No OneTemporary
Actions

Size

4 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c
	===================================================================
	--- stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c (revision 332524)
	+++ stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c (revision 332525)
	@@ -1,4186 +1,4756 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2017 Nexenta Systems, Inc.
	*/

	#include <stdio.h>
	#include <unistd.h>
	#include <stdio_ext.h>
	#include <stdlib.h>
	#include <ctype.h>
	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/spa_impl.h>
	#include <sys/dmu.h>
	#include <sys/zap.h>
	#include <sys/fs/zfs.h>
	#include <sys/zfs_znode.h>
	#include <sys/zfs_sa.h>
	#include <sys/sa.h>
	#include <sys/sa_impl.h>
	#include <sys/vdev.h>
	#include <sys/vdev_impl.h>
	#include <sys/metaslab_impl.h>
	#include <sys/dmu_objset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_pool.h>
	#include <sys/dbuf.h>
	#include <sys/zil.h>
	#include <sys/zil_impl.h>
	#include <sys/stat.h>
	#include <sys/resource.h>
	#include <sys/dmu_traverse.h>
	#include <sys/zio_checksum.h>
	#include <sys/zio_compress.h>
	#include <sys/zfs_fuid.h>
	#include <sys/arc.h>
	#include <sys/ddt.h>
	#include <sys/zfeature.h>
	#include <sys/abd.h>
	#include <sys/blkptr.h>
	#include <zfs_comutil.h>
	#include <libcmdutils.h>
	#undef verify
	#include <libzfs.h>

	#include "zdb.h"

	#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
	zio_compress_table[(idx)].ci_name : "UNKNOWN")
	#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
	#define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \
	dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \
	dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
	#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \
	- (((idx) == DMU_OTN_ZAP_DATA \|\| (idx) == DMU_OTN_ZAP_METADATA) ? \
	- DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
	+ (idx) == DMU_OTN_ZAP_DATA \|\| (idx) == DMU_OTN_ZAP_METADATA ? \
	+ DMU_OT_ZAP_OTHER : \
	+ (idx) == DMU_OTN_UINT64_DATA \|\| (idx) == DMU_OTN_UINT64_METADATA ? \
	+ DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)

	#ifndef lint
	extern int reference_tracking_enable;
	extern boolean_t zfs_recover;
	extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
	extern int zfs_vdev_async_read_max_active;
	#else
	int reference_tracking_enable;
	boolean_t zfs_recover;
	uint64_t zfs_arc_max, zfs_arc_meta_limit;
	int zfs_vdev_async_read_max_active;
	#endif

	static const char cmdname[] = "zdb";
	uint8_t dump_opt[256];

	typedef void object_viewer_t(objset_t , uint64_t, void data, size_t size);

	static uint64_t *zopt_object = NULL;
	static unsigned zopt_objects = 0;
	static libzfs_handle_t *g_zfs;
	static uint64_t max_inflight = 1000;

	static void snprintf_blkptr_compact(char , size_t, const blkptr_t );

	/*
	* These libumem hooks provide a reasonable set of defaults for the allocator's
	* debugging facilities.
	*/
	const char *
	_umem_debug_init()
	{
	return ("default,verbose"); /* $UMEM_DEBUG setting */
	}

	const char *
	_umem_logging_init(void)
	{
	return ("fail,contents"); /* $UMEM_LOGGING setting */
	}

	static void
	usage(void)
	{
	(void) fprintf(stderr,
	"Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p <path> ...]] "
	"[-I <inflight I/Os>]\n"
	"\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
	"\t\t[<poolname> [<object> ...]]\n"
	"\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] <dataset> "
	"[<object> ...]\n"
	"\t%s -C [-A] [-U <cache>]\n"
	"\t%s -l [-Aqu] <device>\n"
	"\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
	"[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
	"\t%s -O <dataset> <path>\n"
	"\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
	"\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
	"\t%s -E [-A] word0:word1:...:word15\n"
	"\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
	"<poolname>\n\n",
	cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
	cmdname, cmdname);

	(void) fprintf(stderr, " Dataset name must include at least one "
	"separator character '/' or '@'\n");
	(void) fprintf(stderr, " If dataset name is specified, only that "
	"dataset is dumped\n");
	(void) fprintf(stderr, " If object numbers are specified, only "
	"those objects are dumped\n\n");
	(void) fprintf(stderr, " Options to control amount of output:\n");
	(void) fprintf(stderr, " -b block statistics\n");
	(void) fprintf(stderr, " -c checksum all metadata (twice for "
	"all data) blocks\n");
	(void) fprintf(stderr, " -C config (or cachefile if alone)\n");
	(void) fprintf(stderr, " -d dataset(s)\n");
	(void) fprintf(stderr, " -D dedup statistics\n");
	(void) fprintf(stderr, " -E decode and display block from an "
	"embedded block pointer\n");
	(void) fprintf(stderr, " -h pool history\n");
	(void) fprintf(stderr, " -i intent logs\n");
	(void) fprintf(stderr, " -l read label contents\n");
	(void) fprintf(stderr, " -L disable leak tracking (do not "
	"load spacemaps)\n");
	(void) fprintf(stderr, " -m metaslabs\n");
	(void) fprintf(stderr, " -M metaslab groups\n");
	(void) fprintf(stderr, " -O perform object lookups by path\n");
	(void) fprintf(stderr, " -R read and display block from a "
	"device\n");
	(void) fprintf(stderr, " -s report stats on zdb's I/O\n");
	(void) fprintf(stderr, " -S simulate dedup to measure effect\n");
	(void) fprintf(stderr, " -v verbose (applies to all "
	"others)\n\n");
	(void) fprintf(stderr, " Below options are intended for use "
	"with other options:\n");
	(void) fprintf(stderr, " -A ignore assertions (-A), enable "
	"panic recovery (-AA) or both (-AAA)\n");
	(void) fprintf(stderr, " -e pool is exported/destroyed/"
	"has altroot/not in a cachefile\n");
	(void) fprintf(stderr, " -F attempt automatic rewind within "
	"safe range of transaction groups\n");
	(void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before "
	"exiting\n");
	(void) fprintf(stderr, " -I <number of inflight I/Os> -- "
	"specify the maximum number of "
	"checksumming I/Os [default is 200]\n");
	(void) fprintf(stderr, " -o <variable>=<value> set global "
	"variable to an unsigned 32-bit integer value\n");
	(void) fprintf(stderr, " -p <path> -- use one or more with "
	"-e to specify path to vdev dir\n");
	(void) fprintf(stderr, " -P print numbers in parseable form\n");
	(void) fprintf(stderr, " -q don't print label contents\n");
	(void) fprintf(stderr, " -t <txg> -- highest txg to use when "
	"searching for uberblocks\n");
	(void) fprintf(stderr, " -u uberblock\n");
	(void) fprintf(stderr, " -U <cachefile_path> -- use alternate "
	"cachefile\n");
	(void) fprintf(stderr, " -V do verbatim import\n");
	(void) fprintf(stderr, " -x <dumpdir> -- "
	"dump all read blocks into specified directory\n");
	(void) fprintf(stderr, " -X attempt extreme rewind (does not "
	"work with dataset)\n\n");
	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
	"to make only that option verbose\n");
	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
	exit(1);
	}

	static void
	dump_debug_buffer()
	{
	if (dump_opt['G']) {
	(void) printf("\n");
	zfs_dbgmsg_print("zdb");
	}
	}

	/*
	* Called for usage errors that are discovered after a call to spa_open(),
	* dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
	*/

	static void
	fatal(const char *fmt, ...)
	{
	va_list ap;

	va_start(ap, fmt);
	(void) fprintf(stderr, "%s: ", cmdname);
	(void) vfprintf(stderr, fmt, ap);
	va_end(ap);
	(void) fprintf(stderr, "\n");

	dump_debug_buffer();

	exit(1);
	}

	/* ARGSUSED */
	static void
	dump_packed_nvlist(objset_t os, uint64_t object, void data, size_t size)
	{
	nvlist_t *nv;
	size_t nvsize = (uint64_t )data;
	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);

	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));

	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);

	umem_free(packed, nvsize);

	dump_nvlist(nv, 8);

	nvlist_free(nv);
	}

	/* ARGSUSED */
	static void
	dump_history_offsets(objset_t os, uint64_t object, void data, size_t size)
	{
	spa_history_phys_t *shp = data;

	if (shp == NULL)
	return;

	(void) printf("\t\tpool_create_len = %llu\n",
	(u_longlong_t)shp->sh_pool_create_len);
	(void) printf("\t\tphys_max_off = %llu\n",
	(u_longlong_t)shp->sh_phys_max_off);
	(void) printf("\t\tbof = %llu\n",
	(u_longlong_t)shp->sh_bof);
	(void) printf("\t\teof = %llu\n",
	(u_longlong_t)shp->sh_eof);
	(void) printf("\t\trecords_lost = %llu\n",
	(u_longlong_t)shp->sh_records_lost);
	}

	static void
	zdb_nicenum(uint64_t num, char *buf, size_t buflen)
	{
	if (dump_opt['P'])
	(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
	else
	nicenum(num, buf, sizeof (buf));
	}

	static const char histo_stars[] = "****************************************";
	static const uint64_t histo_width = sizeof (histo_stars) - 1;

	static void
	dump_histogram(const uint64_t *histo, int size, int offset)
	{
	int i;
	int minidx = size - 1;
	int maxidx = 0;
	uint64_t max = 0;

	for (i = 0; i < size; i++) {
	if (histo[i] > max)
	max = histo[i];
	if (histo[i] > 0 && i > maxidx)
	maxidx = i;
	if (histo[i] > 0 && i < minidx)
	minidx = i;
	}

	if (max < histo_width)
	max = histo_width;

	for (i = minidx; i <= maxidx; i++) {
	(void) printf("\t\t\t%3u: %6llu %s\n",
	i + offset, (u_longlong_t)histo[i],
	&histo_stars[(max - histo[i]) * histo_width / max]);
	}
	}

	static void
	dump_zap_stats(objset_t *os, uint64_t object)
	{
	int error;
	zap_stats_t zs;

	error = zap_get_stats(os, object, &zs);
	if (error)
	return;

	if (zs.zs_ptrtbl_len == 0) {
	ASSERT(zs.zs_num_blocks == 1);
	(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
	(u_longlong_t)zs.zs_blocksize,
	(u_longlong_t)zs.zs_num_entries);
	return;
	}

	(void) printf("\tFat ZAP stats:\n");

	(void) printf("\t\tPointer table:\n");
	(void) printf("\t\t\t%llu elements\n",
	(u_longlong_t)zs.zs_ptrtbl_len);
	(void) printf("\t\t\tzt_blk: %llu\n",
	(u_longlong_t)zs.zs_ptrtbl_zt_blk);
	(void) printf("\t\t\tzt_numblks: %llu\n",
	(u_longlong_t)zs.zs_ptrtbl_zt_numblks);
	(void) printf("\t\t\tzt_shift: %llu\n",
	(u_longlong_t)zs.zs_ptrtbl_zt_shift);
	(void) printf("\t\t\tzt_blks_copied: %llu\n",
	(u_longlong_t)zs.zs_ptrtbl_blks_copied);
	(void) printf("\t\t\tzt_nextblk: %llu\n",
	(u_longlong_t)zs.zs_ptrtbl_nextblk);

	(void) printf("\t\tZAP entries: %llu\n",
	(u_longlong_t)zs.zs_num_entries);
	(void) printf("\t\tLeaf blocks: %llu\n",
	(u_longlong_t)zs.zs_num_leafs);
	(void) printf("\t\tTotal blocks: %llu\n",
	(u_longlong_t)zs.zs_num_blocks);
	(void) printf("\t\tzap_block_type: 0x%llx\n",
	(u_longlong_t)zs.zs_block_type);
	(void) printf("\t\tzap_magic: 0x%llx\n",
	(u_longlong_t)zs.zs_magic);
	(void) printf("\t\tzap_salt: 0x%llx\n",
	(u_longlong_t)zs.zs_salt);

	(void) printf("\t\tLeafs with 2^n pointers:\n");
	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);

	(void) printf("\t\tBlocks with n*5 entries:\n");
	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);

	(void) printf("\t\tBlocks n/10 full:\n");
	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);

	(void) printf("\t\tEntries with n chunks:\n");
	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);

	(void) printf("\t\tBuckets with n entries:\n");
	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
	}

	/ARGSUSED/
	static void
	dump_none(objset_t os, uint64_t object, void data, size_t size)
	{
	}

	/ARGSUSED/
	static void
	dump_unknown(objset_t os, uint64_t object, void data, size_t size)
	{
	(void) printf("\tUNKNOWN OBJECT TYPE\n");
	}

	/ARGSUSED/
	static void
	dump_uint8(objset_t os, uint64_t object, void data, size_t size)
	{
	}

	/ARGSUSED/
	static void
	dump_uint64(objset_t os, uint64_t object, void data, size_t size)
	{
	}

	/ARGSUSED/
	static void
	dump_zap(objset_t os, uint64_t object, void data, size_t size)
	{
	zap_cursor_t zc;
	zap_attribute_t attr;
	void *prop;
	unsigned i;

	dump_zap_stats(os, object);
	(void) printf("\n");

	for (zap_cursor_init(&zc, os, object);
	zap_cursor_retrieve(&zc, &attr) == 0;
	zap_cursor_advance(&zc)) {
	(void) printf("\t\t%s = ", attr.za_name);
	if (attr.za_num_integers == 0) {
	(void) printf("\n");
	continue;
	}
	prop = umem_zalloc(attr.za_num_integers *
	attr.za_integer_length, UMEM_NOFAIL);
	(void) zap_lookup(os, object, attr.za_name,
	attr.za_integer_length, attr.za_num_integers, prop);
	if (attr.za_integer_length == 1) {
	(void) printf("%s", (char *)prop);
	} else {
	for (i = 0; i < attr.za_num_integers; i++) {
	switch (attr.za_integer_length) {
	case 2:
	(void) printf("%u ",
	((uint16_t *)prop)[i]);
	break;
	case 4:
	(void) printf("%u ",
	((uint32_t *)prop)[i]);
	break;
	case 8:
	(void) printf("%lld ",
	(u_longlong_t)((int64_t *)prop)[i]);
	break;
	}
	}
	}
	(void) printf("\n");
	umem_free(prop, attr.za_num_integers * attr.za_integer_length);
	}
	zap_cursor_fini(&zc);
	}

	static void
	dump_bpobj(objset_t os, uint64_t object, void data, size_t size)
	{
	bpobj_phys_t *bpop = data;
	char bytes[32], comp[32], uncomp[32];

	/* make sure the output won't get truncated */
	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);

	if (bpop == NULL)
	return;

	zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
	zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
	zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));

	(void) printf("\t\tnum_blkptrs = %llu\n",
	(u_longlong_t)bpop->bpo_num_blkptrs);
	(void) printf("\t\tbytes = %s\n", bytes);
	if (size >= BPOBJ_SIZE_V1) {
	(void) printf("\t\tcomp = %s\n", comp);
	(void) printf("\t\tuncomp = %s\n", uncomp);
	}
	if (size >= sizeof (*bpop)) {
	(void) printf("\t\tsubobjs = %llu\n",
	(u_longlong_t)bpop->bpo_subobjs);
	(void) printf("\t\tnum_subobjs = %llu\n",
	(u_longlong_t)bpop->bpo_num_subobjs);
	}

	if (dump_opt['d'] < 5)
	return;

	for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) {
	char blkbuf[BP_SPRINTF_LEN];
	blkptr_t bp;

	int err = dmu_read(os, object,
	i * sizeof (bp), sizeof (bp), &bp, 0);
	if (err != 0) {
	(void) printf("got error %u from dmu_read\n", err);
	break;
	}
	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
	(void) printf("\t%s\n", blkbuf);
	}
	}

	/* ARGSUSED */
	static void
	dump_bpobj_subobjs(objset_t os, uint64_t object, void data, size_t size)
	{
	dmu_object_info_t doi;

	VERIFY0(dmu_object_info(os, object, &doi));
	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);

	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
	if (err != 0) {
	(void) printf("got error %u from dmu_read\n", err);
	kmem_free(subobjs, doi.doi_max_offset);
	return;
	}

	int64_t last_nonzero = -1;
	for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) {
	if (subobjs[i] != 0)
	last_nonzero = i;
	}

	for (int64_t i = 0; i <= last_nonzero; i++) {
	(void) printf("\t%llu\n", (longlong_t)subobjs[i]);
	}
	kmem_free(subobjs, doi.doi_max_offset);
	}

	/ARGSUSED/
	static void
	dump_ddt_zap(objset_t os, uint64_t object, void data, size_t size)
	{
	dump_zap_stats(os, object);
	/* contents are printed elsewhere, properly decoded */
	}

	/ARGSUSED/
	static void
	dump_sa_attrs(objset_t os, uint64_t object, void data, size_t size)
	{
	zap_cursor_t zc;
	zap_attribute_t attr;

	dump_zap_stats(os, object);
	(void) printf("\n");

	for (zap_cursor_init(&zc, os, object);
	zap_cursor_retrieve(&zc, &attr) == 0;
	zap_cursor_advance(&zc)) {
	(void) printf("\t\t%s = ", attr.za_name);
	if (attr.za_num_integers == 0) {
	(void) printf("\n");
	continue;
	}
	(void) printf(" %llx : [%d:%d:%d]\n",
	(u_longlong_t)attr.za_first_integer,
	(int)ATTR_LENGTH(attr.za_first_integer),
	(int)ATTR_BSWAP(attr.za_first_integer),
	(int)ATTR_NUM(attr.za_first_integer));
	}
	zap_cursor_fini(&zc);
	}

	/ARGSUSED/
	static void
	dump_sa_layouts(objset_t os, uint64_t object, void data, size_t size)
	{
	zap_cursor_t zc;
	zap_attribute_t attr;
	uint16_t *layout_attrs;
	unsigned i;

	dump_zap_stats(os, object);
	(void) printf("\n");

	for (zap_cursor_init(&zc, os, object);
	zap_cursor_retrieve(&zc, &attr) == 0;
	zap_cursor_advance(&zc)) {
	(void) printf("\t\t%s = [", attr.za_name);
	if (attr.za_num_integers == 0) {
	(void) printf("\n");
	continue;
	}

	VERIFY(attr.za_integer_length == 2);
	layout_attrs = umem_zalloc(attr.za_num_integers *
	attr.za_integer_length, UMEM_NOFAIL);

	VERIFY(zap_lookup(os, object, attr.za_name,
	attr.za_integer_length,
	attr.za_num_integers, layout_attrs) == 0);

	for (i = 0; i != attr.za_num_integers; i++)
	(void) printf(" %d ", (int)layout_attrs[i]);
	(void) printf("]\n");
	umem_free(layout_attrs,
	attr.za_num_integers * attr.za_integer_length);
	}
	zap_cursor_fini(&zc);
	}

	/ARGSUSED/
	static void
	dump_zpldir(objset_t os, uint64_t object, void data, size_t size)
	{
	zap_cursor_t zc;
	zap_attribute_t attr;
	const char *typenames[] = {
	/* 0 */ "not specified",
	/* 1 */ "FIFO",
	/* 2 */ "Character Device",
	/* 3 */ "3 (invalid)",
	/* 4 */ "Directory",
	/* 5 */ "5 (invalid)",
	/* 6 */ "Block Device",
	/* 7 */ "7 (invalid)",
	/* 8 */ "Regular File",
	/* 9 */ "9 (invalid)",
	/* 10 */ "Symbolic Link",
	/* 11 */ "11 (invalid)",
	/* 12 */ "Socket",
	/* 13 */ "Door",
	/* 14 */ "Event Port",
	/* 15 */ "15 (invalid)",
	};

	dump_zap_stats(os, object);
	(void) printf("\n");

	for (zap_cursor_init(&zc, os, object);
	zap_cursor_retrieve(&zc, &attr) == 0;
	zap_cursor_advance(&zc)) {
	(void) printf("\t\t%s = %lld (type: %s)\n",
	attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
	typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
	}
	zap_cursor_fini(&zc);
	}

	static int
	get_dtl_refcount(vdev_t *vd)
	{
	int refcount = 0;

	if (vd->vdev_ops->vdev_op_leaf) {
	space_map_t *sm = vd->vdev_dtl_sm;

	if (sm != NULL &&
	sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
	return (1);
	return (0);
	}

	for (unsigned c = 0; c < vd->vdev_children; c++)
	refcount += get_dtl_refcount(vd->vdev_child[c]);
	return (refcount);
	}

	static int
	get_metaslab_refcount(vdev_t *vd)
	{
	int refcount = 0;

	- if (vd->vdev_top == vd && !vd->vdev_removing) {
	- for (unsigned m = 0; m < vd->vdev_ms_count; m++) {
	+ if (vd->vdev_top == vd) {
	+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
	space_map_t *sm = vd->vdev_ms[m]->ms_sm;

	if (sm != NULL &&
	sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
	refcount++;
	}
	}
	for (unsigned c = 0; c < vd->vdev_children; c++)
	refcount += get_metaslab_refcount(vd->vdev_child[c]);

	return (refcount);
	}

	static int
	+get_obsolete_refcount(vdev_t *vd)
	+{
	+ int refcount = 0;
	+
	+ uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
	+ if (vd->vdev_top == vd && obsolete_sm_obj != 0) {
	+ dmu_object_info_t doi;
	+ VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
	+ obsolete_sm_obj, &doi));
	+ if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
	+ refcount++;
	+ }
	+ } else {
	+ ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
	+ ASSERT3U(obsolete_sm_obj, ==, 0);
	+ }
	+ for (unsigned c = 0; c < vd->vdev_children; c++) {
	+ refcount += get_obsolete_refcount(vd->vdev_child[c]);
	+ }
	+
	+ return (refcount);
	+}
	+
	+static int
	+get_prev_obsolete_spacemap_refcount(spa_t *spa)
	+{
	+ uint64_t prev_obj =
	+ spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
	+ if (prev_obj != 0) {
	+ dmu_object_info_t doi;
	+ VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
	+ if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
	+ return (1);
	+ }
	+ }
	+ return (0);
	+}
	+
	+static int
	verify_spacemap_refcounts(spa_t *spa)
	{
	uint64_t expected_refcount = 0;
	uint64_t actual_refcount;

	(void) feature_get_refcount(spa,
	&spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
	&expected_refcount);
	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
	+ actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
	+ actual_refcount += get_prev_obsolete_spacemap_refcount(spa);

	if (expected_refcount != actual_refcount) {
	(void) printf("space map refcount mismatch: expected %lld != "
	"actual %lld\n",
	(longlong_t)expected_refcount,
	(longlong_t)actual_refcount);
	return (2);
	}
	return (0);
	}

	static void
	dump_spacemap(objset_t os, space_map_t sm)
	{
	uint64_t alloc, offset, entry;
	- const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
	- "INVALID", "INVALID", "INVALID", "INVALID" };
	+ char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
	+ "INVALID", "INVALID", "INVALID", "INVALID" };

	if (sm == NULL)
	return;

	+ (void) printf("space map object %llu:\n",
	+ (longlong_t)sm->sm_phys->smp_object);
	+ (void) printf(" smp_objsize = 0x%llx\n",
	+ (longlong_t)sm->sm_phys->smp_objsize);
	+ (void) printf(" smp_alloc = 0x%llx\n",
	+ (longlong_t)sm->sm_phys->smp_alloc);
	+
	/*
	* Print out the freelist entries in both encoded and decoded form.
	*/
	alloc = 0;
	for (offset = 0; offset < space_map_length(sm);
	offset += sizeof (entry)) {
	uint8_t mapshift = sm->sm_shift;

	VERIFY0(dmu_read(os, space_map_object(sm), offset,
	sizeof (entry), &entry, DMU_READ_PREFETCH));
	if (SM_DEBUG_DECODE(entry)) {

	(void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
	(u_longlong_t)(offset / sizeof (entry)),
	ddata[SM_DEBUG_ACTION_DECODE(entry)],
	(u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
	(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
	} else {
	(void) printf("\t [%6llu] %c range:"
	" %010llx-%010llx size: %06llx\n",
	(u_longlong_t)(offset / sizeof (entry)),
	SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
	(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
	mapshift) + sm->sm_start),
	(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
	mapshift) + sm->sm_start +
	(SM_RUN_DECODE(entry) << mapshift)),
	(u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
	if (SM_TYPE_DECODE(entry) == SM_ALLOC)
	alloc += SM_RUN_DECODE(entry) << mapshift;
	else
	alloc -= SM_RUN_DECODE(entry) << mapshift;
	}
	}
	if (alloc != space_map_allocated(sm)) {
	(void) printf("space_map_object alloc (%llu) INCONSISTENT "
	"with space map summary (%llu)\n",
	(u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
	}
	}

	static void
	dump_metaslab_stats(metaslab_t *msp)
	{
	char maxbuf[32];
	range_tree_t *rt = msp->ms_tree;
	avl_tree_t *t = &msp->ms_size_tree;
	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;

	/* max sure nicenum has enough space */
	CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);

	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf));

	(void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
	"segments", avl_numnodes(t), "maxsize", maxbuf,
	"freepct", free_pct);
	(void) printf("\tIn-memory histogram:\n");
	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
	}

	static void
	dump_metaslab(metaslab_t *msp)
	{
	vdev_t *vd = msp->ms_group->mg_vd;
	spa_t *spa = vd->vdev_spa;
	space_map_t *sm = msp->ms_sm;
	char freebuf[32];

	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
	sizeof (freebuf));

	(void) printf(
	"\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
	(u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
	(u_longlong_t)space_map_object(sm), freebuf);

	if (dump_opt['m'] > 2 && !dump_opt['L']) {
	mutex_enter(&msp->ms_lock);
	metaslab_load_wait(msp);
	if (!msp->ms_loaded) {
	VERIFY0(metaslab_load(msp));
	range_tree_stat_verify(msp->ms_tree);
	}
	dump_metaslab_stats(msp);
	metaslab_unload(msp);
	mutex_exit(&msp->ms_lock);
	}

	if (dump_opt['m'] > 1 && sm != NULL &&
	spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
	/*
	* The space map histogram represents free space in chunks
	* of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
	*/
	(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
	(u_longlong_t)msp->ms_fragmentation);
	dump_histogram(sm->sm_phys->smp_histogram,
	SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
	}

	if (dump_opt['d'] > 5 \|\| dump_opt['m'] > 3) {
	ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));

	- mutex_enter(&msp->ms_lock);
	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
	- mutex_exit(&msp->ms_lock);
	}
	}

	static void
	print_vdev_metaslab_header(vdev_t *vd)
	{
	(void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n",
	(u_longlong_t)vd->vdev_id,
	"metaslabs", (u_longlong_t)vd->vdev_ms_count,
	"offset", "spacemap", "free");
	(void) printf("\t%15s %19s %15s %10s\n",
	"---------------", "-------------------",
	"---------------", "-------------");
	}

	static void
	dump_metaslab_groups(spa_t *spa)
	{
	vdev_t *rvd = spa->spa_root_vdev;
	metaslab_class_t *mc = spa_normal_class(spa);
	uint64_t fragmentation;

	metaslab_class_histogram_verify(mc);

	for (unsigned c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	metaslab_group_t *mg = tvd->vdev_mg;

	if (mg->mg_class != mc)
	continue;

	metaslab_group_histogram_verify(mg);
	mg->mg_fragmentation = metaslab_group_fragmentation(mg);

	(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
	"fragmentation",
	(u_longlong_t)tvd->vdev_id,
	(u_longlong_t)tvd->vdev_ms_count);
	if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
	(void) printf("%3s\n", "-");
	} else {
	(void) printf("%3llu%%\n",
	(u_longlong_t)mg->mg_fragmentation);
	}
	dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
	}

	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
	fragmentation = metaslab_class_fragmentation(mc);
	if (fragmentation == ZFS_FRAG_INVALID)
	(void) printf("\t%3s\n", "-");
	else
	(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
	}

	static void
	+print_vdev_indirect(vdev_t *vd)
	+{
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
	+
	+ if (vim == NULL) {
	+ ASSERT3P(vib, ==, NULL);
	+ return;
	+ }
	+
	+ ASSERT3U(vdev_indirect_mapping_object(vim), ==,
	+ vic->vic_mapping_object);
	+ ASSERT3U(vdev_indirect_births_object(vib), ==,
	+ vic->vic_births_object);
	+
	+ (void) printf("indirect births obj %llu:\n",
	+ (longlong_t)vic->vic_births_object);
	+ (void) printf(" vib_count = %llu\n",
	+ (longlong_t)vdev_indirect_births_count(vib));
	+ for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
	+ vdev_indirect_birth_entry_phys_t *cur_vibe =
	+ &vib->vib_entries[i];
	+ (void) printf("\toffset %llx -> txg %llu\n",
	+ (longlong_t)cur_vibe->vibe_offset,
	+ (longlong_t)cur_vibe->vibe_phys_birth_txg);
	+ }
	+ (void) printf("\n");
	+
	+ (void) printf("indirect mapping obj %llu:\n",
	+ (longlong_t)vic->vic_mapping_object);
	+ (void) printf(" vim_max_offset = 0x%llx\n",
	+ (longlong_t)vdev_indirect_mapping_max_offset(vim));
	+ (void) printf(" vim_bytes_mapped = 0x%llx\n",
	+ (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
	+ (void) printf(" vim_count = %llu\n",
	+ (longlong_t)vdev_indirect_mapping_num_entries(vim));
	+
	+ if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
	+ return;
	+
	+ uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
	+
	+ for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
	+ vdev_indirect_mapping_entry_phys_t *vimep =
	+ &vim->vim_entries[i];
	+ (void) printf("\t<%llx:%llx:%llx> -> "
	+ "<%llx:%llx:%llx> (%x obsolete)\n",
	+ (longlong_t)vd->vdev_id,
	+ (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
	+ (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
	+ (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
	+ (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
	+ (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
	+ counts[i]);
	+ }
	+ (void) printf("\n");
	+
	+ uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
	+ if (obsolete_sm_object != 0) {
	+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
	+ (void) printf("obsolete space map object %llu:\n",
	+ (u_longlong_t)obsolete_sm_object);
	+ ASSERT(vd->vdev_obsolete_sm != NULL);
	+ ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
	+ obsolete_sm_object);
	+ dump_spacemap(mos, vd->vdev_obsolete_sm);
	+ (void) printf("\n");
	+ }
	+}
	+
	+static void
	dump_metaslabs(spa_t *spa)
	{
	vdev_t vd, rvd = spa->spa_root_vdev;
	uint64_t m, c = 0, children = rvd->vdev_children;

	(void) printf("\nMetaslabs:\n");

	if (!dump_opt['d'] && zopt_objects > 0) {
	c = zopt_object[0];

	if (c >= children)
	(void) fatal("bad vdev id: %llu", (u_longlong_t)c);

	if (zopt_objects > 1) {
	vd = rvd->vdev_child[c];
	print_vdev_metaslab_header(vd);

	for (m = 1; m < zopt_objects; m++) {
	if (zopt_object[m] < vd->vdev_ms_count)
	dump_metaslab(
	vd->vdev_ms[zopt_object[m]]);
	else
	(void) fprintf(stderr, "bad metaslab "
	"number %llu\n",
	(u_longlong_t)zopt_object[m]);
	}
	(void) printf("\n");
	return;
	}
	children = c + 1;
	}
	for (; c < children; c++) {
	vd = rvd->vdev_child[c];
	print_vdev_metaslab_header(vd);

	+ print_vdev_indirect(vd);
	+
	for (m = 0; m < vd->vdev_ms_count; m++)
	dump_metaslab(vd->vdev_ms[m]);
	(void) printf("\n");
	}
	}

	static void
	dump_dde(const ddt_t ddt, const ddt_entry_t dde, uint64_t index)
	{
	const ddt_phys_t *ddp = dde->dde_phys;
	const ddt_key_t *ddk = &dde->dde_key;
	const char *types[4] = { "ditto", "single", "double", "triple" };
	char blkbuf[BP_SPRINTF_LEN];
	blkptr_t blk;

	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	if (ddp->ddp_phys_birth == 0)
	continue;
	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
	snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
	(void) printf("index %llx refcnt %llu %s %s\n",
	(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
	types[p], blkbuf);
	}
	}

	static void
	dump_dedup_ratio(const ddt_stat_t *dds)
	{
	double rL, rP, rD, D, dedup, compress, copies;

	if (dds->dds_blocks == 0)
	return;

	rL = (double)dds->dds_ref_lsize;
	rP = (double)dds->dds_ref_psize;
	rD = (double)dds->dds_ref_dsize;
	D = (double)dds->dds_dsize;

	dedup = rD / D;
	compress = rL / rP;
	copies = rD / rP;

	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
	"dedup * compress / copies = %.2f\n\n",
	dedup, compress, copies, dedup * compress / copies);
	}

	static void
	dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
	{
	char name[DDT_NAMELEN];
	ddt_entry_t dde;
	uint64_t walk = 0;
	dmu_object_info_t doi;
	uint64_t count, dspace, mspace;
	int error;

	error = ddt_object_info(ddt, type, class, &doi);

	if (error == ENOENT)
	return;
	ASSERT(error == 0);

	error = ddt_object_count(ddt, type, class, &count);
	ASSERT(error == 0);
	if (count == 0)
	return;

	dspace = doi.doi_physical_blocks_512 << 9;
	mspace = doi.doi_fill_count * doi.doi_data_block_size;

	ddt_object_name(ddt, type, class, name);

	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
	name,
	(u_longlong_t)count,
	(u_longlong_t)(dspace / count),
	(u_longlong_t)(mspace / count));

	if (dump_opt['D'] < 3)
	return;

	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);

	if (dump_opt['D'] < 4)
	return;

	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
	return;

	(void) printf("%s contents:\n\n", name);

	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
	dump_dde(ddt, &dde, walk);

	ASSERT(error == ENOENT);

	(void) printf("\n");
	}

	static void
	dump_all_ddts(spa_t *spa)
	{
	ddt_histogram_t ddh_total;
	ddt_stat_t dds_total;

	bzero(&ddh_total, sizeof (ddh_total));
	bzero(&dds_total, sizeof (dds_total));

	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	ddt_t *ddt = spa->spa_ddt[c];
	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	for (enum ddt_class class = 0; class < DDT_CLASSES;
	class++) {
	dump_ddt(ddt, type, class);
	}
	}
	}

	ddt_get_dedup_stats(spa, &dds_total);

	if (dds_total.dds_blocks == 0) {
	(void) printf("All DDTs are empty\n");
	return;
	}

	(void) printf("\n");

	if (dump_opt['D'] > 1) {
	(void) printf("DDT histogram (aggregated over all DDTs):\n");
	ddt_get_dedup_histogram(spa, &ddh_total);
	zpool_dump_ddt(&dds_total, &ddh_total);
	}

	dump_dedup_ratio(&dds_total);
	}

	static void
	dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
	{
	char *prefix = arg;

	(void) printf("%s [%llu,%llu) length %llu\n",
	prefix,
	(u_longlong_t)start,
	(u_longlong_t)(start + size),
	(u_longlong_t)(size));
	}

	static void
	dump_dtl(vdev_t *vd, int indent)
	{
	spa_t *spa = vd->vdev_spa;
	boolean_t required;
	const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
	"outage" };
	char prefix[256];

	spa_vdev_state_enter(spa, SCL_NONE);
	required = vdev_dtl_required(vd);
	(void) spa_vdev_state_exit(spa, NULL, 0);

	if (indent == 0)
	(void) printf("\nDirty time logs:\n\n");

	(void) printf("\t%*s%s [%s]\n", indent, "",
	vd->vdev_path ? vd->vdev_path :
	vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
	required ? "DTL-required" : "DTL-expendable");

	for (int t = 0; t < DTL_TYPES; t++) {
	range_tree_t *rt = vd->vdev_dtl[t];
	if (range_tree_space(rt) == 0)
	continue;
	(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
	indent + 2, "", name[t]);
	- mutex_enter(rt->rt_lock);
	range_tree_walk(rt, dump_dtl_seg, prefix);
	- mutex_exit(rt->rt_lock);
	if (dump_opt['d'] > 5 && vd->vdev_children == 0)
	dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
	}

	for (unsigned c = 0; c < vd->vdev_children; c++)
	dump_dtl(vd->vdev_child[c], indent + 4);
	}

	/* from spa_history.c: spa_history_create_obj() */
	#define HIS_BUF_LEN_DEF (128 << 10)
	#define HIS_BUF_LEN_MAX (1 << 30)

	static void
	dump_history(spa_t *spa)
	{
	nvlist_t **events = NULL;
	char *buf = NULL;
	uint64_t bufsize = HIS_BUF_LEN_DEF;
	uint64_t resid, len, off = 0;
	uint_t num = 0;
	int error;
	time_t tsec;
	struct tm t;
	char tbuf[30];
	char internalstr[MAXPATHLEN];

	if ((buf = malloc(bufsize)) == NULL)
	(void) fprintf(stderr, "Unable to read history: "
	"out of memory\n");
	do {
	len = bufsize;

	if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
	(void) fprintf(stderr, "Unable to read history: "
	"error %d\n", error);
	return;
	}

	if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
	break;
	off -= resid;

	/*
	* If the history block is too big, double the buffer
	* size and try again.
	*/
	if (resid == len) {
	free(buf);
	buf = NULL;

	bufsize <<= 1;
	if ((bufsize >= HIS_BUF_LEN_MAX) \|\|
	((buf = malloc(bufsize)) == NULL)) {
	(void) fprintf(stderr, "Unable to read history: "
	"out of memory\n");
	return;
	}
	}
	} while (len != 0);
	free(buf);

	(void) printf("\nHistory:\n");
	for (unsigned i = 0; i < num; i++) {
	uint64_t time, txg, ievent;
	char cmd, intstr;
	boolean_t printed = B_FALSE;

	if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
	&time) != 0)
	goto next;
	if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
	&cmd) != 0) {
	if (nvlist_lookup_uint64(events[i],
	ZPOOL_HIST_INT_EVENT, &ievent) != 0)
	goto next;
	verify(nvlist_lookup_uint64(events[i],
	ZPOOL_HIST_TXG, &txg) == 0);
	verify(nvlist_lookup_string(events[i],
	ZPOOL_HIST_INT_STR, &intstr) == 0);
	if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
	goto next;

	(void) snprintf(internalstr,
	sizeof (internalstr),
	"[internal %s txg:%ju] %s",
	zfs_history_event_names[ievent], (uintmax_t)txg,
	intstr);
	cmd = internalstr;
	}
	tsec = time;
	(void) localtime_r(&tsec, &t);
	(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
	(void) printf("%s %s\n", tbuf, cmd);
	printed = B_TRUE;

	next:
	if (dump_opt['h'] > 1) {
	if (!printed)
	(void) printf("unrecognized record:\n");
	dump_nvlist(events[i], 2);
	}
	}
	}

	/ARGSUSED/
	static void
	dump_dnode(objset_t os, uint64_t object, void data, size_t size)
	{
	}

	static uint64_t
	blkid2offset(const dnode_phys_t dnp, const blkptr_t bp,
	const zbookmark_phys_t *zb)
	{
	if (dnp == NULL) {
	ASSERT(zb->zb_level < 0);
	if (zb->zb_object == 0)
	return (zb->zb_blkid);
	return (zb->zb_blkid * BP_GET_LSIZE(bp));
	}

	ASSERT(zb->zb_level >= 0);

	return ((zb->zb_blkid <<
	(zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
	dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
	}

	static void
	snprintf_blkptr_compact(char blkbuf, size_t buflen, const blkptr_t bp)
	{
	const dva_t *dva = bp->blk_dva;
	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;

	if (dump_opt['b'] >= 6) {
	snprintf_blkptr(blkbuf, buflen, bp);
	return;
	}

	if (BP_IS_EMBEDDED(bp)) {
	(void) sprintf(blkbuf,
	"EMBEDDED et=%u %llxL/%llxP B=%llu",
	(int)BPE_GET_ETYPE(bp),
	(u_longlong_t)BPE_GET_LSIZE(bp),
	(u_longlong_t)BPE_GET_PSIZE(bp),
	(u_longlong_t)bp->blk_birth);
	return;
	}

	blkbuf[0] = '\0';
	for (int i = 0; i < ndvas; i++)
	(void) snprintf(blkbuf + strlen(blkbuf),
	buflen - strlen(blkbuf), "%llu:%llx:%llx ",
	(u_longlong_t)DVA_GET_VDEV(&dva[i]),
	(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
	(u_longlong_t)DVA_GET_ASIZE(&dva[i]));

	if (BP_IS_HOLE(bp)) {
	(void) snprintf(blkbuf + strlen(blkbuf),
	buflen - strlen(blkbuf),
	"%llxL B=%llu",
	(u_longlong_t)BP_GET_LSIZE(bp),
	(u_longlong_t)bp->blk_birth);
	} else {
	(void) snprintf(blkbuf + strlen(blkbuf),
	buflen - strlen(blkbuf),
	"%llxL/%llxP F=%llu B=%llu/%llu",
	(u_longlong_t)BP_GET_LSIZE(bp),
	(u_longlong_t)BP_GET_PSIZE(bp),
	(u_longlong_t)BP_GET_FILL(bp),
	(u_longlong_t)bp->blk_birth,
	(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
	}
	}

	static void
	print_indirect(blkptr_t bp, const zbookmark_phys_t zb,
	const dnode_phys_t *dnp)
	{
	char blkbuf[BP_SPRINTF_LEN];
	int l;

	if (!BP_IS_EMBEDDED(bp)) {
	ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
	ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
	}

	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));

	ASSERT(zb->zb_level >= 0);

	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
	if (l == zb->zb_level) {
	(void) printf("L%llx", (u_longlong_t)zb->zb_level);
	} else {
	(void) printf(" ");
	}
	}

	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
	(void) printf("%s\n", blkbuf);
	}

	static int
	visit_indirect(spa_t spa, const dnode_phys_t dnp,
	blkptr_t bp, const zbookmark_phys_t zb)
	{
	int err = 0;

	if (bp->blk_birth == 0)
	return (0);

	print_indirect(bp, zb, dnp);

	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
	arc_flags_t flags = ARC_FLAG_WAIT;
	int i;
	blkptr_t *cbp;
	int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
	arc_buf_t *buf;
	uint64_t fill = 0;

	err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
	ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
	if (err)
	return (err);
	ASSERT(buf->b_data);

	/* recursively visit blocks below this */
	cbp = buf->b_data;
	for (i = 0; i < epb; i++, cbp++) {
	zbookmark_phys_t czb;

	SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
	zb->zb_level - 1,
	zb->zb_blkid * epb + i);
	err = visit_indirect(spa, dnp, cbp, &czb);
	if (err)
	break;
	fill += BP_GET_FILL(cbp);
	}
	if (!err)
	ASSERT3U(fill, ==, BP_GET_FILL(bp));
	arc_buf_destroy(buf, &buf);
	}

	return (err);
	}

	/ARGSUSED/
	static void
	dump_indirect(dnode_t *dn)
	{
	dnode_phys_t *dnp = dn->dn_phys;
	int j;
	zbookmark_phys_t czb;

	(void) printf("Indirect blocks:\n");

	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
	dn->dn_object, dnp->dn_nlevels - 1, 0);
	for (j = 0; j < dnp->dn_nblkptr; j++) {
	czb.zb_blkid = j;
	(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
	&dnp->dn_blkptr[j], &czb);
	}

	(void) printf("\n");
	}

	/ARGSUSED/
	static void
	dump_dsl_dir(objset_t os, uint64_t object, void data, size_t size)
	{
	dsl_dir_phys_t *dd = data;
	time_t crtime;
	char nice[32];

	/* make sure nicenum has enough space */
	CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ);

	if (dd == NULL)
	return;

	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));

	crtime = dd->dd_creation_time;
	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
	(void) printf("\t\thead_dataset_obj = %llu\n",
	(u_longlong_t)dd->dd_head_dataset_obj);
	(void) printf("\t\tparent_dir_obj = %llu\n",
	(u_longlong_t)dd->dd_parent_obj);
	(void) printf("\t\torigin_obj = %llu\n",
	(u_longlong_t)dd->dd_origin_obj);
	(void) printf("\t\tchild_dir_zapobj = %llu\n",
	(u_longlong_t)dd->dd_child_dir_zapobj);
	zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
	(void) printf("\t\tused_bytes = %s\n", nice);
	zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
	(void) printf("\t\tcompressed_bytes = %s\n", nice);
	zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
	zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
	(void) printf("\t\tquota = %s\n", nice);
	zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
	(void) printf("\t\treserved = %s\n", nice);
	(void) printf("\t\tprops_zapobj = %llu\n",
	(u_longlong_t)dd->dd_props_zapobj);
	(void) printf("\t\tdeleg_zapobj = %llu\n",
	(u_longlong_t)dd->dd_deleg_zapobj);
	(void) printf("\t\tflags = %llx\n",
	(u_longlong_t)dd->dd_flags);

	#define DO(which) \
	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
	sizeof (nice)); \
	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
	DO(HEAD);
	DO(SNAP);
	DO(CHILD);
	DO(CHILD_RSRV);
	DO(REFRSRV);
	#undef DO
	}

	/ARGSUSED/
	static void
	dump_dsl_dataset(objset_t os, uint64_t object, void data, size_t size)
	{
	dsl_dataset_phys_t *ds = data;
	time_t crtime;
	char used[32], compressed[32], uncompressed[32], unique[32];
	char blkbuf[BP_SPRINTF_LEN];

	/* make sure nicenum has enough space */
	CTASSERT(sizeof (used) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ);

	if (ds == NULL)
	return;

	ASSERT(size == sizeof (*ds));
	crtime = ds->ds_creation_time;
	zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
	zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
	sizeof (uncompressed));
	zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);

	(void) printf("\t\tdir_obj = %llu\n",
	(u_longlong_t)ds->ds_dir_obj);
	(void) printf("\t\tprev_snap_obj = %llu\n",
	(u_longlong_t)ds->ds_prev_snap_obj);
	(void) printf("\t\tprev_snap_txg = %llu\n",
	(u_longlong_t)ds->ds_prev_snap_txg);
	(void) printf("\t\tnext_snap_obj = %llu\n",
	(u_longlong_t)ds->ds_next_snap_obj);
	(void) printf("\t\tsnapnames_zapobj = %llu\n",
	(u_longlong_t)ds->ds_snapnames_zapobj);
	(void) printf("\t\tnum_children = %llu\n",
	(u_longlong_t)ds->ds_num_children);
	(void) printf("\t\tuserrefs_obj = %llu\n",
	(u_longlong_t)ds->ds_userrefs_obj);
	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
	(void) printf("\t\tcreation_txg = %llu\n",
	(u_longlong_t)ds->ds_creation_txg);
	(void) printf("\t\tdeadlist_obj = %llu\n",
	(u_longlong_t)ds->ds_deadlist_obj);
	(void) printf("\t\tused_bytes = %s\n", used);
	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
	(void) printf("\t\tunique = %s\n", unique);
	(void) printf("\t\tfsid_guid = %llu\n",
	(u_longlong_t)ds->ds_fsid_guid);
	(void) printf("\t\tguid = %llu\n",
	(u_longlong_t)ds->ds_guid);
	(void) printf("\t\tflags = %llx\n",
	(u_longlong_t)ds->ds_flags);
	(void) printf("\t\tnext_clones_obj = %llu\n",
	(u_longlong_t)ds->ds_next_clones_obj);
	(void) printf("\t\tprops_obj = %llu\n",
	(u_longlong_t)ds->ds_props_obj);
	(void) printf("\t\tbp = %s\n", blkbuf);
	}

	/* ARGSUSED */
	static int
	dump_bptree_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	char blkbuf[BP_SPRINTF_LEN];

	if (bp->blk_birth != 0) {
	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	(void) printf("\t%s\n", blkbuf);
	}
	return (0);
	}

	static void
	dump_bptree(objset_t os, uint64_t obj, const char name)
	{
	char bytes[32];
	bptree_phys_t *bt;
	dmu_buf_t *db;

	/* make sure nicenum has enough space */
	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);

	if (dump_opt['d'] < 3)
	return;

	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
	bt = db->db_data;
	zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
	(void) printf("\n %s: %llu datasets, %s\n",
	name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
	dmu_buf_rele(db, FTAG);

	if (dump_opt['d'] < 5)
	return;

	(void) printf("\n");

	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
	}

	/* ARGSUSED */
	static int
	dump_bpobj_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	char blkbuf[BP_SPRINTF_LEN];

	ASSERT(bp->blk_birth != 0);
	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
	(void) printf("\t%s\n", blkbuf);
	return (0);
	}

	static void
	dump_full_bpobj(bpobj_t bpo, const char name, int indent)
	{
	char bytes[32];
	char comp[32];
	char uncomp[32];

	/* make sure nicenum has enough space */
	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);

	if (dump_opt['d'] < 3)
	return;

	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
	zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
	zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
	(void) printf(" %*s: object %llu, %llu local blkptrs, "
	"%llu subobjs in object %llu, %s (%s/%s comp)\n",
	indent * 8, name,
	(u_longlong_t)bpo->bpo_object,
	(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
	(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
	(u_longlong_t)bpo->bpo_phys->bpo_subobjs,
	bytes, comp, uncomp);

	for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
	uint64_t subobj;
	bpobj_t subbpo;
	int error;
	VERIFY0(dmu_read(bpo->bpo_os,
	bpo->bpo_phys->bpo_subobjs,
	i * sizeof (subobj), sizeof (subobj), &subobj, 0));
	error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
	if (error != 0) {
	(void) printf("ERROR %u while trying to open "
	"subobj id %llu\n",
	error, (u_longlong_t)subobj);
	continue;
	}
	dump_full_bpobj(&subbpo, "subobj", indent + 1);
	bpobj_close(&subbpo);
	}
	} else {
	(void) printf(" %*s: object %llu, %llu blkptrs, %s\n",
	indent * 8, name,
	(u_longlong_t)bpo->bpo_object,
	(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
	bytes);
	}

	if (dump_opt['d'] < 5)
	return;


	if (indent == 0) {
	(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
	(void) printf("\n");
	}
	}

	static void
	dump_deadlist(dsl_deadlist_t *dl)
	{
	dsl_deadlist_entry_t *dle;
	uint64_t unused;
	char bytes[32];
	char comp[32];
	char uncomp[32];

	/* make sure nicenum has enough space */
	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);

	if (dump_opt['d'] < 3)
	return;

	if (dl->dl_oldfmt) {
	dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
	return;
	}

	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
	(void) printf("\n Deadlist: %s (%s/%s comp)\n",
	bytes, comp, uncomp);

	if (dump_opt['d'] < 4)
	return;

	(void) printf("\n");

	/* force the tree to be loaded */
	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);

	for (dle = avl_first(&dl->dl_tree); dle;
	dle = AVL_NEXT(&dl->dl_tree, dle)) {
	if (dump_opt['d'] >= 5) {
	char buf[128];
	(void) snprintf(buf, sizeof (buf),
	"mintxg %llu -> obj %llu",
	(longlong_t)dle->dle_mintxg,
	(longlong_t)dle->dle_bpobj.bpo_object);
	dump_full_bpobj(&dle->dle_bpobj, buf, 0);
	} else {
	(void) printf("mintxg %llu -> obj %llu\n",
	(longlong_t)dle->dle_mintxg,
	(longlong_t)dle->dle_bpobj.bpo_object);
	}
	}
	}

	static avl_tree_t idx_tree;
	static avl_tree_t domain_tree;
	static boolean_t fuid_table_loaded;
	static objset_t *sa_os = NULL;
	static sa_attr_type_t *sa_attr_table = NULL;

	static int
	open_objset(const char path, dmu_objset_type_t type, void tag, objset_t **osp)
	{
	int err;
	uint64_t sa_attrs = 0;
	uint64_t version = 0;

	VERIFY3P(sa_os, ==, NULL);
	err = dmu_objset_own(path, type, B_TRUE, tag, osp);
	if (err != 0) {
	(void) fprintf(stderr, "failed to own dataset '%s': %s\n", path,
	strerror(err));
	return (err);
	}

	if (dmu_objset_type(*osp) == DMU_OST_ZFS) {
	(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
	8, 1, &version);
	if (version >= ZPL_VERSION_SA) {
	(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
	8, 1, &sa_attrs);
	}
	err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
	&sa_attr_table);
	if (err != 0) {
	(void) fprintf(stderr, "sa_setup failed: %s\n",
	strerror(err));
	dmu_objset_disown(*osp, tag);
	*osp = NULL;
	}
	}
	sa_os = *osp;

	return (0);
	}

	static void
	close_objset(objset_t os, void tag)
	{
	VERIFY3P(os, ==, sa_os);
	if (os->os_sa != NULL)
	sa_tear_down(os);
	dmu_objset_disown(os, tag);
	sa_attr_table = NULL;
	sa_os = NULL;
	}

	static void
	fuid_table_destroy()
	{
	if (fuid_table_loaded) {
	zfs_fuid_table_destroy(&idx_tree, &domain_tree);
	fuid_table_loaded = B_FALSE;
	}
	}

	/*
	* print uid or gid information.
	* For normal POSIX id just the id is printed in decimal format.
	* For CIFS files with FUID the fuid is printed in hex followed by
	* the domain-rid string.
	*/
	static void
	print_idstr(uint64_t id, const char *id_type)
	{
	if (FUID_INDEX(id)) {
	char *domain;

	domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
	(void) printf("\t%s %llx [%s-%d]\n", id_type,
	(u_longlong_t)id, domain, (int)FUID_RID(id));
	} else {
	(void) printf("\t%s %llu\n", id_type, (u_longlong_t)id);
	}

	}

	static void
	dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
	{
	uint32_t uid_idx, gid_idx;

	uid_idx = FUID_INDEX(uid);
	gid_idx = FUID_INDEX(gid);

	/* Load domain table, if not already loaded */
	if (!fuid_table_loaded && (uid_idx \|\| gid_idx)) {
	uint64_t fuid_obj;

	/* first find the fuid object. It lives in the master node */
	VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
	8, 1, &fuid_obj) == 0);
	zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
	(void) zfs_fuid_table_load(os, fuid_obj,
	&idx_tree, &domain_tree);
	fuid_table_loaded = B_TRUE;
	}

	print_idstr(uid, "uid");
	print_idstr(gid, "gid");
	}

	/ARGSUSED/
	static void
	dump_znode(objset_t os, uint64_t object, void data, size_t size)
	{
	char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
	sa_handle_t *hdl;
	uint64_t xattr, rdev, gen;
	uint64_t uid, gid, mode, fsize, parent, links;
	uint64_t pflags;
	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
	time_t z_crtime, z_atime, z_mtime, z_ctime;
	sa_bulk_attr_t bulk[12];
	int idx = 0;
	int error;

	VERIFY3P(os, ==, sa_os);
	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
	(void) printf("Failed to get handle for SA znode\n");
	return;
	}

	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
	&links, 8);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
	&mode, 8);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
	NULL, &parent, 8);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
	&fsize, 8);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
	acctm, 16);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
	modtm, 16);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
	crtm, 16);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
	chgtm, 16);
	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
	&pflags, 8);

	if (sa_bulk_lookup(hdl, bulk, idx)) {
	(void) sa_handle_destroy(hdl);
	return;
	}

	z_crtime = (time_t)crtm[0];
	z_atime = (time_t)acctm[0];
	z_mtime = (time_t)modtm[0];
	z_ctime = (time_t)chgtm[0];

	if (dump_opt['d'] > 4) {
	error = zfs_obj_to_path(os, object, path, sizeof (path));
	if (error != 0) {
	(void) snprintf(path, sizeof (path),
	"\?\?\?<object#%llu>", (u_longlong_t)object);
	}
	(void) printf("\tpath %s\n", path);
	}
	dump_uidgid(os, uid, gid);
	(void) printf("\tatime %s", ctime(&z_atime));
	(void) printf("\tmtime %s", ctime(&z_mtime));
	(void) printf("\tctime %s", ctime(&z_ctime));
	(void) printf("\tcrtime %s", ctime(&z_crtime));
	(void) printf("\tgen %llu\n", (u_longlong_t)gen);
	(void) printf("\tmode %llo\n", (u_longlong_t)mode);
	(void) printf("\tsize %llu\n", (u_longlong_t)fsize);
	(void) printf("\tparent %llu\n", (u_longlong_t)parent);
	(void) printf("\tlinks %llu\n", (u_longlong_t)links);
	(void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
	sizeof (uint64_t)) == 0)
	(void) printf("\txattr %llu\n", (u_longlong_t)xattr);
	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
	sizeof (uint64_t)) == 0)
	(void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
	sa_handle_destroy(hdl);
	}

	/ARGSUSED/
	static void
	dump_acl(objset_t os, uint64_t object, void data, size_t size)
	{
	}

	/ARGSUSED/
	static void
	dump_dmu_objset(objset_t os, uint64_t object, void data, size_t size)
	{
	}

	static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
	dump_none, /* unallocated */
	dump_zap, /* object directory */
	dump_uint64, /* object array */
	dump_none, /* packed nvlist */
	dump_packed_nvlist, /* packed nvlist size */
	dump_none, /* bpobj */
	dump_bpobj, /* bpobj header */
	dump_none, /* SPA space map header */
	dump_none, /* SPA space map */
	dump_none, /* ZIL intent log */
	dump_dnode, /* DMU dnode */
	dump_dmu_objset, /* DMU objset */
	dump_dsl_dir, /* DSL directory */
	dump_zap, /* DSL directory child map */
	dump_zap, /* DSL dataset snap map */
	dump_zap, /* DSL props */
	dump_dsl_dataset, /* DSL dataset */
	dump_znode, /* ZFS znode */
	dump_acl, /* ZFS V0 ACL */
	dump_uint8, /* ZFS plain file */
	dump_zpldir, /* ZFS directory */
	dump_zap, /* ZFS master node */
	dump_zap, /* ZFS delete queue */
	dump_uint8, /* zvol object */
	dump_zap, /* zvol prop */
	dump_uint8, /* other uint8[] */
	dump_uint64, /* other uint64[] */
	dump_zap, /* other ZAP */
	dump_zap, /* persistent error log */
	dump_uint8, /* SPA history */
	dump_history_offsets, /* SPA history offsets */
	dump_zap, /* Pool properties */
	dump_zap, /* DSL permissions */
	dump_acl, /* ZFS ACL */
	dump_uint8, /* ZFS SYSACL */
	dump_none, /* FUID nvlist */
	dump_packed_nvlist, /* FUID nvlist size */
	dump_zap, /* DSL dataset next clones */
	dump_zap, /* DSL scrub queue */
	dump_zap, /* ZFS user/group used */
	dump_zap, /* ZFS user/group quota */
	dump_zap, /* snapshot refcount tags */
	dump_ddt_zap, /* DDT ZAP object */
	dump_zap, /* DDT statistics */
	dump_znode, /* SA object */
	dump_zap, /* SA Master Node */
	dump_sa_attrs, /* SA attribute registration */
	dump_sa_layouts, /* SA attribute layouts */
	dump_zap, /* DSL scrub translations */
	dump_none, /* fake dedup BP */
	dump_zap, /* deadlist */
	dump_none, /* deadlist hdr */
	dump_zap, /* dsl clones */
	dump_bpobj_subobjs, /* bpobj subobjs */
	dump_unknown, /* Unknown type, must be last */
	};

	static void
	dump_object(objset_t os, uint64_t object, int verbosity, int print_header)
	{
	dmu_buf_t *db = NULL;
	dmu_object_info_t doi;
	dnode_t *dn;
	void *bonus = NULL;
	size_t bsize = 0;
	char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
	char bonus_size[32];
	char aux[50];
	int error;

	/* make sure nicenum has enough space */
	CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);

	if (*print_header) {
	(void) printf("\n%10s %3s %5s %5s %5s %5s %6s %s\n",
	"Object", "lvl", "iblk", "dblk", "dsize", "lsize",
	"%full", "type");
	*print_header = 0;
	}

	if (object == 0) {
	dn = DMU_META_DNODE(os);
	} else {
	error = dmu_bonus_hold(os, object, FTAG, &db);
	if (error)
	fatal("dmu_bonus_hold(%llu) failed, errno %u",
	object, error);
	bonus = db->db_data;
	bsize = db->db_size;
	dn = DB_DNODE((dmu_buf_impl_t *)db);
	}
	dmu_object_info_from_dnode(dn, &doi);

	zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
	zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
	zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
	zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
	doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
	doi.doi_max_offset);

	aux[0] = '\0';

	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT \|\| verbosity >= 6) {
	(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
	ZDB_CHECKSUM_NAME(doi.doi_checksum));
	}

	if (doi.doi_compress != ZIO_COMPRESS_INHERIT \|\| verbosity >= 6) {
	(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
	ZDB_COMPRESS_NAME(doi.doi_compress));
	}

	(void) printf("%10lld %3u %5s %5s %5s %5s %6s %s%s\n",
	(u_longlong_t)object, doi.doi_indirection, iblk, dblk,
	asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);

	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
	(void) printf("%10s %3s %5s %5s %5s %5s %6s %s\n",
	"", "", "", "", "", bonus_size, "bonus",
	ZDB_OT_NAME(doi.doi_bonus_type));
	}

	if (verbosity >= 4) {
	(void) printf("\tdnode flags: %s%s%s\n",
	(dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
	"USED_BYTES " : "",
	(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
	"USERUSED_ACCOUNTED " : "",
	(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
	"SPILL_BLKPTR" : "");
	(void) printf("\tdnode maxblkid: %llu\n",
	(longlong_t)dn->dn_phys->dn_maxblkid);

	object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
	bonus, bsize);
	object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
	*print_header = 1;
	}

	if (verbosity >= 5)
	dump_indirect(dn);

	if (verbosity >= 5) {
	/*
	* Report the list of segments that comprise the object.
	*/
	uint64_t start = 0;
	uint64_t end;
	uint64_t blkfill = 1;
	int minlvl = 1;

	if (dn->dn_type == DMU_OT_DNODE) {
	minlvl = 0;
	blkfill = DNODES_PER_BLOCK;
	}

	for (;;) {
	char segsize[32];
	/* make sure nicenum has enough space */
	CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ);
	error = dnode_next_offset(dn,
	0, &start, minlvl, blkfill, 0);
	if (error)
	break;
	end = start;
	error = dnode_next_offset(dn,
	DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
	zdb_nicenum(end - start, segsize, sizeof (segsize));
	(void) printf("\t\tsegment [%016llx, %016llx)"
	" size %5s\n", (u_longlong_t)start,
	(u_longlong_t)end, segsize);
	if (error)
	break;
	start = end;
	}
	}

	if (db != NULL)
	dmu_buf_rele(db, FTAG);
	}

	static const char *objset_types[DMU_OST_NUMTYPES] = {
	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };

	static void
	dump_dir(objset_t *os)
	{
	dmu_objset_stats_t dds;
	uint64_t object, object_count;
	uint64_t refdbytes, usedobjs, scratch;
	char numbuf[32];
	char blkbuf[BP_SPRINTF_LEN + 20];
	char osname[ZFS_MAX_DATASET_NAME_LEN];
	const char *type = "UNKNOWN";
	int verbosity = dump_opt['d'];
	int print_header = 1;
	unsigned i;
	int error;

	/* make sure nicenum has enough space */
	CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);

	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
	dmu_objset_fast_stat(os, &dds);
	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);

	if (dds.dds_type < DMU_OST_NUMTYPES)
	type = objset_types[dds.dds_type];

	if (dds.dds_type == DMU_OST_META) {
	dds.dds_creation_txg = TXG_INITIAL;
	usedobjs = BP_GET_FILL(os->os_rootbp);
	refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
	dd_used_bytes;
	} else {
	dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
	}

	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));

	zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));

	if (verbosity >= 4) {
	(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
	(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
	sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
	} else {
	blkbuf[0] = '\0';
	}

	dmu_objset_name(os, osname);

	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
	"%s, %llu objects%s\n",
	osname, type, (u_longlong_t)dmu_objset_id(os),
	(u_longlong_t)dds.dds_creation_txg,
	numbuf, (u_longlong_t)usedobjs, blkbuf);

	if (zopt_objects != 0) {
	for (i = 0; i < zopt_objects; i++)
	dump_object(os, zopt_object[i], verbosity,
	&print_header);
	(void) printf("\n");
	return;
	}

	if (dump_opt['i'] != 0 \|\| verbosity >= 2)
	dump_intent_log(dmu_objset_zil(os));

	- if (dmu_objset_ds(os) != NULL)
	- dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
	+ if (dmu_objset_ds(os) != NULL) {
	+ dsl_dataset_t *ds = dmu_objset_ds(os);
	+ dump_deadlist(&ds->ds_deadlist);

	+ if (dsl_dataset_remap_deadlist_exists(ds)) {
	+ (void) printf("ds_remap_deadlist:\n");
	+ dump_deadlist(&ds->ds_remap_deadlist);
	+ }
	+ }
	+
	if (verbosity < 2)
	return;

	if (BP_IS_HOLE(os->os_rootbp))
	return;

	dump_object(os, 0, verbosity, &print_header);
	object_count = 0;
	if (DMU_USERUSED_DNODE(os) != NULL &&
	DMU_USERUSED_DNODE(os)->dn_type != 0) {
	dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
	dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
	}

	object = 0;
	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
	dump_object(os, object, verbosity, &print_header);
	object_count++;
	}

	ASSERT3U(object_count, ==, usedobjs);

	(void) printf("\n");

	if (error != ESRCH) {
	(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
	abort();
	}
	}

	static void
	dump_uberblock(uberblock_t ub, const char header, const char *footer)
	{
	time_t timestamp = ub->ub_timestamp;

	(void) printf("%s", header ? header : "");
	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
	(void) printf("\ttimestamp = %llu UTC = %s",
	(u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
	if (dump_opt['u'] >= 3) {
	char blkbuf[BP_SPRINTF_LEN];
	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
	(void) printf("\trootbp = %s\n", blkbuf);
	}
	(void) printf("%s", footer ? footer : "");
	}

	static void
	dump_config(spa_t *spa)
	{
	dmu_buf_t *db;
	size_t nvsize = 0;
	int error = 0;


	error = dmu_bonus_hold(spa->spa_meta_objset,
	spa->spa_config_object, FTAG, &db);

	if (error == 0) {
	nvsize = (uint64_t )db->db_data;
	dmu_buf_rele(db, FTAG);

	(void) printf("\nMOS Configuration:\n");
	dump_packed_nvlist(spa->spa_meta_objset,
	spa->spa_config_object, (void *)&nvsize, 1);
	} else {
	(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
	(u_longlong_t)spa->spa_config_object, error);
	}
	}

	static void
	dump_cachefile(const char *cachefile)
	{
	int fd;
	struct stat64 statbuf;
	char *buf;
	nvlist_t *config;

	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
	(void) fprintf(stderr, "cannot open '%s': %s\n", cachefile,
	strerror(errno));
	exit(1);
	}

	if (fstat64(fd, &statbuf) != 0) {
	(void) fprintf(stderr, "failed to stat '%s': %s\n", cachefile,
	strerror(errno));
	exit(1);
	}

	if ((buf = malloc(statbuf.st_size)) == NULL) {
	(void) fprintf(stderr, "failed to allocate %llu bytes\n",
	(u_longlong_t)statbuf.st_size);
	exit(1);
	}

	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
	(void) fprintf(stderr, "failed to read %llu bytes\n",
	(u_longlong_t)statbuf.st_size);
	exit(1);
	}

	(void) close(fd);

	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
	(void) fprintf(stderr, "failed to unpack nvlist\n");
	exit(1);
	}

	free(buf);

	dump_nvlist(config, 0);

	nvlist_free(config);
	}

	#define ZDB_MAX_UB_HEADER_SIZE 32

	static void
	dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
	{
	vdev_t vd;
	vdev_t *vdp = &vd;
	char header[ZDB_MAX_UB_HEADER_SIZE];

	vd.vdev_ashift = ashift;
	vdp->vdev_top = vdp;

	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
	uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
	uberblock_t ub = (void )((char *)lbl + uoff);

	if (uberblock_verify(ub))
	continue;
	(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
	"Uberblock[%d]\n", i);
	dump_uberblock(ub, header, "");
	}
	}

	static char curpath[PATH_MAX];

	/*
	* Iterate through the path components, recursively passing
	* current one's obj and remaining path until we find the obj
	* for the last one.
	*/
	static int
	dump_path_impl(objset_t os, uint64_t obj, char name)
	{
	int err;
	int header = 1;
	uint64_t child_obj;
	char *s;
	dmu_buf_t *db;
	dmu_object_info_t doi;

	if ((s = strchr(name, '/')) != NULL)
	*s = '\0';
	err = zap_lookup(os, obj, name, 8, 1, &child_obj);

	(void) strlcat(curpath, name, sizeof (curpath));

	if (err != 0) {
	(void) fprintf(stderr, "failed to lookup %s: %s\n",
	curpath, strerror(err));
	return (err);
	}

	child_obj = ZFS_DIRENT_OBJ(child_obj);
	err = sa_buf_hold(os, child_obj, FTAG, &db);
	if (err != 0) {
	(void) fprintf(stderr,
	"failed to get SA dbuf for obj %llu: %s\n",
	(u_longlong_t)child_obj, strerror(err));
	return (EINVAL);
	}
	dmu_object_info_from_db(db, &doi);
	sa_buf_rele(db, FTAG);

	if (doi.doi_bonus_type != DMU_OT_SA &&
	doi.doi_bonus_type != DMU_OT_ZNODE) {
	(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
	doi.doi_bonus_type, (u_longlong_t)child_obj);
	return (EINVAL);
	}

	if (dump_opt['v'] > 6) {
	(void) printf("obj=%llu %s type=%d bonustype=%d\n",
	(u_longlong_t)child_obj, curpath, doi.doi_type,
	doi.doi_bonus_type);
	}

	(void) strlcat(curpath, "/", sizeof (curpath));

	switch (doi.doi_type) {
	case DMU_OT_DIRECTORY_CONTENTS:
	if (s != NULL && *(s + 1) != '\0')
	return (dump_path_impl(os, child_obj, s + 1));
	/FALLTHROUGH/
	case DMU_OT_PLAIN_FILE_CONTENTS:
	dump_object(os, child_obj, dump_opt['v'], &header);
	return (0);
	default:
	(void) fprintf(stderr, "object %llu has non-file/directory "
	"type %d\n", (u_longlong_t)obj, doi.doi_type);
	break;
	}

	return (EINVAL);
	}

	/*
	* Dump the blocks for the object specified by path inside the dataset.
	*/
	static int
	dump_path(char ds, char path)
	{
	int err;
	objset_t *os;
	uint64_t root_obj;

	err = open_objset(ds, DMU_OST_ZFS, FTAG, &os);
	if (err != 0)
	return (err);

	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
	if (err != 0) {
	(void) fprintf(stderr, "can't lookup root znode: %s\n",
	strerror(err));
	dmu_objset_disown(os, FTAG);
	return (EINVAL);
	}

	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);

	err = dump_path_impl(os, root_obj, path);

	close_objset(os, FTAG);
	return (err);
	}

	static int
	dump_label(const char *dev)
	{
	int fd;
	vdev_label_t label;
	char path[MAXPATHLEN];
	char *buf = label.vl_vdev_phys.vp_nvlist;
	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
	struct stat64 statbuf;
	uint64_t psize, ashift;
	boolean_t label_found = B_FALSE;

	(void) strlcpy(path, dev, sizeof (path));
	if (dev[0] == '/') {
	if (strncmp(dev, ZFS_DISK_ROOTD,
	strlen(ZFS_DISK_ROOTD)) == 0) {
	(void) snprintf(path, sizeof (path), "%s%s",
	ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD));
	}
	} else if (stat64(path, &statbuf) != 0) {
	char *s;

	(void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD,
	dev);
	if (((s = strrchr(dev, 's')) == NULL &&
	(s = strchr(dev, 'p')) == NULL) \|\|
	!isdigit(*(s + 1)))
	(void) strlcat(path, "s0", sizeof (path));
	}

	if ((fd = open64(path, O_RDONLY)) < 0) {
	(void) fprintf(stderr, "cannot open '%s': %s\n", path,
	strerror(errno));
	exit(1);
	}

	if (fstat64(fd, &statbuf) != 0) {
	(void) fprintf(stderr, "failed to stat '%s': %s\n", path,
	strerror(errno));
	(void) close(fd);
	exit(1);
	}

	if (S_ISBLK(statbuf.st_mode)) {
	(void) fprintf(stderr,
	"cannot use '%s': character device required\n", path);
	(void) close(fd);
	exit(1);
	}

	psize = statbuf.st_size;
	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));

	for (int l = 0; l < VDEV_LABELS; l++) {
	nvlist_t *config = NULL;

	if (!dump_opt['q']) {
	(void) printf("------------------------------------\n");
	(void) printf("LABEL %d\n", l);
	(void) printf("------------------------------------\n");
	}

	if (pread64(fd, &label, sizeof (label),
	vdev_label_offset(psize, l, 0)) != sizeof (label)) {
	if (!dump_opt['q'])
	(void) printf("failed to read label %d\n", l);
	continue;
	}

	if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
	if (!dump_opt['q'])
	(void) printf("failed to unpack label %d\n", l);
	ashift = SPA_MINBLOCKSHIFT;
	} else {
	nvlist_t *vdev_tree = NULL;

	if (!dump_opt['q'])
	dump_nvlist(config, 4);
	if ((nvlist_lookup_nvlist(config,
	ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) \|\|
	(nvlist_lookup_uint64(vdev_tree,
	ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
	ashift = SPA_MINBLOCKSHIFT;
	nvlist_free(config);
	label_found = B_TRUE;
	}
	if (dump_opt['u'])
	dump_label_uberblocks(&label, ashift);
	}

	(void) close(fd);

	return (label_found ? 0 : 2);
	}

	static uint64_t dataset_feature_count[SPA_FEATURES];
	+static uint64_t remap_deadlist_count = 0;

	/ARGSUSED/
	static int
	dump_one_dir(const char dsname, void arg)
	{
	int error;
	objset_t *os;

	error = open_objset(dsname, DMU_OST_ANY, FTAG, &os);
	if (error != 0)
	return (0);

	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	if (!dmu_objset_ds(os)->ds_feature_inuse[f])
	continue;
	ASSERT(spa_feature_table[f].fi_flags &
	ZFEATURE_FLAG_PER_DATASET);
	dataset_feature_count[f]++;
	}

	+ if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
	+ remap_deadlist_count++;
	+ }
	+
	dump_dir(os);
	close_objset(os, FTAG);
	fuid_table_destroy();
	return (0);
	}

	/*
	* Block statistics.
	*/
	#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
	typedef struct zdb_blkstats {
	uint64_t zb_asize;
	uint64_t zb_lsize;
	uint64_t zb_psize;
	uint64_t zb_count;
	uint64_t zb_gangs;
	uint64_t zb_ditto_samevdev;
	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
	} zdb_blkstats_t;

	/*
	* Extended object types to report deferred frees and dedup auto-ditto blocks.
	*/
	#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
	#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
	#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2)
	#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3)

	static const char *zdb_ot_extname[] = {
	"deferred free",
	"dedup ditto",
	"other",
	"Total",
	};

	#define ZB_TOTAL DN_MAX_LEVELS

	typedef struct zdb_cb {
	zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
	+ uint64_t zcb_removing_size;
	uint64_t zcb_dedup_asize;
	uint64_t zcb_dedup_blocks;
	uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
	uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
	[BPE_PAYLOAD_SIZE];
	uint64_t zcb_start;
	hrtime_t zcb_lastprint;
	uint64_t zcb_totalasize;
	uint64_t zcb_errors[256];
	int zcb_readfails;
	int zcb_haderrors;
	spa_t *zcb_spa;
	+ uint32_t **zcb_vd_obsolete_counts;
	} zdb_cb_t;

	static void
	zdb_count_block(zdb_cb_t zcb, zilog_t zilog, const blkptr_t *bp,
	dmu_object_type_t type)
	{
	uint64_t refcnt = 0;

	ASSERT(type < ZDB_OT_TOTAL);

	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
	return;

	for (int i = 0; i < 4; i++) {
	int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
	int t = (i & 1) ? type : ZDB_OT_TOTAL;
	int equal;
	zdb_blkstats_t *zb = &zcb->zcb_type[l][t];

	zb->zb_asize += BP_GET_ASIZE(bp);
	zb->zb_lsize += BP_GET_LSIZE(bp);
	zb->zb_psize += BP_GET_PSIZE(bp);
	zb->zb_count++;

	/*
	* The histogram is only big enough to record blocks up to
	* SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
	* "other", bucket.
	*/
	unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
	idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
	zb->zb_psize_histogram[idx]++;

	zb->zb_gangs += BP_COUNT_GANG(bp);

	switch (BP_GET_NDVAS(bp)) {
	case 2:
	if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	DVA_GET_VDEV(&bp->blk_dva[1]))
	zb->zb_ditto_samevdev++;
	break;
	case 3:
	equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	DVA_GET_VDEV(&bp->blk_dva[1])) +
	(DVA_GET_VDEV(&bp->blk_dva[0]) ==
	DVA_GET_VDEV(&bp->blk_dva[2])) +
	(DVA_GET_VDEV(&bp->blk_dva[1]) ==
	DVA_GET_VDEV(&bp->blk_dva[2]));
	if (equal != 0)
	zb->zb_ditto_samevdev++;
	break;
	}

	}

	if (BP_IS_EMBEDDED(bp)) {
	zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
	zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
	[BPE_GET_PSIZE(bp)]++;
	return;
	}

	if (dump_opt['L'])
	return;

	if (BP_GET_DEDUP(bp)) {
	ddt_t *ddt;
	ddt_entry_t *dde;

	ddt = ddt_select(zcb->zcb_spa, bp);
	ddt_enter(ddt);
	dde = ddt_lookup(ddt, bp, B_FALSE);

	if (dde == NULL) {
	refcnt = 0;
	} else {
	ddt_phys_t *ddp = ddt_phys_select(dde, bp);
	ddt_phys_decref(ddp);
	refcnt = ddp->ddp_refcnt;
	if (ddt_phys_total_refcnt(dde) == 0)
	ddt_remove(ddt, dde);
	}
	ddt_exit(ddt);
	}

	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
	refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
	bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
	}

	/* ARGSUSED */
	static void
	zdb_blkptr_done(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;
	blkptr_t *bp = zio->io_bp;
	int ioerr = zio->io_error;
	zdb_cb_t *zcb = zio->io_private;
	zbookmark_phys_t *zb = &zio->io_bookmark;

	abd_free(zio->io_abd);

	mutex_enter(&spa->spa_scrub_lock);
	spa->spa_scrub_inflight--;
	cv_broadcast(&spa->spa_scrub_io_cv);

	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
	char blkbuf[BP_SPRINTF_LEN];

	zcb->zcb_haderrors = 1;
	zcb->zcb_errors[ioerr]++;

	if (dump_opt['b'] >= 2)
	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	else
	blkbuf[0] = '\0';

	(void) printf("zdb_blkptr_cb: "
	"Got error %d reading "
	"<%llu, %llu, %lld, %llx> %s -- skipping\n",
	ioerr,
	(u_longlong_t)zb->zb_objset,
	(u_longlong_t)zb->zb_object,
	(u_longlong_t)zb->zb_level,
	(u_longlong_t)zb->zb_blkid,
	blkbuf);
	}
	mutex_exit(&spa->spa_scrub_lock);
	}

	/* ARGSUSED */
	static int
	zdb_blkptr_cb(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	{
	zdb_cb_t *zcb = arg;
	dmu_object_type_t type;
	boolean_t is_metadata;

	if (bp == NULL)
	return (0);

	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
	char blkbuf[BP_SPRINTF_LEN];
	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	(void) printf("objset %llu object %llu "
	"level %lld offset 0x%llx %s\n",
	(u_longlong_t)zb->zb_objset,
	(u_longlong_t)zb->zb_object,
	(longlong_t)zb->zb_level,
	(u_longlong_t)blkid2offset(dnp, bp, zb),
	blkbuf);
	}

	if (BP_IS_HOLE(bp))
	return (0);

	type = BP_GET_TYPE(bp);

	zdb_count_block(zcb, zilog, bp,
	(type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);

	is_metadata = (BP_GET_LEVEL(bp) != 0 \|\| DMU_OT_IS_METADATA(type));

	if (!BP_IS_EMBEDDED(bp) &&
	(dump_opt['c'] > 1 \|\| (dump_opt['c'] && is_metadata))) {
	size_t size = BP_GET_PSIZE(bp);
	abd_t *abd = abd_alloc(size, B_FALSE);
	int flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SCRUB \| ZIO_FLAG_RAW;

	/* If it's an intent log block, failure is expected. */
	if (zb->zb_level == ZB_ZIL_LEVEL)
	flags \|= ZIO_FLAG_SPECULATIVE;

	mutex_enter(&spa->spa_scrub_lock);
	while (spa->spa_scrub_inflight > max_inflight)
	cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
	spa->spa_scrub_inflight++;
	mutex_exit(&spa->spa_scrub_lock);

	zio_nowait(zio_read(NULL, spa, bp, abd, size,
	zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
	}

	zcb->zcb_readfails = 0;

	/* only call gethrtime() every 100 blocks */
	static int iters;
	if (++iters > 100)
	iters = 0;
	else
	return (0);

	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
	uint64_t now = gethrtime();
	char buf[10];
	uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
	int kb_per_sec =
	1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
	int sec_remaining =
	(zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;

	/* make sure nicenum has enough space */
	CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ);

	zfs_nicenum(bytes, buf, sizeof (buf));
	(void) fprintf(stderr,
	"\r%5s completed (%4dMB/s) "
	"estimated time remaining: %uhr %02umin %02usec ",
	buf, kb_per_sec / 1024,
	sec_remaining / 60 / 60,
	sec_remaining / 60 % 60,
	sec_remaining % 60);

	zcb->zcb_lastprint = now;
	}

	return (0);
	}

	static void
	zdb_leak(void *arg, uint64_t start, uint64_t size)
	{
	vdev_t *vd = arg;

	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
	(u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
	}

	static metaslab_ops_t zdb_metaslab_ops = {
	NULL /* alloc */
	};

	static void
	zdb_ddt_leak_init(spa_t spa, zdb_cb_t zcb)
	{
	ddt_bookmark_t ddb;
	ddt_entry_t dde;
	int error;

	bzero(&ddb, sizeof (ddb));
	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
	blkptr_t blk;
	ddt_phys_t *ddp = dde.dde_phys;

	if (ddb.ddb_class == DDT_CLASS_UNIQUE)
	return;

	ASSERT(ddt_phys_total_refcnt(&dde) > 1);

	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	if (ddp->ddp_phys_birth == 0)
	continue;
	ddt_bp_create(ddb.ddb_checksum,
	&dde.dde_key, ddp, &blk);
	if (p == DDT_PHYS_DITTO) {
	zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
	} else {
	zcb->zcb_dedup_asize +=
	BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
	zcb->zcb_dedup_blocks++;
	}
	}
	if (!dump_opt['L']) {
	ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
	ddt_enter(ddt);
	VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
	ddt_exit(ddt);
	}
	}

	ASSERT(error == ENOENT);
	}

	+/* ARGSUSED */
	static void
	+claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
	+ uint64_t size, void *arg)
	+{
	+ /*
	+ * This callback was called through a remap from
	+ * a device being removed. Therefore, the vdev that
	+ * this callback is applied to is a concrete
	+ * vdev.
	+ */
	+ ASSERT(vdev_is_concrete(vd));
	+
	+ VERIFY0(metaslab_claim_impl(vd, offset, size,
	+ spa_first_txg(vd->vdev_spa)));
	+}
	+
	+static void
	+claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
	+{
	+ vdev_t *vd = arg;
	+
	+ vdev_indirect_ops.vdev_op_remap(vd, offset, size,
	+ claim_segment_impl_cb, NULL);
	+}
	+
	+/*
	+ * After accounting for all allocated blocks that are directly referenced,
	+ * we might have missed a reference to a block from a partially complete
	+ * (and thus unused) indirect mapping object. We perform a secondary pass
	+ * through the metaslabs we have already mapped and claim the destination
	+ * blocks.
	+ */
	+static void
	+zdb_claim_removing(spa_t spa, zdb_cb_t zcb)
	+{
	+ if (spa->spa_vdev_removal == NULL)
	+ return;
	+
	+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	+
	+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	+ vdev_t *vd = svr->svr_vdev;
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+
	+ for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
	+ metaslab_t *msp = vd->vdev_ms[msi];
	+
	+ if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
	+ break;
	+
	+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
	+
	+ if (msp->ms_sm != NULL) {
	+ VERIFY0(space_map_load(msp->ms_sm,
	+ svr->svr_allocd_segs, SM_ALLOC));
	+
	+ /*
	+ * Clear everything past what has been synced,
	+ * because we have not allocated mappings for it yet.
	+ */
	+ range_tree_clear(svr->svr_allocd_segs,
	+ vdev_indirect_mapping_max_offset(vim),
	+ msp->ms_sm->sm_start + msp->ms_sm->sm_size -
	+ vdev_indirect_mapping_max_offset(vim));
	+ }
	+
	+ zcb->zcb_removing_size +=
	+ range_tree_space(svr->svr_allocd_segs);
	+ range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
	+ }
	+
	+ spa_config_exit(spa, SCL_CONFIG, FTAG);
	+}
	+
	+/*
	+ * vm_idxp is an in-out parameter which (for indirect vdevs) is the
	+ * index in vim_entries that has the first entry in this metaslab. On
	+ * return, it will be set to the first entry after this metaslab.
	+ */
	+static void
	+zdb_leak_init_ms(metaslab_t msp, uint64_t vim_idxp)
	+{
	+ metaslab_group_t *mg = msp->ms_group;
	+ vdev_t *vd = mg->mg_vd;
	+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
	+
	+ mutex_enter(&msp->ms_lock);
	+ metaslab_unload(msp);
	+
	+ /*
	+ * We don't want to spend the CPU manipulating the size-ordered
	+ * tree, so clear the range_tree ops.
	+ */
	+ msp->ms_tree->rt_ops = NULL;
	+
	+ (void) fprintf(stderr,
	+ "\rloading vdev %llu of %llu, metaslab %llu of %llu ...",
	+ (longlong_t)vd->vdev_id,
	+ (longlong_t)rvd->vdev_children,
	+ (longlong_t)msp->ms_id,
	+ (longlong_t)vd->vdev_ms_count);
	+
	+ /*
	+ * For leak detection, we overload the metaslab ms_tree to
	+ * contain allocated segments instead of free segments. As a
	+ * result, we can't use the normal metaslab_load/unload
	+ * interfaces.
	+ */
	+ if (vd->vdev_ops == &vdev_indirect_ops) {
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+ for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
	+ (*vim_idxp)++) {
	+ vdev_indirect_mapping_entry_phys_t *vimep =
	+ &vim->vim_entries[*vim_idxp];
	+ uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
	+ uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
	+ ASSERT3U(ent_offset, >=, msp->ms_start);
	+ if (ent_offset >= msp->ms_start + msp->ms_size)
	+ break;
	+
	+ /*
	+ * Mappings do not cross metaslab boundaries,
	+ * because we create them by walking the metaslabs.
	+ */
	+ ASSERT3U(ent_offset + ent_len, <=,
	+ msp->ms_start + msp->ms_size);
	+ range_tree_add(msp->ms_tree, ent_offset, ent_len);
	+ }
	+ } else if (msp->ms_sm != NULL) {
	+ VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC));
	+ }
	+
	+ if (!msp->ms_loaded) {
	+ msp->ms_loaded = B_TRUE;
	+ }
	+ mutex_exit(&msp->ms_lock);
	+}
	+
	+/* ARGSUSED */
	+static int
	+increment_indirect_mapping_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	+{
	+ zdb_cb_t *zcb = arg;
	+ spa_t *spa = zcb->zcb_spa;
	+ vdev_t *vd;
	+ const dva_t *dva = &bp->blk_dva[0];
	+
	+ ASSERT(!dump_opt['L']);
	+ ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
	+
	+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	+ vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
	+ ASSERT3P(vd, !=, NULL);
	+ spa_config_exit(spa, SCL_VDEV, FTAG);
	+
	+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
	+ ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
	+
	+ vdev_indirect_mapping_increment_obsolete_count(
	+ vd->vdev_indirect_mapping,
	+ DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
	+ zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
	+
	+ return (0);
	+}
	+
	+static uint32_t *
	+zdb_load_obsolete_counts(vdev_t *vd)
	+{
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+ spa_t *spa = vd->vdev_spa;
	+ spa_condensing_indirect_phys_t *scip =
	+ &spa->spa_condensing_indirect_phys;
	+ uint32_t *counts;
	+
	+ EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL);
	+ counts = vdev_indirect_mapping_load_obsolete_counts(vim);
	+ if (vd->vdev_obsolete_sm != NULL) {
	+ vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
	+ vd->vdev_obsolete_sm);
	+ }
	+ if (scip->scip_vdev == vd->vdev_id &&
	+ scip->scip_prev_obsolete_sm_object != 0) {
	+ space_map_t *prev_obsolete_sm = NULL;
	+ VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
	+ scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
	+ space_map_update(prev_obsolete_sm);
	+ vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
	+ prev_obsolete_sm);
	+ space_map_close(prev_obsolete_sm);
	+ }
	+ return (counts);
	+}
	+
	+static void
	zdb_leak_init(spa_t spa, zdb_cb_t zcb)
	{
	zcb->zcb_spa = spa;

	if (!dump_opt['L']) {
	+ dsl_pool_t *dp = spa->spa_dsl_pool;
	vdev_t *rvd = spa->spa_root_vdev;

	/*
	* We are going to be changing the meaning of the metaslab's
	* ms_tree. Ensure that the allocator doesn't try to
	* use the tree.
	*/
	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;

	+ zcb->zcb_vd_obsolete_counts =
	+ umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
	+ UMEM_NOFAIL);
	+
	+
	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
	vdev_t *vd = rvd->vdev_child[c];
	- metaslab_group_t *mg = vd->vdev_mg;
	- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
	- metaslab_t *msp = vd->vdev_ms[m];
	- ASSERT3P(msp->ms_group, ==, mg);
	- mutex_enter(&msp->ms_lock);
	- metaslab_unload(msp);
	+ uint64_t vim_idx = 0;

	+ ASSERT3U(c, ==, vd->vdev_id);
	+
	+ /*
	+ * Note: we don't check for mapping leaks on
	+ * removing vdevs because their ms_tree's are
	+ * used to look for leaks in allocated space.
	+ */
	+ if (vd->vdev_ops == &vdev_indirect_ops) {
	+ zcb->zcb_vd_obsolete_counts[c] =
	+ zdb_load_obsolete_counts(vd);
	+
	/*
	- * For leak detection, we overload the metaslab
	- * ms_tree to contain allocated segments
	- * instead of free segments. As a result,
	- * we can't use the normal metaslab_load/unload
	- * interfaces.
	+ * Normally, indirect vdevs don't have any
	+ * metaslabs. We want to set them up for
	+ * zio_claim().
	*/
	- if (msp->ms_sm != NULL) {
	- (void) fprintf(stderr,
	- "\rloading space map for "
	- "vdev %llu of %llu, "
	- "metaslab %llu of %llu ...",
	- (longlong_t)c,
	- (longlong_t)rvd->vdev_children,
	- (longlong_t)m,
	- (longlong_t)vd->vdev_ms_count);
	+ VERIFY0(vdev_metaslab_init(vd, 0));
	+ }

	- /*
	- * We don't want to spend the CPU
	- * manipulating the size-ordered
	- * tree, so clear the range_tree
	- * ops.
	- */
	- msp->ms_tree->rt_ops = NULL;
	- VERIFY0(space_map_load(msp->ms_sm,
	- msp->ms_tree, SM_ALLOC));
	-
	- if (!msp->ms_loaded) {
	- msp->ms_loaded = B_TRUE;
	- }
	- }
	- mutex_exit(&msp->ms_lock);
	+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
	+ zdb_leak_init_ms(vd->vdev_ms[m], &vim_idx);
	}
	+ if (vd->vdev_ops == &vdev_indirect_ops) {
	+ ASSERT3U(vim_idx, ==,
	+ vdev_indirect_mapping_num_entries(
	+ vd->vdev_indirect_mapping));
	+ }
	}
	(void) fprintf(stderr, "\n");
	+
	+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
	+ ASSERT(spa_feature_is_enabled(spa,
	+ SPA_FEATURE_DEVICE_REMOVAL));
	+ (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
	+ increment_indirect_mapping_cb, zcb, NULL);
	+ }
	}

	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

	zdb_ddt_leak_init(spa, zcb);

	spa_config_exit(spa, SCL_CONFIG, FTAG);
	}

	-static void
	-zdb_leak_fini(spa_t *spa)
	+static boolean_t
	+zdb_check_for_obsolete_leaks(vdev_t vd, zdb_cb_t zcb)
	{
	+ boolean_t leaks = B_FALSE;
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+ uint64_t total_leaked = 0;
	+
	+ ASSERT(vim != NULL);
	+
	+ for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
	+ vdev_indirect_mapping_entry_phys_t *vimep =
	+ &vim->vim_entries[i];
	+ uint64_t obsolete_bytes = 0;
	+ uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
	+ metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	+
	+ /*
	+ * This is not very efficient but it's easy to
	+ * verify correctness.
	+ */
	+ for (uint64_t inner_offset = 0;
	+ inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
	+ inner_offset += 1 << vd->vdev_ashift) {
	+ if (range_tree_contains(msp->ms_tree,
	+ offset + inner_offset, 1 << vd->vdev_ashift)) {
	+ obsolete_bytes += 1 << vd->vdev_ashift;
	+ }
	+ }
	+
	+ int64_t bytes_leaked = obsolete_bytes -
	+ zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
	+ ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
	+ zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
	+ if (bytes_leaked != 0 &&
	+ (vdev_obsolete_counts_are_precise(vd) \|\|
	+ dump_opt['d'] >= 5)) {
	+ (void) printf("obsolete indirect mapping count "
	+ "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
	+ (u_longlong_t)vd->vdev_id,
	+ (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
	+ (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
	+ (u_longlong_t)bytes_leaked);
	+ }
	+ total_leaked += ABS(bytes_leaked);
	+ }
	+
	+ if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) {
	+ int pct_leaked = total_leaked * 100 /
	+ vdev_indirect_mapping_bytes_mapped(vim);
	+ (void) printf("cannot verify obsolete indirect mapping "
	+ "counts of vdev %llu because precise feature was not "
	+ "enabled when it was removed: %d%% (%llx bytes) of mapping"
	+ "unreferenced\n",
	+ (u_longlong_t)vd->vdev_id, pct_leaked,
	+ (u_longlong_t)total_leaked);
	+ } else if (total_leaked > 0) {
	+ (void) printf("obsolete indirect mapping count mismatch "
	+ "for vdev %llu -- %llx total bytes mismatched\n",
	+ (u_longlong_t)vd->vdev_id,
	+ (u_longlong_t)total_leaked);
	+ leaks \|= B_TRUE;
	+ }
	+
	+ vdev_indirect_mapping_free_obsolete_counts(vim,
	+ zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
	+ zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
	+
	+ return (leaks);
	+}
	+
	+static boolean_t
	+zdb_leak_fini(spa_t spa, zdb_cb_t zcb)
	+{
	+ boolean_t leaks = B_FALSE;
	if (!dump_opt['L']) {
	vdev_t *rvd = spa->spa_root_vdev;
	for (unsigned c = 0; c < rvd->vdev_children; c++) {
	vdev_t *vd = rvd->vdev_child[c];
	metaslab_group_t *mg = vd->vdev_mg;
	- for (unsigned m = 0; m < vd->vdev_ms_count; m++) {
	+
	+ if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
	+ leaks \|= zdb_check_for_obsolete_leaks(vd, zcb);
	+ }
	+
	+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
	metaslab_t *msp = vd->vdev_ms[m];
	ASSERT3P(mg, ==, msp->ms_group);
	- mutex_enter(&msp->ms_lock);

	/*
	* The ms_tree has been overloaded to
	* contain allocated segments. Now that we
	* finished traversing all blocks, any
	* block that remains in the ms_tree
	* represents an allocated block that we
	* did not claim during the traversal.
	* Claimed blocks would have been removed
	- * from the ms_tree.
	+ * from the ms_tree. For indirect vdevs,
	+ * space remaining in the tree represents
	+ * parts of the mapping that are not
	+ * referenced, which is not a bug.
	*/
	- range_tree_vacate(msp->ms_tree, zdb_leak, vd);
	+ if (vd->vdev_ops == &vdev_indirect_ops) {
	+ range_tree_vacate(msp->ms_tree,
	+ NULL, NULL);
	+ } else {
	+ range_tree_vacate(msp->ms_tree,
	+ zdb_leak, vd);
	+ }

	if (msp->ms_loaded) {
	msp->ms_loaded = B_FALSE;
	}
	-
	- mutex_exit(&msp->ms_lock);
	}
	}
	+
	+ umem_free(zcb->zcb_vd_obsolete_counts,
	+ rvd->vdev_children * sizeof (uint32_t *));
	+ zcb->zcb_vd_obsolete_counts = NULL;
	}
	+ return (leaks);
	}

	/* ARGSUSED */
	static int
	count_block_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	zdb_cb_t *zcb = arg;

	if (dump_opt['b'] >= 5) {
	char blkbuf[BP_SPRINTF_LEN];
	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	(void) printf("[%s] %s\n",
	"deferred free", blkbuf);
	}
	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
	return (0);
	}

	static int
	dump_block_stats(spa_t *spa)
	{
	zdb_cb_t zcb;
	zdb_blkstats_t zb, tzb;
	uint64_t norm_alloc, norm_space, total_alloc, total_found;
	int flags = TRAVERSE_PRE \| TRAVERSE_PREFETCH_METADATA \| TRAVERSE_HARD;
	boolean_t leaks = B_FALSE;

	bzero(&zcb, sizeof (zcb));
	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
	(dump_opt['c'] \|\| !dump_opt['L']) ? "to verify " : "",
	(dump_opt['c'] == 1) ? "metadata " : "",
	dump_opt['c'] ? "checksums " : "",
	(dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
	!dump_opt['L'] ? "nothing leaked " : "");

	/*
	* Load all space maps as SM_ALLOC maps, then traverse the pool
	* claiming each block we discover. If the pool is perfectly
	* consistent, the space maps will be empty when we're done.
	* Anything left over is a leak; any block we can't claim (because
	* it's not part of any space map) is a double allocation,
	* reference to a freed block, or an unclaimed log block.
	*/
	zdb_leak_init(spa, &zcb);

	/*
	* If there's a deferred-free bplist, process that first.
	*/
	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
	count_block_cb, &zcb, NULL);
	+
	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
	(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
	count_block_cb, &zcb, NULL);
	}
	+
	+ zdb_claim_removing(spa, &zcb);
	+
	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
	VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
	spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
	&zcb, NULL));
	}

	if (dump_opt['c'] > 1)
	flags \|= TRAVERSE_PREFETCH_DATA;

	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
	zcb.zcb_haderrors \|= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);

	/*
	* If we've traversed the data blocks then we need to wait for those
	* I/Os to complete. We leverage "The Godfather" zio to wait on
	* all async I/Os to complete.
	*/
	if (dump_opt['c']) {
	for (int i = 0; i < max_ncpus; i++) {
	(void) zio_wait(spa->spa_async_zio_root[i]);
	spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \|
	ZIO_FLAG_GODFATHER);
	}
	}

	if (zcb.zcb_haderrors) {
	(void) printf("\nError counts:\n\n");
	(void) printf("\t%5s %s\n", "errno", "count");
	for (int e = 0; e < 256; e++) {
	if (zcb.zcb_errors[e] != 0) {
	(void) printf("\t%5d %llu\n",
	e, (u_longlong_t)zcb.zcb_errors[e]);
	}
	}
	}

	/*
	* Report any leaked segments.
	*/
	- zdb_leak_fini(spa);
	+ leaks \|= zdb_leak_fini(spa, &zcb);

	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];

	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
	norm_space = metaslab_class_get_space(spa_normal_class(spa));

	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
	- total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
	+ total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
	+ zcb.zcb_removing_size;

	if (total_found == total_alloc) {
	if (!dump_opt['L'])
	(void) printf("\n\tNo leaks (block sum matches space"
	" maps exactly)\n");
	} else {
	(void) printf("block traversal size %llu != alloc %llu "
	"(%s %lld)\n",
	(u_longlong_t)total_found,
	(u_longlong_t)total_alloc,
	(dump_opt['L']) ? "unreachable" : "leaked",
	(longlong_t)(total_alloc - total_found));
	leaks = B_TRUE;
	}

	if (tzb->zb_count == 0)
	return (2);

	(void) printf("\n");
	(void) printf("\tbp count: %10llu\n",
	(u_longlong_t)tzb->zb_count);
	(void) printf("\tganged count: %10llu\n",
	(longlong_t)tzb->zb_gangs);
	(void) printf("\tbp logical: %10llu avg: %6llu\n",
	(u_longlong_t)tzb->zb_lsize,
	(u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
	(void) printf("\tbp physical: %10llu avg:"
	" %6llu compression: %6.2f\n",
	(u_longlong_t)tzb->zb_psize,
	(u_longlong_t)(tzb->zb_psize / tzb->zb_count),
	(double)tzb->zb_lsize / tzb->zb_psize);
	(void) printf("\tbp allocated: %10llu avg:"
	" %6llu compression: %6.2f\n",
	(u_longlong_t)tzb->zb_asize,
	(u_longlong_t)(tzb->zb_asize / tzb->zb_count),
	(double)tzb->zb_lsize / tzb->zb_asize);
	(void) printf("\tbp deduped: %10llu ref>1:"
	" %6llu deduplication: %6.2f\n",
	(u_longlong_t)zcb.zcb_dedup_asize,
	(u_longlong_t)zcb.zcb_dedup_blocks,
	(double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
	(void) printf("\tSPA allocated: %10llu used: %5.2f%%\n",
	(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);

	for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
	if (zcb.zcb_embedded_blocks[i] == 0)
	continue;
	(void) printf("\n");
	(void) printf("\tadditional, non-pointer bps of type %u: "
	"%10llu\n",
	i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);

	if (dump_opt['b'] >= 3) {
	(void) printf("\t number of (compressed) bytes: "
	"number of bps\n");
	dump_histogram(zcb.zcb_embedded_histogram[i],
	sizeof (zcb.zcb_embedded_histogram[i]) /
	sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
	}
	}

	if (tzb->zb_ditto_samevdev != 0) {
	(void) printf("\tDittoed blocks on same vdev: %llu\n",
	(longlong_t)tzb->zb_ditto_samevdev);
	}

	+ for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
	+ vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+
	+ if (vim == NULL) {
	+ continue;
	+ }
	+
	+ char mem[32];
	+ zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
	+ mem, vdev_indirect_mapping_size(vim));
	+
	+ (void) printf("\tindirect vdev id %llu has %llu segments "
	+ "(%s in memory)\n",
	+ (longlong_t)vd->vdev_id,
	+ (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
	+ }
	+
	if (dump_opt['b'] >= 2) {
	int l, t, level;
	(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
	"\t avg\t comp\t%%Total\tType\n");

	for (t = 0; t <= ZDB_OT_TOTAL; t++) {
	char csize[32], lsize[32], psize[32], asize[32];
	char avg[32], gang[32];
	const char *typename;

	/* make sure nicenum has enough space */
	CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ);
	CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ);

	if (t < DMU_OT_NUMTYPES)
	typename = dmu_ot[t].ot_name;
	else
	typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];

	if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
	(void) printf("%6s\t%5s\t%5s\t%5s"
	"\t%5s\t%5s\t%6s\t%s\n",
	"-",
	"-",
	"-",
	"-",
	"-",
	"-",
	"-",
	typename);
	continue;
	}

	for (l = ZB_TOTAL - 1; l >= -1; l--) {
	level = (l == -1 ? ZB_TOTAL : l);
	zb = &zcb.zcb_type[level][t];

	if (zb->zb_asize == 0)
	continue;

	if (dump_opt['b'] < 3 && level != ZB_TOTAL)
	continue;

	if (level == 0 && zb->zb_asize ==
	zcb.zcb_type[ZB_TOTAL][t].zb_asize)
	continue;

	zdb_nicenum(zb->zb_count, csize,
	sizeof (csize));
	zdb_nicenum(zb->zb_lsize, lsize,
	sizeof (lsize));
	zdb_nicenum(zb->zb_psize, psize,
	sizeof (psize));
	zdb_nicenum(zb->zb_asize, asize,
	sizeof (asize));
	zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
	sizeof (avg));
	zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));

	(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
	"\t%5.2f\t%6.2f\t",
	csize, lsize, psize, asize, avg,
	(double)zb->zb_lsize / zb->zb_psize,
	100.0 * zb->zb_asize / tzb->zb_asize);

	if (level == ZB_TOTAL)
	(void) printf("%s\n", typename);
	else
	(void) printf(" L%d %s\n",
	level, typename);

	if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
	(void) printf("\t number of ganged "
	"blocks: %s\n", gang);
	}

	if (dump_opt['b'] >= 4) {
	(void) printf("psize "
	"(in 512-byte sectors): "
	"number of blocks\n");
	dump_histogram(zb->zb_psize_histogram,
	PSIZE_HISTO_SIZE, 0);
	}
	}
	}
	}

	(void) printf("\n");

	if (leaks)
	return (2);

	if (zcb.zcb_haderrors)
	return (3);

	return (0);
	}

	typedef struct zdb_ddt_entry {
	ddt_key_t zdde_key;
	uint64_t zdde_ref_blocks;
	uint64_t zdde_ref_lsize;
	uint64_t zdde_ref_psize;
	uint64_t zdde_ref_dsize;
	avl_node_t zdde_node;
	} zdb_ddt_entry_t;

	/* ARGSUSED */
	static int
	zdb_ddt_add_cb(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	{
	avl_tree_t *t = arg;
	avl_index_t where;
	zdb_ddt_entry_t *zdde, zdde_search;

	if (bp == NULL \|\| BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp))
	return (0);

	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
	(void) printf("traversing objset %llu, %llu objects, "
	"%lu blocks so far\n",
	(u_longlong_t)zb->zb_objset,
	(u_longlong_t)BP_GET_FILL(bp),
	avl_numnodes(t));
	}

	if (BP_IS_HOLE(bp) \|\| BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF \|\|
	BP_GET_LEVEL(bp) > 0 \|\| DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
	return (0);

	ddt_key_fill(&zdde_search.zdde_key, bp);

	zdde = avl_find(t, &zdde_search, &where);

	if (zdde == NULL) {
	zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
	zdde->zdde_key = zdde_search.zdde_key;
	avl_insert(t, zdde, where);
	}

	zdde->zdde_ref_blocks += 1;
	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);

	return (0);
	}

	static void
	dump_simulated_ddt(spa_t *spa)
	{
	avl_tree_t t;
	void *cookie = NULL;
	zdb_ddt_entry_t *zdde;
	ddt_histogram_t ddh_total;
	ddt_stat_t dds_total;

	bzero(&ddh_total, sizeof (ddh_total));
	bzero(&dds_total, sizeof (dds_total));
	avl_create(&t, ddt_entry_compare,
	sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));

	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

	(void) traverse_pool(spa, 0, TRAVERSE_PRE \| TRAVERSE_PREFETCH_METADATA,
	zdb_ddt_add_cb, &t);

	spa_config_exit(spa, SCL_CONFIG, FTAG);

	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
	ddt_stat_t dds;
	uint64_t refcnt = zdde->zdde_ref_blocks;
	ASSERT(refcnt != 0);

	dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
	dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
	dds.dds_psize = zdde->zdde_ref_psize / refcnt;
	dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;

	dds.dds_ref_blocks = zdde->zdde_ref_blocks;
	dds.dds_ref_lsize = zdde->zdde_ref_lsize;
	dds.dds_ref_psize = zdde->zdde_ref_psize;
	dds.dds_ref_dsize = zdde->zdde_ref_dsize;

	ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
	&dds, 0);

	umem_free(zdde, sizeof (*zdde));
	}

	avl_destroy(&t);

	ddt_histogram_stat(&dds_total, &ddh_total);

	(void) printf("Simulated DDT histogram:\n");

	zpool_dump_ddt(&dds_total, &ddh_total);

	dump_dedup_ratio(&dds_total);
	}

	+static int
	+verify_device_removal_feature_counts(spa_t *spa)
	+{
	+ uint64_t dr_feature_refcount = 0;
	+ uint64_t oc_feature_refcount = 0;
	+ uint64_t indirect_vdev_count = 0;
	+ uint64_t precise_vdev_count = 0;
	+ uint64_t obsolete_counts_object_count = 0;
	+ uint64_t obsolete_sm_count = 0;
	+ uint64_t obsolete_counts_count = 0;
	+ uint64_t scip_count = 0;
	+ uint64_t obsolete_bpobj_count = 0;
	+ int ret = 0;
	+
	+ spa_condensing_indirect_phys_t *scip =
	+ &spa->spa_condensing_indirect_phys;
	+ if (scip->scip_next_mapping_object != 0) {
	+ vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
	+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	+
	+ (void) printf("Condensing indirect vdev %llu: new mapping "
	+ "object %llu, prev obsolete sm %llu\n",
	+ (u_longlong_t)scip->scip_vdev,
	+ (u_longlong_t)scip->scip_next_mapping_object,
	+ (u_longlong_t)scip->scip_prev_obsolete_sm_object);
	+ if (scip->scip_prev_obsolete_sm_object != 0) {
	+ space_map_t *prev_obsolete_sm = NULL;
	+ VERIFY0(space_map_open(&prev_obsolete_sm,
	+ spa->spa_meta_objset,
	+ scip->scip_prev_obsolete_sm_object,
	+ 0, vd->vdev_asize, 0));
	+ space_map_update(prev_obsolete_sm);
	+ dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
	+ (void) printf("\n");
	+ space_map_close(prev_obsolete_sm);
	+ }
	+
	+ scip_count += 2;
	+ }
	+
	+ for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
	+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+
	+ if (vic->vic_mapping_object != 0) {
	+ ASSERT(vd->vdev_ops == &vdev_indirect_ops \|\|
	+ vd->vdev_removing);
	+ indirect_vdev_count++;
	+
	+ if (vd->vdev_indirect_mapping->vim_havecounts) {
	+ obsolete_counts_count++;
	+ }
	+ }
	+ if (vdev_obsolete_counts_are_precise(vd)) {
	+ ASSERT(vic->vic_mapping_object != 0);
	+ precise_vdev_count++;
	+ }
	+ if (vdev_obsolete_sm_object(vd) != 0) {
	+ ASSERT(vic->vic_mapping_object != 0);
	+ obsolete_sm_count++;
	+ }
	+ }
	+
	+ (void) feature_get_refcount(spa,
	+ &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
	+ &dr_feature_refcount);
	+ (void) feature_get_refcount(spa,
	+ &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
	+ &oc_feature_refcount);
	+
	+ if (dr_feature_refcount != indirect_vdev_count) {
	+ ret = 1;
	+ (void) printf("Number of indirect vdevs (%llu) " \
	+ "does not match feature count (%llu)\n",
	+ (u_longlong_t)indirect_vdev_count,
	+ (u_longlong_t)dr_feature_refcount);
	+ } else {
	+ (void) printf("Verified device_removal feature refcount " \
	+ "of %llu is correct\n",
	+ (u_longlong_t)dr_feature_refcount);
	+ }
	+
	+ if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
	+ DMU_POOL_OBSOLETE_BPOBJ) == 0) {
	+ obsolete_bpobj_count++;
	+ }
	+
	+
	+ obsolete_counts_object_count = precise_vdev_count;
	+ obsolete_counts_object_count += obsolete_sm_count;
	+ obsolete_counts_object_count += obsolete_counts_count;
	+ obsolete_counts_object_count += scip_count;
	+ obsolete_counts_object_count += obsolete_bpobj_count;
	+ obsolete_counts_object_count += remap_deadlist_count;
	+
	+ if (oc_feature_refcount != obsolete_counts_object_count) {
	+ ret = 1;
	+ (void) printf("Number of obsolete counts objects (%llu) " \
	+ "does not match feature count (%llu)\n",
	+ (u_longlong_t)obsolete_counts_object_count,
	+ (u_longlong_t)oc_feature_refcount);
	+ (void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
	+ "ob:%llu rd:%llu\n",
	+ (u_longlong_t)precise_vdev_count,
	+ (u_longlong_t)obsolete_sm_count,
	+ (u_longlong_t)obsolete_counts_count,
	+ (u_longlong_t)scip_count,
	+ (u_longlong_t)obsolete_bpobj_count,
	+ (u_longlong_t)remap_deadlist_count);
	+ } else {
	+ (void) printf("Verified indirect_refcount feature refcount " \
	+ "of %llu is correct\n",
	+ (u_longlong_t)oc_feature_refcount);
	+ }
	+ return (ret);
	+}
	+
	static void
	dump_zpool(spa_t *spa)
	{
	dsl_pool_t *dp = spa_get_dsl(spa);
	int rc = 0;

	if (dump_opt['S']) {
	dump_simulated_ddt(spa);
	return;
	}

	if (!dump_opt['e'] && dump_opt['C'] > 1) {
	(void) printf("\nCached configuration:\n");
	dump_nvlist(spa->spa_config, 8);
	}

	if (dump_opt['C'])
	dump_config(spa);

	if (dump_opt['u'])
	dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");

	if (dump_opt['D'])
	dump_all_ddts(spa);

	if (dump_opt['d'] > 2 \|\| dump_opt['m'])
	dump_metaslabs(spa);
	if (dump_opt['M'])
	dump_metaslab_groups(spa);

	if (dump_opt['d'] \|\| dump_opt['i']) {
	dump_dir(dp->dp_meta_objset);
	if (dump_opt['d'] >= 3) {
	+ dsl_pool_t *dp = spa->spa_dsl_pool;
	dump_full_bpobj(&spa->spa_deferred_bpobj,
	"Deferred frees", 0);
	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
	- dump_full_bpobj(
	- &spa->spa_dsl_pool->dp_free_bpobj,
	+ dump_full_bpobj(&dp->dp_free_bpobj,
	"Pool snapshot frees", 0);
	}
	+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
	+ ASSERT(spa_feature_is_enabled(spa,
	+ SPA_FEATURE_DEVICE_REMOVAL));
	+ dump_full_bpobj(&dp->dp_obsolete_bpobj,
	+ "Pool obsolete blocks", 0);
	+ }

	if (spa_feature_is_active(spa,
	SPA_FEATURE_ASYNC_DESTROY)) {
	dump_bptree(spa->spa_meta_objset,
	- spa->spa_dsl_pool->dp_bptree_obj,
	+ dp->dp_bptree_obj,
	"Pool dataset frees");
	}
	dump_dtl(spa->spa_root_vdev, 0);
	}
	(void) dmu_objset_find(spa_name(spa), dump_one_dir,
	NULL, DS_FIND_SNAPSHOTS \| DS_FIND_CHILDREN);

	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	uint64_t refcount;

	if (!(spa_feature_table[f].fi_flags &
	ZFEATURE_FLAG_PER_DATASET)) {
	ASSERT0(dataset_feature_count[f]);
	continue;
	}
	(void) feature_get_refcount(spa,
	&spa_feature_table[f], &refcount);
	if (dataset_feature_count[f] != refcount) {
	(void) printf("%s feature refcount mismatch: "
	"%lld datasets != %lld refcount\n",
	spa_feature_table[f].fi_uname,
	(longlong_t)dataset_feature_count[f],
	(longlong_t)refcount);
	rc = 2;
	} else {
	(void) printf("Verified %s feature refcount "
	"of %llu is correct\n",
	spa_feature_table[f].fi_uname,
	(longlong_t)refcount);
	}
	}
	+
	+ if (rc == 0) {
	+ rc = verify_device_removal_feature_counts(spa);
	+ }
	}
	if (rc == 0 && (dump_opt['b'] \|\| dump_opt['c']))
	rc = dump_block_stats(spa);

	if (rc == 0)
	rc = verify_spacemap_refcounts(spa);

	if (dump_opt['s'])
	show_pool_stats(spa);

	if (dump_opt['h'])
	dump_history(spa);

	if (rc != 0) {
	dump_debug_buffer();
	exit(rc);
	}
	}

	#define ZDB_FLAG_CHECKSUM 0x0001
	#define ZDB_FLAG_DECOMPRESS 0x0002
	#define ZDB_FLAG_BSWAP 0x0004
	#define ZDB_FLAG_GBH 0x0008
	#define ZDB_FLAG_INDIRECT 0x0010
	#define ZDB_FLAG_PHYS 0x0020
	#define ZDB_FLAG_RAW 0x0040
	#define ZDB_FLAG_PRINT_BLKPTR 0x0080

	static int flagbits[256];

	static void
	zdb_print_blkptr(blkptr_t *bp, int flags)
	{
	char blkbuf[BP_SPRINTF_LEN];

	if (flags & ZDB_FLAG_BSWAP)
	byteswap_uint64_array((void *)bp, sizeof (blkptr_t));

	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
	(void) printf("%s\n", blkbuf);
	}

	static void
	zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
	{
	int i;

	for (i = 0; i < nbps; i++)
	zdb_print_blkptr(&bp[i], flags);
	}

	static void
	zdb_dump_gbh(void *buf, int flags)
	{
	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
	}

	static void
	zdb_dump_block_raw(void *buf, uint64_t size, int flags)
	{
	if (flags & ZDB_FLAG_BSWAP)
	byteswap_uint64_array(buf, size);
	(void) write(1, buf, size);
	}

	static void
	zdb_dump_block(char label, void buf, uint64_t size, int flags)
	{
	uint64_t d = (uint64_t )buf;
	unsigned nwords = size / sizeof (uint64_t);
	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
	unsigned i, j;
	const char *hdr;
	char *c;


	if (do_bswap)
	hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
	else
	hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";

	(void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);

	for (i = 0; i < nwords; i += 2) {
	(void) printf("%06llx: %016llx %016llx ",
	(u_longlong_t)(i * sizeof (uint64_t)),
	(u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
	(u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));

	c = (char *)&d[i];
	for (j = 0; j < 2 * sizeof (uint64_t); j++)
	(void) printf("%c", isprint(c[j]) ? c[j] : '.');
	(void) printf("\n");
	}
	}

	/*
	* There are two acceptable formats:
	* leaf_name - For example: c1t0d0 or /tmp/ztest.0a
	* child[.child]* - For example: 0.1.1
	*
	* The second form can be used to specify arbitrary vdevs anywhere
	* in the heirarchy. For example, in a pool with a mirror of
	* RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
	*/
	static vdev_t *
	zdb_vdev_lookup(vdev_t vdev, const char path)
	{
	char s, p, *q;
	unsigned i;

	if (vdev == NULL)
	return (NULL);

	/* First, assume the x.x.x.x format */
	i = strtoul(path, &s, 10);
	if (s == path \|\| (s && s != '.' && s != '\0'))
	goto name;
	if (i >= vdev->vdev_children)
	return (NULL);

	vdev = vdev->vdev_child[i];
	if (*s == '\0')
	return (vdev);
	return (zdb_vdev_lookup(vdev, s+1));

	name:
	for (i = 0; i < vdev->vdev_children; i++) {
	vdev_t *vc = vdev->vdev_child[i];

	if (vc->vdev_path == NULL) {
	vc = zdb_vdev_lookup(vc, path);
	if (vc == NULL)
	continue;
	else
	return (vc);
	}

	p = strrchr(vc->vdev_path, '/');
	p = p ? p + 1 : vc->vdev_path;
	q = &vc->vdev_path[strlen(vc->vdev_path) - 2];

	if (strcmp(vc->vdev_path, path) == 0)
	return (vc);
	if (strcmp(p, path) == 0)
	return (vc);
	if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
	return (vc);
	}

	return (NULL);
	}

	/* ARGSUSED */
	static int
	random_get_pseudo_bytes_cb(void buf, size_t len, void unused)
	{
	return (random_get_pseudo_bytes(buf, len));
	}

	/*
	* Read a block from a pool and print it out. The syntax of the
	* block descriptor is:
	*
	* pool:vdev_specifier:offset:size[:flags]
	*
	* pool - The name of the pool you wish to read from
	* vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
	* offset - offset, in hex, in bytes
	* size - Amount of data to read, in hex, in bytes
	* flags - A string of characters specifying options
	* b: Decode a blkptr at given offset within block
	* *c: Calculate and display checksums
	* d: Decompress data before dumping
	* e: Byteswap data before dumping
	* g: Display data as a gang block header
	* i: Display as an indirect block
	* p: Do I/O to physical offset
	* r: Dump raw data to stdout
	*
	* * = not yet implemented
	*/
	static void
	zdb_read_block(char thing, spa_t spa)
	{
	blkptr_t blk, *bp = &blk;
	dva_t *dva = bp->blk_dva;
	int flags = 0;
	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
	zio_t *zio;
	vdev_t *vd;
	abd_t *pabd;
	void lbuf, buf;
	const char s, vdev;
	char p, dup, *flagstr;
	int i, error;

	dup = strdup(thing);
	s = strtok(dup, ":");
	vdev = s ? s : "";
	s = strtok(NULL, ":");
	offset = strtoull(s ? s : "", NULL, 16);
	s = strtok(NULL, ":");
	size = strtoull(s ? s : "", NULL, 16);
	s = strtok(NULL, ":");
	if (s)
	flagstr = strdup(s);
	else
	flagstr = strdup("");

	s = NULL;
	if (size == 0)
	s = "size must not be zero";
	if (!IS_P2ALIGNED(size, DEV_BSIZE))
	s = "size must be a multiple of sector size";
	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
	s = "offset must be a multiple of sector size";
	if (s) {
	(void) printf("Invalid block specifier: %s - %s\n", thing, s);
	free(flagstr);
	free(dup);
	return;
	}

	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
	for (i = 0; flagstr[i]; i++) {
	int bit = flagbits[(uchar_t)flagstr[i]];

	if (bit == 0) {
	(void) printf("***Invalid flag: %c\n",
	flagstr[i]);
	continue;
	}
	flags \|= bit;

	/* If it's not something with an argument, keep going */
	if ((bit & (ZDB_FLAG_CHECKSUM \|
	ZDB_FLAG_PRINT_BLKPTR)) == 0)
	continue;

	p = &flagstr[i + 1];
	if (bit == ZDB_FLAG_PRINT_BLKPTR)
	blkptr_offset = strtoull(p, &p, 16);
	if (p != ':' && p != '\0') {
	(void) printf("***Invalid flag arg: '%s'\n", s);
	free(flagstr);
	free(dup);
	return;
	}
	i += p - &flagstr[i + 1]; /* skip over the number */
	}
	}
	free(flagstr);

	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
	if (vd == NULL) {
	(void) printf("***Invalid vdev: %s\n", vdev);
	free(dup);
	return;
	} else {
	if (vd->vdev_path)
	(void) fprintf(stderr, "Found vdev: %s\n",
	vd->vdev_path);
	else
	(void) fprintf(stderr, "Found vdev type: %s\n",
	vd->vdev_ops->vdev_op_type);
	}

	psize = size;
	lsize = size;

	pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);

	BP_ZERO(bp);

	DVA_SET_VDEV(&dva[0], vd->vdev_id);
	DVA_SET_OFFSET(&dva[0], offset);
	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));

	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);

	BP_SET_LSIZE(bp, lsize);
	BP_SET_PSIZE(bp, psize);
	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
	BP_SET_TYPE(bp, DMU_OT_NONE);
	BP_SET_LEVEL(bp, 0);
	BP_SET_DEDUP(bp, 0);
	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	zio = zio_root(spa, NULL, NULL, 0);

	if (vd == vd->vdev_top) {
	/*
	* Treat this as a normal block read.
	*/
	zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
	ZIO_PRIORITY_SYNC_READ,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_RAW, NULL));
	} else {
	/*
	* Treat this as a vdev child I/O.
	*/
	zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
	psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
	ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_QUEUE \|
	ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_DONT_RETRY \|
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_RAW, NULL, NULL));
	+ ZIO_FLAG_CANFAIL \| ZIO_FLAG_RAW \| ZIO_FLAG_OPTIONAL,
	+ NULL, NULL));
	}

	error = zio_wait(zio);
	spa_config_exit(spa, SCL_STATE, FTAG);

	if (error) {
	(void) printf("Read of %s failed, error: %d\n", thing, error);
	goto out;
	}

	if (flags & ZDB_FLAG_DECOMPRESS) {
	/*
	* We don't know how the data was compressed, so just try
	* every decompress function at every inflated blocksize.
	*/
	enum zio_compress c;
	void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
	void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);

	abd_copy_to_buf(pbuf2, pabd, psize);

	VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
	random_get_pseudo_bytes_cb, NULL));

	VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
	SPA_MAXBLOCKSIZE - psize));

	for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
	lsize -= SPA_MINBLOCKSIZE) {
	for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
	if (zio_decompress_data(c, pabd,
	lbuf, psize, lsize) == 0 &&
	zio_decompress_data_buf(c, pbuf2,
	lbuf2, psize, lsize) == 0 &&
	bcmp(lbuf, lbuf2, lsize) == 0)
	break;
	}
	if (c != ZIO_COMPRESS_FUNCTIONS)
	break;
	lsize -= SPA_MINBLOCKSIZE;
	}

	umem_free(pbuf2, SPA_MAXBLOCKSIZE);
	umem_free(lbuf2, SPA_MAXBLOCKSIZE);

	if (lsize <= psize) {
	(void) printf("Decompress of %s failed\n", thing);
	goto out;
	}
	buf = lbuf;
	size = lsize;
	} else {
	buf = abd_to_buf(pabd);
	size = psize;
	}

	if (flags & ZDB_FLAG_PRINT_BLKPTR)
	zdb_print_blkptr((blkptr_t )(void )
	((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
	else if (flags & ZDB_FLAG_RAW)
	zdb_dump_block_raw(buf, size, flags);
	else if (flags & ZDB_FLAG_INDIRECT)
	zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
	flags);
	else if (flags & ZDB_FLAG_GBH)
	zdb_dump_gbh(buf, flags);
	else
	zdb_dump_block(thing, buf, size, flags);

	out:
	abd_free(pabd);
	umem_free(lbuf, SPA_MAXBLOCKSIZE);
	free(dup);
	}

	static void
	zdb_embedded_block(char *thing)
	{
	blkptr_t bp;
	unsigned long long words = (void )&bp;
	char *buf;
	int err;

	bzero(&bp, sizeof (bp));
	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
	"%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
	words + 0, words + 1, words + 2, words + 3,
	words + 4, words + 5, words + 6, words + 7,
	words + 8, words + 9, words + 10, words + 11,
	words + 12, words + 13, words + 14, words + 15);
	if (err != 16) {
	(void) printf("invalid input format\n");
	exit(1);
	}
	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
	buf = malloc(SPA_MAXBLOCKSIZE);
	if (buf == NULL) {
	(void) fprintf(stderr, "%s: failed to allocate %llu bytes\n",
	__func__, SPA_MAXBLOCKSIZE);
	exit(1);
	}
	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
	if (err != 0) {
	(void) printf("decode failed: %u\n", err);
	free(buf);
	exit(1);
	}
	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
	free(buf);
	}

	static boolean_t
	pool_match(nvlist_t cfg, char tgt)
	{
	uint64_t v, guid = strtoull(tgt, NULL, 0);
	char *s;

	if (guid != 0) {
	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
	return (v == guid);
	} else {
	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
	return (strcmp(s, tgt) == 0);
	}
	return (B_FALSE);
	}

	static char *
	find_zpool(char target, nvlist_t configp, int dirc, char **dirv)
	{
	nvlist_t *pools;
	nvlist_t *match = NULL;
	char *name = NULL;
	char *sepp = NULL;
	char sep = '\0';
	int count = 0;
	importargs_t args;

	bzero(&args, sizeof (args));
	args.paths = dirc;
	args.path = dirv;
	args.can_be_active = B_TRUE;

	if ((sepp = strpbrk(*target, "/@")) != NULL) {
	sep = *sepp;
	*sepp = '\0';
	}

	pools = zpool_search_import(g_zfs, &args);

	if (pools != NULL) {
	nvpair_t *elem = NULL;
	while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
	verify(nvpair_value_nvlist(elem, configp) == 0);
	if (pool_match(configp, target)) {
	count++;
	if (match != NULL) {
	/* print previously found config */
	if (name != NULL) {
	(void) printf("%s\n", name);
	dump_nvlist(match, 8);
	name = NULL;
	}
	(void) printf("%s\n",
	nvpair_name(elem));
	dump_nvlist(*configp, 8);
	} else {
	match = *configp;
	name = nvpair_name(elem);
	}
	}
	}
	}
	if (count > 1)
	(void) fatal("\tMatched %d pools - use pool GUID "
	"instead of pool name or \n"
	"\tpool name part of a dataset name to select pool", count);

	if (sepp)
	*sepp = sep;
	/*
	* If pool GUID was specified for pool id, replace it with pool name
	*/
	if (name && (strstr(target, name) != target)) {
	int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);

	*target = umem_alloc(sz, UMEM_NOFAIL);
	(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
	}

	*configp = name ? match : NULL;

	return (name);
	}

	int
	main(int argc, char **argv)
	{
	int c;
	struct rlimit rl = { 1024, 1024 };
	spa_t *spa = NULL;
	objset_t *os = NULL;
	int dump_all = 1;
	int verbose = 0;
	int error = 0;
	char **searchdirs = NULL;
	int nsearch = 0;
	char *target;
	nvlist_t *policy = NULL;
	uint64_t max_txg = UINT64_MAX;
	int flags = ZFS_IMPORT_MISSING_LOG;
	int rewind = ZPOOL_NEVER_REWIND;
	char *spa_config_path_env;
	boolean_t target_is_spa = B_TRUE;

	(void) setrlimit(RLIMIT_NOFILE, &rl);
	(void) enable_extended_FILE_stdio(-1, -1);

	dprintf_setup(&argc, argv);

	/*
	* If there is an environment variable SPA_CONFIG_PATH it overrides
	* default spa_config_path setting. If -U flag is specified it will
	* override this environment variable settings once again.
	*/
	spa_config_path_env = getenv("SPA_CONFIG_PATH");
	if (spa_config_path_env != NULL)
	spa_config_path = spa_config_path_env;

	while ((c = getopt(argc, argv,
	"AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
	switch (c) {
	case 'b':
	case 'c':
	case 'C':
	case 'd':
	case 'D':
	case 'E':
	case 'G':
	case 'h':
	case 'i':
	case 'l':
	case 'm':
	case 'M':
	case 'O':
	case 'R':
	case 's':
	case 'S':
	case 'u':
	dump_opt[c]++;
	dump_all = 0;
	break;
	case 'A':
	case 'e':
	case 'F':
	case 'L':
	case 'P':
	case 'q':
	case 'X':
	dump_opt[c]++;
	break;
	/* NB: Sort single match options below. */
	case 'I':
	max_inflight = strtoull(optarg, NULL, 0);
	if (max_inflight == 0) {
	(void) fprintf(stderr, "maximum number "
	"of inflight I/Os must be greater "
	"than 0\n");
	usage();
	}
	break;
	case 'o':
	error = set_global_var(optarg);
	if (error != 0)
	usage();
	break;
	case 'p':
	if (searchdirs == NULL) {
	searchdirs = umem_alloc(sizeof (char *),
	UMEM_NOFAIL);
	} else {
	char *tmp = umem_alloc((nsearch + 1)
	sizeof (char *), UMEM_NOFAIL);
	bcopy(searchdirs, tmp, nsearch *
	sizeof (char *));
	umem_free(searchdirs,
	nsearch * sizeof (char *));
	searchdirs = tmp;
	}
	searchdirs[nsearch++] = optarg;
	break;
	case 't':
	max_txg = strtoull(optarg, NULL, 0);
	if (max_txg < TXG_INITIAL) {
	(void) fprintf(stderr, "incorrect txg "
	"specified: %s\n", optarg);
	usage();
	}
	break;
	case 'U':
	spa_config_path = optarg;
	if (spa_config_path[0] != '/') {
	(void) fprintf(stderr,
	"cachefile must be an absolute path "
	"(i.e. start with a slash)\n");
	usage();
	}
	break;
	case 'v':
	verbose++;
	break;
	case 'V':
	flags = ZFS_IMPORT_VERBATIM;
	break;
	case 'x':
	vn_dumpdir = optarg;
	break;
	default:
	usage();
	break;
	}
	}

	if (!dump_opt['e'] && searchdirs != NULL) {
	(void) fprintf(stderr, "-p option requires use of -e\n");
	usage();
	}

	/*
	* ZDB does not typically re-read blocks; therefore limit the ARC
	* to 256 MB, which can be used entirely for metadata.
	*/
	zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;

	/*
	* "zdb -c" uses checksum-verifying scrub i/os which are async reads.
	* "zdb -b" uses traversal prefetch which uses async reads.
	* For good performance, let several of them be active at once.
	*/
	zfs_vdev_async_read_max_active = 10;

	/*
	* Disable reference tracking for better performance.
	*/
	reference_tracking_enable = B_FALSE;

	kernel_init(FREAD);
	g_zfs = libzfs_init();
	if (g_zfs == NULL)
	fatal("Fail to initialize zfs");

	if (dump_all)
	verbose = MAX(verbose, 1);

	for (c = 0; c < 256; c++) {
	if (dump_all && strchr("AeEFlLOPRSX", c) == NULL)
	dump_opt[c] = 1;
	if (dump_opt[c])
	dump_opt[c] += verbose;
	}

	aok = (dump_opt['A'] == 1) \|\| (dump_opt['A'] > 2);
	zfs_recover = (dump_opt['A'] > 1);

	argc -= optind;
	argv += optind;

	if (argc < 2 && dump_opt['R'])
	usage();

	if (dump_opt['E']) {
	if (argc != 1)
	usage();
	zdb_embedded_block(argv[0]);
	return (0);
	}

	if (argc < 1) {
	if (!dump_opt['e'] && dump_opt['C']) {
	dump_cachefile(spa_config_path);
	return (0);
	}
	usage();
	}

	if (dump_opt['l'])
	return (dump_label(argv[0]));

	if (dump_opt['O']) {
	if (argc != 2)
	usage();
	dump_opt['v'] = verbose + 3;
	return (dump_path(argv[0], argv[1]));
	}

	if (dump_opt['X'] \|\| dump_opt['F'])
	rewind = ZPOOL_DO_REWIND \|
	(dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);

	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 \|\|
	nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 \|\|
	nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
	fatal("internal error: %s", strerror(ENOMEM));

	error = 0;
	target = argv[0];

	if (dump_opt['e']) {
	nvlist_t *cfg = NULL;
	char *name = find_zpool(&target, &cfg, nsearch, searchdirs);

	error = ENOENT;
	if (name) {
	if (dump_opt['C'] > 1) {
	(void) printf("\nConfiguration for import:\n");
	dump_nvlist(cfg, 8);
	}
	if (nvlist_add_nvlist(cfg,
	ZPOOL_REWIND_POLICY, policy) != 0) {
	fatal("can't open '%s': %s",
	target, strerror(ENOMEM));
	}
	error = spa_import(name, cfg, NULL, flags);
	}
	}

	if (strpbrk(target, "/@") != NULL) {
	size_t targetlen;

	target_is_spa = B_FALSE;
	/*
	* Remove any trailing slash. Later code would get confused
	* by it, but we want to allow it so that "pool/" can
	* indicate that we want to dump the topmost filesystem,
	* rather than the whole pool.
	*/
	targetlen = strlen(target);
	if (targetlen != 0 && target[targetlen - 1] == '/')
	target[targetlen - 1] = '\0';
	}

	if (error == 0) {
	if (target_is_spa \|\| dump_opt['R']) {
	error = spa_open_rewind(target, &spa, FTAG, policy,
	NULL);
	if (error) {
	/*
	* If we're missing the log device then
	* try opening the pool after clearing the
	* log state.
	*/
	mutex_enter(&spa_namespace_lock);
	if ((spa = spa_lookup(target)) != NULL &&
	spa->spa_log_state == SPA_LOG_MISSING) {
	spa->spa_log_state = SPA_LOG_CLEAR;
	error = 0;
	}
	mutex_exit(&spa_namespace_lock);

	if (!error) {
	error = spa_open_rewind(target, &spa,
	FTAG, policy, NULL);
	}
	}
	} else {
	error = open_objset(target, DMU_OST_ANY, FTAG, &os);
	}
	}
	nvlist_free(policy);

	if (error)
	fatal("can't open '%s': %s", target, strerror(error));

	argv++;
	argc--;
	if (!dump_opt['R']) {
	if (argc > 0) {
	zopt_objects = argc;
	zopt_object = calloc(zopt_objects, sizeof (uint64_t));
	for (unsigned i = 0; i < zopt_objects; i++) {
	errno = 0;
	zopt_object[i] = strtoull(argv[i], NULL, 0);
	if (zopt_object[i] == 0 && errno != 0)
	fatal("bad number %s: %s",
	argv[i], strerror(errno));
	}
	}
	if (os != NULL) {
	dump_dir(os);
	} else if (zopt_objects > 0 && !dump_opt['m']) {
	dump_dir(spa->spa_meta_objset);
	} else {
	dump_zpool(spa);
	}
	} else {
	flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
	flagbits['c'] = ZDB_FLAG_CHECKSUM;
	flagbits['d'] = ZDB_FLAG_DECOMPRESS;
	flagbits['e'] = ZDB_FLAG_BSWAP;
	flagbits['g'] = ZDB_FLAG_GBH;
	flagbits['i'] = ZDB_FLAG_INDIRECT;
	flagbits['p'] = ZDB_FLAG_PHYS;
	flagbits['r'] = ZDB_FLAG_RAW;

	for (int i = 0; i < argc; i++)
	zdb_read_block(argv[i], spa);
	}

	if (os != NULL)
	close_objset(os, FTAG);
	else
	spa_close(spa, FTAG);

	fuid_table_destroy();

	dump_debug_buffer();

	libzfs_fini(g_zfs);
	kernel_fini();

	return (0);
	}
	Index: stable/11/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
	===================================================================
	--- stable/11/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c (revision 332524)
	+++ stable/11/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c (revision 332525)
	@@ -1,7434 +1,7457 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	* Copyright 2012 Milan Jurik. All rights reserved.
	* Copyright (c) 2012, Joyent, Inc. All rights reserved.
	* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
	* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright (c) 2013 Steven Hartland. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
	* Copyright 2016 Nexenta Systems, Inc.
	*/

	#include <assert.h>
	#include <ctype.h>
	#include <errno.h>
	#include <getopt.h>
	#include <libgen.h>
	#include <libintl.h>
	#include <libuutil.h>
	#include <libnvpair.h>
	#include <locale.h>
	#include <stddef.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <strings.h>
	#include <unistd.h>
	#include <fcntl.h>
	#include <zone.h>
	#include <grp.h>
	#include <pwd.h>
	#include <signal.h>
	#include <sys/debug.h>
	#include <sys/list.h>
	#include <sys/mntent.h>
	#include <sys/mnttab.h>
	#include <sys/mount.h>
	#include <sys/stat.h>
	#include <sys/fs/zfs.h>
	#include <sys/types.h>
	#include <time.h>
	#include <err.h>
	#include <jail.h>

	#include <libzfs.h>
	#include <libzfs_core.h>
	#include <zfs_prop.h>
	#include <zfs_deleg.h>
	#include <libuutil.h>
	#ifdef illumos
	#include <aclutils.h>
	#include <directory.h>
	#include <idmap.h>
	#endif

	#include "zfs_iter.h"
	#include "zfs_util.h"
	#include "zfs_comutil.h"

	libzfs_handle_t *g_zfs;

	static FILE *mnttab_file;
	static char history_str[HIS_MAX_RECORD_LEN];
	static boolean_t log_history = B_TRUE;

	static int zfs_do_clone(int argc, char **argv);
	static int zfs_do_create(int argc, char **argv);
	static int zfs_do_destroy(int argc, char **argv);
	static int zfs_do_get(int argc, char **argv);
	static int zfs_do_inherit(int argc, char **argv);
	static int zfs_do_list(int argc, char **argv);
	static int zfs_do_mount(int argc, char **argv);
	static int zfs_do_rename(int argc, char **argv);
	static int zfs_do_rollback(int argc, char **argv);
	static int zfs_do_set(int argc, char **argv);
	static int zfs_do_upgrade(int argc, char **argv);
	static int zfs_do_snapshot(int argc, char **argv);
	static int zfs_do_unmount(int argc, char **argv);
	static int zfs_do_share(int argc, char **argv);
	static int zfs_do_unshare(int argc, char **argv);
	static int zfs_do_send(int argc, char **argv);
	static int zfs_do_receive(int argc, char **argv);
	static int zfs_do_promote(int argc, char **argv);
	static int zfs_do_userspace(int argc, char **argv);
	static int zfs_do_allow(int argc, char **argv);
	static int zfs_do_unallow(int argc, char **argv);
	static int zfs_do_hold(int argc, char **argv);
	static int zfs_do_holds(int argc, char **argv);
	static int zfs_do_release(int argc, char **argv);
	static int zfs_do_diff(int argc, char **argv);
	static int zfs_do_jail(int argc, char **argv);
	static int zfs_do_unjail(int argc, char **argv);
	static int zfs_do_bookmark(int argc, char **argv);
	+static int zfs_do_remap(int argc, char **argv);
	static int zfs_do_channel_program(int argc, char **argv);

	/*
	* Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
	*/

	#ifdef DEBUG
	const char *
	_umem_debug_init(void)
	{
	return ("default,verbose"); /* $UMEM_DEBUG setting */
	}

	const char *
	_umem_logging_init(void)
	{
	return ("fail,contents"); /* $UMEM_LOGGING setting */
	}
	#endif

	typedef enum {
	HELP_CLONE,
	HELP_CREATE,
	HELP_DESTROY,
	HELP_GET,
	HELP_INHERIT,
	HELP_UPGRADE,
	HELP_JAIL,
	HELP_UNJAIL,
	HELP_LIST,
	HELP_MOUNT,
	HELP_PROMOTE,
	HELP_RECEIVE,
	HELP_RENAME,
	HELP_ROLLBACK,
	HELP_SEND,
	HELP_SET,
	HELP_SHARE,
	HELP_SNAPSHOT,
	HELP_UNMOUNT,
	HELP_UNSHARE,
	HELP_ALLOW,
	HELP_UNALLOW,
	HELP_USERSPACE,
	HELP_GROUPSPACE,
	HELP_HOLD,
	HELP_HOLDS,
	HELP_RELEASE,
	HELP_DIFF,
	+ HELP_REMAP,
	HELP_BOOKMARK,
	HELP_CHANNEL_PROGRAM,
	} zfs_help_t;

	typedef struct zfs_command {
	const char *name;
	int (func)(int argc, char *argv);
	zfs_help_t usage;
	} zfs_command_t;

	/*
	* Master command table. Each ZFS command has a name, associated function, and
	* usage message. The usage messages need to be internationalized, so we have
	* to have a function to return the usage message based on a command index.
	*
	* These commands are organized according to how they are displayed in the usage
	* message. An empty command (one with a NULL name) indicates an empty line in
	* the generic usage message.
	*/
	static zfs_command_t command_table[] = {
	{ "create", zfs_do_create, HELP_CREATE },
	{ "destroy", zfs_do_destroy, HELP_DESTROY },
	{ NULL },
	{ "snapshot", zfs_do_snapshot, HELP_SNAPSHOT },
	{ "rollback", zfs_do_rollback, HELP_ROLLBACK },
	{ "clone", zfs_do_clone, HELP_CLONE },
	{ "promote", zfs_do_promote, HELP_PROMOTE },
	{ "rename", zfs_do_rename, HELP_RENAME },
	{ "bookmark", zfs_do_bookmark, HELP_BOOKMARK },
	{ "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM },
	{ NULL },
	{ "list", zfs_do_list, HELP_LIST },
	{ NULL },
	{ "set", zfs_do_set, HELP_SET },
	{ "get", zfs_do_get, HELP_GET },
	{ "inherit", zfs_do_inherit, HELP_INHERIT },
	{ "upgrade", zfs_do_upgrade, HELP_UPGRADE },
	{ "userspace", zfs_do_userspace, HELP_USERSPACE },
	{ "groupspace", zfs_do_userspace, HELP_GROUPSPACE },
	{ NULL },
	{ "mount", zfs_do_mount, HELP_MOUNT },
	{ "unmount", zfs_do_unmount, HELP_UNMOUNT },
	{ "share", zfs_do_share, HELP_SHARE },
	{ "unshare", zfs_do_unshare, HELP_UNSHARE },
	{ NULL },
	{ "send", zfs_do_send, HELP_SEND },
	{ "receive", zfs_do_receive, HELP_RECEIVE },
	{ NULL },
	{ "allow", zfs_do_allow, HELP_ALLOW },
	{ NULL },
	{ "unallow", zfs_do_unallow, HELP_UNALLOW },
	{ NULL },
	{ "hold", zfs_do_hold, HELP_HOLD },
	{ "holds", zfs_do_holds, HELP_HOLDS },
	{ "release", zfs_do_release, HELP_RELEASE },
	{ "diff", zfs_do_diff, HELP_DIFF },
	{ NULL },
	{ "jail", zfs_do_jail, HELP_JAIL },
	{ "unjail", zfs_do_unjail, HELP_UNJAIL },
	+ { "remap", zfs_do_remap, HELP_REMAP },
	};

	#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))

	zfs_command_t *current_command;

	static const char *
	get_usage(zfs_help_t idx)
	{
	switch (idx) {
	case HELP_CLONE:
	return (gettext("\tclone [-p] [-o property=value] ... "
	"<snapshot> <filesystem\|volume>\n"));
	case HELP_CREATE:
	return (gettext("\tcreate [-pu] [-o property=value] ... "
	"<filesystem>\n"
	"\tcreate [-ps] [-b blocksize] [-o property=value] ... "
	"-V <size> <volume>\n"));
	case HELP_DESTROY:
	return (gettext("\tdestroy [-fnpRrv] <filesystem\|volume>\n"
	"\tdestroy [-dnpRrv] "
	"<filesystem\|volume>@<snap>[%<snap>][,...]\n"
	"\tdestroy <filesystem\|volume>#<bookmark>\n"));
	case HELP_GET:
	return (gettext("\tget [-rHp] [-d max] "
	"[-o \"all\" \| field[,...]]\n"
	"\t [-t type[,...]] [-s source[,...]]\n"
	"\t <\"all\" \| property[,...]> "
	"[filesystem\|volume\|snapshot\|bookmark] ...\n"));
	case HELP_INHERIT:
	return (gettext("\tinherit [-rS] <property> "
	"<filesystem\|volume\|snapshot> ...\n"));
	case HELP_UPGRADE:
	return (gettext("\tupgrade [-v]\n"
	"\tupgrade [-r] [-V version] <-a \| filesystem ...>\n"));
	case HELP_JAIL:
	return (gettext("\tjail <jailid\|jailname> <filesystem>\n"));
	case HELP_UNJAIL:
	return (gettext("\tunjail <jailid\|jailname> <filesystem>\n"));
	case HELP_LIST:
	return (gettext("\tlist [-Hp] [-r\|-d max] [-o property[,...]] "
	"[-s property]...\n\t [-S property]... [-t type[,...]] "
	"[filesystem\|volume\|snapshot] ...\n"));
	case HELP_MOUNT:
	return (gettext("\tmount\n"
	"\tmount [-vO] [-o opts] <-a \| filesystem>\n"));
	case HELP_PROMOTE:
	return (gettext("\tpromote <clone-filesystem>\n"));
	case HELP_RECEIVE:
	return (gettext("\treceive\|recv [-vnsFu] <filesystem\|volume\|"
	"snapshot>\n"
	"\treceive\|recv [-vnsFu] [-o origin=<snapshot>] [-d \| -e] "
	"<filesystem>\n"
	"\treceive\|recv -A <filesystem\|volume>\n"));
	case HELP_RENAME:
	return (gettext("\trename [-f] <filesystem\|volume\|snapshot> "
	"<filesystem\|volume\|snapshot>\n"
	"\trename [-f] -p <filesystem\|volume> <filesystem\|volume>\n"
	"\trename -r <snapshot> <snapshot>\n"
	"\trename -u [-p] <filesystem> <filesystem>"));
	case HELP_ROLLBACK:
	return (gettext("\trollback [-rRf] <snapshot>\n"));
	case HELP_SEND:
	return (gettext("\tsend [-DnPpRvLec] [-[iI] snapshot] "
	"<snapshot>\n"
	"\tsend [-Le] [-i snapshot\|bookmark] "
	"<filesystem\|volume\|snapshot>\n"
	"\tsend [-nvPe] -t <receive_resume_token>\n"));
	case HELP_SET:
	return (gettext("\tset <property=value> ... "
	"<filesystem\|volume\|snapshot> ...\n"));
	case HELP_SHARE:
	return (gettext("\tshare <-a \| filesystem>\n"));
	case HELP_SNAPSHOT:
	return (gettext("\tsnapshot\|snap [-r] [-o property=value] ... "
	"<filesystem\|volume>@<snap> ...\n"));
	case HELP_UNMOUNT:
	return (gettext("\tunmount\|umount [-f] "
	"<-a \| filesystem\|mountpoint>\n"));
	case HELP_UNSHARE:
	return (gettext("\tunshare "
	"<-a \| filesystem\|mountpoint>\n"));
	case HELP_ALLOW:
	return (gettext("\tallow <filesystem\|volume>\n"
	"\tallow [-ldug] "
	"<\"everyone\"\|user\|group>[,...] <perm\|@setname>[,...]\n"
	"\t <filesystem\|volume>\n"
	"\tallow [-ld] -e <perm\|@setname>[,...] "
	"<filesystem\|volume>\n"
	"\tallow -c <perm\|@setname>[,...] <filesystem\|volume>\n"
	"\tallow -s @setname <perm\|@setname>[,...] "
	"<filesystem\|volume>\n"));
	case HELP_UNALLOW:
	return (gettext("\tunallow [-rldug] "
	"<\"everyone\"\|user\|group>[,...]\n"
	"\t [<perm\|@setname>[,...]] <filesystem\|volume>\n"
	"\tunallow [-rld] -e [<perm\|@setname>[,...]] "
	"<filesystem\|volume>\n"
	"\tunallow [-r] -c [<perm\|@setname>[,...]] "
	"<filesystem\|volume>\n"
	"\tunallow [-r] -s @setname [<perm\|@setname>[,...]] "
	"<filesystem\|volume>\n"));
	case HELP_USERSPACE:
	return (gettext("\tuserspace [-Hinp] [-o field[,...]] "
	"[-s field] ...\n"
	"\t [-S field] ... [-t type[,...]] "
	"<filesystem\|snapshot>\n"));
	case HELP_GROUPSPACE:
	return (gettext("\tgroupspace [-Hinp] [-o field[,...]] "
	"[-s field] ...\n"
	"\t [-S field] ... [-t type[,...]] "
	"<filesystem\|snapshot>\n"));
	case HELP_HOLD:
	return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
	case HELP_HOLDS:
	return (gettext("\tholds [-Hp] [-r\|-d depth] "
	"<filesystem\|volume\|snapshot> ...\n"));
	case HELP_RELEASE:
	return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
	case HELP_DIFF:
	return (gettext("\tdiff [-FHt] <snapshot> "
	"[snapshot\|filesystem]\n"));
	+ case HELP_REMAP:
	+ return (gettext("\tremap <filesystem \| volume>\n"));
	case HELP_BOOKMARK:
	return (gettext("\tbookmark <snapshot> <bookmark>\n"));
	case HELP_CHANNEL_PROGRAM:
	return (gettext("\tprogram [-n] [-t <instruction limit>] "
	"[-m <memory limit (b)>] <pool> <program file> "
	"[lua args...]\n"));
	}

	abort();
	/* NOTREACHED */
	}

	void
	nomem(void)
	{
	(void) fprintf(stderr, gettext("internal error: out of memory\n"));
	exit(1);
	}

	/*
	* Utility function to guarantee malloc() success.
	*/

	void *
	safe_malloc(size_t size)
	{
	void *data;

	if ((data = calloc(1, size)) == NULL)
	nomem();

	return (data);
	}

	void *
	safe_realloc(void *data, size_t size)
	{
	void *newp;
	if ((newp = realloc(data, size)) == NULL) {
	free(data);
	nomem();
	}

	return (newp);
	}

	static char *
	safe_strdup(char *str)
	{
	char *dupstr = strdup(str);

	if (dupstr == NULL)
	nomem();

	return (dupstr);
	}

	/*
	* Callback routine that will print out information for each of
	* the properties.
	*/
	static int
	usage_prop_cb(int prop, void *cb)
	{
	FILE *fp = cb;

	(void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop));

	if (zfs_prop_readonly(prop))
	(void) fprintf(fp, " NO ");
	else
	(void) fprintf(fp, "YES ");

	if (zfs_prop_inheritable(prop))
	(void) fprintf(fp, " YES ");
	else
	(void) fprintf(fp, " NO ");

	if (zfs_prop_values(prop) == NULL)
	(void) fprintf(fp, "-\n");
	else
	(void) fprintf(fp, "%s\n", zfs_prop_values(prop));

	return (ZPROP_CONT);
	}

	/*
	* Display usage message. If we're inside a command, display only the usage for
	* that command. Otherwise, iterate over the entire command table and display
	* a complete usage message.
	*/
	static void
	usage(boolean_t requested)
	{
	int i;
	boolean_t show_properties = B_FALSE;
	FILE *fp = requested ? stdout : stderr;

	if (current_command == NULL) {

	(void) fprintf(fp, gettext("usage: zfs command args ...\n"));
	(void) fprintf(fp,
	gettext("where 'command' is one of the following:\n\n"));

	for (i = 0; i < NCOMMAND; i++) {
	if (command_table[i].name == NULL)
	(void) fprintf(fp, "\n");
	else
	(void) fprintf(fp, "%s",
	get_usage(command_table[i].usage));
	}

	(void) fprintf(fp, gettext("\nEach dataset is of the form: "
	"pool/[dataset/]*dataset[@name]\n"));
	} else {
	(void) fprintf(fp, gettext("usage:\n"));
	(void) fprintf(fp, "%s", get_usage(current_command->usage));
	}

	if (current_command != NULL &&
	(strcmp(current_command->name, "set") == 0 \|\|
	strcmp(current_command->name, "get") == 0 \|\|
	strcmp(current_command->name, "inherit") == 0 \|\|
	strcmp(current_command->name, "list") == 0))
	show_properties = B_TRUE;

	if (show_properties) {
	(void) fprintf(fp,
	gettext("\nThe following properties are supported:\n"));

	(void) fprintf(fp, "\n\t%-14s %s %s %s\n\n",
	"PROPERTY", "EDIT", "INHERIT", "VALUES");

	/* Iterate over all properties */
	(void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
	ZFS_TYPE_DATASET);

	(void) fprintf(fp, "\t%-15s ", "userused@...");
	(void) fprintf(fp, " NO NO <size>\n");
	(void) fprintf(fp, "\t%-15s ", "groupused@...");
	(void) fprintf(fp, " NO NO <size>\n");
	(void) fprintf(fp, "\t%-15s ", "userquota@...");
	(void) fprintf(fp, "YES NO <size> \| none\n");
	(void) fprintf(fp, "\t%-15s ", "groupquota@...");
	(void) fprintf(fp, "YES NO <size> \| none\n");
	(void) fprintf(fp, "\t%-15s ", "written@<snap>");
	(void) fprintf(fp, " NO NO <size>\n");

	(void) fprintf(fp, gettext("\nSizes are specified in bytes "
	"with standard units such as K, M, G, etc.\n"));
	(void) fprintf(fp, gettext("\nUser-defined properties can "
	"be specified by using a name containing a colon (:).\n"));
	(void) fprintf(fp, gettext("\nThe {user\|group}{used\|quota}@ "
	"properties must be appended with\n"
	"a user or group specifier of one of these forms:\n"
	" POSIX name (eg: \"matt\")\n"
	" POSIX id (eg: \"126829\")\n"
	" SMB name@domain (eg: \"matt@sun\")\n"
	" SMB SID (eg: \"S-1-234-567-89\")\n"));
	} else {
	(void) fprintf(fp,
	gettext("\nFor the property list, run: %s\n"),
	"zfs set\|get");
	(void) fprintf(fp,
	gettext("\nFor the delegated permission list, run: %s\n"),
	"zfs allow\|unallow");
	}

	/*
	* See comments at end of main().
	*/
	if (getenv("ZFS_ABORT") != NULL) {
	(void) printf("dumping core by request\n");
	abort();
	}

	exit(requested ? 0 : 2);
	}

	/*
	* Take a property=value argument string and add it to the given nvlist.
	* Modifies the argument inplace.
	*/
	static int
	parseprop(nvlist_t props, char propname)
	{
	char propval, strval;

	if ((propval = strchr(propname, '=')) == NULL) {
	(void) fprintf(stderr, gettext("missing "
	"'=' for property=value argument\n"));
	return (-1);
	}
	*propval = '\0';
	propval++;
	if (nvlist_lookup_string(props, propname, &strval) == 0) {
	(void) fprintf(stderr, gettext("property '%s' "
	"specified multiple times\n"), propname);
	return (-1);
	}
	if (nvlist_add_string(props, propname, propval) != 0)
	nomem();
	return (0);
	}

	static int
	parse_depth(char opt, int flags)
	{
	char *tmp;
	int depth;

	depth = (int)strtol(opt, &tmp, 0);
	if (*tmp) {
	(void) fprintf(stderr,
	gettext("%s is not an integer\n"), opt);
	usage(B_FALSE);
	}
	if (depth < 0) {
	(void) fprintf(stderr,
	gettext("Depth can not be negative.\n"));
	usage(B_FALSE);
	}
	*flags \|= (ZFS_ITER_DEPTH_LIMIT\|ZFS_ITER_RECURSE);
	return (depth);
	}

	#define PROGRESS_DELAY 2 /* seconds */

	static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
	static time_t pt_begin;
	static char *pt_header = NULL;
	static boolean_t pt_shown;

	static void
	start_progress_timer(void)
	{
	pt_begin = time(NULL) + PROGRESS_DELAY;
	pt_shown = B_FALSE;
	}

	static void
	set_progress_header(char *header)
	{
	assert(pt_header == NULL);
	pt_header = safe_strdup(header);
	if (pt_shown) {
	(void) printf("%s: ", header);
	(void) fflush(stdout);
	}
	}

	static void
	update_progress(char *update)
	{
	if (!pt_shown && time(NULL) > pt_begin) {
	int len = strlen(update);

	(void) printf("%s: %s%.s", pt_header, update, len, len,
	pt_reverse);
	(void) fflush(stdout);
	pt_shown = B_TRUE;
	} else if (pt_shown) {
	int len = strlen(update);

	(void) printf("%s%.s", update, len, len, pt_reverse);
	(void) fflush(stdout);
	}
	}

	static void
	finish_progress(char *done)
	{
	if (pt_shown) {
	(void) printf("%s\n", done);
	(void) fflush(stdout);
	}
	free(pt_header);
	pt_header = NULL;
	}

	/*
	* Check if the dataset is mountable and should be automatically mounted.
	*/
	static boolean_t
	should_auto_mount(zfs_handle_t *zhp)
	{
	if (!zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, zfs_get_type(zhp)))
	return (B_FALSE);
	return (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON);
	}

	/*
	* zfs clone [-p] [-o prop=value] ... <snap> <fs \| vol>
	*
	* Given an existing dataset, create a writable copy whose initial contents
	* are the same as the source. The newly created dataset maintains a
	* dependency on the original; the original cannot be destroyed so long as
	* the clone exists.
	*
	* The '-p' flag creates all the non-existing ancestors of the target first.
	*/
	static int
	zfs_do_clone(int argc, char **argv)
	{
	zfs_handle_t *zhp = NULL;
	boolean_t parents = B_FALSE;
	nvlist_t *props;
	int ret = 0;
	int c;

	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	nomem();

	/* check options */
	while ((c = getopt(argc, argv, "o:p")) != -1) {
	switch (c) {
	case 'o':
	if (parseprop(props, optarg) != 0)
	return (1);
	break;
	case 'p':
	parents = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	goto usage;
	}
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing source dataset "
	"argument\n"));
	goto usage;
	}
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing target dataset "
	"argument\n"));
	goto usage;
	}
	if (argc > 2) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	goto usage;
	}

	/* open the source dataset */
	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
	return (1);

	if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM \|
	ZFS_TYPE_VOLUME)) {
	/*
	* Now create the ancestors of the target dataset. If the
	* target already exists and '-p' option was used we should not
	* complain.
	*/
	if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM \|
	ZFS_TYPE_VOLUME))
	return (0);
	if (zfs_create_ancestors(g_zfs, argv[1]) != 0)
	return (1);
	}

	/* pass to libzfs */
	ret = zfs_clone(zhp, argv[1], props);

	/* create the mountpoint if necessary */
	if (ret == 0) {
	zfs_handle_t *clone;

	clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET);
	if (clone != NULL) {
	/*
	* If the user doesn't want the dataset
	* automatically mounted, then skip the mount/share
	* step.
	*/
	if (should_auto_mount(clone)) {
	if ((ret = zfs_mount(clone, NULL, 0)) != 0) {
	(void) fprintf(stderr, gettext("clone "
	"successfully created, "
	"but not mounted\n"));
	} else if ((ret = zfs_share(clone)) != 0) {
	(void) fprintf(stderr, gettext("clone "
	"successfully created, "
	"but not shared\n"));
	}
	}
	zfs_close(clone);
	}
	}

	zfs_close(zhp);
	nvlist_free(props);

	return (!!ret);

	usage:
	if (zhp)
	zfs_close(zhp);
	nvlist_free(props);
	usage(B_FALSE);
	return (-1);
	}

	/*
	* zfs create [-pu] [-o prop=value] ... fs
	* zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size
	*
	* Create a new dataset. This command can be used to create filesystems
	* and volumes. Snapshot creation is handled by 'zfs snapshot'.
	* For volumes, the user must specify a size to be used.
	*
	* The '-s' flag applies only to volumes, and indicates that we should not try
	* to set the reservation for this volume. By default we set a reservation
	* equal to the size for any volume. For pools with SPA_VERSION >=
	* SPA_VERSION_REFRESERVATION, we set a refreservation instead.
	*
	* The '-p' flag creates all the non-existing ancestors of the target first.
	*
	* The '-u' flag prevents mounting of newly created file system.
	*/
	static int
	zfs_do_create(int argc, char **argv)
	{
	zfs_type_t type = ZFS_TYPE_FILESYSTEM;
	zfs_handle_t *zhp = NULL;
	uint64_t volsize = 0;
	int c;
	boolean_t noreserve = B_FALSE;
	boolean_t bflag = B_FALSE;
	boolean_t parents = B_FALSE;
	boolean_t nomount = B_FALSE;
	int ret = 1;
	nvlist_t *props;
	uint64_t intval;

	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	nomem();

	/* check options */
	while ((c = getopt(argc, argv, ":V:b:so:pu")) != -1) {
	switch (c) {
	case 'V':
	type = ZFS_TYPE_VOLUME;
	if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
	(void) fprintf(stderr, gettext("bad volume "
	"size '%s': %s\n"), optarg,
	libzfs_error_description(g_zfs));
	goto error;
	}

	if (nvlist_add_uint64(props,
	zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0)
	nomem();
	volsize = intval;
	break;
	case 'p':
	parents = B_TRUE;
	break;
	case 'b':
	bflag = B_TRUE;
	if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
	(void) fprintf(stderr, gettext("bad volume "
	"block size '%s': %s\n"), optarg,
	libzfs_error_description(g_zfs));
	goto error;
	}

	if (nvlist_add_uint64(props,
	zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	intval) != 0)
	nomem();
	break;
	case 'o':
	if (parseprop(props, optarg) != 0)
	goto error;
	break;
	case 's':
	noreserve = B_TRUE;
	break;
	case 'u':
	nomount = B_TRUE;
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing size "
	"argument\n"));
	goto badusage;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	goto badusage;
	}
	}

	if ((bflag \|\| noreserve) && type != ZFS_TYPE_VOLUME) {
	(void) fprintf(stderr, gettext("'-s' and '-b' can only be "
	"used when creating a volume\n"));
	goto badusage;
	}
	if (nomount && type != ZFS_TYPE_FILESYSTEM) {
	(void) fprintf(stderr, gettext("'-u' can only be "
	"used when creating a file system\n"));
	goto badusage;
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc == 0) {
	(void) fprintf(stderr, gettext("missing %s argument\n"),
	zfs_type_to_name(type));
	goto badusage;
	}
	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	goto badusage;
	}

	if (type == ZFS_TYPE_VOLUME && !noreserve) {
	zpool_handle_t *zpool_handle;
	nvlist_t *real_props = NULL;
	uint64_t spa_version;
	char *p;
	zfs_prop_t resv_prop;
	char *strval;
	char msg[1024];

	if ((p = strchr(argv[0], '/')) != NULL)
	*p = '\0';
	zpool_handle = zpool_open(g_zfs, argv[0]);
	if (p != NULL)
	*p = '/';
	if (zpool_handle == NULL)
	goto error;
	spa_version = zpool_get_prop_int(zpool_handle,
	ZPOOL_PROP_VERSION, NULL);
	if (spa_version >= SPA_VERSION_REFRESERVATION)
	resv_prop = ZFS_PROP_REFRESERVATION;
	else
	resv_prop = ZFS_PROP_RESERVATION;

	(void) snprintf(msg, sizeof (msg),
	gettext("cannot create '%s'"), argv[0]);
	if (props && (real_props = zfs_valid_proplist(g_zfs, type,
	props, 0, NULL, zpool_handle, msg)) == NULL) {
	zpool_close(zpool_handle);
	goto error;
	}
	zpool_close(zpool_handle);

	volsize = zvol_volsize_to_reservation(volsize, real_props);
	nvlist_free(real_props);

	if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
	&strval) != 0) {
	if (nvlist_add_uint64(props,
	zfs_prop_to_name(resv_prop), volsize) != 0) {
	nvlist_free(props);
	nomem();
	}
	}
	}

	if (parents && zfs_name_valid(argv[0], type)) {
	/*
	* Now create the ancestors of target dataset. If the target
	* already exists and '-p' option was used we should not
	* complain.
	*/
	if (zfs_dataset_exists(g_zfs, argv[0], type)) {
	ret = 0;
	goto error;
	}
	if (zfs_create_ancestors(g_zfs, argv[0]) != 0)
	goto error;
	}

	/* pass to libzfs */
	if (zfs_create(g_zfs, argv[0], type, props) != 0)
	goto error;

	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
	goto error;

	ret = 0;

	/*
	* Mount and/or share the new filesystem as appropriate. We provide a
	* verbose error message to let the user know that their filesystem was
	* in fact created, even if we failed to mount or share it.
	* If the user doesn't want the dataset automatically mounted,
	* then skip the mount/share step altogether.
	*/
	if (!nomount && should_auto_mount(zhp)) {
	if (zfs_mount(zhp, NULL, 0) != 0) {
	(void) fprintf(stderr, gettext("filesystem "
	"successfully created, but not mounted\n"));
	ret = 1;
	} else if (zfs_share(zhp) != 0) {
	(void) fprintf(stderr, gettext("filesystem "
	"successfully created, but not shared\n"));
	ret = 1;
	}
	}

	error:
	if (zhp)
	zfs_close(zhp);
	nvlist_free(props);
	return (ret);
	badusage:
	nvlist_free(props);
	usage(B_FALSE);
	return (2);
	}

	/*
	* zfs destroy [-rRf] <fs, vol>
	* zfs destroy [-rRd] <snap>
	*
	* -r Recursively destroy all children
	* -R Recursively destroy all dependents, including clones
	* -f Force unmounting of any dependents
	* -d If we can't destroy now, mark for deferred destruction
	*
	* Destroys the given dataset. By default, it will unmount any filesystems,
	* and refuse to destroy a dataset that has any dependents. A dependent can
	* either be a child, or a clone of a child.
	*/
	typedef struct destroy_cbdata {
	boolean_t cb_first;
	boolean_t cb_force;
	boolean_t cb_recurse;
	boolean_t cb_error;
	boolean_t cb_doclones;
	zfs_handle_t *cb_target;
	boolean_t cb_defer_destroy;
	boolean_t cb_verbose;
	boolean_t cb_parsable;
	boolean_t cb_dryrun;
	nvlist_t *cb_nvl;
	nvlist_t *cb_batchedsnaps;

	/* first snap in contiguous run */
	char *cb_firstsnap;
	/* previous snap in contiguous run */
	char *cb_prevsnap;
	int64_t cb_snapused;
	char *cb_snapspec;
	char *cb_bookmark;
	} destroy_cbdata_t;

	/*
	* Check for any dependents based on the '-r' or '-R' flags.
	*/
	static int
	destroy_check_dependent(zfs_handle_t zhp, void data)
	{
	destroy_cbdata_t *cbp = data;
	const char *tname = zfs_get_name(cbp->cb_target);
	const char *name = zfs_get_name(zhp);

	if (strncmp(tname, name, strlen(tname)) == 0 &&
	(name[strlen(tname)] == '/' \|\| name[strlen(tname)] == '@')) {
	/*
	* This is a direct descendant, not a clone somewhere else in
	* the hierarchy.
	*/
	if (cbp->cb_recurse)
	goto out;

	if (cbp->cb_first) {
	(void) fprintf(stderr, gettext("cannot destroy '%s': "
	"%s has children\n"),
	zfs_get_name(cbp->cb_target),
	zfs_type_to_name(zfs_get_type(cbp->cb_target)));
	(void) fprintf(stderr, gettext("use '-r' to destroy "
	"the following datasets:\n"));
	cbp->cb_first = B_FALSE;
	cbp->cb_error = B_TRUE;
	}

	(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
	} else {
	/*
	* This is a clone. We only want to report this if the '-r'
	* wasn't specified, or the target is a snapshot.
	*/
	if (!cbp->cb_recurse &&
	zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
	goto out;

	if (cbp->cb_first) {
	(void) fprintf(stderr, gettext("cannot destroy '%s': "
	"%s has dependent clones\n"),
	zfs_get_name(cbp->cb_target),
	zfs_type_to_name(zfs_get_type(cbp->cb_target)));
	(void) fprintf(stderr, gettext("use '-R' to destroy "
	"the following datasets:\n"));
	cbp->cb_first = B_FALSE;
	cbp->cb_error = B_TRUE;
	cbp->cb_dryrun = B_TRUE;
	}

	(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
	}

	out:
	zfs_close(zhp);
	return (0);
	}

	static int
	destroy_callback(zfs_handle_t zhp, void data)
	{
	destroy_cbdata_t *cb = data;
	const char *name = zfs_get_name(zhp);

	if (cb->cb_verbose) {
	if (cb->cb_parsable) {
	(void) printf("destroy\t%s\n", name);
	} else if (cb->cb_dryrun) {
	(void) printf(gettext("would destroy %s\n"),
	name);
	} else {
	(void) printf(gettext("will destroy %s\n"),
	name);
	}
	}

	/*
	* Ignore pools (which we've already flagged as an error before getting
	* here).
	*/
	if (strchr(zfs_get_name(zhp), '/') == NULL &&
	zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
	zfs_close(zhp);
	return (0);
	}
	if (cb->cb_dryrun) {
	zfs_close(zhp);
	return (0);
	}

	/*
	* We batch up all contiguous snapshots (even of different
	* filesystems) and destroy them with one ioctl. We can't
	* simply do all snap deletions and then all fs deletions,
	* because we must delete a clone before its origin.
	*/
	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
	fnvlist_add_boolean(cb->cb_batchedsnaps, name);
	} else {
	int error = zfs_destroy_snaps_nvl(g_zfs,
	cb->cb_batchedsnaps, B_FALSE);
	fnvlist_free(cb->cb_batchedsnaps);
	cb->cb_batchedsnaps = fnvlist_alloc();

	if (error != 0 \|\|
	zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 \|\|
	zfs_destroy(zhp, cb->cb_defer_destroy) != 0) {
	zfs_close(zhp);
	return (-1);
	}
	}

	zfs_close(zhp);
	return (0);
	}

	static int
	destroy_print_cb(zfs_handle_t zhp, void arg)
	{
	destroy_cbdata_t *cb = arg;
	const char *name = zfs_get_name(zhp);
	int err = 0;

	if (nvlist_exists(cb->cb_nvl, name)) {
	if (cb->cb_firstsnap == NULL)
	cb->cb_firstsnap = strdup(name);
	if (cb->cb_prevsnap != NULL)
	free(cb->cb_prevsnap);
	/* this snap continues the current range */
	cb->cb_prevsnap = strdup(name);
	if (cb->cb_firstsnap == NULL \|\| cb->cb_prevsnap == NULL)
	nomem();
	if (cb->cb_verbose) {
	if (cb->cb_parsable) {
	(void) printf("destroy\t%s\n", name);
	} else if (cb->cb_dryrun) {
	(void) printf(gettext("would destroy %s\n"),
	name);
	} else {
	(void) printf(gettext("will destroy %s\n"),
	name);
	}
	}
	} else if (cb->cb_firstsnap != NULL) {
	/* end of this range */
	uint64_t used = 0;
	err = lzc_snaprange_space(cb->cb_firstsnap,
	cb->cb_prevsnap, &used);
	cb->cb_snapused += used;
	free(cb->cb_firstsnap);
	cb->cb_firstsnap = NULL;
	free(cb->cb_prevsnap);
	cb->cb_prevsnap = NULL;
	}
	zfs_close(zhp);
	return (err);
	}

	static int
	destroy_print_snapshots(zfs_handle_t fs_zhp, destroy_cbdata_t cb)
	{
	int err = 0;
	assert(cb->cb_firstsnap == NULL);
	assert(cb->cb_prevsnap == NULL);
	err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb);
	if (cb->cb_firstsnap != NULL) {
	uint64_t used = 0;
	if (err == 0) {
	err = lzc_snaprange_space(cb->cb_firstsnap,
	cb->cb_prevsnap, &used);
	}
	cb->cb_snapused += used;
	free(cb->cb_firstsnap);
	cb->cb_firstsnap = NULL;
	free(cb->cb_prevsnap);
	cb->cb_prevsnap = NULL;
	}
	return (err);
	}

	static int
	snapshot_to_nvl_cb(zfs_handle_t zhp, void arg)
	{
	destroy_cbdata_t *cb = arg;
	int err = 0;

	/* Check for clones. */
	if (!cb->cb_doclones && !cb->cb_defer_destroy) {
	cb->cb_target = zhp;
	cb->cb_first = B_TRUE;
	err = zfs_iter_dependents(zhp, B_TRUE,
	destroy_check_dependent, cb);
	}

	if (err == 0) {
	if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp)))
	nomem();
	}
	zfs_close(zhp);
	return (err);
	}

	static int
	gather_snapshots(zfs_handle_t zhp, void arg)
	{
	destroy_cbdata_t *cb = arg;
	int err = 0;

	err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb);
	if (err == ENOENT)
	err = 0;
	if (err != 0)
	goto out;

	if (cb->cb_verbose) {
	err = destroy_print_snapshots(zhp, cb);
	if (err != 0)
	goto out;
	}

	if (cb->cb_recurse)
	err = zfs_iter_filesystems(zhp, gather_snapshots, cb);

	out:
	zfs_close(zhp);
	return (err);
	}

	static int
	destroy_clones(destroy_cbdata_t *cb)
	{
	nvpair_t *pair;
	for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL);
	pair != NULL;
	pair = nvlist_next_nvpair(cb->cb_nvl, pair)) {
	zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair),
	ZFS_TYPE_SNAPSHOT);
	if (zhp != NULL) {
	boolean_t defer = cb->cb_defer_destroy;
	int err = 0;

	/*
	* We can't defer destroy non-snapshots, so set it to
	* false while destroying the clones.
	*/
	cb->cb_defer_destroy = B_FALSE;
	err = zfs_iter_dependents(zhp, B_FALSE,
	destroy_callback, cb);
	cb->cb_defer_destroy = defer;
	zfs_close(zhp);
	if (err != 0)
	return (err);
	}
	}
	return (0);
	}

	static int
	zfs_do_destroy(int argc, char **argv)
	{
	destroy_cbdata_t cb = { 0 };
	int rv = 0;
	int err = 0;
	int c;
	zfs_handle_t *zhp = NULL;
	char at, pound;
	zfs_type_t type = ZFS_TYPE_DATASET;

	/* check options */
	while ((c = getopt(argc, argv, "vpndfrR")) != -1) {
	switch (c) {
	case 'v':
	cb.cb_verbose = B_TRUE;
	break;
	case 'p':
	cb.cb_verbose = B_TRUE;
	cb.cb_parsable = B_TRUE;
	break;
	case 'n':
	cb.cb_dryrun = B_TRUE;
	break;
	case 'd':
	cb.cb_defer_destroy = B_TRUE;
	type = ZFS_TYPE_SNAPSHOT;
	break;
	case 'f':
	cb.cb_force = B_TRUE;
	break;
	case 'r':
	cb.cb_recurse = B_TRUE;
	break;
	case 'R':
	cb.cb_recurse = B_TRUE;
	cb.cb_doclones = B_TRUE;
	break;
	case '?':
	default:
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc == 0) {
	(void) fprintf(stderr, gettext("missing dataset argument\n"));
	usage(B_FALSE);
	}
	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	at = strchr(argv[0], '@');
	pound = strchr(argv[0], '#');
	if (at != NULL) {

	/* Build the list of snaps to destroy in cb_nvl. */
	cb.cb_nvl = fnvlist_alloc();

	*at = '\0';
	zhp = zfs_open(g_zfs, argv[0],
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	if (zhp == NULL)
	return (1);

	cb.cb_snapspec = at + 1;
	if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 \|\|
	cb.cb_error) {
	rv = 1;
	goto out;
	}

	if (nvlist_empty(cb.cb_nvl)) {
	(void) fprintf(stderr, gettext("could not find any "
	"snapshots to destroy; check snapshot names.\n"));
	rv = 1;
	goto out;
	}

	if (cb.cb_verbose) {
	char buf[16];
	zfs_nicenum(cb.cb_snapused, buf, sizeof (buf));
	if (cb.cb_parsable) {
	(void) printf("reclaim\t%llu\n",
	cb.cb_snapused);
	} else if (cb.cb_dryrun) {
	(void) printf(gettext("would reclaim %s\n"),
	buf);
	} else {
	(void) printf(gettext("will reclaim %s\n"),
	buf);
	}
	}

	if (!cb.cb_dryrun) {
	if (cb.cb_doclones) {
	cb.cb_batchedsnaps = fnvlist_alloc();
	err = destroy_clones(&cb);
	if (err == 0) {
	err = zfs_destroy_snaps_nvl(g_zfs,
	cb.cb_batchedsnaps, B_FALSE);
	}
	if (err != 0) {
	rv = 1;
	goto out;
	}
	}
	if (err == 0) {
	err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl,
	cb.cb_defer_destroy);
	}
	}

	if (err != 0)
	rv = 1;
	} else if (pound != NULL) {
	int err;
	nvlist_t *nvl;

	if (cb.cb_dryrun) {
	(void) fprintf(stderr,
	"dryrun is not supported with bookmark\n");
	return (-1);
	}

	if (cb.cb_defer_destroy) {
	(void) fprintf(stderr,
	"defer destroy is not supported with bookmark\n");
	return (-1);
	}

	if (cb.cb_recurse) {
	(void) fprintf(stderr,
	"recursive is not supported with bookmark\n");
	return (-1);
	}

	if (!zfs_bookmark_exists(argv[0])) {
	(void) fprintf(stderr, gettext("bookmark '%s' "
	"does not exist.\n"), argv[0]);
	return (1);
	}

	nvl = fnvlist_alloc();
	fnvlist_add_boolean(nvl, argv[0]);

	err = lzc_destroy_bookmarks(nvl, NULL);
	if (err != 0) {
	(void) zfs_standard_error(g_zfs, err,
	"cannot destroy bookmark");
	}

	nvlist_free(cb.cb_nvl);

	return (err);
	} else {
	/* Open the given dataset */
	if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
	return (1);

	cb.cb_target = zhp;

	/*
	* Perform an explicit check for pools before going any further.
	*/
	if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
	zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
	(void) fprintf(stderr, gettext("cannot destroy '%s': "
	"operation does not apply to pools\n"),
	zfs_get_name(zhp));
	(void) fprintf(stderr, gettext("use 'zfs destroy -r "
	"%s' to destroy all datasets in the pool\n"),
	zfs_get_name(zhp));
	(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
	"to destroy the pool itself\n"), zfs_get_name(zhp));
	rv = 1;
	goto out;
	}

	/*
	* Check for any dependents and/or clones.
	*/
	cb.cb_first = B_TRUE;
	if (!cb.cb_doclones &&
	zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
	&cb) != 0) {
	rv = 1;
	goto out;
	}

	if (cb.cb_error) {
	rv = 1;
	goto out;
	}

	cb.cb_batchedsnaps = fnvlist_alloc();
	if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback,
	&cb) != 0) {
	rv = 1;
	goto out;
	}

	/*
	* Do the real thing. The callback will close the
	* handle regardless of whether it succeeds or not.
	*/
	err = destroy_callback(zhp, &cb);
	zhp = NULL;
	if (err == 0) {
	err = zfs_destroy_snaps_nvl(g_zfs,
	cb.cb_batchedsnaps, cb.cb_defer_destroy);
	}
	if (err != 0)
	rv = 1;
	}

	out:
	fnvlist_free(cb.cb_batchedsnaps);
	fnvlist_free(cb.cb_nvl);
	if (zhp != NULL)
	zfs_close(zhp);
	return (rv);
	}

	static boolean_t
	is_recvd_column(zprop_get_cbdata_t *cbp)
	{
	int i;
	zfs_get_column_t col;

	for (i = 0; i < ZFS_GET_NCOLS &&
	(col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
	if (col == GET_COL_RECVD)
	return (B_TRUE);
	return (B_FALSE);
	}

	/*
	* zfs get [-rHp] [-o all \| field[,field]...] [-s source[,source]...]
	* < all \| property[,property]... > < fs \| snap \| vol > ...
	*
	* -r recurse over any child datasets
	* -H scripted mode. Headers are stripped, and fields are separated
	* by tabs instead of spaces.
	* -o Set of fields to display. One of "name,property,value,
	* received,source". Default is "name,property,value,source".
	* "all" is an alias for all five.
	* -s Set of sources to allow. One of
	* "local,default,inherited,received,temporary,none". Default is
	* all six.
	* -p Display values in parsable (literal) format.
	*
	* Prints properties for the given datasets. The user can control which
	* columns to display as well as which property types to allow.
	*/

	/*
	* Invoked to display the properties for a single dataset.
	*/
	static int
	get_callback(zfs_handle_t zhp, void data)
	{
	char buf[ZFS_MAXPROPLEN];
	char rbuf[ZFS_MAXPROPLEN];
	zprop_source_t sourcetype;
	char source[ZFS_MAX_DATASET_NAME_LEN];
	zprop_get_cbdata_t *cbp = data;
	nvlist_t *user_props = zfs_get_user_props(zhp);
	zprop_list_t *pl = cbp->cb_proplist;
	nvlist_t *propval;
	char *strval;
	char *sourceval;
	boolean_t received = is_recvd_column(cbp);

	for (; pl != NULL; pl = pl->pl_next) {
	char *recvdval = NULL;
	/*
	* Skip the special fake placeholder. This will also skip over
	* the name property when 'all' is specified.
	*/
	if (pl->pl_prop == ZFS_PROP_NAME &&
	pl == cbp->cb_proplist)
	continue;

	if (pl->pl_prop != ZPROP_INVAL) {
	if (zfs_prop_get(zhp, pl->pl_prop, buf,
	sizeof (buf), &sourcetype, source,
	sizeof (source),
	cbp->cb_literal) != 0) {
	if (pl->pl_all)
	continue;
	if (!zfs_prop_valid_for_type(pl->pl_prop,
	ZFS_TYPE_DATASET)) {
	(void) fprintf(stderr,
	gettext("No such property '%s'\n"),
	zfs_prop_to_name(pl->pl_prop));
	continue;
	}
	sourcetype = ZPROP_SRC_NONE;
	(void) strlcpy(buf, "-", sizeof (buf));
	}

	if (received && (zfs_prop_get_recvd(zhp,
	zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf),
	cbp->cb_literal) == 0))
	recvdval = rbuf;

	zprop_print_one_property(zfs_get_name(zhp), cbp,
	zfs_prop_to_name(pl->pl_prop),
	buf, sourcetype, source, recvdval);
	} else if (zfs_prop_userquota(pl->pl_user_prop)) {
	sourcetype = ZPROP_SRC_LOCAL;

	if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
	buf, sizeof (buf), cbp->cb_literal) != 0) {
	sourcetype = ZPROP_SRC_NONE;
	(void) strlcpy(buf, "-", sizeof (buf));
	}

	zprop_print_one_property(zfs_get_name(zhp), cbp,
	pl->pl_user_prop, buf, sourcetype, source, NULL);
	} else if (zfs_prop_written(pl->pl_user_prop)) {
	sourcetype = ZPROP_SRC_LOCAL;

	if (zfs_prop_get_written(zhp, pl->pl_user_prop,
	buf, sizeof (buf), cbp->cb_literal) != 0) {
	sourcetype = ZPROP_SRC_NONE;
	(void) strlcpy(buf, "-", sizeof (buf));
	}

	zprop_print_one_property(zfs_get_name(zhp), cbp,
	pl->pl_user_prop, buf, sourcetype, source, NULL);
	} else {
	if (nvlist_lookup_nvlist(user_props,
	pl->pl_user_prop, &propval) != 0) {
	if (pl->pl_all)
	continue;
	sourcetype = ZPROP_SRC_NONE;
	strval = "-";
	} else {
	verify(nvlist_lookup_string(propval,
	ZPROP_VALUE, &strval) == 0);
	verify(nvlist_lookup_string(propval,
	ZPROP_SOURCE, &sourceval) == 0);

	if (strcmp(sourceval,
	zfs_get_name(zhp)) == 0) {
	sourcetype = ZPROP_SRC_LOCAL;
	} else if (strcmp(sourceval,
	ZPROP_SOURCE_VAL_RECVD) == 0) {
	sourcetype = ZPROP_SRC_RECEIVED;
	} else {
	sourcetype = ZPROP_SRC_INHERITED;
	(void) strlcpy(source,
	sourceval, sizeof (source));
	}
	}

	if (received && (zfs_prop_get_recvd(zhp,
	pl->pl_user_prop, rbuf, sizeof (rbuf),
	cbp->cb_literal) == 0))
	recvdval = rbuf;

	zprop_print_one_property(zfs_get_name(zhp), cbp,
	pl->pl_user_prop, strval, sourcetype,
	source, recvdval);
	}
	}

	return (0);
	}

	static int
	zfs_do_get(int argc, char **argv)
	{
	zprop_get_cbdata_t cb = { 0 };
	int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
	int types = ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK;
	char value, fields;
	int ret = 0;
	int limit = 0;
	zprop_list_t fake_name = { 0 };

	/*
	* Set up default columns and sources.
	*/
	cb.cb_sources = ZPROP_SRC_ALL;
	cb.cb_columns[0] = GET_COL_NAME;
	cb.cb_columns[1] = GET_COL_PROPERTY;
	cb.cb_columns[2] = GET_COL_VALUE;
	cb.cb_columns[3] = GET_COL_SOURCE;
	cb.cb_type = ZFS_TYPE_DATASET;

	/* check options */
	while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) {
	switch (c) {
	case 'p':
	cb.cb_literal = B_TRUE;
	break;
	case 'd':
	limit = parse_depth(optarg, &flags);
	break;
	case 'r':
	flags \|= ZFS_ITER_RECURSE;
	break;
	case 'H':
	cb.cb_scripted = B_TRUE;
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case 'o':
	/*
	* Process the set of columns to display. We zero out
	* the structure to give us a blank slate.
	*/
	bzero(&cb.cb_columns, sizeof (cb.cb_columns));
	i = 0;
	while (*optarg != '\0') {
	static char *col_subopts[] =
	{ "name", "property", "value", "received",
	"source", "all", NULL };

	if (i == ZFS_GET_NCOLS) {
	(void) fprintf(stderr, gettext("too "
	"many fields given to -o "
	"option\n"));
	usage(B_FALSE);
	}

	switch (getsubopt(&optarg, col_subopts,
	&value)) {
	case 0:
	cb.cb_columns[i++] = GET_COL_NAME;
	break;
	case 1:
	cb.cb_columns[i++] = GET_COL_PROPERTY;
	break;
	case 2:
	cb.cb_columns[i++] = GET_COL_VALUE;
	break;
	case 3:
	cb.cb_columns[i++] = GET_COL_RECVD;
	flags \|= ZFS_ITER_RECVD_PROPS;
	break;
	case 4:
	cb.cb_columns[i++] = GET_COL_SOURCE;
	break;
	case 5:
	if (i > 0) {
	(void) fprintf(stderr,
	gettext("\"all\" conflicts "
	"with specific fields "
	"given to -o option\n"));
	usage(B_FALSE);
	}
	cb.cb_columns[0] = GET_COL_NAME;
	cb.cb_columns[1] = GET_COL_PROPERTY;
	cb.cb_columns[2] = GET_COL_VALUE;
	cb.cb_columns[3] = GET_COL_RECVD;
	cb.cb_columns[4] = GET_COL_SOURCE;
	flags \|= ZFS_ITER_RECVD_PROPS;
	i = ZFS_GET_NCOLS;
	break;
	default:
	(void) fprintf(stderr,
	gettext("invalid column name "
	"'%s'\n"), suboptarg);
	usage(B_FALSE);
	}
	}
	break;

	case 's':
	cb.cb_sources = 0;
	while (*optarg != '\0') {
	static char *source_subopts[] = {
	"local", "default", "inherited",
	"received", "temporary", "none",
	NULL };

	switch (getsubopt(&optarg, source_subopts,
	&value)) {
	case 0:
	cb.cb_sources \|= ZPROP_SRC_LOCAL;
	break;
	case 1:
	cb.cb_sources \|= ZPROP_SRC_DEFAULT;
	break;
	case 2:
	cb.cb_sources \|= ZPROP_SRC_INHERITED;
	break;
	case 3:
	cb.cb_sources \|= ZPROP_SRC_RECEIVED;
	break;
	case 4:
	cb.cb_sources \|= ZPROP_SRC_TEMPORARY;
	break;
	case 5:
	cb.cb_sources \|= ZPROP_SRC_NONE;
	break;
	default:
	(void) fprintf(stderr,
	gettext("invalid source "
	"'%s'\n"), suboptarg);
	usage(B_FALSE);
	}
	}
	break;

	case 't':
	types = 0;
	flags &= ~ZFS_ITER_PROP_LISTSNAPS;
	while (*optarg != '\0') {
	static char *type_subopts[] = { "filesystem",
	"volume", "snapshot", "bookmark",
	"all", NULL };

	switch (getsubopt(&optarg, type_subopts,
	&value)) {
	case 0:
	types \|= ZFS_TYPE_FILESYSTEM;
	break;
	case 1:
	types \|= ZFS_TYPE_VOLUME;
	break;
	case 2:
	types \|= ZFS_TYPE_SNAPSHOT;
	break;
	case 3:
	types \|= ZFS_TYPE_BOOKMARK;
	break;
	case 4:
	types = ZFS_TYPE_DATASET \|
	ZFS_TYPE_BOOKMARK;
	break;

	default:
	(void) fprintf(stderr,
	gettext("invalid type '%s'\n"),
	suboptarg);
	usage(B_FALSE);
	}
	}
	break;

	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing property "
	"argument\n"));
	usage(B_FALSE);
	}

	fields = argv[0];

	if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
	!= 0)
	usage(B_FALSE);

	argc--;
	argv++;

	/*
	* As part of zfs_expand_proplist(), we keep track of the maximum column
	* width for each property. For the 'NAME' (and 'SOURCE') columns, we
	* need to know the maximum name length. However, the user likely did
	* not specify 'name' as one of the properties to fetch, so we need to
	* make sure we always include at least this property for
	* print_get_headers() to work properly.
	*/
	if (cb.cb_proplist != NULL) {
	fake_name.pl_prop = ZFS_PROP_NAME;
	fake_name.pl_width = strlen(gettext("NAME"));
	fake_name.pl_next = cb.cb_proplist;
	cb.cb_proplist = &fake_name;
	}

	cb.cb_first = B_TRUE;

	/* run for each object */
	ret = zfs_for_each(argc, argv, flags, types, NULL,
	&cb.cb_proplist, limit, get_callback, &cb);

	if (cb.cb_proplist == &fake_name)
	zprop_free_list(fake_name.pl_next);
	else
	zprop_free_list(cb.cb_proplist);

	return (ret);
	}

	/*
	* inherit [-rS] <property> <fs\|vol> ...
	*
	* -r Recurse over all children
	* -S Revert to received value, if any
	*
	* For each dataset specified on the command line, inherit the given property
	* from its parent. Inheriting a property at the pool level will cause it to
	* use the default value. The '-r' flag will recurse over all children, and is
	* useful for setting a property on a hierarchy-wide basis, regardless of any
	* local modifications for each dataset.
	*/

	typedef struct inherit_cbdata {
	const char *cb_propname;
	boolean_t cb_received;
	} inherit_cbdata_t;

	static int
	inherit_recurse_cb(zfs_handle_t zhp, void data)
	{
	inherit_cbdata_t *cb = data;
	zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname);

	/*
	* If we're doing it recursively, then ignore properties that
	* are not valid for this type of dataset.
	*/
	if (prop != ZPROP_INVAL &&
	!zfs_prop_valid_for_type(prop, zfs_get_type(zhp)))
	return (0);

	return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
	}

	static int
	inherit_cb(zfs_handle_t zhp, void data)
	{
	inherit_cbdata_t *cb = data;

	return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
	}

	static int
	zfs_do_inherit(int argc, char **argv)
	{
	int c;
	zfs_prop_t prop;
	inherit_cbdata_t cb = { 0 };
	char *propname;
	int ret = 0;
	int flags = 0;
	boolean_t received = B_FALSE;

	/* check options */
	while ((c = getopt(argc, argv, "rS")) != -1) {
	switch (c) {
	case 'r':
	flags \|= ZFS_ITER_RECURSE;
	break;
	case 'S':
	received = B_TRUE;
	break;
	case '?':
	default:
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing property argument\n"));
	usage(B_FALSE);
	}
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing dataset argument\n"));
	usage(B_FALSE);
	}

	propname = argv[0];
	argc--;
	argv++;

	if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
	if (zfs_prop_readonly(prop)) {
	(void) fprintf(stderr, gettext(
	"%s property is read-only\n"),
	propname);
	return (1);
	}
	if (!zfs_prop_inheritable(prop) && !received) {
	(void) fprintf(stderr, gettext("'%s' property cannot "
	"be inherited\n"), propname);
	if (prop == ZFS_PROP_QUOTA \|\|
	prop == ZFS_PROP_RESERVATION \|\|
	prop == ZFS_PROP_REFQUOTA \|\|
	prop == ZFS_PROP_REFRESERVATION) {
	(void) fprintf(stderr, gettext("use 'zfs set "
	"%s=none' to clear\n"), propname);
	(void) fprintf(stderr, gettext("use 'zfs "
	"inherit -S %s' to revert to received "
	"value\n"), propname);
	}
	return (1);
	}
	if (received && (prop == ZFS_PROP_VOLSIZE \|\|
	prop == ZFS_PROP_VERSION)) {
	(void) fprintf(stderr, gettext("'%s' property cannot "
	"be reverted to a received value\n"), propname);
	return (1);
	}
	} else if (!zfs_prop_user(propname)) {
	(void) fprintf(stderr, gettext("invalid property '%s'\n"),
	propname);
	usage(B_FALSE);
	}

	cb.cb_propname = propname;
	cb.cb_received = received;

	if (flags & ZFS_ITER_RECURSE) {
	ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
	NULL, NULL, 0, inherit_recurse_cb, &cb);
	} else {
	ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
	NULL, NULL, 0, inherit_cb, &cb);
	}

	return (ret);
	}

	typedef struct upgrade_cbdata {
	uint64_t cb_numupgraded;
	uint64_t cb_numsamegraded;
	uint64_t cb_numfailed;
	uint64_t cb_version;
	boolean_t cb_newer;
	boolean_t cb_foundone;
	char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN];
	} upgrade_cbdata_t;

	static int
	same_pool(zfs_handle_t zhp, const char name)
	{
	int len1 = strcspn(name, "/@");
	const char *zhname = zfs_get_name(zhp);
	int len2 = strcspn(zhname, "/@");

	if (len1 != len2)
	return (B_FALSE);
	return (strncmp(name, zhname, len1) == 0);
	}

	static int
	upgrade_list_callback(zfs_handle_t zhp, void data)
	{
	upgrade_cbdata_t *cb = data;
	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);

	/* list if it's old/new */
	if ((!cb->cb_newer && version < ZPL_VERSION) \|\|
	(cb->cb_newer && version > ZPL_VERSION)) {
	char *str;
	if (cb->cb_newer) {
	str = gettext("The following filesystems are "
	"formatted using a newer software version and\n"
	"cannot be accessed on the current system.\n\n");
	} else {
	str = gettext("The following filesystems are "
	"out of date, and can be upgraded. After being\n"
	"upgraded, these filesystems (and any 'zfs send' "
	"streams generated from\n"
	"subsequent snapshots) will no longer be "
	"accessible by older software versions.\n\n");
	}

	if (!cb->cb_foundone) {
	(void) puts(str);
	(void) printf(gettext("VER FILESYSTEM\n"));
	(void) printf(gettext("--- ------------\n"));
	cb->cb_foundone = B_TRUE;
	}

	(void) printf("%2u %s\n", version, zfs_get_name(zhp));
	}

	return (0);
	}

	static int
	upgrade_set_callback(zfs_handle_t zhp, void data)
	{
	upgrade_cbdata_t *cb = data;
	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
	int needed_spa_version;
	int spa_version;

	if (zfs_spa_version(zhp, &spa_version) < 0)
	return (-1);

	needed_spa_version = zfs_spa_version_map(cb->cb_version);

	if (needed_spa_version < 0)
	return (-1);

	if (spa_version < needed_spa_version) {
	/* can't upgrade */
	(void) printf(gettext("%s: can not be "
	"upgraded; the pool version needs to first "
	"be upgraded\nto version %d\n\n"),
	zfs_get_name(zhp), needed_spa_version);
	cb->cb_numfailed++;
	return (0);
	}

	/* upgrade */
	if (version < cb->cb_version) {
	char verstr[16];
	(void) snprintf(verstr, sizeof (verstr),
	"%llu", cb->cb_version);
	if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) {
	/*
	* If they did "zfs upgrade -a", then we could
	* be doing ioctls to different pools. We need
	* to log this history once to each pool, and bypass
	* the normal history logging that happens in main().
	*/
	(void) zpool_log_history(g_zfs, history_str);
	log_history = B_FALSE;
	}
	if (zfs_prop_set(zhp, "version", verstr) == 0)
	cb->cb_numupgraded++;
	else
	cb->cb_numfailed++;
	(void) strcpy(cb->cb_lastfs, zfs_get_name(zhp));
	} else if (version > cb->cb_version) {
	/* can't downgrade */
	(void) printf(gettext("%s: can not be downgraded; "
	"it is already at version %u\n"),
	zfs_get_name(zhp), version);
	cb->cb_numfailed++;
	} else {
	cb->cb_numsamegraded++;
	}
	return (0);
	}

	/*
	* zfs upgrade
	* zfs upgrade -v
	* zfs upgrade [-r] [-V <version>] <-a \| filesystem>
	*/
	static int
	zfs_do_upgrade(int argc, char **argv)
	{
	boolean_t all = B_FALSE;
	boolean_t showversions = B_FALSE;
	int ret = 0;
	upgrade_cbdata_t cb = { 0 };
	int c;
	int flags = ZFS_ITER_ARGS_CAN_BE_PATHS;

	/* check options */
	while ((c = getopt(argc, argv, "rvV:a")) != -1) {
	switch (c) {
	case 'r':
	flags \|= ZFS_ITER_RECURSE;
	break;
	case 'v':
	showversions = B_TRUE;
	break;
	case 'V':
	if (zfs_prop_string_to_index(ZFS_PROP_VERSION,
	optarg, &cb.cb_version) != 0) {
	(void) fprintf(stderr,
	gettext("invalid version %s\n"), optarg);
	usage(B_FALSE);
	}
	break;
	case 'a':
	all = B_TRUE;
	break;
	case '?':
	default:
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) \| cb.cb_version))
	usage(B_FALSE);
	if (showversions && (flags & ZFS_ITER_RECURSE \|\| all \|\|
	cb.cb_version \|\| argc))
	usage(B_FALSE);
	if ((all \|\| argc) && (showversions))
	usage(B_FALSE);
	if (all && argc)
	usage(B_FALSE);

	if (showversions) {
	/* Show info on available versions. */
	(void) printf(gettext("The following filesystem versions are "
	"supported:\n\n"));
	(void) printf(gettext("VER DESCRIPTION\n"));
	(void) printf("--- -----------------------------------------"
	"---------------\n");
	(void) printf(gettext(" 1 Initial ZFS filesystem version\n"));
	(void) printf(gettext(" 2 Enhanced directory entries\n"));
	(void) printf(gettext(" 3 Case insensitive and filesystem "
	"user identifier (FUID)\n"));
	(void) printf(gettext(" 4 userquota, groupquota "
	"properties\n"));
	(void) printf(gettext(" 5 System attributes\n"));
	(void) printf(gettext("\nFor more information on a particular "
	"version, including supported releases,\n"));
	(void) printf("see the ZFS Administration Guide.\n\n");
	ret = 0;
	} else if (argc \|\| all) {
	/* Upgrade filesystems */
	if (cb.cb_version == 0)
	cb.cb_version = ZPL_VERSION;
	ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM,
	NULL, NULL, 0, upgrade_set_callback, &cb);
	(void) printf(gettext("%llu filesystems upgraded\n"),
	cb.cb_numupgraded);
	if (cb.cb_numsamegraded) {
	(void) printf(gettext("%llu filesystems already at "
	"this version\n"),
	cb.cb_numsamegraded);
	}
	if (cb.cb_numfailed != 0)
	ret = 1;
	} else {
	/* List old-version filesytems */
	boolean_t found;
	(void) printf(gettext("This system is currently running "
	"ZFS filesystem version %llu.\n\n"), ZPL_VERSION);

	flags \|= ZFS_ITER_RECURSE;
	ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
	NULL, NULL, 0, upgrade_list_callback, &cb);

	found = cb.cb_foundone;
	cb.cb_foundone = B_FALSE;
	cb.cb_newer = B_TRUE;

	ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
	NULL, NULL, 0, upgrade_list_callback, &cb);

	if (!cb.cb_foundone && !found) {
	(void) printf(gettext("All filesystems are "
	"formatted with the current version.\n"));
	}
	}

	return (ret);
	}

	/*
	* zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
	* [-S field [-S field]...] [-t type[,...]] filesystem \| snapshot
	* zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
	* [-S field [-S field]...] [-t type[,...]] filesystem \| snapshot
	*
	* -H Scripted mode; elide headers and separate columns by tabs.
	* -i Translate SID to POSIX ID.
	* -n Print numeric ID instead of user/group name.
	* -o Control which fields to display.
	* -p Use exact (parsable) numeric output.
	* -s Specify sort columns, descending order.
	* -S Specify sort columns, ascending order.
	* -t Control which object types to display.
	*
	* Displays space consumed by, and quotas on, each user in the specified
	* filesystem or snapshot.
	*/

	/* us_field_types, us_field_hdr and us_field_names should be kept in sync */
	enum us_field_types {
	USFIELD_TYPE,
	USFIELD_NAME,
	USFIELD_USED,
	USFIELD_QUOTA
	};
	static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" };
	static char *us_field_names[] = { "type", "name", "used", "quota" };
	#define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *))

	#define USTYPE_PSX_GRP (1 << 0)
	#define USTYPE_PSX_USR (1 << 1)
	#define USTYPE_SMB_GRP (1 << 2)
	#define USTYPE_SMB_USR (1 << 3)
	#define USTYPE_ALL \
	(USTYPE_PSX_GRP \| USTYPE_PSX_USR \| USTYPE_SMB_GRP \| USTYPE_SMB_USR)

	static int us_type_bits[] = {
	USTYPE_PSX_GRP,
	USTYPE_PSX_USR,
	USTYPE_SMB_GRP,
	USTYPE_SMB_USR,
	USTYPE_ALL
	};
	static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup",
	"smbuser", "all" };

	typedef struct us_node {
	nvlist_t *usn_nvl;
	uu_avl_node_t usn_avlnode;
	uu_list_node_t usn_listnode;
	} us_node_t;

	typedef struct us_cbdata {
	nvlist_t **cb_nvlp;
	uu_avl_pool_t *cb_avl_pool;
	uu_avl_t *cb_avl;
	boolean_t cb_numname;
	boolean_t cb_nicenum;
	boolean_t cb_sid2posix;
	zfs_userquota_prop_t cb_prop;
	zfs_sort_column_t *cb_sortcol;
	size_t cb_width[USFIELD_LAST];
	} us_cbdata_t;

	static boolean_t us_populated = B_FALSE;

	typedef struct {
	zfs_sort_column_t *si_sortcol;
	boolean_t si_numname;
	} us_sort_info_t;

	static int
	us_field_index(char *field)
	{
	int i;

	for (i = 0; i < USFIELD_LAST; i++) {
	if (strcmp(field, us_field_names[i]) == 0)
	return (i);
	}

	return (-1);
	}

	static int
	us_compare(const void larg, const void rarg, void *unused)
	{
	const us_node_t *l = larg;
	const us_node_t *r = rarg;
	us_sort_info_t si = (us_sort_info_t )unused;
	zfs_sort_column_t *sortcol = si->si_sortcol;
	boolean_t numname = si->si_numname;
	nvlist_t *lnvl = l->usn_nvl;
	nvlist_t *rnvl = r->usn_nvl;
	int rc = 0;
	boolean_t lvb, rvb;

	for (; sortcol != NULL; sortcol = sortcol->sc_next) {
	char *lvstr = "";
	char *rvstr = "";
	uint32_t lv32 = 0;
	uint32_t rv32 = 0;
	uint64_t lv64 = 0;
	uint64_t rv64 = 0;
	zfs_prop_t prop = sortcol->sc_prop;
	const char *propname = NULL;
	boolean_t reverse = sortcol->sc_reverse;

	switch (prop) {
	case ZFS_PROP_TYPE:
	propname = "type";
	(void) nvlist_lookup_uint32(lnvl, propname, &lv32);
	(void) nvlist_lookup_uint32(rnvl, propname, &rv32);
	if (rv32 != lv32)
	rc = (rv32 < lv32) ? 1 : -1;
	break;
	case ZFS_PROP_NAME:
	propname = "name";
	if (numname) {
	(void) nvlist_lookup_uint64(lnvl, propname,
	&lv64);
	(void) nvlist_lookup_uint64(rnvl, propname,
	&rv64);
	if (rv64 != lv64)
	rc = (rv64 < lv64) ? 1 : -1;
	} else {
	(void) nvlist_lookup_string(lnvl, propname,
	&lvstr);
	(void) nvlist_lookup_string(rnvl, propname,
	&rvstr);
	rc = strcmp(lvstr, rvstr);
	}
	break;
	case ZFS_PROP_USED:
	case ZFS_PROP_QUOTA:
	if (!us_populated)
	break;
	if (prop == ZFS_PROP_USED)
	propname = "used";
	else
	propname = "quota";
	(void) nvlist_lookup_uint64(lnvl, propname, &lv64);
	(void) nvlist_lookup_uint64(rnvl, propname, &rv64);
	if (rv64 != lv64)
	rc = (rv64 < lv64) ? 1 : -1;
	break;

	default:
	break;
	}

	if (rc != 0) {
	if (rc < 0)
	return (reverse ? 1 : -1);
	else
	return (reverse ? -1 : 1);
	}
	}

	/*
	* If entries still seem to be the same, check if they are of the same
	* type (smbentity is added only if we are doing SID to POSIX ID
	* translation where we can have duplicate type/name combinations).
	*/
	if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 &&
	nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 &&
	lvb != rvb)
	return (lvb < rvb ? -1 : 1);

	return (0);
	}

	static inline const char *
	us_type2str(unsigned field_type)
	{
	switch (field_type) {
	case USTYPE_PSX_USR:
	return ("POSIX User");
	case USTYPE_PSX_GRP:
	return ("POSIX Group");
	case USTYPE_SMB_USR:
	return ("SMB User");
	case USTYPE_SMB_GRP:
	return ("SMB Group");
	default:
	return ("Undefined");
	}
	}

	static int
	userspace_cb(void arg, const char domain, uid_t rid, uint64_t space)
	{
	us_cbdata_t cb = (us_cbdata_t )arg;
	zfs_userquota_prop_t prop = cb->cb_prop;
	char *name = NULL;
	char *propname;
	char sizebuf[32];
	us_node_t *node;
	uu_avl_pool_t *avl_pool = cb->cb_avl_pool;
	uu_avl_t *avl = cb->cb_avl;
	uu_avl_index_t idx;
	nvlist_t *props;
	us_node_t *n;
	zfs_sort_column_t *sortcol = cb->cb_sortcol;
	unsigned type = 0;
	const char *typestr;
	size_t namelen;
	size_t typelen;
	size_t sizelen;
	int typeidx, nameidx, sizeidx;
	us_sort_info_t sortinfo = { sortcol, cb->cb_numname };
	boolean_t smbentity = B_FALSE;

	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	nomem();
	node = safe_malloc(sizeof (us_node_t));
	uu_avl_node_init(node, &node->usn_avlnode, avl_pool);
	node->usn_nvl = props;

	if (domain != NULL && domain[0] != '\0') {
	/* SMB */
	char sid[MAXNAMELEN + 32];
	uid_t id;
	#ifdef illumos
	int err;
	int flag = IDMAP_REQ_FLG_USE_CACHE;
	#endif

	smbentity = B_TRUE;

	(void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid);

	if (prop == ZFS_PROP_GROUPUSED \|\| prop == ZFS_PROP_GROUPQUOTA) {
	type = USTYPE_SMB_GRP;
	#ifdef illumos
	err = sid_to_id(sid, B_FALSE, &id);
	#endif
	} else {
	type = USTYPE_SMB_USR;
	#ifdef illumos
	err = sid_to_id(sid, B_TRUE, &id);
	#endif
	}

	#ifdef illumos
	if (err == 0) {
	rid = id;
	if (!cb->cb_sid2posix) {
	if (type == USTYPE_SMB_USR) {
	(void) idmap_getwinnamebyuid(rid, flag,
	&name, NULL);
	} else {
	(void) idmap_getwinnamebygid(rid, flag,
	&name, NULL);
	}
	if (name == NULL)
	name = sid;
	}
	}
	#endif
	}

	if (cb->cb_sid2posix \|\| domain == NULL \|\| domain[0] == '\0') {
	/* POSIX or -i */
	if (prop == ZFS_PROP_GROUPUSED \|\| prop == ZFS_PROP_GROUPQUOTA) {
	type = USTYPE_PSX_GRP;
	if (!cb->cb_numname) {
	struct group *g;

	if ((g = getgrgid(rid)) != NULL)
	name = g->gr_name;
	}
	} else {
	type = USTYPE_PSX_USR;
	if (!cb->cb_numname) {
	struct passwd *p;

	if ((p = getpwuid(rid)) != NULL)
	name = p->pw_name;
	}
	}
	}

	/*
	* Make sure that the type/name combination is unique when doing
	* SID to POSIX ID translation (hence changing the type from SMB to
	* POSIX).
	*/
	if (cb->cb_sid2posix &&
	nvlist_add_boolean_value(props, "smbentity", smbentity) != 0)
	nomem();

	/* Calculate/update width of TYPE field */
	typestr = us_type2str(type);
	typelen = strlen(gettext(typestr));
	typeidx = us_field_index("type");
	if (typelen > cb->cb_width[typeidx])
	cb->cb_width[typeidx] = typelen;
	if (nvlist_add_uint32(props, "type", type) != 0)
	nomem();

	/* Calculate/update width of NAME field */
	if ((cb->cb_numname && cb->cb_sid2posix) \|\| name == NULL) {
	if (nvlist_add_uint64(props, "name", rid) != 0)
	nomem();
	namelen = snprintf(NULL, 0, "%u", rid);
	} else {
	if (nvlist_add_string(props, "name", name) != 0)
	nomem();
	namelen = strlen(name);
	}
	nameidx = us_field_index("name");
	if (namelen > cb->cb_width[nameidx])
	cb->cb_width[nameidx] = namelen;

	/*
	* Check if this type/name combination is in the list and update it;
	* otherwise add new node to the list.
	*/
	if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) {
	uu_avl_insert(avl, node, idx);
	} else {
	nvlist_free(props);
	free(node);
	node = n;
	props = node->usn_nvl;
	}

	/* Calculate/update width of USED/QUOTA fields */
	if (cb->cb_nicenum)
	zfs_nicenum(space, sizebuf, sizeof (sizebuf));
	else
	(void) snprintf(sizebuf, sizeof (sizebuf), "%llu", space);
	sizelen = strlen(sizebuf);
	if (prop == ZFS_PROP_USERUSED \|\| prop == ZFS_PROP_GROUPUSED) {
	propname = "used";
	if (!nvlist_exists(props, "quota"))
	(void) nvlist_add_uint64(props, "quota", 0);
	} else {
	propname = "quota";
	if (!nvlist_exists(props, "used"))
	(void) nvlist_add_uint64(props, "used", 0);
	}
	sizeidx = us_field_index(propname);
	if (sizelen > cb->cb_width[sizeidx])
	cb->cb_width[sizeidx] = sizelen;

	if (nvlist_add_uint64(props, propname, space) != 0)
	nomem();

	return (0);
	}

	static void
	print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types,
	size_t width, us_node_t node)
	{
	nvlist_t *nvl = node->usn_nvl;
	char valstr[MAXNAMELEN];
	boolean_t first = B_TRUE;
	int cfield = 0;
	int field;
	uint32_t ustype;

	/* Check type */
	(void) nvlist_lookup_uint32(nvl, "type", &ustype);
	if (!(ustype & types))
	return;

	while ((field = fields[cfield]) != USFIELD_LAST) {
	nvpair_t *nvp = NULL;
	data_type_t type;
	uint32_t val32;
	uint64_t val64;
	char *strval = NULL;

	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	if (strcmp(nvpair_name(nvp),
	us_field_names[field]) == 0)
	break;
	}

	type = nvpair_type(nvp);
	switch (type) {
	case DATA_TYPE_UINT32:
	(void) nvpair_value_uint32(nvp, &val32);
	break;
	case DATA_TYPE_UINT64:
	(void) nvpair_value_uint64(nvp, &val64);
	break;
	case DATA_TYPE_STRING:
	(void) nvpair_value_string(nvp, &strval);
	break;
	default:
	(void) fprintf(stderr, "invalid data type\n");
	}

	switch (field) {
	case USFIELD_TYPE:
	strval = (char *)us_type2str(val32);
	break;
	case USFIELD_NAME:
	if (type == DATA_TYPE_UINT64) {
	(void) sprintf(valstr, "%llu", val64);
	strval = valstr;
	}
	break;
	case USFIELD_USED:
	case USFIELD_QUOTA:
	if (type == DATA_TYPE_UINT64) {
	if (parsable) {
	(void) sprintf(valstr, "%llu", val64);
	} else {
	zfs_nicenum(val64, valstr,
	sizeof (valstr));
	}
	if (field == USFIELD_QUOTA &&
	strcmp(valstr, "0") == 0)
	strval = "none";
	else
	strval = valstr;
	}
	break;
	}

	if (!first) {
	if (scripted)
	(void) printf("\t");
	else
	(void) printf(" ");
	}
	if (scripted)
	(void) printf("%s", strval);
	else if (field == USFIELD_TYPE \|\| field == USFIELD_NAME)
	(void) printf("%-*s", width[field], strval);
	else
	(void) printf("%*s", width[field], strval);

	first = B_FALSE;
	cfield++;
	}

	(void) printf("\n");
	}

	static void
	print_us(boolean_t scripted, boolean_t parsable, int *fields, int types,
	size_t width, boolean_t rmnode, uu_avl_t avl)
	{
	us_node_t *node;
	const char *col;
	int cfield = 0;
	int field;

	if (!scripted) {
	boolean_t first = B_TRUE;

	while ((field = fields[cfield]) != USFIELD_LAST) {
	col = gettext(us_field_hdr[field]);
	if (field == USFIELD_TYPE \|\| field == USFIELD_NAME) {
	(void) printf(first ? "%-s" : " %-s",
	width[field], col);
	} else {
	(void) printf(first ? "%s" : " %s",
	width[field], col);
	}
	first = B_FALSE;
	cfield++;
	}
	(void) printf("\n");
	}

	for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) {
	print_us_node(scripted, parsable, fields, types, width, node);
	if (rmnode)
	nvlist_free(node->usn_nvl);
	}
	}

	static int
	zfs_do_userspace(int argc, char **argv)
	{
	zfs_handle_t *zhp;
	zfs_userquota_prop_t p;

	uu_avl_pool_t *avl_pool;
	uu_avl_t *avl_tree;
	uu_avl_walk_t *walk;
	char *delim;
	char deffields[] = "type,name,used,quota";
	char *ofield = NULL;
	char *tfield = NULL;
	int cfield = 0;
	int fields[256];
	int i;
	boolean_t scripted = B_FALSE;
	boolean_t prtnum = B_FALSE;
	boolean_t parsable = B_FALSE;
	boolean_t sid2posix = B_FALSE;
	int ret = 0;
	int c;
	zfs_sort_column_t *sortcol = NULL;
	int types = USTYPE_PSX_USR \| USTYPE_SMB_USR;
	us_cbdata_t cb;
	us_node_t *node;
	us_node_t *rmnode;
	uu_list_pool_t *listpool;
	uu_list_t *list;
	uu_avl_index_t idx = 0;
	uu_list_index_t idx2 = 0;

	if (argc < 2)
	usage(B_FALSE);

	if (strcmp(argv[0], "groupspace") == 0)
	/* Toggle default group types */
	types = USTYPE_PSX_GRP \| USTYPE_SMB_GRP;

	while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) {
	switch (c) {
	case 'n':
	prtnum = B_TRUE;
	break;
	case 'H':
	scripted = B_TRUE;
	break;
	case 'p':
	parsable = B_TRUE;
	break;
	case 'o':
	ofield = optarg;
	break;
	case 's':
	case 'S':
	if (zfs_add_sort_column(&sortcol, optarg,
	c == 's' ? B_FALSE : B_TRUE) != 0) {
	(void) fprintf(stderr,
	gettext("invalid field '%s'\n"), optarg);
	usage(B_FALSE);
	}
	break;
	case 't':
	tfield = optarg;
	break;
	case 'i':
	sid2posix = B_TRUE;
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing dataset name\n"));
	usage(B_FALSE);
	}
	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	/* Use default output fields if not specified using -o */
	if (ofield == NULL)
	ofield = deffields;
	do {
	if ((delim = strchr(ofield, ',')) != NULL)
	*delim = '\0';
	if ((fields[cfield++] = us_field_index(ofield)) == -1) {
	(void) fprintf(stderr, gettext("invalid type '%s' "
	"for -o option\n"), ofield);
	return (-1);
	}
	if (delim != NULL)
	ofield = delim + 1;
	} while (delim != NULL);
	fields[cfield] = USFIELD_LAST;

	/* Override output types (-t option) */
	if (tfield != NULL) {
	types = 0;

	do {
	boolean_t found = B_FALSE;

	if ((delim = strchr(tfield, ',')) != NULL)
	*delim = '\0';
	for (i = 0; i < sizeof (us_type_bits) / sizeof (int);
	i++) {
	if (strcmp(tfield, us_type_names[i]) == 0) {
	found = B_TRUE;
	types \|= us_type_bits[i];
	break;
	}
	}
	if (!found) {
	(void) fprintf(stderr, gettext("invalid type "
	"'%s' for -t option\n"), tfield);
	return (-1);
	}
	if (delim != NULL)
	tfield = delim + 1;
	} while (delim != NULL);
	}

	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
	return (1);

	if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t),
	offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL)
	nomem();
	if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
	nomem();

	/* Always add default sorting columns */
	(void) zfs_add_sort_column(&sortcol, "type", B_FALSE);
	(void) zfs_add_sort_column(&sortcol, "name", B_FALSE);

	cb.cb_sortcol = sortcol;
	cb.cb_numname = prtnum;
	cb.cb_nicenum = !parsable;
	cb.cb_avl_pool = avl_pool;
	cb.cb_avl = avl_tree;
	cb.cb_sid2posix = sid2posix;

	for (i = 0; i < USFIELD_LAST; i++)
	cb.cb_width[i] = strlen(gettext(us_field_hdr[i]));

	for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
	if (((p == ZFS_PROP_USERUSED \|\| p == ZFS_PROP_USERQUOTA) &&
	!(types & (USTYPE_PSX_USR \| USTYPE_SMB_USR))) \|\|
	((p == ZFS_PROP_GROUPUSED \|\| p == ZFS_PROP_GROUPQUOTA) &&
	!(types & (USTYPE_PSX_GRP \| USTYPE_SMB_GRP))))
	continue;
	cb.cb_prop = p;
	if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0)
	return (ret);
	}

	/* Sort the list */
	if ((node = uu_avl_first(avl_tree)) == NULL)
	return (0);

	us_populated = B_TRUE;

	listpool = uu_list_pool_create("tmplist", sizeof (us_node_t),
	offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT);
	list = uu_list_create(listpool, NULL, UU_DEFAULT);
	uu_list_node_init(node, &node->usn_listnode, listpool);

	while (node != NULL) {
	rmnode = node;
	node = uu_avl_next(avl_tree, node);
	uu_avl_remove(avl_tree, rmnode);
	if (uu_list_find(list, rmnode, NULL, &idx2) == NULL)
	uu_list_insert(list, rmnode, idx2);
	}

	for (node = uu_list_first(list); node != NULL;
	node = uu_list_next(list, node)) {
	us_sort_info_t sortinfo = { sortcol, cb.cb_numname };

	if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL)
	uu_avl_insert(avl_tree, node, idx);
	}

	uu_list_destroy(list);
	uu_list_pool_destroy(listpool);

	/* Print and free node nvlist memory */
	print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE,
	cb.cb_avl);

	zfs_free_sort_columns(sortcol);

	/* Clean up the AVL tree */
	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
	nomem();

	while ((node = uu_avl_walk_next(walk)) != NULL) {
	uu_avl_remove(cb.cb_avl, node);
	free(node);
	}

	uu_avl_walk_end(walk);
	uu_avl_destroy(avl_tree);
	uu_avl_pool_destroy(avl_pool);

	return (ret);
	}

	/*
	* list [-Hp][-r\|-d max] [-o property[,...]] [-s property] ... [-S property] ...
	* [-t type[,...]] [filesystem\|volume\|snapshot] ...
	*
	* -H Scripted mode; elide headers and separate columns by tabs.
	* -p Display values in parsable (literal) format.
	* -r Recurse over all children.
	* -d Limit recursion by depth.
	* -o Control which fields to display.
	* -s Specify sort columns, descending order.
	* -S Specify sort columns, ascending order.
	* -t Control which object types to display.
	*
	* When given no arguments, list all filesystems in the system.
	* Otherwise, list the specified datasets, optionally recursing down them if
	* '-r' is specified.
	*/
	typedef struct list_cbdata {
	boolean_t cb_first;
	boolean_t cb_literal;
	boolean_t cb_scripted;
	zprop_list_t *cb_proplist;
	} list_cbdata_t;

	/*
	* Given a list of columns to display, output appropriate headers for each one.
	*/
	static void
	print_header(list_cbdata_t *cb)
	{
	zprop_list_t *pl = cb->cb_proplist;
	char headerbuf[ZFS_MAXPROPLEN];
	const char *header;
	int i;
	boolean_t first = B_TRUE;
	boolean_t right_justify;

	for (; pl != NULL; pl = pl->pl_next) {
	if (!first) {
	(void) printf(" ");
	} else {
	first = B_FALSE;
	}

	right_justify = B_FALSE;
	if (pl->pl_prop != ZPROP_INVAL) {
	header = zfs_prop_column_name(pl->pl_prop);
	right_justify = zfs_prop_align_right(pl->pl_prop);
	} else {
	for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
	headerbuf[i] = toupper(pl->pl_user_prop[i]);
	headerbuf[i] = '\0';
	header = headerbuf;
	}

	if (pl->pl_next == NULL && !right_justify)
	(void) printf("%s", header);
	else if (right_justify)
	(void) printf("%*s", pl->pl_width, header);
	else
	(void) printf("%-*s", pl->pl_width, header);
	}

	(void) printf("\n");
	}

	/*
	* Given a dataset and a list of fields, print out all the properties according
	* to the described layout.
	*/
	static void
	print_dataset(zfs_handle_t zhp, list_cbdata_t cb)
	{
	zprop_list_t *pl = cb->cb_proplist;
	boolean_t first = B_TRUE;
	char property[ZFS_MAXPROPLEN];
	nvlist_t *userprops = zfs_get_user_props(zhp);
	nvlist_t *propval;
	char *propstr;
	boolean_t right_justify;

	for (; pl != NULL; pl = pl->pl_next) {
	if (!first) {
	if (cb->cb_scripted)
	(void) printf("\t");
	else
	(void) printf(" ");
	} else {
	first = B_FALSE;
	}

	if (pl->pl_prop == ZFS_PROP_NAME) {
	(void) strlcpy(property, zfs_get_name(zhp),
	sizeof (property));
	propstr = property;
	right_justify = zfs_prop_align_right(pl->pl_prop);
	} else if (pl->pl_prop != ZPROP_INVAL) {
	if (zfs_prop_get(zhp, pl->pl_prop, property,
	sizeof (property), NULL, NULL, 0,
	cb->cb_literal) != 0)
	propstr = "-";
	else
	propstr = property;
	right_justify = zfs_prop_align_right(pl->pl_prop);
	} else if (zfs_prop_userquota(pl->pl_user_prop)) {
	if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
	property, sizeof (property), cb->cb_literal) != 0)
	propstr = "-";
	else
	propstr = property;
	right_justify = B_TRUE;
	} else if (zfs_prop_written(pl->pl_user_prop)) {
	if (zfs_prop_get_written(zhp, pl->pl_user_prop,
	property, sizeof (property), cb->cb_literal) != 0)
	propstr = "-";
	else
	propstr = property;
	right_justify = B_TRUE;
	} else {
	if (nvlist_lookup_nvlist(userprops,
	pl->pl_user_prop, &propval) != 0)
	propstr = "-";
	else
	verify(nvlist_lookup_string(propval,
	ZPROP_VALUE, &propstr) == 0);
	right_justify = B_FALSE;
	}

	/*
	* If this is being called in scripted mode, or if this is the
	* last column and it is left-justified, don't include a width
	* format specifier.
	*/
	if (cb->cb_scripted \|\| (pl->pl_next == NULL && !right_justify))
	(void) printf("%s", propstr);
	else if (right_justify)
	(void) printf("%*s", pl->pl_width, propstr);
	else
	(void) printf("%-*s", pl->pl_width, propstr);
	}

	(void) printf("\n");
	}

	/*
	* Generic callback function to list a dataset or snapshot.
	*/
	static int
	list_callback(zfs_handle_t zhp, void data)
	{
	list_cbdata_t *cbp = data;

	if (cbp->cb_first) {
	if (!cbp->cb_scripted)
	print_header(cbp);
	cbp->cb_first = B_FALSE;
	}

	print_dataset(zhp, cbp);

	return (0);
	}

	static int
	zfs_do_list(int argc, char **argv)
	{
	int c;
	static char default_fields[] =
	"name,used,available,referenced,mountpoint";
	int types = ZFS_TYPE_DATASET;
	boolean_t types_specified = B_FALSE;
	char *fields = NULL;
	list_cbdata_t cb = { 0 };
	char *value;
	int limit = 0;
	int ret = 0;
	zfs_sort_column_t *sortcol = NULL;
	int flags = ZFS_ITER_PROP_LISTSNAPS \| ZFS_ITER_ARGS_CAN_BE_PATHS;

	/* check options */
	while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) {
	switch (c) {
	case 'o':
	fields = optarg;
	break;
	case 'p':
	cb.cb_literal = B_TRUE;
	flags \|= ZFS_ITER_LITERAL_PROPS;
	break;
	case 'd':
	limit = parse_depth(optarg, &flags);
	break;
	case 'r':
	flags \|= ZFS_ITER_RECURSE;
	break;
	case 'H':
	cb.cb_scripted = B_TRUE;
	break;
	case 's':
	if (zfs_add_sort_column(&sortcol, optarg,
	B_FALSE) != 0) {
	(void) fprintf(stderr,
	gettext("invalid property '%s'\n"), optarg);
	usage(B_FALSE);
	}
	break;
	case 'S':
	if (zfs_add_sort_column(&sortcol, optarg,
	B_TRUE) != 0) {
	(void) fprintf(stderr,
	gettext("invalid property '%s'\n"), optarg);
	usage(B_FALSE);
	}
	break;
	case 't':
	types = 0;
	types_specified = B_TRUE;
	flags &= ~ZFS_ITER_PROP_LISTSNAPS;
	while (*optarg != '\0') {
	static char *type_subopts[] = { "filesystem",
	"volume", "snapshot", "snap", "bookmark",
	"all", NULL };

	switch (getsubopt(&optarg, type_subopts,
	&value)) {
	case 0:
	types \|= ZFS_TYPE_FILESYSTEM;
	break;
	case 1:
	types \|= ZFS_TYPE_VOLUME;
	break;
	case 2:
	case 3:
	types \|= ZFS_TYPE_SNAPSHOT;
	break;
	case 4:
	types \|= ZFS_TYPE_BOOKMARK;
	break;
	case 5:
	types = ZFS_TYPE_DATASET \|
	ZFS_TYPE_BOOKMARK;
	break;
	default:
	(void) fprintf(stderr,
	gettext("invalid type '%s'\n"),
	suboptarg);
	usage(B_FALSE);
	}
	}
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	if (fields == NULL)
	fields = default_fields;

	/*
	* If we are only going to list snapshot names and sort by name,
	* then we can use faster version.
	*/
	if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol))
	flags \|= ZFS_ITER_SIMPLE;

	/*
	* If "-o space" and no types were specified, don't display snapshots.
	*/
	if (strcmp(fields, "space") == 0 && types_specified == B_FALSE)
	types &= ~ZFS_TYPE_SNAPSHOT;

	/*
	* If the user specifies '-o all', the zprop_get_list() doesn't
	* normally include the name of the dataset. For 'zfs list', we always
	* want this property to be first.
	*/
	if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
	!= 0)
	usage(B_FALSE);

	cb.cb_first = B_TRUE;

	ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
	limit, list_callback, &cb);

	zprop_free_list(cb.cb_proplist);
	zfs_free_sort_columns(sortcol);

	if (ret == 0 && cb.cb_first && !cb.cb_scripted)
	(void) printf(gettext("no datasets available\n"));

	return (ret);
	}

	/*
	* zfs rename [-f] <fs \| snap \| vol> <fs \| snap \| vol>
	* zfs rename [-f] -p <fs \| vol> <fs \| vol>
	* zfs rename -r <snap> <snap>
	* zfs rename -u [-p] <fs> <fs>
	*
	* Renames the given dataset to another of the same type.
	*
	* The '-p' flag creates all the non-existing ancestors of the target first.
	*/
	/* ARGSUSED */
	static int
	zfs_do_rename(int argc, char **argv)
	{
	zfs_handle_t *zhp;
	renameflags_t flags = { 0 };
	int c;
	int ret = 0;
	int types;
	boolean_t parents = B_FALSE;
	char *snapshot = NULL;

	/* check options */
	while ((c = getopt(argc, argv, "fpru")) != -1) {
	switch (c) {
	case 'p':
	parents = B_TRUE;
	break;
	case 'r':
	flags.recurse = B_TRUE;
	break;
	case 'u':
	flags.nounmount = B_TRUE;
	break;
	case 'f':
	flags.forceunmount = B_TRUE;
	break;
	case '?':
	default:
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing source dataset "
	"argument\n"));
	usage(B_FALSE);
	}
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing target dataset "
	"argument\n"));
	usage(B_FALSE);
	}
	if (argc > 2) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	if (flags.recurse && parents) {
	(void) fprintf(stderr, gettext("-p and -r options are mutually "
	"exclusive\n"));
	usage(B_FALSE);
	}

	if (flags.recurse && strchr(argv[0], '@') == 0) {
	(void) fprintf(stderr, gettext("source dataset for recursive "
	"rename must be a snapshot\n"));
	usage(B_FALSE);
	}

	if (flags.nounmount && parents) {
	(void) fprintf(stderr, gettext("-u and -p options are mutually "
	"exclusive\n"));
	usage(B_FALSE);
	}

	if (flags.nounmount)
	types = ZFS_TYPE_FILESYSTEM;
	else if (parents)
	types = ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME;
	else
	types = ZFS_TYPE_DATASET;

	if (flags.recurse) {
	/*
	* When we do recursive rename we are fine when the given
	* snapshot for the given dataset doesn't exist - it can
	* still exists below.
	*/

	snapshot = strchr(argv[0], '@');
	assert(snapshot != NULL);
	*snapshot = '\0';
	snapshot++;
	}

	if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
	return (1);

	/* If we were asked and the name looks good, try to create ancestors. */
	if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) &&
	zfs_create_ancestors(g_zfs, argv[1]) != 0) {
	zfs_close(zhp);
	return (1);
	}

	ret = (zfs_rename(zhp, snapshot, argv[1], flags) != 0);

	zfs_close(zhp);
	return (ret);
	}

	/*
	* zfs promote <fs>
	*
	* Promotes the given clone fs to be the parent
	*/
	/* ARGSUSED */
	static int
	zfs_do_promote(int argc, char **argv)
	{
	zfs_handle_t *zhp;
	int ret = 0;

	/* check options */
	if (argc > 1 && argv[1][0] == '-') {
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	argv[1][1]);
	usage(B_FALSE);
	}

	/* check number of arguments */
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing clone filesystem"
	" argument\n"));
	usage(B_FALSE);
	}
	if (argc > 2) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	if (zhp == NULL)
	return (1);

	ret = (zfs_promote(zhp) != 0);


	zfs_close(zhp);
	return (ret);
	}

	/*
	* zfs rollback [-rRf] <snapshot>
	*
	* -r Delete any intervening snapshots before doing rollback
	* -R Delete any snapshots and their clones
	* -f ignored for backwards compatability
	*
	* Given a filesystem, rollback to a specific snapshot, discarding any changes
	* since then and making it the active dataset. If more recent snapshots exist,
	* the command will complain unless the '-r' flag is given.
	*/
	typedef struct rollback_cbdata {
	uint64_t cb_create;
	boolean_t cb_first;
	int cb_doclones;
	char *cb_target;
	int cb_error;
	boolean_t cb_recurse;
	} rollback_cbdata_t;

	static int
	rollback_check_dependent(zfs_handle_t zhp, void data)
	{
	rollback_cbdata_t *cbp = data;

	if (cbp->cb_first && cbp->cb_recurse) {
	(void) fprintf(stderr, gettext("cannot rollback to "
	"'%s': clones of previous snapshots exist\n"),
	cbp->cb_target);
	(void) fprintf(stderr, gettext("use '-R' to "
	"force deletion of the following clones and "
	"dependents:\n"));
	cbp->cb_first = 0;
	cbp->cb_error = 1;
	}

	(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));

	zfs_close(zhp);
	return (0);
	}

	/*
	* Report any snapshots more recent than the one specified. Used when '-r' is
	* not specified. We reuse this same callback for the snapshot dependents - if
	* 'cb_dependent' is set, then this is a dependent and we should report it
	* without checking the transaction group.
	*/
	static int
	rollback_check(zfs_handle_t zhp, void data)
	{
	rollback_cbdata_t *cbp = data;

	if (cbp->cb_doclones) {
	zfs_close(zhp);
	return (0);
	}

	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
	if (cbp->cb_first && !cbp->cb_recurse) {
	(void) fprintf(stderr, gettext("cannot "
	"rollback to '%s': more recent snapshots "
	"or bookmarks exist\n"),
	cbp->cb_target);
	(void) fprintf(stderr, gettext("use '-r' to "
	"force deletion of the following "
	"snapshots and bookmarks:\n"));
	cbp->cb_first = 0;
	cbp->cb_error = 1;
	}

	if (cbp->cb_recurse) {
	if (zfs_iter_dependents(zhp, B_TRUE,
	rollback_check_dependent, cbp) != 0) {
	zfs_close(zhp);
	return (-1);
	}
	} else {
	(void) fprintf(stderr, "%s\n",
	zfs_get_name(zhp));
	}
	}
	zfs_close(zhp);
	return (0);
	}

	static int
	zfs_do_rollback(int argc, char **argv)
	{
	int ret = 0;
	int c;
	boolean_t force = B_FALSE;
	rollback_cbdata_t cb = { 0 };
	zfs_handle_t zhp, snap;
	char parentname[ZFS_MAX_DATASET_NAME_LEN];
	char *delim;

	/* check options */
	while ((c = getopt(argc, argv, "rRf")) != -1) {
	switch (c) {
	case 'r':
	cb.cb_recurse = 1;
	break;
	case 'R':
	cb.cb_recurse = 1;
	cb.cb_doclones = 1;
	break;
	case 'f':
	force = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing dataset argument\n"));
	usage(B_FALSE);
	}
	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	/* open the snapshot */
	if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
	return (1);

	/* open the parent dataset */
	(void) strlcpy(parentname, argv[0], sizeof (parentname));
	verify((delim = strrchr(parentname, '@')) != NULL);
	*delim = '\0';
	if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) {
	zfs_close(snap);
	return (1);
	}

	/*
	* Check for more recent snapshots and/or clones based on the presence
	* of '-r' and '-R'.
	*/
	cb.cb_target = argv[0];
	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
	cb.cb_first = B_TRUE;
	cb.cb_error = 0;
	if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb)) != 0)
	goto out;
	if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0)
	goto out;

	if ((ret = cb.cb_error) != 0)
	goto out;

	/*
	* Rollback parent to the given snapshot.
	*/
	ret = zfs_rollback(zhp, snap, force);

	out:
	zfs_close(snap);
	zfs_close(zhp);

	if (ret == 0)
	return (0);
	else
	return (1);
	}

	/*
	* zfs set property=value ... { fs \| snap \| vol } ...
	*
	* Sets the given properties for all datasets specified on the command line.
	*/

	static int
	set_callback(zfs_handle_t zhp, void data)
	{
	nvlist_t *props = data;

	if (zfs_prop_set_list(zhp, props) != 0) {
	switch (libzfs_errno(g_zfs)) {
	case EZFS_MOUNTFAILED:
	(void) fprintf(stderr, gettext("property may be set "
	"but unable to remount filesystem\n"));
	break;
	case EZFS_SHARENFSFAILED:
	(void) fprintf(stderr, gettext("property may be set "
	"but unable to reshare filesystem\n"));
	break;
	}
	return (1);
	}
	return (0);
	}

	static int
	zfs_do_set(int argc, char **argv)
	{
	nvlist_t *props = NULL;
	int ds_start = -1; /* argv idx of first dataset arg */
	int ret = 0;

	/* check for options */
	if (argc > 1 && argv[1][0] == '-') {
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	argv[1][1]);
	usage(B_FALSE);
	}

	/* check number of arguments */
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing arguments\n"));
	usage(B_FALSE);
	}
	if (argc < 3) {
	if (strchr(argv[1], '=') == NULL) {
	(void) fprintf(stderr, gettext("missing property=value "
	"argument(s)\n"));
	} else {
	(void) fprintf(stderr, gettext("missing dataset "
	"name(s)\n"));
	}
	usage(B_FALSE);
	}

	/* validate argument order: prop=val args followed by dataset args */
	for (int i = 1; i < argc; i++) {
	if (strchr(argv[i], '=') != NULL) {
	if (ds_start > 0) {
	/* out-of-order prop=val argument */
	(void) fprintf(stderr, gettext("invalid "
	"argument order\n"), i);
	usage(B_FALSE);
	}
	} else if (ds_start < 0) {
	ds_start = i;
	}
	}
	if (ds_start < 0) {
	(void) fprintf(stderr, gettext("missing dataset name(s)\n"));
	usage(B_FALSE);
	}

	/* Populate a list of property settings */
	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	nomem();
	for (int i = 1; i < ds_start; i++) {
	if ((ret = parseprop(props, argv[i])) != 0)
	goto error;
	}

	ret = zfs_for_each(argc - ds_start, argv + ds_start, 0,
	ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props);

	error:
	nvlist_free(props);
	return (ret);
	}

	typedef struct snap_cbdata {
	nvlist_t *sd_nvl;
	boolean_t sd_recursive;
	const char *sd_snapname;
	} snap_cbdata_t;

	static int
	zfs_snapshot_cb(zfs_handle_t zhp, void arg)
	{
	snap_cbdata_t *sd = arg;
	char *name;
	int rv = 0;
	int error;

	if (sd->sd_recursive &&
	zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) {
	zfs_close(zhp);
	return (0);
	}

	error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
	if (error == -1)
	nomem();
	fnvlist_add_boolean(sd->sd_nvl, name);
	free(name);

	if (sd->sd_recursive)
	rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
	zfs_close(zhp);
	return (rv);
	}

	/*
	* zfs snapshot [-r] [-o prop=value] ... <fs@snap>
	*
	* Creates a snapshot with the given name. While functionally equivalent to
	* 'zfs create', it is a separate command to differentiate intent.
	*/
	static int
	zfs_do_snapshot(int argc, char **argv)
	{
	int ret = 0;
	int c;
	nvlist_t *props;
	snap_cbdata_t sd = { 0 };
	boolean_t multiple_snaps = B_FALSE;

	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	nomem();
	if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0)
	nomem();

	/* check options */
	while ((c = getopt(argc, argv, "ro:")) != -1) {
	switch (c) {
	case 'o':
	if (parseprop(props, optarg) != 0)
	return (1);
	break;
	case 'r':
	sd.sd_recursive = B_TRUE;
	multiple_snaps = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	goto usage;
	}
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing snapshot argument\n"));
	goto usage;
	}

	if (argc > 1)
	multiple_snaps = B_TRUE;
	for (; argc > 0; argc--, argv++) {
	char *atp;
	zfs_handle_t *zhp;

	atp = strchr(argv[0], '@');
	if (atp == NULL)
	goto usage;
	*atp = '\0';
	sd.sd_snapname = atp + 1;
	zhp = zfs_open(g_zfs, argv[0],
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	if (zhp == NULL)
	goto usage;
	if (zfs_snapshot_cb(zhp, &sd) != 0)
	goto usage;
	}

	ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props);
	nvlist_free(sd.sd_nvl);
	nvlist_free(props);
	if (ret != 0 && multiple_snaps)
	(void) fprintf(stderr, gettext("no snapshots were created\n"));
	return (ret != 0);

	usage:
	nvlist_free(sd.sd_nvl);
	nvlist_free(props);
	usage(B_FALSE);
	return (-1);
	}

	/*
	* Send a backup stream to stdout.
	*/
	static int
	zfs_do_send(int argc, char **argv)
	{
	char *fromname = NULL;
	char *toname = NULL;
	char *resume_token = NULL;
	char *cp;
	zfs_handle_t *zhp;
	sendflags_t flags = { 0 };
	int c, err;
	nvlist_t *dbgnv = NULL;
	boolean_t extraverbose = B_FALSE;

	struct option long_options[] = {
	{"replicate", no_argument, NULL, 'R'},
	{"props", no_argument, NULL, 'p'},
	{"parsable", no_argument, NULL, 'P'},
	{"dedup", no_argument, NULL, 'D'},
	{"verbose", no_argument, NULL, 'v'},
	{"dryrun", no_argument, NULL, 'n'},
	{"large-block", no_argument, NULL, 'L'},
	{"embed", no_argument, NULL, 'e'},
	{"resume", required_argument, NULL, 't'},
	{"compressed", no_argument, NULL, 'c'},
	{0, 0, 0, 0}
	};

	/* check options */
	while ((c = getopt_long(argc, argv, ":i:I:RbDpvnPLet:c", long_options,
	NULL)) != -1) {
	switch (c) {
	case 'i':
	if (fromname)
	usage(B_FALSE);
	fromname = optarg;
	break;
	case 'I':
	if (fromname)
	usage(B_FALSE);
	fromname = optarg;
	flags.doall = B_TRUE;
	break;
	case 'R':
	flags.replicate = B_TRUE;
	break;
	case 'p':
	flags.props = B_TRUE;
	break;
	case 'P':
	flags.parsable = B_TRUE;
	flags.verbose = B_TRUE;
	break;
	case 'v':
	if (flags.verbose)
	extraverbose = B_TRUE;
	flags.verbose = B_TRUE;
	flags.progress = B_TRUE;
	break;
	case 'D':
	flags.dedup = B_TRUE;
	break;
	case 'n':
	flags.dryrun = B_TRUE;
	break;
	case 'L':
	flags.largeblock = B_TRUE;
	break;
	case 'e':
	flags.embed_data = B_TRUE;
	break;
	case 't':
	resume_token = optarg;
	break;
	case 'c':
	flags.compress = B_TRUE;
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case '?':
	/FALLTHROUGH/
	default:
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	if (resume_token != NULL) {
	if (fromname != NULL \|\| flags.replicate \|\| flags.props \|\|
	flags.dedup) {
	(void) fprintf(stderr,
	gettext("invalid flags combined with -t\n"));
	usage(B_FALSE);
	}
	if (argc != 0) {
	(void) fprintf(stderr, gettext("no additional "
	"arguments are permitted with -t\n"));
	usage(B_FALSE);
	}
	} else {
	if (argc < 1) {
	(void) fprintf(stderr,
	gettext("missing snapshot argument\n"));
	usage(B_FALSE);
	}
	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}
	}

	if (!flags.dryrun && isatty(STDOUT_FILENO)) {
	(void) fprintf(stderr,
	gettext("Error: Stream can not be written to a terminal.\n"
	"You must redirect standard output.\n"));
	return (1);
	}

	if (resume_token != NULL) {
	return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
	resume_token));
	}

	/*
	* Special case sending a filesystem, or from a bookmark.
	*/
	if (strchr(argv[0], '@') == NULL \|\|
	(fromname && strchr(fromname, '#') != NULL)) {
	char frombuf[ZFS_MAX_DATASET_NAME_LEN];
	enum lzc_send_flags lzc_flags = 0;

	if (flags.replicate \|\| flags.doall \|\| flags.props \|\|
	flags.dedup \|\| flags.dryrun \|\| flags.verbose \|\|
	flags.progress) {
	(void) fprintf(stderr,
	gettext("Error: "
	"Unsupported flag with filesystem or bookmark.\n"));
	return (1);
	}

	zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET);
	if (zhp == NULL)
	return (1);

	if (flags.largeblock)
	lzc_flags \|= LZC_SEND_FLAG_LARGE_BLOCK;
	if (flags.embed_data)
	lzc_flags \|= LZC_SEND_FLAG_EMBED_DATA;
	if (flags.compress)
	lzc_flags \|= LZC_SEND_FLAG_COMPRESS;

	if (fromname != NULL &&
	(fromname[0] == '#' \|\| fromname[0] == '@')) {
	/*
	* Incremental source name begins with # or @.
	* Default to same fs as target.
	*/
	(void) strncpy(frombuf, argv[0], sizeof (frombuf));
	cp = strchr(frombuf, '@');
	if (cp != NULL)
	*cp = '\0';
	(void) strlcat(frombuf, fromname, sizeof (frombuf));
	fromname = frombuf;
	}
	err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags);
	zfs_close(zhp);
	return (err != 0);
	}

	cp = strchr(argv[0], '@');
	*cp = '\0';
	toname = cp + 1;
	zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	if (zhp == NULL)
	return (1);

	/*
	* If they specified the full path to the snapshot, chop off
	* everything except the short name of the snapshot, but special
	* case if they specify the origin.
	*/
	if (fromname && (cp = strchr(fromname, '@')) != NULL) {
	char origin[ZFS_MAX_DATASET_NAME_LEN];
	zprop_source_t src;

	(void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN,
	origin, sizeof (origin), &src, NULL, 0, B_FALSE);

	if (strcmp(origin, fromname) == 0) {
	fromname = NULL;
	flags.fromorigin = B_TRUE;
	} else {
	*cp = '\0';
	if (cp != fromname && strcmp(argv[0], fromname)) {
	(void) fprintf(stderr,
	gettext("incremental source must be "
	"in same filesystem\n"));
	usage(B_FALSE);
	}
	fromname = cp + 1;
	if (strchr(fromname, '@') \|\| strchr(fromname, '/')) {
	(void) fprintf(stderr,
	gettext("invalid incremental source\n"));
	usage(B_FALSE);
	}
	}
	}

	if (flags.replicate && fromname == NULL)
	flags.doall = B_TRUE;

	err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0,
	extraverbose ? &dbgnv : NULL);

	if (extraverbose && dbgnv != NULL) {
	/*
	* dump_nvlist prints to stdout, but that's been
	* redirected to a file. Make it print to stderr
	* instead.
	*/
	(void) dup2(STDERR_FILENO, STDOUT_FILENO);
	dump_nvlist(dbgnv, 0);
	nvlist_free(dbgnv);
	}
	zfs_close(zhp);

	return (err != 0);
	}

	/*
	* Restore a backup stream from stdin.
	*/
	static int
	zfs_do_receive(int argc, char **argv)
	{
	int c, err = 0;
	recvflags_t flags = { 0 };
	boolean_t abort_resumable = B_FALSE;

	nvlist_t *props;
	nvpair_t *nvp = NULL;

	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
	nomem();

	/* check options */
	while ((c = getopt(argc, argv, ":o:denuvFsA")) != -1) {
	switch (c) {
	case 'o':
	if (parseprop(props, optarg) != 0)
	return (1);
	break;
	case 'd':
	flags.isprefix = B_TRUE;
	break;
	case 'e':
	flags.isprefix = B_TRUE;
	flags.istail = B_TRUE;
	break;
	case 'n':
	flags.dryrun = B_TRUE;
	break;
	case 'u':
	flags.nomount = B_TRUE;
	break;
	case 'v':
	flags.verbose = B_TRUE;
	break;
	case 's':
	flags.resumable = B_TRUE;
	break;
	case 'F':
	flags.force = B_TRUE;
	break;
	case 'A':
	abort_resumable = B_TRUE;
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing snapshot argument\n"));
	usage(B_FALSE);
	}
	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	while ((nvp = nvlist_next_nvpair(props, nvp))) {
	if (strcmp(nvpair_name(nvp), "origin") != 0) {
	(void) fprintf(stderr, gettext("invalid option"));
	usage(B_FALSE);
	}
	}

	if (abort_resumable) {
	if (flags.isprefix \|\| flags.istail \|\| flags.dryrun \|\|
	flags.resumable \|\| flags.nomount) {
	(void) fprintf(stderr, gettext("invalid option"));
	usage(B_FALSE);
	}

	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
	(void) snprintf(namebuf, sizeof (namebuf),
	"%s/%%recv", argv[0]);

	if (zfs_dataset_exists(g_zfs, namebuf,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME)) {
	zfs_handle_t *zhp = zfs_open(g_zfs,
	namebuf, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	if (zhp == NULL)
	return (1);
	err = zfs_destroy(zhp, B_FALSE);
	} else {
	zfs_handle_t *zhp = zfs_open(g_zfs,
	argv[0], ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	if (zhp == NULL)
	usage(B_FALSE);
	if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) \|\|
	zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
	NULL, 0, NULL, NULL, 0, B_TRUE) == -1) {
	(void) fprintf(stderr,
	gettext("'%s' does not have any "
	"resumable receive state to abort\n"),
	argv[0]);
	return (1);
	}
	err = zfs_destroy(zhp, B_FALSE);
	}

	return (err != 0);
	}

	if (isatty(STDIN_FILENO)) {
	(void) fprintf(stderr,
	gettext("Error: Backup stream can not be read "
	"from a terminal.\n"
	"You must redirect standard input.\n"));
	return (1);
	}
	err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);

	return (err != 0);
	}

	/*
	* allow/unallow stuff
	*/
	/* copied from zfs/sys/dsl_deleg.h */
	#define ZFS_DELEG_PERM_CREATE "create"
	#define ZFS_DELEG_PERM_DESTROY "destroy"
	#define ZFS_DELEG_PERM_SNAPSHOT "snapshot"
	#define ZFS_DELEG_PERM_ROLLBACK "rollback"
	#define ZFS_DELEG_PERM_CLONE "clone"
	#define ZFS_DELEG_PERM_PROMOTE "promote"
	#define ZFS_DELEG_PERM_RENAME "rename"
	#define ZFS_DELEG_PERM_MOUNT "mount"
	#define ZFS_DELEG_PERM_SHARE "share"
	#define ZFS_DELEG_PERM_SEND "send"
	#define ZFS_DELEG_PERM_RECEIVE "receive"
	#define ZFS_DELEG_PERM_ALLOW "allow"
	#define ZFS_DELEG_PERM_USERPROP "userprop"
	#define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */
	#define ZFS_DELEG_PERM_USERQUOTA "userquota"
	#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
	#define ZFS_DELEG_PERM_USERUSED "userused"
	#define ZFS_DELEG_PERM_GROUPUSED "groupused"
	#define ZFS_DELEG_PERM_HOLD "hold"
	#define ZFS_DELEG_PERM_RELEASE "release"
	#define ZFS_DELEG_PERM_DIFF "diff"
	#define ZFS_DELEG_PERM_BOOKMARK "bookmark"
	+#define ZFS_DELEG_PERM_REMAP "remap"

	#define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE

	static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {
	{ ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW },
	{ ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
	{ ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
	{ ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
	{ ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
	{ ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
	{ ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
	{ ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
	{ ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
	{ ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
	{ ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
	{ ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
	{ ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
	{ ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
	{ ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
	{ ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK },
	+ { ZFS_DELEG_PERM_REMAP, ZFS_DELEG_NOTE_REMAP },

	{ ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
	{ ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
	{ ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
	{ ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
	{ ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
	{ NULL, ZFS_DELEG_NOTE_NONE }
	};

	/* permission structure */
	typedef struct deleg_perm {
	zfs_deleg_who_type_t dp_who_type;
	const char *dp_name;
	boolean_t dp_local;
	boolean_t dp_descend;
	} deleg_perm_t;

	/* */
	typedef struct deleg_perm_node {
	deleg_perm_t dpn_perm;

	uu_avl_node_t dpn_avl_node;
	} deleg_perm_node_t;

	typedef struct fs_perm fs_perm_t;

	/* permissions set */
	typedef struct who_perm {
	zfs_deleg_who_type_t who_type;
	const char who_name; / id */
	char who_ug_name[256]; /* user/group name */
	fs_perm_t who_fsperm; / uplink */

	uu_avl_t who_deleg_perm_avl; / permissions */
	} who_perm_t;

	/* */
	typedef struct who_perm_node {
	who_perm_t who_perm;
	uu_avl_node_t who_avl_node;
	} who_perm_node_t;

	typedef struct fs_perm_set fs_perm_set_t;
	/* fs permissions */
	struct fs_perm {
	const char *fsp_name;

	uu_avl_t fsp_sc_avl; / sets,create */
	uu_avl_t fsp_uge_avl; / user,group,everyone */

	fs_perm_set_t fsp_set; / uplink */
	};

	/* */
	typedef struct fs_perm_node {
	fs_perm_t fspn_fsperm;
	uu_avl_t *fspn_avl;

	uu_list_node_t fspn_list_node;
	} fs_perm_node_t;

	/* top level structure */
	struct fs_perm_set {
	uu_list_pool_t *fsps_list_pool;
	uu_list_t fsps_list; / list of fs_perms */

	uu_avl_pool_t *fsps_named_set_avl_pool;
	uu_avl_pool_t *fsps_who_perm_avl_pool;
	uu_avl_pool_t *fsps_deleg_perm_avl_pool;
	};

	static inline const char *
	deleg_perm_type(zfs_deleg_note_t note)
	{
	/* subcommands */
	switch (note) {
	/* SUBCOMMANDS */
	/* OTHER */
	case ZFS_DELEG_NOTE_GROUPQUOTA:
	case ZFS_DELEG_NOTE_GROUPUSED:
	case ZFS_DELEG_NOTE_USERPROP:
	case ZFS_DELEG_NOTE_USERQUOTA:
	case ZFS_DELEG_NOTE_USERUSED:
	/* other */
	return (gettext("other"));
	default:
	return (gettext("subcommand"));
	}
	}

	static int
	who_type2weight(zfs_deleg_who_type_t who_type)
	{
	int res;
	switch (who_type) {
	case ZFS_DELEG_NAMED_SET_SETS:
	case ZFS_DELEG_NAMED_SET:
	res = 0;
	break;
	case ZFS_DELEG_CREATE_SETS:
	case ZFS_DELEG_CREATE:
	res = 1;
	break;
	case ZFS_DELEG_USER_SETS:
	case ZFS_DELEG_USER:
	res = 2;
	break;
	case ZFS_DELEG_GROUP_SETS:
	case ZFS_DELEG_GROUP:
	res = 3;
	break;
	case ZFS_DELEG_EVERYONE_SETS:
	case ZFS_DELEG_EVERYONE:
	res = 4;
	break;
	default:
	res = -1;
	}

	return (res);
	}

	/* ARGSUSED */
	static int
	who_perm_compare(const void larg, const void rarg, void *unused)
	{
	const who_perm_node_t *l = larg;
	const who_perm_node_t *r = rarg;
	zfs_deleg_who_type_t ltype = l->who_perm.who_type;
	zfs_deleg_who_type_t rtype = r->who_perm.who_type;
	int lweight = who_type2weight(ltype);
	int rweight = who_type2weight(rtype);
	int res = lweight - rweight;
	if (res == 0)
	res = strncmp(l->who_perm.who_name, r->who_perm.who_name,
	ZFS_MAX_DELEG_NAME-1);

	if (res == 0)
	return (0);
	if (res > 0)
	return (1);
	else
	return (-1);
	}

	/* ARGSUSED */
	static int
	deleg_perm_compare(const void larg, const void rarg, void *unused)
	{
	const deleg_perm_node_t *l = larg;
	const deleg_perm_node_t *r = rarg;
	int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name,
	ZFS_MAX_DELEG_NAME-1);

	if (res == 0)
	return (0);

	if (res > 0)
	return (1);
	else
	return (-1);
	}

	static inline void
	fs_perm_set_init(fs_perm_set_t *fspset)
	{
	bzero(fspset, sizeof (fs_perm_set_t));

	if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool",
	sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node),
	NULL, UU_DEFAULT)) == NULL)
	nomem();
	if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL,
	UU_DEFAULT)) == NULL)
	nomem();

	if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create(
	"named_set_avl_pool", sizeof (who_perm_node_t), offsetof(
	who_perm_node_t, who_avl_node), who_perm_compare,
	UU_DEFAULT)) == NULL)
	nomem();

	if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create(
	"who_perm_avl_pool", sizeof (who_perm_node_t), offsetof(
	who_perm_node_t, who_avl_node), who_perm_compare,
	UU_DEFAULT)) == NULL)
	nomem();

	if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create(
	"deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof(
	deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT))
	== NULL)
	nomem();
	}

	static inline void fs_perm_fini(fs_perm_t *);
	static inline void who_perm_fini(who_perm_t *);

	static inline void
	fs_perm_set_fini(fs_perm_set_t *fspset)
	{
	fs_perm_node_t *node = uu_list_first(fspset->fsps_list);

	while (node != NULL) {
	fs_perm_node_t *next_node =
	uu_list_next(fspset->fsps_list, node);
	fs_perm_t *fsperm = &node->fspn_fsperm;
	fs_perm_fini(fsperm);
	uu_list_remove(fspset->fsps_list, node);
	free(node);
	node = next_node;
	}

	uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool);
	uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool);
	uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool);
	}

	static inline void
	deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type,
	const char *name)
	{
	deleg_perm->dp_who_type = type;
	deleg_perm->dp_name = name;
	}

	static inline void
	who_perm_init(who_perm_t who_perm, fs_perm_t fsperm,
	zfs_deleg_who_type_t type, const char *name)
	{
	uu_avl_pool_t *pool;
	pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool;

	bzero(who_perm, sizeof (who_perm_t));

	if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL,
	UU_DEFAULT)) == NULL)
	nomem();

	who_perm->who_type = type;
	who_perm->who_name = name;
	who_perm->who_fsperm = fsperm;
	}

	static inline void
	who_perm_fini(who_perm_t *who_perm)
	{
	deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl);

	while (node != NULL) {
	deleg_perm_node_t *next_node =
	uu_avl_next(who_perm->who_deleg_perm_avl, node);

	uu_avl_remove(who_perm->who_deleg_perm_avl, node);
	free(node);
	node = next_node;
	}

	uu_avl_destroy(who_perm->who_deleg_perm_avl);
	}

	static inline void
	fs_perm_init(fs_perm_t fsperm, fs_perm_set_t fspset, const char *fsname)
	{
	uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool;
	uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool;

	bzero(fsperm, sizeof (fs_perm_t));

	if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT))
	== NULL)
	nomem();

	if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT))
	== NULL)
	nomem();

	fsperm->fsp_set = fspset;
	fsperm->fsp_name = fsname;
	}

	static inline void
	fs_perm_fini(fs_perm_t *fsperm)
	{
	who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl);
	while (node != NULL) {
	who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl,
	node);
	who_perm_t *who_perm = &node->who_perm;
	who_perm_fini(who_perm);
	uu_avl_remove(fsperm->fsp_sc_avl, node);
	free(node);
	node = next_node;
	}

	node = uu_avl_first(fsperm->fsp_uge_avl);
	while (node != NULL) {
	who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl,
	node);
	who_perm_t *who_perm = &node->who_perm;
	who_perm_fini(who_perm);
	uu_avl_remove(fsperm->fsp_uge_avl, node);
	free(node);
	node = next_node;
	}

	uu_avl_destroy(fsperm->fsp_sc_avl);
	uu_avl_destroy(fsperm->fsp_uge_avl);
	}

	static void
	set_deleg_perm_node(uu_avl_t avl, deleg_perm_node_t node,
	zfs_deleg_who_type_t who_type, const char *name, char locality)
	{
	uu_avl_index_t idx = 0;

	deleg_perm_node_t *found_node = NULL;
	deleg_perm_t *deleg_perm = &node->dpn_perm;

	deleg_perm_init(deleg_perm, who_type, name);

	if ((found_node = uu_avl_find(avl, node, NULL, &idx))
	== NULL)
	uu_avl_insert(avl, node, idx);
	else {
	node = found_node;
	deleg_perm = &node->dpn_perm;
	}


	switch (locality) {
	case ZFS_DELEG_LOCAL:
	deleg_perm->dp_local = B_TRUE;
	break;
	case ZFS_DELEG_DESCENDENT:
	deleg_perm->dp_descend = B_TRUE;
	break;
	case ZFS_DELEG_NA:
	break;
	default:
	assert(B_FALSE); /* invalid locality */
	}
	}

	static inline int
	parse_who_perm(who_perm_t who_perm, nvlist_t nvl, char locality)
	{
	nvpair_t *nvp = NULL;
	fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set;
	uu_avl_t *avl = who_perm->who_deleg_perm_avl;
	zfs_deleg_who_type_t who_type = who_perm->who_type;

	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	const char *name = nvpair_name(nvp);
	data_type_t type = nvpair_type(nvp);
	uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool;
	deleg_perm_node_t *node =
	safe_malloc(sizeof (deleg_perm_node_t));

	assert(type == DATA_TYPE_BOOLEAN);

	uu_avl_node_init(node, &node->dpn_avl_node, avl_pool);
	set_deleg_perm_node(avl, node, who_type, name, locality);
	}

	return (0);
	}

	static inline int
	parse_fs_perm(fs_perm_t fsperm, nvlist_t nvl)
	{
	nvpair_t *nvp = NULL;
	fs_perm_set_t *fspset = fsperm->fsp_set;

	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	nvlist_t *nvl2 = NULL;
	const char *name = nvpair_name(nvp);
	uu_avl_t *avl = NULL;
	uu_avl_pool_t *avl_pool = NULL;
	zfs_deleg_who_type_t perm_type = name[0];
	char perm_locality = name[1];
	const char *perm_name = name + 3;
	boolean_t is_set = B_TRUE;
	who_perm_t *who_perm = NULL;

	assert('$' == name[2]);

	if (nvpair_value_nvlist(nvp, &nvl2) != 0)
	return (-1);

	switch (perm_type) {
	case ZFS_DELEG_CREATE:
	case ZFS_DELEG_CREATE_SETS:
	case ZFS_DELEG_NAMED_SET:
	case ZFS_DELEG_NAMED_SET_SETS:
	avl_pool = fspset->fsps_named_set_avl_pool;
	avl = fsperm->fsp_sc_avl;
	break;
	case ZFS_DELEG_USER:
	case ZFS_DELEG_USER_SETS:
	case ZFS_DELEG_GROUP:
	case ZFS_DELEG_GROUP_SETS:
	case ZFS_DELEG_EVERYONE:
	case ZFS_DELEG_EVERYONE_SETS:
	avl_pool = fspset->fsps_who_perm_avl_pool;
	avl = fsperm->fsp_uge_avl;
	break;

	default:
	assert(!"unhandled zfs_deleg_who_type_t");
	}

	if (is_set) {
	who_perm_node_t *found_node = NULL;
	who_perm_node_t *node = safe_malloc(
	sizeof (who_perm_node_t));
	who_perm = &node->who_perm;
	uu_avl_index_t idx = 0;

	uu_avl_node_init(node, &node->who_avl_node, avl_pool);
	who_perm_init(who_perm, fsperm, perm_type, perm_name);

	if ((found_node = uu_avl_find(avl, node, NULL, &idx))
	== NULL) {
	if (avl == fsperm->fsp_uge_avl) {
	uid_t rid = 0;
	struct passwd *p = NULL;
	struct group *g = NULL;
	const char *nice_name = NULL;

	switch (perm_type) {
	case ZFS_DELEG_USER_SETS:
	case ZFS_DELEG_USER:
	rid = atoi(perm_name);
	p = getpwuid(rid);
	if (p)
	nice_name = p->pw_name;
	break;
	case ZFS_DELEG_GROUP_SETS:
	case ZFS_DELEG_GROUP:
	rid = atoi(perm_name);
	g = getgrgid(rid);
	if (g)
	nice_name = g->gr_name;
	break;

	default:
	break;
	}

	if (nice_name != NULL)
	(void) strlcpy(
	node->who_perm.who_ug_name,
	nice_name, 256);
	}

	uu_avl_insert(avl, node, idx);
	} else {
	node = found_node;
	who_perm = &node->who_perm;
	}
	}

	(void) parse_who_perm(who_perm, nvl2, perm_locality);
	}

	return (0);
	}

	static inline int
	parse_fs_perm_set(fs_perm_set_t fspset, nvlist_t nvl)
	{
	nvpair_t *nvp = NULL;
	uu_avl_index_t idx = 0;

	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	nvlist_t *nvl2 = NULL;
	const char *fsname = nvpair_name(nvp);
	data_type_t type = nvpair_type(nvp);
	fs_perm_t *fsperm = NULL;
	fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t));
	if (node == NULL)
	nomem();

	fsperm = &node->fspn_fsperm;

	assert(DATA_TYPE_NVLIST == type);

	uu_list_node_init(node, &node->fspn_list_node,
	fspset->fsps_list_pool);

	idx = uu_list_numnodes(fspset->fsps_list);
	fs_perm_init(fsperm, fspset, fsname);

	if (nvpair_value_nvlist(nvp, &nvl2) != 0)
	return (-1);

	(void) parse_fs_perm(fsperm, nvl2);

	uu_list_insert(fspset->fsps_list, node, idx);
	}

	return (0);
	}

	static inline const char *
	deleg_perm_comment(zfs_deleg_note_t note)
	{
	const char *str = "";

	/* subcommands */
	switch (note) {
	/* SUBCOMMANDS */
	case ZFS_DELEG_NOTE_ALLOW:
	str = gettext("Must also have the permission that is being"
	"\n\t\t\t\tallowed");
	break;
	case ZFS_DELEG_NOTE_CLONE:
	str = gettext("Must also have the 'create' ability and 'mount'"
	"\n\t\t\t\tability in the origin file system");
	break;
	case ZFS_DELEG_NOTE_CREATE:
	str = gettext("Must also have the 'mount' ability");
	break;
	case ZFS_DELEG_NOTE_DESTROY:
	str = gettext("Must also have the 'mount' ability");
	break;
	case ZFS_DELEG_NOTE_DIFF:
	str = gettext("Allows lookup of paths within a dataset;"
	"\n\t\t\t\tgiven an object number. Ordinary users need this"
	"\n\t\t\t\tin order to use zfs diff");
	break;
	case ZFS_DELEG_NOTE_HOLD:
	str = gettext("Allows adding a user hold to a snapshot");
	break;
	case ZFS_DELEG_NOTE_MOUNT:
	str = gettext("Allows mount/umount of ZFS datasets");
	break;
	case ZFS_DELEG_NOTE_PROMOTE:
	str = gettext("Must also have the 'mount'\n\t\t\t\tand"
	" 'promote' ability in the origin file system");
	break;
	case ZFS_DELEG_NOTE_RECEIVE:
	str = gettext("Must also have the 'mount' and 'create'"
	" ability");
	break;
	case ZFS_DELEG_NOTE_RELEASE:
	str = gettext("Allows releasing a user hold which\n\t\t\t\t"
	"might destroy the snapshot");
	break;
	case ZFS_DELEG_NOTE_RENAME:
	str = gettext("Must also have the 'mount' and 'create'"
	"\n\t\t\t\tability in the new parent");
	break;
	case ZFS_DELEG_NOTE_ROLLBACK:
	str = gettext("");
	break;
	case ZFS_DELEG_NOTE_SEND:
	str = gettext("");
	break;
	case ZFS_DELEG_NOTE_SHARE:
	str = gettext("Allows sharing file systems over NFS or SMB"
	"\n\t\t\t\tprotocols");
	break;
	case ZFS_DELEG_NOTE_SNAPSHOT:
	str = gettext("");
	break;
	/*
	* case ZFS_DELEG_NOTE_VSCAN:
	* str = gettext("");
	* break;
	*/
	/* OTHER */
	case ZFS_DELEG_NOTE_GROUPQUOTA:
	str = gettext("Allows accessing any groupquota@... property");
	break;
	case ZFS_DELEG_NOTE_GROUPUSED:
	str = gettext("Allows reading any groupused@... property");
	break;
	case ZFS_DELEG_NOTE_USERPROP:
	str = gettext("Allows changing any user property");
	break;
	case ZFS_DELEG_NOTE_USERQUOTA:
	str = gettext("Allows accessing any userquota@... property");
	break;
	case ZFS_DELEG_NOTE_USERUSED:
	str = gettext("Allows reading any userused@... property");
	break;
	/* other */
	default:
	str = "";
	}

	return (str);
	}

	struct allow_opts {
	boolean_t local;
	boolean_t descend;
	boolean_t user;
	boolean_t group;
	boolean_t everyone;
	boolean_t create;
	boolean_t set;
	boolean_t recursive; /* unallow only */
	boolean_t prt_usage;

	boolean_t prt_perms;
	char *who;
	char *perms;
	const char *dataset;
	};

	static inline int
	prop_cmp(const void a, const void b)
	{
	const char str1 = (const char **)a;
	const char str2 = (const char **)b;
	return (strcmp(str1, str2));
	}

	static void
	allow_usage(boolean_t un, boolean_t requested, const char *msg)
	{
	const char *opt_desc[] = {
	"-h", gettext("show this help message and exit"),
	"-l", gettext("set permission locally"),
	"-d", gettext("set permission for descents"),
	"-u", gettext("set permission for user"),
	"-g", gettext("set permission for group"),
	"-e", gettext("set permission for everyone"),
	"-c", gettext("set create time permission"),
	"-s", gettext("define permission set"),
	/* unallow only */
	"-r", gettext("remove permissions recursively"),
	};
	size_t unallow_size = sizeof (opt_desc) / sizeof (char *);
	size_t allow_size = unallow_size - 2;
	const char *props[ZFS_NUM_PROPS];
	int i;
	size_t count = 0;
	FILE *fp = requested ? stdout : stderr;
	zprop_desc_t *pdtbl = zfs_prop_get_table();
	const char *fmt = gettext("%-16s %-14s\t%s\n");

	(void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW :
	HELP_ALLOW));
	(void) fprintf(fp, gettext("Options:\n"));
	for (i = 0; i < (un ? unallow_size : allow_size); i++) {
	const char *opt = opt_desc[i++];
	const char *optdsc = opt_desc[i];
	(void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc);
	}

	(void) fprintf(fp, gettext("\nThe following permissions are "
	"supported:\n\n"));
	(void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"),
	gettext("NOTES"));
	for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) {
	const char *perm_name = zfs_deleg_perm_tbl[i].z_perm;
	zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note;
	const char *perm_type = deleg_perm_type(perm_note);
	const char *perm_comment = deleg_perm_comment(perm_note);
	(void) fprintf(fp, fmt, perm_name, perm_type, perm_comment);
	}

	for (i = 0; i < ZFS_NUM_PROPS; i++) {
	zprop_desc_t *pd = &pdtbl[i];
	if (pd->pd_visible != B_TRUE)
	continue;

	if (pd->pd_attr == PROP_READONLY)
	continue;

	props[count++] = pd->pd_name;
	}
	props[count] = NULL;

	qsort(props, count, sizeof (char *), prop_cmp);

	for (i = 0; i < count; i++)
	(void) fprintf(fp, fmt, props[i], gettext("property"), "");

	if (msg != NULL)
	(void) fprintf(fp, gettext("\nzfs: error: %s"), msg);

	exit(requested ? 0 : 2);
	}

	static inline const char *
	munge_args(int argc, char **argv, boolean_t un, size_t expected_argc,
	char **permsp)
	{
	if (un && argc == expected_argc - 1)
	*permsp = NULL;
	else if (argc == expected_argc)
	*permsp = argv[argc - 2];
	else
	allow_usage(un, B_FALSE,
	gettext("wrong number of parameters\n"));

	return (argv[argc - 1]);
	}

	static void
	parse_allow_args(int argc, char *argv, boolean_t un, struct allow_opts opts)
	{
	int uge_sum = opts->user + opts->group + opts->everyone;
	int csuge_sum = opts->create + opts->set + uge_sum;
	int ldcsuge_sum = csuge_sum + opts->local + opts->descend;
	int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum;

	if (uge_sum > 1)
	allow_usage(un, B_FALSE,
	gettext("-u, -g, and -e are mutually exclusive\n"));

	if (opts->prt_usage) {
	if (argc == 0 && all_sum == 0)
	allow_usage(un, B_TRUE, NULL);
	else
	usage(B_FALSE);
	}

	if (opts->set) {
	if (csuge_sum > 1)
	allow_usage(un, B_FALSE,
	gettext("invalid options combined with -s\n"));

	opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
	if (argv[0][0] != '@')
	allow_usage(un, B_FALSE,
	gettext("invalid set name: missing '@' prefix\n"));
	opts->who = argv[0];
	} else if (opts->create) {
	if (ldcsuge_sum > 1)
	allow_usage(un, B_FALSE,
	gettext("invalid options combined with -c\n"));
	opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
	} else if (opts->everyone) {
	if (csuge_sum > 1)
	allow_usage(un, B_FALSE,
	gettext("invalid options combined with -e\n"));
	opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
	} else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone")
	== 0) {
	opts->everyone = B_TRUE;
	argc--;
	argv++;
	opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
	} else if (argc == 1 && !un) {
	opts->prt_perms = B_TRUE;
	opts->dataset = argv[argc-1];
	} else {
	opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
	opts->who = argv[0];
	}

	if (!opts->local && !opts->descend) {
	opts->local = B_TRUE;
	opts->descend = B_TRUE;
	}
	}

	static void
	store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend,
	const char who, char perms, nvlist_t *top_nvl)
	{
	int i;
	char ld[2] = { '\0', '\0' };
	char who_buf[MAXNAMELEN + 32];
	char base_type = '\0';
	char set_type = '\0';
	nvlist_t *base_nvl = NULL;
	nvlist_t *set_nvl = NULL;
	nvlist_t *nvl;

	if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0)
	nomem();
	if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0)
	nomem();

	switch (type) {
	case ZFS_DELEG_NAMED_SET_SETS:
	case ZFS_DELEG_NAMED_SET:
	set_type = ZFS_DELEG_NAMED_SET_SETS;
	base_type = ZFS_DELEG_NAMED_SET;
	ld[0] = ZFS_DELEG_NA;
	break;
	case ZFS_DELEG_CREATE_SETS:
	case ZFS_DELEG_CREATE:
	set_type = ZFS_DELEG_CREATE_SETS;
	base_type = ZFS_DELEG_CREATE;
	ld[0] = ZFS_DELEG_NA;
	break;
	case ZFS_DELEG_USER_SETS:
	case ZFS_DELEG_USER:
	set_type = ZFS_DELEG_USER_SETS;
	base_type = ZFS_DELEG_USER;
	if (local)
	ld[0] = ZFS_DELEG_LOCAL;
	if (descend)
	ld[1] = ZFS_DELEG_DESCENDENT;
	break;
	case ZFS_DELEG_GROUP_SETS:
	case ZFS_DELEG_GROUP:
	set_type = ZFS_DELEG_GROUP_SETS;
	base_type = ZFS_DELEG_GROUP;
	if (local)
	ld[0] = ZFS_DELEG_LOCAL;
	if (descend)
	ld[1] = ZFS_DELEG_DESCENDENT;
	break;
	case ZFS_DELEG_EVERYONE_SETS:
	case ZFS_DELEG_EVERYONE:
	set_type = ZFS_DELEG_EVERYONE_SETS;
	base_type = ZFS_DELEG_EVERYONE;
	if (local)
	ld[0] = ZFS_DELEG_LOCAL;
	if (descend)
	ld[1] = ZFS_DELEG_DESCENDENT;
	break;

	default:
	assert(set_type != '\0' && base_type != '\0');
	}

	if (perms != NULL) {
	char *curr = perms;
	char *end = curr + strlen(perms);

	while (curr < end) {
	char *delim = strchr(curr, ',');
	if (delim == NULL)
	delim = end;
	else
	*delim = '\0';

	if (curr[0] == '@')
	nvl = set_nvl;
	else
	nvl = base_nvl;

	(void) nvlist_add_boolean(nvl, curr);
	if (delim != end)
	*delim = ',';
	curr = delim + 1;
	}

	for (i = 0; i < 2; i++) {
	char locality = ld[i];
	if (locality == 0)
	continue;

	if (!nvlist_empty(base_nvl)) {
	if (who != NULL)
	(void) snprintf(who_buf,
	sizeof (who_buf), "%c%c$%s",
	base_type, locality, who);
	else
	(void) snprintf(who_buf,
	sizeof (who_buf), "%c%c$",
	base_type, locality);

	(void) nvlist_add_nvlist(top_nvl, who_buf,
	base_nvl);
	}


	if (!nvlist_empty(set_nvl)) {
	if (who != NULL)
	(void) snprintf(who_buf,
	sizeof (who_buf), "%c%c$%s",
	set_type, locality, who);
	else
	(void) snprintf(who_buf,
	sizeof (who_buf), "%c%c$",
	set_type, locality);

	(void) nvlist_add_nvlist(top_nvl, who_buf,
	set_nvl);
	}
	}
	} else {
	for (i = 0; i < 2; i++) {
	char locality = ld[i];
	if (locality == 0)
	continue;

	if (who != NULL)
	(void) snprintf(who_buf, sizeof (who_buf),
	"%c%c$%s", base_type, locality, who);
	else
	(void) snprintf(who_buf, sizeof (who_buf),
	"%c%c$", base_type, locality);
	(void) nvlist_add_boolean(top_nvl, who_buf);

	if (who != NULL)
	(void) snprintf(who_buf, sizeof (who_buf),
	"%c%c$%s", set_type, locality, who);
	else
	(void) snprintf(who_buf, sizeof (who_buf),
	"%c%c$", set_type, locality);
	(void) nvlist_add_boolean(top_nvl, who_buf);
	}
	}
	}

	static int
	construct_fsacl_list(boolean_t un, struct allow_opts opts, nvlist_t *nvlp)
	{
	if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0)
	nomem();

	if (opts->set) {
	store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local,
	opts->descend, opts->who, opts->perms, *nvlp);
	} else if (opts->create) {
	store_allow_perm(ZFS_DELEG_CREATE, opts->local,
	opts->descend, NULL, opts->perms, *nvlp);
	} else if (opts->everyone) {
	store_allow_perm(ZFS_DELEG_EVERYONE, opts->local,
	opts->descend, NULL, opts->perms, *nvlp);
	} else {
	char *curr = opts->who;
	char *end = curr + strlen(curr);

	while (curr < end) {
	const char *who;
	zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN;
	char *endch;
	char *delim = strchr(curr, ',');
	char errbuf[256];
	char id[64];
	struct passwd *p = NULL;
	struct group *g = NULL;

	uid_t rid;
	if (delim == NULL)
	delim = end;
	else
	*delim = '\0';

	rid = (uid_t)strtol(curr, &endch, 0);
	if (opts->user) {
	who_type = ZFS_DELEG_USER;
	if (*endch != '\0')
	p = getpwnam(curr);
	else
	p = getpwuid(rid);

	if (p != NULL)
	rid = p->pw_uid;
	else {
	(void) snprintf(errbuf, 256, gettext(
	"invalid user %s"), curr);
	allow_usage(un, B_TRUE, errbuf);
	}
	} else if (opts->group) {
	who_type = ZFS_DELEG_GROUP;
	if (*endch != '\0')
	g = getgrnam(curr);
	else
	g = getgrgid(rid);

	if (g != NULL)
	rid = g->gr_gid;
	else {
	(void) snprintf(errbuf, 256, gettext(
	"invalid group %s"), curr);
	allow_usage(un, B_TRUE, errbuf);
	}
	} else {
	if (*endch != '\0') {
	p = getpwnam(curr);
	} else {
	p = getpwuid(rid);
	}

	if (p == NULL) {
	if (*endch != '\0') {
	g = getgrnam(curr);
	} else {
	g = getgrgid(rid);
	}
	}

	if (p != NULL) {
	who_type = ZFS_DELEG_USER;
	rid = p->pw_uid;
	} else if (g != NULL) {
	who_type = ZFS_DELEG_GROUP;
	rid = g->gr_gid;
	} else {
	(void) snprintf(errbuf, 256, gettext(
	"invalid user/group %s"), curr);
	allow_usage(un, B_TRUE, errbuf);
	}
	}

	(void) sprintf(id, "%u", rid);
	who = id;

	store_allow_perm(who_type, opts->local,
	opts->descend, who, opts->perms, *nvlp);
	curr = delim + 1;
	}
	}

	return (0);
	}

	static void
	print_set_creat_perms(uu_avl_t *who_avl)
	{
	const char *sc_title[] = {
	gettext("Permission sets:\n"),
	gettext("Create time permissions:\n"),
	NULL
	};
	const char **title_ptr = sc_title;
	who_perm_node_t *who_node = NULL;
	int prev_weight = -1;

	for (who_node = uu_avl_first(who_avl); who_node != NULL;
	who_node = uu_avl_next(who_avl, who_node)) {
	uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
	zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
	const char *who_name = who_node->who_perm.who_name;
	int weight = who_type2weight(who_type);
	boolean_t first = B_TRUE;
	deleg_perm_node_t *deleg_node;

	if (prev_weight != weight) {
	(void) printf(*title_ptr++);
	prev_weight = weight;
	}

	if (who_name == NULL \|\| strnlen(who_name, 1) == 0)
	(void) printf("\t");
	else
	(void) printf("\t%s ", who_name);

	for (deleg_node = uu_avl_first(avl); deleg_node != NULL;
	deleg_node = uu_avl_next(avl, deleg_node)) {
	if (first) {
	(void) printf("%s",
	deleg_node->dpn_perm.dp_name);
	first = B_FALSE;
	} else
	(void) printf(",%s",
	deleg_node->dpn_perm.dp_name);
	}

	(void) printf("\n");
	}
	}

	static void
	print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend,
	const char *title)
	{
	who_perm_node_t *who_node = NULL;
	boolean_t prt_title = B_TRUE;
	uu_avl_walk_t *walk;

	if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL)
	nomem();

	while ((who_node = uu_avl_walk_next(walk)) != NULL) {
	const char *who_name = who_node->who_perm.who_name;
	const char *nice_who_name = who_node->who_perm.who_ug_name;
	uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
	zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
	char delim = ' ';
	deleg_perm_node_t *deleg_node;
	boolean_t prt_who = B_TRUE;

	for (deleg_node = uu_avl_first(avl);
	deleg_node != NULL;
	deleg_node = uu_avl_next(avl, deleg_node)) {
	if (local != deleg_node->dpn_perm.dp_local \|\|
	descend != deleg_node->dpn_perm.dp_descend)
	continue;

	if (prt_who) {
	const char *who = NULL;
	if (prt_title) {
	prt_title = B_FALSE;
	(void) printf(title);
	}

	switch (who_type) {
	case ZFS_DELEG_USER_SETS:
	case ZFS_DELEG_USER:
	who = gettext("user");
	if (nice_who_name)
	who_name = nice_who_name;
	break;
	case ZFS_DELEG_GROUP_SETS:
	case ZFS_DELEG_GROUP:
	who = gettext("group");
	if (nice_who_name)
	who_name = nice_who_name;
	break;
	case ZFS_DELEG_EVERYONE_SETS:
	case ZFS_DELEG_EVERYONE:
	who = gettext("everyone");
	who_name = NULL;
	break;

	default:
	assert(who != NULL);
	}

	prt_who = B_FALSE;
	if (who_name == NULL)
	(void) printf("\t%s", who);
	else
	(void) printf("\t%s %s", who, who_name);
	}

	(void) printf("%c%s", delim,
	deleg_node->dpn_perm.dp_name);
	delim = ',';
	}

	if (!prt_who)
	(void) printf("\n");
	}

	uu_avl_walk_end(walk);
	}

	static void
	print_fs_perms(fs_perm_set_t *fspset)
	{
	fs_perm_node_t *node = NULL;
	char buf[MAXNAMELEN + 32];
	const char *dsname = buf;

	for (node = uu_list_first(fspset->fsps_list); node != NULL;
	node = uu_list_next(fspset->fsps_list, node)) {
	uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl;
	uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl;
	int left = 0;

	(void) snprintf(buf, sizeof (buf),
	gettext("---- Permissions on %s "),
	node->fspn_fsperm.fsp_name);
	(void) printf(dsname);
	left = 70 - strlen(buf);
	while (left-- > 0)
	(void) printf("-");
	(void) printf("\n");

	print_set_creat_perms(sc_avl);
	print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE,
	gettext("Local permissions:\n"));
	print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE,
	gettext("Descendent permissions:\n"));
	print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE,
	gettext("Local+Descendent permissions:\n"));
	}
	}

	static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL };

	struct deleg_perms {
	boolean_t un;
	nvlist_t *nvl;
	};

	static int
	set_deleg_perms(zfs_handle_t zhp, void data)
	{
	struct deleg_perms perms = (struct deleg_perms )data;
	zfs_type_t zfs_type = zfs_get_type(zhp);

	if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME)
	return (0);

	return (zfs_set_fsacl(zhp, perms->un, perms->nvl));
	}

	static int
	zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un)
	{
	zfs_handle_t *zhp;
	nvlist_t *perm_nvl = NULL;
	nvlist_t *update_perm_nvl = NULL;
	int error = 1;
	int c;
	struct allow_opts opts = { 0 };

	const char *optstr = un ? "ldugecsrh" : "ldugecsh";

	/* check opts */
	while ((c = getopt(argc, argv, optstr)) != -1) {
	switch (c) {
	case 'l':
	opts.local = B_TRUE;
	break;
	case 'd':
	opts.descend = B_TRUE;
	break;
	case 'u':
	opts.user = B_TRUE;
	break;
	case 'g':
	opts.group = B_TRUE;
	break;
	case 'e':
	opts.everyone = B_TRUE;
	break;
	case 's':
	opts.set = B_TRUE;
	break;
	case 'c':
	opts.create = B_TRUE;
	break;
	case 'r':
	opts.recursive = B_TRUE;
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case 'h':
	opts.prt_usage = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* check arguments */
	parse_allow_args(argc, argv, un, &opts);

	/* try to open the dataset */
	if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM \|
	ZFS_TYPE_VOLUME)) == NULL) {
	(void) fprintf(stderr, "Failed to open dataset: %s\n",
	opts.dataset);
	return (-1);
	}

	if (zfs_get_fsacl(zhp, &perm_nvl) != 0)
	goto cleanup2;

	fs_perm_set_init(&fs_perm_set);
	if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) {
	(void) fprintf(stderr, "Failed to parse fsacl permissions\n");
	goto cleanup1;
	}

	if (opts.prt_perms)
	print_fs_perms(&fs_perm_set);
	else {
	(void) construct_fsacl_list(un, &opts, &update_perm_nvl);
	if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0)
	goto cleanup0;

	if (un && opts.recursive) {
	struct deleg_perms data = { un, update_perm_nvl };
	if (zfs_iter_filesystems(zhp, set_deleg_perms,
	&data) != 0)
	goto cleanup0;
	}
	}

	error = 0;

	cleanup0:
	nvlist_free(perm_nvl);
	nvlist_free(update_perm_nvl);
	cleanup1:
	fs_perm_set_fini(&fs_perm_set);
	cleanup2:
	zfs_close(zhp);

	return (error);
	}

	static int
	zfs_do_allow(int argc, char **argv)
	{
	return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
	}

	static int
	zfs_do_unallow(int argc, char **argv)
	{
	return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE));
	}

	static int
	zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
	{
	int errors = 0;
	int i;
	const char *tag;
	boolean_t recursive = B_FALSE;
	const char *opts = holding ? "rt" : "r";
	int c;

	/* check options */
	while ((c = getopt(argc, argv, opts)) != -1) {
	switch (c) {
	case 'r':
	recursive = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc < 2)
	usage(B_FALSE);

	tag = argv[0];
	--argc;
	++argv;

	if (holding && tag[0] == '.') {
	/* tags starting with '.' are reserved for libzfs */
	(void) fprintf(stderr, gettext("tag may not start with '.'\n"));
	usage(B_FALSE);
	}

	for (i = 0; i < argc; ++i) {
	zfs_handle_t *zhp;
	char parent[ZFS_MAX_DATASET_NAME_LEN];
	const char *delim;
	char *path = argv[i];

	delim = strchr(path, '@');
	if (delim == NULL) {
	(void) fprintf(stderr,
	gettext("'%s' is not a snapshot\n"), path);
	++errors;
	continue;
	}
	(void) strncpy(parent, path, delim - path);
	parent[delim - path] = '\0';

	zhp = zfs_open(g_zfs, parent,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME);
	if (zhp == NULL) {
	++errors;
	continue;
	}
	if (holding) {
	if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0)
	++errors;
	} else {
	if (zfs_release(zhp, delim+1, tag, recursive) != 0)
	++errors;
	}
	zfs_close(zhp);
	}

	return (errors != 0);
	}

	/*
	* zfs hold [-r] [-t] <tag> <snap> ...
	*
	* -r Recursively hold
	*
	* Apply a user-hold with the given tag to the list of snapshots.
	*/
	static int
	zfs_do_hold(int argc, char **argv)
	{
	return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
	}

	/*
	* zfs release [-r] <tag> <snap> ...
	*
	* -r Recursively release
	*
	* Release a user-hold with the given tag from the list of snapshots.
	*/
	static int
	zfs_do_release(int argc, char **argv)
	{
	return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
	}

	typedef struct holds_cbdata {
	boolean_t cb_recursive;
	const char *cb_snapname;
	nvlist_t **cb_nvlp;
	size_t cb_max_namelen;
	size_t cb_max_taglen;
	} holds_cbdata_t;

	#define STRFTIME_FMT_STR "%a %b %e %k:%M %Y"
	#define DATETIME_BUF_LEN (32)
	/*
	*
	*/
	static void
	print_holds(boolean_t scripted, boolean_t literal, size_t nwidth,
	size_t tagwidth, nvlist_t *nvl)
	{
	int i;
	nvpair_t *nvp = NULL;
	char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" };
	const char *col;

	if (!scripted) {
	for (i = 0; i < 3; i++) {
	col = gettext(hdr_cols[i]);
	if (i < 2)
	(void) printf("%-*s ", i ? tagwidth : nwidth,
	col);
	else
	(void) printf("%s\n", col);
	}
	}

	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	char *zname = nvpair_name(nvp);
	nvlist_t *nvl2;
	nvpair_t *nvp2 = NULL;
	(void) nvpair_value_nvlist(nvp, &nvl2);
	while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) {
	char tsbuf[DATETIME_BUF_LEN];
	char *tagname = nvpair_name(nvp2);
	uint64_t val = 0;
	time_t time;
	struct tm t;

	(void) nvpair_value_uint64(nvp2, &val);
	if (literal)
	snprintf(tsbuf, DATETIME_BUF_LEN, "%llu", val);
	else {
	time = (time_t)val;
	(void) localtime_r(&time, &t);
	(void) strftime(tsbuf, DATETIME_BUF_LEN,
	gettext(STRFTIME_FMT_STR), &t);
	}

	if (scripted) {
	(void) printf("%s\t%s\t%s\n", zname,
	tagname, tsbuf);
	} else {
	(void) printf("%-s %-s %s\n", nwidth,
	zname, tagwidth, tagname, tsbuf);
	}
	}
	}
	}

	/*
	* Generic callback function to list a dataset or snapshot.
	*/
	static int
	holds_callback(zfs_handle_t zhp, void data)
	{
	holds_cbdata_t *cbp = data;
	nvlist_t top_nvl = cbp->cb_nvlp;
	nvlist_t *nvl = NULL;
	nvpair_t *nvp = NULL;
	const char *zname = zfs_get_name(zhp);
	size_t znamelen = strlen(zname);

	if (cbp->cb_recursive && cbp->cb_snapname != NULL) {
	const char *snapname;
	char *delim = strchr(zname, '@');
	if (delim == NULL)
	return (0);

	snapname = delim + 1;
	if (strcmp(cbp->cb_snapname, snapname))
	return (0);
	}

	if (zfs_get_holds(zhp, &nvl) != 0)
	return (-1);

	if (znamelen > cbp->cb_max_namelen)
	cbp->cb_max_namelen = znamelen;

	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
	const char *tag = nvpair_name(nvp);
	size_t taglen = strlen(tag);
	if (taglen > cbp->cb_max_taglen)
	cbp->cb_max_taglen = taglen;
	}

	return (nvlist_add_nvlist(top_nvl, zname, nvl));
	}

	/*
	* zfs holds [-Hp] [-r \| -d max] <dataset\|snap> ...
	*
	* -H Suppress header output
	* -p Output literal values
	* -r Recursively search for holds
	* -d max Limit depth of recursive search
	*/
	static int
	zfs_do_holds(int argc, char **argv)
	{
	int errors = 0;
	int c;
	int i;
	boolean_t scripted = B_FALSE;
	boolean_t literal = B_FALSE;
	boolean_t recursive = B_FALSE;
	const char *opts = "d:rHp";
	nvlist_t *nvl;

	int types = ZFS_TYPE_SNAPSHOT;
	holds_cbdata_t cb = { 0 };

	int limit = 0;
	int ret = 0;
	int flags = 0;

	/* check options */
	while ((c = getopt(argc, argv, opts)) != -1) {
	switch (c) {
	case 'd':
	limit = parse_depth(optarg, &flags);
	recursive = B_TRUE;
	break;
	case 'r':
	recursive = B_TRUE;
	break;
	case 'H':
	scripted = B_TRUE;
	break;
	case 'p':
	literal = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	if (recursive) {
	types \|= ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME;
	flags \|= ZFS_ITER_RECURSE;
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc < 1)
	usage(B_FALSE);

	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
	nomem();

	for (i = 0; i < argc; ++i) {
	char *snapshot = argv[i];
	const char *delim;
	const char *snapname = NULL;

	delim = strchr(snapshot, '@');
	if (delim != NULL) {
	snapname = delim + 1;
	if (recursive)
	snapshot[delim - snapshot] = '\0';
	}

	cb.cb_recursive = recursive;
	cb.cb_snapname = snapname;
	cb.cb_nvlp = &nvl;

	/*
	* 1. collect holds data, set format options
	*/
	ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit,
	holds_callback, &cb);
	if (ret != 0)
	++errors;
	}

	/*
	* 2. print holds data
	*/
	print_holds(scripted, literal, cb.cb_max_namelen, cb.cb_max_taglen,
	nvl);

	if (nvlist_empty(nvl))
	(void) printf(gettext("no datasets available\n"));

	nvlist_free(nvl);

	return (0 != errors);
	}

	#define CHECK_SPINNER 30
	#define SPINNER_TIME 3 /* seconds */
	#define MOUNT_TIME 5 /* seconds */

	static int
	get_one_dataset(zfs_handle_t zhp, void data)
	{
	static char *spin[] = { "-", "\\", "\|", "/" };
	static int spinval = 0;
	static int spincheck = 0;
	static time_t last_spin_time = (time_t)0;
	get_all_cb_t *cbp = data;
	zfs_type_t type = zfs_get_type(zhp);

	if (cbp->cb_verbose) {
	if (--spincheck < 0) {
	time_t now = time(NULL);
	if (last_spin_time + SPINNER_TIME < now) {
	update_progress(spin[spinval++ % 4]);
	last_spin_time = now;
	}
	spincheck = CHECK_SPINNER;
	}
	}

	/*
	* Interate over any nested datasets.
	*/
	if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
	zfs_close(zhp);
	return (1);
	}

	/*
	* Skip any datasets whose type does not match.
	*/
	if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
	zfs_close(zhp);
	return (0);
	}
	libzfs_add_handle(cbp, zhp);
	assert(cbp->cb_used <= cbp->cb_alloc);

	return (0);
	}

	static void
	get_all_datasets(zfs_handle_t **dslist, size_t count, boolean_t verbose)
	{
	get_all_cb_t cb = { 0 };
	cb.cb_verbose = verbose;
	cb.cb_getone = get_one_dataset;

	if (verbose)
	set_progress_header(gettext("Reading ZFS config"));
	(void) zfs_iter_root(g_zfs, get_one_dataset, &cb);

	*dslist = cb.cb_handles;
	*count = cb.cb_used;

	if (verbose)
	finish_progress(gettext("done."));
	}

	/*
	* Generic callback for sharing or mounting filesystems. Because the code is so
	* similar, we have a common function with an extra parameter to determine which
	* mode we are using.
	*/
	#define OP_SHARE 0x1
	#define OP_MOUNT 0x2

	/*
	* Share or mount a dataset.
	*/
	static int
	share_mount_one(zfs_handle_t zhp, int op, int flags, char protocol,
	boolean_t explicit, const char *options)
	{
	char mountpoint[ZFS_MAXPROPLEN];
	char shareopts[ZFS_MAXPROPLEN];
	char smbshareopts[ZFS_MAXPROPLEN];
	const char *cmdname = op == OP_SHARE ? "share" : "mount";
	struct mnttab mnt;
	uint64_t zoned, canmount;
	boolean_t shared_nfs, shared_smb;

	assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM);

	/*
	* Check to make sure we can mount/share this dataset. If we
	* are in the global zone and the filesystem is exported to a
	* local zone, or if we are in a local zone and the
	* filesystem is not exported, then it is an error.
	*/
	zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);

	if (zoned && getzoneid() == GLOBAL_ZONEID) {
	if (!explicit)
	return (0);

	(void) fprintf(stderr, gettext("cannot %s '%s': "
	"dataset is exported to a local zone\n"), cmdname,
	zfs_get_name(zhp));
	return (1);

	} else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
	if (!explicit)
	return (0);

	(void) fprintf(stderr, gettext("cannot %s '%s': "
	"permission denied\n"), cmdname,
	zfs_get_name(zhp));
	return (1);
	}

	/*
	* Ignore any filesystems which don't apply to us. This
	* includes those with a legacy mountpoint, or those with
	* legacy share options.
	*/
	verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
	sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
	verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
	sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
	verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
	sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);

	if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
	strcmp(smbshareopts, "off") == 0) {
	if (!explicit)
	return (0);

	(void) fprintf(stderr, gettext("cannot share '%s': "
	"legacy share\n"), zfs_get_name(zhp));
	(void) fprintf(stderr, gettext("to "
	"share this filesystem set "
	"sharenfs property on\n"));
	return (1);
	}

	/*
	* We cannot share or mount legacy filesystems. If the
	* shareopts is non-legacy but the mountpoint is legacy, we
	* treat it as a legacy share.
	*/
	if (strcmp(mountpoint, "legacy") == 0) {
	if (!explicit)
	return (0);

	(void) fprintf(stderr, gettext("cannot %s '%s': "
	"legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
	(void) fprintf(stderr, gettext("use %s(8) to "
	"%s this filesystem\n"), cmdname, cmdname);
	return (1);
	}

	if (strcmp(mountpoint, "none") == 0) {
	if (!explicit)
	return (0);

	(void) fprintf(stderr, gettext("cannot %s '%s': no "
	"mountpoint set\n"), cmdname, zfs_get_name(zhp));
	return (1);
	}

	/*
	* canmount explicit outcome
	* on no pass through
	* on yes pass through
	* off no return 0
	* off yes display error, return 1
	* noauto no return 0
	* noauto yes pass through
	*/
	canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
	if (canmount == ZFS_CANMOUNT_OFF) {
	if (!explicit)
	return (0);

	(void) fprintf(stderr, gettext("cannot %s '%s': "
	"'canmount' property is set to 'off'\n"), cmdname,
	zfs_get_name(zhp));
	return (1);
	} else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
	return (0);
	}

	/*
	* If this filesystem is inconsistent and has a receive resume
	* token, we can not mount it.
	*/
	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
	zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
	NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
	if (!explicit)
	return (0);

	(void) fprintf(stderr, gettext("cannot %s '%s': "
	"Contains partially-completed state from "
	"\"zfs receive -r\", which can be resumed with "
	"\"zfs send -t\"\n"),
	cmdname, zfs_get_name(zhp));
	return (1);
	}

	/*
	* At this point, we have verified that the mountpoint and/or
	* shareopts are appropriate for auto management. If the
	* filesystem is already mounted or shared, return (failing
	* for explicit requests); otherwise mount or share the
	* filesystem.
	*/
	switch (op) {
	case OP_SHARE:

	shared_nfs = zfs_is_shared_nfs(zhp, NULL);
	shared_smb = zfs_is_shared_smb(zhp, NULL);

	if ((shared_nfs && shared_smb) \|\|
	(shared_nfs && strcmp(shareopts, "on") == 0 &&
	strcmp(smbshareopts, "off") == 0) \|\|
	(shared_smb && strcmp(smbshareopts, "on") == 0 &&
	strcmp(shareopts, "off") == 0)) {
	if (!explicit)
	return (0);

	(void) fprintf(stderr, gettext("cannot share "
	"'%s': filesystem already shared\n"),
	zfs_get_name(zhp));
	return (1);
	}

	if (!zfs_is_mounted(zhp, NULL) &&
	zfs_mount(zhp, NULL, 0) != 0)
	return (1);

	if (protocol == NULL) {
	if (zfs_shareall(zhp) != 0)
	return (1);
	} else if (strcmp(protocol, "nfs") == 0) {
	if (zfs_share_nfs(zhp))
	return (1);
	} else if (strcmp(protocol, "smb") == 0) {
	if (zfs_share_smb(zhp))
	return (1);
	} else {
	(void) fprintf(stderr, gettext("cannot share "
	"'%s': invalid share type '%s' "
	"specified\n"),
	zfs_get_name(zhp), protocol);
	return (1);
	}

	break;

	case OP_MOUNT:
	if (options == NULL)
	mnt.mnt_mntopts = "";
	else
	mnt.mnt_mntopts = (char *)options;

	if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
	zfs_is_mounted(zhp, NULL)) {
	if (!explicit)
	return (0);

	(void) fprintf(stderr, gettext("cannot mount "
	"'%s': filesystem already mounted\n"),
	zfs_get_name(zhp));
	return (1);
	}

	if (zfs_mount(zhp, options, flags) != 0)
	return (1);
	break;
	}

	return (0);
	}

	/*
	* Reports progress in the form "(current/total)". Not thread-safe.
	*/
	static void
	report_mount_progress(int current, int total)
	{
	static time_t last_progress_time = 0;
	time_t now = time(NULL);
	char info[32];

	/* report 1..n instead of 0..n-1 */
	++current;

	/* display header if we're here for the first time */
	if (current == 1) {
	set_progress_header(gettext("Mounting ZFS filesystems"));
	} else if (current != total && last_progress_time + MOUNT_TIME >= now) {
	/* too soon to report again */
	return;
	}

	last_progress_time = now;

	(void) sprintf(info, "(%d/%d)", current, total);

	if (current == total)
	finish_progress(info);
	else
	update_progress(info);
	}

	static void
	append_options(char mntopts, char newopts)
	{
	int len = strlen(mntopts);

	/* original length plus new string to append plus 1 for the comma */
	if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) {
	(void) fprintf(stderr, gettext("the opts argument for "
	"'%c' option is too long (more than %d chars)\n"),
	"-o", MNT_LINE_MAX);
	usage(B_FALSE);
	}

	if (*mntopts)
	mntopts[len++] = ',';

	(void) strcpy(&mntopts[len], newopts);
	}

	static int
	share_mount(int op, int argc, char **argv)
	{
	int do_all = 0;
	boolean_t verbose = B_FALSE;
	int c, ret = 0;
	char *options = NULL;
	int flags = 0;

	/* check options */
	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a"))
	!= -1) {
	switch (c) {
	case 'a':
	do_all = 1;
	break;
	case 'v':
	verbose = B_TRUE;
	break;
	case 'o':
	if (*optarg == '\0') {
	(void) fprintf(stderr, gettext("empty mount "
	"options (-o) specified\n"));
	usage(B_FALSE);
	}

	if (options == NULL)
	options = safe_malloc(MNT_LINE_MAX + 1);

	/* option validation is done later */
	append_options(options, optarg);
	break;

	case 'O':
	warnx("no overlay mounts support on FreeBSD, ignoring");
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (do_all) {
	zfs_handle_t **dslist = NULL;
	size_t i, count = 0;
	char *protocol = NULL;

	if (op == OP_SHARE && argc > 0) {
	if (strcmp(argv[0], "nfs") != 0 &&
	strcmp(argv[0], "smb") != 0) {
	(void) fprintf(stderr, gettext("share type "
	"must be 'nfs' or 'smb'\n"));
	usage(B_FALSE);
	}
	protocol = argv[0];
	argc--;
	argv++;
	}

	if (argc != 0) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	start_progress_timer();
	get_all_datasets(&dslist, &count, verbose);

	if (count == 0)
	return (0);

	qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);

	for (i = 0; i < count; i++) {
	if (verbose)
	report_mount_progress(i, count);

	if (share_mount_one(dslist[i], op, flags, protocol,
	B_FALSE, options) != 0)
	ret = 1;
	zfs_close(dslist[i]);
	}

	free(dslist);
	} else if (argc == 0) {
	struct mnttab entry;

	if ((op == OP_SHARE) \|\| (options != NULL)) {
	(void) fprintf(stderr, gettext("missing filesystem "
	"argument (specify -a for all)\n"));
	usage(B_FALSE);
	}

	/*
	* When mount is given no arguments, go through /etc/mnttab and
	* display any active ZFS mounts. We hide any snapshots, since
	* they are controlled automatically.
	*/
	rewind(mnttab_file);
	while (getmntent(mnttab_file, &entry) == 0) {
	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 \|\|
	strchr(entry.mnt_special, '@') != NULL)
	continue;

	(void) printf("%-30s %s\n", entry.mnt_special,
	entry.mnt_mountp);
	}

	} else {
	zfs_handle_t *zhp;

	if (argc > 1) {
	(void) fprintf(stderr,
	gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	if ((zhp = zfs_open(g_zfs, argv[0],
	ZFS_TYPE_FILESYSTEM)) == NULL) {
	ret = 1;
	} else {
	ret = share_mount_one(zhp, op, flags, NULL, B_TRUE,
	options);
	zfs_close(zhp);
	}
	}

	return (ret);
	}

	/*
	* zfs mount -a [nfs]
	* zfs mount filesystem
	*
	* Mount all filesystems, or mount the given filesystem.
	*/
	static int
	zfs_do_mount(int argc, char **argv)
	{
	return (share_mount(OP_MOUNT, argc, argv));
	}

	/*
	* zfs share -a [nfs \| smb]
	* zfs share filesystem
	*
	* Share all filesystems, or share the given filesystem.
	*/
	static int
	zfs_do_share(int argc, char **argv)
	{
	return (share_mount(OP_SHARE, argc, argv));
	}

	typedef struct unshare_unmount_node {
	zfs_handle_t *un_zhp;
	char *un_mountp;
	uu_avl_node_t un_avlnode;
	} unshare_unmount_node_t;

	/* ARGSUSED */
	static int
	unshare_unmount_compare(const void larg, const void rarg, void *unused)
	{
	const unshare_unmount_node_t *l = larg;
	const unshare_unmount_node_t *r = rarg;

	return (strcmp(l->un_mountp, r->un_mountp));
	}

	/*
	* Convenience routine used by zfs_do_umount() and manual_unmount(). Given an
	* absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem,
	* and unmount it appropriately.
	*/
	static int
	unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
	{
	zfs_handle_t *zhp;
	int ret = 0;
	struct stat64 statbuf;
	struct extmnttab entry;
	const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
	ino_t path_inode;

	/*
	* Search for the path in /etc/mnttab. Rather than looking for the
	* specific path, which can be fooled by non-standard paths (i.e. ".."
	* or "//"), we stat() the path and search for the corresponding
	* (major,minor) device pair.
	*/
	if (stat64(path, &statbuf) != 0) {
	(void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
	cmdname, path, strerror(errno));
	return (1);
	}
	path_inode = statbuf.st_ino;

	/*
	* Search for the given (major,minor) pair in the mount table.
	*/
	#ifdef illumos
	rewind(mnttab_file);
	while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) {
	if (entry.mnt_major == major(statbuf.st_dev) &&
	entry.mnt_minor == minor(statbuf.st_dev))
	break;
	}
	#else
	{
	struct statfs sfs;

	if (statfs(path, &sfs) != 0) {
	(void) fprintf(stderr, "%s: %s\n", path,
	strerror(errno));
	ret = -1;
	}
	statfs2mnttab(&sfs, &entry);
	}
	#endif
	if (ret != 0) {
	if (op == OP_SHARE) {
	(void) fprintf(stderr, gettext("cannot %s '%s': not "
	"currently mounted\n"), cmdname, path);
	return (1);
	}
	(void) fprintf(stderr, gettext("warning: %s not in mnttab\n"),
	path);
	if ((ret = umount2(path, flags)) != 0)
	(void) fprintf(stderr, gettext("%s: %s\n"), path,
	strerror(errno));
	return (ret != 0);
	}

	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
	(void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
	"filesystem\n"), cmdname, path);
	return (1);
	}

	if ((zhp = zfs_open(g_zfs, entry.mnt_special,
	ZFS_TYPE_FILESYSTEM)) == NULL)
	return (1);

	ret = 1;
	if (stat64(entry.mnt_mountp, &statbuf) != 0) {
	(void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
	cmdname, path, strerror(errno));
	goto out;
	} else if (statbuf.st_ino != path_inode) {
	(void) fprintf(stderr, gettext("cannot "
	"%s '%s': not a mountpoint\n"), cmdname, path);
	goto out;
	}

	if (op == OP_SHARE) {
	char nfs_mnt_prop[ZFS_MAXPROPLEN];
	char smbshare_prop[ZFS_MAXPROPLEN];

	verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop,
	sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0);
	verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop,
	sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0);

	if (strcmp(nfs_mnt_prop, "off") == 0 &&
	strcmp(smbshare_prop, "off") == 0) {
	(void) fprintf(stderr, gettext("cannot unshare "
	"'%s': legacy share\n"), path);
	#ifdef illumos
	(void) fprintf(stderr, gettext("use "
	"unshare(1M) to unshare this filesystem\n"));
	#endif
	} else if (!zfs_is_shared(zhp)) {
	(void) fprintf(stderr, gettext("cannot unshare '%s': "
	"not currently shared\n"), path);
	} else {
	ret = zfs_unshareall_bypath(zhp, path);
	}
	} else {
	char mtpt_prop[ZFS_MAXPROPLEN];

	verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop,
	sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0);

	if (is_manual) {
	ret = zfs_unmount(zhp, NULL, flags);
	} else if (strcmp(mtpt_prop, "legacy") == 0) {
	(void) fprintf(stderr, gettext("cannot unmount "
	"'%s': legacy mountpoint\n"),
	zfs_get_name(zhp));
	(void) fprintf(stderr, gettext("use umount(8) "
	"to unmount this filesystem\n"));
	} else {
	ret = zfs_unmountall(zhp, flags);
	}
	}

	out:
	zfs_close(zhp);

	return (ret != 0);
	}

	/*
	* Generic callback for unsharing or unmounting a filesystem.
	*/
	static int
	unshare_unmount(int op, int argc, char **argv)
	{
	int do_all = 0;
	int flags = 0;
	int ret = 0;
	int c;
	zfs_handle_t *zhp;
	char nfs_mnt_prop[ZFS_MAXPROPLEN];
	char sharesmb[ZFS_MAXPROPLEN];

	/* check options */
	while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) {
	switch (c) {
	case 'a':
	do_all = 1;
	break;
	case 'f':
	flags = MS_FORCE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	if (do_all) {
	/*
	* We could make use of zfs_for_each() to walk all datasets in
	* the system, but this would be very inefficient, especially
	* since we would have to linearly search /etc/mnttab for each
	* one. Instead, do one pass through /etc/mnttab looking for
	* zfs entries and call zfs_unmount() for each one.
	*
	* Things get a little tricky if the administrator has created
	* mountpoints beneath other ZFS filesystems. In this case, we
	* have to unmount the deepest filesystems first. To accomplish
	* this, we place all the mountpoints in an AVL tree sorted by
	* the special type (dataset name), and walk the result in
	* reverse to make sure to get any snapshots first.
	*/
	struct mnttab entry;
	uu_avl_pool_t *pool;
	uu_avl_t *tree = NULL;
	unshare_unmount_node_t *node;
	uu_avl_index_t idx;
	uu_avl_walk_t *walk;

	if (argc != 0) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	if (((pool = uu_avl_pool_create("unmount_pool",
	sizeof (unshare_unmount_node_t),
	offsetof(unshare_unmount_node_t, un_avlnode),
	unshare_unmount_compare, UU_DEFAULT)) == NULL) \|\|
	((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL))
	nomem();

	rewind(mnttab_file);
	while (getmntent(mnttab_file, &entry) == 0) {

	/* ignore non-ZFS entries */
	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
	continue;

	/* ignore snapshots */
	if (strchr(entry.mnt_special, '@') != NULL)
	continue;

	if ((zhp = zfs_open(g_zfs, entry.mnt_special,
	ZFS_TYPE_FILESYSTEM)) == NULL) {
	ret = 1;
	continue;
	}

	/*
	* Ignore datasets that are excluded/restricted by
	* parent pool name.
	*/
	if (zpool_skip_pool(zfs_get_pool_name(zhp))) {
	zfs_close(zhp);
	continue;
	}

	switch (op) {
	case OP_SHARE:
	verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
	nfs_mnt_prop,
	sizeof (nfs_mnt_prop),
	NULL, NULL, 0, B_FALSE) == 0);
	if (strcmp(nfs_mnt_prop, "off") != 0)
	break;
	verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
	nfs_mnt_prop,
	sizeof (nfs_mnt_prop),
	NULL, NULL, 0, B_FALSE) == 0);
	if (strcmp(nfs_mnt_prop, "off") == 0)
	continue;
	break;
	case OP_MOUNT:
	/* Ignore legacy mounts */
	verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
	nfs_mnt_prop,
	sizeof (nfs_mnt_prop),
	NULL, NULL, 0, B_FALSE) == 0);
	if (strcmp(nfs_mnt_prop, "legacy") == 0)
	continue;
	/* Ignore canmount=noauto mounts */
	if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) ==
	ZFS_CANMOUNT_NOAUTO)
	continue;
	default:
	break;
	}

	node = safe_malloc(sizeof (unshare_unmount_node_t));
	node->un_zhp = zhp;
	node->un_mountp = safe_strdup(entry.mnt_mountp);

	uu_avl_node_init(node, &node->un_avlnode, pool);

	if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
	uu_avl_insert(tree, node, idx);
	} else {
	zfs_close(node->un_zhp);
	free(node->un_mountp);
	free(node);
	}
	}

	/*
	* Walk the AVL tree in reverse, unmounting each filesystem and
	* removing it from the AVL tree in the process.
	*/
	if ((walk = uu_avl_walk_start(tree,
	UU_WALK_REVERSE \| UU_WALK_ROBUST)) == NULL)
	nomem();

	while ((node = uu_avl_walk_next(walk)) != NULL) {
	uu_avl_remove(tree, node);

	switch (op) {
	case OP_SHARE:
	if (zfs_unshareall_bypath(node->un_zhp,
	node->un_mountp) != 0)
	ret = 1;
	break;

	case OP_MOUNT:
	if (zfs_unmount(node->un_zhp,
	node->un_mountp, flags) != 0)
	ret = 1;
	break;
	}

	zfs_close(node->un_zhp);
	free(node->un_mountp);
	free(node);
	}

	uu_avl_walk_end(walk);
	uu_avl_destroy(tree);
	uu_avl_pool_destroy(pool);

	} else {
	if (argc != 1) {
	if (argc == 0)
	(void) fprintf(stderr,
	gettext("missing filesystem argument\n"));
	else
	(void) fprintf(stderr,
	gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	/*
	* We have an argument, but it may be a full path or a ZFS
	* filesystem. Pass full paths off to unmount_path() (shared by
	* manual_unmount), otherwise open the filesystem and pass to
	* zfs_unmount().
	*/
	if (argv[0][0] == '/')
	return (unshare_unmount_path(op, argv[0],
	flags, B_FALSE));

	if ((zhp = zfs_open(g_zfs, argv[0],
	ZFS_TYPE_FILESYSTEM)) == NULL)
	return (1);

	verify(zfs_prop_get(zhp, op == OP_SHARE ?
	ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
	nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
	NULL, 0, B_FALSE) == 0);

	switch (op) {
	case OP_SHARE:
	verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
	nfs_mnt_prop,
	sizeof (nfs_mnt_prop),
	NULL, NULL, 0, B_FALSE) == 0);
	verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
	sharesmb, sizeof (sharesmb), NULL, NULL,
	0, B_FALSE) == 0);

	if (strcmp(nfs_mnt_prop, "off") == 0 &&
	strcmp(sharesmb, "off") == 0) {
	(void) fprintf(stderr, gettext("cannot "
	"unshare '%s': legacy share\n"),
	zfs_get_name(zhp));
	#ifdef illumos
	(void) fprintf(stderr, gettext("use "
	"unshare(1M) to unshare this "
	"filesystem\n"));
	#endif
	ret = 1;
	} else if (!zfs_is_shared(zhp)) {
	(void) fprintf(stderr, gettext("cannot "
	"unshare '%s': not currently "
	"shared\n"), zfs_get_name(zhp));
	ret = 1;
	} else if (zfs_unshareall(zhp) != 0) {
	ret = 1;
	}
	break;

	case OP_MOUNT:
	if (strcmp(nfs_mnt_prop, "legacy") == 0) {
	(void) fprintf(stderr, gettext("cannot "
	"unmount '%s': legacy "
	"mountpoint\n"), zfs_get_name(zhp));
	(void) fprintf(stderr, gettext("use "
	"umount(8) to unmount this "
	"filesystem\n"));
	ret = 1;
	} else if (!zfs_is_mounted(zhp, NULL)) {
	(void) fprintf(stderr, gettext("cannot "
	"unmount '%s': not currently "
	"mounted\n"),
	zfs_get_name(zhp));
	ret = 1;
	} else if (zfs_unmountall(zhp, flags) != 0) {
	ret = 1;
	}
	break;
	}

	zfs_close(zhp);
	}

	return (ret);
	}

	/*
	* zfs unmount -a
	* zfs unmount filesystem
	*
	* Unmount all filesystems, or a specific ZFS filesystem.
	*/
	static int
	zfs_do_unmount(int argc, char **argv)
	{
	return (unshare_unmount(OP_MOUNT, argc, argv));
	}

	/*
	* zfs unshare -a
	* zfs unshare filesystem
	*
	* Unshare all filesystems, or a specific ZFS filesystem.
	*/
	static int
	zfs_do_unshare(int argc, char **argv)
	{
	return (unshare_unmount(OP_SHARE, argc, argv));
	}

	/*
	* Attach/detach the given dataset to/from the given jail
	*/
	/* ARGSUSED */
	static int
	do_jail(int argc, char **argv, int attach)
	{
	zfs_handle_t *zhp;
	int jailid, ret;

	/* check number of arguments */
	if (argc < 3) {
	(void) fprintf(stderr, gettext("missing argument(s)\n"));
	usage(B_FALSE);
	}
	if (argc > 3) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	jailid = jail_getid(argv[1]);
	if (jailid < 0) {
	(void) fprintf(stderr, gettext("invalid jail id or name\n"));
	usage(B_FALSE);
	}

	zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM);
	if (zhp == NULL)
	return (1);

	ret = (zfs_jail(zhp, jailid, attach) != 0);

	zfs_close(zhp);
	return (ret);
	}

	/*
	* zfs jail jailid filesystem
	*
	* Attach the given dataset to the given jail
	*/
	/* ARGSUSED */
	static int
	zfs_do_jail(int argc, char **argv)
	{

	return (do_jail(argc, argv, 1));
	}

	/*
	* zfs unjail jailid filesystem
	*
	* Detach the given dataset from the given jail
	*/
	/* ARGSUSED */
	static int
	zfs_do_unjail(int argc, char **argv)
	{

	return (do_jail(argc, argv, 0));
	}

	/*
	* Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is
	* 'legacy'. Otherwise, complain that use should be using 'zfs mount'.
	*/
	static int
	manual_mount(int argc, char **argv)
	{
	zfs_handle_t *zhp;
	char mountpoint[ZFS_MAXPROPLEN];
	char mntopts[MNT_LINE_MAX] = { '\0' };
	int ret = 0;
	int c;
	int flags = 0;
	char dataset, path;

	/* check options */
	while ((c = getopt(argc, argv, ":mo:O")) != -1) {
	switch (c) {
	case 'o':
	(void) strlcpy(mntopts, optarg, sizeof (mntopts));
	break;
	case 'O':
	flags \|= MS_OVERLAY;
	break;
	case 'm':
	flags \|= MS_NOMNTTAB;
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	(void) fprintf(stderr, gettext("usage: mount [-o opts] "
	"<path>\n"));
	return (2);
	}
	}

	argc -= optind;
	argv += optind;

	/* check that we only have two arguments */
	if (argc != 2) {
	if (argc == 0)
	(void) fprintf(stderr, gettext("missing dataset "
	"argument\n"));
	else if (argc == 1)
	(void) fprintf(stderr,
	gettext("missing mountpoint argument\n"));
	else
	(void) fprintf(stderr, gettext("too many arguments\n"));
	(void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
	return (2);
	}

	dataset = argv[0];
	path = argv[1];

	/* try to open the dataset */
	if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_FILESYSTEM)) == NULL)
	return (1);

	(void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
	sizeof (mountpoint), NULL, NULL, 0, B_FALSE);

	/* check for legacy mountpoint and complain appropriately */
	ret = 0;
	if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) {
	if (zmount(dataset, path, flags, MNTTYPE_ZFS,
	NULL, 0, mntopts, sizeof (mntopts)) != 0) {
	(void) fprintf(stderr, gettext("mount failed: %s\n"),
	strerror(errno));
	ret = 1;
	}
	} else {
	(void) fprintf(stderr, gettext("filesystem '%s' cannot be "
	"mounted using 'mount -t zfs'\n"), dataset);
	(void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' "
	"instead.\n"), path);
	(void) fprintf(stderr, gettext("If you must use 'mount -t zfs' "
	"or /etc/fstab, use 'zfs set mountpoint=legacy'.\n"));
	(void) fprintf(stderr, gettext("See zfs(8) for more "
	"information.\n"));
	ret = 1;
	}

	return (ret);
	}

	/*
	* Called when invoked as /etc/fs/zfs/umount. Unlike a manual mount, we allow
	* unmounts of non-legacy filesystems, as this is the dominant administrative
	* interface.
	*/
	static int
	manual_unmount(int argc, char **argv)
	{
	int flags = 0;
	int c;

	/* check options */
	while ((c = getopt(argc, argv, "f")) != -1) {
	switch (c) {
	case 'f':
	flags = MS_FORCE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	(void) fprintf(stderr, gettext("usage: unmount [-f] "
	"<path>\n"));
	return (2);
	}
	}

	argc -= optind;
	argv += optind;

	/* check arguments */
	if (argc != 1) {
	if (argc == 0)
	(void) fprintf(stderr, gettext("missing path "
	"argument\n"));
	else
	(void) fprintf(stderr, gettext("too many arguments\n"));
	(void) fprintf(stderr, gettext("usage: unmount [-f] <path>\n"));
	return (2);
	}

	return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE));
	}

	static int
	find_command_idx(char command, int idx)
	{
	int i;

	for (i = 0; i < NCOMMAND; i++) {
	if (command_table[i].name == NULL)
	continue;

	if (strcmp(command, command_table[i].name) == 0) {
	*idx = i;
	return (0);
	}
	}
	return (1);
	}

	static int
	zfs_do_diff(int argc, char **argv)
	{
	zfs_handle_t *zhp;
	int flags = 0;
	char *tosnap = NULL;
	char *fromsnap = NULL;
	char atp, copy;
	int err = 0;
	int c;

	while ((c = getopt(argc, argv, "FHt")) != -1) {
	switch (c) {
	case 'F':
	flags \|= ZFS_DIFF_CLASSIFY;
	break;
	case 'H':
	flags \|= ZFS_DIFF_PARSEABLE;
	break;
	case 't':
	flags \|= ZFS_DIFF_TIMESTAMP;
	break;
	default:
	(void) fprintf(stderr,
	gettext("invalid option '%c'\n"), optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	if (argc < 1) {
	(void) fprintf(stderr,
	- gettext("must provide at least one snapshot name\n"));
	+ gettext("must provide at least one snapshot name\n"));
	usage(B_FALSE);
	}

	if (argc > 2) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	fromsnap = argv[0];
	tosnap = (argc == 2) ? argv[1] : NULL;

	copy = NULL;
	if (*fromsnap != '@')
	copy = strdup(fromsnap);
	else if (tosnap)
	copy = strdup(tosnap);
	if (copy == NULL)
	usage(B_FALSE);

	if ((atp = strchr(copy, '@')) != NULL)
	*atp = '\0';

	if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL)
	return (1);

	free(copy);

	/*
	* Ignore SIGPIPE so that the library can give us
	* information on any failure
	*/
	(void) sigignore(SIGPIPE);

	err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags);

	zfs_close(zhp);

	return (err != 0);
	+}
	+
	+static int
	+zfs_do_remap(int argc, char **argv)
	+{
	+ const char *fsname;
	+ int err = 0;
	+ if (argc != 2) {
	+ (void) fprintf(stderr, gettext("wrong number of arguments\n"));
	+ usage(B_FALSE);
	+ }
	+
	+ fsname = argv[1];
	+ err = zfs_remap_indirects(g_zfs, fsname);
	+
	+ return (err);
	}

	/*
	* zfs bookmark <fs@snap> <fs#bmark>
	*
	* Creates a bookmark with the given name from the given snapshot.
	*/
	static int
	zfs_do_bookmark(int argc, char **argv)
	{
	char snapname[ZFS_MAX_DATASET_NAME_LEN];
	zfs_handle_t *zhp;
	nvlist_t *nvl;
	int ret = 0;
	int c;

	/* check options */
	while ((c = getopt(argc, argv, "")) != -1) {
	switch (c) {
	case '?':
	(void) fprintf(stderr,
	gettext("invalid option '%c'\n"), optopt);
	goto usage;
	}
	}

	argc -= optind;
	argv += optind;

	/* check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing snapshot argument\n"));
	goto usage;
	}
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing bookmark argument\n"));
	goto usage;
	}

	if (strchr(argv[1], '#') == NULL) {
	(void) fprintf(stderr,
	gettext("invalid bookmark name '%s' -- "
	"must contain a '#'\n"), argv[1]);
	goto usage;
	}

	if (argv[0][0] == '@') {
	/*
	* Snapshot name begins with @.
	* Default to same fs as bookmark.
	*/
	(void) strncpy(snapname, argv[1], sizeof (snapname));
	*strchr(snapname, '#') = '\0';
	(void) strlcat(snapname, argv[0], sizeof (snapname));
	} else {
	(void) strncpy(snapname, argv[0], sizeof (snapname));
	}
	zhp = zfs_open(g_zfs, snapname, ZFS_TYPE_SNAPSHOT);
	if (zhp == NULL)
	goto usage;
	zfs_close(zhp);


	nvl = fnvlist_alloc();
	fnvlist_add_string(nvl, argv[1], snapname);
	ret = lzc_bookmark(nvl, NULL);
	fnvlist_free(nvl);

	if (ret != 0) {
	const char *err_msg;
	char errbuf[1024];

	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN,
	"cannot create bookmark '%s'"), argv[1]);

	switch (ret) {
	case EXDEV:
	err_msg = "bookmark is in a different pool";
	break;
	case EEXIST:
	err_msg = "bookmark exists";
	break;
	case EINVAL:
	err_msg = "invalid argument";
	break;
	case ENOTSUP:
	err_msg = "bookmark feature not enabled";
	break;
	case ENOSPC:
	err_msg = "out of space";
	break;
	default:
	err_msg = "unknown error";
	break;
	}
	(void) fprintf(stderr, "%s: %s\n", errbuf,
	dgettext(TEXT_DOMAIN, err_msg));
	}

	return (ret != 0);

	usage:
	usage(B_FALSE);
	return (-1);
	}

	static int
	zfs_do_channel_program(int argc, char **argv)
	{
	int ret, fd;
	char c;
	char progbuf, filename, *poolname;
	size_t progsize, progread;
	nvlist_t *outnvl;
	uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT;
	uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT;
	boolean_t sync_flag = B_TRUE;
	zpool_handle_t *zhp;

	/* check options */
	while (-1 !=
	(c = getopt(argc, argv, "nt:(instr-limit)m:(memory-limit)"))) {
	switch (c) {
	case 't':
	case 'm': {
	uint64_t arg;
	char *endp;

	errno = 0;
	arg = strtoull(optarg, &endp, 0);
	if (errno != 0 \|\| *endp != '\0') {
	(void) fprintf(stderr, gettext(
	"invalid argument "
	"'%s': expected integer\n"), optarg);
	goto usage;
	}

	if (c == 't') {
	if (arg > ZCP_MAX_INSTRLIMIT \|\| arg == 0) {
	(void) fprintf(stderr, gettext(
	"Invalid instruction limit: "
	"%s\n"), optarg);
	return (1);
	} else {
	instrlimit = arg;
	}
	} else {
	ASSERT3U(c, ==, 'm');
	if (arg > ZCP_MAX_MEMLIMIT \|\| arg == 0) {
	(void) fprintf(stderr, gettext(
	"Invalid memory limit: "
	"%s\n"), optarg);
	return (1);
	} else {
	memlimit = arg;
	}
	}
	break;
	}
	case 'n': {
	sync_flag = B_FALSE;
	break;
	}
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	goto usage;
	}
	}

	argc -= optind;
	argv += optind;

	if (argc < 2) {
	(void) fprintf(stderr,
	gettext("invalid number of arguments\n"));
	goto usage;
	}

	poolname = argv[0];
	filename = argv[1];
	if (strcmp(filename, "-") == 0) {
	fd = 0;
	filename = "standard input";
	} else if ((fd = open(filename, O_RDONLY)) < 0) {
	(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
	filename, strerror(errno));
	return (1);
	}

	if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
	(void) fprintf(stderr, gettext("cannot open pool '%s'"),
	poolname);
	return (1);
	}
	zpool_close(zhp);

	/*
	* Read in the channel program, expanding the program buffer as
	* necessary.
	*/
	progread = 0;
	progsize = 1024;
	progbuf = safe_malloc(progsize);
	do {
	ret = read(fd, progbuf + progread, progsize - progread);
	progread += ret;
	if (progread == progsize && ret > 0) {
	progsize *= 2;
	progbuf = safe_realloc(progbuf, progsize);
	}
	} while (ret > 0);

	if (fd != 0)
	(void) close(fd);
	if (ret < 0) {
	free(progbuf);
	(void) fprintf(stderr,
	gettext("cannot read '%s': %s\n"),
	filename, strerror(errno));
	return (1);
	}
	progbuf[progread] = '\0';

	/*
	* Any remaining arguments are passed as arguments to the lua script as
	* a string array:
	* {
	* "argv" -> [ "arg 1", ... "arg n" ],
	* }
	*/
	nvlist_t *argnvl = fnvlist_alloc();
	fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2);

	if (sync_flag) {
	ret = lzc_channel_program(poolname, progbuf,
	instrlimit, memlimit, argnvl, &outnvl);
	} else {
	ret = lzc_channel_program_nosync(poolname, progbuf,
	instrlimit, memlimit, argnvl, &outnvl);
	}

	if (ret != 0) {
	/*
	* On error, report the error message handed back by lua if one
	* exists. Otherwise, generate an appropriate error message,
	* falling back on strerror() for an unexpected return code.
	*/
	char *errstring = NULL;
	if (nvlist_exists(outnvl, ZCP_RET_ERROR)) {
	(void) nvlist_lookup_string(outnvl,
	ZCP_RET_ERROR, &errstring);
	if (errstring == NULL)
	errstring = strerror(ret);
	} else {
	switch (ret) {
	case EINVAL:
	errstring =
	"Invalid instruction or memory limit.";
	break;
	case ENOMEM:
	errstring = "Return value too large.";
	break;
	case ENOSPC:
	errstring = "Memory limit exhausted.";
	break;
	#ifdef illumos
	case ETIME:
	#else
	case ETIMEDOUT:
	#endif
	errstring = "Timed out.";
	break;
	case EPERM:
	errstring = "Permission denied. Channel "
	"programs must be run as root.";
	break;
	default:
	errstring = strerror(ret);
	}
	}
	(void) fprintf(stderr,
	gettext("Channel program execution failed:\n%s\n"),
	errstring);
	} else {
	(void) printf("Channel program fully executed ");
	if (nvlist_empty(outnvl)) {
	(void) printf("with no return value.\n");
	} else {
	(void) printf("with return value:\n");
	dump_nvlist(outnvl, 4);
	}
	}

	free(progbuf);
	fnvlist_free(outnvl);
	fnvlist_free(argnvl);
	return (ret != 0);

	usage:
	usage(B_FALSE);
	return (-1);
	}

	int
	main(int argc, char **argv)
	{
	int ret = 0;
	int i;
	char *progname;
	char *cmdname;

	(void) setlocale(LC_ALL, "");
	(void) textdomain(TEXT_DOMAIN);

	opterr = 0;

	if ((g_zfs = libzfs_init()) == NULL) {
	(void) fprintf(stderr, gettext("internal error: failed to "
	"initialize ZFS library\n"));
	return (1);
	}

	zfs_save_arguments(argc, argv, history_str, sizeof (history_str));

	libzfs_print_on_error(g_zfs, B_TRUE);

	if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) {
	(void) fprintf(stderr, gettext("internal error: unable to "
	"open %s\n"), MNTTAB);
	return (1);
	}

	/*
	* This command also doubles as the /etc/fs mount and unmount program.
	* Determine if we should take this behavior based on argv[0].
	*/
	progname = basename(argv[0]);
	if (strcmp(progname, "mount") == 0) {
	ret = manual_mount(argc, argv);
	} else if (strcmp(progname, "umount") == 0) {
	ret = manual_unmount(argc, argv);
	} else {
	/*
	* Make sure the user has specified some command.
	*/
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing command\n"));
	usage(B_FALSE);
	}

	cmdname = argv[1];

	/*
	* The 'umount' command is an alias for 'unmount'
	*/
	if (strcmp(cmdname, "umount") == 0)
	cmdname = "unmount";

	/*
	* The 'recv' command is an alias for 'receive'
	*/
	if (strcmp(cmdname, "recv") == 0)
	cmdname = "receive";

	/*
	* The 'snap' command is an alias for 'snapshot'
	*/
	if (strcmp(cmdname, "snap") == 0)
	cmdname = "snapshot";

	/*
	* Special case '-?'
	*/
	if (strcmp(cmdname, "-?") == 0)
	usage(B_TRUE);

	/*
	* Run the appropriate command.
	*/
	libzfs_mnttab_cache(g_zfs, B_TRUE);
	if (find_command_idx(cmdname, &i) == 0) {
	current_command = &command_table[i];
	ret = command_table[i].func(argc - 1, argv + 1);
	} else if (strchr(cmdname, '=') != NULL) {
	verify(find_command_idx("set", &i) == 0);
	current_command = &command_table[i];
	ret = command_table[i].func(argc, argv);
	} else {
	(void) fprintf(stderr, gettext("unrecognized "
	"command '%s'\n"), cmdname);
	usage(B_FALSE);
	}
	libzfs_mnttab_cache(g_zfs, B_FALSE);
	}

	(void) fclose(mnttab_file);

	if (ret == 0 && log_history)
	(void) zpool_log_history(g_zfs, history_str);

	libzfs_fini(g_zfs);

	/*
	* The 'ZFS_ABORT' environment variable causes us to dump core on exit
	* for the purposes of running ::findleaks.
	*/
	if (getenv("ZFS_ABORT") != NULL) {
	(void) printf("dumping core by request\n");
	abort();
	}

	return (ret);
	}
	Index: stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
	===================================================================
	--- stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c (revision 332524)
	+++ stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c (revision 332525)
	@@ -1,5761 +1,5940 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	* Copyright (c) 2012 by Frederik Wessels. All rights reserved.
	* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
	* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
	* Copyright 2016 Nexenta Systems, Inc.
	* Copyright (c) 2017 Datto Inc.
	*/

	#include <solaris.h>
	#include <assert.h>
	#include <ctype.h>
	#include <dirent.h>
	#include <errno.h>
	#include <fcntl.h>
	#include <libgen.h>
	#include <libintl.h>
	#include <libuutil.h>
	#include <locale.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <strings.h>
	#include <unistd.h>
	#include <priv.h>
	#include <pwd.h>
	#include <zone.h>
	#include <sys/time.h>
	#include <zfs_prop.h>
	#include <sys/fs/zfs.h>
	#include <sys/stat.h>

	#include <libzfs.h>

	#include "zpool_util.h"
	#include "zfs_comutil.h"
	#include "zfeature_common.h"

	#include "statcommon.h"

	static int zpool_do_create(int, char **);
	static int zpool_do_destroy(int, char **);

	static int zpool_do_add(int, char **);
	static int zpool_do_remove(int, char **);
	static int zpool_do_labelclear(int, char **);

	static int zpool_do_list(int, char **);
	static int zpool_do_iostat(int, char **);
	static int zpool_do_status(int, char **);

	static int zpool_do_online(int, char **);
	static int zpool_do_offline(int, char **);
	static int zpool_do_clear(int, char **);
	static int zpool_do_reopen(int, char **);

	static int zpool_do_reguid(int, char **);

	static int zpool_do_attach(int, char **);
	static int zpool_do_detach(int, char **);
	static int zpool_do_replace(int, char **);
	static int zpool_do_split(int, char **);

	static int zpool_do_scrub(int, char **);

	static int zpool_do_import(int, char **);
	static int zpool_do_export(int, char **);

	static int zpool_do_upgrade(int, char **);

	static int zpool_do_history(int, char **);

	static int zpool_do_get(int, char **);
	static int zpool_do_set(int, char **);

	/*
	* These libumem hooks provide a reasonable set of defaults for the allocator's
	* debugging facilities.
	*/

	#ifdef DEBUG
	const char *
	_umem_debug_init(void)
	{
	return ("default,verbose"); /* $UMEM_DEBUG setting */
	}

	const char *
	_umem_logging_init(void)
	{
	return ("fail,contents"); /* $UMEM_LOGGING setting */
	}
	#endif

	typedef enum {
	HELP_ADD,
	HELP_ATTACH,
	HELP_CLEAR,
	HELP_CREATE,
	HELP_DESTROY,
	HELP_DETACH,
	HELP_EXPORT,
	HELP_HISTORY,
	HELP_IMPORT,
	HELP_IOSTAT,
	HELP_LABELCLEAR,
	HELP_LIST,
	HELP_OFFLINE,
	HELP_ONLINE,
	HELP_REPLACE,
	HELP_REMOVE,
	HELP_SCRUB,
	HELP_STATUS,
	HELP_UPGRADE,
	HELP_GET,
	HELP_SET,
	HELP_SPLIT,
	HELP_REGUID,
	HELP_REOPEN
	} zpool_help_t;


	typedef struct zpool_command {
	const char *name;
	int (func)(int, char *);
	zpool_help_t usage;
	} zpool_command_t;

	/*
	* Master command table. Each ZFS command has a name, associated function, and
	* usage message. The usage messages need to be internationalized, so we have
	* to have a function to return the usage message based on a command index.
	*
	* These commands are organized according to how they are displayed in the usage
	* message. An empty command (one with a NULL name) indicates an empty line in
	* the generic usage message.
	*/
	static zpool_command_t command_table[] = {
	{ "create", zpool_do_create, HELP_CREATE },
	{ "destroy", zpool_do_destroy, HELP_DESTROY },
	{ NULL },
	{ "add", zpool_do_add, HELP_ADD },
	{ "remove", zpool_do_remove, HELP_REMOVE },
	{ NULL },
	{ "labelclear", zpool_do_labelclear, HELP_LABELCLEAR },
	{ NULL },
	{ "list", zpool_do_list, HELP_LIST },
	{ "iostat", zpool_do_iostat, HELP_IOSTAT },
	{ "status", zpool_do_status, HELP_STATUS },
	{ NULL },
	{ "online", zpool_do_online, HELP_ONLINE },
	{ "offline", zpool_do_offline, HELP_OFFLINE },
	{ "clear", zpool_do_clear, HELP_CLEAR },
	{ "reopen", zpool_do_reopen, HELP_REOPEN },
	{ NULL },
	{ "attach", zpool_do_attach, HELP_ATTACH },
	{ "detach", zpool_do_detach, HELP_DETACH },
	{ "replace", zpool_do_replace, HELP_REPLACE },
	{ "split", zpool_do_split, HELP_SPLIT },
	{ NULL },
	{ "scrub", zpool_do_scrub, HELP_SCRUB },
	{ NULL },
	{ "import", zpool_do_import, HELP_IMPORT },
	{ "export", zpool_do_export, HELP_EXPORT },
	{ "upgrade", zpool_do_upgrade, HELP_UPGRADE },
	{ "reguid", zpool_do_reguid, HELP_REGUID },
	{ NULL },
	{ "history", zpool_do_history, HELP_HISTORY },
	{ "get", zpool_do_get, HELP_GET },
	{ "set", zpool_do_set, HELP_SET },
	};

	#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))

	static zpool_command_t *current_command;
	static char history_str[HIS_MAX_RECORD_LEN];
	static boolean_t log_history = B_TRUE;
	static uint_t timestamp_fmt = NODATE;

	static const char *
	get_usage(zpool_help_t idx)
	{
	switch (idx) {
	case HELP_ADD:
	return (gettext("\tadd [-fn] <pool> <vdev> ...\n"));
	case HELP_ATTACH:
	return (gettext("\tattach [-f] <pool> <device> "
	"<new-device>\n"));
	case HELP_CLEAR:
	return (gettext("\tclear [-nF] <pool> [device]\n"));
	case HELP_CREATE:
	return (gettext("\tcreate [-fnd] [-B] "
	"[-o property=value] ... \n"
	"\t [-O file-system-property=value] ... \n"
	"\t [-m mountpoint] [-R root] <pool> <vdev> ...\n"));
	case HELP_DESTROY:
	return (gettext("\tdestroy [-f] <pool>\n"));
	case HELP_DETACH:
	return (gettext("\tdetach <pool> <device>\n"));
	case HELP_EXPORT:
	return (gettext("\texport [-f] <pool> ...\n"));
	case HELP_HISTORY:
	return (gettext("\thistory [-il] [<pool>] ...\n"));
	case HELP_IMPORT:
	return (gettext("\timport [-d dir] [-D]\n"
	"\timport [-d dir \| -c cachefile] [-F [-n]] <pool \| id>\n"
	"\timport [-o mntopts] [-o property=value] ... \n"
	"\t [-d dir \| -c cachefile] [-D] [-f] [-m] [-N] "
	"[-R root] [-F [-n]] -a\n"
	"\timport [-o mntopts] [-o property=value] ... \n"
	"\t [-d dir \| -c cachefile] [-D] [-f] [-m] [-N] "
	"[-R root] [-F [-n]]\n"
	"\t <pool \| id> [newpool]\n"));
	case HELP_IOSTAT:
	return (gettext("\tiostat [-v] [-T d\|u] [pool] ... [interval "
	"[count]]\n"));
	case HELP_LABELCLEAR:
	return (gettext("\tlabelclear [-f] <vdev>\n"));
	case HELP_LIST:
	return (gettext("\tlist [-Hpv] [-o property[,...]] "
	"[-T d\|u] [pool] ... [interval [count]]\n"));
	case HELP_OFFLINE:
	return (gettext("\toffline [-t] <pool> <device> ...\n"));
	case HELP_ONLINE:
	return (gettext("\tonline [-e] <pool> <device> ...\n"));
	case HELP_REPLACE:
	return (gettext("\treplace [-f] <pool> <device> "
	"[new-device]\n"));
	case HELP_REMOVE:
	- return (gettext("\tremove <pool> <device> ...\n"));
	+ return (gettext("\tremove [-nps] <pool> <device> ...\n"));
	case HELP_REOPEN:
	return (gettext("\treopen <pool>\n"));
	case HELP_SCRUB:
	return (gettext("\tscrub [-s \| -p] <pool> ...\n"));
	case HELP_STATUS:
	return (gettext("\tstatus [-vx] [-T d\|u] [pool] ... [interval "
	"[count]]\n"));
	case HELP_UPGRADE:
	return (gettext("\tupgrade [-v]\n"
	"\tupgrade [-V version] <-a \| pool ...>\n"));
	case HELP_GET:
	return (gettext("\tget [-Hp] [-o \"all\" \| field[,...]] "
	"<\"all\" \| property[,...]> <pool> ...\n"));
	case HELP_SET:
	return (gettext("\tset <property=value> <pool> \n"));
	case HELP_SPLIT:
	return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n"
	"\t [-o property=value] <pool> <newpool> "
	"[<device> ...]\n"));
	case HELP_REGUID:
	return (gettext("\treguid <pool>\n"));
	}

	abort();
	/* NOTREACHED */
	}


	/*
	* Callback routine that will print out a pool property value.
	*/
	static int
	print_prop_cb(int prop, void *cb)
	{
	FILE *fp = cb;

	(void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop));

	if (zpool_prop_readonly(prop))
	(void) fprintf(fp, " NO ");
	else
	(void) fprintf(fp, " YES ");

	if (zpool_prop_values(prop) == NULL)
	(void) fprintf(fp, "-\n");
	else
	(void) fprintf(fp, "%s\n", zpool_prop_values(prop));

	return (ZPROP_CONT);
	}

	/*
	* Display usage message. If we're inside a command, display only the usage for
	* that command. Otherwise, iterate over the entire command table and display
	* a complete usage message.
	*/
	void
	usage(boolean_t requested)
	{
	FILE *fp = requested ? stdout : stderr;

	if (current_command == NULL) {
	int i;

	(void) fprintf(fp, gettext("usage: zpool command args ...\n"));
	(void) fprintf(fp,
	gettext("where 'command' is one of the following:\n\n"));

	for (i = 0; i < NCOMMAND; i++) {
	if (command_table[i].name == NULL)
	(void) fprintf(fp, "\n");
	else
	(void) fprintf(fp, "%s",
	get_usage(command_table[i].usage));
	}
	} else {
	(void) fprintf(fp, gettext("usage:\n"));
	(void) fprintf(fp, "%s", get_usage(current_command->usage));
	}

	if (current_command != NULL &&
	((strcmp(current_command->name, "set") == 0) \|\|
	(strcmp(current_command->name, "get") == 0) \|\|
	(strcmp(current_command->name, "list") == 0))) {

	(void) fprintf(fp,
	gettext("\nthe following properties are supported:\n"));

	(void) fprintf(fp, "\n\t%-15s %s %s\n\n",
	"PROPERTY", "EDIT", "VALUES");

	/* Iterate over all properties */
	(void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE,
	ZFS_TYPE_POOL);

	(void) fprintf(fp, "\t%-15s ", "feature@...");
	(void) fprintf(fp, "YES disabled \| enabled \| active\n");

	(void) fprintf(fp, gettext("\nThe feature@ properties must be "
	"appended with a feature name.\nSee zpool-features(7).\n"));
	}

	/*
	* See comments at end of main().
	*/
	if (getenv("ZFS_ABORT") != NULL) {
	(void) printf("dumping core by request\n");
	abort();
	}

	exit(requested ? 0 : 2);
	}

	void
	print_vdev_tree(zpool_handle_t zhp, const char name, nvlist_t *nv, int indent,
	boolean_t print_logs)
	{
	nvlist_t **child;
	uint_t c, children;
	char *vname;

	if (name != NULL)
	(void) printf("\t%*s%s\n", indent, "", name);

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	&child, &children) != 0)
	return;

	for (c = 0; c < children; c++) {
	uint64_t is_log = B_FALSE;

	(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	&is_log);
	if ((is_log && !print_logs) \|\| (!is_log && print_logs))
	continue;

	vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
	print_vdev_tree(zhp, vname, child[c], indent + 2,
	B_FALSE);
	free(vname);
	}
	}

	static boolean_t
	prop_list_contains_feature(nvlist_t *proplist)
	{
	nvpair_t *nvp;
	for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp;
	nvp = nvlist_next_nvpair(proplist, nvp)) {
	if (zpool_prop_feature(nvpair_name(nvp)))
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	/*
	* Add a property pair (name, string-value) into a property nvlist.
	*/
	static int
	add_prop_list(const char propname, char propval, nvlist_t **props,
	boolean_t poolprop)
	{
	zpool_prop_t prop = ZPROP_INVAL;
	zfs_prop_t fprop;
	nvlist_t *proplist;
	const char *normnm;
	char *strval;

	if (*props == NULL &&
	nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) {
	(void) fprintf(stderr,
	gettext("internal error: out of memory\n"));
	return (1);
	}

	proplist = *props;

	if (poolprop) {
	const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION);

	if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL &&
	!zpool_prop_feature(propname)) {
	(void) fprintf(stderr, gettext("property '%s' is "
	"not a valid pool property\n"), propname);
	return (2);
	}

	/*
	* feature@ properties and version should not be specified
	* at the same time.
	*/
	if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) &&
	nvlist_exists(proplist, vname)) \|\|
	(prop == ZPOOL_PROP_VERSION &&
	prop_list_contains_feature(proplist))) {
	(void) fprintf(stderr, gettext("'feature@' and "
	"'version' properties cannot be specified "
	"together\n"));
	return (2);
	}


	if (zpool_prop_feature(propname))
	normnm = propname;
	else
	normnm = zpool_prop_to_name(prop);
	} else {
	if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
	normnm = zfs_prop_to_name(fprop);
	} else {
	normnm = propname;
	}
	}

	if (nvlist_lookup_string(proplist, normnm, &strval) == 0 &&
	prop != ZPOOL_PROP_CACHEFILE) {
	(void) fprintf(stderr, gettext("property '%s' "
	"specified multiple times\n"), propname);
	return (2);
	}

	if (nvlist_add_string(proplist, normnm, propval) != 0) {
	(void) fprintf(stderr, gettext("internal "
	"error: out of memory\n"));
	return (1);
	}

	return (0);
	}

	/*
	* zpool add [-fn] <pool> <vdev> ...
	*
	* -f Force addition of devices, even if they appear in use
	* -n Do not add the devices, but display the resulting layout if
	* they were to be added.
	*
	* Adds the given vdevs to 'pool'. As with create, the bulk of this work is
	* handled by get_vdev_spec(), which constructs the nvlist needed to pass to
	* libzfs.
	*/
	int
	zpool_do_add(int argc, char **argv)
	{
	boolean_t force = B_FALSE;
	boolean_t dryrun = B_FALSE;
	int c;
	nvlist_t *nvroot;
	char *poolname;
	zpool_boot_label_t boot_type;
	uint64_t boot_size;
	int ret;
	zpool_handle_t *zhp;
	nvlist_t *config;

	/* check options */
	while ((c = getopt(argc, argv, "fn")) != -1) {
	switch (c) {
	case 'f':
	force = B_TRUE;
	break;
	case 'n':
	dryrun = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* get pool name and check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name argument\n"));
	usage(B_FALSE);
	}
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing vdev specification\n"));
	usage(B_FALSE);
	}

	poolname = argv[0];

	argc--;
	argv++;

	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	return (1);

	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
	(void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
	poolname);
	zpool_close(zhp);
	return (1);
	}

	if (zpool_is_bootable(zhp))
	boot_type = ZPOOL_COPY_BOOT_LABEL;
	else
	boot_type = ZPOOL_NO_BOOT_LABEL;

	/* pass off to get_vdev_spec for processing */
	boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL);
	nvroot = make_root_vdev(zhp, force, !force, B_FALSE, dryrun,
	boot_type, boot_size, argc, argv);
	if (nvroot == NULL) {
	zpool_close(zhp);
	return (1);
	}

	if (dryrun) {
	nvlist_t *poolnvroot;

	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&poolnvroot) == 0);

	(void) printf(gettext("would update '%s' to the following "
	"configuration:\n"), zpool_get_name(zhp));

	/* print original main pool and new tree */
	print_vdev_tree(zhp, poolname, poolnvroot, 0, B_FALSE);
	print_vdev_tree(zhp, NULL, nvroot, 0, B_FALSE);

	/* Do the same for the logs */
	if (num_logs(poolnvroot) > 0) {
	print_vdev_tree(zhp, "logs", poolnvroot, 0, B_TRUE);
	print_vdev_tree(zhp, NULL, nvroot, 0, B_TRUE);
	} else if (num_logs(nvroot) > 0) {
	print_vdev_tree(zhp, "logs", nvroot, 0, B_TRUE);
	}

	ret = 0;
	} else {
	ret = (zpool_add(zhp, nvroot) != 0);
	}

	nvlist_free(nvroot);
	zpool_close(zhp);

	return (ret);
	}

	/*
	* zpool remove <pool> <vdev> ...
	*
	- * Removes the given vdev from the pool. Currently, this supports removing
	- * spares, cache, and log devices from the pool.
	+ * Removes the given vdev from the pool.
	*/
	int
	zpool_do_remove(int argc, char **argv)
	{
	char *poolname;
	int i, ret = 0;
	zpool_handle_t *zhp;
	+ boolean_t stop = B_FALSE;
	+ boolean_t noop = B_FALSE;
	+ boolean_t parsable = B_FALSE;
	+ char c;

	- argc--;
	- argv++;
	+ /* check options */
	+ while ((c = getopt(argc, argv, "nps")) != -1) {
	+ switch (c) {
	+ case 'n':
	+ noop = B_TRUE;
	+ break;
	+ case 'p':
	+ parsable = B_TRUE;
	+ break;
	+ case 's':
	+ stop = B_TRUE;
	+ break;
	+ case '?':
	+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
	+ optopt);
	+ usage(B_FALSE);
	+ }
	+ }

	+ argc -= optind;
	+ argv += optind;
	+
	/* get pool name and check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name argument\n"));
	usage(B_FALSE);
	}
	- if (argc < 2) {
	- (void) fprintf(stderr, gettext("missing device\n"));
	- usage(B_FALSE);
	- }

	poolname = argv[0];

	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	return (1);

	- for (i = 1; i < argc; i++) {
	- if (zpool_vdev_remove(zhp, argv[i]) != 0)
	+ if (stop && noop) {
	+ (void) fprintf(stderr, gettext("stop request ignored\n"));
	+ return (0);
	+ }
	+
	+ if (stop) {
	+ if (argc > 1) {
	+ (void) fprintf(stderr, gettext("too many arguments\n"));
	+ usage(B_FALSE);
	+ }
	+ if (zpool_vdev_remove_cancel(zhp) != 0)
	ret = 1;
	+ } else {
	+ if (argc < 2) {
	+ (void) fprintf(stderr, gettext("missing device\n"));
	+ usage(B_FALSE);
	+ }
	+
	+ for (i = 1; i < argc; i++) {
	+ if (noop) {
	+ uint64_t size;
	+
	+ if (zpool_vdev_indirect_size(zhp, argv[i],
	+ &size) != 0) {
	+ ret = 1;
	+ break;
	+ }
	+ if (parsable) {
	+ (void) printf("%s %llu\n",
	+ argv[i], size);
	+ } else {
	+ char valstr[32];
	+ zfs_nicenum(size, valstr,
	+ sizeof (valstr));
	+ (void) printf("Memory that will be "
	+ "used after removing %s: %s\n",
	+ argv[i], valstr);
	+ }
	+ } else {
	+ if (zpool_vdev_remove(zhp, argv[i]) != 0)
	+ ret = 1;
	+ }
	+ }
	}

	return (ret);
	}

	/*
	* zpool labelclear [-f] <vdev>
	*
	* -f Force clearing the label for the vdevs which are members of
	* the exported or foreign pools.
	*
	* Verifies that the vdev is not active and zeros out the label information
	* on the device.
	*/
	int
	zpool_do_labelclear(int argc, char **argv)
	{
	char vdev[MAXPATHLEN];
	char *name = NULL;
	struct stat st;
	int c, fd, ret = 0;
	nvlist_t *config;
	pool_state_t state;
	boolean_t inuse = B_FALSE;
	boolean_t force = B_FALSE;

	/* check options */
	while ((c = getopt(argc, argv, "f")) != -1) {
	switch (c) {
	case 'f':
	force = B_TRUE;
	break;
	default:
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* get vdev name */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing vdev name\n"));
	usage(B_FALSE);
	}
	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	/*
	* Check if we were given absolute path and use it as is.
	* Otherwise if the provided vdev name doesn't point to a file,
	* try prepending dsk path and appending s0.
	*/
	(void) strlcpy(vdev, argv[0], sizeof (vdev));
	if (vdev[0] != '/' && stat(vdev, &st) != 0) {
	char *s;

	(void) snprintf(vdev, sizeof (vdev), "%s/%s",
	#ifdef illumos
	ZFS_DISK_ROOT, argv[0]);
	if ((s = strrchr(argv[0], 's')) == NULL \|\|
	!isdigit(*(s + 1)))
	(void) strlcat(vdev, "s0", sizeof (vdev));
	#else
	"/dev", argv[0]);
	#endif
	if (stat(vdev, &st) != 0) {
	(void) fprintf(stderr, gettext(
	"failed to find device %s, try specifying absolute "
	"path instead\n"), argv[0]);
	return (1);
	}
	}

	if ((fd = open(vdev, O_RDWR)) < 0) {
	(void) fprintf(stderr, gettext("failed to open %s: %s\n"),
	vdev, strerror(errno));
	return (1);
	}

	if (zpool_read_label(fd, &config) != 0) {
	(void) fprintf(stderr,
	gettext("failed to read label from %s\n"), vdev);
	return (1);
	}
	nvlist_free(config);

	ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse);
	if (ret != 0) {
	(void) fprintf(stderr,
	gettext("failed to check state for %s\n"), vdev);
	return (1);
	}

	if (!inuse)
	goto wipe_label;

	switch (state) {
	default:
	case POOL_STATE_ACTIVE:
	case POOL_STATE_SPARE:
	case POOL_STATE_L2CACHE:
	(void) fprintf(stderr, gettext(
	"%s is a member (%s) of pool \"%s\"\n"),
	vdev, zpool_pool_state_to_name(state), name);
	ret = 1;
	goto errout;

	case POOL_STATE_EXPORTED:
	if (force)
	break;
	(void) fprintf(stderr, gettext(
	"use '-f' to override the following error:\n"
	"%s is a member of exported pool \"%s\"\n"),
	vdev, name);
	ret = 1;
	goto errout;

	case POOL_STATE_POTENTIALLY_ACTIVE:
	if (force)
	break;
	(void) fprintf(stderr, gettext(
	"use '-f' to override the following error:\n"
	"%s is a member of potentially active pool \"%s\"\n"),
	vdev, name);
	ret = 1;
	goto errout;

	case POOL_STATE_DESTROYED:
	/* inuse should never be set for a destroyed pool */
	assert(0);
	break;
	}

	wipe_label:
	ret = zpool_clear_label(fd);
	if (ret != 0) {
	(void) fprintf(stderr,
	gettext("failed to clear label for %s\n"), vdev);
	}

	errout:
	free(name);
	(void) close(fd);

	return (ret);
	}

	/*
	* zpool create [-fnd] [-B] [-o property=value] ...
	* [-O file-system-property=value] ...
	* [-R root] [-m mountpoint] <pool> <dev> ...
	*
	* -B Create boot partition.
	* -f Force creation, even if devices appear in use
	* -n Do not create the pool, but display the resulting layout if it
	* were to be created.
	* -R Create a pool under an alternate root
	* -m Set default mountpoint for the root dataset. By default it's
	* '/<pool>'
	* -o Set property=value.
	* -d Don't automatically enable all supported pool features
	* (individual features can be enabled with -o).
	* -O Set fsproperty=value in the pool's root file system
	*
	* Creates the named pool according to the given vdev specification. The
	* bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c. Once
	* we get the nvlist back from get_vdev_spec(), we either print out the contents
	* (if '-n' was specified), or pass it to libzfs to do the creation.
	*/

	#define SYSTEM256 (256 * 1024 * 1024)
	int
	zpool_do_create(int argc, char **argv)
	{
	boolean_t force = B_FALSE;
	boolean_t dryrun = B_FALSE;
	boolean_t enable_all_pool_feat = B_TRUE;
	zpool_boot_label_t boot_type = ZPOOL_NO_BOOT_LABEL;
	uint64_t boot_size = 0;
	int c;
	nvlist_t *nvroot = NULL;
	char *poolname;
	int ret = 1;
	char *altroot = NULL;
	char *mountpoint = NULL;
	nvlist_t *fsprops = NULL;
	nvlist_t *props = NULL;
	char *propval;

	/* check options */
	while ((c = getopt(argc, argv, ":fndBR:m:o:O:")) != -1) {
	switch (c) {
	case 'f':
	force = B_TRUE;
	break;
	case 'n':
	dryrun = B_TRUE;
	break;
	case 'd':
	enable_all_pool_feat = B_FALSE;
	break;
	case 'B':
	#ifdef illumos
	/*
	* We should create the system partition.
	* Also make sure the size is set.
	*/
	boot_type = ZPOOL_CREATE_BOOT_LABEL;
	if (boot_size == 0)
	boot_size = SYSTEM256;
	break;
	#else
	(void) fprintf(stderr,
	gettext("option '%c' is not supported\n"),
	optopt);
	goto badusage;
	#endif
	case 'R':
	altroot = optarg;
	if (add_prop_list(zpool_prop_to_name(
	ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
	goto errout;
	if (nvlist_lookup_string(props,
	zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
	&propval) == 0)
	break;
	if (add_prop_list(zpool_prop_to_name(
	ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
	goto errout;
	break;
	case 'm':
	/* Equivalent to -O mountpoint=optarg */
	mountpoint = optarg;
	break;
	case 'o':
	if ((propval = strchr(optarg, '=')) == NULL) {
	(void) fprintf(stderr, gettext("missing "
	"'=' for -o option\n"));
	goto errout;
	}
	*propval = '\0';
	propval++;

	if (add_prop_list(optarg, propval, &props, B_TRUE))
	goto errout;

	/*
	* Get bootsize value for make_root_vdev().
	*/
	if (zpool_name_to_prop(optarg) == ZPOOL_PROP_BOOTSIZE) {
	if (zfs_nicestrtonum(g_zfs, propval,
	&boot_size) < 0 \|\| boot_size == 0) {
	(void) fprintf(stderr,
	gettext("bad boot partition size "
	"'%s': %s\n"), propval,
	libzfs_error_description(g_zfs));
	goto errout;
	}
	}

	/*
	* If the user is creating a pool that doesn't support
	* feature flags, don't enable any features.
	*/
	if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) {
	char *end;
	u_longlong_t ver;

	ver = strtoull(propval, &end, 10);
	if (*end == '\0' &&
	ver < SPA_VERSION_FEATURES) {
	enable_all_pool_feat = B_FALSE;
	}
	}
	if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT)
	altroot = propval;
	break;
	case 'O':
	if ((propval = strchr(optarg, '=')) == NULL) {
	(void) fprintf(stderr, gettext("missing "
	"'=' for -O option\n"));
	goto errout;
	}
	*propval = '\0';
	propval++;

	/*
	* Mountpoints are checked and then added later.
	* Uniquely among properties, they can be specified
	* more than once, to avoid conflict with -m.
	*/
	if (0 == strcmp(optarg,
	zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) {
	mountpoint = propval;
	} else if (add_prop_list(optarg, propval, &fsprops,
	B_FALSE)) {
	goto errout;
	}
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	goto badusage;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	goto badusage;
	}
	}

	argc -= optind;
	argv += optind;

	/* get pool name and check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name argument\n"));
	goto badusage;
	}
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing vdev specification\n"));
	goto badusage;
	}

	poolname = argv[0];

	/*
	* As a special case, check for use of '/' in the name, and direct the
	* user to use 'zfs create' instead.
	*/
	if (strchr(poolname, '/') != NULL) {
	(void) fprintf(stderr, gettext("cannot create '%s': invalid "
	"character '/' in pool name\n"), poolname);
	(void) fprintf(stderr, gettext("use 'zfs create' to "
	"create a dataset\n"));
	goto errout;
	}

	/*
	* Make sure the bootsize is set when ZPOOL_CREATE_BOOT_LABEL is used,
	* and not set otherwise.
	*/
	if (boot_type == ZPOOL_CREATE_BOOT_LABEL) {
	const char *propname;
	char strptr, buf = NULL;
	int rv;

	propname = zpool_prop_to_name(ZPOOL_PROP_BOOTSIZE);
	if (nvlist_lookup_string(props, propname, &strptr) != 0) {
	(void) asprintf(&buf, "%" PRIu64, boot_size);
	if (buf == NULL) {
	(void) fprintf(stderr,
	gettext("internal error: out of memory\n"));
	goto errout;
	}
	rv = add_prop_list(propname, buf, &props, B_TRUE);
	free(buf);
	if (rv != 0)
	goto errout;
	}
	} else {
	const char *propname;
	char *strptr;

	propname = zpool_prop_to_name(ZPOOL_PROP_BOOTSIZE);
	if (nvlist_lookup_string(props, propname, &strptr) == 0) {
	(void) fprintf(stderr, gettext("error: setting boot "
	"partition size requires option '-B'\n"));
	goto errout;
	}
	}

	/* pass off to get_vdev_spec for bulk processing */
	nvroot = make_root_vdev(NULL, force, !force, B_FALSE, dryrun,
	boot_type, boot_size, argc - 1, argv + 1);
	if (nvroot == NULL)
	goto errout;

	/* make_root_vdev() allows 0 toplevel children if there are spares */
	if (!zfs_allocatable_devs(nvroot)) {
	(void) fprintf(stderr, gettext("invalid vdev "
	"specification: at least one toplevel vdev must be "
	"specified\n"));
	goto errout;
	}

	if (altroot != NULL && altroot[0] != '/') {
	(void) fprintf(stderr, gettext("invalid alternate root '%s': "
	"must be an absolute path\n"), altroot);
	goto errout;
	}

	/*
	* Check the validity of the mountpoint and direct the user to use the
	* '-m' mountpoint option if it looks like its in use.
	* Ignore the checks if the '-f' option is given.
	*/
	if (!force && (mountpoint == NULL \|\|
	(strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 &&
	strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0))) {
	char buf[MAXPATHLEN];
	DIR *dirp;

	if (mountpoint && mountpoint[0] != '/') {
	(void) fprintf(stderr, gettext("invalid mountpoint "
	"'%s': must be an absolute path, 'legacy', or "
	"'none'\n"), mountpoint);
	goto errout;
	}

	if (mountpoint == NULL) {
	if (altroot != NULL)
	(void) snprintf(buf, sizeof (buf), "%s/%s",
	altroot, poolname);
	else
	(void) snprintf(buf, sizeof (buf), "/%s",
	poolname);
	} else {
	if (altroot != NULL)
	(void) snprintf(buf, sizeof (buf), "%s%s",
	altroot, mountpoint);
	else
	(void) snprintf(buf, sizeof (buf), "%s",
	mountpoint);
	}

	if ((dirp = opendir(buf)) == NULL && errno != ENOENT) {
	(void) fprintf(stderr, gettext("mountpoint '%s' : "
	"%s\n"), buf, strerror(errno));
	(void) fprintf(stderr, gettext("use '-m' "
	"option to provide a different default\n"));
	goto errout;
	} else if (dirp) {
	int count = 0;

	while (count < 3 && readdir(dirp) != NULL)
	count++;
	(void) closedir(dirp);

	if (count > 2) {
	(void) fprintf(stderr, gettext("mountpoint "
	"'%s' exists and is not empty\n"), buf);
	(void) fprintf(stderr, gettext("use '-m' "
	"option to provide a "
	"different default\n"));
	goto errout;
	}
	}
	}

	/*
	* Now that the mountpoint's validity has been checked, ensure that
	* the property is set appropriately prior to creating the pool.
	*/
	if (mountpoint != NULL) {
	ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
	mountpoint, &fsprops, B_FALSE);
	if (ret != 0)
	goto errout;
	}

	ret = 1;
	if (dryrun) {
	/*
	* For a dry run invocation, print out a basic message and run
	* through all the vdevs in the list and print out in an
	* appropriate hierarchy.
	*/
	(void) printf(gettext("would create '%s' with the "
	"following layout:\n\n"), poolname);

	print_vdev_tree(NULL, poolname, nvroot, 0, B_FALSE);
	if (num_logs(nvroot) > 0)
	print_vdev_tree(NULL, "logs", nvroot, 0, B_TRUE);

	ret = 0;
	} else {
	/*
	* Hand off to libzfs.
	*/
	if (enable_all_pool_feat) {
	spa_feature_t i;
	for (i = 0; i < SPA_FEATURES; i++) {
	char propname[MAXPATHLEN];
	zfeature_info_t *feat = &spa_feature_table[i];

	(void) snprintf(propname, sizeof (propname),
	"feature@%s", feat->fi_uname);

	/*
	* Skip feature if user specified it manually
	* on the command line.
	*/
	if (nvlist_exists(props, propname))
	continue;

	ret = add_prop_list(propname,
	ZFS_FEATURE_ENABLED, &props, B_TRUE);
	if (ret != 0)
	goto errout;
	}
	}

	ret = 1;
	if (zpool_create(g_zfs, poolname,
	nvroot, props, fsprops) == 0) {
	zfs_handle_t *pool = zfs_open(g_zfs, poolname,
	ZFS_TYPE_FILESYSTEM);
	if (pool != NULL) {
	if (zfs_mount(pool, NULL, 0) == 0)
	ret = zfs_shareall(pool);
	zfs_close(pool);
	}
	} else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) {
	(void) fprintf(stderr, gettext("pool name may have "
	"been omitted\n"));
	}
	}

	errout:
	nvlist_free(nvroot);
	nvlist_free(fsprops);
	nvlist_free(props);
	return (ret);
	badusage:
	nvlist_free(fsprops);
	nvlist_free(props);
	usage(B_FALSE);
	return (2);
	}

	/*
	* zpool destroy <pool>
	*
	* -f Forcefully unmount any datasets
	*
	* Destroy the given pool. Automatically unmounts any datasets in the pool.
	*/
	int
	zpool_do_destroy(int argc, char **argv)
	{
	boolean_t force = B_FALSE;
	int c;
	char *pool;
	zpool_handle_t *zhp;
	int ret;

	/* check options */
	while ((c = getopt(argc, argv, "f")) != -1) {
	switch (c) {
	case 'f':
	force = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* check arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool argument\n"));
	usage(B_FALSE);
	}
	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	pool = argv[0];

	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
	/*
	* As a special case, check for use of '/' in the name, and
	* direct the user to use 'zfs destroy' instead.
	*/
	if (strchr(pool, '/') != NULL)
	(void) fprintf(stderr, gettext("use 'zfs destroy' to "
	"destroy a dataset\n"));
	return (1);
	}

	if (zpool_disable_datasets(zhp, force) != 0) {
	(void) fprintf(stderr, gettext("could not destroy '%s': "
	"could not unmount datasets\n"), zpool_get_name(zhp));
	return (1);
	}

	/* The history must be logged as part of the export */
	log_history = B_FALSE;

	ret = (zpool_destroy(zhp, history_str) != 0);

	zpool_close(zhp);

	return (ret);
	}

	/*
	* zpool export [-f] <pool> ...
	*
	* -f Forcefully unmount datasets
	*
	* Export the given pools. By default, the command will attempt to cleanly
	* unmount any active datasets within the pool. If the '-f' flag is specified,
	* then the datasets will be forcefully unmounted.
	*/
	int
	zpool_do_export(int argc, char **argv)
	{
	boolean_t force = B_FALSE;
	boolean_t hardforce = B_FALSE;
	int c;
	zpool_handle_t *zhp;
	int ret;
	int i;

	/* check options */
	while ((c = getopt(argc, argv, "fF")) != -1) {
	switch (c) {
	case 'f':
	force = B_TRUE;
	break;
	case 'F':
	hardforce = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* check arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool argument\n"));
	usage(B_FALSE);
	}

	ret = 0;
	for (i = 0; i < argc; i++) {
	if ((zhp = zpool_open_canfail(g_zfs, argv[i])) == NULL) {
	ret = 1;
	continue;
	}

	if (zpool_disable_datasets(zhp, force) != 0) {
	ret = 1;
	zpool_close(zhp);
	continue;
	}

	/* The history must be logged as part of the export */
	log_history = B_FALSE;

	if (hardforce) {
	if (zpool_export_force(zhp, history_str) != 0)
	ret = 1;
	} else if (zpool_export(zhp, force, history_str) != 0) {
	ret = 1;
	}

	zpool_close(zhp);
	}

	return (ret);
	}

	/*
	* Given a vdev configuration, determine the maximum width needed for the device
	* name column.
	*/
	static int
	max_width(zpool_handle_t zhp, nvlist_t nv, int depth, int max)
	{
	char *name = zpool_vdev_name(g_zfs, zhp, nv, B_TRUE);
	nvlist_t **child;
	uint_t c, children;
	int ret;

	if (strlen(name) + depth > max)
	max = strlen(name) + depth;

	free(name);

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
	&child, &children) == 0) {
	for (c = 0; c < children; c++)
	if ((ret = max_width(zhp, child[c], depth + 2,
	max)) > max)
	max = ret;
	}

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
	&child, &children) == 0) {
	for (c = 0; c < children; c++)
	if ((ret = max_width(zhp, child[c], depth + 2,
	max)) > max)
	max = ret;
	}

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	&child, &children) == 0) {
	for (c = 0; c < children; c++)
	if ((ret = max_width(zhp, child[c], depth + 2,
	max)) > max)
	max = ret;
	}


	return (max);
	}

	typedef struct spare_cbdata {
	uint64_t cb_guid;
	zpool_handle_t *cb_zhp;
	} spare_cbdata_t;

	static boolean_t
	find_vdev(nvlist_t *nv, uint64_t search)
	{
	uint64_t guid;
	nvlist_t **child;
	uint_t c, children;

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
	search == guid)
	return (B_TRUE);

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	&child, &children) == 0) {
	for (c = 0; c < children; c++)
	if (find_vdev(child[c], search))
	return (B_TRUE);
	}

	return (B_FALSE);
	}

	static int
	find_spare(zpool_handle_t zhp, void data)
	{
	spare_cbdata_t *cbp = data;
	nvlist_t config, nvroot;

	config = zpool_get_config(zhp, NULL);
	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvroot) == 0);

	if (find_vdev(nvroot, cbp->cb_guid)) {
	cbp->cb_zhp = zhp;
	return (1);
	}

	zpool_close(zhp);
	return (0);
	}

	/*
	* Print out configuration state as requested by status_callback.
	*/
	void
	print_status_config(zpool_handle_t zhp, const char name, nvlist_t *nv,
	int namewidth, int depth, boolean_t isspare)
	{
	nvlist_t **child;
	uint_t c, vsc, children;
	pool_scan_stat_t *ps = NULL;
	vdev_stat_t *vs;
	char rbuf[6], wbuf[6], cbuf[6];
	char *vname;
	uint64_t notpresent;
	uint64_t ashift;
	spare_cbdata_t cb;
	const char *state;
	+ char *type;

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	&child, &children) != 0)
	children = 0;

	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	(uint64_t **)&vs, &vsc) == 0);

	+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
	+
	+ if (strcmp(type, VDEV_TYPE_INDIRECT) == 0)
	+ return;
	+
	state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
	if (isspare) {
	/*
	* For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
	* online drives.
	*/
	if (vs->vs_aux == VDEV_AUX_SPARED)
	state = "INUSE";
	else if (vs->vs_state == VDEV_STATE_HEALTHY)
	state = "AVAIL";
	}

	(void) printf("\t%s%-s %-8s", depth, "", namewidth - depth,
	name, state);

	if (!isspare) {
	zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
	zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
	zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
	(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
	}

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
	&notpresent) == 0 \|\|
	vs->vs_state <= VDEV_STATE_CANT_OPEN) {
	char *path;
	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0)
	(void) printf(" was %s", path);
	} else if (vs->vs_aux != 0) {
	(void) printf(" ");

	switch (vs->vs_aux) {
	case VDEV_AUX_OPEN_FAILED:
	(void) printf(gettext("cannot open"));
	break;

	case VDEV_AUX_BAD_GUID_SUM:
	(void) printf(gettext("missing device"));
	break;

	case VDEV_AUX_NO_REPLICAS:
	(void) printf(gettext("insufficient replicas"));
	break;

	case VDEV_AUX_VERSION_NEWER:
	(void) printf(gettext("newer version"));
	break;

	case VDEV_AUX_UNSUP_FEAT:
	(void) printf(gettext("unsupported feature(s)"));
	break;

	case VDEV_AUX_ASHIFT_TOO_BIG:
	(void) printf(gettext("unsupported minimum blocksize"));
	break;

	case VDEV_AUX_SPARED:
	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
	&cb.cb_guid) == 0);
	if (zpool_iter(g_zfs, find_spare, &cb) == 1) {
	if (strcmp(zpool_get_name(cb.cb_zhp),
	zpool_get_name(zhp)) == 0)
	(void) printf(gettext("currently in "
	"use"));
	else
	(void) printf(gettext("in use by "
	"pool '%s'"),
	zpool_get_name(cb.cb_zhp));
	zpool_close(cb.cb_zhp);
	} else {
	(void) printf(gettext("currently in use"));
	}
	break;

	case VDEV_AUX_ERR_EXCEEDED:
	(void) printf(gettext("too many errors"));
	break;

	case VDEV_AUX_IO_FAILURE:
	(void) printf(gettext("experienced I/O failures"));
	break;

	case VDEV_AUX_BAD_LOG:
	(void) printf(gettext("bad intent log"));
	break;

	case VDEV_AUX_EXTERNAL:
	(void) printf(gettext("external device fault"));
	break;

	case VDEV_AUX_SPLIT_POOL:
	(void) printf(gettext("split into new pool"));
	break;

	default:
	(void) printf(gettext("corrupted data"));
	break;
	}
	} else if (children == 0 && !isspare &&
	VDEV_STAT_VALID(vs_physical_ashift, vsc) &&
	vs->vs_configured_ashift < vs->vs_physical_ashift) {
	(void) printf(
	gettext(" block size: %dB configured, %dB native"),
	1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift);
	}

	(void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
	(uint64_t **)&ps, &c);

	if (ps && ps->pss_state == DSS_SCANNING &&
	vs->vs_scan_processed != 0 && children == 0) {
	(void) printf(gettext(" (%s)"),
	(ps->pss_func == POOL_SCAN_RESILVER) ?
	"resilvering" : "repairing");
	}

	(void) printf("\n");

	for (c = 0; c < children; c++) {
	uint64_t islog = B_FALSE, ishole = B_FALSE;

	/* Don't print logs or holes here */
	(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	&islog);
	(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
	&ishole);
	if (islog \|\| ishole)
	continue;
	vname = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
	print_status_config(zhp, vname, child[c],
	namewidth, depth + 2, isspare);
	free(vname);
	}
	}


	/*
	* Print the configuration of an exported pool. Iterate over all vdevs in the
	* pool, printing out the name and status for each one.
	*/
	void
	print_import_config(const char name, nvlist_t nv, int namewidth, int depth)
	{
	nvlist_t **child;
	uint_t c, children;
	vdev_stat_t *vs;
	char type, vname;

	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
	if (strcmp(type, VDEV_TYPE_MISSING) == 0 \|\|
	strcmp(type, VDEV_TYPE_HOLE) == 0)
	return;

	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	(uint64_t **)&vs, &c) == 0);

	(void) printf("\t%s%-s", depth, "", namewidth - depth, name);
	(void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux));

	if (vs->vs_aux != 0) {
	(void) printf(" ");

	switch (vs->vs_aux) {
	case VDEV_AUX_OPEN_FAILED:
	(void) printf(gettext("cannot open"));
	break;

	case VDEV_AUX_BAD_GUID_SUM:
	(void) printf(gettext("missing device"));
	break;

	case VDEV_AUX_NO_REPLICAS:
	(void) printf(gettext("insufficient replicas"));
	break;

	case VDEV_AUX_VERSION_NEWER:
	(void) printf(gettext("newer version"));
	break;

	case VDEV_AUX_UNSUP_FEAT:
	(void) printf(gettext("unsupported feature(s)"));
	break;

	case VDEV_AUX_ERR_EXCEEDED:
	(void) printf(gettext("too many errors"));
	break;

	default:
	(void) printf(gettext("corrupted data"));
	break;
	}
	}
	(void) printf("\n");

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	&child, &children) != 0)
	return;

	for (c = 0; c < children; c++) {
	uint64_t is_log = B_FALSE;

	(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	&is_log);
	if (is_log)
	continue;

	vname = zpool_vdev_name(g_zfs, NULL, child[c], B_TRUE);
	print_import_config(vname, child[c], namewidth, depth + 2);
	free(vname);
	}

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
	&child, &children) == 0) {
	(void) printf(gettext("\tcache\n"));
	for (c = 0; c < children; c++) {
	vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
	(void) printf("\t %s\n", vname);
	free(vname);
	}
	}

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
	&child, &children) == 0) {
	(void) printf(gettext("\tspares\n"));
	for (c = 0; c < children; c++) {
	vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
	(void) printf("\t %s\n", vname);
	free(vname);
	}
	}
	}

	/*
	* Print log vdevs.
	* Logs are recorded as top level vdevs in the main pool child array
	* but with "is_log" set to 1. We use either print_status_config() or
	* print_import_config() to print the top level logs then any log
	* children (eg mirrored slogs) are printed recursively - which
	* works because only the top level vdev is marked "is_log"
	*/
	static void
	print_logs(zpool_handle_t zhp, nvlist_t nv, int namewidth, boolean_t verbose)
	{
	uint_t c, children;
	nvlist_t **child;

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child,
	&children) != 0)
	return;

	(void) printf(gettext("\tlogs\n"));

	for (c = 0; c < children; c++) {
	uint64_t is_log = B_FALSE;
	char *name;

	(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	&is_log);
	if (!is_log)
	continue;
	name = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
	if (verbose)
	print_status_config(zhp, name, child[c], namewidth,
	2, B_FALSE);
	else
	print_import_config(name, child[c], namewidth, 2);
	free(name);
	}
	}

	/*
	* Display the status for the given pool.
	*/
	static void
	show_import(nvlist_t *config)
	{
	uint64_t pool_state;
	vdev_stat_t *vs;
	char *name;
	uint64_t guid;
	char *msgid;
	nvlist_t *nvroot;
	int reason;
	const char *health;
	uint_t vsc;
	int namewidth;
	char *comment;

	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	&name) == 0);
	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	&guid) == 0);
	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	&pool_state) == 0);
	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvroot) == 0);

	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
	(uint64_t **)&vs, &vsc) == 0);
	health = zpool_state_to_name(vs->vs_state, vs->vs_aux);

	reason = zpool_import_status(config, &msgid);

	(void) printf(gettext(" pool: %s\n"), name);
	(void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid);
	(void) printf(gettext(" state: %s"), health);
	if (pool_state == POOL_STATE_DESTROYED)
	(void) printf(gettext(" (DESTROYED)"));
	(void) printf("\n");

	switch (reason) {
	case ZPOOL_STATUS_MISSING_DEV_R:
	case ZPOOL_STATUS_MISSING_DEV_NR:
	case ZPOOL_STATUS_BAD_GUID_SUM:
	(void) printf(gettext(" status: One or more devices are "
	"missing from the system.\n"));
	break;

	case ZPOOL_STATUS_CORRUPT_LABEL_R:
	case ZPOOL_STATUS_CORRUPT_LABEL_NR:
	(void) printf(gettext(" status: One or more devices contains "
	"corrupted data.\n"));
	break;

	case ZPOOL_STATUS_CORRUPT_DATA:
	(void) printf(
	gettext(" status: The pool data is corrupted.\n"));
	break;

	case ZPOOL_STATUS_OFFLINE_DEV:
	(void) printf(gettext(" status: One or more devices "
	"are offlined.\n"));
	break;

	case ZPOOL_STATUS_CORRUPT_POOL:
	(void) printf(gettext(" status: The pool metadata is "
	"corrupted.\n"));
	break;

	case ZPOOL_STATUS_VERSION_OLDER:
	(void) printf(gettext(" status: The pool is formatted using a "
	"legacy on-disk version.\n"));
	break;

	case ZPOOL_STATUS_VERSION_NEWER:
	(void) printf(gettext(" status: The pool is formatted using an "
	"incompatible version.\n"));
	break;

	case ZPOOL_STATUS_FEAT_DISABLED:
	(void) printf(gettext(" status: Some supported features are "
	"not enabled on the pool.\n"));
	break;

	case ZPOOL_STATUS_UNSUP_FEAT_READ:
	(void) printf(gettext("status: The pool uses the following "
	"feature(s) not supported on this sytem:\n"));
	zpool_print_unsup_feat(config);
	break;

	case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
	(void) printf(gettext("status: The pool can only be accessed "
	"in read-only mode on this system. It\n\tcannot be "
	"accessed in read-write mode because it uses the "
	"following\n\tfeature(s) not supported on this system:\n"));
	zpool_print_unsup_feat(config);
	break;

	case ZPOOL_STATUS_HOSTID_MISMATCH:
	(void) printf(gettext(" status: The pool was last accessed by "
	"another system.\n"));
	break;

	case ZPOOL_STATUS_FAULTED_DEV_R:
	case ZPOOL_STATUS_FAULTED_DEV_NR:
	(void) printf(gettext(" status: One or more devices are "
	"faulted.\n"));
	break;

	case ZPOOL_STATUS_BAD_LOG:
	(void) printf(gettext(" status: An intent log record cannot be "
	"read.\n"));
	break;

	case ZPOOL_STATUS_RESILVERING:
	(void) printf(gettext(" status: One or more devices were being "
	"resilvered.\n"));
	break;

	case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
	(void) printf(gettext("status: One or more devices were "
	"configured to use a non-native block size.\n"
	"\tExpect reduced performance.\n"));
	break;

	default:
	/*
	* No other status can be seen when importing pools.
	*/
	assert(reason == ZPOOL_STATUS_OK);
	}

	/*
	* Print out an action according to the overall state of the pool.
	*/
	if (vs->vs_state == VDEV_STATE_HEALTHY) {
	if (reason == ZPOOL_STATUS_VERSION_OLDER \|\|
	reason == ZPOOL_STATUS_FEAT_DISABLED) {
	(void) printf(gettext(" action: The pool can be "
	"imported using its name or numeric identifier, "
	"though\n\tsome features will not be available "
	"without an explicit 'zpool upgrade'.\n"));
	} else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) {
	(void) printf(gettext(" action: The pool can be "
	"imported using its name or numeric "
	"identifier and\n\tthe '-f' flag.\n"));
	} else {
	(void) printf(gettext(" action: The pool can be "
	"imported using its name or numeric "
	"identifier.\n"));
	}
	} else if (vs->vs_state == VDEV_STATE_DEGRADED) {
	(void) printf(gettext(" action: The pool can be imported "
	"despite missing or damaged devices. The\n\tfault "
	"tolerance of the pool may be compromised if imported.\n"));
	} else {
	switch (reason) {
	case ZPOOL_STATUS_VERSION_NEWER:
	(void) printf(gettext(" action: The pool cannot be "
	"imported. Access the pool on a system running "
	"newer\n\tsoftware, or recreate the pool from "
	"backup.\n"));
	break;
	case ZPOOL_STATUS_UNSUP_FEAT_READ:
	(void) printf(gettext("action: The pool cannot be "
	"imported. Access the pool on a system that "
	"supports\n\tthe required feature(s), or recreate "
	"the pool from backup.\n"));
	break;
	case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
	(void) printf(gettext("action: The pool cannot be "
	"imported in read-write mode. Import the pool "
	"with\n"
	"\t\"-o readonly=on\", access the pool on a system "
	"that supports the\n\trequired feature(s), or "
	"recreate the pool from backup.\n"));
	break;
	case ZPOOL_STATUS_MISSING_DEV_R:
	case ZPOOL_STATUS_MISSING_DEV_NR:
	case ZPOOL_STATUS_BAD_GUID_SUM:
	(void) printf(gettext(" action: The pool cannot be "
	"imported. Attach the missing\n\tdevices and try "
	"again.\n"));
	break;
	default:
	(void) printf(gettext(" action: The pool cannot be "
	"imported due to damaged devices or data.\n"));
	}
	}

	/* Print the comment attached to the pool. */
	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
	(void) printf(gettext("comment: %s\n"), comment);

	/*
	* If the state is "closed" or "can't open", and the aux state
	* is "corrupt data":
	*/
	if (((vs->vs_state == VDEV_STATE_CLOSED) \|\|
	(vs->vs_state == VDEV_STATE_CANT_OPEN)) &&
	(vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) {
	if (pool_state == POOL_STATE_DESTROYED)
	(void) printf(gettext("\tThe pool was destroyed, "
	"but can be imported using the '-Df' flags.\n"));
	else if (pool_state != POOL_STATE_EXPORTED)
	(void) printf(gettext("\tThe pool may be active on "
	"another system, but can be imported using\n\t"
	"the '-f' flag.\n"));
	}

	if (msgid != NULL)
	(void) printf(gettext(" see: http://illumos.org/msg/%s\n"),
	msgid);

	(void) printf(gettext(" config:\n\n"));

	namewidth = max_width(NULL, nvroot, 0, 0);
	if (namewidth < 10)
	namewidth = 10;

	print_import_config(name, nvroot, namewidth, 0);
	if (num_logs(nvroot) > 0)
	print_logs(NULL, nvroot, namewidth, B_FALSE);

	if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
	(void) printf(gettext("\n\tAdditional devices are known to "
	"be part of this pool, though their\n\texact "
	"configuration cannot be determined.\n"));
	}
	}

	/*
	* Perform the import for the given configuration. This passes the heavy
	* lifting off to zpool_import_props(), and then mounts the datasets contained
	* within the pool.
	*/
	static int
	do_import(nvlist_t config, const char newname, const char *mntopts,
	nvlist_t *props, int flags)
	{
	zpool_handle_t *zhp;
	char *name;
	uint64_t state;
	uint64_t version;

	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	&name) == 0);

	verify(nvlist_lookup_uint64(config,
	ZPOOL_CONFIG_POOL_STATE, &state) == 0);
	verify(nvlist_lookup_uint64(config,
	ZPOOL_CONFIG_VERSION, &version) == 0);
	if (!SPA_VERSION_IS_SUPPORTED(version)) {
	(void) fprintf(stderr, gettext("cannot import '%s': pool "
	"is formatted using an unsupported ZFS version\n"), name);
	return (1);
	} else if (state != POOL_STATE_EXPORTED &&
	!(flags & ZFS_IMPORT_ANY_HOST)) {
	uint64_t hostid;

	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID,
	&hostid) == 0) {
	if ((unsigned long)hostid != gethostid()) {
	char *hostname;
	uint64_t timestamp;
	time_t t;

	verify(nvlist_lookup_string(config,
	ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
	verify(nvlist_lookup_uint64(config,
	ZPOOL_CONFIG_TIMESTAMP, &timestamp) == 0);
	t = timestamp;
	(void) fprintf(stderr, gettext("cannot import "
	"'%s': pool may be in use from other "
	"system, it was last accessed by %s "
	"(hostid: 0x%lx) on %s"), name, hostname,
	(unsigned long)hostid,
	asctime(localtime(&t)));
	(void) fprintf(stderr, gettext("use '-f' to "
	"import anyway\n"));
	return (1);
	}
	} else {
	(void) fprintf(stderr, gettext("cannot import '%s': "
	"pool may be in use from other system\n"), name);
	(void) fprintf(stderr, gettext("use '-f' to import "
	"anyway\n"));
	return (1);
	}
	}

	if (zpool_import_props(g_zfs, config, newname, props, flags) != 0)
	return (1);

	if (newname != NULL)
	name = (char *)newname;

	if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL)
	return (1);

	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
	!(flags & ZFS_IMPORT_ONLY) &&
	zpool_enable_datasets(zhp, mntopts, 0) != 0) {
	zpool_close(zhp);
	return (1);
	}

	zpool_close(zhp);
	return (0);
	}

	/*
	* zpool import [-d dir] [-D]
	* import [-o mntopts] [-o prop=value] ... [-R root] [-D]
	* [-d dir \| -c cachefile] [-f] -a
	* import [-o mntopts] [-o prop=value] ... [-R root] [-D]
	* [-d dir \| -c cachefile] [-f] [-n] [-F] <pool \| id> [newpool]
	*
	* -c Read pool information from a cachefile instead of searching
	* devices.
	*
	* -d Scan in a specific directory, other than /dev/dsk. More than
	* one directory can be specified using multiple '-d' options.
	*
	* -D Scan for previously destroyed pools or import all or only
	* specified destroyed pools.
	*
	* -R Temporarily import the pool, with all mountpoints relative to
	* the given root. The pool will remain exported when the machine
	* is rebooted.
	*
	* -V Import even in the presence of faulted vdevs. This is an
	* intentionally undocumented option for testing purposes, and
	* treats the pool configuration as complete, leaving any bad
	* vdevs in the FAULTED state. In other words, it does verbatim
	* import.
	*
	* -f Force import, even if it appears that the pool is active.
	*
	* -F Attempt rewind if necessary.
	*
	* -n See if rewind would work, but don't actually rewind.
	*
	* -N Import the pool but don't mount datasets.
	*
	* -T Specify a starting txg to use for import. This option is
	* intentionally undocumented option for testing purposes.
	*
	* -a Import all pools found.
	*
	* -o Set property=value and/or temporary mount options (without '=').
	*
	* The import command scans for pools to import, and import pools based on pool
	* name and GUID. The pool can also be renamed as part of the import process.
	*/
	int
	zpool_do_import(int argc, char **argv)
	{
	char **searchdirs = NULL;
	int nsearch = 0;
	int c;
	int err = 0;
	nvlist_t *pools = NULL;
	boolean_t do_all = B_FALSE;
	boolean_t do_destroyed = B_FALSE;
	char *mntopts = NULL;
	nvpair_t *elem;
	nvlist_t *config;
	uint64_t searchguid = 0;
	char *searchname = NULL;
	char *propval;
	nvlist_t *found_config;
	nvlist_t *policy = NULL;
	nvlist_t *props = NULL;
	boolean_t first;
	int flags = ZFS_IMPORT_NORMAL;
	uint32_t rewind_policy = ZPOOL_NO_REWIND;
	boolean_t dryrun = B_FALSE;
	boolean_t do_rewind = B_FALSE;
	boolean_t xtreme_rewind = B_FALSE;
	uint64_t pool_state, txg = -1ULL;
	char *cachefile = NULL;
	importargs_t idata = { 0 };
	char *endptr;

	/* check options */
	while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:T:VX")) != -1) {
	switch (c) {
	case 'a':
	do_all = B_TRUE;
	break;
	case 'c':
	cachefile = optarg;
	break;
	case 'd':
	if (searchdirs == NULL) {
	searchdirs = safe_malloc(sizeof (char *));
	} else {
	char *tmp = safe_malloc((nsearch + 1)
	sizeof (char *));
	bcopy(searchdirs, tmp, nsearch *
	sizeof (char *));
	free(searchdirs);
	searchdirs = tmp;
	}
	searchdirs[nsearch++] = optarg;
	break;
	case 'D':
	do_destroyed = B_TRUE;
	break;
	case 'f':
	flags \|= ZFS_IMPORT_ANY_HOST;
	break;
	case 'F':
	do_rewind = B_TRUE;
	break;
	case 'm':
	flags \|= ZFS_IMPORT_MISSING_LOG;
	break;
	case 'n':
	dryrun = B_TRUE;
	break;
	case 'N':
	flags \|= ZFS_IMPORT_ONLY;
	break;
	case 'o':
	if ((propval = strchr(optarg, '=')) != NULL) {
	*propval = '\0';
	propval++;
	if (add_prop_list(optarg, propval,
	&props, B_TRUE))
	goto error;
	} else {
	mntopts = optarg;
	}
	break;
	case 'R':
	if (add_prop_list(zpool_prop_to_name(
	ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
	goto error;
	if (nvlist_lookup_string(props,
	zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
	&propval) == 0)
	break;
	if (add_prop_list(zpool_prop_to_name(
	ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
	goto error;
	break;
	case 'T':
	errno = 0;
	txg = strtoull(optarg, &endptr, 0);
	if (errno != 0 \|\| *endptr != '\0') {
	(void) fprintf(stderr,
	gettext("invalid txg value\n"));
	usage(B_FALSE);
	}
	rewind_policy = ZPOOL_DO_REWIND \| ZPOOL_EXTREME_REWIND;
	break;
	case 'V':
	flags \|= ZFS_IMPORT_VERBATIM;
	break;
	case 'X':
	xtreme_rewind = B_TRUE;
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	if (cachefile && nsearch != 0) {
	(void) fprintf(stderr, gettext("-c is incompatible with -d\n"));
	usage(B_FALSE);
	}

	if ((dryrun \|\| xtreme_rewind) && !do_rewind) {
	(void) fprintf(stderr,
	gettext("-n or -X only meaningful with -F\n"));
	usage(B_FALSE);
	}
	if (dryrun)
	rewind_policy = ZPOOL_TRY_REWIND;
	else if (do_rewind)
	rewind_policy = ZPOOL_DO_REWIND;
	if (xtreme_rewind)
	rewind_policy \|= ZPOOL_EXTREME_REWIND;

	/* In the future, we can capture further policy and include it here */
	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 \|\|
	nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, txg) != 0 \|\|
	nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
	goto error;

	if (searchdirs == NULL) {
	searchdirs = safe_malloc(sizeof (char *));
	searchdirs[0] = "/dev";
	nsearch = 1;
	}

	/* check argument count */
	if (do_all) {
	if (argc != 0) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}
	} else {
	if (argc > 2) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	/*
	* Check for the SYS_CONFIG privilege. We do this explicitly
	* here because otherwise any attempt to discover pools will
	* silently fail.
	*/
	if (argc == 0 && !priv_ineffect(PRIV_SYS_CONFIG)) {
	(void) fprintf(stderr, gettext("cannot "
	"discover pools: permission denied\n"));
	free(searchdirs);
	nvlist_free(policy);
	return (1);
	}
	}

	/*
	* Depending on the arguments given, we do one of the following:
	*
	* <none> Iterate through all pools and display information about
	* each one.
	*
	* -a Iterate through all pools and try to import each one.
	*
	* <id> Find the pool that corresponds to the given GUID/pool
	* name and import that one.
	*
	* -D Above options applies only to destroyed pools.
	*/
	if (argc != 0) {
	char *endptr;

	errno = 0;
	searchguid = strtoull(argv[0], &endptr, 10);
	if (errno != 0 \|\| *endptr != '\0') {
	searchname = argv[0];
	searchguid = 0;
	}
	found_config = NULL;

	/*
	* User specified a name or guid. Ensure it's unique.
	*/
	idata.unique = B_TRUE;
	}


	idata.path = searchdirs;
	idata.paths = nsearch;
	idata.poolname = searchname;
	idata.guid = searchguid;
	idata.cachefile = cachefile;

	pools = zpool_search_import(g_zfs, &idata);

	if (pools != NULL && idata.exists &&
	(argc == 1 \|\| strcmp(argv[0], argv[1]) == 0)) {
	(void) fprintf(stderr, gettext("cannot import '%s': "
	"a pool with that name already exists\n"),
	argv[0]);
	(void) fprintf(stderr, gettext("use the form '%s "
	"<pool \| id> <newpool>' to give it a new name\n"),
	"zpool import");
	err = 1;
	} else if (pools == NULL && idata.exists) {
	(void) fprintf(stderr, gettext("cannot import '%s': "
	"a pool with that name is already created/imported,\n"),
	argv[0]);
	(void) fprintf(stderr, gettext("and no additional pools "
	"with that name were found\n"));
	err = 1;
	} else if (pools == NULL) {
	if (argc != 0) {
	(void) fprintf(stderr, gettext("cannot import '%s': "
	"no such pool available\n"), argv[0]);
	}
	err = 1;
	}

	if (err == 1) {
	free(searchdirs);
	nvlist_free(policy);
	return (1);
	}

	/*
	* At this point we have a list of import candidate configs. Even if
	* we were searching by pool name or guid, we still need to
	* post-process the list to deal with pool state and possible
	* duplicate names.
	*/
	err = 0;
	elem = NULL;
	first = B_TRUE;
	while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {

	verify(nvpair_value_nvlist(elem, &config) == 0);

	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	&pool_state) == 0);
	if (!do_destroyed && pool_state == POOL_STATE_DESTROYED)
	continue;
	if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
	continue;

	verify(nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY,
	policy) == 0);

	if (argc == 0) {
	if (first)
	first = B_FALSE;
	else if (!do_all)
	(void) printf("\n");

	if (do_all) {
	err \|= do_import(config, NULL, mntopts,
	props, flags);
	} else {
	show_import(config);
	}
	} else if (searchname != NULL) {
	char *name;

	/*
	* We are searching for a pool based on name.
	*/
	verify(nvlist_lookup_string(config,
	ZPOOL_CONFIG_POOL_NAME, &name) == 0);

	if (strcmp(name, searchname) == 0) {
	if (found_config != NULL) {
	(void) fprintf(stderr, gettext(
	"cannot import '%s': more than "
	"one matching pool\n"), searchname);
	(void) fprintf(stderr, gettext(
	"import by numeric ID instead\n"));
	err = B_TRUE;
	}
	found_config = config;
	}
	} else {
	uint64_t guid;

	/*
	* Search for a pool by guid.
	*/
	verify(nvlist_lookup_uint64(config,
	ZPOOL_CONFIG_POOL_GUID, &guid) == 0);

	if (guid == searchguid)
	found_config = config;
	}
	}

	/*
	* If we were searching for a specific pool, verify that we found a
	* pool, and then do the import.
	*/
	if (argc != 0 && err == 0) {
	if (found_config == NULL) {
	(void) fprintf(stderr, gettext("cannot import '%s': "
	"no such pool available\n"), argv[0]);
	err = B_TRUE;
	} else {
	err \|= do_import(found_config, argc == 1 ? NULL :
	argv[1], mntopts, props, flags);
	}
	}

	/*
	* If we were just looking for pools, report an error if none were
	* found.
	*/
	if (argc == 0 && first)
	(void) fprintf(stderr,
	gettext("no pools available to import\n"));

	error:
	nvlist_free(props);
	nvlist_free(pools);
	nvlist_free(policy);
	free(searchdirs);

	return (err ? 1 : 0);
	}

	typedef struct iostat_cbdata {
	boolean_t cb_verbose;
	int cb_namewidth;
	int cb_iteration;
	zpool_list_t *cb_list;
	} iostat_cbdata_t;

	static void
	print_iostat_separator(iostat_cbdata_t *cb)
	{
	int i = 0;

	for (i = 0; i < cb->cb_namewidth; i++)
	(void) printf("-");
	(void) printf(" ----- ----- ----- ----- ----- -----\n");
	}

	static void
	print_iostat_header(iostat_cbdata_t *cb)
	{
	(void) printf("%*s capacity operations bandwidth\n",
	cb->cb_namewidth, "");
	(void) printf("%-*s alloc free read write read write\n",
	cb->cb_namewidth, "pool");
	print_iostat_separator(cb);
	}

	/*
	* Display a single statistic.
	*/
	static void
	print_one_stat(uint64_t value)
	{
	char buf[64];

	zfs_nicenum(value, buf, sizeof (buf));
	(void) printf(" %5s", buf);
	}

	/*
	* Print out all the statistics for the given vdev. This can either be the
	* toplevel configuration, or called recursively. If 'name' is NULL, then this
	* is a verbose output, and we don't want to display the toplevel pool stats.
	*/
	void
	print_vdev_stats(zpool_handle_t zhp, const char name, nvlist_t *oldnv,
	nvlist_t newnv, iostat_cbdata_t cb, int depth)
	{
	nvlist_t oldchild, newchild;
	uint_t c, children;
	vdev_stat_t oldvs, newvs;
	vdev_stat_t zerovs = { 0 };
	uint64_t tdelta;
	double scale;
	char *vname;

	+ if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
	+ return;
	+
	if (oldnv != NULL) {
	verify(nvlist_lookup_uint64_array(oldnv,
	ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0);
	} else {
	oldvs = &zerovs;
	}

	verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
	(uint64_t **)&newvs, &c) == 0);

	if (strlen(name) + depth > cb->cb_namewidth)
	(void) printf("%*s%s", depth, "", name);
	else
	(void) printf("%s%s%s", depth, "", name,
	(int)(cb->cb_namewidth - strlen(name) - depth), "");

	tdelta = newvs->vs_timestamp - oldvs->vs_timestamp;

	if (tdelta == 0)
	scale = 1.0;
	else
	scale = (double)NANOSEC / tdelta;

	/* only toplevel vdevs have capacity stats */
	if (newvs->vs_space == 0) {
	(void) printf(" - -");
	} else {
	print_one_stat(newvs->vs_alloc);
	print_one_stat(newvs->vs_space - newvs->vs_alloc);
	}

	print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] -
	oldvs->vs_ops[ZIO_TYPE_READ])));

	print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] -
	oldvs->vs_ops[ZIO_TYPE_WRITE])));

	print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] -
	oldvs->vs_bytes[ZIO_TYPE_READ])));

	print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] -
	oldvs->vs_bytes[ZIO_TYPE_WRITE])));

	(void) printf("\n");

	if (!cb->cb_verbose)
	return;

	if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN,
	&newchild, &children) != 0)
	return;

	if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
	&oldchild, &c) != 0)
	return;

	for (c = 0; c < children; c++) {
	uint64_t ishole = B_FALSE, islog = B_FALSE;

	(void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE,
	&ishole);

	(void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG,
	&islog);

	if (ishole \|\| islog)
	continue;

	vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE);
	print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
	newchild[c], cb, depth + 2);
	free(vname);
	}

	/*
	* Log device section
	*/

	if (num_logs(newnv) > 0) {
	(void) printf("%-*s - - - - - "
	"-\n", cb->cb_namewidth, "logs");

	for (c = 0; c < children; c++) {
	uint64_t islog = B_FALSE;
	(void) nvlist_lookup_uint64(newchild[c],
	ZPOOL_CONFIG_IS_LOG, &islog);

	if (islog) {
	vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
	B_FALSE);
	print_vdev_stats(zhp, vname, oldnv ?
	oldchild[c] : NULL, newchild[c],
	cb, depth + 2);
	free(vname);
	}
	}

	}

	/*
	* Include level 2 ARC devices in iostat output
	*/
	if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE,
	&newchild, &children) != 0)
	return;

	if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE,
	&oldchild, &c) != 0)
	return;

	if (children > 0) {
	(void) printf("%-*s - - - - - "
	"-\n", cb->cb_namewidth, "cache");
	for (c = 0; c < children; c++) {
	vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
	B_FALSE);
	print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
	newchild[c], cb, depth + 2);
	free(vname);
	}
	}
	}

	static int
	refresh_iostat(zpool_handle_t zhp, void data)
	{
	iostat_cbdata_t *cb = data;
	boolean_t missing;

	/*
	* If the pool has disappeared, remove it from the list and continue.
	*/
	if (zpool_refresh_stats(zhp, &missing) != 0)
	return (-1);

	if (missing)
	pool_list_remove(cb->cb_list, zhp);

	return (0);
	}

	/*
	* Callback to print out the iostats for the given pool.
	*/
	int
	print_iostat(zpool_handle_t zhp, void data)
	{
	iostat_cbdata_t *cb = data;
	nvlist_t oldconfig, newconfig;
	nvlist_t oldnvroot, newnvroot;

	newconfig = zpool_get_config(zhp, &oldconfig);

	if (cb->cb_iteration == 1)
	oldconfig = NULL;

	verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE,
	&newnvroot) == 0);

	if (oldconfig == NULL)
	oldnvroot = NULL;
	else
	verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE,
	&oldnvroot) == 0);

	/*
	* Print out the statistics for the pool.
	*/
	print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0);

	if (cb->cb_verbose)
	print_iostat_separator(cb);

	return (0);
	}

	int
	get_namewidth(zpool_handle_t zhp, void data)
	{
	iostat_cbdata_t *cb = data;
	nvlist_t config, nvroot;

	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvroot) == 0);
	if (!cb->cb_verbose)
	cb->cb_namewidth = strlen(zpool_get_name(zhp));
	else
	cb->cb_namewidth = max_width(zhp, nvroot, 0,
	cb->cb_namewidth);
	}

	/*
	* The width must fall into the range [10,38]. The upper limit is the
	* maximum we can have and still fit in 80 columns.
	*/
	if (cb->cb_namewidth < 10)
	cb->cb_namewidth = 10;
	if (cb->cb_namewidth > 38)
	cb->cb_namewidth = 38;

	return (0);
	}

	/*
	* Parse the input string, get the 'interval' and 'count' value if there is one.
	*/
	static void
	get_interval_count(int argcp, char argv, unsigned long iv,
	unsigned long *cnt)
	{
	unsigned long interval = 0, count = 0;
	int argc = *argcp, errno;

	/*
	* Determine if the last argument is an integer or a pool name
	*/
	if (argc > 0 && isdigit(argv[argc - 1][0])) {
	char *end;

	errno = 0;
	interval = strtoul(argv[argc - 1], &end, 10);

	if (*end == '\0' && errno == 0) {
	if (interval == 0) {
	(void) fprintf(stderr, gettext("interval "
	"cannot be zero\n"));
	usage(B_FALSE);
	}
	/*
	* Ignore the last parameter
	*/
	argc--;
	} else {
	/*
	* If this is not a valid number, just plow on. The
	* user will get a more informative error message later
	* on.
	*/
	interval = 0;
	}
	}

	/*
	* If the last argument is also an integer, then we have both a count
	* and an interval.
	*/
	if (argc > 0 && isdigit(argv[argc - 1][0])) {
	char *end;

	errno = 0;
	count = interval;
	interval = strtoul(argv[argc - 1], &end, 10);

	if (*end == '\0' && errno == 0) {
	if (interval == 0) {
	(void) fprintf(stderr, gettext("interval "
	"cannot be zero\n"));
	usage(B_FALSE);
	}

	/*
	* Ignore the last parameter
	*/
	argc--;
	} else {
	interval = 0;
	}
	}

	*iv = interval;
	*cnt = count;
	*argcp = argc;
	}

	static void
	get_timestamp_arg(char c)
	{
	if (c == 'u')
	timestamp_fmt = UDATE;
	else if (c == 'd')
	timestamp_fmt = DDATE;
	else
	usage(B_FALSE);
	}

	/*
	* zpool iostat [-v] [-T d\|u] [pool] ... [interval [count]]
	*
	* -v Display statistics for individual vdevs
	* -T Display a timestamp in date(1) or Unix format
	*
	* This command can be tricky because we want to be able to deal with pool
	* creation/destruction as well as vdev configuration changes. The bulk of this
	* processing is handled by the pool_list_* routines in zpool_iter.c. We rely
	* on pool_list_update() to detect the addition of new pools. Configuration
	* changes are all handled within libzfs.
	*/
	int
	zpool_do_iostat(int argc, char **argv)
	{
	int c;
	int ret;
	int npools;
	unsigned long interval = 0, count = 0;
	zpool_list_t *list;
	boolean_t verbose = B_FALSE;
	iostat_cbdata_t cb;

	/* check options */
	while ((c = getopt(argc, argv, "T:v")) != -1) {
	switch (c) {
	case 'T':
	get_timestamp_arg(*optarg);
	break;
	case 'v':
	verbose = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	get_interval_count(&argc, argv, &interval, &count);

	/*
	* Construct the list of all interesting pools.
	*/
	ret = 0;
	if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL)
	return (1);

	if (pool_list_count(list) == 0 && argc != 0) {
	pool_list_free(list);
	return (1);
	}

	if (pool_list_count(list) == 0 && interval == 0) {
	pool_list_free(list);
	(void) fprintf(stderr, gettext("no pools available\n"));
	return (1);
	}

	/*
	* Enter the main iostat loop.
	*/
	cb.cb_list = list;
	cb.cb_verbose = verbose;
	cb.cb_iteration = 0;
	cb.cb_namewidth = 0;

	for (;;) {
	pool_list_update(list);

	if ((npools = pool_list_count(list)) == 0)
	break;

	/*
	* Refresh all statistics. This is done as an explicit step
	* before calculating the maximum name width, so that any
	* configuration changes are properly accounted for.
	*/
	(void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb);

	/*
	* Iterate over all pools to determine the maximum width
	* for the pool / device name column across all pools.
	*/
	cb.cb_namewidth = 0;
	(void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);

	if (timestamp_fmt != NODATE)
	print_timestamp(timestamp_fmt);

	/*
	* If it's the first time, or verbose mode, print the header.
	*/
	if (++cb.cb_iteration == 1 \|\| verbose)
	print_iostat_header(&cb);

	(void) pool_list_iter(list, B_FALSE, print_iostat, &cb);

	/*
	* If there's more than one pool, and we're not in verbose mode
	* (which prints a separator for us), then print a separator.
	*/
	if (npools > 1 && !verbose)
	print_iostat_separator(&cb);

	if (verbose)
	(void) printf("\n");

	/*
	* Flush the output so that redirection to a file isn't buffered
	* indefinitely.
	*/
	(void) fflush(stdout);

	if (interval == 0)
	break;

	if (count != 0 && --count == 0)
	break;

	(void) sleep(interval);
	}

	pool_list_free(list);

	return (ret);
	}

	typedef struct list_cbdata {
	boolean_t cb_verbose;
	int cb_namewidth;
	boolean_t cb_scripted;
	zprop_list_t *cb_proplist;
	boolean_t cb_literal;
	} list_cbdata_t;

	/*
	* Given a list of columns to display, output appropriate headers for each one.
	*/
	static void
	print_header(list_cbdata_t *cb)
	{
	zprop_list_t *pl = cb->cb_proplist;
	char headerbuf[ZPOOL_MAXPROPLEN];
	const char *header;
	boolean_t first = B_TRUE;
	boolean_t right_justify;
	size_t width = 0;

	for (; pl != NULL; pl = pl->pl_next) {
	width = pl->pl_width;
	if (first && cb->cb_verbose) {
	/*
	* Reset the width to accommodate the verbose listing
	* of devices.
	*/
	width = cb->cb_namewidth;
	}

	if (!first)
	(void) printf(" ");
	else
	first = B_FALSE;

	right_justify = B_FALSE;
	if (pl->pl_prop != ZPROP_INVAL) {
	header = zpool_prop_column_name(pl->pl_prop);
	right_justify = zpool_prop_align_right(pl->pl_prop);
	} else {
	int i;

	for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
	headerbuf[i] = toupper(pl->pl_user_prop[i]);
	headerbuf[i] = '\0';
	header = headerbuf;
	}

	if (pl->pl_next == NULL && !right_justify)
	(void) printf("%s", header);
	else if (right_justify)
	(void) printf("%*s", width, header);
	else
	(void) printf("%-*s", width, header);

	}

	(void) printf("\n");
	}

	/*
	* Given a pool and a list of properties, print out all the properties according
	* to the described layout.
	*/
	static void
	print_pool(zpool_handle_t zhp, list_cbdata_t cb)
	{
	zprop_list_t *pl = cb->cb_proplist;
	boolean_t first = B_TRUE;
	char property[ZPOOL_MAXPROPLEN];
	char *propstr;
	boolean_t right_justify;
	size_t width;

	for (; pl != NULL; pl = pl->pl_next) {

	width = pl->pl_width;
	if (first && cb->cb_verbose) {
	/*
	* Reset the width to accommodate the verbose listing
	* of devices.
	*/
	width = cb->cb_namewidth;
	}

	if (!first) {
	if (cb->cb_scripted)
	(void) printf("\t");
	else
	(void) printf(" ");
	} else {
	first = B_FALSE;
	}

	right_justify = B_FALSE;
	if (pl->pl_prop != ZPROP_INVAL) {
	if (zpool_get_prop(zhp, pl->pl_prop, property,
	sizeof (property), NULL, cb->cb_literal) != 0)
	propstr = "-";
	else
	propstr = property;

	right_justify = zpool_prop_align_right(pl->pl_prop);
	} else if ((zpool_prop_feature(pl->pl_user_prop) \|\|
	zpool_prop_unsupported(pl->pl_user_prop)) &&
	zpool_prop_get_feature(zhp, pl->pl_user_prop, property,
	sizeof (property)) == 0) {
	propstr = property;
	} else {
	propstr = "-";
	}


	/*
	* If this is being called in scripted mode, or if this is the
	* last column and it is left-justified, don't include a width
	* format specifier.
	*/
	if (cb->cb_scripted \|\| (pl->pl_next == NULL && !right_justify))
	(void) printf("%s", propstr);
	else if (right_justify)
	(void) printf("%*s", width, propstr);
	else
	(void) printf("%-*s", width, propstr);
	}

	(void) printf("\n");
	}

	static void
	print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted,
	boolean_t valid)
	{
	char propval[64];
	boolean_t fixed;
	size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);

	switch (prop) {
	case ZPOOL_PROP_EXPANDSZ:
	if (value == 0)
	(void) strlcpy(propval, "-", sizeof (propval));
	else
	zfs_nicenum(value, propval, sizeof (propval));
	break;
	case ZPOOL_PROP_FRAGMENTATION:
	if (value == ZFS_FRAG_INVALID) {
	(void) strlcpy(propval, "-", sizeof (propval));
	} else {
	(void) snprintf(propval, sizeof (propval), "%llu%%",
	value);
	}
	break;
	case ZPOOL_PROP_CAPACITY:
	(void) snprintf(propval, sizeof (propval), "%llu%%", value);
	break;
	default:
	zfs_nicenum(value, propval, sizeof (propval));
	}

	if (!valid)
	(void) strlcpy(propval, "-", sizeof (propval));

	if (scripted)
	(void) printf("\t%s", propval);
	else
	(void) printf(" %*s", width, propval);
	}

	void
	print_list_stats(zpool_handle_t zhp, const char name, nvlist_t *nv,
	list_cbdata_t *cb, int depth)
	{
	nvlist_t **child;
	vdev_stat_t *vs;
	uint_t c, children;
	char *vname;
	boolean_t scripted = cb->cb_scripted;
	uint64_t islog = B_FALSE;
	boolean_t haslog = B_FALSE;
	char dashes = "%-s - - - - - -\n";

	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	(uint64_t **)&vs, &c) == 0);

	if (name != NULL) {
	boolean_t toplevel = (vs->vs_space != 0);
	uint64_t cap;

	+ if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
	+ return;
	+
	if (scripted)
	(void) printf("\t%s", name);
	else if (strlen(name) + depth > cb->cb_namewidth)
	(void) printf("%*s%s", depth, "", name);
	else
	(void) printf("%s%s%s", depth, "", name,
	(int)(cb->cb_namewidth - strlen(name) - depth), "");

	/*
	* Print the properties for the individual vdevs. Some
	* properties are only applicable to toplevel vdevs. The
	* 'toplevel' boolean value is passed to the print_one_column()
	* to indicate that the value is valid.
	*/
	print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, scripted,
	toplevel);
	print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, scripted,
	toplevel);
	print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
	scripted, toplevel);
	print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted,
	B_TRUE);
	print_one_column(ZPOOL_PROP_FRAGMENTATION,
	vs->vs_fragmentation, scripted,
	(vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel));
	cap = (vs->vs_space == 0) ? 0 :
	(vs->vs_alloc * 100 / vs->vs_space);
	print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel);
	(void) printf("\n");
	}

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	&child, &children) != 0)
	return;

	for (c = 0; c < children; c++) {
	uint64_t ishole = B_FALSE;

	if (nvlist_lookup_uint64(child[c],
	ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole)
	continue;

	if (nvlist_lookup_uint64(child[c],
	ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) {
	haslog = B_TRUE;
	continue;
	}

	vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
	print_list_stats(zhp, vname, child[c], cb, depth + 2);
	free(vname);
	}

	if (haslog == B_TRUE) {
	/* LINTED E_SEC_PRINTF_VAR_FMT */
	(void) printf(dashes, cb->cb_namewidth, "log");
	for (c = 0; c < children; c++) {
	if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	&islog) != 0 \|\| !islog)
	continue;
	vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
	print_list_stats(zhp, vname, child[c], cb, depth + 2);
	free(vname);
	}
	}

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
	&child, &children) == 0 && children > 0) {
	/* LINTED E_SEC_PRINTF_VAR_FMT */
	(void) printf(dashes, cb->cb_namewidth, "cache");
	for (c = 0; c < children; c++) {
	vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
	print_list_stats(zhp, vname, child[c], cb, depth + 2);
	free(vname);
	}
	}

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child,
	&children) == 0 && children > 0) {
	/* LINTED E_SEC_PRINTF_VAR_FMT */
	(void) printf(dashes, cb->cb_namewidth, "spare");
	for (c = 0; c < children; c++) {
	vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
	print_list_stats(zhp, vname, child[c], cb, depth + 2);
	free(vname);
	}
	}
	}


	/*
	* Generic callback function to list a pool.
	*/
	int
	list_callback(zpool_handle_t zhp, void data)
	{
	list_cbdata_t *cbp = data;
	nvlist_t *config;
	nvlist_t *nvroot;

	config = zpool_get_config(zhp, NULL);

	print_pool(zhp, cbp);
	if (!cbp->cb_verbose)
	return (0);

	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvroot) == 0);
	print_list_stats(zhp, NULL, nvroot, cbp, 0);

	return (0);
	}

	/*
	* zpool list [-Hp] [-o prop[,prop]*] [-T d\|u] [pool] ... [interval [count]]
	*
	* -H Scripted mode. Don't display headers, and separate properties
	* by a single tab.
	* -o List of properties to display. Defaults to
	* "name,size,allocated,free,expandsize,fragmentation,capacity,"
	* "dedupratio,health,altroot"
	* -p Diplay values in parsable (exact) format.
	* -T Display a timestamp in date(1) or Unix format
	*
	* List all pools in the system, whether or not they're healthy. Output space
	* statistics for each one, as well as health status summary.
	*/
	int
	zpool_do_list(int argc, char **argv)
	{
	int c;
	int ret;
	list_cbdata_t cb = { 0 };
	static char default_props[] =
	"name,size,allocated,free,expandsize,fragmentation,capacity,"
	"dedupratio,health,altroot";
	char *props = default_props;
	unsigned long interval = 0, count = 0;
	zpool_list_t *list;
	boolean_t first = B_TRUE;

	/* check options */
	while ((c = getopt(argc, argv, ":Ho:pT:v")) != -1) {
	switch (c) {
	case 'H':
	cb.cb_scripted = B_TRUE;
	break;
	case 'o':
	props = optarg;
	break;
	case 'p':
	cb.cb_literal = B_TRUE;
	break;
	case 'T':
	get_timestamp_arg(*optarg);
	break;
	case 'v':
	cb.cb_verbose = B_TRUE;
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	get_interval_count(&argc, argv, &interval, &count);

	if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0)
	usage(B_FALSE);

	for (;;) {
	if ((list = pool_list_get(argc, argv, &cb.cb_proplist,
	&ret)) == NULL)
	return (1);

	if (pool_list_count(list) == 0)
	break;

	cb.cb_namewidth = 0;
	(void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);

	if (timestamp_fmt != NODATE)
	print_timestamp(timestamp_fmt);

	if (!cb.cb_scripted && (first \|\| cb.cb_verbose)) {
	print_header(&cb);
	first = B_FALSE;
	}
	ret = pool_list_iter(list, B_TRUE, list_callback, &cb);

	if (interval == 0)
	break;

	if (count != 0 && --count == 0)
	break;

	pool_list_free(list);
	(void) sleep(interval);
	}

	if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) {
	(void) printf(gettext("no pools available\n"));
	ret = 0;
	}

	pool_list_free(list);
	zprop_free_list(cb.cb_proplist);
	return (ret);
	}

	static int
	zpool_do_attach_or_replace(int argc, char **argv, int replacing)
	{
	boolean_t force = B_FALSE;
	int c;
	nvlist_t *nvroot;
	char poolname, old_disk, *new_disk;
	zpool_handle_t *zhp;
	zpool_boot_label_t boot_type;
	uint64_t boot_size;
	int ret;

	/* check options */
	while ((c = getopt(argc, argv, "f")) != -1) {
	switch (c) {
	case 'f':
	force = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* get pool name and check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name argument\n"));
	usage(B_FALSE);
	}

	poolname = argv[0];

	if (argc < 2) {
	(void) fprintf(stderr,
	gettext("missing <device> specification\n"));
	usage(B_FALSE);
	}

	old_disk = argv[1];

	if (argc < 3) {
	if (!replacing) {
	(void) fprintf(stderr,
	gettext("missing <new_device> specification\n"));
	usage(B_FALSE);
	}
	new_disk = old_disk;
	argc -= 1;
	argv += 1;
	} else {
	new_disk = argv[2];
	argc -= 2;
	argv += 2;
	}

	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	return (1);

	if (zpool_get_config(zhp, NULL) == NULL) {
	(void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
	poolname);
	zpool_close(zhp);
	return (1);
	}

	if (zpool_is_bootable(zhp))
	boot_type = ZPOOL_COPY_BOOT_LABEL;
	else
	boot_type = ZPOOL_NO_BOOT_LABEL;

	boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL);
	nvroot = make_root_vdev(zhp, force, B_FALSE, replacing, B_FALSE,
	boot_type, boot_size, argc, argv);
	if (nvroot == NULL) {
	zpool_close(zhp);
	return (1);
	}

	ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing);

	nvlist_free(nvroot);
	zpool_close(zhp);

	return (ret);
	}

	/*
	* zpool replace [-f] <pool> <device> <new_device>
	*
	* -f Force attach, even if <new_device> appears to be in use.
	*
	* Replace <device> with <new_device>.
	*/
	/* ARGSUSED */
	int
	zpool_do_replace(int argc, char **argv)
	{
	return (zpool_do_attach_or_replace(argc, argv, B_TRUE));
	}

	/*
	* zpool attach [-f] <pool> <device> <new_device>
	*
	* -f Force attach, even if <new_device> appears to be in use.
	*
	* Attach <new_device> to the mirror containing <device>. If <device> is not
	* part of a mirror, then <device> will be transformed into a mirror of
	* <device> and <new_device>. In either case, <new_device> will begin life
	* with a DTL of [0, now], and will immediately begin to resilver itself.
	*/
	int
	zpool_do_attach(int argc, char **argv)
	{
	return (zpool_do_attach_or_replace(argc, argv, B_FALSE));
	}

	/*
	* zpool detach [-f] <pool> <device>
	*
	* -f Force detach of <device>, even if DTLs argue against it
	* (not supported yet)
	*
	* Detach a device from a mirror. The operation will be refused if <device>
	* is the last device in the mirror, or if the DTLs indicate that this device
	* has the only valid copy of some data.
	*/
	/* ARGSUSED */
	int
	zpool_do_detach(int argc, char **argv)
	{
	int c;
	char poolname, path;
	zpool_handle_t *zhp;
	int ret;

	/* check options */
	while ((c = getopt(argc, argv, "f")) != -1) {
	switch (c) {
	case 'f':
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* get pool name and check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name argument\n"));
	usage(B_FALSE);
	}

	if (argc < 2) {
	(void) fprintf(stderr,
	gettext("missing <device> specification\n"));
	usage(B_FALSE);
	}

	poolname = argv[0];
	path = argv[1];

	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	return (1);

	ret = zpool_vdev_detach(zhp, path);

	zpool_close(zhp);

	return (ret);
	}

	/*
	* zpool split [-n] [-o prop=val] ...
	* [-o mntopt] ...
	* [-R altroot] <pool> <newpool> [<device> ...]
	*
	* -n Do not split the pool, but display the resulting layout if
	* it were to be split.
	* -o Set property=value, or set mount options.
	* -R Mount the split-off pool under an alternate root.
	*
	* Splits the named pool and gives it the new pool name. Devices to be split
	* off may be listed, provided that no more than one device is specified
	* per top-level vdev mirror. The newly split pool is left in an exported
	* state unless -R is specified.
	*
	* Restrictions: the top-level of the pool pool must only be made up of
	* mirrors; all devices in the pool must be healthy; no device may be
	* undergoing a resilvering operation.
	*/
	int
	zpool_do_split(int argc, char **argv)
	{
	char srcpool, newpool, *propval;
	char *mntopts = NULL;
	splitflags_t flags;
	int c, ret = 0;
	zpool_handle_t *zhp;
	nvlist_t config, props = NULL;

	flags.dryrun = B_FALSE;
	flags.import = B_FALSE;

	/* check options */
	while ((c = getopt(argc, argv, ":R:no:")) != -1) {
	switch (c) {
	case 'R':
	flags.import = B_TRUE;
	if (add_prop_list(
	zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg,
	&props, B_TRUE) != 0) {
	nvlist_free(props);
	usage(B_FALSE);
	}
	break;
	case 'n':
	flags.dryrun = B_TRUE;
	break;
	case 'o':
	if ((propval = strchr(optarg, '=')) != NULL) {
	*propval = '\0';
	propval++;
	if (add_prop_list(optarg, propval,
	&props, B_TRUE) != 0) {
	nvlist_free(props);
	usage(B_FALSE);
	}
	} else {
	mntopts = optarg;
	}
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	break;
	}
	}

	if (!flags.import && mntopts != NULL) {
	(void) fprintf(stderr, gettext("setting mntopts is only "
	"valid when importing the pool\n"));
	usage(B_FALSE);
	}

	argc -= optind;
	argv += optind;

	if (argc < 1) {
	(void) fprintf(stderr, gettext("Missing pool name\n"));
	usage(B_FALSE);
	}
	if (argc < 2) {
	(void) fprintf(stderr, gettext("Missing new pool name\n"));
	usage(B_FALSE);
	}

	srcpool = argv[0];
	newpool = argv[1];

	argc -= 2;
	argv += 2;

	if ((zhp = zpool_open(g_zfs, srcpool)) == NULL)
	return (1);

	config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv);
	if (config == NULL) {
	ret = 1;
	} else {
	if (flags.dryrun) {
	(void) printf(gettext("would create '%s' with the "
	"following layout:\n\n"), newpool);
	print_vdev_tree(NULL, newpool, config, 0, B_FALSE);
	}
	nvlist_free(config);
	}

	zpool_close(zhp);

	if (ret != 0 \|\| flags.dryrun \|\| !flags.import)
	return (ret);

	/*
	* The split was successful. Now we need to open the new
	* pool and import it.
	*/
	if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL)
	return (1);
	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
	zpool_enable_datasets(zhp, mntopts, 0) != 0) {
	ret = 1;
	(void) fprintf(stderr, gettext("Split was successful, but "
	"the datasets could not all be mounted\n"));
	(void) fprintf(stderr, gettext("Try doing '%s' with a "
	"different altroot\n"), "zpool import");
	}
	zpool_close(zhp);

	return (ret);
	}



	/*
	* zpool online <pool> <device> ...
	*/
	int
	zpool_do_online(int argc, char **argv)
	{
	int c, i;
	char *poolname;
	zpool_handle_t *zhp;
	int ret = 0;
	vdev_state_t newstate;
	int flags = 0;

	/* check options */
	while ((c = getopt(argc, argv, "et")) != -1) {
	switch (c) {
	case 'e':
	flags \|= ZFS_ONLINE_EXPAND;
	break;
	case 't':
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* get pool name and check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name\n"));
	usage(B_FALSE);
	}
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing device name\n"));
	usage(B_FALSE);
	}

	poolname = argv[0];

	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	return (1);

	for (i = 1; i < argc; i++) {
	if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
	if (newstate != VDEV_STATE_HEALTHY) {
	(void) printf(gettext("warning: device '%s' "
	"onlined, but remains in faulted state\n"),
	argv[i]);
	if (newstate == VDEV_STATE_FAULTED)
	(void) printf(gettext("use 'zpool "
	"clear' to restore a faulted "
	"device\n"));
	else
	(void) printf(gettext("use 'zpool "
	"replace' to replace devices "
	"that are no longer present\n"));
	}
	} else {
	ret = 1;
	}
	}

	zpool_close(zhp);

	return (ret);
	}

	/*
	* zpool offline [-ft] <pool> <device> ...
	*
	* -f Force the device into the offline state, even if doing
	* so would appear to compromise pool availability.
	* (not supported yet)
	*
	* -t Only take the device off-line temporarily. The offline
	* state will not be persistent across reboots.
	*/
	/* ARGSUSED */
	int
	zpool_do_offline(int argc, char **argv)
	{
	int c, i;
	char *poolname;
	zpool_handle_t *zhp;
	int ret = 0;
	boolean_t istmp = B_FALSE;

	/* check options */
	while ((c = getopt(argc, argv, "ft")) != -1) {
	switch (c) {
	case 't':
	istmp = B_TRUE;
	break;
	case 'f':
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* get pool name and check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name\n"));
	usage(B_FALSE);
	}
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing device name\n"));
	usage(B_FALSE);
	}

	poolname = argv[0];

	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	return (1);

	for (i = 1; i < argc; i++) {
	if (zpool_vdev_offline(zhp, argv[i], istmp) != 0)
	ret = 1;
	}

	zpool_close(zhp);

	return (ret);
	}

	/*
	* zpool clear <pool> [device]
	*
	* Clear all errors associated with a pool or a particular device.
	*/
	int
	zpool_do_clear(int argc, char **argv)
	{
	int c;
	int ret = 0;
	boolean_t dryrun = B_FALSE;
	boolean_t do_rewind = B_FALSE;
	boolean_t xtreme_rewind = B_FALSE;
	uint32_t rewind_policy = ZPOOL_NO_REWIND;
	nvlist_t *policy = NULL;
	zpool_handle_t *zhp;
	char pool, device;

	/* check options */
	while ((c = getopt(argc, argv, "FnX")) != -1) {
	switch (c) {
	case 'F':
	do_rewind = B_TRUE;
	break;
	case 'n':
	dryrun = B_TRUE;
	break;
	case 'X':
	xtreme_rewind = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name\n"));
	usage(B_FALSE);
	}

	if (argc > 2) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	if ((dryrun \|\| xtreme_rewind) && !do_rewind) {
	(void) fprintf(stderr,
	gettext("-n or -X only meaningful with -F\n"));
	usage(B_FALSE);
	}
	if (dryrun)
	rewind_policy = ZPOOL_TRY_REWIND;
	else if (do_rewind)
	rewind_policy = ZPOOL_DO_REWIND;
	if (xtreme_rewind)
	rewind_policy \|= ZPOOL_EXTREME_REWIND;

	/* In future, further rewind policy choices can be passed along here */
	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 \|\|
	nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
	return (1);

	pool = argv[0];
	device = argc == 2 ? argv[1] : NULL;

	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
	nvlist_free(policy);
	return (1);
	}

	if (zpool_clear(zhp, device, policy) != 0)
	ret = 1;

	zpool_close(zhp);

	nvlist_free(policy);

	return (ret);
	}

	/*
	* zpool reguid <pool>
	*/
	int
	zpool_do_reguid(int argc, char **argv)
	{
	int c;
	char *poolname;
	zpool_handle_t *zhp;
	int ret = 0;

	/* check options */
	while ((c = getopt(argc, argv, "")) != -1) {
	switch (c) {
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	/* get pool name and check number of arguments */
	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name\n"));
	usage(B_FALSE);
	}

	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	poolname = argv[0];
	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
	return (1);

	ret = zpool_reguid(zhp);

	zpool_close(zhp);
	return (ret);
	}


	/*
	* zpool reopen <pool>
	*
	* Reopen the pool so that the kernel can update the sizes of all vdevs.
	*/
	int
	zpool_do_reopen(int argc, char **argv)
	{
	int c;
	int ret = 0;
	zpool_handle_t *zhp;
	char *pool;

	/* check options */
	while ((c = getopt(argc, argv, "")) != -1) {
	switch (c) {
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc--;
	argv++;

	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name\n"));
	usage(B_FALSE);
	}

	if (argc > 1) {
	(void) fprintf(stderr, gettext("too many arguments\n"));
	usage(B_FALSE);
	}

	pool = argv[0];
	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL)
	return (1);

	ret = zpool_reopen(zhp);
	zpool_close(zhp);
	return (ret);
	}

	typedef struct scrub_cbdata {
	int cb_type;
	int cb_argc;
	char **cb_argv;
	pool_scrub_cmd_t cb_scrub_cmd;
	} scrub_cbdata_t;

	int
	scrub_callback(zpool_handle_t zhp, void data)
	{
	scrub_cbdata_t *cb = data;
	int err;

	/*
	* Ignore faulted pools.
	*/
	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	(void) fprintf(stderr, gettext("cannot scrub '%s': pool is "
	"currently unavailable\n"), zpool_get_name(zhp));
	return (1);
	}

	err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);

	return (err != 0);
	}

	/*
	* zpool scrub [-s \| -p] <pool> ...
	*
	* -s Stop. Stops any in-progress scrub.
	* -p Pause. Pause in-progress scrub.
	*/
	int
	zpool_do_scrub(int argc, char **argv)
	{
	int c;
	scrub_cbdata_t cb;

	cb.cb_type = POOL_SCAN_SCRUB;
	cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;

	/* check options */
	while ((c = getopt(argc, argv, "sp")) != -1) {
	switch (c) {
	case 's':
	cb.cb_type = POOL_SCAN_NONE;
	break;
	case 'p':
	cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	if (cb.cb_type == POOL_SCAN_NONE &&
	cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) {
	(void) fprintf(stderr, gettext("invalid option combination: "
	"-s and -p are mutually exclusive\n"));
	usage(B_FALSE);
	}

	cb.cb_argc = argc;
	cb.cb_argv = argv;
	argc -= optind;
	argv += optind;

	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing pool name argument\n"));
	usage(B_FALSE);
	}

	return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
	}

	typedef struct status_cbdata {
	int cb_count;
	boolean_t cb_allpools;
	boolean_t cb_verbose;
	boolean_t cb_explain;
	boolean_t cb_first;
	boolean_t cb_dedup_stats;
	} status_cbdata_t;

	/*
	* Print out detailed scrub status.
	*/
	-void
	+static void
	print_scan_status(pool_scan_stat_t *ps)
	{
	time_t start, end, pause;
	uint64_t elapsed, mins_left, hours_left;
	uint64_t pass_exam, examined, total;
	uint_t rate;
	double fraction_done;
	char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];

	(void) printf(gettext(" scan: "));

	/* If there's never been a scan, there's not much to say. */
	if (ps == NULL \|\| ps->pss_func == POOL_SCAN_NONE \|\|
	ps->pss_func >= POOL_SCAN_FUNCS) {
	(void) printf(gettext("none requested\n"));
	return;
	}

	start = ps->pss_start_time;
	end = ps->pss_end_time;
	pause = ps->pss_pass_scrub_pause;
	zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));

	assert(ps->pss_func == POOL_SCAN_SCRUB \|\|
	ps->pss_func == POOL_SCAN_RESILVER);
	/*
	* Scan is finished or canceled.
	*/
	if (ps->pss_state == DSS_FINISHED) {
	uint64_t minutes_taken = (end - start) / 60;
	char *fmt = NULL;

	if (ps->pss_func == POOL_SCAN_SCRUB) {
	fmt = gettext("scrub repaired %s in %lluh%um with "
	"%llu errors on %s");
	} else if (ps->pss_func == POOL_SCAN_RESILVER) {
	fmt = gettext("resilvered %s in %lluh%um with "
	"%llu errors on %s");
	}
	/* LINTED */
	(void) printf(fmt, processed_buf,
	(u_longlong_t)(minutes_taken / 60),
	(uint_t)(minutes_taken % 60),
	(u_longlong_t)ps->pss_errors,
	ctime((time_t *)&end));
	return;
	} else if (ps->pss_state == DSS_CANCELED) {
	if (ps->pss_func == POOL_SCAN_SCRUB) {
	(void) printf(gettext("scrub canceled on %s"),
	ctime(&end));
	} else if (ps->pss_func == POOL_SCAN_RESILVER) {
	(void) printf(gettext("resilver canceled on %s"),
	ctime(&end));
	}
	return;
	}

	assert(ps->pss_state == DSS_SCANNING);

	/*
	* Scan is in progress.
	*/
	if (ps->pss_func == POOL_SCAN_SCRUB) {
	if (pause == 0) {
	(void) printf(gettext("scrub in progress since %s"),
	ctime(&start));
	} else {
	char buf[32];
	struct tm *p = localtime(&pause);
	(void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p);
	(void) printf(gettext("scrub paused since %s\n"), buf);
	(void) printf(gettext("\tscrub started on %s"),
	ctime(&start));
	}
	} else if (ps->pss_func == POOL_SCAN_RESILVER) {
	(void) printf(gettext("resilver in progress since %s"),
	ctime(&start));
	}

	examined = ps->pss_examined ? ps->pss_examined : 1;
	total = ps->pss_to_examine;
	fraction_done = (double)examined / total;

	/* elapsed time for this pass */
	elapsed = time(NULL) - ps->pss_pass_start;
	elapsed -= ps->pss_pass_scrub_spent_paused;
	elapsed = elapsed ? elapsed : 1;
	pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
	rate = pass_exam / elapsed;
	rate = rate ? rate : 1;
	mins_left = ((total - examined) / rate) / 60;
	hours_left = mins_left / 60;

	zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
	zfs_nicenum(total, total_buf, sizeof (total_buf));

	/*
	* do not print estimated time if hours_left is more than 30 days
	* or we have a paused scrub
	*/
	if (pause == 0) {
	zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
	(void) printf(gettext("\t%s scanned out of %s at %s/s"),
	examined_buf, total_buf, rate_buf);
	if (hours_left < (30 * 24)) {
	(void) printf(gettext(", %lluh%um to go\n"),
	(u_longlong_t)hours_left, (uint_t)(mins_left % 60));
	} else {
	(void) printf(gettext(
	", (scan is slow, no estimated time)\n"));
	}
	} else {
	(void) printf(gettext("\t%s scanned out of %s\n"),
	examined_buf, total_buf);
	}

	if (ps->pss_func == POOL_SCAN_RESILVER) {
	(void) printf(gettext(" %s resilvered, %.2f%% done\n"),
	processed_buf, 100 * fraction_done);
	} else if (ps->pss_func == POOL_SCAN_SCRUB) {
	(void) printf(gettext(" %s repaired, %.2f%% done\n"),
	processed_buf, 100 * fraction_done);
	}
	}

	+/*
	+ * Print out detailed removal status.
	+ */
	static void
	+print_removal_status(zpool_handle_t zhp, pool_removal_stat_t prs)
	+{
	+ char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
	+ time_t start, end;
	+ nvlist_t config, nvroot;
	+ nvlist_t **child;
	+ uint_t children;
	+ char *vdev_name;
	+
	+ if (prs == NULL \|\| prs->prs_state == DSS_NONE)
	+ return;
	+
	+ /*
	+ * Determine name of vdev.
	+ */
	+ config = zpool_get_config(zhp, NULL);
	+ nvroot = fnvlist_lookup_nvlist(config,
	+ ZPOOL_CONFIG_VDEV_TREE);
	+ verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	+ &child, &children) == 0);
	+ assert(prs->prs_removing_vdev < children);
	+ vdev_name = zpool_vdev_name(g_zfs, zhp,
	+ child[prs->prs_removing_vdev], B_TRUE);
	+
	+ (void) printf(gettext("remove: "));
	+
	+ start = prs->prs_start_time;
	+ end = prs->prs_end_time;
	+ zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf));
	+
	+ /*
	+ * Removal is finished or canceled.
	+ */
	+ if (prs->prs_state == DSS_FINISHED) {
	+ uint64_t minutes_taken = (end - start) / 60;
	+
	+ (void) printf(gettext("Removal of vdev %llu copied %s "
	+ "in %lluh%um, completed on %s"),
	+ (longlong_t)prs->prs_removing_vdev,
	+ copied_buf,
	+ (u_longlong_t)(minutes_taken / 60),
	+ (uint_t)(minutes_taken % 60),
	+ ctime((time_t *)&end));
	+ } else if (prs->prs_state == DSS_CANCELED) {
	+ (void) printf(gettext("Removal of %s canceled on %s"),
	+ vdev_name, ctime(&end));
	+ } else {
	+ uint64_t copied, total, elapsed, mins_left, hours_left;
	+ double fraction_done;
	+ uint_t rate;
	+
	+ assert(prs->prs_state == DSS_SCANNING);
	+
	+ /*
	+ * Removal is in progress.
	+ */
	+ (void) printf(gettext(
	+ "Evacuation of %s in progress since %s"),
	+ vdev_name, ctime(&start));
	+
	+ copied = prs->prs_copied > 0 ? prs->prs_copied : 1;
	+ total = prs->prs_to_copy;
	+ fraction_done = (double)copied / total;
	+
	+ /* elapsed time for this pass */
	+ elapsed = time(NULL) - prs->prs_start_time;
	+ elapsed = elapsed > 0 ? elapsed : 1;
	+ rate = copied / elapsed;
	+ rate = rate > 0 ? rate : 1;
	+ mins_left = ((total - copied) / rate) / 60;
	+ hours_left = mins_left / 60;
	+
	+ zfs_nicenum(copied, examined_buf, sizeof (examined_buf));
	+ zfs_nicenum(total, total_buf, sizeof (total_buf));
	+ zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
	+
	+ /*
	+ * do not print estimated time if hours_left is more than
	+ * 30 days
	+ */
	+ (void) printf(gettext(" %s copied out of %s at %s/s, "
	+ "%.2f%% done"),
	+ examined_buf, total_buf, rate_buf, 100 * fraction_done);
	+ if (hours_left < (30 * 24)) {
	+ (void) printf(gettext(", %lluh%um to go\n"),
	+ (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
	+ } else {
	+ (void) printf(gettext(
	+ ", (copy is slow, no estimated time)\n"));
	+ }
	+ }
	+
	+ if (prs->prs_mapping_memory > 0) {
	+ char mem_buf[7];
	+ zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf));
	+ (void) printf(gettext(" %s memory used for "
	+ "removed device mappings\n"),
	+ mem_buf);
	+ }
	+}
	+
	+static void
	print_error_log(zpool_handle_t *zhp)
	{
	nvlist_t *nverrlist = NULL;
	nvpair_t *elem;
	char *pathname;
	size_t len = MAXPATHLEN * 2;

	if (zpool_get_errlog(zhp, &nverrlist) != 0) {
	(void) printf("errors: List of errors unavailable "
	"(insufficient privileges)\n");
	return;
	}

	(void) printf("errors: Permanent errors have been "
	"detected in the following files:\n\n");

	pathname = safe_malloc(len);
	elem = NULL;
	while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) {
	nvlist_t *nv;
	uint64_t dsobj, obj;

	verify(nvpair_value_nvlist(elem, &nv) == 0);
	verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET,
	&dsobj) == 0);
	verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT,
	&obj) == 0);
	zpool_obj_to_path(zhp, dsobj, obj, pathname, len);
	(void) printf("%7s %s\n", "", pathname);
	}
	free(pathname);
	nvlist_free(nverrlist);
	}

	static void
	print_spares(zpool_handle_t zhp, nvlist_t *spares, uint_t nspares,
	int namewidth)
	{
	uint_t i;
	char *name;

	if (nspares == 0)
	return;

	(void) printf(gettext("\tspares\n"));

	for (i = 0; i < nspares; i++) {
	name = zpool_vdev_name(g_zfs, zhp, spares[i], B_FALSE);
	print_status_config(zhp, name, spares[i],
	namewidth, 2, B_TRUE);
	free(name);
	}
	}

	static void
	print_l2cache(zpool_handle_t zhp, nvlist_t *l2cache, uint_t nl2cache,
	int namewidth)
	{
	uint_t i;
	char *name;

	if (nl2cache == 0)
	return;

	(void) printf(gettext("\tcache\n"));

	for (i = 0; i < nl2cache; i++) {
	name = zpool_vdev_name(g_zfs, zhp, l2cache[i], B_FALSE);
	print_status_config(zhp, name, l2cache[i],
	namewidth, 2, B_FALSE);
	free(name);
	}
	}

	static void
	print_dedup_stats(nvlist_t *config)
	{
	ddt_histogram_t *ddh;
	ddt_stat_t *dds;
	ddt_object_t *ddo;
	uint_t c;

	/*
	* If the pool was faulted then we may not have been able to
	* obtain the config. Otherwise, if we have anything in the dedup
	* table continue processing the stats.
	*/
	if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,
	(uint64_t **)&ddo, &c) != 0)
	return;

	(void) printf("\n");
	(void) printf(gettext(" dedup: "));
	if (ddo->ddo_count == 0) {
	(void) printf(gettext("no DDT entries\n"));
	return;
	}

	(void) printf("DDT entries %llu, size %llu on disk, %llu in core\n",
	(u_longlong_t)ddo->ddo_count,
	(u_longlong_t)ddo->ddo_dspace,
	(u_longlong_t)ddo->ddo_mspace);

	verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS,
	(uint64_t **)&dds, &c) == 0);
	verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM,
	(uint64_t **)&ddh, &c) == 0);
	zpool_dump_ddt(dds, ddh);
	}

	/*
	* Display a summary of pool status. Displays a summary such as:
	*
	* pool: tank
	* status: DEGRADED
	* reason: One or more devices ...
	* see: http://illumos.org/msg/ZFS-xxxx-01
	* config:
	* mirror DEGRADED
	* c1t0d0 OK
	* c2t0d0 UNAVAIL
	*
	* When given the '-v' option, we print out the complete config. If the '-e'
	* option is specified, then we print out error rate information as well.
	*/
	int
	status_callback(zpool_handle_t zhp, void data)
	{
	status_cbdata_t *cbp = data;
	nvlist_t config, nvroot;
	char *msgid;
	int reason;
	const char *health;
	uint_t c;
	vdev_stat_t *vs;

	config = zpool_get_config(zhp, NULL);
	reason = zpool_get_status(zhp, &msgid);

	cbp->cb_count++;

	/*
	* If we were given 'zpool status -x', only report those pools with
	* problems.
	*/
	if (cbp->cb_explain &&
	(reason == ZPOOL_STATUS_OK \|\|
	reason == ZPOOL_STATUS_VERSION_OLDER \|\|
	reason == ZPOOL_STATUS_NON_NATIVE_ASHIFT \|\|
	reason == ZPOOL_STATUS_FEAT_DISABLED)) {
	if (!cbp->cb_allpools) {
	(void) printf(gettext("pool '%s' is healthy\n"),
	zpool_get_name(zhp));
	if (cbp->cb_first)
	cbp->cb_first = B_FALSE;
	}
	return (0);
	}

	if (cbp->cb_first)
	cbp->cb_first = B_FALSE;
	else
	(void) printf("\n");

	- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	- &nvroot) == 0);
	+ nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
	(uint64_t **)&vs, &c) == 0);
	health = zpool_state_to_name(vs->vs_state, vs->vs_aux);

	(void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp));
	(void) printf(gettext(" state: %s\n"), health);

	switch (reason) {
	case ZPOOL_STATUS_MISSING_DEV_R:
	(void) printf(gettext("status: One or more devices could not "
	"be opened. Sufficient replicas exist for\n\tthe pool to "
	"continue functioning in a degraded state.\n"));
	(void) printf(gettext("action: Attach the missing device and "
	"online it using 'zpool online'.\n"));
	break;

	case ZPOOL_STATUS_MISSING_DEV_NR:
	(void) printf(gettext("status: One or more devices could not "
	"be opened. There are insufficient\n\treplicas for the "
	"pool to continue functioning.\n"));
	(void) printf(gettext("action: Attach the missing device and "
	"online it using 'zpool online'.\n"));
	break;

	case ZPOOL_STATUS_CORRUPT_LABEL_R:
	(void) printf(gettext("status: One or more devices could not "
	"be used because the label is missing or\n\tinvalid. "
	"Sufficient replicas exist for the pool to continue\n\t"
	"functioning in a degraded state.\n"));
	(void) printf(gettext("action: Replace the device using "
	"'zpool replace'.\n"));
	break;

	case ZPOOL_STATUS_CORRUPT_LABEL_NR:
	(void) printf(gettext("status: One or more devices could not "
	"be used because the label is missing \n\tor invalid. "
	"There are insufficient replicas for the pool to "
	"continue\n\tfunctioning.\n"));
	zpool_explain_recover(zpool_get_handle(zhp),
	zpool_get_name(zhp), reason, config);
	break;

	case ZPOOL_STATUS_FAILING_DEV:
	(void) printf(gettext("status: One or more devices has "
	"experienced an unrecoverable error. An\n\tattempt was "
	"made to correct the error. Applications are "
	"unaffected.\n"));
	(void) printf(gettext("action: Determine if the device needs "
	"to be replaced, and clear the errors\n\tusing "
	"'zpool clear' or replace the device with 'zpool "
	"replace'.\n"));
	break;

	case ZPOOL_STATUS_OFFLINE_DEV:
	(void) printf(gettext("status: One or more devices has "
	"been taken offline by the administrator.\n\tSufficient "
	"replicas exist for the pool to continue functioning in "
	"a\n\tdegraded state.\n"));
	(void) printf(gettext("action: Online the device using "
	"'zpool online' or replace the device with\n\t'zpool "
	"replace'.\n"));
	break;

	case ZPOOL_STATUS_REMOVED_DEV:
	(void) printf(gettext("status: One or more devices has "
	"been removed by the administrator.\n\tSufficient "
	"replicas exist for the pool to continue functioning in "
	"a\n\tdegraded state.\n"));
	(void) printf(gettext("action: Online the device using "
	"'zpool online' or replace the device with\n\t'zpool "
	"replace'.\n"));
	break;

	case ZPOOL_STATUS_RESILVERING:
	(void) printf(gettext("status: One or more devices is "
	"currently being resilvered. The pool will\n\tcontinue "
	"to function, possibly in a degraded state.\n"));
	(void) printf(gettext("action: Wait for the resilver to "
	"complete.\n"));
	break;

	case ZPOOL_STATUS_CORRUPT_DATA:
	(void) printf(gettext("status: One or more devices has "
	"experienced an error resulting in data\n\tcorruption. "
	"Applications may be affected.\n"));
	(void) printf(gettext("action: Restore the file in question "
	"if possible. Otherwise restore the\n\tentire pool from "
	"backup.\n"));
	break;

	case ZPOOL_STATUS_CORRUPT_POOL:
	(void) printf(gettext("status: The pool metadata is corrupted "
	"and the pool cannot be opened.\n"));
	zpool_explain_recover(zpool_get_handle(zhp),
	zpool_get_name(zhp), reason, config);
	break;

	case ZPOOL_STATUS_VERSION_OLDER:
	(void) printf(gettext("status: The pool is formatted using a "
	"legacy on-disk format. The pool can\n\tstill be used, "
	"but some features are unavailable.\n"));
	(void) printf(gettext("action: Upgrade the pool using 'zpool "
	"upgrade'. Once this is done, the\n\tpool will no longer "
	"be accessible on software that does not support feature\n"
	"\tflags.\n"));
	break;

	case ZPOOL_STATUS_VERSION_NEWER:
	(void) printf(gettext("status: The pool has been upgraded to a "
	"newer, incompatible on-disk version.\n\tThe pool cannot "
	"be accessed on this system.\n"));
	(void) printf(gettext("action: Access the pool from a system "
	"running more recent software, or\n\trestore the pool from "
	"backup.\n"));
	break;

	case ZPOOL_STATUS_FEAT_DISABLED:
	(void) printf(gettext("status: Some supported features are not "
	"enabled on the pool. The pool can\n\tstill be used, but "
	"some features are unavailable.\n"));
	(void) printf(gettext("action: Enable all features using "
	"'zpool upgrade'. Once this is done,\n\tthe pool may no "
	"longer be accessible by software that does not support\n\t"
	"the features. See zpool-features(7) for details.\n"));
	break;

	case ZPOOL_STATUS_UNSUP_FEAT_READ:
	(void) printf(gettext("status: The pool cannot be accessed on "
	"this system because it uses the\n\tfollowing feature(s) "
	"not supported on this system:\n"));
	zpool_print_unsup_feat(config);
	(void) printf("\n");
	(void) printf(gettext("action: Access the pool from a system "
	"that supports the required feature(s),\n\tor restore the "
	"pool from backup.\n"));
	break;

	case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
	(void) printf(gettext("status: The pool can only be accessed "
	"in read-only mode on this system. It\n\tcannot be "
	"accessed in read-write mode because it uses the "
	"following\n\tfeature(s) not supported on this system:\n"));
	zpool_print_unsup_feat(config);
	(void) printf("\n");
	(void) printf(gettext("action: The pool cannot be accessed in "
	"read-write mode. Import the pool with\n"
	"\t\"-o readonly=on\", access the pool from a system that "
	"supports the\n\trequired feature(s), or restore the "
	"pool from backup.\n"));
	break;

	case ZPOOL_STATUS_FAULTED_DEV_R:
	(void) printf(gettext("status: One or more devices are "
	"faulted in response to persistent errors.\n\tSufficient "
	"replicas exist for the pool to continue functioning "
	"in a\n\tdegraded state.\n"));
	(void) printf(gettext("action: Replace the faulted device, "
	"or use 'zpool clear' to mark the device\n\trepaired.\n"));
	break;

	case ZPOOL_STATUS_FAULTED_DEV_NR:
	(void) printf(gettext("status: One or more devices are "
	"faulted in response to persistent errors. There are "
	"insufficient replicas for the pool to\n\tcontinue "
	"functioning.\n"));
	(void) printf(gettext("action: Destroy and re-create the pool "
	"from a backup source. Manually marking the device\n"
	"\trepaired using 'zpool clear' may allow some data "
	"to be recovered.\n"));
	break;

	case ZPOOL_STATUS_IO_FAILURE_WAIT:
	case ZPOOL_STATUS_IO_FAILURE_CONTINUE:
	(void) printf(gettext("status: One or more devices are "
	"faulted in response to IO failures.\n"));
	(void) printf(gettext("action: Make sure the affected devices "
	"are connected, then run 'zpool clear'.\n"));
	break;

	case ZPOOL_STATUS_BAD_LOG:
	(void) printf(gettext("status: An intent log record "
	"could not be read.\n"
	"\tWaiting for adminstrator intervention to fix the "
	"faulted pool.\n"));
	(void) printf(gettext("action: Either restore the affected "
	"device(s) and run 'zpool online',\n"
	"\tor ignore the intent log records by running "
	"'zpool clear'.\n"));
	break;

	case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
	(void) printf(gettext("status: One or more devices are "
	"configured to use a non-native block size.\n"
	"\tExpect reduced performance.\n"));
	(void) printf(gettext("action: Replace affected devices with "
	"devices that support the\n\tconfigured block size, or "
	"migrate data to a properly configured\n\tpool.\n"));
	break;

	default:
	/*
	* The remaining errors can't actually be generated, yet.
	*/
	assert(reason == ZPOOL_STATUS_OK);
	}

	if (msgid != NULL)
	(void) printf(gettext(" see: http://illumos.org/msg/%s\n"),
	msgid);

	if (config != NULL) {
	int namewidth;
	uint64_t nerr;
	nvlist_t spares, l2cache;
	uint_t nspares, nl2cache;
	pool_scan_stat_t *ps = NULL;
	+ pool_removal_stat_t *prs = NULL;

	(void) nvlist_lookup_uint64_array(nvroot,
	ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
	print_scan_status(ps);
	+
	+ (void) nvlist_lookup_uint64_array(nvroot,
	+ ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
	+ print_removal_status(zhp, prs);

	namewidth = max_width(zhp, nvroot, 0, 0);
	if (namewidth < 10)
	namewidth = 10;

	(void) printf(gettext("config:\n\n"));
	(void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), namewidth,
	"NAME", "STATE", "READ", "WRITE", "CKSUM");
	print_status_config(zhp, zpool_get_name(zhp), nvroot,
	namewidth, 0, B_FALSE);

	if (num_logs(nvroot) > 0)
	print_logs(zhp, nvroot, namewidth, B_TRUE);
	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	&l2cache, &nl2cache) == 0)
	print_l2cache(zhp, l2cache, nl2cache, namewidth);

	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	&spares, &nspares) == 0)
	print_spares(zhp, spares, nspares, namewidth);

	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
	&nerr) == 0) {
	nvlist_t *nverrlist = NULL;

	/*
	* If the approximate error count is small, get a
	* precise count by fetching the entire log and
	* uniquifying the results.
	*/
	if (nerr > 0 && nerr < 100 && !cbp->cb_verbose &&
	zpool_get_errlog(zhp, &nverrlist) == 0) {
	nvpair_t *elem;

	elem = NULL;
	nerr = 0;
	while ((elem = nvlist_next_nvpair(nverrlist,
	elem)) != NULL) {
	nerr++;
	}
	}
	nvlist_free(nverrlist);

	(void) printf("\n");

	if (nerr == 0)
	(void) printf(gettext("errors: No known data "
	"errors\n"));
	else if (!cbp->cb_verbose)
	(void) printf(gettext("errors: %llu data "
	"errors, use '-v' for a list\n"),
	(u_longlong_t)nerr);
	else
	print_error_log(zhp);
	}

	if (cbp->cb_dedup_stats)
	print_dedup_stats(config);
	} else {
	(void) printf(gettext("config: The configuration cannot be "
	"determined.\n"));
	}

	return (0);
	}

	/*
	* zpool status [-vx] [-T d\|u] [pool] ... [interval [count]]
	*
	* -v Display complete error logs
	* -x Display only pools with potential problems
	* -D Display dedup status (undocumented)
	* -T Display a timestamp in date(1) or Unix format
	*
	* Describes the health status of all pools or some subset.
	*/
	int
	zpool_do_status(int argc, char **argv)
	{
	int c;
	int ret;
	unsigned long interval = 0, count = 0;
	status_cbdata_t cb = { 0 };

	/* check options */
	while ((c = getopt(argc, argv, "vxDT:")) != -1) {
	switch (c) {
	case 'v':
	cb.cb_verbose = B_TRUE;
	break;
	case 'x':
	cb.cb_explain = B_TRUE;
	break;
	case 'D':
	cb.cb_dedup_stats = B_TRUE;
	break;
	case 'T':
	get_timestamp_arg(*optarg);
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	get_interval_count(&argc, argv, &interval, &count);

	if (argc == 0)
	cb.cb_allpools = B_TRUE;

	cb.cb_first = B_TRUE;

	for (;;) {
	if (timestamp_fmt != NODATE)
	print_timestamp(timestamp_fmt);

	ret = for_each_pool(argc, argv, B_TRUE, NULL,
	status_callback, &cb);

	if (argc == 0 && cb.cb_count == 0)
	(void) printf(gettext("no pools available\n"));
	else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
	(void) printf(gettext("all pools are healthy\n"));

	if (ret != 0)
	return (ret);

	if (interval == 0)
	break;

	if (count != 0 && --count == 0)
	break;

	(void) sleep(interval);
	}

	return (0);
	}

	typedef struct upgrade_cbdata {
	boolean_t cb_first;
	boolean_t cb_unavail;
	char cb_poolname[ZFS_MAX_DATASET_NAME_LEN];
	int cb_argc;
	uint64_t cb_version;
	char **cb_argv;
	} upgrade_cbdata_t;

	#ifdef __FreeBSD__
	static int
	is_root_pool(zpool_handle_t *zhp)
	{
	static struct statfs sfs;
	static char *poolname = NULL;
	static boolean_t stated = B_FALSE;
	char *slash;

	if (!stated) {
	stated = B_TRUE;
	if (statfs("/", &sfs) == -1) {
	(void) fprintf(stderr,
	"Unable to stat root file system: %s.\n",
	strerror(errno));
	return (0);
	}
	if (strcmp(sfs.f_fstypename, "zfs") != 0)
	return (0);
	poolname = sfs.f_mntfromname;
	if ((slash = strchr(poolname, '/')) != NULL)
	*slash = '\0';
	}
	return (poolname != NULL && strcmp(poolname, zpool_get_name(zhp)) == 0);
	}

	static void
	root_pool_upgrade_check(zpool_handle_t zhp, char poolname, int size)
	{

	if (poolname[0] == '\0' && is_root_pool(zhp))
	(void) strlcpy(poolname, zpool_get_name(zhp), size);
	}
	#endif /* FreeBSD */

	static int
	upgrade_version(zpool_handle_t *zhp, uint64_t version)
	{
	int ret;
	nvlist_t *config;
	uint64_t oldversion;

	config = zpool_get_config(zhp, NULL);
	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	&oldversion) == 0);

	assert(SPA_VERSION_IS_SUPPORTED(oldversion));
	assert(oldversion < version);

	ret = zpool_upgrade(zhp, version);
	if (ret != 0)
	return (ret);

	if (version >= SPA_VERSION_FEATURES) {
	(void) printf(gettext("Successfully upgraded "
	"'%s' from version %llu to feature flags.\n"),
	zpool_get_name(zhp), oldversion);
	} else {
	(void) printf(gettext("Successfully upgraded "
	"'%s' from version %llu to version %llu.\n"),
	zpool_get_name(zhp), oldversion, version);
	}

	return (0);
	}

	static int
	upgrade_enable_all(zpool_handle_t zhp, int countp)
	{
	int i, ret, count;
	boolean_t firstff = B_TRUE;
	nvlist_t *enabled = zpool_get_features(zhp);

	count = 0;
	for (i = 0; i < SPA_FEATURES; i++) {
	const char *fname = spa_feature_table[i].fi_uname;
	const char *fguid = spa_feature_table[i].fi_guid;
	if (!nvlist_exists(enabled, fguid)) {
	char *propname;
	verify(-1 != asprintf(&propname, "feature@%s", fname));
	ret = zpool_set_prop(zhp, propname,
	ZFS_FEATURE_ENABLED);
	if (ret != 0) {
	free(propname);
	return (ret);
	}
	count++;

	if (firstff) {
	(void) printf(gettext("Enabled the "
	"following features on '%s':\n"),
	zpool_get_name(zhp));
	firstff = B_FALSE;
	}
	(void) printf(gettext(" %s\n"), fname);
	free(propname);
	}
	}

	if (countp != NULL)
	*countp = count;
	return (0);
	}

	static int
	upgrade_cb(zpool_handle_t zhp, void arg)
	{
	upgrade_cbdata_t *cbp = arg;
	nvlist_t *config;
	uint64_t version;
	boolean_t printnl = B_FALSE;
	int ret;

	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	(void) fprintf(stderr, gettext("cannot upgrade '%s': pool is "
	"currently unavailable.\n\n"), zpool_get_name(zhp));
	cbp->cb_unavail = B_TRUE;
	/* Allow iteration to continue. */
	return (0);
	}

	config = zpool_get_config(zhp, NULL);
	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	&version) == 0);

	assert(SPA_VERSION_IS_SUPPORTED(version));

	if (version < cbp->cb_version) {
	cbp->cb_first = B_FALSE;
	ret = upgrade_version(zhp, cbp->cb_version);
	if (ret != 0)
	return (ret);
	#ifdef __FreeBSD__
	root_pool_upgrade_check(zhp, cbp->cb_poolname,
	sizeof(cbp->cb_poolname));
	#endif /* __FreeBSD__ */
	printnl = B_TRUE;

	#ifdef illumos
	/*
	* If they did "zpool upgrade -a", then we could
	* be doing ioctls to different pools. We need
	* to log this history once to each pool, and bypass
	* the normal history logging that happens in main().
	*/
	(void) zpool_log_history(g_zfs, history_str);
	log_history = B_FALSE;
	#endif
	}

	if (cbp->cb_version >= SPA_VERSION_FEATURES) {
	int count;
	ret = upgrade_enable_all(zhp, &count);
	if (ret != 0)
	return (ret);

	if (count > 0) {
	cbp->cb_first = B_FALSE;
	printnl = B_TRUE;
	#ifdef __FreeBSD__
	root_pool_upgrade_check(zhp, cbp->cb_poolname,
	sizeof(cbp->cb_poolname));
	#endif /* __FreeBSD__ */
	/*
	* If they did "zpool upgrade -a", then we could
	* be doing ioctls to different pools. We need
	* to log this history once to each pool, and bypass
	* the normal history logging that happens in main().
	*/
	(void) zpool_log_history(g_zfs, history_str);
	log_history = B_FALSE;
	}
	}

	if (printnl) {
	(void) printf(gettext("\n"));
	}

	return (0);
	}

	static int
	upgrade_list_unavail(zpool_handle_t zhp, void arg)
	{
	upgrade_cbdata_t *cbp = arg;

	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	if (cbp->cb_first) {
	(void) fprintf(stderr, gettext("The following pools "
	"are unavailable and cannot be upgraded as this "
	"time.\n\n"));
	(void) fprintf(stderr, gettext("POOL\n"));
	(void) fprintf(stderr, gettext("------------\n"));
	cbp->cb_first = B_FALSE;
	}
	(void) printf(gettext("%s\n"), zpool_get_name(zhp));
	cbp->cb_unavail = B_TRUE;
	}
	return (0);
	}

	static int
	upgrade_list_older_cb(zpool_handle_t zhp, void arg)
	{
	upgrade_cbdata_t *cbp = arg;
	nvlist_t *config;
	uint64_t version;

	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	/*
	* This will have been reported by upgrade_list_unavail so
	* just allow iteration to continue.
	*/
	cbp->cb_unavail = B_TRUE;
	return (0);
	}

	config = zpool_get_config(zhp, NULL);
	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	&version) == 0);

	assert(SPA_VERSION_IS_SUPPORTED(version));

	if (version < SPA_VERSION_FEATURES) {
	if (cbp->cb_first) {
	(void) printf(gettext("The following pools are "
	"formatted with legacy version numbers and can\n"
	"be upgraded to use feature flags. After "
	"being upgraded, these pools\nwill no "
	"longer be accessible by software that does not "
	"support feature\nflags.\n\n"));
	(void) printf(gettext("VER POOL\n"));
	(void) printf(gettext("--- ------------\n"));
	cbp->cb_first = B_FALSE;
	}

	(void) printf("%2llu %s\n", (u_longlong_t)version,
	zpool_get_name(zhp));
	}

	return (0);
	}

	static int
	upgrade_list_disabled_cb(zpool_handle_t zhp, void arg)
	{
	upgrade_cbdata_t *cbp = arg;
	nvlist_t *config;
	uint64_t version;

	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	/*
	* This will have been reported by upgrade_list_unavail so
	* just allow iteration to continue.
	*/
	cbp->cb_unavail = B_TRUE;
	return (0);
	}

	config = zpool_get_config(zhp, NULL);
	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	&version) == 0);

	if (version >= SPA_VERSION_FEATURES) {
	int i;
	boolean_t poolfirst = B_TRUE;
	nvlist_t *enabled = zpool_get_features(zhp);

	for (i = 0; i < SPA_FEATURES; i++) {
	const char *fguid = spa_feature_table[i].fi_guid;
	const char *fname = spa_feature_table[i].fi_uname;
	if (!nvlist_exists(enabled, fguid)) {
	if (cbp->cb_first) {
	(void) printf(gettext("\nSome "
	"supported features are not "
	"enabled on the following pools. "
	"Once a\nfeature is enabled the "
	"pool may become incompatible with "
	"software\nthat does not support "
	"the feature. See "
	"zpool-features(7) for "
	"details.\n\n"));
	(void) printf(gettext("POOL "
	"FEATURE\n"));
	(void) printf(gettext("------"
	"---------\n"));
	cbp->cb_first = B_FALSE;
	}

	if (poolfirst) {
	(void) printf(gettext("%s\n"),
	zpool_get_name(zhp));
	poolfirst = B_FALSE;
	}

	(void) printf(gettext(" %s\n"), fname);
	}
	}
	}

	return (0);
	}

	/* ARGSUSED */
	static int
	upgrade_one(zpool_handle_t zhp, void data)
	{
	boolean_t printnl = B_FALSE;
	upgrade_cbdata_t *cbp = data;
	uint64_t cur_version;
	int ret;

	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	(void) fprintf(stderr, gettext("cannot upgrade '%s': pool is "
	"is currently unavailable.\n\n"), zpool_get_name(zhp));
	cbp->cb_unavail = B_TRUE;
	return (1);
	}

	if (strcmp("log", zpool_get_name(zhp)) == 0) {
	(void) printf(gettext("'log' is now a reserved word\n"
	"Pool 'log' must be renamed using export and import"
	" to upgrade.\n\n"));
	return (1);
	}

	cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
	if (cur_version > cbp->cb_version) {
	(void) printf(gettext("Pool '%s' is already formatted "
	"using more current version '%llu'.\n\n"),
	zpool_get_name(zhp), cur_version);
	return (0);
	}

	if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) {
	(void) printf(gettext("Pool '%s' is already formatted "
	"using version %llu.\n\n"), zpool_get_name(zhp),
	cbp->cb_version);
	return (0);
	}

	if (cur_version != cbp->cb_version) {
	printnl = B_TRUE;
	ret = upgrade_version(zhp, cbp->cb_version);
	if (ret != 0)
	return (ret);
	#ifdef __FreeBSD__
	root_pool_upgrade_check(zhp, cbp->cb_poolname,
	sizeof(cbp->cb_poolname));
	#endif /* __FreeBSD__ */
	}

	if (cbp->cb_version >= SPA_VERSION_FEATURES) {
	int count = 0;
	ret = upgrade_enable_all(zhp, &count);
	if (ret != 0)
	return (ret);

	if (count != 0) {
	printnl = B_TRUE;
	#ifdef __FreeBSD__
	root_pool_upgrade_check(zhp, cbp->cb_poolname,
	sizeof(cbp->cb_poolname));
	#endif /* __FreeBSD __*/
	} else if (cur_version == SPA_VERSION) {
	(void) printf(gettext("Pool '%s' already has all "
	"supported features enabled.\n\n"),
	zpool_get_name(zhp));
	}
	}

	if (printnl) {
	(void) printf(gettext("\n"));
	}

	return (0);
	}

	/*
	* zpool upgrade
	* zpool upgrade -v
	* zpool upgrade [-V version] <-a \| pool ...>
	*
	* With no arguments, display downrev'd ZFS pool available for upgrade.
	* Individual pools can be upgraded by specifying the pool, and '-a' will
	* upgrade all pools.
	*/
	int
	zpool_do_upgrade(int argc, char **argv)
	{
	int c;
	upgrade_cbdata_t cb = { 0 };
	int ret = 0;
	boolean_t showversions = B_FALSE;
	boolean_t upgradeall = B_FALSE;
	char *end;


	/* check options */
	while ((c = getopt(argc, argv, ":avV:")) != -1) {
	switch (c) {
	case 'a':
	upgradeall = B_TRUE;
	break;
	case 'v':
	showversions = B_TRUE;
	break;
	case 'V':
	cb.cb_version = strtoll(optarg, &end, 10);
	if (*end != '\0' \|\|
	!SPA_VERSION_IS_SUPPORTED(cb.cb_version)) {
	(void) fprintf(stderr,
	gettext("invalid version '%s'\n"), optarg);
	usage(B_FALSE);
	}
	break;
	case ':':
	(void) fprintf(stderr, gettext("missing argument for "
	"'%c' option\n"), optopt);
	usage(B_FALSE);
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	cb.cb_argc = argc;
	cb.cb_argv = argv;
	argc -= optind;
	argv += optind;

	if (cb.cb_version == 0) {
	cb.cb_version = SPA_VERSION;
	} else if (!upgradeall && argc == 0) {
	(void) fprintf(stderr, gettext("-V option is "
	"incompatible with other arguments\n"));
	usage(B_FALSE);
	}

	if (showversions) {
	if (upgradeall \|\| argc != 0) {
	(void) fprintf(stderr, gettext("-v option is "
	"incompatible with other arguments\n"));
	usage(B_FALSE);
	}
	} else if (upgradeall) {
	if (argc != 0) {
	(void) fprintf(stderr, gettext("-a option should not "
	"be used along with a pool name\n"));
	usage(B_FALSE);
	}
	}

	(void) printf(gettext("This system supports ZFS pool feature "
	"flags.\n\n"));
	if (showversions) {
	int i;

	(void) printf(gettext("The following features are "
	"supported:\n\n"));
	(void) printf(gettext("FEAT DESCRIPTION\n"));
	(void) printf("----------------------------------------------"
	"---------------\n");
	for (i = 0; i < SPA_FEATURES; i++) {
	zfeature_info_t *fi = &spa_feature_table[i];
	const char *ro =
	(fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
	" (read-only compatible)" : "";

	(void) printf("%-37s%s\n", fi->fi_uname, ro);
	(void) printf(" %s\n", fi->fi_desc);
	}
	(void) printf("\n");

	(void) printf(gettext("The following legacy versions are also "
	"supported:\n\n"));
	(void) printf(gettext("VER DESCRIPTION\n"));
	(void) printf("--- -----------------------------------------"
	"---------------\n");
	(void) printf(gettext(" 1 Initial ZFS version\n"));
	(void) printf(gettext(" 2 Ditto blocks "
	"(replicated metadata)\n"));
	(void) printf(gettext(" 3 Hot spares and double parity "
	"RAID-Z\n"));
	(void) printf(gettext(" 4 zpool history\n"));
	(void) printf(gettext(" 5 Compression using the gzip "
	"algorithm\n"));
	(void) printf(gettext(" 6 bootfs pool property\n"));
	(void) printf(gettext(" 7 Separate intent log devices\n"));
	(void) printf(gettext(" 8 Delegated administration\n"));
	(void) printf(gettext(" 9 refquota and refreservation "
	"properties\n"));
	(void) printf(gettext(" 10 Cache devices\n"));
	(void) printf(gettext(" 11 Improved scrub performance\n"));
	(void) printf(gettext(" 12 Snapshot properties\n"));
	(void) printf(gettext(" 13 snapused property\n"));
	(void) printf(gettext(" 14 passthrough-x aclinherit\n"));
	(void) printf(gettext(" 15 user/group space accounting\n"));
	(void) printf(gettext(" 16 stmf property support\n"));
	(void) printf(gettext(" 17 Triple-parity RAID-Z\n"));
	(void) printf(gettext(" 18 Snapshot user holds\n"));
	(void) printf(gettext(" 19 Log device removal\n"));
	(void) printf(gettext(" 20 Compression using zle "
	"(zero-length encoding)\n"));
	(void) printf(gettext(" 21 Deduplication\n"));
	(void) printf(gettext(" 22 Received properties\n"));
	(void) printf(gettext(" 23 Slim ZIL\n"));
	(void) printf(gettext(" 24 System attributes\n"));
	(void) printf(gettext(" 25 Improved scrub stats\n"));
	(void) printf(gettext(" 26 Improved snapshot deletion "
	"performance\n"));
	(void) printf(gettext(" 27 Improved snapshot creation "
	"performance\n"));
	(void) printf(gettext(" 28 Multiple vdev replacements\n"));
	(void) printf(gettext("\nFor more information on a particular "
	"version, including supported releases,\n"));
	(void) printf(gettext("see the ZFS Administration Guide.\n\n"));
	} else if (argc == 0 && upgradeall) {
	cb.cb_first = B_TRUE;
	ret = zpool_iter(g_zfs, upgrade_cb, &cb);
	if (ret == 0 && cb.cb_first) {
	if (cb.cb_version == SPA_VERSION) {
	(void) printf(gettext("All %spools are already "
	"formatted using feature flags.\n\n"),
	cb.cb_unavail ? gettext("available ") : "");
	(void) printf(gettext("Every %sfeature flags "
	"pool already has all supported features "
	"enabled.\n"),
	cb.cb_unavail ? gettext("available ") : "");
	} else {
	(void) printf(gettext("All pools are already "
	"formatted with version %llu or higher.\n"),
	cb.cb_version);
	}
	}
	} else if (argc == 0) {
	cb.cb_first = B_TRUE;
	ret = zpool_iter(g_zfs, upgrade_list_unavail, &cb);
	assert(ret == 0);

	if (!cb.cb_first) {
	(void) fprintf(stderr, "\n");
	}

	cb.cb_first = B_TRUE;
	ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb);
	assert(ret == 0);

	if (cb.cb_first) {
	(void) printf(gettext("All %spools are formatted using "
	"feature flags.\n\n"), cb.cb_unavail ?
	gettext("available ") : "");
	} else {
	(void) printf(gettext("\nUse 'zpool upgrade -v' "
	"for a list of available legacy versions.\n"));
	}

	cb.cb_first = B_TRUE;
	ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb);
	assert(ret == 0);

	if (cb.cb_first) {
	(void) printf(gettext("Every %sfeature flags pool has "
	"all supported features enabled.\n"),
	cb.cb_unavail ? gettext("available ") : "");
	} else {
	(void) printf(gettext("\n"));
	}
	} else {
	ret = for_each_pool(argc, argv, B_TRUE, NULL,
	upgrade_one, &cb);
	}

	if (cb.cb_poolname[0] != '\0') {
	(void) printf(
	"If you boot from pool '%s', don't forget to update boot code.\n"
	"Assuming you use GPT partitioning and da0 is your boot disk\n"
	"the following command will do it:\n"
	"\n"
	"\tgpart bootcode -b /boot/pmbr -p /boot/gptzfsboot -i 1 da0\n\n",
	cb.cb_poolname);
	}

	return (ret);
	}

	typedef struct hist_cbdata {
	boolean_t first;
	boolean_t longfmt;
	boolean_t internal;
	} hist_cbdata_t;

	/*
	* Print out the command history for a specific pool.
	*/
	static int
	get_history_one(zpool_handle_t zhp, void data)
	{
	nvlist_t *nvhis;
	nvlist_t **records;
	uint_t numrecords;
	int ret, i;
	hist_cbdata_t cb = (hist_cbdata_t )data;

	cb->first = B_FALSE;

	(void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp));

	if ((ret = zpool_get_history(zhp, &nvhis)) != 0)
	return (ret);

	verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD,
	&records, &numrecords) == 0);
	for (i = 0; i < numrecords; i++) {
	nvlist_t *rec = records[i];
	char tbuf[30] = "";

	if (nvlist_exists(rec, ZPOOL_HIST_TIME)) {
	time_t tsec;
	struct tm t;

	tsec = fnvlist_lookup_uint64(records[i],
	ZPOOL_HIST_TIME);
	(void) localtime_r(&tsec, &t);
	(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
	}

	if (nvlist_exists(rec, ZPOOL_HIST_CMD)) {
	(void) printf("%s %s", tbuf,
	fnvlist_lookup_string(rec, ZPOOL_HIST_CMD));
	} else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) {
	int ievent =
	fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT);
	if (!cb->internal)
	continue;
	if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) {
	(void) printf("%s unrecognized record:\n",
	tbuf);
	dump_nvlist(rec, 4);
	continue;
	}
	(void) printf("%s [internal %s txg:%lld] %s", tbuf,
	zfs_history_event_names[ievent],
	fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG),
	fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR));
	} else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) {
	if (!cb->internal)
	continue;
	(void) printf("%s [txg:%lld] %s", tbuf,
	fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG),
	fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME));
	if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) {
	(void) printf(" %s (%llu)",
	fnvlist_lookup_string(rec,
	ZPOOL_HIST_DSNAME),
	fnvlist_lookup_uint64(rec,
	ZPOOL_HIST_DSID));
	}
	(void) printf(" %s", fnvlist_lookup_string(rec,
	ZPOOL_HIST_INT_STR));
	} else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) {
	if (!cb->internal)
	continue;
	(void) printf("%s ioctl %s\n", tbuf,
	fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL));
	if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) {
	(void) printf(" input:\n");
	dump_nvlist(fnvlist_lookup_nvlist(rec,
	ZPOOL_HIST_INPUT_NVL), 8);
	}
	if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) {
	(void) printf(" output:\n");
	dump_nvlist(fnvlist_lookup_nvlist(rec,
	ZPOOL_HIST_OUTPUT_NVL), 8);
	}
	if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) {
	(void) printf(" errno: %lld\n",
	fnvlist_lookup_int64(rec,
	ZPOOL_HIST_ERRNO));
	}
	} else {
	if (!cb->internal)
	continue;
	(void) printf("%s unrecognized record:\n", tbuf);
	dump_nvlist(rec, 4);
	}

	if (!cb->longfmt) {
	(void) printf("\n");
	continue;
	}
	(void) printf(" [");
	if (nvlist_exists(rec, ZPOOL_HIST_WHO)) {
	uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO);
	struct passwd *pwd = getpwuid(who);
	(void) printf("user %d ", (int)who);
	if (pwd != NULL)
	(void) printf("(%s) ", pwd->pw_name);
	}
	if (nvlist_exists(rec, ZPOOL_HIST_HOST)) {
	(void) printf("on %s",
	fnvlist_lookup_string(rec, ZPOOL_HIST_HOST));
	}
	if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) {
	(void) printf(":%s",
	fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE));
	}
	(void) printf("]");
	(void) printf("\n");
	}
	(void) printf("\n");
	nvlist_free(nvhis);

	return (ret);
	}

	/*
	* zpool history <pool>
	*
	* Displays the history of commands that modified pools.
	*/
	int
	zpool_do_history(int argc, char **argv)
	{
	hist_cbdata_t cbdata = { 0 };
	int ret;
	int c;

	cbdata.first = B_TRUE;
	/* check options */
	while ((c = getopt(argc, argv, "li")) != -1) {
	switch (c) {
	case 'l':
	cbdata.longfmt = B_TRUE;
	break;
	case 'i':
	cbdata.internal = B_TRUE;
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}
	argc -= optind;
	argv += optind;

	ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one,
	&cbdata);

	if (argc == 0 && cbdata.first == B_TRUE) {
	(void) printf(gettext("no pools available\n"));
	return (0);
	}

	return (ret);
	}

	static int
	get_callback(zpool_handle_t zhp, void data)
	{
	zprop_get_cbdata_t cbp = (zprop_get_cbdata_t )data;
	char value[MAXNAMELEN];
	zprop_source_t srctype;
	zprop_list_t *pl;

	for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {

	/*
	* Skip the special fake placeholder. This will also skip
	* over the name property when 'all' is specified.
	*/
	if (pl->pl_prop == ZPOOL_PROP_NAME &&
	pl == cbp->cb_proplist)
	continue;

	if (pl->pl_prop == ZPROP_INVAL &&
	(zpool_prop_feature(pl->pl_user_prop) \|\|
	zpool_prop_unsupported(pl->pl_user_prop))) {
	srctype = ZPROP_SRC_LOCAL;

	if (zpool_prop_get_feature(zhp, pl->pl_user_prop,
	value, sizeof (value)) == 0) {
	zprop_print_one_property(zpool_get_name(zhp),
	cbp, pl->pl_user_prop, value, srctype,
	NULL, NULL);
	}
	} else {
	if (zpool_get_prop(zhp, pl->pl_prop, value,
	sizeof (value), &srctype, cbp->cb_literal) != 0)
	continue;

	zprop_print_one_property(zpool_get_name(zhp), cbp,
	zpool_prop_to_name(pl->pl_prop), value, srctype,
	NULL, NULL);
	}
	}
	return (0);
	}

	/*
	* zpool get [-Hp] [-o "all" \| field[,...]] <"all" \| property[,...]> <pool> ...
	*
	* -H Scripted mode. Don't display headers, and separate properties
	* by a single tab.
	* -o List of columns to display. Defaults to
	* "name,property,value,source".
	* -p Diplay values in parsable (exact) format.
	*
	* Get properties of pools in the system. Output space statistics
	* for each one as well as other attributes.
	*/
	int
	zpool_do_get(int argc, char **argv)
	{
	zprop_get_cbdata_t cb = { 0 };
	zprop_list_t fake_name = { 0 };
	int ret;
	int c, i;
	char *value;

	cb.cb_first = B_TRUE;

	/*
	* Set up default columns and sources.
	*/
	cb.cb_sources = ZPROP_SRC_ALL;
	cb.cb_columns[0] = GET_COL_NAME;
	cb.cb_columns[1] = GET_COL_PROPERTY;
	cb.cb_columns[2] = GET_COL_VALUE;
	cb.cb_columns[3] = GET_COL_SOURCE;
	cb.cb_type = ZFS_TYPE_POOL;

	/* check options */
	while ((c = getopt(argc, argv, ":Hpo:")) != -1) {
	switch (c) {
	case 'p':
	cb.cb_literal = B_TRUE;
	break;
	case 'H':
	cb.cb_scripted = B_TRUE;
	break;
	case 'o':
	bzero(&cb.cb_columns, sizeof (cb.cb_columns));
	i = 0;
	while (*optarg != '\0') {
	static char *col_subopts[] =
	{ "name", "property", "value", "source",
	"all", NULL };

	if (i == ZFS_GET_NCOLS) {
	(void) fprintf(stderr, gettext("too "
	"many fields given to -o "
	"option\n"));
	usage(B_FALSE);
	}

	switch (getsubopt(&optarg, col_subopts,
	&value)) {
	case 0:
	cb.cb_columns[i++] = GET_COL_NAME;
	break;
	case 1:
	cb.cb_columns[i++] = GET_COL_PROPERTY;
	break;
	case 2:
	cb.cb_columns[i++] = GET_COL_VALUE;
	break;
	case 3:
	cb.cb_columns[i++] = GET_COL_SOURCE;
	break;
	case 4:
	if (i > 0) {
	(void) fprintf(stderr,
	gettext("\"all\" conflicts "
	"with specific fields "
	"given to -o option\n"));
	usage(B_FALSE);
	}
	cb.cb_columns[0] = GET_COL_NAME;
	cb.cb_columns[1] = GET_COL_PROPERTY;
	cb.cb_columns[2] = GET_COL_VALUE;
	cb.cb_columns[3] = GET_COL_SOURCE;
	i = ZFS_GET_NCOLS;
	break;
	default:
	(void) fprintf(stderr,
	gettext("invalid column name "
	"'%s'\n"), suboptarg);
	usage(B_FALSE);
	}
	}
	break;
	case '?':
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	optopt);
	usage(B_FALSE);
	}
	}

	argc -= optind;
	argv += optind;

	if (argc < 1) {
	(void) fprintf(stderr, gettext("missing property "
	"argument\n"));
	usage(B_FALSE);
	}

	if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist,
	ZFS_TYPE_POOL) != 0)
	usage(B_FALSE);

	argc--;
	argv++;

	if (cb.cb_proplist != NULL) {
	fake_name.pl_prop = ZPOOL_PROP_NAME;
	fake_name.pl_width = strlen(gettext("NAME"));
	fake_name.pl_next = cb.cb_proplist;
	cb.cb_proplist = &fake_name;
	}

	ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
	get_callback, &cb);

	if (cb.cb_proplist == &fake_name)
	zprop_free_list(fake_name.pl_next);
	else
	zprop_free_list(cb.cb_proplist);

	return (ret);
	}

	typedef struct set_cbdata {
	char *cb_propname;
	char *cb_value;
	boolean_t cb_any_successful;
	} set_cbdata_t;

	int
	set_callback(zpool_handle_t zhp, void data)
	{
	int error;
	set_cbdata_t cb = (set_cbdata_t )data;

	error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value);

	if (!error)
	cb->cb_any_successful = B_TRUE;

	return (error);
	}

	int
	zpool_do_set(int argc, char **argv)
	{
	set_cbdata_t cb = { 0 };
	int error;

	if (argc > 1 && argv[1][0] == '-') {
	(void) fprintf(stderr, gettext("invalid option '%c'\n"),
	argv[1][1]);
	usage(B_FALSE);
	}

	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing property=value "
	"argument\n"));
	usage(B_FALSE);
	}

	if (argc < 3) {
	(void) fprintf(stderr, gettext("missing pool name\n"));
	usage(B_FALSE);
	}

	if (argc > 3) {
	(void) fprintf(stderr, gettext("too many pool names\n"));
	usage(B_FALSE);
	}

	cb.cb_propname = argv[1];
	cb.cb_value = strchr(cb.cb_propname, '=');
	if (cb.cb_value == NULL) {
	(void) fprintf(stderr, gettext("missing value in "
	"property=value argument\n"));
	usage(B_FALSE);
	}

	*(cb.cb_value) = '\0';
	cb.cb_value++;

	error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL,
	set_callback, &cb);

	return (error);
	}

	static int
	find_command_idx(char command, int idx)
	{
	int i;

	for (i = 0; i < NCOMMAND; i++) {
	if (command_table[i].name == NULL)
	continue;

	if (strcmp(command, command_table[i].name) == 0) {
	*idx = i;
	return (0);
	}
	}
	return (1);
	}

	int
	main(int argc, char **argv)
	{
	int ret = 0;
	int i;
	char *cmdname;

	(void) setlocale(LC_ALL, "");
	(void) textdomain(TEXT_DOMAIN);

	if ((g_zfs = libzfs_init()) == NULL) {
	(void) fprintf(stderr, gettext("internal error: failed to "
	"initialize ZFS library\n"));
	return (1);
	}

	libzfs_print_on_error(g_zfs, B_TRUE);

	opterr = 0;

	/*
	* Make sure the user has specified some command.
	*/
	if (argc < 2) {
	(void) fprintf(stderr, gettext("missing command\n"));
	usage(B_FALSE);
	}

	cmdname = argv[1];

	/*
	* Special case '-?'
	*/
	if (strcmp(cmdname, "-?") == 0)
	usage(B_TRUE);

	zfs_save_arguments(argc, argv, history_str, sizeof (history_str));

	/*
	* Run the appropriate command.
	*/
	if (find_command_idx(cmdname, &i) == 0) {
	current_command = &command_table[i];
	ret = command_table[i].func(argc - 1, argv + 1);
	} else if (strchr(cmdname, '=')) {
	verify(find_command_idx("set", &i) == 0);
	current_command = &command_table[i];
	ret = command_table[i].func(argc, argv);
	} else if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
	/*
	* 'freeze' is a vile debugging abomination, so we treat
	* it as such.
	*/
	zfs_cmd_t zc = { 0 };
	(void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name));
	return (!!zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc));
	} else {
	(void) fprintf(stderr, gettext("unrecognized "
	"command '%s'\n"), cmdname);
	usage(B_FALSE);
	}

	if (ret == 0 && log_history)
	(void) zpool_log_history(g_zfs, history_str);

	libzfs_fini(g_zfs);

	/*
	* The 'ZFS_ABORT' environment variable causes us to dump core on exit
	* for the purposes of running ::findleaks.
	*/
	if (getenv("ZFS_ABORT") != NULL) {
	(void) printf("dumping core by request\n");
	abort();
	}

	return (ret);
	}
	Index: stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c
	===================================================================
	--- stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c (revision 332524)
	+++ stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c (revision 332525)
	@@ -1,6425 +1,6496 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright (c) 2013 Steven Hartland. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2017 Joyent, Inc.
	*/

	/*
	* The objective of this program is to provide a DMU/ZAP/SPA stress test
	* that runs entirely in userland, is easy to use, and easy to extend.
	*
	* The overall design of the ztest program is as follows:
	*
	* (1) For each major functional area (e.g. adding vdevs to a pool,
	* creating and destroying datasets, reading and writing objects, etc)
	* we have a simple routine to test that functionality. These
	* individual routines do not have to do anything "stressful".
	*
	* (2) We turn these simple functionality tests into a stress test by
	* running them all in parallel, with as many threads as desired,
	* and spread across as many datasets, objects, and vdevs as desired.
	*
	* (3) While all this is happening, we inject faults into the pool to
	* verify that self-healing data really works.
	*
	* (4) Every time we open a dataset, we change its checksum and compression
	* functions. Thus even individual objects vary from block to block
	* in which checksum they use and whether they're compressed.
	*
	* (5) To verify that we never lose on-disk consistency after a crash,
	* we run the entire test in a child of the main process.
	* At random times, the child self-immolates with a SIGKILL.
	* This is the software equivalent of pulling the power cord.
	* The parent then runs the test again, using the existing
	* storage pool, as many times as desired. If backwards compatibility
	* testing is enabled ztest will sometimes run the "older" version
	* of ztest after a SIGKILL.
	*
	* (6) To verify that we don't have future leaks or temporal incursions,
	* many of the functional tests record the transaction group number
	* as part of their data. When reading old data, they verify that
	* the transaction group number is less than the current, open txg.
	* If you add a new test, please do this if applicable.
	*
	* When run with no arguments, ztest runs for about five minutes and
	* produces no output if successful. To get a little bit of information,
	* specify -V. To get more information, specify -VV, and so on.
	*
	* To turn this into an overnight stress test, use -T to specify run time.
	*
	* You can ask more more vdevs [-v], datasets [-d], or threads [-t]
	* to increase the pool capacity, fanout, and overall stress level.
	*
	* Use the -k option to set the desired frequency of kills.
	*
	* When ztest invokes itself it passes all relevant information through a
	* temporary file which is mmap-ed in the child process. This allows shared
	* memory to survive the exec syscall. The ztest_shared_hdr_t struct is always
	* stored at offset 0 of this file and contains information on the size and
	* number of shared structures in the file. The information stored in this file
	* must remain backwards compatible with older versions of ztest so that
	* ztest can invoke them during backwards compatibility testing (-B).
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/dmu.h>
	#include <sys/txg.h>
	#include <sys/dbuf.h>
	#include <sys/zap.h>
	#include <sys/dmu_objset.h>
	#include <sys/poll.h>
	#include <sys/stat.h>
	#include <sys/time.h>
	#include <sys/wait.h>
	#include <sys/mman.h>
	#include <sys/resource.h>
	#include <sys/zio.h>
	#include <sys/zil.h>
	#include <sys/zil_impl.h>
	#include <sys/vdev_impl.h>
	#include <sys/vdev_file.h>
	#include <sys/spa_impl.h>
	#include <sys/metaslab_impl.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_destroy.h>
	#include <sys/dsl_scan.h>
	#include <sys/zio_checksum.h>
	#include <sys/refcount.h>
	#include <sys/zfeature.h>
	#include <sys/dsl_userhold.h>
	#include <sys/abd.h>
	#include <stdio.h>
	#include <stdio_ext.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <signal.h>
	#include <umem.h>
	#include <dlfcn.h>
	#include <ctype.h>
	#include <math.h>
	#include <errno.h>
	#include <sys/fs/zfs.h>
	#include <libnvpair.h>
	#include <libcmdutils.h>

	static int ztest_fd_data = -1;
	static int ztest_fd_rand = -1;

	typedef struct ztest_shared_hdr {
	uint64_t zh_hdr_size;
	uint64_t zh_opts_size;
	uint64_t zh_size;
	uint64_t zh_stats_size;
	uint64_t zh_stats_count;
	uint64_t zh_ds_size;
	uint64_t zh_ds_count;
	} ztest_shared_hdr_t;

	static ztest_shared_hdr_t *ztest_shared_hdr;

	typedef struct ztest_shared_opts {
	char zo_pool[ZFS_MAX_DATASET_NAME_LEN];
	char zo_dir[ZFS_MAX_DATASET_NAME_LEN];
	char zo_alt_ztest[MAXNAMELEN];
	char zo_alt_libpath[MAXNAMELEN];
	uint64_t zo_vdevs;
	uint64_t zo_vdevtime;
	size_t zo_vdev_size;
	int zo_ashift;
	int zo_mirrors;
	int zo_raidz;
	int zo_raidz_parity;
	int zo_datasets;
	int zo_threads;
	uint64_t zo_passtime;
	uint64_t zo_killrate;
	int zo_verbose;
	int zo_init;
	uint64_t zo_time;
	uint64_t zo_maxloops;
	uint64_t zo_metaslab_gang_bang;
	} ztest_shared_opts_t;

	static const ztest_shared_opts_t ztest_opts_defaults = {
	.zo_pool = { 'z', 't', 'e', 's', 't', '\0' },
	.zo_dir = { '/', 't', 'm', 'p', '\0' },
	.zo_alt_ztest = { '\0' },
	.zo_alt_libpath = { '\0' },
	.zo_vdevs = 5,
	.zo_ashift = SPA_MINBLOCKSHIFT,
	.zo_mirrors = 2,
	.zo_raidz = 4,
	.zo_raidz_parity = 1,
	.zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */
	.zo_datasets = 7,
	.zo_threads = 23,
	.zo_passtime = 60, /* 60 seconds */
	.zo_killrate = 70, /* 70% kill rate */
	.zo_verbose = 0,
	.zo_init = 1,
	.zo_time = 300, /* 5 minutes */
	.zo_maxloops = 50, /* max loops during spa_freeze() */
	.zo_metaslab_gang_bang = 32 << 10
	};

	extern uint64_t metaslab_gang_bang;
	extern uint64_t metaslab_df_alloc_threshold;
	extern uint64_t zfs_deadman_synctime_ms;
	extern int metaslab_preload_limit;
	extern boolean_t zfs_compressed_arc_enabled;
	extern boolean_t zfs_abd_scatter_enabled;

	static ztest_shared_opts_t *ztest_shared_opts;
	static ztest_shared_opts_t ztest_opts;

	typedef struct ztest_shared_ds {
	uint64_t zd_seq;
	} ztest_shared_ds_t;

	static ztest_shared_ds_t *ztest_shared_ds;
	#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])

	#define BT_MAGIC 0x123456789abcdefULL
	#define MAXFAULTS() \
	(MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)

	enum ztest_io_type {
	ZTEST_IO_WRITE_TAG,
	ZTEST_IO_WRITE_PATTERN,
	ZTEST_IO_WRITE_ZEROES,
	ZTEST_IO_TRUNCATE,
	ZTEST_IO_SETATTR,
	ZTEST_IO_REWRITE,
	ZTEST_IO_TYPES
	};

	typedef struct ztest_block_tag {
	uint64_t bt_magic;
	uint64_t bt_objset;
	uint64_t bt_object;
	uint64_t bt_offset;
	uint64_t bt_gen;
	uint64_t bt_txg;
	uint64_t bt_crtxg;
	} ztest_block_tag_t;

	typedef struct bufwad {
	uint64_t bw_index;
	uint64_t bw_txg;
	uint64_t bw_data;
	} bufwad_t;

	/*
	* XXX -- fix zfs range locks to be generic so we can use them here.
	*/
	typedef enum {
	RL_READER,
	RL_WRITER,
	RL_APPEND
	} rl_type_t;

	typedef struct rll {
	void *rll_writer;
	int rll_readers;
	mutex_t rll_lock;
	cond_t rll_cv;
	} rll_t;

	typedef struct rl {
	uint64_t rl_object;
	uint64_t rl_offset;
	uint64_t rl_size;
	rll_t *rl_lock;
	} rl_t;

	#define ZTEST_RANGE_LOCKS 64
	#define ZTEST_OBJECT_LOCKS 64

	/*
	* Object descriptor. Used as a template for object lookup/create/remove.
	*/
	typedef struct ztest_od {
	uint64_t od_dir;
	uint64_t od_object;
	dmu_object_type_t od_type;
	dmu_object_type_t od_crtype;
	uint64_t od_blocksize;
	uint64_t od_crblocksize;
	uint64_t od_gen;
	uint64_t od_crgen;
	char od_name[ZFS_MAX_DATASET_NAME_LEN];
	} ztest_od_t;

	/*
	* Per-dataset state.
	*/
	typedef struct ztest_ds {
	ztest_shared_ds_t *zd_shared;
	objset_t *zd_os;
	rwlock_t zd_zilog_lock;
	zilog_t *zd_zilog;
	ztest_od_t zd_od; / debugging aid */
	char zd_name[ZFS_MAX_DATASET_NAME_LEN];
	mutex_t zd_dirobj_lock;
	rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
	rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
	} ztest_ds_t;

	/*
	* Per-iteration state.
	*/
	typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);

	typedef struct ztest_info {
	ztest_func_t zi_func; / test function */
	uint64_t zi_iters; /* iterations per execution */
	uint64_t zi_interval; / execute every <interval> seconds */
	} ztest_info_t;

	typedef struct ztest_shared_callstate {
	uint64_t zc_count; /* per-pass count */
	uint64_t zc_time; /* per-pass time */
	uint64_t zc_next; /* next time to call this function */
	} ztest_shared_callstate_t;

	static ztest_shared_callstate_t *ztest_shared_callstate;
	#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c])

	/*
	* Note: these aren't static because we want dladdr() to work.
	*/
	ztest_func_t ztest_dmu_read_write;
	ztest_func_t ztest_dmu_write_parallel;
	ztest_func_t ztest_dmu_object_alloc_free;
	ztest_func_t ztest_dmu_commit_callbacks;
	ztest_func_t ztest_zap;
	ztest_func_t ztest_zap_parallel;
	ztest_func_t ztest_zil_commit;
	ztest_func_t ztest_zil_remount;
	ztest_func_t ztest_dmu_read_write_zcopy;
	ztest_func_t ztest_dmu_objset_create_destroy;
	ztest_func_t ztest_dmu_prealloc;
	ztest_func_t ztest_fzap;
	ztest_func_t ztest_dmu_snapshot_create_destroy;
	ztest_func_t ztest_dsl_prop_get_set;
	ztest_func_t ztest_spa_prop_get_set;
	ztest_func_t ztest_spa_create_destroy;
	ztest_func_t ztest_fault_inject;
	ztest_func_t ztest_ddt_repair;
	ztest_func_t ztest_dmu_snapshot_hold;
	ztest_func_t ztest_spa_rename;
	ztest_func_t ztest_scrub;
	ztest_func_t ztest_dsl_dataset_promote_busy;
	ztest_func_t ztest_vdev_attach_detach;
	ztest_func_t ztest_vdev_LUN_growth;
	ztest_func_t ztest_vdev_add_remove;
	ztest_func_t ztest_vdev_aux_add_remove;
	ztest_func_t ztest_split_pool;
	ztest_func_t ztest_reguid;
	ztest_func_t ztest_spa_upgrade;
	+ztest_func_t ztest_device_removal;
	+ztest_func_t ztest_remap_blocks;

	uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
	uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
	uint64_t zopt_often = 1ULL * NANOSEC; /* every second */
	uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */
	uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */

	ztest_info_t ztest_info[] = {
	{ ztest_dmu_read_write, 1, &zopt_always },
	{ ztest_dmu_write_parallel, 10, &zopt_always },
	{ ztest_dmu_object_alloc_free, 1, &zopt_always },
	{ ztest_dmu_commit_callbacks, 1, &zopt_always },
	{ ztest_zap, 30, &zopt_always },
	{ ztest_zap_parallel, 100, &zopt_always },
	{ ztest_split_pool, 1, &zopt_always },
	{ ztest_zil_commit, 1, &zopt_incessant },
	{ ztest_zil_remount, 1, &zopt_sometimes },
	{ ztest_dmu_read_write_zcopy, 1, &zopt_often },
	{ ztest_dmu_objset_create_destroy, 1, &zopt_often },
	{ ztest_dsl_prop_get_set, 1, &zopt_often },
	{ ztest_spa_prop_get_set, 1, &zopt_sometimes },
	#if 0
	{ ztest_dmu_prealloc, 1, &zopt_sometimes },
	#endif
	{ ztest_fzap, 1, &zopt_sometimes },
	{ ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
	{ ztest_spa_create_destroy, 1, &zopt_sometimes },
	{ ztest_fault_inject, 1, &zopt_sometimes },
	{ ztest_ddt_repair, 1, &zopt_sometimes },
	{ ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
	{ ztest_reguid, 1, &zopt_rarely },
	{ ztest_spa_rename, 1, &zopt_rarely },
	{ ztest_scrub, 1, &zopt_rarely },
	{ ztest_spa_upgrade, 1, &zopt_rarely },
	{ ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
	{ ztest_vdev_attach_detach, 1, &zopt_sometimes },
	{ ztest_vdev_LUN_growth, 1, &zopt_rarely },
	{ ztest_vdev_add_remove, 1,
	&ztest_opts.zo_vdevtime },
	{ ztest_vdev_aux_add_remove, 1,
	&ztest_opts.zo_vdevtime },
	+ { ztest_device_removal, 1, &zopt_sometimes },
	+ { ztest_remap_blocks, 1, &zopt_sometimes }
	};

	#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))

	/*
	* The following struct is used to hold a list of uncalled commit callbacks.
	* The callbacks are ordered by txg number.
	*/
	typedef struct ztest_cb_list {
	mutex_t zcl_callbacks_lock;
	list_t zcl_callbacks;
	} ztest_cb_list_t;

	/*
	* Stuff we need to share writably between parent and child.
	*/
	typedef struct ztest_shared {
	boolean_t zs_do_init;
	hrtime_t zs_proc_start;
	hrtime_t zs_proc_stop;
	hrtime_t zs_thread_start;
	hrtime_t zs_thread_stop;
	hrtime_t zs_thread_kill;
	uint64_t zs_enospc_count;
	uint64_t zs_vdev_next_leaf;
	uint64_t zs_vdev_aux;
	uint64_t zs_alloc;
	uint64_t zs_space;
	uint64_t zs_splits;
	uint64_t zs_mirrors;
	uint64_t zs_metaslab_sz;
	uint64_t zs_metaslab_df_alloc_threshold;
	uint64_t zs_guid;
	} ztest_shared_t;

	#define ID_PARALLEL -1ULL

	static char ztest_dev_template[] = "%s/%s.%llua";
	static char ztest_aux_template[] = "%s/%s.%s.%llu";
	ztest_shared_t *ztest_shared;

	static spa_t *ztest_spa = NULL;
	static ztest_ds_t *ztest_ds;

	static mutex_t ztest_vdev_lock;

	/*
	* The ztest_name_lock protects the pool and dataset namespace used by
	* the individual tests. To modify the namespace, consumers must grab
	* this lock as writer. Grabbing the lock as reader will ensure that the
	* namespace does not change while the lock is held.
	*/
	static rwlock_t ztest_name_lock;

	static boolean_t ztest_dump_core = B_TRUE;
	static boolean_t ztest_exiting;

	/* Global commit callback list */
	static ztest_cb_list_t zcl;

	enum ztest_object {
	ZTEST_META_DNODE = 0,
	ZTEST_DIROBJ,
	ZTEST_OBJECTS
	};

	static void usage(boolean_t) __NORETURN;

	/*
	* These libumem hooks provide a reasonable set of defaults for the allocator's
	* debugging facilities.
	*/
	const char *
	_umem_debug_init()
	{
	return ("default,verbose"); /* $UMEM_DEBUG setting */
	}

	const char *
	_umem_logging_init(void)
	{
	return ("fail,contents"); /* $UMEM_LOGGING setting */
	}

	#define FATAL_MSG_SZ 1024

	char *fatal_msg;

	static void
	fatal(int do_perror, char *message, ...)
	{
	va_list args;
	int save_errno = errno;
	char buf[FATAL_MSG_SZ];

	(void) fflush(stdout);

	va_start(args, message);
	(void) sprintf(buf, "ztest: ");
	/* LINTED */
	(void) vsprintf(buf + strlen(buf), message, args);
	va_end(args);
	if (do_perror) {
	(void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
	": %s", strerror(save_errno));
	}
	(void) fprintf(stderr, "%s\n", buf);
	fatal_msg = buf; /* to ease debugging */
	if (ztest_dump_core)
	abort();
	exit(3);
	}

	static int
	str2shift(const char *buf)
	{
	const char *ends = "BKMGTPEZ";
	int i;

	if (buf[0] == '\0')
	return (0);
	for (i = 0; i < strlen(ends); i++) {
	if (toupper(buf[0]) == ends[i])
	break;
	}
	if (i == strlen(ends)) {
	(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
	buf);
	usage(B_FALSE);
	}
	if (buf[1] == '\0' \|\| (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
	return (10*i);
	}
	(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
	usage(B_FALSE);
	/* NOTREACHED */
	}

	static uint64_t
	nicenumtoull(const char *buf)
	{
	char *end;
	uint64_t val;

	val = strtoull(buf, &end, 0);
	if (end == buf) {
	(void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
	usage(B_FALSE);
	} else if (end[0] == '.') {
	double fval = strtod(buf, &end);
	fval *= pow(2, str2shift(end));
	if (fval > UINT64_MAX) {
	(void) fprintf(stderr, "ztest: value too large: %s\n",
	buf);
	usage(B_FALSE);
	}
	val = (uint64_t)fval;
	} else {
	int shift = str2shift(end);
	if (shift >= 64 \|\| (val << shift) >> shift != val) {
	(void) fprintf(stderr, "ztest: value too large: %s\n",
	buf);
	usage(B_FALSE);
	}
	val <<= shift;
	}
	return (val);
	}

	static void
	usage(boolean_t requested)
	{
	const ztest_shared_opts_t *zo = &ztest_opts_defaults;

	char nice_vdev_size[NN_NUMBUF_SZ];
	char nice_gang_bang[NN_NUMBUF_SZ];
	FILE *fp = requested ? stdout : stderr;

	nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size));
	nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang,
	sizeof (nice_gang_bang));

	(void) fprintf(fp, "Usage: %s\n"
	"\t[-v vdevs (default: %llu)]\n"
	"\t[-s size_of_each_vdev (default: %s)]\n"
	"\t[-a alignment_shift (default: %d)] use 0 for random\n"
	"\t[-m mirror_copies (default: %d)]\n"
	"\t[-r raidz_disks (default: %d)]\n"
	"\t[-R raidz_parity (default: %d)]\n"
	"\t[-d datasets (default: %d)]\n"
	"\t[-t threads (default: %d)]\n"
	"\t[-g gang_block_threshold (default: %s)]\n"
	"\t[-i init_count (default: %d)] initialize pool i times\n"
	"\t[-k kill_percentage (default: %llu%%)]\n"
	"\t[-p pool_name (default: %s)]\n"
	"\t[-f dir (default: %s)] file directory for vdev files\n"
	"\t[-V] verbose (use multiple times for ever more blather)\n"
	"\t[-E] use existing pool instead of creating new one\n"
	"\t[-T time (default: %llu sec)] total run time\n"
	"\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
	"\t[-P passtime (default: %llu sec)] time per pass\n"
	"\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
	"\t[-o variable=value] ... set global variable to an unsigned\n"
	"\t 32-bit integer value\n"
	"\t[-h] (print help)\n"
	"",
	zo->zo_pool,
	(u_longlong_t)zo->zo_vdevs, /* -v */
	nice_vdev_size, /* -s */
	zo->zo_ashift, /* -a */
	zo->zo_mirrors, /* -m */
	zo->zo_raidz, /* -r */
	zo->zo_raidz_parity, /* -R */
	zo->zo_datasets, /* -d */
	zo->zo_threads, /* -t */
	nice_gang_bang, /* -g */
	zo->zo_init, /* -i */
	(u_longlong_t)zo->zo_killrate, /* -k */
	zo->zo_pool, /* -p */
	zo->zo_dir, /* -f */
	(u_longlong_t)zo->zo_time, /* -T */
	(u_longlong_t)zo->zo_maxloops, /* -F */
	(u_longlong_t)zo->zo_passtime);
	exit(requested ? 0 : 1);
	}

	static void
	process_options(int argc, char **argv)
	{
	char *path;
	ztest_shared_opts_t *zo = &ztest_opts;

	int opt;
	uint64_t value;
	char altdir[MAXNAMELEN] = { 0 };

	bcopy(&ztest_opts_defaults, zo, sizeof (*zo));

	while ((opt = getopt(argc, argv,
	"v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:B:o:")) != EOF) {
	value = 0;
	switch (opt) {
	case 'v':
	case 's':
	case 'a':
	case 'm':
	case 'r':
	case 'R':
	case 'd':
	case 't':
	case 'g':
	case 'i':
	case 'k':
	case 'T':
	case 'P':
	case 'F':
	value = nicenumtoull(optarg);
	}
	switch (opt) {
	case 'v':
	zo->zo_vdevs = value;
	break;
	case 's':
	zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value);
	break;
	case 'a':
	zo->zo_ashift = value;
	break;
	case 'm':
	zo->zo_mirrors = value;
	break;
	case 'r':
	zo->zo_raidz = MAX(1, value);
	break;
	case 'R':
	zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
	break;
	case 'd':
	zo->zo_datasets = MAX(1, value);
	break;
	case 't':
	zo->zo_threads = MAX(1, value);
	break;
	case 'g':
	zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1,
	value);
	break;
	case 'i':
	zo->zo_init = value;
	break;
	case 'k':
	zo->zo_killrate = value;
	break;
	case 'p':
	(void) strlcpy(zo->zo_pool, optarg,
	sizeof (zo->zo_pool));
	break;
	case 'f':
	path = realpath(optarg, NULL);
	if (path == NULL) {
	(void) fprintf(stderr, "error: %s: %s\n",
	optarg, strerror(errno));
	usage(B_FALSE);
	} else {
	(void) strlcpy(zo->zo_dir, path,
	sizeof (zo->zo_dir));
	}
	break;
	case 'V':
	zo->zo_verbose++;
	break;
	case 'E':
	zo->zo_init = 0;
	break;
	case 'T':
	zo->zo_time = value;
	break;
	case 'P':
	zo->zo_passtime = MAX(1, value);
	break;
	case 'F':
	zo->zo_maxloops = MAX(1, value);
	break;
	case 'B':
	(void) strlcpy(altdir, optarg, sizeof (altdir));
	break;
	case 'o':
	if (set_global_var(optarg) != 0)
	usage(B_FALSE);
	break;
	case 'h':
	usage(B_TRUE);
	break;
	case '?':
	default:
	usage(B_FALSE);
	break;
	}
	}

	zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);

	zo->zo_vdevtime =
	(zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
	UINT64_MAX >> 2);

	if (strlen(altdir) > 0) {
	char *cmd;
	char *realaltdir;
	char *bin;
	char *ztest;
	char *isa;
	int isalen;

	cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
	realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);

	VERIFY(NULL != realpath(getexecname(), cmd));
	if (0 != access(altdir, F_OK)) {
	ztest_dump_core = B_FALSE;
	fatal(B_TRUE, "invalid alternate ztest path: %s",
	altdir);
	}
	VERIFY(NULL != realpath(altdir, realaltdir));

	/*
	* 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest".
	* We want to extract <isa> to determine if we should use
	* 32 or 64 bit binaries.
	*/
	bin = strstr(cmd, "/usr/bin/");
	ztest = strstr(bin, "/ztest");
	isa = bin + 9;
	isalen = ztest - isa;
	(void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest),
	"%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa);
	(void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath),
	"%s/usr/lib/%.*s", realaltdir, isalen, isa);

	if (0 != access(zo->zo_alt_ztest, X_OK)) {
	ztest_dump_core = B_FALSE;
	fatal(B_TRUE, "invalid alternate ztest: %s",
	zo->zo_alt_ztest);
	} else if (0 != access(zo->zo_alt_libpath, X_OK)) {
	ztest_dump_core = B_FALSE;
	fatal(B_TRUE, "invalid alternate lib directory %s",
	zo->zo_alt_libpath);
	}

	umem_free(cmd, MAXPATHLEN);
	umem_free(realaltdir, MAXPATHLEN);
	}
	}

	static void
	ztest_kill(ztest_shared_t *zs)
	{
	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
	zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));

	/*
	* Before we kill off ztest, make sure that the config is updated.
	- * See comment above spa_config_sync().
	+ * See comment above spa_write_cachefile().
	*/
	mutex_enter(&spa_namespace_lock);
	- spa_config_sync(ztest_spa, B_FALSE, B_FALSE);
	+ spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE);
	mutex_exit(&spa_namespace_lock);

	zfs_dbgmsg_print(FTAG);
	(void) kill(getpid(), SIGKILL);
	}

	static uint64_t
	ztest_random(uint64_t range)
	{
	uint64_t r;

	ASSERT3S(ztest_fd_rand, >=, 0);

	if (range == 0)
	return (0);

	if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
	fatal(1, "short read from /dev/urandom");

	return (r % range);
	}

	/* ARGSUSED */
	static void
	ztest_record_enospc(const char *s)
	{
	ztest_shared->zs_enospc_count++;
	}

	static uint64_t
	ztest_get_ashift(void)
	{
	if (ztest_opts.zo_ashift == 0)
	return (SPA_MINBLOCKSHIFT + ztest_random(5));
	return (ztest_opts.zo_ashift);
	}

	static nvlist_t *
	make_vdev_file(char path, char aux, char *pool, size_t size, uint64_t ashift)
	{
	char pathbuf[MAXPATHLEN];
	uint64_t vdev;
	nvlist_t *file;

	if (ashift == 0)
	ashift = ztest_get_ashift();

	if (path == NULL) {
	path = pathbuf;

	if (aux != NULL) {
	vdev = ztest_shared->zs_vdev_aux;
	(void) snprintf(path, sizeof (pathbuf),
	ztest_aux_template, ztest_opts.zo_dir,
	pool == NULL ? ztest_opts.zo_pool : pool,
	aux, vdev);
	} else {
	vdev = ztest_shared->zs_vdev_next_leaf++;
	(void) snprintf(path, sizeof (pathbuf),
	ztest_dev_template, ztest_opts.zo_dir,
	pool == NULL ? ztest_opts.zo_pool : pool, vdev);
	}
	}

	if (size != 0) {
	int fd = open(path, O_RDWR \| O_CREAT \| O_TRUNC, 0666);
	if (fd == -1)
	fatal(1, "can't open %s", path);
	if (ftruncate(fd, size) != 0)
	fatal(1, "can't ftruncate %s", path);
	(void) close(fd);
	}

	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);

	return (file);
	}

	static nvlist_t *
	make_vdev_raidz(char path, char aux, char *pool, size_t size,
	uint64_t ashift, int r)
	{
	nvlist_t raidz, *child;
	int c;

	if (r < 2)
	return (make_vdev_file(path, aux, pool, size, ashift));
	child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);

	for (c = 0; c < r; c++)
	child[c] = make_vdev_file(path, aux, pool, size, ashift);

	VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
	VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
	VDEV_TYPE_RAIDZ) == 0);
	VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
	ztest_opts.zo_raidz_parity) == 0);
	VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
	child, r) == 0);

	for (c = 0; c < r; c++)
	nvlist_free(child[c]);

	umem_free(child, r * sizeof (nvlist_t *));

	return (raidz);
	}

	static nvlist_t *
	make_vdev_mirror(char path, char aux, char *pool, size_t size,
	uint64_t ashift, int r, int m)
	{
	nvlist_t mirror, *child;
	int c;

	if (m < 1)
	return (make_vdev_raidz(path, aux, pool, size, ashift, r));

	child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);

	for (c = 0; c < m; c++)
	child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);

	VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
	VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
	VDEV_TYPE_MIRROR) == 0);
	VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
	child, m) == 0);

	for (c = 0; c < m; c++)
	nvlist_free(child[c]);

	umem_free(child, m * sizeof (nvlist_t *));

	return (mirror);
	}

	static nvlist_t *
	make_vdev_root(char path, char aux, char *pool, size_t size, uint64_t ashift,
	int log, int r, int m, int t)
	{
	nvlist_t root, *child;
	int c;

	ASSERT(t > 0);

	child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);

	for (c = 0; c < t; c++) {
	child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
	r, m);
	VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	log) == 0);
	}

	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
	VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
	child, t) == 0);

	for (c = 0; c < t; c++)
	nvlist_free(child[c]);

	umem_free(child, t * sizeof (nvlist_t *));

	return (root);
	}

	/*
	* Find a random spa version. Returns back a random spa version in the
	* range [initial_version, SPA_VERSION_FEATURES].
	*/
	static uint64_t
	ztest_random_spa_version(uint64_t initial_version)
	{
	uint64_t version = initial_version;

	if (version <= SPA_VERSION_BEFORE_FEATURES) {
	version = version +
	ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
	}

	if (version > SPA_VERSION_BEFORE_FEATURES)
	version = SPA_VERSION_FEATURES;

	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
	return (version);
	}

	static int
	ztest_random_blocksize(void)
	{
	uint64_t block_shift;
	/*
	* Choose a block size >= the ashift.
	* If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
	*/
	int maxbs = SPA_OLD_MAXBLOCKSHIFT;
	if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
	maxbs = 20;
	block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1);
	return (1 << (SPA_MINBLOCKSHIFT + block_shift));
	}

	static int
	ztest_random_ibshift(void)
	{
	return (DN_MIN_INDBLKSHIFT +
	ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
	}

	static uint64_t
	ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
	{
	uint64_t top;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t *tvd;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);

	do {
	top = ztest_random(rvd->vdev_children);
	tvd = rvd->vdev_child[top];
	- } while (tvd->vdev_ishole \|\| (tvd->vdev_islog && !log_ok) \|\|
	+ } while (!vdev_is_concrete(tvd) \|\| (tvd->vdev_islog && !log_ok) \|\|
	tvd->vdev_mg == NULL \|\| tvd->vdev_mg->mg_class == NULL);

	return (top);
	}

	static uint64_t
	ztest_random_dsl_prop(zfs_prop_t prop)
	{
	uint64_t value;

	do {
	value = zfs_prop_random_value(prop, ztest_random(-1ULL));
	} while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);

	return (value);
	}

	static int
	ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
	boolean_t inherit)
	{
	const char *propname = zfs_prop_to_name(prop);
	const char *valname;
	char setpoint[MAXPATHLEN];
	uint64_t curval;
	int error;

	error = dsl_prop_set_int(osname, propname,
	(inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);

	if (error == ENOSPC) {
	ztest_record_enospc(FTAG);
	return (error);
	}
	ASSERT0(error);

	VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));

	if (ztest_opts.zo_verbose >= 6) {
	VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
	(void) printf("%s %s = %s at '%s'\n",
	osname, propname, valname, setpoint);
	}

	return (error);
	}

	static int
	ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value)
	{
	spa_t *spa = ztest_spa;
	nvlist_t *props = NULL;
	int error;

	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
	VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);

	error = spa_prop_set(spa, props);

	nvlist_free(props);

	if (error == ENOSPC) {
	ztest_record_enospc(FTAG);
	return (error);
	}
	ASSERT0(error);

	return (error);
	}

	static void
	ztest_rll_init(rll_t *rll)
	{
	rll->rll_writer = NULL;
	rll->rll_readers = 0;
	VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
	VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
	}

	static void
	ztest_rll_destroy(rll_t *rll)
	{
	ASSERT(rll->rll_writer == NULL);
	ASSERT(rll->rll_readers == 0);
	VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
	VERIFY(cond_destroy(&rll->rll_cv) == 0);
	}

	static void
	ztest_rll_lock(rll_t *rll, rl_type_t type)
	{
	VERIFY(mutex_lock(&rll->rll_lock) == 0);

	if (type == RL_READER) {
	while (rll->rll_writer != NULL)
	(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
	rll->rll_readers++;
	} else {
	while (rll->rll_writer != NULL \|\| rll->rll_readers)
	(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
	rll->rll_writer = curthread;
	}

	VERIFY(mutex_unlock(&rll->rll_lock) == 0);
	}

	static void
	ztest_rll_unlock(rll_t *rll)
	{
	VERIFY(mutex_lock(&rll->rll_lock) == 0);

	if (rll->rll_writer) {
	ASSERT(rll->rll_readers == 0);
	rll->rll_writer = NULL;
	} else {
	ASSERT(rll->rll_readers != 0);
	ASSERT(rll->rll_writer == NULL);
	rll->rll_readers--;
	}

	if (rll->rll_writer == NULL && rll->rll_readers == 0)
	VERIFY(cond_broadcast(&rll->rll_cv) == 0);

	VERIFY(mutex_unlock(&rll->rll_lock) == 0);
	}

	static void
	ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
	{
	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];

	ztest_rll_lock(rll, type);
	}

	static void
	ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
	{
	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];

	ztest_rll_unlock(rll);
	}

	static rl_t *
	ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
	uint64_t size, rl_type_t type)
	{
	uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
	rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
	rl_t *rl;

	rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
	rl->rl_object = object;
	rl->rl_offset = offset;
	rl->rl_size = size;
	rl->rl_lock = rll;

	ztest_rll_lock(rll, type);

	return (rl);
	}

	static void
	ztest_range_unlock(rl_t *rl)
	{
	rll_t *rll = rl->rl_lock;

	ztest_rll_unlock(rll);

	umem_free(rl, sizeof (*rl));
	}

	static void
	ztest_zd_init(ztest_ds_t zd, ztest_shared_ds_t szd, objset_t *os)
	{
	zd->zd_os = os;
	zd->zd_zilog = dmu_objset_zil(os);
	zd->zd_shared = szd;
	dmu_objset_name(os, zd->zd_name);

	if (zd->zd_shared != NULL)
	zd->zd_shared->zd_seq = 0;

	VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0);
	VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);

	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
	ztest_rll_init(&zd->zd_object_lock[l]);

	for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
	ztest_rll_init(&zd->zd_range_lock[l]);
	}

	static void
	ztest_zd_fini(ztest_ds_t *zd)
	{
	VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);

	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
	ztest_rll_destroy(&zd->zd_object_lock[l]);

	for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
	ztest_rll_destroy(&zd->zd_range_lock[l]);
	}

	#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)

	static uint64_t
	ztest_tx_assign(dmu_tx_t tx, uint64_t txg_how, const char tag)
	{
	uint64_t txg;
	int error;

	/*
	* Attempt to assign tx to some transaction group.
	*/
	error = dmu_tx_assign(tx, txg_how);
	if (error) {
	if (error == ERESTART) {
	ASSERT(txg_how == TXG_NOWAIT);
	dmu_tx_wait(tx);
	} else {
	ASSERT3U(error, ==, ENOSPC);
	ztest_record_enospc(tag);
	}
	dmu_tx_abort(tx);
	return (0);
	}
	txg = dmu_tx_get_txg(tx);
	ASSERT(txg != 0);
	return (txg);
	}

	static void
	ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
	{
	uint64_t *ip = buf;
	uint64_t ip_end = (uint64_t )((uintptr_t)buf + (uintptr_t)size);

	while (ip < ip_end)
	*ip++ = value;
	}

	static boolean_t
	ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
	{
	uint64_t *ip = buf;
	uint64_t ip_end = (uint64_t )((uintptr_t)buf + (uintptr_t)size);
	uint64_t diff = 0;

	while (ip < ip_end)
	diff \|= (value - *ip++);

	return (diff == 0);
	}

	static void
	ztest_bt_generate(ztest_block_tag_t bt, objset_t os, uint64_t object,
	uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
	{
	bt->bt_magic = BT_MAGIC;
	bt->bt_objset = dmu_objset_id(os);
	bt->bt_object = object;
	bt->bt_offset = offset;
	bt->bt_gen = gen;
	bt->bt_txg = txg;
	bt->bt_crtxg = crtxg;
	}

	static void
	ztest_bt_verify(ztest_block_tag_t bt, objset_t os, uint64_t object,
	uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
	{
	ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
	ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
	ASSERT3U(bt->bt_object, ==, object);
	ASSERT3U(bt->bt_offset, ==, offset);
	ASSERT3U(bt->bt_gen, <=, gen);
	ASSERT3U(bt->bt_txg, <=, txg);
	ASSERT3U(bt->bt_crtxg, ==, crtxg);
	}

	static ztest_block_tag_t *
	ztest_bt_bonus(dmu_buf_t *db)
	{
	dmu_object_info_t doi;
	ztest_block_tag_t *bt;

	dmu_object_info_from_db(db, &doi);
	ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
	ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
	bt = (void )((char )db->db_data + doi.doi_bonus_size - sizeof (*bt));

	return (bt);
	}

	/*
	* ZIL logging ops
	*/

	#define lrz_type lr_mode
	#define lrz_blocksize lr_uid
	#define lrz_ibshift lr_gid
	#define lrz_bonustype lr_rdev
	#define lrz_bonuslen lr_crtime[1]

	static void
	ztest_log_create(ztest_ds_t zd, dmu_tx_t tx, lr_create_t *lr)
	{
	char name = (void )(lr + 1); /* name follows lr */
	size_t namesize = strlen(name) + 1;
	itx_t *itx;

	if (zil_replaying(zd->zd_zilog, tx))
	return;

	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
	sizeof (*lr) + namesize - sizeof (lr_t));

	zil_itx_assign(zd->zd_zilog, itx, tx);
	}

	static void
	ztest_log_remove(ztest_ds_t zd, dmu_tx_t tx, lr_remove_t *lr, uint64_t object)
	{
	char name = (void )(lr + 1); /* name follows lr */
	size_t namesize = strlen(name) + 1;
	itx_t *itx;

	if (zil_replaying(zd->zd_zilog, tx))
	return;

	itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
	sizeof (*lr) + namesize - sizeof (lr_t));

	itx->itx_oid = object;
	zil_itx_assign(zd->zd_zilog, itx, tx);
	}

	static void
	ztest_log_write(ztest_ds_t zd, dmu_tx_t tx, lr_write_t *lr)
	{
	itx_t *itx;
	itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);

	if (zil_replaying(zd->zd_zilog, tx))
	return;

	if (lr->lr_length > ZIL_MAX_LOG_DATA)
	write_state = WR_INDIRECT;

	itx = zil_itx_create(TX_WRITE,
	sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));

	if (write_state == WR_COPIED &&
	dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
	((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
	zil_itx_destroy(itx);
	itx = zil_itx_create(TX_WRITE, sizeof (*lr));
	write_state = WR_NEED_COPY;
	}
	itx->itx_private = zd;
	itx->itx_wr_state = write_state;
	itx->itx_sync = (ztest_random(8) == 0);

	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
	sizeof (*lr) - sizeof (lr_t));

	zil_itx_assign(zd->zd_zilog, itx, tx);
	}

	static void
	ztest_log_truncate(ztest_ds_t zd, dmu_tx_t tx, lr_truncate_t *lr)
	{
	itx_t *itx;

	if (zil_replaying(zd->zd_zilog, tx))
	return;

	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
	sizeof (*lr) - sizeof (lr_t));

	itx->itx_sync = B_FALSE;
	zil_itx_assign(zd->zd_zilog, itx, tx);
	}

	static void
	ztest_log_setattr(ztest_ds_t zd, dmu_tx_t tx, lr_setattr_t *lr)
	{
	itx_t *itx;

	if (zil_replaying(zd->zd_zilog, tx))
	return;

	itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
	sizeof (*lr) - sizeof (lr_t));

	itx->itx_sync = B_FALSE;
	zil_itx_assign(zd->zd_zilog, itx, tx);
	}

	/*
	* ZIL replay ops
	*/
	static int
	ztest_replay_create(void arg1, void arg2, boolean_t byteswap)
	{
	ztest_ds_t *zd = arg1;
	lr_create_t *lr = arg2;
	char name = (void )(lr + 1); /* name follows lr */
	objset_t *os = zd->zd_os;
	ztest_block_tag_t *bbt;
	dmu_buf_t *db;
	dmu_tx_t *tx;
	uint64_t txg;
	int error = 0;

	if (byteswap)
	byteswap_uint64_array(lr, sizeof (*lr));

	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
	ASSERT(name[0] != '\0');

	tx = dmu_tx_create(os);

	dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);

	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
	} else {
	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
	}

	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	if (txg == 0)
	return (ENOSPC);

	ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);

	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
	if (lr->lr_foid == 0) {
	lr->lr_foid = zap_create(os,
	lr->lrz_type, lr->lrz_bonustype,
	lr->lrz_bonuslen, tx);
	} else {
	error = zap_create_claim(os, lr->lr_foid,
	lr->lrz_type, lr->lrz_bonustype,
	lr->lrz_bonuslen, tx);
	}
	} else {
	if (lr->lr_foid == 0) {
	lr->lr_foid = dmu_object_alloc(os,
	lr->lrz_type, 0, lr->lrz_bonustype,
	lr->lrz_bonuslen, tx);
	} else {
	error = dmu_object_claim(os, lr->lr_foid,
	lr->lrz_type, 0, lr->lrz_bonustype,
	lr->lrz_bonuslen, tx);
	}
	}

	if (error) {
	ASSERT3U(error, ==, EEXIST);
	ASSERT(zd->zd_zilog->zl_replay);
	dmu_tx_commit(tx);
	return (error);
	}

	ASSERT(lr->lr_foid != 0);

	if (lr->lrz_type != DMU_OT_ZAP_OTHER)
	VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
	lr->lrz_blocksize, lr->lrz_ibshift, tx));

	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
	bbt = ztest_bt_bonus(db);
	dmu_buf_will_dirty(db, tx);
	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
	dmu_buf_rele(db, FTAG);

	VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
	&lr->lr_foid, tx));

	(void) ztest_log_create(zd, tx, lr);

	dmu_tx_commit(tx);

	return (0);
	}

	static int
	ztest_replay_remove(void arg1, void arg2, boolean_t byteswap)
	{
	ztest_ds_t *zd = arg1;
	lr_remove_t *lr = arg2;
	char name = (void )(lr + 1); /* name follows lr */
	objset_t *os = zd->zd_os;
	dmu_object_info_t doi;
	dmu_tx_t *tx;
	uint64_t object, txg;

	if (byteswap)
	byteswap_uint64_array(lr, sizeof (*lr));

	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
	ASSERT(name[0] != '\0');

	VERIFY3U(0, ==,
	zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
	ASSERT(object != 0);

	ztest_object_lock(zd, object, RL_WRITER);

	VERIFY3U(0, ==, dmu_object_info(os, object, &doi));

	tx = dmu_tx_create(os);

	dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);

	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	if (txg == 0) {
	ztest_object_unlock(zd, object);
	return (ENOSPC);
	}

	if (doi.doi_type == DMU_OT_ZAP_OTHER) {
	VERIFY3U(0, ==, zap_destroy(os, object, tx));
	} else {
	VERIFY3U(0, ==, dmu_object_free(os, object, tx));
	}

	VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));

	(void) ztest_log_remove(zd, tx, lr, object);

	dmu_tx_commit(tx);

	ztest_object_unlock(zd, object);

	return (0);
	}

	static int
	ztest_replay_write(void arg1, void arg2, boolean_t byteswap)
	{
	ztest_ds_t *zd = arg1;
	lr_write_t *lr = arg2;
	objset_t *os = zd->zd_os;
	void data = lr + 1; / data follows lr */
	uint64_t offset, length;
	ztest_block_tag_t *bt = data;
	ztest_block_tag_t *bbt;
	uint64_t gen, txg, lrtxg, crtxg;
	dmu_object_info_t doi;
	dmu_tx_t *tx;
	dmu_buf_t *db;
	arc_buf_t *abuf = NULL;
	rl_t *rl;

	if (byteswap)
	byteswap_uint64_array(lr, sizeof (*lr));

	offset = lr->lr_offset;
	length = lr->lr_length;

	/* If it's a dmu_sync() block, write the whole block */
	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
	uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
	if (length < blocksize) {
	offset -= offset % blocksize;
	length = blocksize;
	}
	}

	if (bt->bt_magic == BSWAP_64(BT_MAGIC))
	byteswap_uint64_array(bt, sizeof (*bt));

	if (bt->bt_magic != BT_MAGIC)
	bt = NULL;

	ztest_object_lock(zd, lr->lr_foid, RL_READER);
	rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);

	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));

	dmu_object_info_from_db(db, &doi);

	bbt = ztest_bt_bonus(db);
	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
	gen = bbt->bt_gen;
	crtxg = bbt->bt_crtxg;
	lrtxg = lr->lr_common.lrc_txg;

	tx = dmu_tx_create(os);

	dmu_tx_hold_write(tx, lr->lr_foid, offset, length);

	if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
	P2PHASE(offset, length) == 0)
	abuf = dmu_request_arcbuf(db, length);

	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	if (txg == 0) {
	if (abuf != NULL)
	dmu_return_arcbuf(abuf);
	dmu_buf_rele(db, FTAG);
	ztest_range_unlock(rl);
	ztest_object_unlock(zd, lr->lr_foid);
	return (ENOSPC);
	}

	if (bt != NULL) {
	/*
	* Usually, verify the old data before writing new data --
	* but not always, because we also want to verify correct
	* behavior when the data was not recently read into cache.
	*/
	ASSERT(offset % doi.doi_data_block_size == 0);
	if (ztest_random(4) != 0) {
	int prefetch = ztest_random(2) ?
	DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
	ztest_block_tag_t rbt;

	VERIFY(dmu_read(os, lr->lr_foid, offset,
	sizeof (rbt), &rbt, prefetch) == 0);
	if (rbt.bt_magic == BT_MAGIC) {
	ztest_bt_verify(&rbt, os, lr->lr_foid,
	offset, gen, txg, crtxg);
	}
	}

	/*
	* Writes can appear to be newer than the bonus buffer because
	* the ztest_get_data() callback does a dmu_read() of the
	* open-context data, which may be different than the data
	* as it was when the write was generated.
	*/
	if (zd->zd_zilog->zl_replay) {
	ztest_bt_verify(bt, os, lr->lr_foid, offset,
	MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
	bt->bt_crtxg);
	}

	/*
	* Set the bt's gen/txg to the bonus buffer's gen/txg
	* so that all of the usual ASSERTs will work.
	*/
	ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
	}

	if (abuf == NULL) {
	dmu_write(os, lr->lr_foid, offset, length, data, tx);
	} else {
	bcopy(data, abuf->b_data, length);
	dmu_assign_arcbuf(db, offset, abuf, tx);
	}

	(void) ztest_log_write(zd, tx, lr);

	dmu_buf_rele(db, FTAG);

	dmu_tx_commit(tx);

	ztest_range_unlock(rl);
	ztest_object_unlock(zd, lr->lr_foid);

	return (0);
	}

	static int
	ztest_replay_truncate(void arg1, void arg2, boolean_t byteswap)
	{
	ztest_ds_t *zd = arg1;
	lr_truncate_t *lr = arg2;
	objset_t *os = zd->zd_os;
	dmu_tx_t *tx;
	uint64_t txg;
	rl_t *rl;

	if (byteswap)
	byteswap_uint64_array(lr, sizeof (*lr));

	ztest_object_lock(zd, lr->lr_foid, RL_READER);
	rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
	RL_WRITER);

	tx = dmu_tx_create(os);

	dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);

	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	if (txg == 0) {
	ztest_range_unlock(rl);
	ztest_object_unlock(zd, lr->lr_foid);
	return (ENOSPC);
	}

	VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
	lr->lr_length, tx) == 0);

	(void) ztest_log_truncate(zd, tx, lr);

	dmu_tx_commit(tx);

	ztest_range_unlock(rl);
	ztest_object_unlock(zd, lr->lr_foid);

	return (0);
	}

	static int
	ztest_replay_setattr(void arg1, void arg2, boolean_t byteswap)
	{
	ztest_ds_t *zd = arg1;
	lr_setattr_t *lr = arg2;
	objset_t *os = zd->zd_os;
	dmu_tx_t *tx;
	dmu_buf_t *db;
	ztest_block_tag_t *bbt;
	uint64_t txg, lrtxg, crtxg;

	if (byteswap)
	byteswap_uint64_array(lr, sizeof (*lr));

	ztest_object_lock(zd, lr->lr_foid, RL_WRITER);

	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));

	tx = dmu_tx_create(os);
	dmu_tx_hold_bonus(tx, lr->lr_foid);

	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	if (txg == 0) {
	dmu_buf_rele(db, FTAG);
	ztest_object_unlock(zd, lr->lr_foid);
	return (ENOSPC);
	}

	bbt = ztest_bt_bonus(db);
	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
	crtxg = bbt->bt_crtxg;
	lrtxg = lr->lr_common.lrc_txg;

	if (zd->zd_zilog->zl_replay) {
	ASSERT(lr->lr_size != 0);
	ASSERT(lr->lr_mode != 0);
	ASSERT(lrtxg != 0);
	} else {
	/*
	* Randomly change the size and increment the generation.
	*/
	lr->lr_size = (ztest_random(db->db_size / sizeof (bbt)) + 1)
	sizeof (*bbt);
	lr->lr_mode = bbt->bt_gen + 1;
	ASSERT(lrtxg == 0);
	}

	/*
	* Verify that the current bonus buffer is not newer than our txg.
	*/
	ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
	MAX(txg, lrtxg), crtxg);

	dmu_buf_will_dirty(db, tx);

	ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
	ASSERT3U(lr->lr_size, <=, db->db_size);
	VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
	bbt = ztest_bt_bonus(db);

	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);

	dmu_buf_rele(db, FTAG);

	(void) ztest_log_setattr(zd, tx, lr);

	dmu_tx_commit(tx);

	ztest_object_unlock(zd, lr->lr_foid);

	return (0);
	}

	zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
	NULL, /* 0 no such transaction type */
	ztest_replay_create, /* TX_CREATE */
	NULL, /* TX_MKDIR */
	NULL, /* TX_MKXATTR */
	NULL, /* TX_SYMLINK */
	ztest_replay_remove, /* TX_REMOVE */
	NULL, /* TX_RMDIR */
	NULL, /* TX_LINK */
	NULL, /* TX_RENAME */
	ztest_replay_write, /* TX_WRITE */
	ztest_replay_truncate, /* TX_TRUNCATE */
	ztest_replay_setattr, /* TX_SETATTR */
	NULL, /* TX_ACL */
	NULL, /* TX_CREATE_ACL */
	NULL, /* TX_CREATE_ATTR */
	NULL, /* TX_CREATE_ACL_ATTR */
	NULL, /* TX_MKDIR_ACL */
	NULL, /* TX_MKDIR_ATTR */
	NULL, /* TX_MKDIR_ACL_ATTR */
	NULL, /* TX_WRITE2 */
	};

	/*
	* ZIL get_data callbacks
	*/

	static void
	ztest_get_done(zgd_t *zgd, int error)
	{
	ztest_ds_t *zd = zgd->zgd_private;
	uint64_t object = zgd->zgd_rl->rl_object;

	if (zgd->zgd_db)
	dmu_buf_rele(zgd->zgd_db, zgd);

	ztest_range_unlock(zgd->zgd_rl);
	ztest_object_unlock(zd, object);

	if (error == 0 && zgd->zgd_bp)
	zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);

	umem_free(zgd, sizeof (*zgd));
	}

	static int
	ztest_get_data(void arg, lr_write_t lr, char buf, struct lwb lwb,
	zio_t *zio)
	{
	ztest_ds_t *zd = arg;
	objset_t *os = zd->zd_os;
	uint64_t object = lr->lr_foid;
	uint64_t offset = lr->lr_offset;
	uint64_t size = lr->lr_length;
	uint64_t txg = lr->lr_common.lrc_txg;
	uint64_t crtxg;
	dmu_object_info_t doi;
	dmu_buf_t *db;
	zgd_t *zgd;
	int error;

	ASSERT3P(lwb, !=, NULL);
	ASSERT3P(zio, !=, NULL);
	ASSERT3U(size, !=, 0);

	ztest_object_lock(zd, object, RL_READER);
	error = dmu_bonus_hold(os, object, FTAG, &db);
	if (error) {
	ztest_object_unlock(zd, object);
	return (error);
	}

	crtxg = ztest_bt_bonus(db)->bt_crtxg;

	if (crtxg == 0 \|\| crtxg > txg) {
	dmu_buf_rele(db, FTAG);
	ztest_object_unlock(zd, object);
	return (ENOENT);
	}

	dmu_object_info_from_db(db, &doi);
	dmu_buf_rele(db, FTAG);
	db = NULL;

	zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
	zgd->zgd_lwb = lwb;
	zgd->zgd_private = zd;

	if (buf != NULL) { /* immediate write */
	zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
	RL_READER);

	error = dmu_read(os, object, offset, size, buf,
	DMU_READ_NO_PREFETCH);
	ASSERT(error == 0);
	} else {
	size = doi.doi_data_block_size;
	if (ISP2(size)) {
	offset = P2ALIGN(offset, size);
	} else {
	ASSERT(offset < size);
	offset = 0;
	}

	zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
	RL_READER);

	error = dmu_buf_hold(os, object, offset, zgd, &db,
	DMU_READ_NO_PREFETCH);

	if (error == 0) {
	blkptr_t *bp = &lr->lr_blkptr;

	zgd->zgd_db = db;
	zgd->zgd_bp = bp;

	ASSERT(db->db_offset == offset);
	ASSERT(db->db_size == size);

	error = dmu_sync(zio, lr->lr_common.lrc_txg,
	ztest_get_done, zgd);

	if (error == 0)
	return (0);
	}
	}

	ztest_get_done(zgd, error);

	return (error);
	}

	static void *
	ztest_lr_alloc(size_t lrsize, char *name)
	{
	char *lr;
	size_t namesize = name ? strlen(name) + 1 : 0;

	lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);

	if (name)
	bcopy(name, lr + lrsize, namesize);

	return (lr);
	}

	void
	ztest_lr_free(void lr, size_t lrsize, char name)
	{
	size_t namesize = name ? strlen(name) + 1 : 0;

	umem_free(lr, lrsize + namesize);
	}

	/*
	* Lookup a bunch of objects. Returns the number of objects not found.
	*/
	static int
	ztest_lookup(ztest_ds_t zd, ztest_od_t od, int count)
	{
	int missing = 0;
	int error;

	ASSERT(_mutex_held(&zd->zd_dirobj_lock));

	for (int i = 0; i < count; i++, od++) {
	od->od_object = 0;
	error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
	sizeof (uint64_t), 1, &od->od_object);
	if (error) {
	ASSERT(error == ENOENT);
	ASSERT(od->od_object == 0);
	missing++;
	} else {
	dmu_buf_t *db;
	ztest_block_tag_t *bbt;
	dmu_object_info_t doi;

	ASSERT(od->od_object != 0);
	ASSERT(missing == 0); /* there should be no gaps */

	ztest_object_lock(zd, od->od_object, RL_READER);
	VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
	od->od_object, FTAG, &db));
	dmu_object_info_from_db(db, &doi);
	bbt = ztest_bt_bonus(db);
	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
	od->od_type = doi.doi_type;
	od->od_blocksize = doi.doi_data_block_size;
	od->od_gen = bbt->bt_gen;
	dmu_buf_rele(db, FTAG);
	ztest_object_unlock(zd, od->od_object);
	}
	}

	return (missing);
	}

	static int
	ztest_create(ztest_ds_t zd, ztest_od_t od, int count)
	{
	int missing = 0;

	ASSERT(_mutex_held(&zd->zd_dirobj_lock));

	for (int i = 0; i < count; i++, od++) {
	if (missing) {
	od->od_object = 0;
	missing++;
	continue;
	}

	lr_create_t lr = ztest_lr_alloc(sizeof (lr), od->od_name);

	lr->lr_doid = od->od_dir;
	lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
	lr->lrz_type = od->od_crtype;
	lr->lrz_blocksize = od->od_crblocksize;
	lr->lrz_ibshift = ztest_random_ibshift();
	lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
	lr->lrz_bonuslen = dmu_bonus_max();
	lr->lr_gen = od->od_crgen;
	lr->lr_crtime[0] = time(NULL);

	if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
	ASSERT(missing == 0);
	od->od_object = 0;
	missing++;
	} else {
	od->od_object = lr->lr_foid;
	od->od_type = od->od_crtype;
	od->od_blocksize = od->od_crblocksize;
	od->od_gen = od->od_crgen;
	ASSERT(od->od_object != 0);
	}

	ztest_lr_free(lr, sizeof (*lr), od->od_name);
	}

	return (missing);
	}

	static int
	ztest_remove(ztest_ds_t zd, ztest_od_t od, int count)
	{
	int missing = 0;
	int error;

	ASSERT(_mutex_held(&zd->zd_dirobj_lock));

	od += count - 1;

	for (int i = count - 1; i >= 0; i--, od--) {
	if (missing) {
	missing++;
	continue;
	}

	/*
	* No object was found.
	*/
	if (od->od_object == 0)
	continue;

	lr_remove_t lr = ztest_lr_alloc(sizeof (lr), od->od_name);

	lr->lr_doid = od->od_dir;

	if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
	ASSERT3U(error, ==, ENOSPC);
	missing++;
	} else {
	od->od_object = 0;
	}
	ztest_lr_free(lr, sizeof (*lr), od->od_name);
	}

	return (missing);
	}

	static int
	ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
	void *data)
	{
	lr_write_t *lr;
	int error;

	lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);

	lr->lr_foid = object;
	lr->lr_offset = offset;
	lr->lr_length = size;
	lr->lr_blkoff = 0;
	BP_ZERO(&lr->lr_blkptr);

	bcopy(data, lr + 1, size);

	error = ztest_replay_write(zd, lr, B_FALSE);

	ztest_lr_free(lr, sizeof (*lr) + size, NULL);

	return (error);
	}

	static int
	ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
	{
	lr_truncate_t *lr;
	int error;

	lr = ztest_lr_alloc(sizeof (*lr), NULL);

	lr->lr_foid = object;
	lr->lr_offset = offset;
	lr->lr_length = size;

	error = ztest_replay_truncate(zd, lr, B_FALSE);

	ztest_lr_free(lr, sizeof (*lr), NULL);

	return (error);
	}

	static int
	ztest_setattr(ztest_ds_t *zd, uint64_t object)
	{
	lr_setattr_t *lr;
	int error;

	lr = ztest_lr_alloc(sizeof (*lr), NULL);

	lr->lr_foid = object;
	lr->lr_size = 0;
	lr->lr_mode = 0;

	error = ztest_replay_setattr(zd, lr, B_FALSE);

	ztest_lr_free(lr, sizeof (*lr), NULL);

	return (error);
	}

	static void
	ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
	{
	objset_t *os = zd->zd_os;
	dmu_tx_t *tx;
	uint64_t txg;
	rl_t *rl;

	txg_wait_synced(dmu_objset_pool(os), 0);

	ztest_object_lock(zd, object, RL_READER);
	rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);

	tx = dmu_tx_create(os);

	dmu_tx_hold_write(tx, object, offset, size);

	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);

	if (txg != 0) {
	dmu_prealloc(os, object, offset, size, tx);
	dmu_tx_commit(tx);
	txg_wait_synced(dmu_objset_pool(os), txg);
	} else {
	(void) dmu_free_long_range(os, object, offset, size);
	}

	ztest_range_unlock(rl);
	ztest_object_unlock(zd, object);
	}

	static void
	ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
	{
	int err;
	ztest_block_tag_t wbt;
	dmu_object_info_t doi;
	enum ztest_io_type io_type;
	uint64_t blocksize;
	void *data;

	VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
	blocksize = doi.doi_data_block_size;
	data = umem_alloc(blocksize, UMEM_NOFAIL);

	/*
	* Pick an i/o type at random, biased toward writing block tags.
	*/
	io_type = ztest_random(ZTEST_IO_TYPES);
	if (ztest_random(2) == 0)
	io_type = ZTEST_IO_WRITE_TAG;

	(void) rw_rdlock(&zd->zd_zilog_lock);

	switch (io_type) {

	case ZTEST_IO_WRITE_TAG:
	ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
	(void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
	break;

	case ZTEST_IO_WRITE_PATTERN:
	(void) memset(data, 'a' + (object + offset) % 5, blocksize);
	if (ztest_random(2) == 0) {
	/*
	* Induce fletcher2 collisions to ensure that
	* zio_ddt_collision() detects and resolves them
	* when using fletcher2-verify for deduplication.
	*/
	((uint64_t *)data)[0] ^= 1ULL << 63;
	((uint64_t *)data)[4] ^= 1ULL << 63;
	}
	(void) ztest_write(zd, object, offset, blocksize, data);
	break;

	case ZTEST_IO_WRITE_ZEROES:
	bzero(data, blocksize);
	(void) ztest_write(zd, object, offset, blocksize, data);
	break;

	case ZTEST_IO_TRUNCATE:
	(void) ztest_truncate(zd, object, offset, blocksize);
	break;

	case ZTEST_IO_SETATTR:
	(void) ztest_setattr(zd, object);
	break;

	case ZTEST_IO_REWRITE:
	(void) rw_rdlock(&ztest_name_lock);
	err = ztest_dsl_prop_set_uint64(zd->zd_name,
	ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
	B_FALSE);
	VERIFY(err == 0 \|\| err == ENOSPC);
	err = ztest_dsl_prop_set_uint64(zd->zd_name,
	ZFS_PROP_COMPRESSION,
	ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
	B_FALSE);
	VERIFY(err == 0 \|\| err == ENOSPC);
	(void) rw_unlock(&ztest_name_lock);

	VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
	DMU_READ_NO_PREFETCH));

	(void) ztest_write(zd, object, offset, blocksize, data);
	break;
	}

	(void) rw_unlock(&zd->zd_zilog_lock);

	umem_free(data, blocksize);
	}

	/*
	* Initialize an object description template.
	*/
	static void
	ztest_od_init(ztest_od_t od, uint64_t id, char tag, uint64_t index,
	dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
	{
	od->od_dir = ZTEST_DIROBJ;
	od->od_object = 0;

	od->od_crtype = type;
	od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
	od->od_crgen = gen;

	od->od_type = DMU_OT_NONE;
	od->od_blocksize = 0;
	od->od_gen = 0;

	(void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
	tag, (int64_t)id, index);
	}

	/*
	* Lookup or create the objects for a test using the od template.
	* If the objects do not all exist, or if 'remove' is specified,
	* remove any existing objects and create new ones. Otherwise,
	* use the existing objects.
	*/
	static int
	ztest_object_init(ztest_ds_t zd, ztest_od_t od, size_t size, boolean_t remove)
	{
	int count = size / sizeof (*od);
	int rv = 0;

	VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
	if ((ztest_lookup(zd, od, count) != 0 \|\| remove) &&
	(ztest_remove(zd, od, count) != 0 \|\|
	ztest_create(zd, od, count) != 0))
	rv = -1;
	zd->zd_od = od;
	VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);

	return (rv);
	}

	/* ARGSUSED */
	void
	ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
	{
	zilog_t *zilog = zd->zd_zilog;

	(void) rw_rdlock(&zd->zd_zilog_lock);

	zil_commit(zilog, ztest_random(ZTEST_OBJECTS));

	/*
	* Remember the committed values in zd, which is in parent/child
	* shared memory. If we die, the next iteration of ztest_run()
	* will verify that the log really does contain this record.
	*/
	mutex_enter(&zilog->zl_lock);
	ASSERT(zd->zd_shared != NULL);
	ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
	zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
	mutex_exit(&zilog->zl_lock);

	(void) rw_unlock(&zd->zd_zilog_lock);
	}

	/*
	* This function is designed to simulate the operations that occur during a
	* mount/unmount operation. We hold the dataset across these operations in an
	* attempt to expose any implicit assumptions about ZIL management.
	*/
	/* ARGSUSED */
	void
	ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
	{
	objset_t *os = zd->zd_os;

	/*
	* We grab the zd_dirobj_lock to ensure that no other thread is
	* updating the zil (i.e. adding in-memory log records) and the
	* zd_zilog_lock to block any I/O.
	*/
	VERIFY0(mutex_lock(&zd->zd_dirobj_lock));
	(void) rw_wrlock(&zd->zd_zilog_lock);

	/* zfsvfs_teardown() */
	zil_close(zd->zd_zilog);

	/* zfsvfs_setup() */
	VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
	zil_replay(os, zd, ztest_replay_vector);

	(void) rw_unlock(&zd->zd_zilog_lock);
	VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
	}

	/*
	* Verify that we can't destroy an active pool, create an existing pool,
	* or create a pool with a bad vdev spec.
	*/
	/* ARGSUSED */
	void
	ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
	{
	ztest_shared_opts_t *zo = &ztest_opts;
	spa_t *spa;
	nvlist_t *nvroot;

	/*
	* Attempt to create using a bad file.
	*/
	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
	VERIFY3U(ENOENT, ==,
	spa_create("ztest_bad_file", nvroot, NULL, NULL));
	nvlist_free(nvroot);

	/*
	* Attempt to create using a bad mirror.
	*/
	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
	VERIFY3U(ENOENT, ==,
	spa_create("ztest_bad_mirror", nvroot, NULL, NULL));
	nvlist_free(nvroot);

	/*
	* Attempt to create an existing pool. It shouldn't matter
	* what's in the nvroot; we should fail with EEXIST.
	*/
	(void) rw_rdlock(&ztest_name_lock);
	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
	VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL));
	nvlist_free(nvroot);
	VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
	VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
	spa_close(spa, FTAG);

	(void) rw_unlock(&ztest_name_lock);
	}

	/* ARGSUSED */
	void
	ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
	{
	spa_t *spa;
	uint64_t initial_version = SPA_VERSION_INITIAL;
	uint64_t version, newversion;
	nvlist_t nvroot, props;
	char *name;

	VERIFY0(mutex_lock(&ztest_vdev_lock));
	name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);

	/*
	* Clean up from previous runs.
	*/
	(void) spa_destroy(name);

	nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
	0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);

	/*
	* If we're configuring a RAIDZ device then make sure that the
	* the initial version is capable of supporting that feature.
	*/
	switch (ztest_opts.zo_raidz_parity) {
	case 0:
	case 1:
	initial_version = SPA_VERSION_INITIAL;
	break;
	case 2:
	initial_version = SPA_VERSION_RAIDZ2;
	break;
	case 3:
	initial_version = SPA_VERSION_RAIDZ3;
	break;
	}

	/*
	* Create a pool with a spa version that can be upgraded. Pick
	* a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
	*/
	do {
	version = ztest_random_spa_version(initial_version);
	} while (version > SPA_VERSION_BEFORE_FEATURES);

	props = fnvlist_alloc();
	fnvlist_add_uint64(props,
	zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
	VERIFY0(spa_create(name, nvroot, props, NULL));
	fnvlist_free(nvroot);
	fnvlist_free(props);

	VERIFY0(spa_open(name, &spa, FTAG));
	VERIFY3U(spa_version(spa), ==, version);
	newversion = ztest_random_spa_version(version + 1);

	if (ztest_opts.zo_verbose >= 4) {
	(void) printf("upgrading spa version from %llu to %llu\n",
	(u_longlong_t)version, (u_longlong_t)newversion);
	}

	spa_upgrade(spa, newversion);
	VERIFY3U(spa_version(spa), >, version);
	VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
	zpool_prop_to_name(ZPOOL_PROP_VERSION)));
	spa_close(spa, FTAG);

	strfree(name);
	VERIFY0(mutex_unlock(&ztest_vdev_lock));
	}

	static vdev_t *
	vdev_lookup_by_path(vdev_t vd, const char path)
	{
	vdev_t *mvd;

	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
	return (vd);

	for (int c = 0; c < vd->vdev_children; c++)
	if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
	NULL)
	return (mvd);

	return (NULL);
	}

	/*
	* Find the first available hole which can be used as a top-level.
	*/
	int
	find_vdev_hole(spa_t *spa)
	{
	vdev_t *rvd = spa->spa_root_vdev;
	int c;

	ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);

	for (c = 0; c < rvd->vdev_children; c++) {
	vdev_t *cvd = rvd->vdev_child[c];

	if (cvd->vdev_ishole)
	break;
	}
	return (c);
	}

	/*
	* Verify that vdev_add() works as expected.
	*/
	/* ARGSUSED */
	void
	ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
	{
	ztest_shared_t *zs = ztest_shared;
	spa_t *spa = ztest_spa;
	uint64_t leaves;
	uint64_t guid;
	nvlist_t *nvroot;
	int error;

	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;

	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

	ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;

	/*
	* If we have slogs then remove them 1/4 of the time.
	*/
	if (spa_has_slogs(spa) && ztest_random(4) == 0) {
	/*
	* Grab the guid from the head of the log class rotor.
	*/
	guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;

	spa_config_exit(spa, SCL_VDEV, FTAG);

	/*
	* We have to grab the zs_name_lock as writer to
	* prevent a race between removing a slog (dmu_objset_find)
	* and destroying a dataset. Removing the slog will
	* grab a reference on the dataset which may cause
	* dmu_objset_destroy() to fail with EBUSY thus
	* leaving the dataset in an inconsistent state.
	*/
	VERIFY(rw_wrlock(&ztest_name_lock) == 0);
	error = spa_vdev_remove(spa, guid, B_FALSE);
	VERIFY(rw_unlock(&ztest_name_lock) == 0);

	if (error && error != EEXIST)
	fatal(0, "spa_vdev_remove() = %d", error);
	} else {
	spa_config_exit(spa, SCL_VDEV, FTAG);

	/*
	* Make 1/4 of the devices be log devices.
	*/
	nvroot = make_vdev_root(NULL, NULL, NULL,
	ztest_opts.zo_vdev_size, 0,
	ztest_random(4) == 0, ztest_opts.zo_raidz,
	zs->zs_mirrors, 1);

	error = spa_vdev_add(spa, nvroot);
	nvlist_free(nvroot);

	if (error == ENOSPC)
	ztest_record_enospc("spa_vdev_add");
	else if (error != 0)
	fatal(0, "spa_vdev_add() = %d", error);
	}

	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	}

	/*
	* Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
	*/
	/* ARGSUSED */
	void
	ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
	{
	ztest_shared_t *zs = ztest_shared;
	spa_t *spa = ztest_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	spa_aux_vdev_t *sav;
	char *aux;
	uint64_t guid = 0;
	int error;

	if (ztest_random(2) == 0) {
	sav = &spa->spa_spares;
	aux = ZPOOL_CONFIG_SPARES;
	} else {
	sav = &spa->spa_l2cache;
	aux = ZPOOL_CONFIG_L2CACHE;
	}

	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);

	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

	if (sav->sav_count != 0 && ztest_random(4) == 0) {
	/*
	* Pick a random device to remove.
	*/
	guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
	} else {
	/*
	* Find an unused device we can add.
	*/
	zs->zs_vdev_aux = 0;
	for (;;) {
	char path[MAXPATHLEN];
	int c;
	(void) snprintf(path, sizeof (path), ztest_aux_template,
	ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
	zs->zs_vdev_aux);
	for (c = 0; c < sav->sav_count; c++)
	if (strcmp(sav->sav_vdevs[c]->vdev_path,
	path) == 0)
	break;
	if (c == sav->sav_count &&
	vdev_lookup_by_path(rvd, path) == NULL)
	break;
	zs->zs_vdev_aux++;
	}
	}

	spa_config_exit(spa, SCL_VDEV, FTAG);

	if (guid == 0) {
	/*
	* Add a new device.
	*/
	nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
	(ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
	error = spa_vdev_add(spa, nvroot);
	if (error != 0)
	fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
	nvlist_free(nvroot);
	} else {
	/*
	* Remove an existing device. Sometimes, dirty its
	* vdev state first to make sure we handle removal
	* of devices that have pending state changes.
	*/
	if (ztest_random(2) == 0)
	(void) vdev_online(spa, guid, 0, NULL);

	error = spa_vdev_remove(spa, guid, B_FALSE);
	if (error != 0 && error != EBUSY)
	fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
	}

	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	}

	/*
	* split a pool if it has mirror tlvdevs
	*/
	/* ARGSUSED */
	void
	ztest_split_pool(ztest_ds_t *zd, uint64_t id)
	{
	ztest_shared_t *zs = ztest_shared;
	spa_t *spa = ztest_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	nvlist_t tree, child, config, split, *schild;
	uint_t c, children, schildren = 0, lastlogid = 0;
	int error = 0;

	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);

	/* ensure we have a useable config; mirrors of raidz aren't supported */
	if (zs->zs_mirrors < 3 \|\| ztest_opts.zo_raidz > 1) {
	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	return;
	}

	/* clean up the old pool, if any */
	(void) spa_destroy("splitp");

	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

	/* generate a config from the existing config */
	mutex_enter(&spa->spa_props_lock);
	VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
	&tree) == 0);
	mutex_exit(&spa->spa_props_lock);

	VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
	&children) == 0);

	schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
	for (c = 0; c < children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	nvlist_t **mchild;
	uint_t mchildren;

	if (tvd->vdev_islog \|\| tvd->vdev_ops == &vdev_hole_ops) {
	VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
	0) == 0);
	VERIFY(nvlist_add_string(schild[schildren],
	ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
	VERIFY(nvlist_add_uint64(schild[schildren],
	ZPOOL_CONFIG_IS_HOLE, 1) == 0);
	if (lastlogid == 0)
	lastlogid = schildren;
	++schildren;
	continue;
	}
	lastlogid = 0;
	VERIFY(nvlist_lookup_nvlist_array(child[c],
	ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
	VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
	}

	/* OK, create a config that can be used to split */
	VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
	VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
	VDEV_TYPE_ROOT) == 0);
	VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
	lastlogid != 0 ? lastlogid : schildren) == 0);

	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);

	for (c = 0; c < schildren; c++)
	nvlist_free(schild[c]);
	free(schild);
	nvlist_free(split);

	spa_config_exit(spa, SCL_VDEV, FTAG);

	(void) rw_wrlock(&ztest_name_lock);
	error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
	(void) rw_unlock(&ztest_name_lock);

	nvlist_free(config);

	if (error == 0) {
	(void) printf("successful split - results:\n");
	mutex_enter(&spa_namespace_lock);
	show_pool_stats(spa);
	show_pool_stats(spa_lookup("splitp"));
	mutex_exit(&spa_namespace_lock);
	++zs->zs_splits;
	--zs->zs_mirrors;
	}
	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);

	}

	/*
	* Verify that we can attach and detach devices.
	*/
	/* ARGSUSED */
	void
	ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
	{
	ztest_shared_t *zs = ztest_shared;
	spa_t *spa = ztest_spa;
	spa_aux_vdev_t *sav = &spa->spa_spares;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t oldvd, newvd, *pvd;
	nvlist_t *root;
	uint64_t leaves;
	uint64_t leaf, top;
	uint64_t ashift = ztest_get_ashift();
	uint64_t oldguid, pguid;
	uint64_t oldsize, newsize;
	char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
	int replacing;
	int oldvd_has_siblings = B_FALSE;
	int newvd_is_spare = B_FALSE;
	int oldvd_is_log;
	int error, expected_error;

	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;

	- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	/*
	+ * If a vdev is in the process of being removed, its removal may
	+ * finish while we are in progress, leading to an unexpected error
	+ * value. Don't bother trying to attach while we are in the middle
	+ * of removal.
	+ */
	+ if (spa->spa_vdev_removal != NULL) {
	+ spa_config_exit(spa, SCL_ALL, FTAG);
	+ VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	+ return;
	+ }
	+
	+ /*
	* Decide whether to do an attach or a replace.
	*/
	replacing = ztest_random(2);

	/*
	* Pick a random top-level vdev.
	*/
	top = ztest_random_vdev_top(spa, B_TRUE);

	/*
	* Pick a random leaf within it.
	*/
	leaf = ztest_random(leaves);

	/*
	* Locate this vdev.
	*/
	oldvd = rvd->vdev_child[top];
	if (zs->zs_mirrors >= 1) {
	ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
	ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
	oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
	}
	if (ztest_opts.zo_raidz > 1) {
	ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
	ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
	oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
	}

	/*
	* If we're already doing an attach or replace, oldvd may be a
	* mirror vdev -- in which case, pick a random child.
	*/
	while (oldvd->vdev_children != 0) {
	oldvd_has_siblings = B_TRUE;
	ASSERT(oldvd->vdev_children >= 2);
	oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
	}

	oldguid = oldvd->vdev_guid;
	oldsize = vdev_get_min_asize(oldvd);
	oldvd_is_log = oldvd->vdev_top->vdev_islog;
	(void) strcpy(oldpath, oldvd->vdev_path);
	pvd = oldvd->vdev_parent;
	pguid = pvd->vdev_guid;

	/*
	* If oldvd has siblings, then half of the time, detach it.
	*/
	if (oldvd_has_siblings && ztest_random(2) == 0) {
	- spa_config_exit(spa, SCL_VDEV, FTAG);
	+ spa_config_exit(spa, SCL_ALL, FTAG);
	error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
	if (error != 0 && error != ENODEV && error != EBUSY &&
	error != ENOTSUP)
	fatal(0, "detach (%s) returned %d", oldpath, error);
	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	return;
	}

	/*
	* For the new vdev, choose with equal probability between the two
	* standard paths (ending in either 'a' or 'b') or a random hot spare.
	*/
	if (sav->sav_count != 0 && ztest_random(3) == 0) {
	newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
	newvd_is_spare = B_TRUE;
	(void) strcpy(newpath, newvd->vdev_path);
	} else {
	(void) snprintf(newpath, sizeof (newpath), ztest_dev_template,
	ztest_opts.zo_dir, ztest_opts.zo_pool,
	top * leaves + leaf);
	if (ztest_random(2) == 0)
	newpath[strlen(newpath) - 1] = 'b';
	newvd = vdev_lookup_by_path(rvd, newpath);
	}

	if (newvd) {
	+ /*
	+ * Reopen to ensure the vdev's asize field isn't stale.
	+ */
	+ vdev_reopen(newvd);
	newsize = vdev_get_min_asize(newvd);
	} else {
	/*
	* Make newsize a little bigger or smaller than oldsize.
	* If it's smaller, the attach should fail.
	* If it's larger, and we're doing a replace,
	* we should get dynamic LUN growth when we're done.
	*/
	newsize = 10 * oldsize / (9 + ztest_random(3));
	}

	/*
	* If pvd is not a mirror or root, the attach should fail with ENOTSUP,
	* unless it's a replace; in that case any non-replacing parent is OK.
	*
	* If newvd is already part of the pool, it should fail with EBUSY.
	*
	* If newvd is too small, it should fail with EOVERFLOW.
	*/
	if (pvd->vdev_ops != &vdev_mirror_ops &&
	pvd->vdev_ops != &vdev_root_ops && (!replacing \|\|
	pvd->vdev_ops == &vdev_replacing_ops \|\|
	pvd->vdev_ops == &vdev_spare_ops))
	expected_error = ENOTSUP;
	else if (newvd_is_spare && (!replacing \|\| oldvd_is_log))
	expected_error = ENOTSUP;
	else if (newvd == oldvd)
	expected_error = replacing ? 0 : EBUSY;
	else if (vdev_lookup_by_path(rvd, newpath) != NULL)
	expected_error = EBUSY;
	else if (newsize < oldsize)
	expected_error = EOVERFLOW;
	else if (ashift > oldvd->vdev_top->vdev_ashift)
	expected_error = EDOM;
	else
	expected_error = 0;

	- spa_config_exit(spa, SCL_VDEV, FTAG);
	+ spa_config_exit(spa, SCL_ALL, FTAG);

	/*
	* Build the nvlist describing newpath.
	*/
	root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
	ashift, 0, 0, 0, 1);

	error = spa_vdev_attach(spa, oldguid, root, replacing);

	nvlist_free(root);

	/*
	* If our parent was the replacing vdev, but the replace completed,
	* then instead of failing with ENOTSUP we may either succeed,
	* fail with ENODEV, or fail with EOVERFLOW.
	*/
	if (expected_error == ENOTSUP &&
	(error == 0 \|\| error == ENODEV \|\| error == EOVERFLOW))
	expected_error = error;

	/*
	* If someone grew the LUN, the replacement may be too small.
	*/
	if (error == EOVERFLOW \|\| error == EBUSY)
	expected_error = error;

	/* XXX workaround 6690467 */
	if (error != expected_error && expected_error != EBUSY) {
	fatal(0, "attach (%s %llu, %s %llu, %d) "
	"returned %d, expected %d",
	oldpath, oldsize, newpath,
	newsize, replacing, error, expected_error);
	}

	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	}

	+/* ARGSUSED */
	+void
	+ztest_device_removal(ztest_ds_t *zd, uint64_t id)
	+{
	+ spa_t *spa = ztest_spa;
	+ vdev_t *vd;
	+ uint64_t guid;
	+
	+ VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
	+
	+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	+ vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
	+ guid = vd->vdev_guid;
	+ spa_config_exit(spa, SCL_VDEV, FTAG);
	+
	+ (void) spa_vdev_remove(spa, guid, B_FALSE);
	+
	+ VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	+}
	+
	/*
	* Callback function which expands the physical size of the vdev.
	*/
	vdev_t *
	grow_vdev(vdev_t vd, void arg)
	{
	spa_t *spa = vd->vdev_spa;
	size_t *newsize = arg;
	size_t fsize;
	int fd;

	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
	ASSERT(vd->vdev_ops->vdev_op_leaf);

	if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
	return (vd);

	fsize = lseek(fd, 0, SEEK_END);
	(void) ftruncate(fd, *newsize);

	if (ztest_opts.zo_verbose >= 6) {
	(void) printf("%s grew from %lu to %lu bytes\n",
	vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
	}
	(void) close(fd);
	return (NULL);
	}

	/*
	* Callback function which expands a given vdev by calling vdev_online().
	*/
	/* ARGSUSED */
	vdev_t *
	online_vdev(vdev_t vd, void arg)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *tvd = vd->vdev_top;
	uint64_t guid = vd->vdev_guid;
	uint64_t generation = spa->spa_config_generation + 1;
	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
	int error;

	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
	ASSERT(vd->vdev_ops->vdev_op_leaf);

	/* Calling vdev_online will initialize the new metaslabs */
	spa_config_exit(spa, SCL_STATE, spa);
	error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
	spa_config_enter(spa, SCL_STATE, spa, RW_READER);

	/*
	* If vdev_online returned an error or the underlying vdev_open
	* failed then we abort the expand. The only way to know that
	* vdev_open fails is by checking the returned newstate.
	*/
	if (error \|\| newstate != VDEV_STATE_HEALTHY) {
	if (ztest_opts.zo_verbose >= 5) {
	(void) printf("Unable to expand vdev, state %llu, "
	"error %d\n", (u_longlong_t)newstate, error);
	}
	return (vd);
	}
	ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);

	/*
	* Since we dropped the lock we need to ensure that we're
	* still talking to the original vdev. It's possible this
	* vdev may have been detached/replaced while we were
	* trying to online it.
	*/
	if (generation != spa->spa_config_generation) {
	if (ztest_opts.zo_verbose >= 5) {
	(void) printf("vdev configuration has changed, "
	"guid %llu, state %llu, expected gen %llu, "
	"got gen %llu\n",
	(u_longlong_t)guid,
	(u_longlong_t)tvd->vdev_state,
	(u_longlong_t)generation,
	(u_longlong_t)spa->spa_config_generation);
	}
	return (vd);
	}
	return (NULL);
	}

	/*
	* Traverse the vdev tree calling the supplied function.
	* We continue to walk the tree until we either have walked all
	* children or we receive a non-NULL return from the callback.
	* If a NULL callback is passed, then we just return back the first
	* leaf vdev we encounter.
	*/
	vdev_t *
	vdev_walk_tree(vdev_t vd, vdev_t (func)(vdev_t , void ), void arg)
	{
	if (vd->vdev_ops->vdev_op_leaf) {
	if (func == NULL)
	return (vd);
	else
	return (func(vd, arg));
	}

	for (uint_t c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];
	if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
	return (cvd);
	}
	return (NULL);
	}

	/*
	* Verify that dynamic LUN growth works as expected.
	*/
	/* ARGSUSED */
	void
	ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
	{
	spa_t *spa = ztest_spa;
	vdev_t vd, tvd;
	metaslab_class_t *mc;
	metaslab_group_t *mg;
	size_t psize, newsize;
	uint64_t top;
	uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;

	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
	spa_config_enter(spa, SCL_STATE, spa, RW_READER);

	+ /*
	+ * If there is a vdev removal in progress, it could complete while
	+ * we are running, in which case we would not be able to verify
	+ * that the metaslab_class space increased (because it decreases
	+ * when the device removal completes).
	+ */
	+ if (spa->spa_vdev_removal != NULL) {
	+ spa_config_exit(spa, SCL_STATE, FTAG);
	+ VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	+ return;
	+ }
	+
	top = ztest_random_vdev_top(spa, B_TRUE);

	tvd = spa->spa_root_vdev->vdev_child[top];
	mg = tvd->vdev_mg;
	mc = mg->mg_class;
	old_ms_count = tvd->vdev_ms_count;
	old_class_space = metaslab_class_get_space(mc);

	/*
	* Determine the size of the first leaf vdev associated with
	* our top-level device.
	*/
	vd = vdev_walk_tree(tvd, NULL, NULL);
	ASSERT3P(vd, !=, NULL);
	ASSERT(vd->vdev_ops->vdev_op_leaf);

	psize = vd->vdev_psize;

	/*
	* We only try to expand the vdev if it's healthy, less than 4x its
	* original size, and it has a valid psize.
	*/
	if (tvd->vdev_state != VDEV_STATE_HEALTHY \|\|
	psize == 0 \|\| psize >= 4 * ztest_opts.zo_vdev_size) {
	spa_config_exit(spa, SCL_STATE, spa);
	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	return;
	}
	ASSERT(psize > 0);
	newsize = psize + psize / 8;
	ASSERT3U(newsize, >, psize);

	if (ztest_opts.zo_verbose >= 6) {
	(void) printf("Expanding LUN %s from %lu to %lu\n",
	vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
	}

	/*
	* Growing the vdev is a two step process:
	* 1). expand the physical size (i.e. relabel)
	* 2). online the vdev to create the new metaslabs
	*/
	if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL \|\|
	vdev_walk_tree(tvd, online_vdev, NULL) != NULL \|\|
	tvd->vdev_state != VDEV_STATE_HEALTHY) {
	if (ztest_opts.zo_verbose >= 5) {
	(void) printf("Could not expand LUN because "
	"the vdev configuration changed.\n");
	}
	spa_config_exit(spa, SCL_STATE, spa);
	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	return;
	}

	spa_config_exit(spa, SCL_STATE, spa);

	/*
	* Expanding the LUN will update the config asynchronously,
	* thus we must wait for the async thread to complete any
	* pending tasks before proceeding.
	*/
	for (;;) {
	boolean_t done;
	mutex_enter(&spa->spa_async_lock);
	done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
	mutex_exit(&spa->spa_async_lock);
	if (done)
	break;
	txg_wait_synced(spa_get_dsl(spa), 0);
	(void) poll(NULL, 0, 100);
	}

	spa_config_enter(spa, SCL_STATE, spa, RW_READER);

	tvd = spa->spa_root_vdev->vdev_child[top];
	new_ms_count = tvd->vdev_ms_count;
	new_class_space = metaslab_class_get_space(mc);

	if (tvd->vdev_mg != mg \|\| mg->mg_class != mc) {
	if (ztest_opts.zo_verbose >= 5) {
	(void) printf("Could not verify LUN expansion due to "
	"intervening vdev offline or remove.\n");
	}
	spa_config_exit(spa, SCL_STATE, spa);
	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	return;
	}

	/*
	* Make sure we were able to grow the vdev.
	*/
	- if (new_ms_count <= old_ms_count)
	- fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
	+ if (new_ms_count <= old_ms_count) {
	+ fatal(0, "LUN expansion failed: ms_count %llu < %llu\n",
	old_ms_count, new_ms_count);
	+ }

	/*
	* Make sure we were able to grow the pool.
	*/
	- if (new_class_space <= old_class_space)
	- fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
	+ if (new_class_space <= old_class_space) {
	+ fatal(0, "LUN expansion failed: class_space %llu < %llu\n",
	old_class_space, new_class_space);
	+ }

	if (ztest_opts.zo_verbose >= 5) {
	char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ];

	nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf));
	nicenum(new_class_space, newnumbuf, sizeof (newnumbuf));
	(void) printf("%s grew from %s to %s\n",
	spa->spa_name, oldnumbuf, newnumbuf);
	}

	spa_config_exit(spa, SCL_STATE, spa);
	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	}

	/*
	* Verify that dmu_objset_{create,destroy,open,close} work as expected.
	*/
	/* ARGSUSED */
	static void
	ztest_objset_create_cb(objset_t os, void arg, cred_t cr, dmu_tx_t tx)
	{
	/*
	* Create the objects common to all ztest datasets.
	*/
	VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
	DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
	}

	static int
	ztest_dataset_create(char *dsname)
	{
	uint64_t zilset = ztest_random(100);
	int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0,
	ztest_objset_create_cb, NULL);

	if (err \|\| zilset < 80)
	return (err);

	if (ztest_opts.zo_verbose >= 6)
	(void) printf("Setting dataset %s to sync always\n", dsname);
	return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
	ZFS_SYNC_ALWAYS, B_FALSE));
	}

	/* ARGSUSED */
	static int
	ztest_objset_destroy_cb(const char name, void arg)
	{
	objset_t *os;
	dmu_object_info_t doi;
	int error;

	/*
	* Verify that the dataset contains a directory object.
	*/
	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os));
	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
	if (error != ENOENT) {
	/* We could have crashed in the middle of destroying it */
	ASSERT0(error);
	ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
	ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
	}
	dmu_objset_disown(os, FTAG);

	/*
	* Destroy the dataset.
	*/
	if (strchr(name, '@') != NULL) {
	VERIFY0(dsl_destroy_snapshot(name, B_FALSE));
	} else {
	VERIFY0(dsl_destroy_head(name));
	}
	return (0);
	}

	static boolean_t
	ztest_snapshot_create(char *osname, uint64_t id)
	{
	char snapname[ZFS_MAX_DATASET_NAME_LEN];
	int error;

	(void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);

	error = dmu_objset_snapshot_one(osname, snapname);
	if (error == ENOSPC) {
	ztest_record_enospc(FTAG);
	return (B_FALSE);
	}
	if (error != 0 && error != EEXIST) {
	fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
	snapname, error);
	}
	return (B_TRUE);
	}

	static boolean_t
	ztest_snapshot_destroy(char *osname, uint64_t id)
	{
	char snapname[ZFS_MAX_DATASET_NAME_LEN];
	int error;

	(void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname,
	(u_longlong_t)id);

	error = dsl_destroy_snapshot(snapname, B_FALSE);
	if (error != 0 && error != ENOENT)
	fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
	return (B_TRUE);
	}

	/* ARGSUSED */
	void
	ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
	{
	ztest_ds_t zdtmp;
	int iters;
	int error;
	objset_t os, os2;
	char name[ZFS_MAX_DATASET_NAME_LEN];
	zilog_t *zilog;

	(void) rw_rdlock(&ztest_name_lock);

	(void) snprintf(name, sizeof (name), "%s/temp_%llu",
	ztest_opts.zo_pool, (u_longlong_t)id);

	/*
	* If this dataset exists from a previous run, process its replay log
	* half of the time. If we don't replay it, then dmu_objset_destroy()
	* (invoked from ztest_objset_destroy_cb()) should just throw it away.
	*/
	if (ztest_random(2) == 0 &&
	dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
	ztest_zd_init(&zdtmp, NULL, os);
	zil_replay(os, &zdtmp, ztest_replay_vector);
	ztest_zd_fini(&zdtmp);
	dmu_objset_disown(os, FTAG);
	}

	/*
	* There may be an old instance of the dataset we're about to
	* create lying around from a previous run. If so, destroy it
	* and all of its snapshots.
	*/
	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
	DS_FIND_CHILDREN \| DS_FIND_SNAPSHOTS);

	/*
	* Verify that the destroyed dataset is no longer in the namespace.
	*/
	VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
	FTAG, &os));

	/*
	* Verify that we can create a new dataset.
	*/
	error = ztest_dataset_create(name);
	if (error) {
	if (error == ENOSPC) {
	ztest_record_enospc(FTAG);
	(void) rw_unlock(&ztest_name_lock);
	return;
	}
	fatal(0, "dmu_objset_create(%s) = %d", name, error);
	}

	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));

	ztest_zd_init(&zdtmp, NULL, os);

	/*
	* Open the intent log for it.
	*/
	zilog = zil_open(os, ztest_get_data);

	/*
	* Put some objects in there, do a little I/O to them,
	* and randomly take a couple of snapshots along the way.
	*/
	iters = ztest_random(5);
	for (int i = 0; i < iters; i++) {
	ztest_dmu_object_alloc_free(&zdtmp, id);
	if (ztest_random(iters) == 0)
	(void) ztest_snapshot_create(name, i);
	}

	/*
	* Verify that we cannot create an existing dataset.
	*/
	VERIFY3U(EEXIST, ==,
	dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));

	/*
	* Verify that we can hold an objset that is also owned.
	*/
	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
	dmu_objset_rele(os2, FTAG);

	/*
	* Verify that we cannot own an objset that is already owned.
	*/
	VERIFY3U(EBUSY, ==,
	dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));

	zil_close(zilog);
	dmu_objset_disown(os, FTAG);
	ztest_zd_fini(&zdtmp);

	(void) rw_unlock(&ztest_name_lock);
	}

	/*
	* Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
	*/
	void
	ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
	{
	(void) rw_rdlock(&ztest_name_lock);
	(void) ztest_snapshot_destroy(zd->zd_name, id);
	(void) ztest_snapshot_create(zd->zd_name, id);
	(void) rw_unlock(&ztest_name_lock);
	}

	/*
	* Cleanup non-standard snapshots and clones.
	*/
	void
	ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
	{
	char snap1name[ZFS_MAX_DATASET_NAME_LEN];
	char clone1name[ZFS_MAX_DATASET_NAME_LEN];
	char snap2name[ZFS_MAX_DATASET_NAME_LEN];
	char clone2name[ZFS_MAX_DATASET_NAME_LEN];
	char snap3name[ZFS_MAX_DATASET_NAME_LEN];
	int error;

	(void) snprintf(snap1name, sizeof (snap1name),
	"%s@s1_%llu", osname, id);
	(void) snprintf(clone1name, sizeof (clone1name),
	"%s/c1_%llu", osname, id);
	(void) snprintf(snap2name, sizeof (snap2name),
	"%s@s2_%llu", clone1name, id);
	(void) snprintf(clone2name, sizeof (clone2name),
	"%s/c2_%llu", osname, id);
	(void) snprintf(snap3name, sizeof (snap3name),
	"%s@s3_%llu", clone1name, id);

	error = dsl_destroy_head(clone2name);
	if (error && error != ENOENT)
	fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
	error = dsl_destroy_snapshot(snap3name, B_FALSE);
	if (error && error != ENOENT)
	fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
	error = dsl_destroy_snapshot(snap2name, B_FALSE);
	if (error && error != ENOENT)
	fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
	error = dsl_destroy_head(clone1name);
	if (error && error != ENOENT)
	fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
	error = dsl_destroy_snapshot(snap1name, B_FALSE);
	if (error && error != ENOENT)
	fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
	}

	/*
	* Verify dsl_dataset_promote handles EBUSY
	*/
	void
	ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
	{
	objset_t *os;
	char snap1name[ZFS_MAX_DATASET_NAME_LEN];
	char clone1name[ZFS_MAX_DATASET_NAME_LEN];
	char snap2name[ZFS_MAX_DATASET_NAME_LEN];
	char clone2name[ZFS_MAX_DATASET_NAME_LEN];
	char snap3name[ZFS_MAX_DATASET_NAME_LEN];
	char *osname = zd->zd_name;
	int error;

	(void) rw_rdlock(&ztest_name_lock);

	ztest_dsl_dataset_cleanup(osname, id);

	(void) snprintf(snap1name, sizeof (snap1name),
	"%s@s1_%llu", osname, id);
	(void) snprintf(clone1name, sizeof (clone1name),
	"%s/c1_%llu", osname, id);
	(void) snprintf(snap2name, sizeof (snap2name),
	"%s@s2_%llu", clone1name, id);
	(void) snprintf(clone2name, sizeof (clone2name),
	"%s/c2_%llu", osname, id);
	(void) snprintf(snap3name, sizeof (snap3name),
	"%s@s3_%llu", clone1name, id);

	error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
	if (error && error != EEXIST) {
	if (error == ENOSPC) {
	ztest_record_enospc(FTAG);
	goto out;
	}
	fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
	}

	error = dmu_objset_clone(clone1name, snap1name);
	if (error) {
	if (error == ENOSPC) {
	ztest_record_enospc(FTAG);
	goto out;
	}
	fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
	}

	error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1);
	if (error && error != EEXIST) {
	if (error == ENOSPC) {
	ztest_record_enospc(FTAG);
	goto out;
	}
	fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
	}

	error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1);
	if (error && error != EEXIST) {
	if (error == ENOSPC) {
	ztest_record_enospc(FTAG);
	goto out;
	}
	fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
	}

	error = dmu_objset_clone(clone2name, snap3name);
	if (error) {
	if (error == ENOSPC) {
	ztest_record_enospc(FTAG);
	goto out;
	}
	fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
	}

	error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
	if (error)
	fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
	error = dsl_dataset_promote(clone2name, NULL);
	if (error == ENOSPC) {
	dmu_objset_disown(os, FTAG);
	ztest_record_enospc(FTAG);
	goto out;
	}
	if (error != EBUSY)
	fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
	error);
	dmu_objset_disown(os, FTAG);

	out:
	ztest_dsl_dataset_cleanup(osname, id);

	(void) rw_unlock(&ztest_name_lock);
	}

	/*
	* Verify that dmu_object_{alloc,free} work as expected.
	*/
	void
	ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
	{
	ztest_od_t od[4];
	int batchsize = sizeof (od) / sizeof (od[0]);

	for (int b = 0; b < batchsize; b++)
	ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);

	/*
	* Destroy the previous batch of objects, create a new batch,
	* and do some I/O on the new objects.
	*/
	if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
	return;

	while (ztest_random(4 * batchsize) != 0)
	ztest_io(zd, od[ztest_random(batchsize)].od_object,
	ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
	}

	/*
	* Verify that dmu_{read,write} work as expected.
	*/
	void
	ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
	{
	objset_t *os = zd->zd_os;
	ztest_od_t od[2];
	dmu_tx_t *tx;
	int i, freeit, error;
	uint64_t n, s, txg;
	bufwad_t packbuf, bigbuf, pack, bigH, *bigT;
	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
	uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
	uint64_t regions = 997;
	uint64_t stride = 123456789ULL;
	uint64_t width = 40;
	int free_percent = 5;

	/*
	* This test uses two objects, packobj and bigobj, that are always
	* updated together (i.e. in the same tx) so that their contents are
	* in sync and can be compared. Their contents relate to each other
	* in a simple way: packobj is a dense array of 'bufwad' structures,
	* while bigobj is a sparse array of the same bufwads. Specifically,
	* for any index n, there are three bufwads that should be identical:
	*
	* packobj, at offset n * sizeof (bufwad_t)
	* bigobj, at the head of the nth chunk
	* bigobj, at the tail of the nth chunk
	*
	* The chunk size is arbitrary. It doesn't have to be a power of two,
	* and it doesn't have any relation to the object blocksize.
	* The only requirement is that it can hold at least two bufwads.
	*
	* Normally, we write the bufwad to each of these locations.
	* However, free_percent of the time we instead write zeroes to
	* packobj and perform a dmu_free_range() on bigobj. By comparing
	* bigobj to packobj, we can verify that the DMU is correctly
	* tracking which parts of an object are allocated and free,
	* and that the contents of the allocated blocks are correct.
	*/

	/*
	* Read the directory info. If it's the first time, set things up.
	*/
	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
	ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);

	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	return;

	bigobj = od[0].od_object;
	packobj = od[1].od_object;
	chunksize = od[0].od_gen;
	ASSERT(chunksize == od[1].od_gen);

	/*
	* Prefetch a random chunk of the big object.
	* Our aim here is to get some async reads in flight
	* for blocks that we may free below; the DMU should
	* handle this race correctly.
	*/
	n = ztest_random(regions) * stride + ztest_random(width);
	s = 1 + ztest_random(2 * width - 1);
	dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
	ZIO_PRIORITY_SYNC_READ);

	/*
	* Pick a random index and compute the offsets into packobj and bigobj.
	*/
	n = ztest_random(regions) * stride + ztest_random(width);
	s = 1 + ztest_random(width - 1);

	packoff = n * sizeof (bufwad_t);
	packsize = s * sizeof (bufwad_t);

	bigoff = n * chunksize;
	bigsize = s * chunksize;

	packbuf = umem_alloc(packsize, UMEM_NOFAIL);
	bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);

	/*
	* free_percent of the time, free a range of bigobj rather than
	* overwriting it.
	*/
	freeit = (ztest_random(100) < free_percent);

	/*
	* Read the current contents of our objects.
	*/
	error = dmu_read(os, packobj, packoff, packsize, packbuf,
	DMU_READ_PREFETCH);
	ASSERT0(error);
	error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
	DMU_READ_PREFETCH);
	ASSERT0(error);

	/*
	* Get a tx for the mods to both packobj and bigobj.
	*/
	tx = dmu_tx_create(os);

	dmu_tx_hold_write(tx, packobj, packoff, packsize);

	if (freeit)
	dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
	else
	dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);

	/* This accounts for setting the checksum/compression. */
	dmu_tx_hold_bonus(tx, bigobj);

	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	if (txg == 0) {
	umem_free(packbuf, packsize);
	umem_free(bigbuf, bigsize);
	return;
	}

	enum zio_checksum cksum;
	do {
	cksum = (enum zio_checksum)
	ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
	} while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
	dmu_object_set_checksum(os, bigobj, cksum, tx);

	enum zio_compress comp;
	do {
	comp = (enum zio_compress)
	ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
	} while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
	dmu_object_set_compress(os, bigobj, comp, tx);

	/*
	* For each index from n to n + s, verify that the existing bufwad
	* in packobj matches the bufwads at the head and tail of the
	* corresponding chunk in bigobj. Then update all three bufwads
	* with the new values we want to write out.
	*/
	for (i = 0; i < s; i++) {
	/* LINTED */
	pack = (bufwad_t )((char )packbuf + i * sizeof (bufwad_t));
	/* LINTED */
	bigH = (bufwad_t )((char )bigbuf + i * chunksize);
	/* LINTED */
	bigT = (bufwad_t )((char )bigH + chunksize) - 1;

	ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
	ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);

	if (pack->bw_txg > txg)
	fatal(0, "future leak: got %llx, open txg is %llx",
	pack->bw_txg, txg);

	if (pack->bw_data != 0 && pack->bw_index != n + i)
	fatal(0, "wrong index: got %llx, wanted %llx+%llx",
	pack->bw_index, n, i);

	if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
	fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);

	if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
	fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);

	if (freeit) {
	bzero(pack, sizeof (bufwad_t));
	} else {
	pack->bw_index = n + i;
	pack->bw_txg = txg;
	pack->bw_data = 1 + ztest_random(-2ULL);
	}
	bigH = pack;
	bigT = pack;
	}

	/*
	* We've verified all the old bufwads, and made new ones.
	* Now write them out.
	*/
	dmu_write(os, packobj, packoff, packsize, packbuf, tx);

	if (freeit) {
	if (ztest_opts.zo_verbose >= 7) {
	(void) printf("freeing offset %llx size %llx"
	" txg %llx\n",
	(u_longlong_t)bigoff,
	(u_longlong_t)bigsize,
	(u_longlong_t)txg);
	}
	VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
	} else {
	if (ztest_opts.zo_verbose >= 7) {
	(void) printf("writing offset %llx size %llx"
	" txg %llx\n",
	(u_longlong_t)bigoff,
	(u_longlong_t)bigsize,
	(u_longlong_t)txg);
	}
	dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
	}

	dmu_tx_commit(tx);

	/*
	* Sanity check the stuff we just wrote.
	*/
	{
	void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
	void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);

	VERIFY(0 == dmu_read(os, packobj, packoff,
	packsize, packcheck, DMU_READ_PREFETCH));
	VERIFY(0 == dmu_read(os, bigobj, bigoff,
	bigsize, bigcheck, DMU_READ_PREFETCH));

	ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
	ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);

	umem_free(packcheck, packsize);
	umem_free(bigcheck, bigsize);
	}

	umem_free(packbuf, packsize);
	umem_free(bigbuf, bigsize);
	}

	void
	compare_and_update_pbbufs(uint64_t s, bufwad_t packbuf, bufwad_t bigbuf,
	uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
	{
	uint64_t i;
	bufwad_t *pack;
	bufwad_t *bigH;
	bufwad_t *bigT;

	/*
	* For each index from n to n + s, verify that the existing bufwad
	* in packobj matches the bufwads at the head and tail of the
	* corresponding chunk in bigobj. Then update all three bufwads
	* with the new values we want to write out.
	*/
	for (i = 0; i < s; i++) {
	/* LINTED */
	pack = (bufwad_t )((char )packbuf + i * sizeof (bufwad_t));
	/* LINTED */
	bigH = (bufwad_t )((char )bigbuf + i * chunksize);
	/* LINTED */
	bigT = (bufwad_t )((char )bigH + chunksize) - 1;

	ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
	ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);

	if (pack->bw_txg > txg)
	fatal(0, "future leak: got %llx, open txg is %llx",
	pack->bw_txg, txg);

	if (pack->bw_data != 0 && pack->bw_index != n + i)
	fatal(0, "wrong index: got %llx, wanted %llx+%llx",
	pack->bw_index, n, i);

	if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
	fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);

	if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
	fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);

	pack->bw_index = n + i;
	pack->bw_txg = txg;
	pack->bw_data = 1 + ztest_random(-2ULL);

	bigH = pack;
	bigT = pack;
	}
	}

	void
	ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
	{
	objset_t *os = zd->zd_os;
	ztest_od_t od[2];
	dmu_tx_t *tx;
	uint64_t i;
	int error;
	uint64_t n, s, txg;
	bufwad_t packbuf, bigbuf;
	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
	uint64_t blocksize = ztest_random_blocksize();
	uint64_t chunksize = blocksize;
	uint64_t regions = 997;
	uint64_t stride = 123456789ULL;
	uint64_t width = 9;
	dmu_buf_t *bonus_db;
	arc_buf_t **bigbuf_arcbufs;
	dmu_object_info_t doi;

	/*
	* This test uses two objects, packobj and bigobj, that are always
	* updated together (i.e. in the same tx) so that their contents are
	* in sync and can be compared. Their contents relate to each other
	* in a simple way: packobj is a dense array of 'bufwad' structures,
	* while bigobj is a sparse array of the same bufwads. Specifically,
	* for any index n, there are three bufwads that should be identical:
	*
	* packobj, at offset n * sizeof (bufwad_t)
	* bigobj, at the head of the nth chunk
	* bigobj, at the tail of the nth chunk
	*
	* The chunk size is set equal to bigobj block size so that
	* dmu_assign_arcbuf() can be tested for object updates.
	*/

	/*
	* Read the directory info. If it's the first time, set things up.
	*/
	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
	ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);

	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	return;

	bigobj = od[0].od_object;
	packobj = od[1].od_object;
	blocksize = od[0].od_blocksize;
	chunksize = blocksize;
	ASSERT(chunksize == od[1].od_gen);

	VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
	VERIFY(ISP2(doi.doi_data_block_size));
	VERIFY(chunksize == doi.doi_data_block_size);
	VERIFY(chunksize >= 2 * sizeof (bufwad_t));

	/*
	* Pick a random index and compute the offsets into packobj and bigobj.
	*/
	n = ztest_random(regions) * stride + ztest_random(width);
	s = 1 + ztest_random(width - 1);

	packoff = n * sizeof (bufwad_t);
	packsize = s * sizeof (bufwad_t);

	bigoff = n * chunksize;
	bigsize = s * chunksize;

	packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
	bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);

	VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));

	bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);

	/*
	* Iteration 0 test zcopy for DB_UNCACHED dbufs.
	* Iteration 1 test zcopy to already referenced dbufs.
	* Iteration 2 test zcopy to dirty dbuf in the same txg.
	* Iteration 3 test zcopy to dbuf dirty in previous txg.
	* Iteration 4 test zcopy when dbuf is no longer dirty.
	* Iteration 5 test zcopy when it can't be done.
	* Iteration 6 one more zcopy write.
	*/
	for (i = 0; i < 7; i++) {
	uint64_t j;
	uint64_t off;

	/*
	* In iteration 5 (i == 5) use arcbufs
	* that don't match bigobj blksz to test
	* dmu_assign_arcbuf() when it can't directly
	* assign an arcbuf to a dbuf.
	*/
	for (j = 0; j < s; j++) {
	if (i != 5) {
	bigbuf_arcbufs[j] =
	dmu_request_arcbuf(bonus_db, chunksize);
	} else {
	bigbuf_arcbufs[2 * j] =
	dmu_request_arcbuf(bonus_db, chunksize / 2);
	bigbuf_arcbufs[2 * j + 1] =
	dmu_request_arcbuf(bonus_db, chunksize / 2);
	}
	}

	/*
	* Get a tx for the mods to both packobj and bigobj.
	*/
	tx = dmu_tx_create(os);

	dmu_tx_hold_write(tx, packobj, packoff, packsize);
	dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);

	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	if (txg == 0) {
	umem_free(packbuf, packsize);
	umem_free(bigbuf, bigsize);
	for (j = 0; j < s; j++) {
	if (i != 5) {
	dmu_return_arcbuf(bigbuf_arcbufs[j]);
	} else {
	dmu_return_arcbuf(
	bigbuf_arcbufs[2 * j]);
	dmu_return_arcbuf(
	bigbuf_arcbufs[2 * j + 1]);
	}
	}
	umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
	dmu_buf_rele(bonus_db, FTAG);
	return;
	}

	/*
	* 50% of the time don't read objects in the 1st iteration to
	* test dmu_assign_arcbuf() for the case when there're no
	* existing dbufs for the specified offsets.
	*/
	if (i != 0 \|\| ztest_random(2) != 0) {
	error = dmu_read(os, packobj, packoff,
	packsize, packbuf, DMU_READ_PREFETCH);
	ASSERT0(error);
	error = dmu_read(os, bigobj, bigoff, bigsize,
	bigbuf, DMU_READ_PREFETCH);
	ASSERT0(error);
	}
	compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
	n, chunksize, txg);

	/*
	* We've verified all the old bufwads, and made new ones.
	* Now write them out.
	*/
	dmu_write(os, packobj, packoff, packsize, packbuf, tx);
	if (ztest_opts.zo_verbose >= 7) {
	(void) printf("writing offset %llx size %llx"
	" txg %llx\n",
	(u_longlong_t)bigoff,
	(u_longlong_t)bigsize,
	(u_longlong_t)txg);
	}
	for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
	dmu_buf_t *dbt;
	if (i != 5) {
	bcopy((caddr_t)bigbuf + (off - bigoff),
	bigbuf_arcbufs[j]->b_data, chunksize);
	} else {
	bcopy((caddr_t)bigbuf + (off - bigoff),
	bigbuf_arcbufs[2 * j]->b_data,
	chunksize / 2);
	bcopy((caddr_t)bigbuf + (off - bigoff) +
	chunksize / 2,
	bigbuf_arcbufs[2 * j + 1]->b_data,
	chunksize / 2);
	}

	if (i == 1) {
	VERIFY(dmu_buf_hold(os, bigobj, off,
	FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
	}
	if (i != 5) {
	dmu_assign_arcbuf(bonus_db, off,
	bigbuf_arcbufs[j], tx);
	} else {
	dmu_assign_arcbuf(bonus_db, off,
	bigbuf_arcbufs[2 * j], tx);
	dmu_assign_arcbuf(bonus_db,
	off + chunksize / 2,
	bigbuf_arcbufs[2 * j + 1], tx);
	}
	if (i == 1) {
	dmu_buf_rele(dbt, FTAG);
	}
	}
	dmu_tx_commit(tx);

	/*
	* Sanity check the stuff we just wrote.
	*/
	{
	void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
	void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);

	VERIFY(0 == dmu_read(os, packobj, packoff,
	packsize, packcheck, DMU_READ_PREFETCH));
	VERIFY(0 == dmu_read(os, bigobj, bigoff,
	bigsize, bigcheck, DMU_READ_PREFETCH));

	ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
	ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);

	umem_free(packcheck, packsize);
	umem_free(bigcheck, bigsize);
	}
	if (i == 2) {
	txg_wait_open(dmu_objset_pool(os), 0);
	} else if (i == 3) {
	txg_wait_synced(dmu_objset_pool(os), 0);
	}
	}

	dmu_buf_rele(bonus_db, FTAG);
	umem_free(packbuf, packsize);
	umem_free(bigbuf, bigsize);
	umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
	}

	/* ARGSUSED */
	void
	ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
	{
	ztest_od_t od[1];
	uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
	(ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);

	/*
	* Have multiple threads write to large offsets in an object
	* to verify that parallel writes to an object -- even to the
	* same blocks within the object -- doesn't cause any trouble.
	*/
	ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);

	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	return;

	while (ztest_random(10) != 0)
	ztest_io(zd, od[0].od_object, offset);
	}

	void
	ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
	{
	ztest_od_t od[1];
	uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
	(ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
	uint64_t count = ztest_random(20) + 1;
	uint64_t blocksize = ztest_random_blocksize();
	void *data;

	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);

	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
	return;

	if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0)
	return;

	ztest_prealloc(zd, od[0].od_object, offset, count * blocksize);

	data = umem_zalloc(blocksize, UMEM_NOFAIL);

	while (ztest_random(count) != 0) {
	uint64_t randoff = offset + (ztest_random(count) * blocksize);
	if (ztest_write(zd, od[0].od_object, randoff, blocksize,
	data) != 0)
	break;
	while (ztest_random(4) != 0)
	ztest_io(zd, od[0].od_object, randoff);
	}

	umem_free(data, blocksize);
	}

	/*
	* Verify that zap_{create,destroy,add,remove,update} work as expected.
	*/
	#define ZTEST_ZAP_MIN_INTS 1
	#define ZTEST_ZAP_MAX_INTS 4
	#define ZTEST_ZAP_MAX_PROPS 1000

	void
	ztest_zap(ztest_ds_t *zd, uint64_t id)
	{
	objset_t *os = zd->zd_os;
	ztest_od_t od[1];
	uint64_t object;
	uint64_t txg, last_txg;
	uint64_t value[ZTEST_ZAP_MAX_INTS];
	uint64_t zl_ints, zl_intsize, prop;
	int i, ints;
	dmu_tx_t *tx;
	char propname[100], txgname[100];
	int error;
	char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };

	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);

	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
	return;

	object = od[0].od_object;

	/*
	* Generate a known hash collision, and verify that
	* we can lookup and remove both entries.
	*/
	tx = dmu_tx_create(os);
	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	if (txg == 0)
	return;
	for (i = 0; i < 2; i++) {
	value[i] = i;
	VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
	1, &value[i], tx));
	}
	for (i = 0; i < 2; i++) {
	VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
	sizeof (uint64_t), 1, &value[i], tx));
	VERIFY3U(0, ==,
	zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
	ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
	ASSERT3U(zl_ints, ==, 1);
	}
	for (i = 0; i < 2; i++) {
	VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
	}
	dmu_tx_commit(tx);

	/*
	* Generate a buch of random entries.
	*/
	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);

	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
	bzero(value, sizeof (value));
	last_txg = 0;

	/*
	* If these zap entries already exist, validate their contents.
	*/
	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
	if (error == 0) {
	ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
	ASSERT3U(zl_ints, ==, 1);

	VERIFY(zap_lookup(os, object, txgname, zl_intsize,
	zl_ints, &last_txg) == 0);

	VERIFY(zap_length(os, object, propname, &zl_intsize,
	&zl_ints) == 0);

	ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
	ASSERT3U(zl_ints, ==, ints);

	VERIFY(zap_lookup(os, object, propname, zl_intsize,
	zl_ints, value) == 0);

	for (i = 0; i < ints; i++) {
	ASSERT3U(value[i], ==, last_txg + object + i);
	}
	} else {
	ASSERT3U(error, ==, ENOENT);
	}

	/*
	* Atomically update two entries in our zap object.
	* The first is named txg_%llu, and contains the txg
	* in which the property was last updated. The second
	* is named prop_%llu, and the nth element of its value
	* should be txg + object + n.
	*/
	tx = dmu_tx_create(os);
	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	if (txg == 0)
	return;

	if (last_txg > txg)
	fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);

	for (i = 0; i < ints; i++)
	value[i] = txg + object + i;

	VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
	1, &txg, tx));
	VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
	ints, value, tx));

	dmu_tx_commit(tx);

	/*
	* Remove a random pair of entries.
	*/
	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);

	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);

	if (error == ENOENT)
	return;

	ASSERT0(error);

	tx = dmu_tx_create(os);
	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	if (txg == 0)
	return;
	VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
	VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
	dmu_tx_commit(tx);
	}

	/*
	* Testcase to test the upgrading of a microzap to fatzap.
	*/
	void
	ztest_fzap(ztest_ds_t *zd, uint64_t id)
	{
	objset_t *os = zd->zd_os;
	ztest_od_t od[1];
	uint64_t object, txg;

	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);

	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
	return;

	object = od[0].od_object;

	/*
	* Add entries to this ZAP and make sure it spills over
	* and gets upgraded to a fatzap. Also, since we are adding
	* 2050 entries we should see ptrtbl growth and leaf-block split.
	*/
	for (int i = 0; i < 2050; i++) {
	char name[ZFS_MAX_DATASET_NAME_LEN];
	uint64_t value = i;
	dmu_tx_t *tx;
	int error;

	(void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
	id, value);

	tx = dmu_tx_create(os);
	dmu_tx_hold_zap(tx, object, B_TRUE, name);
	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	if (txg == 0)
	return;
	error = zap_add(os, object, name, sizeof (uint64_t), 1,
	&value, tx);
	ASSERT(error == 0 \|\| error == EEXIST);
	dmu_tx_commit(tx);
	}
	}

	/* ARGSUSED */
	void
	ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
	{
	objset_t *os = zd->zd_os;
	ztest_od_t od[1];
	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
	dmu_tx_t *tx;
	int i, namelen, error;
	int micro = ztest_random(2);
	char name[20], string_value[20];
	void *data;

	ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);

	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	return;

	object = od[0].od_object;

	/*
	* Generate a random name of the form 'xxx.....' where each
	* x is a random printable character and the dots are dots.
	* There are 94 such characters, and the name length goes from
	* 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
	*/
	namelen = ztest_random(sizeof (name) - 5) + 5 + 1;

	for (i = 0; i < 3; i++)
	name[i] = '!' + ztest_random('~' - '!' + 1);
	for (; i < namelen - 1; i++)
	name[i] = '.';
	name[i] = '\0';

	if ((namelen & 1) \|\| micro) {
	wsize = sizeof (txg);
	wc = 1;
	data = &txg;
	} else {
	wsize = 1;
	wc = namelen;
	data = string_value;
	}

	count = -1ULL;
	VERIFY0(zap_count(os, object, &count));
	ASSERT(count != -1ULL);

	/*
	* Select an operation: length, lookup, add, update, remove.
	*/
	i = ztest_random(5);

	if (i >= 2) {
	tx = dmu_tx_create(os);
	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
	if (txg == 0)
	return;
	bcopy(name, string_value, namelen);
	} else {
	tx = NULL;
	txg = 0;
	bzero(string_value, namelen);
	}

	switch (i) {

	case 0:
	error = zap_length(os, object, name, &zl_wsize, &zl_wc);
	if (error == 0) {
	ASSERT3U(wsize, ==, zl_wsize);
	ASSERT3U(wc, ==, zl_wc);
	} else {
	ASSERT3U(error, ==, ENOENT);
	}
	break;

	case 1:
	error = zap_lookup(os, object, name, wsize, wc, data);
	if (error == 0) {
	if (data == string_value &&
	bcmp(name, data, namelen) != 0)
	fatal(0, "name '%s' != val '%s' len %d",
	name, data, namelen);
	} else {
	ASSERT3U(error, ==, ENOENT);
	}
	break;

	case 2:
	error = zap_add(os, object, name, wsize, wc, data, tx);
	ASSERT(error == 0 \|\| error == EEXIST);
	break;

	case 3:
	VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
	break;

	case 4:
	error = zap_remove(os, object, name, tx);
	ASSERT(error == 0 \|\| error == ENOENT);
	break;
	}

	if (tx != NULL)
	dmu_tx_commit(tx);
	}

	/*
	* Commit callback data.
	*/
	typedef struct ztest_cb_data {
	list_node_t zcd_node;
	uint64_t zcd_txg;
	int zcd_expected_err;
	boolean_t zcd_added;
	boolean_t zcd_called;
	spa_t *zcd_spa;
	} ztest_cb_data_t;

	/* This is the actual commit callback function */
	static void
	ztest_commit_callback(void *arg, int error)
	{
	ztest_cb_data_t *data = arg;
	uint64_t synced_txg;

	VERIFY(data != NULL);
	VERIFY3S(data->zcd_expected_err, ==, error);
	VERIFY(!data->zcd_called);

	synced_txg = spa_last_synced_txg(data->zcd_spa);
	if (data->zcd_txg > synced_txg)
	fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
	", last synced txg = %" PRIu64 "\n", data->zcd_txg,
	synced_txg);

	data->zcd_called = B_TRUE;

	if (error == ECANCELED) {
	ASSERT0(data->zcd_txg);
	ASSERT(!data->zcd_added);

	/*
	* The private callback data should be destroyed here, but
	* since we are going to check the zcd_called field after
	* dmu_tx_abort(), we will destroy it there.
	*/
	return;
	}

	/* Was this callback added to the global callback list? */
	if (!data->zcd_added)
	goto out;

	ASSERT3U(data->zcd_txg, !=, 0);

	/* Remove our callback from the list */
	(void) mutex_lock(&zcl.zcl_callbacks_lock);
	list_remove(&zcl.zcl_callbacks, data);
	(void) mutex_unlock(&zcl.zcl_callbacks_lock);

	out:
	umem_free(data, sizeof (ztest_cb_data_t));
	}

	/* Allocate and initialize callback data structure */
	static ztest_cb_data_t *
	ztest_create_cb_data(objset_t *os, uint64_t txg)
	{
	ztest_cb_data_t *cb_data;

	cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);

	cb_data->zcd_txg = txg;
	cb_data->zcd_spa = dmu_objset_spa(os);

	return (cb_data);
	}

	/*
	* If a number of txgs equal to this threshold have been created after a commit
	* callback has been registered but not called, then we assume there is an
	* implementation bug.
	*/
	#define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2)

	/*
	* Commit callback test.
	*/
	void
	ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
	{
	objset_t *os = zd->zd_os;
	ztest_od_t od[1];
	dmu_tx_t *tx;
	ztest_cb_data_t cb_data[3], tmp_cb;
	uint64_t old_txg, txg;
	int i, error;

	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);

	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	return;

	tx = dmu_tx_create(os);

	cb_data[0] = ztest_create_cb_data(os, 0);
	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);

	dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));

	/* Every once in a while, abort the transaction on purpose */
	if (ztest_random(100) == 0)
	error = -1;

	if (!error)
	error = dmu_tx_assign(tx, TXG_NOWAIT);

	txg = error ? 0 : dmu_tx_get_txg(tx);

	cb_data[0]->zcd_txg = txg;
	cb_data[1] = ztest_create_cb_data(os, txg);
	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);

	if (error) {
	/*
	* It's not a strict requirement to call the registered
	* callbacks from inside dmu_tx_abort(), but that's what
	* it's supposed to happen in the current implementation
	* so we will check for that.
	*/
	for (i = 0; i < 2; i++) {
	cb_data[i]->zcd_expected_err = ECANCELED;
	VERIFY(!cb_data[i]->zcd_called);
	}

	dmu_tx_abort(tx);

	for (i = 0; i < 2; i++) {
	VERIFY(cb_data[i]->zcd_called);
	umem_free(cb_data[i], sizeof (ztest_cb_data_t));
	}

	return;
	}

	cb_data[2] = ztest_create_cb_data(os, txg);
	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);

	/*
	* Read existing data to make sure there isn't a future leak.
	*/
	VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
	&old_txg, DMU_READ_PREFETCH));

	if (old_txg > txg)
	fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
	old_txg, txg);

	dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);

	(void) mutex_lock(&zcl.zcl_callbacks_lock);

	/*
	* Since commit callbacks don't have any ordering requirement and since
	* it is theoretically possible for a commit callback to be called
	* after an arbitrary amount of time has elapsed since its txg has been
	* synced, it is difficult to reliably determine whether a commit
	* callback hasn't been called due to high load or due to a flawed
	* implementation.
	*
	* In practice, we will assume that if after a certain number of txgs a
	* commit callback hasn't been called, then most likely there's an
	* implementation bug..
	*/
	tmp_cb = list_head(&zcl.zcl_callbacks);
	if (tmp_cb != NULL &&
	(txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) {
	fatal(0, "Commit callback threshold exceeded, oldest txg: %"
	PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
	}

	/*
	* Let's find the place to insert our callbacks.
	*
	* Even though the list is ordered by txg, it is possible for the
	* insertion point to not be the end because our txg may already be
	* quiescing at this point and other callbacks in the open txg
	* (from other objsets) may have sneaked in.
	*/
	tmp_cb = list_tail(&zcl.zcl_callbacks);
	while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
	tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);

	/* Add the 3 callbacks to the list */
	for (i = 0; i < 3; i++) {
	if (tmp_cb == NULL)
	list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
	else
	list_insert_after(&zcl.zcl_callbacks, tmp_cb,
	cb_data[i]);

	cb_data[i]->zcd_added = B_TRUE;
	VERIFY(!cb_data[i]->zcd_called);

	tmp_cb = cb_data[i];
	}

	(void) mutex_unlock(&zcl.zcl_callbacks_lock);

	dmu_tx_commit(tx);
	}

	/* ARGSUSED */
	void
	ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
	{
	zfs_prop_t proplist[] = {
	ZFS_PROP_CHECKSUM,
	ZFS_PROP_COMPRESSION,
	ZFS_PROP_COPIES,
	ZFS_PROP_DEDUP
	};

	(void) rw_rdlock(&ztest_name_lock);

	for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
	(void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
	ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));

	(void) rw_unlock(&ztest_name_lock);
	}

	/* ARGSUSED */
	void
	+ztest_remap_blocks(ztest_ds_t *zd, uint64_t id)
	+{
	+ (void) rw_rdlock(&ztest_name_lock);
	+
	+ int error = dmu_objset_remap_indirects(zd->zd_name);
	+ if (error == ENOSPC)
	+ error = 0;
	+ ASSERT0(error);
	+
	+ (void) rw_unlock(&ztest_name_lock);
	+}
	+
	+/* ARGSUSED */
	+void
	ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
	{
	nvlist_t *props = NULL;

	(void) rw_rdlock(&ztest_name_lock);

	(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
	ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));

	VERIFY0(spa_prop_get(ztest_spa, &props));

	if (ztest_opts.zo_verbose >= 6)
	dump_nvlist(props, 4);

	nvlist_free(props);

	(void) rw_unlock(&ztest_name_lock);
	}

	static int
	user_release_one(const char snapname, const char holdname)
	{
	nvlist_t snaps, holds;
	int error;

	snaps = fnvlist_alloc();
	holds = fnvlist_alloc();
	fnvlist_add_boolean(holds, holdname);
	fnvlist_add_nvlist(snaps, snapname, holds);
	fnvlist_free(holds);
	error = dsl_dataset_user_release(snaps, NULL);
	fnvlist_free(snaps);
	return (error);
	}

	/*
	* Test snapshot hold/release and deferred destroy.
	*/
	void
	ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
	{
	int error;
	objset_t *os = zd->zd_os;
	objset_t *origin;
	char snapname[100];
	char fullname[100];
	char clonename[100];
	char tag[100];
	char osname[ZFS_MAX_DATASET_NAME_LEN];
	nvlist_t *holds;

	(void) rw_rdlock(&ztest_name_lock);

	dmu_objset_name(os, osname);

	(void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id);
	(void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
	(void) snprintf(clonename, sizeof (clonename),
	"%s/ch1_%llu", osname, id);
	(void) snprintf(tag, sizeof (tag), "tag_%llu", id);

	/*
	* Clean up from any previous run.
	*/
	error = dsl_destroy_head(clonename);
	if (error != ENOENT)
	ASSERT0(error);
	error = user_release_one(fullname, tag);
	if (error != ESRCH && error != ENOENT)
	ASSERT0(error);
	error = dsl_destroy_snapshot(fullname, B_FALSE);
	if (error != ENOENT)
	ASSERT0(error);

	/*
	* Create snapshot, clone it, mark snap for deferred destroy,
	* destroy clone, verify snap was also destroyed.
	*/
	error = dmu_objset_snapshot_one(osname, snapname);
	if (error) {
	if (error == ENOSPC) {
	ztest_record_enospc("dmu_objset_snapshot");
	goto out;
	}
	fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
	}

	error = dmu_objset_clone(clonename, fullname);
	if (error) {
	if (error == ENOSPC) {
	ztest_record_enospc("dmu_objset_clone");
	goto out;
	}
	fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
	}

	error = dsl_destroy_snapshot(fullname, B_TRUE);
	if (error) {
	fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
	fullname, error);
	}

	error = dsl_destroy_head(clonename);
	if (error)
	fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);

	error = dmu_objset_hold(fullname, FTAG, &origin);
	if (error != ENOENT)
	fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);

	/*
	* Create snapshot, add temporary hold, verify that we can't
	* destroy a held snapshot, mark for deferred destroy,
	* release hold, verify snapshot was destroyed.
	*/
	error = dmu_objset_snapshot_one(osname, snapname);
	if (error) {
	if (error == ENOSPC) {
	ztest_record_enospc("dmu_objset_snapshot");
	goto out;
	}
	fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
	}

	holds = fnvlist_alloc();
	fnvlist_add_string(holds, fullname, tag);
	error = dsl_dataset_user_hold(holds, 0, NULL);
	fnvlist_free(holds);

	if (error == ENOSPC) {
	ztest_record_enospc("dsl_dataset_user_hold");
	goto out;
	} else if (error) {
	fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
	fullname, tag, error);
	}

	error = dsl_destroy_snapshot(fullname, B_FALSE);
	if (error != EBUSY) {
	fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
	fullname, error);
	}

	error = dsl_destroy_snapshot(fullname, B_TRUE);
	if (error) {
	fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
	fullname, error);
	}

	error = user_release_one(fullname, tag);
	if (error)
	fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);

	VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);

	out:
	(void) rw_unlock(&ztest_name_lock);
	}

	/*
	* Inject random faults into the on-disk data.
	*/
	/* ARGSUSED */
	void
	ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
	{
	ztest_shared_t *zs = ztest_shared;
	spa_t *spa = ztest_spa;
	int fd;
	uint64_t offset;
	uint64_t leaves;
	uint64_t bad = 0x1990c0ffeedecadeULL;
	uint64_t top, leaf;
	char path0[MAXPATHLEN];
	char pathrand[MAXPATHLEN];
	size_t fsize;
	int bshift = SPA_MAXBLOCKSHIFT + 2;
	int iters = 1000;
	int maxfaults;
	int mirror_save;
	vdev_t *vd0 = NULL;
	uint64_t guid0 = 0;
	boolean_t islog = B_FALSE;

	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
	maxfaults = MAXFAULTS();
	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
	mirror_save = zs->zs_mirrors;
	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);

	ASSERT(leaves >= 1);

	/*
	* Grab the name lock as reader. There are some operations
	* which don't like to have their vdevs changed while
	* they are in progress (i.e. spa_change_guid). Those
	* operations will have grabbed the name lock as writer.
	*/
	(void) rw_rdlock(&ztest_name_lock);

	/*
	* We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
	*/
	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

	if (ztest_random(2) == 0) {
	/*
	* Inject errors on a normal data device or slog device.
	*/
	top = ztest_random_vdev_top(spa, B_TRUE);
	leaf = ztest_random(leaves) + zs->zs_splits;

	/*
	* Generate paths to the first leaf in this top-level vdev,
	* and to the random leaf we selected. We'll induce transient
	* write failures and random online/offline activity on leaf 0,
	* and we'll write random garbage to the randomly chosen leaf.
	*/
	(void) snprintf(path0, sizeof (path0), ztest_dev_template,
	ztest_opts.zo_dir, ztest_opts.zo_pool,
	top * leaves + zs->zs_splits);
	(void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
	ztest_opts.zo_dir, ztest_opts.zo_pool,
	top * leaves + leaf);

	vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
	if (vd0 != NULL && vd0->vdev_top->vdev_islog)
	islog = B_TRUE;

	/*
	* If the top-level vdev needs to be resilvered
	* then we only allow faults on the device that is
	* resilvering.
	*/
	if (vd0 != NULL && maxfaults != 1 &&
	(!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) \|\|
	vd0->vdev_resilver_txg != 0)) {
	/*
	* Make vd0 explicitly claim to be unreadable,
	* or unwriteable, or reach behind its back
	* and close the underlying fd. We can do this if
	* maxfaults == 0 because we'll fail and reexecute,
	* and we can do it if maxfaults >= 2 because we'll
	* have enough redundancy. If maxfaults == 1, the
	* combination of this with injection of random data
	* corruption below exceeds the pool's fault tolerance.
	*/
	vdev_file_t *vf = vd0->vdev_tsd;
	+
	+ zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d",
	+ (long long)vd0->vdev_id, (int)maxfaults);

	if (vf != NULL && ztest_random(3) == 0) {
	(void) close(vf->vf_vnode->v_fd);
	vf->vf_vnode->v_fd = -1;
	} else if (ztest_random(2) == 0) {
	vd0->vdev_cant_read = B_TRUE;
	} else {
	vd0->vdev_cant_write = B_TRUE;
	}
	guid0 = vd0->vdev_guid;
	}
	} else {
	/*
	* Inject errors on an l2cache device.
	*/
	spa_aux_vdev_t *sav = &spa->spa_l2cache;

	if (sav->sav_count == 0) {
	spa_config_exit(spa, SCL_STATE, FTAG);
	(void) rw_unlock(&ztest_name_lock);
	return;
	}
	vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
	guid0 = vd0->vdev_guid;
	(void) strcpy(path0, vd0->vdev_path);
	(void) strcpy(pathrand, vd0->vdev_path);

	leaf = 0;
	leaves = 1;
	maxfaults = INT_MAX; /* no limit on cache devices */
	}

	spa_config_exit(spa, SCL_STATE, FTAG);
	(void) rw_unlock(&ztest_name_lock);

	/*
	* If we can tolerate two or more faults, or we're dealing
	* with a slog, randomly online/offline vd0.
	*/
	if ((maxfaults >= 2 \|\| islog) && guid0 != 0) {
	if (ztest_random(10) < 6) {
	int flags = (ztest_random(2) == 0 ?
	ZFS_OFFLINE_TEMPORARY : 0);

	/*
	* We have to grab the zs_name_lock as writer to
	* prevent a race between offlining a slog and
	* destroying a dataset. Offlining the slog will
	* grab a reference on the dataset which may cause
	* dmu_objset_destroy() to fail with EBUSY thus
	* leaving the dataset in an inconsistent state.
	*/
	if (islog)
	(void) rw_wrlock(&ztest_name_lock);

	VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);

	if (islog)
	(void) rw_unlock(&ztest_name_lock);
	} else {
	/*
	* Ideally we would like to be able to randomly
	* call vdev_[on\|off]line without holding locks
	* to force unpredictable failures but the side
	* effects of vdev_[on\|off]line prevent us from
	* doing so. We grab the ztest_vdev_lock here to
	* prevent a race between injection testing and
	* aux_vdev removal.
	*/
	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
	(void) vdev_online(spa, guid0, 0, NULL);
	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	}
	}

	if (maxfaults == 0)
	return;

	/*
	* We have at least single-fault tolerance, so inject data corruption.
	*/
	fd = open(pathrand, O_RDWR);

	if (fd == -1) /* we hit a gap in the device namespace */
	return;

	fsize = lseek(fd, 0, SEEK_END);

	while (--iters != 0) {
	/*
	* The offset must be chosen carefully to ensure that
	* we do not inject a given logical block with errors
	* on two different leaf devices, because ZFS can not
	* tolerate that (if maxfaults==1).
	*
	* We divide each leaf into chunks of size
	* (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk
	* there is a series of ranges to which we can inject errors.
	* Each range can accept errors on only a single leaf vdev.
	* The error injection ranges are separated by ranges
	* which we will not inject errors on any device (DMZs).
	* Each DMZ must be large enough such that a single block
	* can not straddle it, so that a single block can not be
	* a target in two different injection ranges (on different
	* leaf vdevs).
	*
	* For example, with 3 leaves, each chunk looks like:
	* 0 to 32M: injection range for leaf 0
	* 32M to 64M: DMZ - no injection allowed
	* 64M to 96M: injection range for leaf 1
	* 96M to 128M: DMZ - no injection allowed
	* 128M to 160M: injection range for leaf 2
	* 160M to 192M: DMZ - no injection allowed
	*/
	offset = ztest_random(fsize / (leaves << bshift)) *
	(leaves << bshift) + (leaf << bshift) +
	(ztest_random(1ULL << (bshift - 1)) & -8ULL);

	/*
	* Only allow damage to the labels at one end of the vdev.
	*
	* If all labels are damaged, the device will be totally
	* inaccessible, which will result in loss of data,
	* because we also damage (parts of) the other side of
	* the mirror/raidz.
	*
	* Additionally, we will always have both an even and an
	* odd label, so that we can handle crashes in the
	* middle of vdev_config_sync().
	*/
	if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
	continue;

	/*
	* The two end labels are stored at the "end" of the disk, but
	* the end of the disk (vdev_psize) is aligned to
	* sizeof (vdev_label_t).
	*/
	uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
	if ((leaf & 1) == 1 &&
	offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
	continue;

	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
	if (mirror_save != zs->zs_mirrors) {
	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
	(void) close(fd);
	return;
	}

	if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
	fatal(1, "can't inject bad word at 0x%llx in %s",
	offset, pathrand);

	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);

	if (ztest_opts.zo_verbose >= 7)
	(void) printf("injected bad word into %s,"
	" offset 0x%llx\n", pathrand, (u_longlong_t)offset);
	}

	(void) close(fd);
	}

	/*
	* Verify that DDT repair works as expected.
	*/
	void
	ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
	{
	ztest_shared_t *zs = ztest_shared;
	spa_t *spa = ztest_spa;
	objset_t *os = zd->zd_os;
	ztest_od_t od[1];
	uint64_t object, blocksize, txg, pattern, psize;
	enum zio_checksum checksum = spa_dedup_checksum(spa);
	dmu_buf_t *db;
	dmu_tx_t *tx;
	abd_t *abd;
	blkptr_t blk;
	int copies = 2 * ZIO_DEDUPDITTO_MIN;

	blocksize = ztest_random_blocksize();
	blocksize = MIN(blocksize, 2048); /* because we write so many */

	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);

	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
	return;

	/*
	* Take the name lock as writer to prevent anyone else from changing
	* the pool and dataset properies we need to maintain during this test.
	*/
	(void) rw_wrlock(&ztest_name_lock);

	if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
	B_FALSE) != 0 \|\|
	ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
	B_FALSE) != 0) {
	(void) rw_unlock(&ztest_name_lock);
	return;
	}

	dmu_objset_stats_t dds;
	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
	dmu_objset_fast_stat(os, &dds);
	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);

	object = od[0].od_object;
	blocksize = od[0].od_blocksize;
	pattern = zs->zs_guid ^ dds.dds_guid;

	ASSERT(object != 0);

	tx = dmu_tx_create(os);
	dmu_tx_hold_write(tx, object, 0, copies * blocksize);
	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
	if (txg == 0) {
	(void) rw_unlock(&ztest_name_lock);
	return;
	}

	/*
	* Write all the copies of our block.
	*/
	for (int i = 0; i < copies; i++) {
	uint64_t offset = i * blocksize;
	int error = dmu_buf_hold(os, object, offset, FTAG, &db,
	DMU_READ_NO_PREFETCH);
	if (error != 0) {
	fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
	os, (long long)object, (long long) offset, error);
	}
	ASSERT(db->db_offset == offset);
	ASSERT(db->db_size == blocksize);
	ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) \|\|
	ztest_pattern_match(db->db_data, db->db_size, 0ULL));
	dmu_buf_will_fill(db, tx);
	ztest_pattern_set(db->db_data, db->db_size, pattern);
	dmu_buf_rele(db, FTAG);
	}

	dmu_tx_commit(tx);
	txg_wait_synced(spa_get_dsl(spa), txg);

	/*
	* Find out what block we got.
	*/
	VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
	DMU_READ_NO_PREFETCH));
	blk = ((dmu_buf_impl_t )db)->db_blkptr;
	dmu_buf_rele(db, FTAG);

	/*
	* Damage the block. Dedup-ditto will save us when we read it later.
	*/
	psize = BP_GET_PSIZE(&blk);
	abd = abd_alloc_linear(psize, B_TRUE);
	ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);

	(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
	abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_INDUCE_DAMAGE, NULL));

	abd_free(abd);

	(void) rw_unlock(&ztest_name_lock);
	}

	/*
	* Scrub the pool.
	*/
	/* ARGSUSED */
	void
	ztest_scrub(ztest_ds_t *zd, uint64_t id)
	{
	spa_t *spa = ztest_spa;

	(void) spa_scan(spa, POOL_SCAN_SCRUB);
	(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
	(void) spa_scan(spa, POOL_SCAN_SCRUB);
	}

	/*
	* Change the guid for the pool.
	*/
	/* ARGSUSED */
	void
	ztest_reguid(ztest_ds_t *zd, uint64_t id)
	{
	spa_t *spa = ztest_spa;
	uint64_t orig, load;
	int error;

	orig = spa_guid(spa);
	load = spa_load_guid(spa);

	(void) rw_wrlock(&ztest_name_lock);
	error = spa_change_guid(spa);
	(void) rw_unlock(&ztest_name_lock);

	if (error != 0)
	return;

	if (ztest_opts.zo_verbose >= 4) {
	(void) printf("Changed guid old %llu -> %llu\n",
	(u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
	}

	VERIFY3U(orig, !=, spa_guid(spa));
	VERIFY3U(load, ==, spa_load_guid(spa));
	}

	/*
	* Rename the pool to a different name and then rename it back.
	*/
	/* ARGSUSED */
	void
	ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
	{
	char oldname, newname;
	spa_t *spa;

	(void) rw_wrlock(&ztest_name_lock);

	oldname = ztest_opts.zo_pool;
	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
	(void) strcpy(newname, oldname);
	(void) strcat(newname, "_tmp");

	/*
	* Do the rename
	*/
	VERIFY3U(0, ==, spa_rename(oldname, newname));

	/*
	* Try to open it under the old name, which shouldn't exist
	*/
	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));

	/*
	* Open it under the new name and make sure it's still the same spa_t.
	*/
	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));

	ASSERT(spa == ztest_spa);
	spa_close(spa, FTAG);

	/*
	* Rename it back to the original
	*/
	VERIFY3U(0, ==, spa_rename(newname, oldname));

	/*
	* Make sure it can still be opened
	*/
	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));

	ASSERT(spa == ztest_spa);
	spa_close(spa, FTAG);

	umem_free(newname, strlen(newname) + 1);

	(void) rw_unlock(&ztest_name_lock);
	}

	/*
	* Verify pool integrity by running zdb.
	*/
	static void
	ztest_run_zdb(char *pool)
	{
	int status;
	char zdb[MAXPATHLEN + MAXNAMELEN + 20];
	char zbuf[1024];
	char *bin;
	char *ztest;
	char *isa;
	int isalen;
	FILE *fp;

	strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb));

	/* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
	bin = strstr(zdb, "/usr/bin/");
	ztest = strstr(bin, "/ztest");
	isa = bin + 8;
	isalen = ztest - isa;
	isa = strdup(isa);
	/* LINTED */
	(void) sprintf(bin,
	"/usr/sbin%.*s/zdb -bcc%s%s -G -d -U %s %s",
	isalen,
	isa,
	ztest_opts.zo_verbose >= 3 ? "s" : "",
	ztest_opts.zo_verbose >= 4 ? "v" : "",
	spa_config_path,
	pool);
	free(isa);

	if (ztest_opts.zo_verbose >= 5)
	(void) printf("Executing %s\n", strstr(zdb, "zdb "));

	fp = popen(zdb, "r");
	assert(fp != NULL);

	while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
	if (ztest_opts.zo_verbose >= 3)
	(void) printf("%s", zbuf);

	status = pclose(fp);

	if (status == 0)
	return;

	ztest_dump_core = 0;
	if (WIFEXITED(status))
	fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
	else
	fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
	}

	static void
	ztest_walk_pool_directory(char *header)
	{
	spa_t *spa = NULL;

	if (ztest_opts.zo_verbose >= 6)
	(void) printf("%s\n", header);

	mutex_enter(&spa_namespace_lock);
	while ((spa = spa_next(spa)) != NULL)
	if (ztest_opts.zo_verbose >= 6)
	(void) printf("\t%s\n", spa_name(spa));
	mutex_exit(&spa_namespace_lock);
	}

	static void
	ztest_spa_import_export(char oldname, char newname)
	{
	nvlist_t config, newconfig;
	uint64_t pool_guid;
	spa_t *spa;
	int error;

	if (ztest_opts.zo_verbose >= 4) {
	(void) printf("import/export: old = %s, new = %s\n",
	oldname, newname);
	}

	/*
	* Clean up from previous runs.
	*/
	(void) spa_destroy(newname);

	/*
	* Get the pool's configuration and guid.
	*/
	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));

	/*
	* Kick off a scrub to tickle scrub/export races.
	*/
	if (ztest_random(2) == 0)
	(void) spa_scan(spa, POOL_SCAN_SCRUB);

	pool_guid = spa_guid(spa);
	spa_close(spa, FTAG);

	ztest_walk_pool_directory("pools before export");

	/*
	* Export it.
	*/
	VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));

	ztest_walk_pool_directory("pools after export");

	/*
	* Try to import it.
	*/
	newconfig = spa_tryimport(config);
	ASSERT(newconfig != NULL);
	nvlist_free(newconfig);

	/*
	* Import it under the new name.
	*/
	error = spa_import(newname, config, NULL, 0);
	if (error != 0) {
	dump_nvlist(config, 0);
	fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
	oldname, newname, error);
	}

	ztest_walk_pool_directory("pools after import");

	/*
	* Try to import it again -- should fail with EEXIST.
	*/
	VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));

	/*
	* Try to import it under a different name -- should fail with EEXIST.
	*/
	VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));

	/*
	* Verify that the pool is no longer visible under the old name.
	*/
	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));

	/*
	* Verify that we can open and close the pool using the new name.
	*/
	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
	ASSERT(pool_guid == spa_guid(spa));
	spa_close(spa, FTAG);

	nvlist_free(config);
	}

	static void
	ztest_resume(spa_t *spa)
	{
	if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6)
	(void) printf("resuming from suspended state\n");
	spa_vdev_state_enter(spa, SCL_NONE);
	vdev_clear(spa, NULL);
	(void) spa_vdev_state_exit(spa, NULL, 0);
	(void) zio_resume(spa);
	}

	static void *
	ztest_resume_thread(void *arg)
	{
	spa_t *spa = arg;

	while (!ztest_exiting) {
	if (spa_suspended(spa))
	ztest_resume(spa);
	(void) poll(NULL, 0, 100);

	/*
	* Periodically change the zfs_compressed_arc_enabled setting.
	*/
	if (ztest_random(10) == 0)
	zfs_compressed_arc_enabled = ztest_random(2);

	/*
	* Periodically change the zfs_abd_scatter_enabled setting.
	*/
	if (ztest_random(10) == 0)
	zfs_abd_scatter_enabled = ztest_random(2);
	}
	return (NULL);
	}

	static void *
	ztest_deadman_thread(void *arg)
	{
	ztest_shared_t *zs = arg;
	spa_t *spa = ztest_spa;
	hrtime_t delta, total = 0;

	for (;;) {
	delta = zs->zs_thread_stop - zs->zs_thread_start +
	MSEC2NSEC(zfs_deadman_synctime_ms);

	(void) poll(NULL, 0, (int)NSEC2MSEC(delta));

	/*
	* If the pool is suspended then fail immediately. Otherwise,
	* check to see if the pool is making any progress. If
	* vdev_deadman() discovers that there hasn't been any recent
	* I/Os then it will end up aborting the tests.
	*/
	if (spa_suspended(spa) \|\| spa->spa_root_vdev == NULL) {
	fatal(0, "aborting test after %llu seconds because "
	"pool has transitioned to a suspended state.",
	zfs_deadman_synctime_ms / 1000);
	return (NULL);
	}
	vdev_deadman(spa->spa_root_vdev);

	total += zfs_deadman_synctime_ms/1000;
	(void) printf("ztest has been running for %lld seconds\n",
	total);
	}
	}

	static void
	ztest_execute(int test, ztest_info_t *zi, uint64_t id)
	{
	ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets];
	ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test);
	hrtime_t functime = gethrtime();

	for (int i = 0; i < zi->zi_iters; i++)
	zi->zi_func(zd, id);

	functime = gethrtime() - functime;

	atomic_add_64(&zc->zc_count, 1);
	atomic_add_64(&zc->zc_time, functime);

	if (ztest_opts.zo_verbose >= 4) {
	Dl_info dli;
	(void) dladdr((void *)zi->zi_func, &dli);
	(void) printf("%6.2f sec in %s\n",
	(double)functime / NANOSEC, dli.dli_sname);
	}
	}

	static void *
	ztest_thread(void *arg)
	{
	int rand;
	uint64_t id = (uintptr_t)arg;
	ztest_shared_t *zs = ztest_shared;
	uint64_t call_next;
	hrtime_t now;
	ztest_info_t *zi;
	ztest_shared_callstate_t *zc;

	while ((now = gethrtime()) < zs->zs_thread_stop) {
	/*
	* See if it's time to force a crash.
	*/
	if (now > zs->zs_thread_kill)
	ztest_kill(zs);

	/*
	* If we're getting ENOSPC with some regularity, stop.
	*/
	if (zs->zs_enospc_count > 10)
	break;

	/*
	* Pick a random function to execute.
	*/
	rand = ztest_random(ZTEST_FUNCS);
	zi = &ztest_info[rand];
	zc = ZTEST_GET_SHARED_CALLSTATE(rand);
	call_next = zc->zc_next;

	if (now >= call_next &&
	atomic_cas_64(&zc->zc_next, call_next, call_next +
	ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) {
	ztest_execute(rand, zi, id);
	}
	}

	return (NULL);
	}

	static void
	ztest_dataset_name(char dsname, char pool, int d)
	{
	(void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d);
	}

	static void
	ztest_dataset_destroy(int d)
	{
	char name[ZFS_MAX_DATASET_NAME_LEN];

	ztest_dataset_name(name, ztest_opts.zo_pool, d);

	if (ztest_opts.zo_verbose >= 3)
	(void) printf("Destroying %s to free up space\n", name);

	/*
	* Cleanup any non-standard clones and snapshots. In general,
	* ztest thread t operates on dataset (t % zopt_datasets),
	* so there may be more than one thing to clean up.
	*/
	for (int t = d; t < ztest_opts.zo_threads;
	t += ztest_opts.zo_datasets) {
	ztest_dsl_dataset_cleanup(name, t);
	}

	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
	DS_FIND_SNAPSHOTS \| DS_FIND_CHILDREN);
	}

	static void
	ztest_dataset_dirobj_verify(ztest_ds_t *zd)
	{
	uint64_t usedobjs, dirobjs, scratch;

	/*
	* ZTEST_DIROBJ is the object directory for the entire dataset.
	* Therefore, the number of objects in use should equal the
	* number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
	* If not, we have an object leak.
	*
	* Note that we can only check this in ztest_dataset_open(),
	* when the open-context and syncing-context values agree.
	* That's because zap_count() returns the open-context value,
	* while dmu_objset_space() returns the rootbp fill count.
	*/
	VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
	dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
	ASSERT3U(dirobjs + 1, ==, usedobjs);
	}

	static int
	ztest_dataset_open(int d)
	{
	ztest_ds_t *zd = &ztest_ds[d];
	uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
	objset_t *os;
	zilog_t *zilog;
	char name[ZFS_MAX_DATASET_NAME_LEN];
	int error;

	ztest_dataset_name(name, ztest_opts.zo_pool, d);

	(void) rw_rdlock(&ztest_name_lock);

	error = ztest_dataset_create(name);
	if (error == ENOSPC) {
	(void) rw_unlock(&ztest_name_lock);
	ztest_record_enospc(FTAG);
	return (error);
	}
	ASSERT(error == 0 \|\| error == EEXIST);

	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
	(void) rw_unlock(&ztest_name_lock);

	ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);

	zilog = zd->zd_zilog;

	if (zilog->zl_header->zh_claim_lr_seq != 0 &&
	zilog->zl_header->zh_claim_lr_seq < committed_seq)
	fatal(0, "missing log records: claimed %llu < committed %llu",
	zilog->zl_header->zh_claim_lr_seq, committed_seq);

	ztest_dataset_dirobj_verify(zd);

	zil_replay(os, zd, ztest_replay_vector);

	ztest_dataset_dirobj_verify(zd);

	if (ztest_opts.zo_verbose >= 6)
	(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
	zd->zd_name,
	(u_longlong_t)zilog->zl_parse_blk_count,
	(u_longlong_t)zilog->zl_parse_lr_count,
	(u_longlong_t)zilog->zl_replaying_seq);

	zilog = zil_open(os, ztest_get_data);

	if (zilog->zl_replaying_seq != 0 &&
	zilog->zl_replaying_seq < committed_seq)
	fatal(0, "missing log records: replayed %llu < committed %llu",
	zilog->zl_replaying_seq, committed_seq);

	return (0);
	}

	static void
	ztest_dataset_close(int d)
	{
	ztest_ds_t *zd = &ztest_ds[d];

	zil_close(zd->zd_zilog);
	dmu_objset_disown(zd->zd_os, zd);

	ztest_zd_fini(zd);
	}

	/*
	* Kick off threads to run tests on all datasets in parallel.
	*/
	static void
	ztest_run(ztest_shared_t *zs)
	{
	thread_t *tid;
	spa_t *spa;
	objset_t *os;
	thread_t resume_tid;
	int error;

	ztest_exiting = B_FALSE;

	/*
	* Initialize parent/child shared state.
	*/
	VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);

	zs->zs_thread_start = gethrtime();
	zs->zs_thread_stop =
	zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
	zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
	zs->zs_thread_kill = zs->zs_thread_stop;
	if (ztest_random(100) < ztest_opts.zo_killrate) {
	zs->zs_thread_kill -=
	ztest_random(ztest_opts.zo_passtime * NANOSEC);
	}

	(void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);

	list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
	offsetof(ztest_cb_data_t, zcd_node));

	/*
	* Open our pool.
	*/
	kernel_init(FREAD \| FWRITE);
	VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
	spa->spa_debug = B_TRUE;
	metaslab_preload_limit = ztest_random(20) + 1;
	ztest_spa = spa;

	dmu_objset_stats_t dds;
	VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
	DMU_OST_ANY, B_TRUE, FTAG, &os));
	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
	dmu_objset_fast_stat(os, &dds);
	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	zs->zs_guid = dds.dds_guid;
	dmu_objset_disown(os, FTAG);

	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;

	/*
	* We don't expect the pool to suspend unless maxfaults == 0,
	* in which case ztest_fault_inject() temporarily takes away
	* the only valid replica.
	*/
	if (MAXFAULTS() == 0)
	spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
	else
	spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;

	/*
	* Create a thread to periodically resume suspended I/O.
	*/
	VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND,
	&resume_tid) == 0);

	/*
	* Create a deadman thread to abort() if we hang.
	*/
	VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
	NULL) == 0);

	/*
	* Verify that we can safely inquire about about any object,
	* whether it's allocated or not. To make it interesting,
	* we probe a 5-wide window around each power of two.
	* This hits all edge cases, including zero and the max.
	*/
	for (int t = 0; t < 64; t++) {
	for (int d = -5; d <= 5; d++) {
	error = dmu_object_info(spa->spa_meta_objset,
	(1ULL << t) + d, NULL);
	ASSERT(error == 0 \|\| error == ENOENT \|\|
	error == EINVAL);
	}
	}

	/*
	* If we got any ENOSPC errors on the previous run, destroy something.
	*/
	if (zs->zs_enospc_count != 0) {
	int d = ztest_random(ztest_opts.zo_datasets);
	ztest_dataset_destroy(d);
	}
	zs->zs_enospc_count = 0;

	tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t),
	UMEM_NOFAIL);

	if (ztest_opts.zo_verbose >= 4)
	(void) printf("starting main threads...\n");

	/*
	* Kick off all the tests that run in parallel.
	*/
	for (int t = 0; t < ztest_opts.zo_threads; t++) {
	if (t < ztest_opts.zo_datasets &&
	ztest_dataset_open(t) != 0)
	return;
	VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
	THR_BOUND, &tid[t]) == 0);
	}

	/*
	* Wait for all of the tests to complete. We go in reverse order
	* so we don't close datasets while threads are still using them.
	*/
	for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) {
	VERIFY(thr_join(tid[t], NULL, NULL) == 0);
	if (t < ztest_opts.zo_datasets)
	ztest_dataset_close(t);
	}

	txg_wait_synced(spa_get_dsl(spa), 0);

	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
	zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
	zfs_dbgmsg_print(FTAG);

	umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));

	/* Kill the resume thread */
	ztest_exiting = B_TRUE;
	VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
	ztest_resume(spa);

	/*
	* Right before closing the pool, kick off a bunch of async I/O;
	* spa_close() should wait for it to complete.
	*/
	for (uint64_t object = 1; object < 50; object++) {
	dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
	ZIO_PRIORITY_SYNC_READ);
	}

	spa_close(spa, FTAG);

	/*
	* Verify that we can loop over all pools.
	*/
	mutex_enter(&spa_namespace_lock);
	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
	if (ztest_opts.zo_verbose > 3)
	(void) printf("spa_next: found %s\n", spa_name(spa));
	mutex_exit(&spa_namespace_lock);

	/*
	* Verify that we can export the pool and reimport it under a
	* different name.
	*/
	if (ztest_random(2) == 0) {
	char name[ZFS_MAX_DATASET_NAME_LEN];
	(void) snprintf(name, sizeof (name), "%s_import",
	ztest_opts.zo_pool);
	ztest_spa_import_export(ztest_opts.zo_pool, name);
	ztest_spa_import_export(name, ztest_opts.zo_pool);
	}

	kernel_fini();

	list_destroy(&zcl.zcl_callbacks);

	(void) _mutex_destroy(&zcl.zcl_callbacks_lock);

	(void) rwlock_destroy(&ztest_name_lock);
	(void) _mutex_destroy(&ztest_vdev_lock);
	}

	static void
	ztest_freeze(void)
	{
	ztest_ds_t *zd = &ztest_ds[0];
	spa_t *spa;
	int numloops = 0;

	if (ztest_opts.zo_verbose >= 3)
	(void) printf("testing spa_freeze()...\n");

	kernel_init(FREAD \| FWRITE);
	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
	VERIFY3U(0, ==, ztest_dataset_open(0));
	spa->spa_debug = B_TRUE;
	ztest_spa = spa;

	/*
	* Force the first log block to be transactionally allocated.
	* We have to do this before we freeze the pool -- otherwise
	* the log chain won't be anchored.
	*/
	while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
	ztest_dmu_object_alloc_free(zd, 0);
	zil_commit(zd->zd_zilog, 0);
	}

	txg_wait_synced(spa_get_dsl(spa), 0);

	/*
	* Freeze the pool. This stops spa_sync() from doing anything,
	* so that the only way to record changes from now on is the ZIL.
	*/
	spa_freeze(spa);

	/*
	* Because it is hard to predict how much space a write will actually
	* require beforehand, we leave ourselves some fudge space to write over
	* capacity.
	*/
	uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;

	/*
	* Run tests that generate log records but don't alter the pool config
	* or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
	* We do a txg_wait_synced() after each iteration to force the txg
	* to increase well beyond the last synced value in the uberblock.
	* The ZIL should be OK with that.
	*
	* Run a random number of times less than zo_maxloops and ensure we do
	* not run out of space on the pool.
	*/
	while (ztest_random(10) != 0 &&
	numloops++ < ztest_opts.zo_maxloops &&
	metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
	ztest_od_t od;
	ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
	VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
	ztest_io(zd, od.od_object,
	ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
	txg_wait_synced(spa_get_dsl(spa), 0);
	}

	/*
	* Commit all of the changes we just generated.
	*/
	zil_commit(zd->zd_zilog, 0);
	txg_wait_synced(spa_get_dsl(spa), 0);

	/*
	* Close our dataset and close the pool.
	*/
	ztest_dataset_close(0);
	spa_close(spa, FTAG);
	kernel_fini();

	/*
	* Open and close the pool and dataset to induce log replay.
	*/
	kernel_init(FREAD \| FWRITE);
	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
	ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
	VERIFY3U(0, ==, ztest_dataset_open(0));
	ztest_dataset_close(0);

	spa->spa_debug = B_TRUE;
	ztest_spa = spa;
	txg_wait_synced(spa_get_dsl(spa), 0);
	ztest_reguid(NULL, 0);

	spa_close(spa, FTAG);
	kernel_fini();
	}

	void
	print_time(hrtime_t t, char *timebuf)
	{
	hrtime_t s = t / NANOSEC;
	hrtime_t m = s / 60;
	hrtime_t h = m / 60;
	hrtime_t d = h / 24;

	s -= m * 60;
	m -= h * 60;
	h -= d * 24;

	timebuf[0] = '\0';

	if (d)
	(void) sprintf(timebuf,
	"%llud%02lluh%02llum%02llus", d, h, m, s);
	else if (h)
	(void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
	else if (m)
	(void) sprintf(timebuf, "%llum%02llus", m, s);
	else
	(void) sprintf(timebuf, "%llus", s);
	}

	static nvlist_t *
	make_random_props()
	{
	nvlist_t *props;

	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
	if (ztest_random(2) == 0)
	return (props);
	VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);

	return (props);
	}

	/*
	* Create a storage pool with the given name and initial vdev size.
	* Then test spa_freeze() functionality.
	*/
	static void
	ztest_init(ztest_shared_t *zs)
	{
	spa_t *spa;
	nvlist_t nvroot, props;

	VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);

	kernel_init(FREAD \| FWRITE);

	/*
	* Create the storage pool.
	*/
	(void) spa_destroy(ztest_opts.zo_pool);
	ztest_shared->zs_vdev_next_leaf = 0;
	zs->zs_splits = 0;
	zs->zs_mirrors = ztest_opts.zo_mirrors;
	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
	0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
	props = make_random_props();
	for (int i = 0; i < SPA_FEATURES; i++) {
	char buf[1024];
	(void) snprintf(buf, sizeof (buf), "feature@%s",
	spa_feature_table[i].fi_uname);
	VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
	}
	VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL));
	nvlist_free(nvroot);
	nvlist_free(props);

	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
	zs->zs_metaslab_sz =
	1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;

	spa_close(spa, FTAG);

	kernel_fini();

	ztest_run_zdb(ztest_opts.zo_pool);

	ztest_freeze();

	ztest_run_zdb(ztest_opts.zo_pool);

	(void) rwlock_destroy(&ztest_name_lock);
	(void) _mutex_destroy(&ztest_vdev_lock);
	}

	static void
	setup_data_fd(void)
	{
	static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";

	ztest_fd_data = mkstemp(ztest_name_data);
	ASSERT3S(ztest_fd_data, >=, 0);
	(void) unlink(ztest_name_data);
	}


	static int
	shared_data_size(ztest_shared_hdr_t *hdr)
	{
	int size;

	size = hdr->zh_hdr_size;
	size += hdr->zh_opts_size;
	size += hdr->zh_size;
	size += hdr->zh_stats_size * hdr->zh_stats_count;
	size += hdr->zh_ds_size * hdr->zh_ds_count;

	return (size);
	}

	static void
	setup_hdr(void)
	{
	int size;
	ztest_shared_hdr_t *hdr;

	hdr = (void )mmap(0, P2ROUNDUP(sizeof (hdr), getpagesize()),
	PROT_READ \| PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
	ASSERT(hdr != MAP_FAILED);

	VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));

	hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
	hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
	hdr->zh_size = sizeof (ztest_shared_t);
	hdr->zh_stats_size = sizeof (ztest_shared_callstate_t);
	hdr->zh_stats_count = ZTEST_FUNCS;
	hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
	hdr->zh_ds_count = ztest_opts.zo_datasets;

	size = shared_data_size(hdr);
	VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));

	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
	}

	static void
	setup_data(void)
	{
	int size, offset;
	ztest_shared_hdr_t *hdr;
	uint8_t *buf;

	hdr = (void )mmap(0, P2ROUNDUP(sizeof (hdr), getpagesize()),
	PROT_READ, MAP_SHARED, ztest_fd_data, 0);
	ASSERT(hdr != MAP_FAILED);

	size = shared_data_size(hdr);

	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
	hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
	PROT_READ \| PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
	ASSERT(hdr != MAP_FAILED);
	buf = (uint8_t *)hdr;

	offset = hdr->zh_hdr_size;
	ztest_shared_opts = (void *)&buf[offset];
	offset += hdr->zh_opts_size;
	ztest_shared = (void *)&buf[offset];
	offset += hdr->zh_size;
	ztest_shared_callstate = (void *)&buf[offset];
	offset += hdr->zh_stats_size * hdr->zh_stats_count;
	ztest_shared_ds = (void *)&buf[offset];
	}

	static boolean_t
	exec_child(char cmd, char libpath, boolean_t ignorekill, int *statusp)
	{
	pid_t pid;
	int status;
	char *cmdbuf = NULL;

	pid = fork();

	if (cmd == NULL) {
	cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
	(void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
	cmd = cmdbuf;
	}

	if (pid == -1)
	fatal(1, "fork failed");

	if (pid == 0) { /* child */
	char *emptyargv[2] = { cmd, NULL };
	char fd_data_str[12];

	struct rlimit rl = { 1024, 1024 };
	(void) setrlimit(RLIMIT_NOFILE, &rl);

	(void) close(ztest_fd_rand);
	VERIFY3U(11, >=,
	snprintf(fd_data_str, 12, "%d", ztest_fd_data));
	VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1));

	(void) enable_extended_FILE_stdio(-1, -1);
	if (libpath != NULL)
	VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
	#ifdef illumos
	(void) execv(cmd, emptyargv);
	#else
	(void) execvp(cmd, emptyargv);
	#endif
	ztest_dump_core = B_FALSE;
	fatal(B_TRUE, "exec failed: %s", cmd);
	}

	if (cmdbuf != NULL) {
	umem_free(cmdbuf, MAXPATHLEN);
	cmd = NULL;
	}

	while (waitpid(pid, &status, 0) != pid)
	continue;
	if (statusp != NULL)
	*statusp = status;

	if (WIFEXITED(status)) {
	if (WEXITSTATUS(status) != 0) {
	(void) fprintf(stderr, "child exited with code %d\n",
	WEXITSTATUS(status));
	exit(2);
	}
	return (B_FALSE);
	} else if (WIFSIGNALED(status)) {
	if (!ignorekill \|\| WTERMSIG(status) != SIGKILL) {
	(void) fprintf(stderr, "child died with signal %d\n",
	WTERMSIG(status));
	exit(3);
	}
	return (B_TRUE);
	} else {
	(void) fprintf(stderr, "something strange happened to child\n");
	exit(4);
	/* NOTREACHED */
	}
	}

	static void
	ztest_run_init(void)
	{
	ztest_shared_t *zs = ztest_shared;

	ASSERT(ztest_opts.zo_init != 0);

	/*
	* Blow away any existing copy of zpool.cache
	*/
	(void) remove(spa_config_path);

	/*
	* Create and initialize our storage pool.
	*/
	for (int i = 1; i <= ztest_opts.zo_init; i++) {
	bzero(zs, sizeof (ztest_shared_t));
	if (ztest_opts.zo_verbose >= 3 &&
	ztest_opts.zo_init != 1) {
	(void) printf("ztest_init(), pass %d\n", i);
	}
	ztest_init(zs);
	}
	}

	int
	main(int argc, char **argv)
	{
	int kills = 0;
	int iters = 0;
	int older = 0;
	int newer = 0;
	ztest_shared_t *zs;
	ztest_info_t *zi;
	ztest_shared_callstate_t *zc;
	char timebuf[100];
	char numbuf[NN_NUMBUF_SZ];
	spa_t *spa;
	char *cmd;
	boolean_t hasalt;
	char *fd_data_str = getenv("ZTEST_FD_DATA");

	(void) setvbuf(stdout, NULL, _IOLBF, 0);

	dprintf_setup(&argc, argv);
	zfs_deadman_synctime_ms = 300000;

	ztest_fd_rand = open("/dev/urandom", O_RDONLY);
	ASSERT3S(ztest_fd_rand, >=, 0);

	if (!fd_data_str) {
	process_options(argc, argv);

	setup_data_fd();
	setup_hdr();
	setup_data();
	bcopy(&ztest_opts, ztest_shared_opts,
	sizeof (*ztest_shared_opts));
	} else {
	ztest_fd_data = atoi(fd_data_str);
	setup_data();
	bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
	}
	ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);

	/* Override location of zpool.cache */
	VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache",
	ztest_opts.zo_dir), !=, -1);

	ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
	UMEM_NOFAIL);
	zs = ztest_shared;

	if (fd_data_str) {
	metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
	metaslab_df_alloc_threshold =
	zs->zs_metaslab_df_alloc_threshold;

	if (zs->zs_do_init)
	ztest_run_init();
	else
	ztest_run(zs);
	exit(0);
	}

	hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);

	if (ztest_opts.zo_verbose >= 1) {
	(void) printf("%llu vdevs, %d datasets, %d threads,"
	" %llu seconds...\n",
	(u_longlong_t)ztest_opts.zo_vdevs,
	ztest_opts.zo_datasets,
	ztest_opts.zo_threads,
	(u_longlong_t)ztest_opts.zo_time);
	}

	cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
	(void) strlcpy(cmd, getexecname(), MAXNAMELEN);

	zs->zs_do_init = B_TRUE;
	if (strlen(ztest_opts.zo_alt_ztest) != 0) {
	if (ztest_opts.zo_verbose >= 1) {
	(void) printf("Executing older ztest for "
	"initialization: %s\n", ztest_opts.zo_alt_ztest);
	}
	VERIFY(!exec_child(ztest_opts.zo_alt_ztest,
	ztest_opts.zo_alt_libpath, B_FALSE, NULL));
	} else {
	VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL));
	}
	zs->zs_do_init = B_FALSE;

	zs->zs_proc_start = gethrtime();
	zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC;

	for (int f = 0; f < ZTEST_FUNCS; f++) {
	zi = &ztest_info[f];
	zc = ZTEST_GET_SHARED_CALLSTATE(f);
	if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
	zc->zc_next = UINT64_MAX;
	else
	zc->zc_next = zs->zs_proc_start +
	ztest_random(2 * zi->zi_interval[0] + 1);
	}

	/*
	* Run the tests in a loop. These tests include fault injection
	* to verify that self-healing data works, and forced crashes
	* to verify that we never lose on-disk consistency.
	*/
	while (gethrtime() < zs->zs_proc_stop) {
	int status;
	boolean_t killed;

	/*
	* Initialize the workload counters for each function.
	*/
	for (int f = 0; f < ZTEST_FUNCS; f++) {
	zc = ZTEST_GET_SHARED_CALLSTATE(f);
	zc->zc_count = 0;
	zc->zc_time = 0;
	}

	/* Set the allocation switch size */
	zs->zs_metaslab_df_alloc_threshold =
	ztest_random(zs->zs_metaslab_sz / 4) + 1;

	if (!hasalt \|\| ztest_random(2) == 0) {
	if (hasalt && ztest_opts.zo_verbose >= 1) {
	(void) printf("Executing newer ztest: %s\n",
	cmd);
	}
	newer++;
	killed = exec_child(cmd, NULL, B_TRUE, &status);
	} else {
	if (hasalt && ztest_opts.zo_verbose >= 1) {
	(void) printf("Executing older ztest: %s\n",
	ztest_opts.zo_alt_ztest);
	}
	older++;
	killed = exec_child(ztest_opts.zo_alt_ztest,
	ztest_opts.zo_alt_libpath, B_TRUE, &status);
	}

	if (killed)
	kills++;
	iters++;

	if (ztest_opts.zo_verbose >= 1) {
	hrtime_t now = gethrtime();

	now = MIN(now, zs->zs_proc_stop);
	print_time(zs->zs_proc_stop - now, timebuf);
	nicenum(zs->zs_space, numbuf, sizeof (numbuf));

	(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
	"%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
	iters,
	WIFEXITED(status) ? "Complete" : "SIGKILL",
	(u_longlong_t)zs->zs_enospc_count,
	100.0 * zs->zs_alloc / zs->zs_space,
	numbuf,
	100.0 * (now - zs->zs_proc_start) /
	(ztest_opts.zo_time * NANOSEC), timebuf);
	}

	if (ztest_opts.zo_verbose >= 2) {
	(void) printf("\nWorkload summary:\n\n");
	(void) printf("%7s %9s %s\n",
	"Calls", "Time", "Function");
	(void) printf("%7s %9s %s\n",
	"-----", "----", "--------");
	for (int f = 0; f < ZTEST_FUNCS; f++) {
	Dl_info dli;

	zi = &ztest_info[f];
	zc = ZTEST_GET_SHARED_CALLSTATE(f);
	print_time(zc->zc_time, timebuf);
	(void) dladdr((void *)zi->zi_func, &dli);
	(void) printf("%7llu %9s %s\n",
	(u_longlong_t)zc->zc_count, timebuf,
	dli.dli_sname);
	}
	(void) printf("\n");
	}

	/*
	* It's possible that we killed a child during a rename test,
	* in which case we'll have a 'ztest_tmp' pool lying around
	* instead of 'ztest'. Do a blind rename in case this happened.
	*/
	kernel_init(FREAD);
	if (spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0) {
	spa_close(spa, FTAG);
	} else {
	char tmpname[ZFS_MAX_DATASET_NAME_LEN];
	kernel_fini();
	kernel_init(FREAD \| FWRITE);
	(void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
	ztest_opts.zo_pool);
	(void) spa_rename(tmpname, ztest_opts.zo_pool);
	}
	kernel_fini();

	ztest_run_zdb(ztest_opts.zo_pool);
	}

	if (ztest_opts.zo_verbose >= 1) {
	if (hasalt) {
	(void) printf("%d runs of older ztest: %s\n", older,
	ztest_opts.zo_alt_ztest);
	(void) printf("%d runs of newer ztest: %s\n", newer,
	cmd);
	}
	(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
	kills, iters - kills, (100.0 * kills) / MAX(1, iters));
	}

	umem_free(cmd, MAXNAMELEN);

	return (0);
	}
	Index: stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
	===================================================================
	--- stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h (revision 332524)
	+++ stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h (revision 332525)
	@@ -1,833 +1,837 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved.
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2012, Joyent, Inc. All rights reserved.
	* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright (c) 2013 Steven Hartland. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2016 Nexenta Systems, Inc.
	* Copyright (c) 2017 Datto Inc.
	*/

	#ifndef _LIBZFS_H
	#define _LIBZFS_H

	#include <assert.h>
	#include <libnvpair.h>
	#include <sys/mnttab.h>
	#include <sys/param.h>
	#include <sys/types.h>
	#include <sys/varargs.h>
	#include <sys/fs/zfs.h>
	#include <sys/avl.h>
	#include <sys/zfs_ioctl.h>
	#include <libzfs_core.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	/*
	* Miscellaneous ZFS constants
	*/
	#define ZFS_MAXPROPLEN MAXPATHLEN
	#define ZPOOL_MAXPROPLEN MAXPATHLEN

	/*
	* libzfs errors
	*/
	typedef enum zfs_error {
	EZFS_SUCCESS = 0, /* no error -- success */
	EZFS_NOMEM = 2000, /* out of memory */
	EZFS_BADPROP, /* invalid property value */
	EZFS_PROPREADONLY, /* cannot set readonly property */
	EZFS_PROPTYPE, /* property does not apply to dataset type */
	EZFS_PROPNONINHERIT, /* property is not inheritable */
	EZFS_PROPSPACE, /* bad quota or reservation */
	EZFS_BADTYPE, /* dataset is not of appropriate type */
	EZFS_BUSY, /* pool or dataset is busy */
	EZFS_EXISTS, /* pool or dataset already exists */
	EZFS_NOENT, /* no such pool or dataset */
	EZFS_BADSTREAM, /* bad backup stream */
	EZFS_DSREADONLY, /* dataset is readonly */
	EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */
	EZFS_INVALIDNAME, /* invalid dataset name */
	EZFS_BADRESTORE, /* unable to restore to destination */
	EZFS_BADBACKUP, /* backup failed */
	EZFS_BADTARGET, /* bad attach/detach/replace target */
	EZFS_NODEVICE, /* no such device in pool */
	EZFS_BADDEV, /* invalid device to add */
	EZFS_NOREPLICAS, /* no valid replicas */
	EZFS_RESILVERING, /* currently resilvering */
	EZFS_BADVERSION, /* unsupported version */
	EZFS_POOLUNAVAIL, /* pool is currently unavailable */
	EZFS_DEVOVERFLOW, /* too many devices in one vdev */
	EZFS_BADPATH, /* must be an absolute path */
	EZFS_CROSSTARGET, /* rename or clone across pool or dataset */
	EZFS_ZONED, /* used improperly in local zone */
	EZFS_MOUNTFAILED, /* failed to mount dataset */
	EZFS_UMOUNTFAILED, /* failed to unmount dataset */
	EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */
	EZFS_SHARENFSFAILED, /* share(1M) failed */
	EZFS_PERM, /* permission denied */
	EZFS_NOSPC, /* out of space */
	EZFS_FAULT, /* bad address */
	EZFS_IO, /* I/O error */
	EZFS_INTR, /* signal received */
	EZFS_ISSPARE, /* device is a hot spare */
	EZFS_INVALCONFIG, /* invalid vdev configuration */
	EZFS_RECURSIVE, /* recursive dependency */
	EZFS_NOHISTORY, /* no history object */
	EZFS_POOLPROPS, /* couldn't retrieve pool props */
	EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */
	EZFS_POOL_INVALARG, /* invalid argument for this pool operation */
	EZFS_NAMETOOLONG, /* dataset name is too long */
	EZFS_OPENFAILED, /* open of device failed */
	EZFS_NOCAP, /* couldn't get capacity */
	EZFS_LABELFAILED, /* write of label failed */
	EZFS_BADWHO, /* invalid permission who */
	EZFS_BADPERM, /* invalid permission */
	EZFS_BADPERMSET, /* invalid permission set name */
	EZFS_NODELEGATION, /* delegated administration is disabled */
	EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */
	EZFS_SHARESMBFAILED, /* failed to share over smb */
	EZFS_BADCACHE, /* bad cache file */
	EZFS_ISL2CACHE, /* device is for the level 2 ARC */
	EZFS_VDEVNOTSUP, /* unsupported vdev type */
	EZFS_NOTSUP, /* ops not supported on this dataset */
	EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */
	EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */
	EZFS_REFTAG_RELE, /* snapshot release: tag not found */
	EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */
	EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */
	EZFS_PIPEFAILED, /* pipe create failed */
	EZFS_THREADCREATEFAILED, /* thread create failed */
	EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */
	EZFS_SCRUBBING, /* currently scrubbing */
	EZFS_NO_SCRUB, /* no active scrub */
	EZFS_DIFF, /* general failure of zfs diff */
	EZFS_DIFFDATA, /* bad zfs diff data */
	EZFS_POOLREADONLY, /* pool is in read-only mode */
	EZFS_SCRUB_PAUSED, /* scrub currently paused */
	+ EZFS_NO_PENDING, /* cannot cancel, no operation is pending */
	EZFS_UNKNOWN
	} zfs_error_t;

	/*
	* UEFI boot support parameters. When creating whole disk boot pool,
	* zpool create should allow to create EFI System partition for UEFI boot
	* program. In case of BIOS, the EFI System partition is not used
	* even if it does exist.
	*/
	typedef enum zpool_boot_label {
	ZPOOL_NO_BOOT_LABEL = 0,
	ZPOOL_CREATE_BOOT_LABEL,
	ZPOOL_COPY_BOOT_LABEL
	} zpool_boot_label_t;

	/*
	* The following data structures are all part
	* of the zfs_allow_t data structure which is
	* used for printing 'allow' permissions.
	* It is a linked list of zfs_allow_t's which
	* then contain avl tree's for user/group/sets/...
	* and each one of the entries in those trees have
	* avl tree's for the permissions they belong to and
	* whether they are local,descendent or local+descendent
	* permissions. The AVL trees are used primarily for
	* sorting purposes, but also so that we can quickly find
	* a given user and or permission.
	*/
	typedef struct zfs_perm_node {
	avl_node_t z_node;
	char z_pname[MAXPATHLEN];
	} zfs_perm_node_t;

	typedef struct zfs_allow_node {
	avl_node_t z_node;
	char z_key[MAXPATHLEN]; /* name, such as joe */
	avl_tree_t z_localdescend; /* local+descendent perms */
	avl_tree_t z_local; /* local permissions */
	avl_tree_t z_descend; /* descendent permissions */
	} zfs_allow_node_t;

	typedef struct zfs_allow {
	struct zfs_allow *z_next;
	char z_setpoint[MAXPATHLEN];
	avl_tree_t z_sets;
	avl_tree_t z_crperms;
	avl_tree_t z_user;
	avl_tree_t z_group;
	avl_tree_t z_everyone;
	} zfs_allow_t;

	/*
	* Basic handle types
	*/
	typedef struct zfs_handle zfs_handle_t;
	typedef struct zpool_handle zpool_handle_t;
	typedef struct libzfs_handle libzfs_handle_t;

	/*
	* Library initialization
	*/
	extern libzfs_handle_t *libzfs_init(void);
	extern void libzfs_fini(libzfs_handle_t *);

	extern libzfs_handle_t zpool_get_handle(zpool_handle_t );
	extern libzfs_handle_t zfs_get_handle(zfs_handle_t );

	extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);

	extern void zfs_save_arguments(int argc, char *, char , int);
	extern int zpool_log_history(libzfs_handle_t , const char );

	extern int libzfs_errno(libzfs_handle_t *);
	extern const char libzfs_error_action(libzfs_handle_t );
	extern const char libzfs_error_description(libzfs_handle_t );
	extern int zfs_standard_error(libzfs_handle_t , int, const char );
	extern void libzfs_mnttab_init(libzfs_handle_t *);
	extern void libzfs_mnttab_fini(libzfs_handle_t *);
	extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t);
	extern int libzfs_mnttab_find(libzfs_handle_t , const char ,
	struct mnttab *);
	extern void libzfs_mnttab_add(libzfs_handle_t , const char ,
	const char , const char );
	extern void libzfs_mnttab_remove(libzfs_handle_t , const char );

	/*
	* Basic handle functions
	*/
	extern zpool_handle_t zpool_open(libzfs_handle_t , const char *);
	extern zpool_handle_t zpool_open_canfail(libzfs_handle_t , const char *);
	extern void zpool_close(zpool_handle_t *);
	extern const char zpool_get_name(zpool_handle_t );
	extern int zpool_get_state(zpool_handle_t *);
	extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t);
	extern const char *zpool_pool_state_to_name(pool_state_t);
	extern void zpool_free_handles(libzfs_handle_t *);
	extern int zpool_nextboot(libzfs_handle_t , uint64_t, uint64_t, const char );

	/*
	* Iterate over all active pools in the system.
	*/
	typedef int (zpool_iter_f)(zpool_handle_t , void *);
	extern int zpool_iter(libzfs_handle_t , zpool_iter_f, void );
	extern boolean_t zpool_skip_pool(const char *);

	/*
	* Functions to create and destroy pools
	*/
	extern int zpool_create(libzfs_handle_t , const char , nvlist_t *,
	nvlist_t , nvlist_t );
	extern int zpool_destroy(zpool_handle_t , const char );
	extern int zpool_add(zpool_handle_t , nvlist_t );

	typedef struct splitflags {
	/* do not split, but return the config that would be split off */
	int dryrun : 1;

	/* after splitting, import the pool */
	int import : 1;
	} splitflags_t;

	/*
	* Functions to manipulate pool and vdev state
	*/
	extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
	extern int zpool_clear(zpool_handle_t , const char , nvlist_t *);
	extern int zpool_reguid(zpool_handle_t *);
	extern int zpool_reopen(zpool_handle_t *);

	extern int zpool_vdev_online(zpool_handle_t , const char , int,
	vdev_state_t *);
	extern int zpool_vdev_offline(zpool_handle_t , const char , boolean_t);
	extern int zpool_vdev_attach(zpool_handle_t , const char ,
	const char , nvlist_t , int);
	extern int zpool_vdev_detach(zpool_handle_t , const char );
	extern int zpool_vdev_remove(zpool_handle_t , const char );
	+extern int zpool_vdev_remove_cancel(zpool_handle_t *);
	+extern int zpool_vdev_indirect_size(zpool_handle_t , const char , uint64_t *);
	extern int zpool_vdev_split(zpool_handle_t , char , nvlist_t *, nvlist_t ,
	splitflags_t);

	extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t);
	extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t);
	extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);

	extern nvlist_t zpool_find_vdev(zpool_handle_t , const char , boolean_t ,
	boolean_t , boolean_t );
	extern nvlist_t zpool_find_vdev_by_physpath(zpool_handle_t , const char *,
	boolean_t , boolean_t , boolean_t *);
	extern int zpool_label_disk(libzfs_handle_t , zpool_handle_t , const char *,
	zpool_boot_label_t, uint64_t, int *);

	/*
	* Functions to manage pool properties
	*/
	extern int zpool_set_prop(zpool_handle_t , const char , const char *);
	extern int zpool_get_prop(zpool_handle_t , zpool_prop_t, char ,
	size_t proplen, zprop_source_t *, boolean_t);
	extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t,
	zprop_source_t *);

	extern const char *zpool_prop_to_name(zpool_prop_t);
	extern const char *zpool_prop_values(zpool_prop_t);

	/*
	* Pool health statistics.
	*/
	typedef enum {
	/*
	* The following correspond to faults as defined in the (fault.fs.zfs.*)
	* event namespace. Each is associated with a corresponding message ID.
	*/
	ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */
	ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */
	ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */
	ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */
	ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */
	ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */
	ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */
	ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */
	ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */
	ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */
	ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */
	ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */
	ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */
	ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */

	/*
	* If the pool has unsupported features but can still be opened in
	* read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the
	* pool has unsupported features but cannot be opened at all, its
	* status is ZPOOL_STATUS_UNSUP_FEAT_READ.
	*/
	ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */
	ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */

	/*
	* These faults have no corresponding message ID. At the time we are
	* checking the status, the original reason for the FMA fault (I/O or
	* checksum errors) has been lost.
	*/
	ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */
	ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */

	/*
	* The following are not faults per se, but still an error possibly
	* requiring administrative attention. There is no corresponding
	* message ID.
	*/
	ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */
	ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */
	ZPOOL_STATUS_RESILVERING, /* device being resilvered */
	ZPOOL_STATUS_OFFLINE_DEV, /* device offline */
	ZPOOL_STATUS_REMOVED_DEV, /* removed device */
	ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */

	/*
	* Finally, the following indicates a healthy pool.
	*/
	ZPOOL_STATUS_OK
	} zpool_status_t;

	extern zpool_status_t zpool_get_status(zpool_handle_t , char *);
	extern zpool_status_t zpool_import_status(nvlist_t , char *);
	extern void zpool_dump_ddt(const ddt_stat_t dds, const ddt_histogram_t ddh);

	/*
	* Statistics and configuration functions.
	*/
	extern nvlist_t zpool_get_config(zpool_handle_t , nvlist_t **);
	extern nvlist_t zpool_get_features(zpool_handle_t );
	extern int zpool_refresh_stats(zpool_handle_t , boolean_t );
	extern int zpool_get_errlog(zpool_handle_t , nvlist_t *);
	extern boolean_t zpool_is_bootable(zpool_handle_t *);

	/*
	* Import and export functions
	*/
	extern int zpool_export(zpool_handle_t , boolean_t, const char );
	extern int zpool_export_force(zpool_handle_t , const char );
	extern int zpool_import(libzfs_handle_t , nvlist_t , const char *,
	char *altroot);
	extern int zpool_import_props(libzfs_handle_t , nvlist_t , const char *,
	nvlist_t *, int);
	extern void zpool_print_unsup_feat(nvlist_t *config);

	/*
	* Search for pools to import
	*/

	typedef struct importargs {
	char *path; / a list of paths to search */
	int paths; /* number of paths to search */
	char poolname; / name of a pool to find */
	uint64_t guid; /* guid of a pool to find */
	char cachefile; / cachefile to use for import */
	int can_be_active : 1; /* can the pool be active? */
	int unique : 1; /* does 'poolname' already exist? */
	int exists : 1; /* set on return if pool already exists */
	} importargs_t;

	extern nvlist_t zpool_search_import(libzfs_handle_t , importargs_t *);

	/* legacy pool search routines */
	extern nvlist_t zpool_find_import(libzfs_handle_t , int, char **);
	extern nvlist_t zpool_find_import_cached(libzfs_handle_t , const char *,
	char *, uint64_t);

	/*
	* Miscellaneous pool functions
	*/
	struct zfs_cmd;

	extern const char *zfs_history_event_names[];

	extern char zpool_vdev_name(libzfs_handle_t , zpool_handle_t , nvlist_t ,
	boolean_t verbose);
	extern int zpool_upgrade(zpool_handle_t *, uint64_t);
	extern int zpool_get_history(zpool_handle_t , nvlist_t *);
	extern int zpool_history_unpack(char , uint64_t, uint64_t ,
	nvlist_t **, uint_t );
	extern void zpool_obj_to_path(zpool_handle_t , uint64_t, uint64_t, char ,
	size_t len);
	extern int zfs_ioctl(libzfs_handle_t , int request, struct zfs_cmd );
	extern int zpool_get_physpath(zpool_handle_t , char , size_t);
	extern void zpool_explain_recover(libzfs_handle_t , const char , int,
	nvlist_t *);

	/*
	* Basic handle manipulations. These functions do not create or destroy the
	* underlying datasets, only the references to them.
	*/
	extern zfs_handle_t zfs_open(libzfs_handle_t , const char *, int);
	extern zfs_handle_t zfs_handle_dup(zfs_handle_t );
	extern void zfs_close(zfs_handle_t *);
	extern zfs_type_t zfs_get_type(const zfs_handle_t *);
	extern const char zfs_get_name(const zfs_handle_t );
	extern zpool_handle_t zfs_get_pool_handle(const zfs_handle_t );
	extern const char zfs_get_pool_name(const zfs_handle_t );

	/*
	* Property management functions. Some functions are shared with the kernel,
	* and are found in sys/fs/zfs.h.
	*/

	/*
	* zfs dataset property management
	*/
	extern const char *zfs_prop_default_string(zfs_prop_t);
	extern uint64_t zfs_prop_default_numeric(zfs_prop_t);
	extern const char *zfs_prop_column_name(zfs_prop_t);
	extern boolean_t zfs_prop_align_right(zfs_prop_t);

	extern nvlist_t zfs_valid_proplist(libzfs_handle_t , zfs_type_t,
	nvlist_t , uint64_t, zfs_handle_t , zpool_handle_t , const char );

	extern const char *zfs_prop_to_name(zfs_prop_t);
	extern int zfs_prop_set(zfs_handle_t , const char , const char *);
	extern int zfs_prop_set_list(zfs_handle_t , nvlist_t );
	extern int zfs_prop_get(zfs_handle_t , zfs_prop_t, char , size_t,
	zprop_source_t , char , size_t, boolean_t);
	extern int zfs_prop_get_recvd(zfs_handle_t , const char , char *, size_t,
	boolean_t);
	extern int zfs_prop_get_numeric(zfs_handle_t , zfs_prop_t, uint64_t ,
	zprop_source_t , char , size_t);
	extern int zfs_prop_get_userquota_int(zfs_handle_t zhp, const char propname,
	uint64_t *propvalue);
	extern int zfs_prop_get_userquota(zfs_handle_t zhp, const char propname,
	char *propbuf, int proplen, boolean_t literal);
	extern int zfs_prop_get_written_int(zfs_handle_t zhp, const char propname,
	uint64_t *propvalue);
	extern int zfs_prop_get_written(zfs_handle_t zhp, const char propname,
	char *propbuf, int proplen, boolean_t literal);
	extern int zfs_prop_get_feature(zfs_handle_t zhp, const char propname,
	char *buf, size_t len);
	extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
	extern int zfs_prop_inherit(zfs_handle_t , const char , boolean_t);
	extern const char *zfs_prop_values(zfs_prop_t);
	extern int zfs_prop_is_string(zfs_prop_t prop);
	extern nvlist_t zfs_get_user_props(zfs_handle_t );
	extern nvlist_t zfs_get_recvd_props(zfs_handle_t );
	extern nvlist_t zfs_get_clones_nvl(zfs_handle_t );


	typedef struct zprop_list {
	int pl_prop;
	char *pl_user_prop;
	struct zprop_list *pl_next;
	boolean_t pl_all;
	size_t pl_width;
	size_t pl_recvd_width;
	boolean_t pl_fixed;
	} zprop_list_t;

	extern int zfs_expand_proplist(zfs_handle_t , zprop_list_t *, boolean_t,
	boolean_t);
	extern void zfs_prune_proplist(zfs_handle_t , uint8_t );

	#define ZFS_MOUNTPOINT_NONE "none"
	#define ZFS_MOUNTPOINT_LEGACY "legacy"

	#define ZFS_FEATURE_DISABLED "disabled"
	#define ZFS_FEATURE_ENABLED "enabled"
	#define ZFS_FEATURE_ACTIVE "active"

	#define ZFS_UNSUPPORTED_INACTIVE "inactive"
	#define ZFS_UNSUPPORTED_READONLY "readonly"

	/*
	* zpool property management
	*/
	extern int zpool_expand_proplist(zpool_handle_t , zprop_list_t *);
	extern int zpool_prop_get_feature(zpool_handle_t , const char , char *,
	size_t);
	extern const char *zpool_prop_default_string(zpool_prop_t);
	extern uint64_t zpool_prop_default_numeric(zpool_prop_t);
	extern const char *zpool_prop_column_name(zpool_prop_t);
	extern boolean_t zpool_prop_align_right(zpool_prop_t);

	/*
	* Functions shared by zfs and zpool property management.
	*/
	extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all,
	boolean_t ordered, zfs_type_t type);
	extern int zprop_get_list(libzfs_handle_t , char , zprop_list_t **,
	zfs_type_t);
	extern void zprop_free_list(zprop_list_t *);

	#define ZFS_GET_NCOLS 5

	typedef enum {
	GET_COL_NONE,
	GET_COL_NAME,
	GET_COL_PROPERTY,
	GET_COL_VALUE,
	GET_COL_RECVD,
	GET_COL_SOURCE
	} zfs_get_column_t;

	/*
	* Functions for printing zfs or zpool properties
	*/
	typedef struct zprop_get_cbdata {
	int cb_sources;
	zfs_get_column_t cb_columns[ZFS_GET_NCOLS];
	int cb_colwidths[ZFS_GET_NCOLS + 1];
	boolean_t cb_scripted;
	boolean_t cb_literal;
	boolean_t cb_first;
	zprop_list_t *cb_proplist;
	zfs_type_t cb_type;
	} zprop_get_cbdata_t;

	void zprop_print_one_property(const char , zprop_get_cbdata_t ,
	const char , const char , zprop_source_t, const char *,
	const char *);

	/*
	* Iterator functions.
	*/
	typedef int (zfs_iter_f)(zfs_handle_t , void *);
	extern int zfs_iter_root(libzfs_handle_t , zfs_iter_f, void );
	extern int zfs_iter_children(zfs_handle_t , zfs_iter_f, void );
	extern int zfs_iter_dependents(zfs_handle_t , boolean_t, zfs_iter_f, void );
	extern int zfs_iter_filesystems(zfs_handle_t , zfs_iter_f, void );
	extern int zfs_iter_snapshots(zfs_handle_t , boolean_t, zfs_iter_f, void );
	extern int zfs_iter_snapshots_sorted(zfs_handle_t , zfs_iter_f, void );
	extern int zfs_iter_snapspec(zfs_handle_t , const char , zfs_iter_f, void *);
	extern int zfs_iter_bookmarks(zfs_handle_t , zfs_iter_f, void );

	typedef struct get_all_cb {
	zfs_handle_t **cb_handles;
	size_t cb_alloc;
	size_t cb_used;
	boolean_t cb_verbose;
	int (cb_getone)(zfs_handle_t , void *);
	} get_all_cb_t;

	void libzfs_add_handle(get_all_cb_t , zfs_handle_t );
	int libzfs_dataset_cmp(const void , const void );

	/*
	* Functions to create and destroy datasets.
	*/
	extern int zfs_create(libzfs_handle_t , const char , zfs_type_t,
	nvlist_t *);
	extern int zfs_create_ancestors(libzfs_handle_t , const char );
	extern int zfs_destroy(zfs_handle_t *, boolean_t);
	extern int zfs_destroy_snaps(zfs_handle_t , char , boolean_t);
	extern int zfs_destroy_snaps_nvl(libzfs_handle_t , nvlist_t , boolean_t);
	extern int zfs_clone(zfs_handle_t , const char , nvlist_t *);
	extern int zfs_snapshot(libzfs_handle_t , const char , boolean_t, nvlist_t *);
	extern int zfs_snapshot_nvl(libzfs_handle_t hdl, nvlist_t snaps,
	nvlist_t *props);
	extern int zfs_rollback(zfs_handle_t , zfs_handle_t , boolean_t);

	typedef struct renameflags {
	/* recursive rename */
	int recurse : 1;

	/* don't unmount file systems */
	int nounmount : 1;

	/* force unmount file systems */
	int forceunmount : 1;
	} renameflags_t;

	extern int zfs_rename(zfs_handle_t , const char , const char *,
	renameflags_t flags);

	typedef struct sendflags {
	/* print informational messages (ie, -v was specified) */
	boolean_t verbose;

	/* recursive send (ie, -R) */
	boolean_t replicate;

	/* for incrementals, do all intermediate snapshots */
	boolean_t doall;

	/* if dataset is a clone, do incremental from its origin */
	boolean_t fromorigin;

	/* do deduplication */
	boolean_t dedup;

	/* send properties (ie, -p) */
	boolean_t props;

	/* do not send (no-op, ie. -n) */
	boolean_t dryrun;

	/* parsable verbose output (ie. -P) */
	boolean_t parsable;

	/* show progress (ie. -v) */
	boolean_t progress;

	/* large blocks (>128K) are permitted */
	boolean_t largeblock;

	/* WRITE_EMBEDDED records of type DATA are permitted */
	boolean_t embed_data;

	/* compressed WRITE records are permitted */
	boolean_t compress;
	} sendflags_t;

	typedef boolean_t (snapfilter_cb_t)(zfs_handle_t , void );

	extern int zfs_send(zfs_handle_t , const char , const char *,
	sendflags_t , int, snapfilter_cb_t, void , nvlist_t **);
	extern int zfs_send_one(zfs_handle_t , const char , int, enum lzc_send_flags);
	extern int zfs_send_resume(libzfs_handle_t , sendflags_t , int outfd,
	const char *);
	extern nvlist_t zfs_send_resume_token_to_nvlist(libzfs_handle_t hdl,
	const char *token);

	extern int zfs_promote(zfs_handle_t *);
	extern int zfs_hold(zfs_handle_t , const char , const char *,
	boolean_t, int);
	extern int zfs_hold_nvl(zfs_handle_t , int, nvlist_t );
	extern int zfs_release(zfs_handle_t , const char , const char *, boolean_t);
	extern int zfs_get_holds(zfs_handle_t , nvlist_t *);
	extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);

	typedef int (zfs_userspace_cb_t)(void arg, const char *domain,
	uid_t rid, uint64_t space);

	extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t,
	zfs_userspace_cb_t, void *);

	extern int zfs_get_fsacl(zfs_handle_t , nvlist_t *);
	extern int zfs_set_fsacl(zfs_handle_t , boolean_t, nvlist_t );

	typedef struct recvflags {
	/* print informational messages (ie, -v was specified) */
	boolean_t verbose;

	/* the destination is a prefix, not the exact fs (ie, -d) */
	boolean_t isprefix;

	/*
	* Only the tail of the sent snapshot path is appended to the
	* destination to determine the received snapshot name (ie, -e).
	*/
	boolean_t istail;

	/* do not actually do the recv, just check if it would work (ie, -n) */
	boolean_t dryrun;

	/* rollback/destroy filesystems as necessary (eg, -F) */
	boolean_t force;

	/* set "canmount=off" on all modified filesystems */
	boolean_t canmountoff;

	/*
	* Mark the file systems as "resumable" and do not destroy them if the
	* receive is interrupted
	*/
	boolean_t resumable;

	/* byteswap flag is used internally; callers need not specify */
	boolean_t byteswap;

	/* do not mount file systems as they are extracted (private) */
	boolean_t nomount;
	} recvflags_t;

	extern int zfs_receive(libzfs_handle_t , const char , nvlist_t *,
	recvflags_t , int, avl_tree_t );

	typedef enum diff_flags {
	ZFS_DIFF_PARSEABLE = 0x1,
	ZFS_DIFF_TIMESTAMP = 0x2,
	ZFS_DIFF_CLASSIFY = 0x4
	} diff_flags_t;

	extern int zfs_show_diffs(zfs_handle_t , int, const char , const char *,
	int);

	/*
	* Miscellaneous functions.
	*/
	extern const char *zfs_type_to_name(zfs_type_t);
	extern void zfs_refresh_properties(zfs_handle_t *);
	extern int zfs_name_valid(const char *, zfs_type_t);
	extern zfs_handle_t zfs_path_to_zhandle(libzfs_handle_t , char *, zfs_type_t);
	extern boolean_t zfs_dataset_exists(libzfs_handle_t , const char ,
	zfs_type_t);
	extern int zfs_spa_version(zfs_handle_t , int );
	extern boolean_t zfs_bookmark_exists(const char *path);

	/*
	* Mount support functions.
	*/
	extern boolean_t is_mounted(libzfs_handle_t , const char special, char **);
	extern boolean_t zfs_is_mounted(zfs_handle_t , char *);
	extern int zfs_mount(zfs_handle_t , const char , int);
	extern int zfs_unmount(zfs_handle_t , const char , int);
	extern int zfs_unmountall(zfs_handle_t *, int);

	/*
	* Share support functions.
	*/
	extern boolean_t zfs_is_shared(zfs_handle_t *);
	extern int zfs_share(zfs_handle_t *);
	extern int zfs_unshare(zfs_handle_t *);

	/*
	* Protocol-specific share support functions.
	*/
	extern boolean_t zfs_is_shared_nfs(zfs_handle_t , char *);
	extern boolean_t zfs_is_shared_smb(zfs_handle_t , char *);
	extern int zfs_share_nfs(zfs_handle_t *);
	extern int zfs_share_smb(zfs_handle_t *);
	extern int zfs_shareall(zfs_handle_t *);
	extern int zfs_unshare_nfs(zfs_handle_t , const char );
	extern int zfs_unshare_smb(zfs_handle_t , const char );
	extern int zfs_unshareall_nfs(zfs_handle_t *);
	extern int zfs_unshareall_smb(zfs_handle_t *);
	extern int zfs_unshareall_bypath(zfs_handle_t , const char );
	extern int zfs_unshareall(zfs_handle_t *);
	extern int zfs_deleg_share_nfs(libzfs_handle_t , char , char , char ,
	void , void , int, zfs_share_op_t);

	/*
	* FreeBSD-specific jail support function.
	*/
	extern int zfs_jail(zfs_handle_t *, int, int);

	/*
	* When dealing with nvlists, verify() is extremely useful
	*/
	#ifndef verify
	#ifdef NDEBUG
	#define verify(EX) ((void)(EX))
	#else
	#define verify(EX) assert(EX)
	#endif
	#endif

	/*
	* Utility function to convert a number to a human-readable form.
	*/
	extern void zfs_nicenum(uint64_t, char *, size_t);
	extern int zfs_nicestrtonum(libzfs_handle_t , const char , uint64_t *);

	/*
	* Given a device or file, determine if it is part of a pool.
	*/
	extern int zpool_in_use(libzfs_handle_t , int, pool_state_t , char **,
	boolean_t *);

	/*
	* Label manipulation.
	*/
	extern int zpool_read_label(int, nvlist_t **);
	extern int zpool_read_all_labels(int, nvlist_t **);
	extern int zpool_clear_label(int);

	/* is this zvol valid for use as a dump device? */
	extern int zvol_check_dump_config(char *);

	/*
	* Management interfaces for SMB ACL files
	*/

	int zfs_smb_acl_add(libzfs_handle_t , char , char , char );
	int zfs_smb_acl_remove(libzfs_handle_t , char , char , char );
	int zfs_smb_acl_purge(libzfs_handle_t , char , char *);
	int zfs_smb_acl_rename(libzfs_handle_t , char , char , char , char *);

	/*
	* Enable and disable datasets within a pool by mounting/unmounting and
	* sharing/unsharing them.
	*/
	extern int zpool_enable_datasets(zpool_handle_t , const char , int);
	extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);

	/*
	* Mappings between vdev and FRU.
	*/
	extern void libzfs_fru_refresh(libzfs_handle_t *);
	extern const char libzfs_fru_lookup(libzfs_handle_t , const char *);
	extern const char libzfs_fru_devpath(libzfs_handle_t , const char *);
	extern boolean_t libzfs_fru_compare(libzfs_handle_t , const char ,
	const char *);
	extern boolean_t libzfs_fru_notself(libzfs_handle_t , const char );
	extern int zpool_fru_set(zpool_handle_t , uint64_t, const char );

	#ifndef illumos
	extern int zmount(const char , const char , int, char , char , int, char *,
	int);
	#endif
	+extern int zfs_remap_indirects(libzfs_handle_t hdl, const char );

	#ifdef __cplusplus
	}
	#endif

	#endif /* _LIBZFS_H */
	Index: stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
	===================================================================
	--- stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c (revision 332524)
	+++ stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c (revision 332525)
	@@ -1,5095 +1,5113 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	* Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved.
	* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
	* Copyright (c) 2013 Martin Matuska. All rights reserved.
	* Copyright (c) 2013 Steven Hartland. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	* Copyright 2017 Nexenta Systems, Inc.
	* Copyright 2017 RackTop Systems.
	*/

	#include <ctype.h>
	#include <errno.h>
	#include <libintl.h>
	#include <math.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <strings.h>
	#include <unistd.h>
	#include <stddef.h>
	#include <zone.h>
	#include <fcntl.h>
	#include <sys/mntent.h>
	#include <sys/mount.h>
	#include <priv.h>
	#include <pwd.h>
	#include <grp.h>
	#include <stddef.h>
	#include <idmap.h>

	#include <sys/dnode.h>
	#include <sys/spa.h>
	#include <sys/zap.h>
	#include <sys/misc.h>
	#include <libzfs.h>

	#include "zfs_namecheck.h"
	#include "zfs_prop.h"
	#include "libzfs_impl.h"
	#include "zfs_deleg.h"

	static int userquota_propname_decode(const char *propname, boolean_t zoned,
	zfs_userquota_prop_t typep, char domain, int domainlen, uint64_t *ridp);

	/*
	* Given a single type (not a mask of types), return the type in a human
	* readable form.
	*/
	const char *
	zfs_type_to_name(zfs_type_t type)
	{
	switch (type) {
	case ZFS_TYPE_FILESYSTEM:
	return (dgettext(TEXT_DOMAIN, "filesystem"));
	case ZFS_TYPE_SNAPSHOT:
	return (dgettext(TEXT_DOMAIN, "snapshot"));
	case ZFS_TYPE_VOLUME:
	return (dgettext(TEXT_DOMAIN, "volume"));
	case ZFS_TYPE_POOL:
	return (dgettext(TEXT_DOMAIN, "pool"));
	case ZFS_TYPE_BOOKMARK:
	return (dgettext(TEXT_DOMAIN, "bookmark"));
	default:
	assert(!"unhandled zfs_type_t");
	}

	return (NULL);
	}

	/*
	* Validate a ZFS path. This is used even before trying to open the dataset, to
	* provide a more meaningful error message. We call zfs_error_aux() to
	* explain exactly why the name was not valid.
	*/
	int
	zfs_validate_name(libzfs_handle_t hdl, const char path, int type,
	boolean_t modifying)
	{
	namecheck_err_t why;
	char what;

	if (entity_namecheck(path, &why, &what) != 0) {
	if (hdl != NULL) {
	switch (why) {
	case NAME_ERR_TOOLONG:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"name is too long"));
	break;

	case NAME_ERR_LEADING_SLASH:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"leading slash in name"));
	break;

	case NAME_ERR_EMPTY_COMPONENT:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"empty component in name"));
	break;

	case NAME_ERR_TRAILING_SLASH:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"trailing slash in name"));
	break;

	case NAME_ERR_INVALCHAR:
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN, "invalid character "
	"'%c' in name"), what);
	break;

	case NAME_ERR_MULTIPLE_DELIMITERS:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"multiple '@' and/or '#' delimiters in "
	"name"));
	break;

	case NAME_ERR_NOLETTER:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool doesn't begin with a letter"));
	break;

	case NAME_ERR_RESERVED:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"name is reserved"));
	break;

	case NAME_ERR_DISKLIKE:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"reserved disk name"));
	break;

	default:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"(%d) not defined"), why);
	break;
	}
	}

	return (0);
	}

	if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) {
	if (hdl != NULL)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"snapshot delimiter '@' is not expected here"));
	return (0);
	}

	if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) {
	if (hdl != NULL)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"missing '@' delimiter in snapshot name"));
	return (0);
	}

	if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) {
	if (hdl != NULL)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"bookmark delimiter '#' is not expected here"));
	return (0);
	}

	if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) {
	if (hdl != NULL)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"missing '#' delimiter in bookmark name"));
	return (0);
	}

	if (modifying && strchr(path, '%') != NULL) {
	if (hdl != NULL)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"invalid character %c in name"), '%');
	return (0);
	}

	return (-1);
	}

	int
	zfs_name_valid(const char *name, zfs_type_t type)
	{
	if (type == ZFS_TYPE_POOL)
	return (zpool_name_valid(NULL, B_FALSE, name));
	return (zfs_validate_name(NULL, name, type, B_FALSE));
	}

	/*
	* This function takes the raw DSL properties, and filters out the user-defined
	* properties into a separate nvlist.
	*/
	static nvlist_t *
	process_user_props(zfs_handle_t zhp, nvlist_t props)
	{
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	nvpair_t *elem;
	nvlist_t *propval;
	nvlist_t *nvl;

	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
	(void) no_memory(hdl);
	return (NULL);
	}

	elem = NULL;
	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
	if (!zfs_prop_user(nvpair_name(elem)))
	continue;

	verify(nvpair_value_nvlist(elem, &propval) == 0);
	if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) {
	nvlist_free(nvl);
	(void) no_memory(hdl);
	return (NULL);
	}
	}

	return (nvl);
	}

	static zpool_handle_t *
	zpool_add_handle(zfs_handle_t zhp, const char pool_name)
	{
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	zpool_handle_t *zph;

	if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) {
	if (hdl->libzfs_pool_handles != NULL)
	zph->zpool_next = hdl->libzfs_pool_handles;
	hdl->libzfs_pool_handles = zph;
	}
	return (zph);
	}

	static zpool_handle_t *
	zpool_find_handle(zfs_handle_t zhp, const char pool_name, int len)
	{
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	zpool_handle_t *zph = hdl->libzfs_pool_handles;

	while ((zph != NULL) &&
	(strncmp(pool_name, zpool_get_name(zph), len) != 0))
	zph = zph->zpool_next;
	return (zph);
	}

	/*
	* Returns a handle to the pool that contains the provided dataset.
	* If a handle to that pool already exists then that handle is returned.
	* Otherwise, a new handle is created and added to the list of handles.
	*/
	static zpool_handle_t *
	zpool_handle(zfs_handle_t *zhp)
	{
	char *pool_name;
	int len;
	zpool_handle_t *zph;

	len = strcspn(zhp->zfs_name, "/@#") + 1;
	pool_name = zfs_alloc(zhp->zfs_hdl, len);
	(void) strlcpy(pool_name, zhp->zfs_name, len);

	zph = zpool_find_handle(zhp, pool_name, len);
	if (zph == NULL)
	zph = zpool_add_handle(zhp, pool_name);

	free(pool_name);
	return (zph);
	}

	void
	zpool_free_handles(libzfs_handle_t *hdl)
	{
	zpool_handle_t next, zph = hdl->libzfs_pool_handles;

	while (zph != NULL) {
	next = zph->zpool_next;
	zpool_close(zph);
	zph = next;
	}
	hdl->libzfs_pool_handles = NULL;
	}

	/*
	* Utility function to gather stats (objset and zpl) for the given object.
	*/
	static int
	get_stats_ioctl(zfs_handle_t zhp, zfs_cmd_t zc)
	{
	libzfs_handle_t *hdl = zhp->zfs_hdl;

	(void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));

	while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) {
	if (errno == ENOMEM) {
	if (zcmd_expand_dst_nvlist(hdl, zc) != 0) {
	return (-1);
	}
	} else {
	return (-1);
	}
	}
	return (0);
	}

	/*
	* Utility function to get the received properties of the given object.
	*/
	static int
	get_recvd_props_ioctl(zfs_handle_t *zhp)
	{
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	nvlist_t *recvdprops;
	zfs_cmd_t zc = { 0 };
	int err;

	if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
	return (-1);

	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));

	while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) {
	if (errno == ENOMEM) {
	if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	return (-1);
	}
	} else {
	zcmd_free_nvlists(&zc);
	return (-1);
	}
	}

	err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops);
	zcmd_free_nvlists(&zc);
	if (err != 0)
	return (-1);

	nvlist_free(zhp->zfs_recvd_props);
	zhp->zfs_recvd_props = recvdprops;

	return (0);
	}

	static int
	put_stats_zhdl(zfs_handle_t zhp, zfs_cmd_t zc)
	{
	nvlist_t allprops, userprops;

	zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */

	if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) {
	return (-1);
	}

	/*
	* XXX Why do we store the user props separately, in addition to
	* storing them in zfs_props?
	*/
	if ((userprops = process_user_props(zhp, allprops)) == NULL) {
	nvlist_free(allprops);
	return (-1);
	}

	nvlist_free(zhp->zfs_props);
	nvlist_free(zhp->zfs_user_props);

	zhp->zfs_props = allprops;
	zhp->zfs_user_props = userprops;

	return (0);
	}

	static int
	get_stats(zfs_handle_t *zhp)
	{
	int rc = 0;
	zfs_cmd_t zc = { 0 };

	if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
	return (-1);
	if (get_stats_ioctl(zhp, &zc) != 0)
	rc = -1;
	else if (put_stats_zhdl(zhp, &zc) != 0)
	rc = -1;
	zcmd_free_nvlists(&zc);
	return (rc);
	}

	/*
	* Refresh the properties currently stored in the handle.
	*/
	void
	zfs_refresh_properties(zfs_handle_t *zhp)
	{
	(void) get_stats(zhp);
	}

	/*
	* Makes a handle from the given dataset name. Used by zfs_open() and
	* zfs_iter_* to create child handles on the fly.
	*/
	static int
	make_dataset_handle_common(zfs_handle_t zhp, zfs_cmd_t zc)
	{
	if (put_stats_zhdl(zhp, zc) != 0)
	return (-1);

	/*
	* We've managed to open the dataset and gather statistics. Determine
	* the high-level type.
	*/
	if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
	zhp->zfs_head_type = ZFS_TYPE_VOLUME;
	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
	zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM;
	else
	abort();

	if (zhp->zfs_dmustats.dds_is_snapshot)
	zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
	zhp->zfs_type = ZFS_TYPE_VOLUME;
	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
	zhp->zfs_type = ZFS_TYPE_FILESYSTEM;
	else
	abort(); /* we should never see any other types */

	if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL)
	return (-1);

	return (0);
	}

	zfs_handle_t *
	make_dataset_handle(libzfs_handle_t hdl, const char path)
	{
	zfs_cmd_t zc = { 0 };

	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);

	if (zhp == NULL)
	return (NULL);

	zhp->zfs_hdl = hdl;
	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
	if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) {
	free(zhp);
	return (NULL);
	}
	if (get_stats_ioctl(zhp, &zc) == -1) {
	zcmd_free_nvlists(&zc);
	free(zhp);
	return (NULL);
	}
	if (make_dataset_handle_common(zhp, &zc) == -1) {
	free(zhp);
	zhp = NULL;
	}
	zcmd_free_nvlists(&zc);
	return (zhp);
	}

	zfs_handle_t *
	make_dataset_handle_zc(libzfs_handle_t hdl, zfs_cmd_t zc)
	{
	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);

	if (zhp == NULL)
	return (NULL);

	zhp->zfs_hdl = hdl;
	(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
	if (make_dataset_handle_common(zhp, zc) == -1) {
	free(zhp);
	return (NULL);
	}
	return (zhp);
	}

	zfs_handle_t *
	make_dataset_simple_handle_zc(zfs_handle_t pzhp, zfs_cmd_t zc)
	{
	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);

	if (zhp == NULL)
	return (NULL);

	zhp->zfs_hdl = pzhp->zfs_hdl;
	(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
	zhp->zfs_head_type = pzhp->zfs_type;
	zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
	zhp->zpool_hdl = zpool_handle(zhp);
	return (zhp);
	}

	zfs_handle_t *
	zfs_handle_dup(zfs_handle_t *zhp_orig)
	{
	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);

	if (zhp == NULL)
	return (NULL);

	zhp->zfs_hdl = zhp_orig->zfs_hdl;
	zhp->zpool_hdl = zhp_orig->zpool_hdl;
	(void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name,
	sizeof (zhp->zfs_name));
	zhp->zfs_type = zhp_orig->zfs_type;
	zhp->zfs_head_type = zhp_orig->zfs_head_type;
	zhp->zfs_dmustats = zhp_orig->zfs_dmustats;
	if (zhp_orig->zfs_props != NULL) {
	if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) {
	(void) no_memory(zhp->zfs_hdl);
	zfs_close(zhp);
	return (NULL);
	}
	}
	if (zhp_orig->zfs_user_props != NULL) {
	if (nvlist_dup(zhp_orig->zfs_user_props,
	&zhp->zfs_user_props, 0) != 0) {
	(void) no_memory(zhp->zfs_hdl);
	zfs_close(zhp);
	return (NULL);
	}
	}
	if (zhp_orig->zfs_recvd_props != NULL) {
	if (nvlist_dup(zhp_orig->zfs_recvd_props,
	&zhp->zfs_recvd_props, 0)) {
	(void) no_memory(zhp->zfs_hdl);
	zfs_close(zhp);
	return (NULL);
	}
	}
	zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck;
	if (zhp_orig->zfs_mntopts != NULL) {
	zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl,
	zhp_orig->zfs_mntopts);
	}
	zhp->zfs_props_table = zhp_orig->zfs_props_table;
	return (zhp);
	}

	boolean_t
	zfs_bookmark_exists(const char *path)
	{
	nvlist_t *bmarks;
	nvlist_t *props;
	char fsname[ZFS_MAX_DATASET_NAME_LEN];
	char *bmark_name;
	char *pound;
	int err;
	boolean_t rv;


	(void) strlcpy(fsname, path, sizeof (fsname));
	pound = strchr(fsname, '#');
	if (pound == NULL)
	return (B_FALSE);

	*pound = '\0';
	bmark_name = pound + 1;
	props = fnvlist_alloc();
	err = lzc_get_bookmarks(fsname, props, &bmarks);
	nvlist_free(props);
	if (err != 0) {
	nvlist_free(bmarks);
	return (B_FALSE);
	}

	rv = nvlist_exists(bmarks, bmark_name);
	nvlist_free(bmarks);
	return (rv);
	}

	zfs_handle_t *
	make_bookmark_handle(zfs_handle_t parent, const char path,
	nvlist_t *bmark_props)
	{
	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);

	if (zhp == NULL)
	return (NULL);

	/* Fill in the name. */
	zhp->zfs_hdl = parent->zfs_hdl;
	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));

	/* Set the property lists. */
	if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) {
	free(zhp);
	return (NULL);
	}

	/* Set the types. */
	zhp->zfs_head_type = parent->zfs_head_type;
	zhp->zfs_type = ZFS_TYPE_BOOKMARK;

	if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) {
	nvlist_free(zhp->zfs_props);
	free(zhp);
	return (NULL);
	}

	return (zhp);
	}

	struct zfs_open_bookmarks_cb_data {
	const char *path;
	zfs_handle_t *zhp;
	};

	static int
	zfs_open_bookmarks_cb(zfs_handle_t zhp, void data)
	{
	struct zfs_open_bookmarks_cb_data *dp = data;

	/*
	* Is it the one we are looking for?
	*/
	if (strcmp(dp->path, zfs_get_name(zhp)) == 0) {
	/*
	* We found it. Save it and let the caller know we are done.
	*/
	dp->zhp = zhp;
	return (EEXIST);
	}

	/*
	* Not found. Close the handle and ask for another one.
	*/
	zfs_close(zhp);
	return (0);
	}

	/*
	* Opens the given snapshot, bookmark, filesystem, or volume. The 'types'
	* argument is a mask of acceptable types. The function will print an
	* appropriate error message and return NULL if it can't be opened.
	*/
	zfs_handle_t *
	zfs_open(libzfs_handle_t hdl, const char path, int types)
	{
	zfs_handle_t *zhp;
	char errbuf[1024];
	char *bookp;

	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);

	/*
	* Validate the name before we even try to open it.
	*/
	if (!zfs_validate_name(hdl, path, types, B_FALSE)) {
	(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
	return (NULL);
	}

	/*
	* Bookmarks needs to be handled separately.
	*/
	bookp = strchr(path, '#');
	if (bookp == NULL) {
	/*
	* Try to get stats for the dataset, which will tell us if it
	* exists.
	*/
	errno = 0;
	if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
	(void) zfs_standard_error(hdl, errno, errbuf);
	return (NULL);
	}
	} else {
	char dsname[ZFS_MAX_DATASET_NAME_LEN];
	zfs_handle_t *pzhp;
	struct zfs_open_bookmarks_cb_data cb_data = {path, NULL};

	/*
	* We need to cut out '#' and everything after '#'
	* to get the parent dataset name only.
	*/
	assert(bookp - path < sizeof (dsname));
	(void) strncpy(dsname, path, bookp - path);
	dsname[bookp - path] = '\0';

	/*
	* Create handle for the parent dataset.
	*/
	errno = 0;
	if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) {
	(void) zfs_standard_error(hdl, errno, errbuf);
	return (NULL);
	}

	/*
	* Iterate bookmarks to find the right one.
	*/
	errno = 0;
	if ((zfs_iter_bookmarks(pzhp, zfs_open_bookmarks_cb,
	&cb_data) == 0) && (cb_data.zhp == NULL)) {
	(void) zfs_error(hdl, EZFS_NOENT, errbuf);
	zfs_close(pzhp);
	return (NULL);
	}
	if (cb_data.zhp == NULL) {
	(void) zfs_standard_error(hdl, errno, errbuf);
	zfs_close(pzhp);
	return (NULL);
	}
	zhp = cb_data.zhp;

	/*
	* Cleanup.
	*/
	zfs_close(pzhp);
	}

	if (zhp == NULL) {
	char *at = strchr(path, '@');

	if (at != NULL)
	*at = '\0';
	errno = 0;
	if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
	(void) zfs_standard_error(hdl, errno, errbuf);
	return (NULL);
	}
	if (at != NULL)
	*at = '@';
	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
	zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
	}

	if (!(types & zhp->zfs_type)) {
	(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	zfs_close(zhp);
	return (NULL);
	}

	return (zhp);
	}

	/*
	* Release a ZFS handle. Nothing to do but free the associated memory.
	*/
	void
	zfs_close(zfs_handle_t *zhp)
	{
	if (zhp->zfs_mntopts)
	free(zhp->zfs_mntopts);
	nvlist_free(zhp->zfs_props);
	nvlist_free(zhp->zfs_user_props);
	nvlist_free(zhp->zfs_recvd_props);
	free(zhp);
	}

	typedef struct mnttab_node {
	struct mnttab mtn_mt;
	avl_node_t mtn_node;
	} mnttab_node_t;

	static int
	libzfs_mnttab_cache_compare(const void arg1, const void arg2)
	{
	const mnttab_node_t *mtn1 = arg1;
	const mnttab_node_t *mtn2 = arg2;
	int rv;

	rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special);

	if (rv == 0)
	return (0);
	return (rv > 0 ? 1 : -1);
	}

	void
	libzfs_mnttab_init(libzfs_handle_t *hdl)
	{
	assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
	avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
	sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
	}

	void
	libzfs_mnttab_update(libzfs_handle_t *hdl)
	{
	struct mnttab entry;

	rewind(hdl->libzfs_mnttab);
	while (getmntent(hdl->libzfs_mnttab, &entry) == 0) {
	mnttab_node_t *mtn;

	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
	continue;
	mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
	mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special);
	mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp);
	mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype);
	mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts);
	avl_add(&hdl->libzfs_mnttab_cache, mtn);
	}
	}

	void
	libzfs_mnttab_fini(libzfs_handle_t *hdl)
	{
	void *cookie = NULL;
	mnttab_node_t *mtn;

	while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie))
	!= NULL) {
	free(mtn->mtn_mt.mnt_special);
	free(mtn->mtn_mt.mnt_mountp);
	free(mtn->mtn_mt.mnt_fstype);
	free(mtn->mtn_mt.mnt_mntopts);
	free(mtn);
	}
	avl_destroy(&hdl->libzfs_mnttab_cache);
	}

	void
	libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable)
	{
	hdl->libzfs_mnttab_enable = enable;
	}

	int
	libzfs_mnttab_find(libzfs_handle_t hdl, const char fsname,
	struct mnttab *entry)
	{
	mnttab_node_t find;
	mnttab_node_t *mtn;

	if (!hdl->libzfs_mnttab_enable) {
	struct mnttab srch = { 0 };

	if (avl_numnodes(&hdl->libzfs_mnttab_cache))
	libzfs_mnttab_fini(hdl);
	rewind(hdl->libzfs_mnttab);
	srch.mnt_special = (char *)fsname;
	srch.mnt_fstype = MNTTYPE_ZFS;
	if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0)
	return (0);
	else
	return (ENOENT);
	}

	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
	libzfs_mnttab_update(hdl);

	find.mtn_mt.mnt_special = (char *)fsname;
	mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
	if (mtn) {
	*entry = mtn->mtn_mt;
	return (0);
	}
	return (ENOENT);
	}

	void
	libzfs_mnttab_add(libzfs_handle_t hdl, const char special,
	const char mountp, const char mntopts)
	{
	mnttab_node_t *mtn;

	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
	return;
	mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
	mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
	mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
	mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
	mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
	avl_add(&hdl->libzfs_mnttab_cache, mtn);
	}

	void
	libzfs_mnttab_remove(libzfs_handle_t hdl, const char fsname)
	{
	mnttab_node_t find;
	mnttab_node_t *ret;

	find.mtn_mt.mnt_special = (char *)fsname;
	if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))
	!= NULL) {
	avl_remove(&hdl->libzfs_mnttab_cache, ret);
	free(ret->mtn_mt.mnt_special);
	free(ret->mtn_mt.mnt_mountp);
	free(ret->mtn_mt.mnt_fstype);
	free(ret->mtn_mt.mnt_mntopts);
	free(ret);
	}
	}

	int
	zfs_spa_version(zfs_handle_t zhp, int spa_version)
	{
	zpool_handle_t *zpool_handle = zhp->zpool_hdl;

	if (zpool_handle == NULL)
	return (-1);

	*spa_version = zpool_get_prop_int(zpool_handle,
	ZPOOL_PROP_VERSION, NULL);
	return (0);
	}

	/*
	* The choice of reservation property depends on the SPA version.
	*/
	static int
	zfs_which_resv_prop(zfs_handle_t zhp, zfs_prop_t resv_prop)
	{
	int spa_version;

	if (zfs_spa_version(zhp, &spa_version) < 0)
	return (-1);

	if (spa_version >= SPA_VERSION_REFRESERVATION)
	*resv_prop = ZFS_PROP_REFRESERVATION;
	else
	*resv_prop = ZFS_PROP_RESERVATION;

	return (0);
	}

	/*
	* Given an nvlist of properties to set, validates that they are correct, and
	* parses any numeric properties (index, boolean, etc) if they are specified as
	* strings.
	*/
	nvlist_t *
	zfs_valid_proplist(libzfs_handle_t hdl, zfs_type_t type, nvlist_t nvl,
	uint64_t zoned, zfs_handle_t zhp, zpool_handle_t zpool_hdl,
	const char *errbuf)
	{
	nvpair_t *elem;
	uint64_t intval;
	char *strval;
	zfs_prop_t prop;
	nvlist_t *ret;
	int chosen_normal = -1;
	int chosen_utf = -1;

	if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) {
	(void) no_memory(hdl);
	return (NULL);
	}

	/*
	* Make sure this property is valid and applies to this type.
	*/

	elem = NULL;
	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
	const char *propname = nvpair_name(elem);

	prop = zfs_name_to_prop(propname);
	if (prop == ZPROP_INVAL && zfs_prop_user(propname)) {
	/*
	* This is a user property: make sure it's a
	* string, and that it's less than ZAP_MAXNAMELEN.
	*/
	if (nvpair_type(elem) != DATA_TYPE_STRING) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' must be a string"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property name '%s' is too long"),
	propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	(void) nvpair_value_string(elem, &strval);
	if (nvlist_add_string(ret, propname, strval) != 0) {
	(void) no_memory(hdl);
	goto error;
	}
	continue;
	}

	/*
	* Currently, only user properties can be modified on
	* snapshots.
	*/
	if (type == ZFS_TYPE_SNAPSHOT) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"this property can not be modified for snapshots"));
	(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
	goto error;
	}

	if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) {
	zfs_userquota_prop_t uqtype;
	char newpropname[128];
	char domain[128];
	uint64_t rid;
	uint64_t valary[3];

	if (userquota_propname_decode(propname, zoned,
	&uqtype, domain, sizeof (domain), &rid) != 0) {
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN,
	"'%s' has an invalid user/group name"),
	propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	if (uqtype != ZFS_PROP_USERQUOTA &&
	uqtype != ZFS_PROP_GROUPQUOTA) {
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN, "'%s' is readonly"),
	propname);
	(void) zfs_error(hdl, EZFS_PROPREADONLY,
	errbuf);
	goto error;
	}

	if (nvpair_type(elem) == DATA_TYPE_STRING) {
	(void) nvpair_value_string(elem, &strval);
	if (strcmp(strval, "none") == 0) {
	intval = 0;
	} else if (zfs_nicestrtonum(hdl,
	strval, &intval) != 0) {
	(void) zfs_error(hdl,
	EZFS_BADPROP, errbuf);
	goto error;
	}
	} else if (nvpair_type(elem) ==
	DATA_TYPE_UINT64) {
	(void) nvpair_value_uint64(elem, &intval);
	if (intval == 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"use 'none' to disable "
	"userquota/groupquota"));
	goto error;
	}
	} else {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' must be a number"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	/*
	* Encode the prop name as
	* userquota@<hex-rid>-domain, to make it easy
	* for the kernel to decode.
	*/
	(void) snprintf(newpropname, sizeof (newpropname),
	"%s%llx-%s", zfs_userquota_prop_prefixes[uqtype],
	(longlong_t)rid, domain);
	valary[0] = uqtype;
	valary[1] = rid;
	valary[2] = intval;
	if (nvlist_add_uint64_array(ret, newpropname,
	valary, 3) != 0) {
	(void) no_memory(hdl);
	goto error;
	}
	continue;
	} else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' is readonly"),
	propname);
	(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
	goto error;
	}

	if (prop == ZPROP_INVAL) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"invalid property '%s'"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	if (!zfs_prop_valid_for_type(prop, type)) {
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN, "'%s' does not "
	"apply to datasets of this type"), propname);
	(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
	goto error;
	}

	if (zfs_prop_readonly(prop) &&
	(!zfs_prop_setonce(prop) \|\| zhp != NULL)) {
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN, "'%s' is readonly"),
	propname);
	(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
	goto error;
	}

	if (zprop_parse_value(hdl, elem, prop, type, ret,
	&strval, &intval, errbuf) != 0)
	goto error;

	/*
	* Perform some additional checks for specific properties.
	*/
	switch (prop) {
	case ZFS_PROP_VERSION:
	{
	int version;

	if (zhp == NULL)
	break;
	version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
	if (intval < version) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"Can not downgrade; already at version %u"),
	version);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}
	break;
	}

	case ZFS_PROP_VOLBLOCKSIZE:
	case ZFS_PROP_RECORDSIZE:
	{
	int maxbs = SPA_MAXBLOCKSIZE;
	if (zpool_hdl != NULL) {
	maxbs = zpool_get_prop_int(zpool_hdl,
	ZPOOL_PROP_MAXBLOCKSIZE, NULL);
	}
	/*
	* Volumes are limited to a volblocksize of 128KB,
	* because they typically service workloads with
	* small random writes, which incur a large performance
	* penalty with large blocks.
	*/
	if (prop == ZFS_PROP_VOLBLOCKSIZE)
	maxbs = SPA_OLD_MAXBLOCKSIZE;
	/*
	* The value must be a power of two between
	* SPA_MINBLOCKSIZE and maxbs.
	*/
	if (intval < SPA_MINBLOCKSIZE \|\|
	intval > maxbs \|\| !ISP2(intval)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' must be power of 2 from 512B "
	"to %uKB"), propname, maxbs >> 10);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}
	break;
	}
	case ZFS_PROP_MLSLABEL:
	{
	#ifdef illumos
	/*
	* Verify the mlslabel string and convert to
	* internal hex label string.
	*/

	m_label_t *new_sl;
	char hex = NULL; / internal label string */

	/* Default value is already OK. */
	if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
	break;

	/* Verify the label can be converted to binary form */
	if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) \|\|
	(str_to_label(strval, &new_sl, MAC_LABEL,
	L_NO_CORRECTION, NULL) == -1)) {
	goto badlabel;
	}

	/* Now translate to hex internal label string */
	if (label_to_str(new_sl, &hex, M_INTERNAL,
	DEF_NAMES) != 0) {
	if (hex)
	free(hex);
	goto badlabel;
	}
	m_label_free(new_sl);

	/* If string is already in internal form, we're done. */
	if (strcmp(strval, hex) == 0) {
	free(hex);
	break;
	}

	/* Replace the label string with the internal form. */
	(void) nvlist_remove(ret, zfs_prop_to_name(prop),
	DATA_TYPE_STRING);
	verify(nvlist_add_string(ret, zfs_prop_to_name(prop),
	hex) == 0);
	free(hex);

	break;

	badlabel:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"invalid mlslabel '%s'"), strval);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	m_label_free(new_sl); /* OK if null */
	#else /* !illumos */
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"mlslabel is not supported on FreeBSD"));
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	#endif /* illumos */
	goto error;

	}

	case ZFS_PROP_MOUNTPOINT:
	{
	namecheck_err_t why;

	if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 \|\|
	strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0)
	break;

	if (mountpoint_namecheck(strval, &why)) {
	switch (why) {
	case NAME_ERR_LEADING_SLASH:
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN,
	"'%s' must be an absolute path, "
	"'none', or 'legacy'"), propname);
	break;
	case NAME_ERR_TOOLONG:
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN,
	"component of '%s' is too long"),
	propname);
	break;

	default:
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN,
	"(%d) not defined"),
	why);
	break;
	}
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}
	}

	/FALLTHRU/

	case ZFS_PROP_SHARESMB:
	case ZFS_PROP_SHARENFS:
	/*
	* For the mountpoint and sharenfs or sharesmb
	* properties, check if it can be set in a
	* global/non-global zone based on
	* the zoned property value:
	*
	* global zone non-global zone
	* --------------------------------------------------
	* zoned=on mountpoint (no) mountpoint (yes)
	* sharenfs (no) sharenfs (no)
	* sharesmb (no) sharesmb (no)
	*
	* zoned=off mountpoint (yes) N/A
	* sharenfs (yes)
	* sharesmb (yes)
	*/
	if (zoned) {
	if (getzoneid() == GLOBAL_ZONEID) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' cannot be set on "
	"dataset in a non-global zone"),
	propname);
	(void) zfs_error(hdl, EZFS_ZONED,
	errbuf);
	goto error;
	} else if (prop == ZFS_PROP_SHARENFS \|\|
	prop == ZFS_PROP_SHARESMB) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' cannot be set in "
	"a non-global zone"), propname);
	(void) zfs_error(hdl, EZFS_ZONED,
	errbuf);
	goto error;
	}
	} else if (getzoneid() != GLOBAL_ZONEID) {
	/*
	* If zoned property is 'off', this must be in
	* a global zone. If not, something is wrong.
	*/
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' cannot be set while dataset "
	"'zoned' property is set"), propname);
	(void) zfs_error(hdl, EZFS_ZONED, errbuf);
	goto error;
	}

	/*
	* At this point, it is legitimate to set the
	* property. Now we want to make sure that the
	* property value is valid if it is sharenfs.
	*/
	if ((prop == ZFS_PROP_SHARENFS \|\|
	prop == ZFS_PROP_SHARESMB) &&
	strcmp(strval, "on") != 0 &&
	strcmp(strval, "off") != 0) {
	zfs_share_proto_t proto;

	if (prop == ZFS_PROP_SHARESMB)
	proto = PROTO_SMB;
	else
	proto = PROTO_NFS;

	/*
	* Must be an valid sharing protocol
	* option string so init the libshare
	* in order to enable the parser and
	* then parse the options. We use the
	* control API since we don't care about
	* the current configuration and don't
	* want the overhead of loading it
	* until we actually do something.
	*/

	if (zfs_init_libshare(hdl,
	SA_INIT_CONTROL_API) != SA_OK) {
	/*
	* An error occurred so we can't do
	* anything
	*/
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' cannot be set: problem "
	"in share initialization"),
	propname);
	(void) zfs_error(hdl, EZFS_BADPROP,
	errbuf);
	goto error;
	}

	if (zfs_parse_options(strval, proto) != SA_OK) {
	/*
	* There was an error in parsing so
	* deal with it by issuing an error
	* message and leaving after
	* uninitializing the the libshare
	* interface.
	*/
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' cannot be set to invalid "
	"options"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP,
	errbuf);
	zfs_uninit_libshare(hdl);
	goto error;
	}
	zfs_uninit_libshare(hdl);
	}

	break;

	case ZFS_PROP_UTF8ONLY:
	chosen_utf = (int)intval;
	break;

	case ZFS_PROP_NORMALIZE:
	chosen_normal = (int)intval;
	break;

	default:
	break;
	}

	/*
	* For changes to existing volumes, we have some additional
	* checks to enforce.
	*/
	if (type == ZFS_TYPE_VOLUME && zhp != NULL) {
	uint64_t volsize = zfs_prop_get_int(zhp,
	ZFS_PROP_VOLSIZE);
	uint64_t blocksize = zfs_prop_get_int(zhp,
	ZFS_PROP_VOLBLOCKSIZE);
	char buf[64];

	switch (prop) {
	case ZFS_PROP_RESERVATION:
	case ZFS_PROP_REFRESERVATION:
	if (intval > volsize) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' is greater than current "
	"volume size"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP,
	errbuf);
	goto error;
	}
	break;

	case ZFS_PROP_VOLSIZE:
	if (intval % blocksize != 0) {
	zfs_nicenum(blocksize, buf,
	sizeof (buf));
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' must be a multiple of "
	"volume block size (%s)"),
	propname, buf);
	(void) zfs_error(hdl, EZFS_BADPROP,
	errbuf);
	goto error;
	}

	if (intval == 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' cannot be zero"),
	propname);
	(void) zfs_error(hdl, EZFS_BADPROP,
	errbuf);
	goto error;
	}
	break;

	default:
	break;
	}
	}
	}

	/*
	* If normalization was chosen, but no UTF8 choice was made,
	* enforce rejection of non-UTF8 names.
	*
	* If normalization was chosen, but rejecting non-UTF8 names
	* was explicitly not chosen, it is an error.
	*/
	if (chosen_normal > 0 && chosen_utf < 0) {
	if (nvlist_add_uint64(ret,
	zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) {
	(void) no_memory(hdl);
	goto error;
	}
	} else if (chosen_normal > 0 && chosen_utf == 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' must be set 'on' if normalization chosen"),
	zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}
	return (ret);

	error:
	nvlist_free(ret);
	return (NULL);
	}

	int
	zfs_add_synthetic_resv(zfs_handle_t zhp, nvlist_t nvl)
	{
	uint64_t old_volsize;
	uint64_t new_volsize;
	uint64_t old_reservation;
	uint64_t new_reservation;
	zfs_prop_t resv_prop;
	nvlist_t *props;

	/*
	* If this is an existing volume, and someone is setting the volsize,
	* make sure that it matches the reservation, or add it if necessary.
	*/
	old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
	if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
	return (-1);
	old_reservation = zfs_prop_get_int(zhp, resv_prop);

	props = fnvlist_alloc();
	fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));

	if ((zvol_volsize_to_reservation(old_volsize, props) !=
	old_reservation) \|\| nvlist_exists(nvl,
	zfs_prop_to_name(resv_prop))) {
	fnvlist_free(props);
	return (0);
	}
	if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
	&new_volsize) != 0) {
	fnvlist_free(props);
	return (-1);
	}
	new_reservation = zvol_volsize_to_reservation(new_volsize, props);
	fnvlist_free(props);

	if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
	new_reservation) != 0) {
	(void) no_memory(zhp->zfs_hdl);
	return (-1);
	}
	return (1);
	}

	void
	zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
	char *errbuf)
	{
	switch (err) {

	case ENOSPC:
	/*
	* For quotas and reservations, ENOSPC indicates
	* something different; setting a quota or reservation
	* doesn't use any disk space.
	*/
	switch (prop) {
	case ZFS_PROP_QUOTA:
	case ZFS_PROP_REFQUOTA:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"size is less than current used or "
	"reserved space"));
	(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
	break;

	case ZFS_PROP_RESERVATION:
	case ZFS_PROP_REFRESERVATION:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"size is greater than available space"));
	(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
	break;

	default:
	(void) zfs_standard_error(hdl, err, errbuf);
	break;
	}
	break;

	case EBUSY:
	(void) zfs_standard_error(hdl, EBUSY, errbuf);
	break;

	case EROFS:
	(void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
	break;

	case E2BIG:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property value too long"));
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	break;

	case ENOTSUP:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool and or dataset must be upgraded to set this "
	"property or value"));
	(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	break;

	case ERANGE:
	case EDOM:
	if (prop == ZFS_PROP_COMPRESSION \|\|
	prop == ZFS_PROP_RECORDSIZE) {
	(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property setting is not allowed on "
	"bootable datasets"));
	(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
	} else if (prop == ZFS_PROP_CHECKSUM \|\|
	prop == ZFS_PROP_DEDUP) {
	(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property setting is not allowed on "
	"root pools"));
	(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
	} else {
	(void) zfs_standard_error(hdl, err, errbuf);
	}
	break;

	case EINVAL:
	if (prop == ZPROP_INVAL) {
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	} else {
	(void) zfs_standard_error(hdl, err, errbuf);
	}
	break;

	case EOVERFLOW:
	/*
	* This platform can't address a volume this big.
	*/
	#ifdef _ILP32
	if (prop == ZFS_PROP_VOLSIZE) {
	(void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
	break;
	}
	#endif
	/* FALLTHROUGH */
	default:
	(void) zfs_standard_error(hdl, err, errbuf);
	}
	}

	/*
	* Given a property name and value, set the property for the given dataset.
	*/
	int
	zfs_prop_set(zfs_handle_t zhp, const char propname, const char *propval)
	{
	int ret = -1;
	char errbuf[1024];
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	nvlist_t *nvl = NULL;

	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
	zhp->zfs_name);

	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 \|\|
	nvlist_add_string(nvl, propname, propval) != 0) {
	(void) no_memory(hdl);
	goto error;
	}

	ret = zfs_prop_set_list(zhp, nvl);

	error:
	nvlist_free(nvl);
	return (ret);
	}



	/*
	* Given an nvlist of property names and values, set the properties for the
	* given dataset.
	*/
	int
	zfs_prop_set_list(zfs_handle_t zhp, nvlist_t props)
	{
	zfs_cmd_t zc = { 0 };
	int ret = -1;
	prop_changelist_t **cls = NULL;
	int cl_idx;
	char errbuf[1024];
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	nvlist_t *nvl;
	int nvl_len;
	int added_resv = 0;

	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
	zhp->zfs_name);

	if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props,
	zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl,
	errbuf)) == NULL)
	goto error;

	/*
	* We have to check for any extra properties which need to be added
	* before computing the length of the nvlist.
	*/
	for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
	elem != NULL;
	elem = nvlist_next_nvpair(nvl, elem)) {
	if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE &&
	(added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) {
	goto error;
	}
	}
	/*
	* Check how many properties we're setting and allocate an array to
	* store changelist pointers for postfix().
	*/
	nvl_len = 0;
	for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
	elem != NULL;
	elem = nvlist_next_nvpair(nvl, elem))
	nvl_len++;
	if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL)
	goto error;

	cl_idx = 0;
	for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
	elem != NULL;
	elem = nvlist_next_nvpair(nvl, elem)) {

	zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem));

	assert(cl_idx < nvl_len);
	/*
	* We don't want to unmount & remount the dataset when changing
	* its canmount property to 'on' or 'noauto'. We only use
	* the changelist logic to unmount when setting canmount=off.
	*/
	if (prop != ZFS_PROP_CANMOUNT \|\|
	(fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF &&
	zfs_is_mounted(zhp, NULL))) {
	cls[cl_idx] = changelist_gather(zhp, prop, 0, 0);
	if (cls[cl_idx] == NULL)
	goto error;
	}

	if (prop == ZFS_PROP_MOUNTPOINT &&
	changelist_haszonedchild(cls[cl_idx])) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"child dataset with inherited mountpoint is used "
	"in a non-global zone"));
	ret = zfs_error(hdl, EZFS_ZONED, errbuf);
	goto error;
	}

	/* We don't support those properties on FreeBSD. */
	switch (prop) {
	case ZFS_PROP_DEVICES:
	case ZFS_PROP_ISCSIOPTIONS:
	case ZFS_PROP_XATTR:
	case ZFS_PROP_VSCAN:
	case ZFS_PROP_NBMAND:
	case ZFS_PROP_MLSLABEL:
	(void) snprintf(errbuf, sizeof (errbuf),
	"property '%s' not supported on FreeBSD",
	nvpair_name(elem));
	ret = zfs_error(hdl, EZFS_PERM, errbuf);
	goto error;
	}

	if (cls[cl_idx] != NULL &&
	(ret = changelist_prefix(cls[cl_idx])) != 0)
	goto error;

	cl_idx++;
	}
	assert(cl_idx == nvl_len);

	/*
	* Execute the corresponding ioctl() to set this list of properties.
	*/
	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));

	if ((ret = zcmd_write_src_nvlist(hdl, &zc, nvl)) != 0 \|\|
	(ret = zcmd_alloc_dst_nvlist(hdl, &zc, 0)) != 0)
	goto error;

	ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);

	if (ret != 0) {
	/* Get the list of unset properties back and report them. */
	nvlist_t *errorprops = NULL;
	if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0)
	goto error;
	for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
	elem != NULL;
	elem = nvlist_next_nvpair(nvl, elem)) {
	zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem));
	zfs_setprop_error(hdl, prop, errno, errbuf);
	}
	nvlist_free(errorprops);

	if (added_resv && errno == ENOSPC) {
	/* clean up the volsize property we tried to set */
	uint64_t old_volsize = zfs_prop_get_int(zhp,
	ZFS_PROP_VOLSIZE);
	nvlist_free(nvl);
	nvl = NULL;
	zcmd_free_nvlists(&zc);

	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
	goto error;
	if (nvlist_add_uint64(nvl,
	zfs_prop_to_name(ZFS_PROP_VOLSIZE),
	old_volsize) != 0)
	goto error;
	if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0)
	goto error;
	(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
	}
	} else {
	for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
	if (cls[cl_idx] != NULL) {
	int clp_err = changelist_postfix(cls[cl_idx]);
	if (clp_err != 0)
	ret = clp_err;
	}
	}

	/*
	* Refresh the statistics so the new property value
	* is reflected.
	*/
	if (ret == 0)
	(void) get_stats(zhp);
	}

	error:
	nvlist_free(nvl);
	zcmd_free_nvlists(&zc);
	if (cls != NULL) {
	for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
	if (cls[cl_idx] != NULL)
	changelist_free(cls[cl_idx]);
	}
	free(cls);
	}
	return (ret);
	}

	/*
	* Given a property, inherit the value from the parent dataset, or if received
	* is TRUE, revert to the received value, if any.
	*/
	int
	zfs_prop_inherit(zfs_handle_t zhp, const char propname, boolean_t received)
	{
	zfs_cmd_t zc = { 0 };
	int ret;
	prop_changelist_t *cl;
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	char errbuf[1024];
	zfs_prop_t prop;

	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"cannot inherit %s for '%s'"), propname, zhp->zfs_name);

	zc.zc_cookie = received;
	if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) {
	/*
	* For user properties, the amount of work we have to do is very
	* small, so just do it here.
	*/
	if (!zfs_prop_user(propname)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"invalid property"));
	return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	}

	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));

	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0)
	return (zfs_standard_error(hdl, errno, errbuf));

	return (0);
	}

	/*
	* Verify that this property is inheritable.
	*/
	if (zfs_prop_readonly(prop))
	return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf));

	if (!zfs_prop_inheritable(prop) && !received)
	return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf));

	/*
	* Check to see if the value applies to this type
	*/
	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
	return (zfs_error(hdl, EZFS_PROPTYPE, errbuf));

	/*
	* Normalize the name, to get rid of shorthand abbreviations.
	*/
	propname = zfs_prop_to_name(prop);
	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));

	if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID &&
	zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"dataset is used in a non-global zone"));
	return (zfs_error(hdl, EZFS_ZONED, errbuf));
	}

	/*
	* Determine datasets which will be affected by this change, if any.
	*/
	if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL)
	return (-1);

	if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"child dataset with inherited mountpoint is used "
	"in a non-global zone"));
	ret = zfs_error(hdl, EZFS_ZONED, errbuf);
	goto error;
	}

	if ((ret = changelist_prefix(cl)) != 0)
	goto error;

	if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) {
	return (zfs_standard_error(hdl, errno, errbuf));
	} else {

	if ((ret = changelist_postfix(cl)) != 0)
	goto error;

	/*
	* Refresh the statistics so the new property is reflected.
	*/
	(void) get_stats(zhp);
	}

	error:
	changelist_free(cl);
	return (ret);
	}

	/*
	* True DSL properties are stored in an nvlist. The following two functions
	* extract them appropriately.
	*/
	static uint64_t
	getprop_uint64(zfs_handle_t zhp, zfs_prop_t prop, char *source)
	{
	nvlist_t *nv;
	uint64_t value;

	*source = NULL;
	if (nvlist_lookup_nvlist(zhp->zfs_props,
	zfs_prop_to_name(prop), &nv) == 0) {
	verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
	(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
	} else {
	verify(!zhp->zfs_props_table \|\|
	zhp->zfs_props_table[prop] == B_TRUE);
	value = zfs_prop_default_numeric(prop);
	*source = "";
	}

	return (value);
	}

	static const char *
	getprop_string(zfs_handle_t zhp, zfs_prop_t prop, char *source)
	{
	nvlist_t *nv;
	const char *value;

	*source = NULL;
	if (nvlist_lookup_nvlist(zhp->zfs_props,
	zfs_prop_to_name(prop), &nv) == 0) {
	value = fnvlist_lookup_string(nv, ZPROP_VALUE);
	(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
	} else {
	verify(!zhp->zfs_props_table \|\|
	zhp->zfs_props_table[prop] == B_TRUE);
	value = zfs_prop_default_string(prop);
	*source = "";
	}

	return (value);
	}

	static boolean_t
	zfs_is_recvd_props_mode(zfs_handle_t *zhp)
	{
	return (zhp->zfs_props == zhp->zfs_recvd_props);
	}

	static void
	zfs_set_recvd_props_mode(zfs_handle_t zhp, uint64_t cookie)
	{
	*cookie = (uint64_t)(uintptr_t)zhp->zfs_props;
	zhp->zfs_props = zhp->zfs_recvd_props;
	}

	static void
	zfs_unset_recvd_props_mode(zfs_handle_t zhp, uint64_t cookie)
	{
	zhp->zfs_props = (nvlist_t )(uintptr_t)cookie;
	*cookie = 0;
	}

	/*
	* Internal function for getting a numeric property. Both zfs_prop_get() and
	* zfs_prop_get_int() are built using this interface.
	*
	* Certain properties can be overridden using 'mount -o'. In this case, scan
	* the contents of the /etc/mnttab entry, searching for the appropriate options.
	* If they differ from the on-disk values, report the current values and mark
	* the source "temporary".
	*/
	static int
	get_numeric_property(zfs_handle_t zhp, zfs_prop_t prop, zprop_source_t src,
	char *source, uint64_t val)
	{
	zfs_cmd_t zc = { 0 };
	nvlist_t *zplprops = NULL;
	struct mnttab mnt;
	char *mntopt_on = NULL;
	char *mntopt_off = NULL;
	boolean_t received = zfs_is_recvd_props_mode(zhp);

	*source = NULL;

	switch (prop) {
	case ZFS_PROP_ATIME:
	mntopt_on = MNTOPT_ATIME;
	mntopt_off = MNTOPT_NOATIME;
	break;

	case ZFS_PROP_DEVICES:
	mntopt_on = MNTOPT_DEVICES;
	mntopt_off = MNTOPT_NODEVICES;
	break;

	case ZFS_PROP_EXEC:
	mntopt_on = MNTOPT_EXEC;
	mntopt_off = MNTOPT_NOEXEC;
	break;

	case ZFS_PROP_READONLY:
	mntopt_on = MNTOPT_RO;
	mntopt_off = MNTOPT_RW;
	break;

	case ZFS_PROP_SETUID:
	mntopt_on = MNTOPT_SETUID;
	mntopt_off = MNTOPT_NOSETUID;
	break;

	case ZFS_PROP_XATTR:
	mntopt_on = MNTOPT_XATTR;
	mntopt_off = MNTOPT_NOXATTR;
	break;

	case ZFS_PROP_NBMAND:
	mntopt_on = MNTOPT_NBMAND;
	mntopt_off = MNTOPT_NONBMAND;
	break;

	default:
	break;
	}

	/*
	* Because looking up the mount options is potentially expensive
	* (iterating over all of /etc/mnttab), we defer its calculation until
	* we're looking up a property which requires its presence.
	*/
	if (!zhp->zfs_mntcheck &&
	(mntopt_on != NULL \|\| prop == ZFS_PROP_MOUNTED)) {
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	struct mnttab entry;

	if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) {
	zhp->zfs_mntopts = zfs_strdup(hdl,
	entry.mnt_mntopts);
	if (zhp->zfs_mntopts == NULL)
	return (-1);
	}

	zhp->zfs_mntcheck = B_TRUE;
	}

	if (zhp->zfs_mntopts == NULL)
	mnt.mnt_mntopts = "";
	else
	mnt.mnt_mntopts = zhp->zfs_mntopts;

	switch (prop) {
	case ZFS_PROP_ATIME:
	case ZFS_PROP_DEVICES:
	case ZFS_PROP_EXEC:
	case ZFS_PROP_READONLY:
	case ZFS_PROP_SETUID:
	case ZFS_PROP_XATTR:
	case ZFS_PROP_NBMAND:
	*val = getprop_uint64(zhp, prop, source);

	if (received)
	break;

	if (hasmntopt(&mnt, mntopt_on) && !*val) {
	*val = B_TRUE;
	if (src)
	*src = ZPROP_SRC_TEMPORARY;
	} else if (hasmntopt(&mnt, mntopt_off) && *val) {
	*val = B_FALSE;
	if (src)
	*src = ZPROP_SRC_TEMPORARY;
	}
	break;

	case ZFS_PROP_CANMOUNT:
	case ZFS_PROP_VOLSIZE:
	case ZFS_PROP_QUOTA:
	case ZFS_PROP_REFQUOTA:
	case ZFS_PROP_RESERVATION:
	case ZFS_PROP_REFRESERVATION:
	case ZFS_PROP_FILESYSTEM_LIMIT:
	case ZFS_PROP_SNAPSHOT_LIMIT:
	case ZFS_PROP_FILESYSTEM_COUNT:
	case ZFS_PROP_SNAPSHOT_COUNT:
	*val = getprop_uint64(zhp, prop, source);

	if (*source == NULL) {
	/* not default, must be local */
	*source = zhp->zfs_name;
	}
	break;

	case ZFS_PROP_MOUNTED:
	*val = (zhp->zfs_mntopts != NULL);
	break;

	case ZFS_PROP_NUMCLONES:
	*val = zhp->zfs_dmustats.dds_num_clones;
	break;

	case ZFS_PROP_VERSION:
	case ZFS_PROP_NORMALIZE:
	case ZFS_PROP_UTF8ONLY:
	case ZFS_PROP_CASE:
	if (!zfs_prop_valid_for_type(prop, zhp->zfs_head_type) \|\|
	zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
	return (-1);
	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) {
	zcmd_free_nvlists(&zc);
	return (-1);
	}
	if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 \|\|
	nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop),
	val) != 0) {
	zcmd_free_nvlists(&zc);
	return (-1);
	}
	nvlist_free(zplprops);
	zcmd_free_nvlists(&zc);
	break;

	case ZFS_PROP_INCONSISTENT:
	*val = zhp->zfs_dmustats.dds_inconsistent;
	break;

	default:
	switch (zfs_prop_get_type(prop)) {
	case PROP_TYPE_NUMBER:
	case PROP_TYPE_INDEX:
	*val = getprop_uint64(zhp, prop, source);
	/*
	* If we tried to use a default value for a
	* readonly property, it means that it was not
	* present. Note this only applies to "truly"
	* readonly properties, not set-once properties
	* like volblocksize.
	*/
	if (zfs_prop_readonly(prop) &&
	!zfs_prop_setonce(prop) &&
	source != NULL && (source)[0] == '\0') {
	*source = NULL;
	return (-1);
	}
	break;

	case PROP_TYPE_STRING:
	default:
	zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	"cannot get non-numeric property"));
	return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP,
	dgettext(TEXT_DOMAIN, "internal error")));
	}
	}

	return (0);
	}

	/*
	* Calculate the source type, given the raw source string.
	*/
	static void
	get_source(zfs_handle_t zhp, zprop_source_t srctype, char *source,
	char *statbuf, size_t statlen)
	{
	if (statbuf == NULL \|\| *srctype == ZPROP_SRC_TEMPORARY)
	return;

	if (source == NULL) {
	*srctype = ZPROP_SRC_NONE;
	} else if (source[0] == '\0') {
	*srctype = ZPROP_SRC_DEFAULT;
	} else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) {
	*srctype = ZPROP_SRC_RECEIVED;
	} else {
	if (strcmp(source, zhp->zfs_name) == 0) {
	*srctype = ZPROP_SRC_LOCAL;
	} else {
	(void) strlcpy(statbuf, source, statlen);
	*srctype = ZPROP_SRC_INHERITED;
	}
	}

	}

	int
	zfs_prop_get_recvd(zfs_handle_t zhp, const char propname, char *propbuf,
	size_t proplen, boolean_t literal)
	{
	zfs_prop_t prop;
	int err = 0;

	if (zhp->zfs_recvd_props == NULL)
	if (get_recvd_props_ioctl(zhp) != 0)
	return (-1);

	prop = zfs_name_to_prop(propname);

	if (prop != ZPROP_INVAL) {
	uint64_t cookie;
	if (!nvlist_exists(zhp->zfs_recvd_props, propname))
	return (-1);
	zfs_set_recvd_props_mode(zhp, &cookie);
	err = zfs_prop_get(zhp, prop, propbuf, proplen,
	NULL, NULL, 0, literal);
	zfs_unset_recvd_props_mode(zhp, &cookie);
	} else {
	nvlist_t *propval;
	char *recvdval;
	if (nvlist_lookup_nvlist(zhp->zfs_recvd_props,
	propname, &propval) != 0)
	return (-1);
	verify(nvlist_lookup_string(propval, ZPROP_VALUE,
	&recvdval) == 0);
	(void) strlcpy(propbuf, recvdval, proplen);
	}

	return (err == 0 ? 0 : -1);
	}

	static int
	get_clones_string(zfs_handle_t zhp, char propbuf, size_t proplen)
	{
	nvlist_t *value;
	nvpair_t *pair;

	value = zfs_get_clones_nvl(zhp);
	if (value == NULL)
	return (-1);

	propbuf[0] = '\0';
	for (pair = nvlist_next_nvpair(value, NULL); pair != NULL;
	pair = nvlist_next_nvpair(value, pair)) {
	if (propbuf[0] != '\0')
	(void) strlcat(propbuf, ",", proplen);
	(void) strlcat(propbuf, nvpair_name(pair), proplen);
	}

	return (0);
	}

	struct get_clones_arg {
	uint64_t numclones;
	nvlist_t *value;
	const char *origin;
	char buf[ZFS_MAX_DATASET_NAME_LEN];
	};

	int
	get_clones_cb(zfs_handle_t zhp, void arg)
	{
	struct get_clones_arg *gca = arg;

	if (gca->numclones == 0) {
	zfs_close(zhp);
	return (0);
	}

	if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf),
	NULL, NULL, 0, B_TRUE) != 0)
	goto out;
	if (strcmp(gca->buf, gca->origin) == 0) {
	fnvlist_add_boolean(gca->value, zfs_get_name(zhp));
	gca->numclones--;
	}

	out:
	(void) zfs_iter_children(zhp, get_clones_cb, gca);
	zfs_close(zhp);
	return (0);
	}

	nvlist_t *
	zfs_get_clones_nvl(zfs_handle_t *zhp)
	{
	nvlist_t nv, value;

	if (nvlist_lookup_nvlist(zhp->zfs_props,
	zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) {
	struct get_clones_arg gca;

	/*
	* if this is a snapshot, then the kernel wasn't able
	* to get the clones. Do it by slowly iterating.
	*/
	if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT)
	return (NULL);
	if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0)
	return (NULL);
	if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) {
	nvlist_free(nv);
	return (NULL);
	}

	gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES);
	gca.value = value;
	gca.origin = zhp->zfs_name;

	if (gca.numclones != 0) {
	zfs_handle_t *root;
	char pool[ZFS_MAX_DATASET_NAME_LEN];
	char *cp = pool;

	/* get the pool name */
	(void) strlcpy(pool, zhp->zfs_name, sizeof (pool));
	(void) strsep(&cp, "/@");
	root = zfs_open(zhp->zfs_hdl, pool,
	ZFS_TYPE_FILESYSTEM);

	(void) get_clones_cb(root, &gca);
	}

	if (gca.numclones != 0 \|\|
	nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 \|\|
	nvlist_add_nvlist(zhp->zfs_props,
	zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) {
	nvlist_free(nv);
	nvlist_free(value);
	return (NULL);
	}
	nvlist_free(nv);
	nvlist_free(value);
	verify(0 == nvlist_lookup_nvlist(zhp->zfs_props,
	zfs_prop_to_name(ZFS_PROP_CLONES), &nv));
	}

	verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0);

	return (value);
	}

	/*
	* Accepts a property and value and checks that the value
	* matches the one found by the channel program. If they are
	* not equal, print both of them.
	*/
	void
	zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval,
	const char *strval)
	{
	if (!zhp->zfs_hdl->libzfs_prop_debug)
	return;
	int error;
	char *poolname = zhp->zpool_hdl->zpool_name;
	const char *program =
	"args = ...\n"
	"ds = args['dataset']\n"
	"prop = args['property']\n"
	"value, setpoint = zfs.get_prop(ds, prop)\n"
	"return {value=value, setpoint=setpoint}\n";
	nvlist_t *outnvl;
	nvlist_t *retnvl;
	nvlist_t *argnvl = fnvlist_alloc();

	fnvlist_add_string(argnvl, "dataset", zhp->zfs_name);
	fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop));

	error = lzc_channel_program_nosync(poolname, program,
	10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl);

	if (error == 0) {
	retnvl = fnvlist_lookup_nvlist(outnvl, "return");
	if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) {
	int64_t ans;
	error = nvlist_lookup_int64(retnvl, "value", &ans);
	if (error != 0) {
	(void) fprintf(stderr, "zcp check error: %u\n",
	error);
	return;
	}
	if (ans != intval) {
	(void) fprintf(stderr,
	"%s: zfs found %lld, but zcp found %lld\n",
	zfs_prop_to_name(prop),
	(longlong_t)intval, (longlong_t)ans);
	}
	} else {
	char *str_ans;
	error = nvlist_lookup_string(retnvl, "value", &str_ans);
	if (error != 0) {
	(void) fprintf(stderr, "zcp check error: %u\n",
	error);
	return;
	}
	if (strcmp(strval, str_ans) != 0) {
	(void) fprintf(stderr,
	"%s: zfs found %s, but zcp found %s\n",
	zfs_prop_to_name(prop),
	strval, str_ans);
	}
	}
	} else {
	(void) fprintf(stderr,
	"zcp check failed, channel program error: %u\n", error);
	}
	nvlist_free(argnvl);
	nvlist_free(outnvl);
	}

	/*
	* Retrieve a property from the given object. If 'literal' is specified, then
	* numbers are left as exact values. Otherwise, numbers are converted to a
	* human-readable form.
	*
	* Returns 0 on success, or -1 on error.
	*/
	int
	zfs_prop_get(zfs_handle_t zhp, zfs_prop_t prop, char propbuf, size_t proplen,
	zprop_source_t src, char statbuf, size_t statlen, boolean_t literal)
	{
	char *source = NULL;
	uint64_t val;
	const char *str;
	const char *strval;
	boolean_t received = zfs_is_recvd_props_mode(zhp);

	/*
	* Check to see if this property applies to our object
	*/
	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
	return (-1);

	if (received && zfs_prop_readonly(prop))
	return (-1);

	if (src)
	*src = ZPROP_SRC_NONE;

	switch (prop) {
	case ZFS_PROP_CREATION:
	/*
	* 'creation' is a time_t stored in the statistics. We convert
	* this into a string unless 'literal' is specified.
	*/
	{
	val = getprop_uint64(zhp, prop, &source);
	time_t time = (time_t)val;
	struct tm t;

	if (literal \|\|
	localtime_r(&time, &t) == NULL \|\|
	strftime(propbuf, proplen, "%a %b %e %k:%M %Y",
	&t) == 0)
	(void) snprintf(propbuf, proplen, "%llu", val);
	}
	zcp_check(zhp, prop, val, NULL);
	break;

	case ZFS_PROP_MOUNTPOINT:
	/*
	* Getting the precise mountpoint can be tricky.
	*
	* - for 'none' or 'legacy', return those values.
	* - for inherited mountpoints, we want to take everything
	* after our ancestor and append it to the inherited value.
	*
	* If the pool has an alternate root, we want to prepend that
	* root to any values we return.
	*/

	str = getprop_string(zhp, prop, &source);

	if (str[0] == '/') {
	char buf[MAXPATHLEN];
	char *root = buf;
	const char *relpath;

	/*
	* If we inherit the mountpoint, even from a dataset
	* with a received value, the source will be the path of
	* the dataset we inherit from. If source is
	* ZPROP_SOURCE_VAL_RECVD, the received value is not
	* inherited.
	*/
	if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
	relpath = "";
	} else {
	relpath = zhp->zfs_name + strlen(source);
	if (relpath[0] == '/')
	relpath++;
	}

	if ((zpool_get_prop(zhp->zpool_hdl,
	ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL,
	B_FALSE)) \|\| (strcmp(root, "-") == 0))
	root[0] = '\0';
	/*
	* Special case an alternate root of '/'. This will
	* avoid having multiple leading slashes in the
	* mountpoint path.
	*/
	if (strcmp(root, "/") == 0)
	root++;

	/*
	* If the mountpoint is '/' then skip over this
	* if we are obtaining either an alternate root or
	* an inherited mountpoint.
	*/
	if (str[1] == '\0' && (root[0] != '\0' \|\|
	relpath[0] != '\0'))
	str++;

	if (relpath[0] == '\0')
	(void) snprintf(propbuf, proplen, "%s%s",
	root, str);
	else
	(void) snprintf(propbuf, proplen, "%s%s%s%s",
	root, str, relpath[0] == '@' ? "" : "/",
	relpath);
	} else {
	/* 'legacy' or 'none' */
	(void) strlcpy(propbuf, str, proplen);
	}
	zcp_check(zhp, prop, NULL, propbuf);
	break;

	case ZFS_PROP_ORIGIN:
	str = getprop_string(zhp, prop, &source);
	if (str == NULL)
	return (-1);
	(void) strlcpy(propbuf, str, proplen);
	zcp_check(zhp, prop, NULL, str);
	break;

	case ZFS_PROP_CLONES:
	if (get_clones_string(zhp, propbuf, proplen) != 0)
	return (-1);
	break;

	case ZFS_PROP_QUOTA:
	case ZFS_PROP_REFQUOTA:
	case ZFS_PROP_RESERVATION:
	case ZFS_PROP_REFRESERVATION:

	if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
	return (-1);
	/*
	* If quota or reservation is 0, we translate this into 'none'
	* (unless literal is set), and indicate that it's the default
	* value. Otherwise, we print the number nicely and indicate
	* that its set locally.
	*/
	if (val == 0) {
	if (literal)
	(void) strlcpy(propbuf, "0", proplen);
	else
	(void) strlcpy(propbuf, "none", proplen);
	} else {
	if (literal)
	(void) snprintf(propbuf, proplen, "%llu",
	(u_longlong_t)val);
	else
	zfs_nicenum(val, propbuf, proplen);
	}
	zcp_check(zhp, prop, val, NULL);
	break;

	case ZFS_PROP_FILESYSTEM_LIMIT:
	case ZFS_PROP_SNAPSHOT_LIMIT:
	case ZFS_PROP_FILESYSTEM_COUNT:
	case ZFS_PROP_SNAPSHOT_COUNT:

	if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
	return (-1);

	/*
	* If limit is UINT64_MAX, we translate this into 'none' (unless
	* literal is set), and indicate that it's the default value.
	* Otherwise, we print the number nicely and indicate that it's
	* set locally.
	*/
	if (literal) {
	(void) snprintf(propbuf, proplen, "%llu",
	(u_longlong_t)val);
	} else if (val == UINT64_MAX) {
	(void) strlcpy(propbuf, "none", proplen);
	} else {
	zfs_nicenum(val, propbuf, proplen);
	}

	zcp_check(zhp, prop, val, NULL);
	break;

	case ZFS_PROP_REFRATIO:
	case ZFS_PROP_COMPRESSRATIO:
	if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
	return (-1);
	(void) snprintf(propbuf, proplen, "%llu.%02llux",
	(u_longlong_t)(val / 100),
	(u_longlong_t)(val % 100));
	zcp_check(zhp, prop, val, NULL);
	break;

	case ZFS_PROP_TYPE:
	switch (zhp->zfs_type) {
	case ZFS_TYPE_FILESYSTEM:
	str = "filesystem";
	break;
	case ZFS_TYPE_VOLUME:
	str = "volume";
	break;
	case ZFS_TYPE_SNAPSHOT:
	str = "snapshot";
	break;
	case ZFS_TYPE_BOOKMARK:
	str = "bookmark";
	break;
	default:
	abort();
	}
	(void) snprintf(propbuf, proplen, "%s", str);
	zcp_check(zhp, prop, NULL, propbuf);
	break;

	case ZFS_PROP_MOUNTED:
	/*
	* The 'mounted' property is a pseudo-property that described
	* whether the filesystem is currently mounted. Even though
	* it's a boolean value, the typical values of "on" and "off"
	* don't make sense, so we translate to "yes" and "no".
	*/
	if (get_numeric_property(zhp, ZFS_PROP_MOUNTED,
	src, &source, &val) != 0)
	return (-1);
	if (val)
	(void) strlcpy(propbuf, "yes", proplen);
	else
	(void) strlcpy(propbuf, "no", proplen);
	break;

	case ZFS_PROP_NAME:
	/*
	* The 'name' property is a pseudo-property derived from the
	* dataset name. It is presented as a real property to simplify
	* consumers.
	*/
	(void) strlcpy(propbuf, zhp->zfs_name, proplen);
	zcp_check(zhp, prop, NULL, propbuf);
	break;

	case ZFS_PROP_MLSLABEL:
	{
	#ifdef illumos
	m_label_t *new_sl = NULL;
	char ascii = NULL; / human readable label */

	(void) strlcpy(propbuf,
	getprop_string(zhp, prop, &source), proplen);

	if (literal \|\| (strcasecmp(propbuf,
	ZFS_MLSLABEL_DEFAULT) == 0))
	break;

	/*
	* Try to translate the internal hex string to
	* human-readable output. If there are any
	* problems just use the hex string.
	*/

	if (str_to_label(propbuf, &new_sl, MAC_LABEL,
	L_NO_CORRECTION, NULL) == -1) {
	m_label_free(new_sl);
	break;
	}

	if (label_to_str(new_sl, &ascii, M_LABEL,
	DEF_NAMES) != 0) {
	if (ascii)
	free(ascii);
	m_label_free(new_sl);
	break;
	}
	m_label_free(new_sl);

	(void) strlcpy(propbuf, ascii, proplen);
	free(ascii);
	#else /* !illumos */
	propbuf[0] = '\0';
	#endif /* illumos */
	}
	break;

	case ZFS_PROP_GUID:
	/*
	* GUIDs are stored as numbers, but they are identifiers.
	* We don't want them to be pretty printed, because pretty
	* printing mangles the ID into a truncated and useless value.
	*/
	if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
	return (-1);
	(void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val);
	zcp_check(zhp, prop, val, NULL);
	break;

	default:
	switch (zfs_prop_get_type(prop)) {
	case PROP_TYPE_NUMBER:
	if (get_numeric_property(zhp, prop, src,
	&source, &val) != 0) {
	return (-1);
	}

	if (literal) {
	(void) snprintf(propbuf, proplen, "%llu",
	(u_longlong_t)val);
	} else {
	zfs_nicenum(val, propbuf, proplen);
	}
	zcp_check(zhp, prop, val, NULL);
	break;

	case PROP_TYPE_STRING:
	str = getprop_string(zhp, prop, &source);
	if (str == NULL)
	return (-1);

	(void) strlcpy(propbuf, str, proplen);
	zcp_check(zhp, prop, NULL, str);
	break;

	case PROP_TYPE_INDEX:
	if (get_numeric_property(zhp, prop, src,
	&source, &val) != 0)
	return (-1);
	if (zfs_prop_index_to_string(prop, val, &strval) != 0)
	return (-1);

	(void) strlcpy(propbuf, strval, proplen);
	zcp_check(zhp, prop, NULL, strval);
	break;

	default:
	abort();
	}
	}

	get_source(zhp, src, source, statbuf, statlen);

	return (0);
	}

	/*
	* Utility function to get the given numeric property. Does no validation that
	* the given property is the appropriate type; should only be used with
	* hard-coded property types.
	*/
	uint64_t
	zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop)
	{
	char *source;
	uint64_t val;

	(void) get_numeric_property(zhp, prop, NULL, &source, &val);

	return (val);
	}

	int
	zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val)
	{
	char buf[64];

	(void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val);
	return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf));
	}

	/*
	* Similar to zfs_prop_get(), but returns the value as an integer.
	*/
	int
	zfs_prop_get_numeric(zfs_handle_t zhp, zfs_prop_t prop, uint64_t value,
	zprop_source_t src, char statbuf, size_t statlen)
	{
	char *source;

	/*
	* Check to see if this property applies to our object
	*/
	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) {
	return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE,
	dgettext(TEXT_DOMAIN, "cannot get property '%s'"),
	zfs_prop_to_name(prop)));
	}

	if (src)
	*src = ZPROP_SRC_NONE;

	if (get_numeric_property(zhp, prop, src, &source, value) != 0)
	return (-1);

	get_source(zhp, src, source, statbuf, statlen);

	return (0);
	}

	static int
	idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
	char *domainp, idmap_rid_t ridp)
	{
	#ifdef illumos
	idmap_get_handle_t *get_hdl = NULL;
	idmap_stat status;
	int err = EINVAL;

	if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS)
	goto out;

	if (isuser) {
	err = idmap_get_sidbyuid(get_hdl, id,
	IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
	} else {
	err = idmap_get_sidbygid(get_hdl, id,
	IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
	}
	if (err == IDMAP_SUCCESS &&
	idmap_get_mappings(get_hdl) == IDMAP_SUCCESS &&
	status == IDMAP_SUCCESS)
	err = 0;
	else
	err = EINVAL;
	out:
	if (get_hdl)
	idmap_get_destroy(get_hdl);
	return (err);
	#else /* !illumos */
	assert(!"invalid code path");
	return (EINVAL); // silence compiler warning
	#endif /* illumos */
	}

	/*
	* convert the propname into parameters needed by kernel
	* Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829
	* Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789
	*/
	static int
	userquota_propname_decode(const char *propname, boolean_t zoned,
	zfs_userquota_prop_t typep, char domain, int domainlen, uint64_t *ridp)
	{
	zfs_userquota_prop_t type;
	char cp, end;
	char *numericsid = NULL;
	boolean_t isuser;

	domain[0] = '\0';
	*ridp = 0;
	/* Figure out the property type ({user\|group}{quota\|space}) */
	for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
	if (strncmp(propname, zfs_userquota_prop_prefixes[type],
	strlen(zfs_userquota_prop_prefixes[type])) == 0)
	break;
	}
	if (type == ZFS_NUM_USERQUOTA_PROPS)
	return (EINVAL);
	*typep = type;

	isuser = (type == ZFS_PROP_USERQUOTA \|\|
	type == ZFS_PROP_USERUSED);

	cp = strchr(propname, '@') + 1;

	if (strchr(cp, '@')) {
	#ifdef illumos
	/*
	* It's a SID name (eg "user@domain") that needs to be
	* turned into S-1-domainID-RID.
	*/
	int flag = 0;
	idmap_stat stat, map_stat;
	uid_t pid;
	idmap_rid_t rid;
	idmap_get_handle_t *gh = NULL;

	stat = idmap_get_create(&gh);
	if (stat != IDMAP_SUCCESS) {
	idmap_get_destroy(gh);
	return (ENOMEM);
	}
	if (zoned && getzoneid() == GLOBAL_ZONEID)
	return (ENOENT);
	if (isuser) {
	stat = idmap_getuidbywinname(cp, NULL, flag, &pid);
	if (stat < 0)
	return (ENOENT);
	stat = idmap_get_sidbyuid(gh, pid, flag, &numericsid,
	&rid, &map_stat);
	} else {
	stat = idmap_getgidbywinname(cp, NULL, flag, &pid);
	if (stat < 0)
	return (ENOENT);
	stat = idmap_get_sidbygid(gh, pid, flag, &numericsid,
	&rid, &map_stat);
	}
	if (stat < 0) {
	idmap_get_destroy(gh);
	return (ENOENT);
	}
	stat = idmap_get_mappings(gh);
	idmap_get_destroy(gh);

	if (stat < 0) {
	return (ENOENT);
	}
	if (numericsid == NULL)
	return (ENOENT);
	cp = numericsid;
	*ridp = rid;
	/* will be further decoded below */
	#else /* !illumos */
	return (ENOENT);
	#endif /* illumos */
	}

	if (strncmp(cp, "S-1-", 4) == 0) {
	/* It's a numeric SID (eg "S-1-234-567-89") */
	(void) strlcpy(domain, cp, domainlen);
	errno = 0;
	if (*ridp == 0) {
	cp = strrchr(domain, '-');
	*cp = '\0';
	cp++;
	*ridp = strtoull(cp, &end, 10);
	} else {
	end = "";
	}
	if (numericsid) {
	free(numericsid);
	numericsid = NULL;
	}
	if (errno != 0 \|\| *end != '\0')
	return (EINVAL);
	} else if (!isdigit(*cp)) {
	/*
	* It's a user/group name (eg "user") that needs to be
	* turned into a uid/gid
	*/
	if (zoned && getzoneid() == GLOBAL_ZONEID)
	return (ENOENT);
	if (isuser) {
	struct passwd *pw;
	pw = getpwnam(cp);
	if (pw == NULL)
	return (ENOENT);
	*ridp = pw->pw_uid;
	} else {
	struct group *gr;
	gr = getgrnam(cp);
	if (gr == NULL)
	return (ENOENT);
	*ridp = gr->gr_gid;
	}
	} else {
	/* It's a user/group ID (eg "12345"). */
	uid_t id = strtoul(cp, &end, 10);
	idmap_rid_t rid;
	char *mapdomain;

	if (*end != '\0')
	return (EINVAL);
	if (id > MAXUID) {
	/* It's an ephemeral ID. */
	if (idmap_id_to_numeric_domain_rid(id, isuser,
	&mapdomain, &rid) != 0)
	return (ENOENT);
	(void) strlcpy(domain, mapdomain, domainlen);
	*ridp = rid;
	} else {
	*ridp = id;
	}
	}

	ASSERT3P(numericsid, ==, NULL);
	return (0);
	}

	static int
	zfs_prop_get_userquota_common(zfs_handle_t zhp, const char propname,
	uint64_t propvalue, zfs_userquota_prop_t typep)
	{
	int err;
	zfs_cmd_t zc = { 0 };

	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));

	err = userquota_propname_decode(propname,
	zfs_prop_get_int(zhp, ZFS_PROP_ZONED),
	typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid);
	zc.zc_objset_type = *typep;
	if (err)
	return (err);

	err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_USERSPACE_ONE, &zc);
	if (err)
	return (err);

	*propvalue = zc.zc_cookie;
	return (0);
	}

	int
	zfs_prop_get_userquota_int(zfs_handle_t zhp, const char propname,
	uint64_t *propvalue)
	{
	zfs_userquota_prop_t type;

	return (zfs_prop_get_userquota_common(zhp, propname, propvalue,
	&type));
	}

	int
	zfs_prop_get_userquota(zfs_handle_t zhp, const char propname,
	char *propbuf, int proplen, boolean_t literal)
	{
	int err;
	uint64_t propvalue;
	zfs_userquota_prop_t type;

	err = zfs_prop_get_userquota_common(zhp, propname, &propvalue,
	&type);

	if (err)
	return (err);

	if (literal) {
	(void) snprintf(propbuf, proplen, "%llu", propvalue);
	} else if (propvalue == 0 &&
	(type == ZFS_PROP_USERQUOTA \|\| type == ZFS_PROP_GROUPQUOTA)) {
	(void) strlcpy(propbuf, "none", proplen);
	} else {
	zfs_nicenum(propvalue, propbuf, proplen);
	}
	return (0);
	}

	int
	zfs_prop_get_written_int(zfs_handle_t zhp, const char propname,
	uint64_t *propvalue)
	{
	int err;
	zfs_cmd_t zc = { 0 };
	const char *snapname;

	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));

	snapname = strchr(propname, '@') + 1;
	if (strchr(snapname, '@')) {
	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
	} else {
	/* snapname is the short name, append it to zhp's fsname */
	char *cp;

	(void) strlcpy(zc.zc_value, zhp->zfs_name,
	sizeof (zc.zc_value));
	cp = strchr(zc.zc_value, '@');
	if (cp != NULL)
	*cp = '\0';
	(void) strlcat(zc.zc_value, "@", sizeof (zc.zc_value));
	(void) strlcat(zc.zc_value, snapname, sizeof (zc.zc_value));
	}

	err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_WRITTEN, &zc);
	if (err)
	return (err);

	*propvalue = zc.zc_cookie;
	return (0);
	}

	int
	zfs_prop_get_written(zfs_handle_t zhp, const char propname,
	char *propbuf, int proplen, boolean_t literal)
	{
	int err;
	uint64_t propvalue;

	err = zfs_prop_get_written_int(zhp, propname, &propvalue);

	if (err)
	return (err);

	if (literal) {
	(void) snprintf(propbuf, proplen, "%llu", propvalue);
	} else {
	zfs_nicenum(propvalue, propbuf, proplen);
	}
	return (0);
	}

	/*
	* Returns the name of the given zfs handle.
	*/
	const char *
	zfs_get_name(const zfs_handle_t *zhp)
	{
	return (zhp->zfs_name);
	}

	/*
	* Returns the name of the parent pool for the given zfs handle.
	*/
	const char *
	zfs_get_pool_name(const zfs_handle_t *zhp)
	{
	return (zhp->zpool_hdl->zpool_name);
	}

	/*
	* Returns the type of the given zfs handle.
	*/
	zfs_type_t
	zfs_get_type(const zfs_handle_t *zhp)
	{
	return (zhp->zfs_type);
	}

	/*
	* Is one dataset name a child dataset of another?
	*
	* Needs to handle these cases:
	* Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo"
	* Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar"
	* Descendant? No. No. No. Yes.
	*/
	static boolean_t
	is_descendant(const char ds1, const char ds2)
	{
	size_t d1len = strlen(ds1);

	/* ds2 can't be a descendant if it's smaller */
	if (strlen(ds2) < d1len)
	return (B_FALSE);

	/* otherwise, compare strings and verify that there's a '/' char */
	return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0));
	}

	/*
	* Given a complete name, return just the portion that refers to the parent.
	* Will return -1 if there is no parent (path is just the name of the
	* pool).
	*/
	static int
	parent_name(const char path, char buf, size_t buflen)
	{
	char *slashp;

	(void) strlcpy(buf, path, buflen);

	if ((slashp = strrchr(buf, '/')) == NULL)
	return (-1);
	*slashp = '\0';

	return (0);
	}

	/*
	* If accept_ancestor is false, then check to make sure that the given path has
	* a parent, and that it exists. If accept_ancestor is true, then find the
	* closest existing ancestor for the given path. In prefixlen return the
	* length of already existing prefix of the given path. We also fetch the
	* 'zoned' property, which is used to validate property settings when creating
	* new datasets.
	*/
	static int
	check_parents(libzfs_handle_t hdl, const char path, uint64_t *zoned,
	boolean_t accept_ancestor, int *prefixlen)
	{
	zfs_cmd_t zc = { 0 };
	char parent[ZFS_MAX_DATASET_NAME_LEN];
	char *slash;
	zfs_handle_t *zhp;
	char errbuf[1024];
	uint64_t is_zoned;

	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot create '%s'"), path);

	/* get parent, and check to see if this is just a pool */
	if (parent_name(path, parent, sizeof (parent)) != 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"missing dataset name"));
	return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	}

	/* check to see if the pool exists */
	if ((slash = strchr(parent, '/')) == NULL)
	slash = parent + strlen(parent);
	(void) strncpy(zc.zc_name, parent, slash - parent);
	zc.zc_name[slash - parent] = '\0';
	if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 &&
	errno == ENOENT) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"no such pool '%s'"), zc.zc_name);
	return (zfs_error(hdl, EZFS_NOENT, errbuf));
	}

	/* check to see if the parent dataset exists */
	while ((zhp = make_dataset_handle(hdl, parent)) == NULL) {
	if (errno == ENOENT && accept_ancestor) {
	/*
	* Go deeper to find an ancestor, give up on top level.
	*/
	if (parent_name(parent, parent, sizeof (parent)) != 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"no such pool '%s'"), zc.zc_name);
	return (zfs_error(hdl, EZFS_NOENT, errbuf));
	}
	} else if (errno == ENOENT) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"parent does not exist"));
	return (zfs_error(hdl, EZFS_NOENT, errbuf));
	} else
	return (zfs_standard_error(hdl, errno, errbuf));
	}

	is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
	if (zoned != NULL)
	*zoned = is_zoned;

	/* we are in a non-global zone, but parent is in the global zone */
	if (getzoneid() != GLOBAL_ZONEID && !is_zoned) {
	(void) zfs_standard_error(hdl, EPERM, errbuf);
	zfs_close(zhp);
	return (-1);
	}

	/* make sure parent is a filesystem */
	if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"parent is not a filesystem"));
	(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	zfs_close(zhp);
	return (-1);
	}

	zfs_close(zhp);
	if (prefixlen != NULL)
	*prefixlen = strlen(parent);
	return (0);
	}

	/*
	* Finds whether the dataset of the given type(s) exists.
	*/
	boolean_t
	zfs_dataset_exists(libzfs_handle_t hdl, const char path, zfs_type_t types)
	{
	zfs_handle_t *zhp;

	if (!zfs_validate_name(hdl, path, types, B_FALSE))
	return (B_FALSE);

	/*
	* Try to get stats for the dataset, which will tell us if it exists.
	*/
	if ((zhp = make_dataset_handle(hdl, path)) != NULL) {
	int ds_type = zhp->zfs_type;

	zfs_close(zhp);
	if (types & ds_type)
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	/*
	* Given a path to 'target', create all the ancestors between
	* the prefixlen portion of the path, and the target itself.
	* Fail if the initial prefixlen-ancestor does not already exist.
	*/
	int
	create_parents(libzfs_handle_t hdl, char target, int prefixlen)
	{
	zfs_handle_t *h;
	char *cp;
	const char *opname;

	/* make sure prefix exists */
	cp = target + prefixlen;
	if (*cp != '/') {
	assert(strchr(cp, '/') == NULL);
	h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
	} else {
	*cp = '\0';
	h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
	*cp = '/';
	}
	if (h == NULL)
	return (-1);
	zfs_close(h);

	/*
	* Attempt to create, mount, and share any ancestor filesystems,
	* up to the prefixlen-long one.
	*/
	for (cp = target + prefixlen + 1;
	(cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) {

	*cp = '\0';

	h = make_dataset_handle(hdl, target);
	if (h) {
	/* it already exists, nothing to do here */
	zfs_close(h);
	continue;
	}

	if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM,
	NULL) != 0) {
	opname = dgettext(TEXT_DOMAIN, "create");
	goto ancestorerr;
	}

	h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
	if (h == NULL) {
	opname = dgettext(TEXT_DOMAIN, "open");
	goto ancestorerr;
	}

	if (zfs_mount(h, NULL, 0) != 0) {
	opname = dgettext(TEXT_DOMAIN, "mount");
	goto ancestorerr;
	}

	if (zfs_share(h) != 0) {
	opname = dgettext(TEXT_DOMAIN, "share");
	goto ancestorerr;
	}

	zfs_close(h);
	}

	return (0);

	ancestorerr:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"failed to %s ancestor '%s'"), opname, target);
	return (-1);
	}

	/*
	* Creates non-existing ancestors of the given path.
	*/
	int
	zfs_create_ancestors(libzfs_handle_t hdl, const char path)
	{
	int prefix;
	char *path_copy;
	int rc = 0;

	if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0)
	return (-1);

	if ((path_copy = strdup(path)) != NULL) {
	rc = create_parents(hdl, path_copy, prefix);
	free(path_copy);
	}
	if (path_copy == NULL \|\| rc != 0)
	return (-1);

	return (0);
	}

	/*
	* Create a new filesystem or volume.
	*/
	int
	zfs_create(libzfs_handle_t hdl, const char path, zfs_type_t type,
	nvlist_t *props)
	{
	int ret;
	uint64_t size = 0;
	uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
	char errbuf[1024];
	uint64_t zoned;
	enum lzc_dataset_type ost;
	zpool_handle_t *zpool_handle;

	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"cannot create '%s'"), path);

	/* validate the path, taking care to note the extended error message */
	if (!zfs_validate_name(hdl, path, type, B_TRUE))
	return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));

	/* validate parents exist */
	if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0)
	return (-1);

	/*
	* The failure modes when creating a dataset of a different type over
	* one that already exists is a little strange. In particular, if you
	* try to create a dataset on top of an existing dataset, the ioctl()
	* will return ENOENT, not EEXIST. To prevent this from happening, we
	* first try to see if the dataset exists.
	*/
	if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"dataset already exists"));
	return (zfs_error(hdl, EZFS_EXISTS, errbuf));
	}

	if (type == ZFS_TYPE_VOLUME)
	ost = LZC_DATSET_TYPE_ZVOL;
	else
	ost = LZC_DATSET_TYPE_ZFS;

	/* open zpool handle for prop validation */
	char pool_path[ZFS_MAX_DATASET_NAME_LEN];
	(void) strlcpy(pool_path, path, sizeof (pool_path));

	/* truncate pool_path at first slash */
	char *p = strchr(pool_path, '/');
	if (p != NULL)
	*p = '\0';

	if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL)
	return (-1);

	if (props && (props = zfs_valid_proplist(hdl, type, props,
	zoned, NULL, zpool_handle, errbuf)) == 0) {
	zpool_close(zpool_handle);
	return (-1);
	}
	zpool_close(zpool_handle);

	if (type == ZFS_TYPE_VOLUME) {
	/*
	* If we are creating a volume, the size and block size must
	* satisfy a few restraints. First, the blocksize must be a
	* valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the
	* volsize must be a multiple of the block size, and cannot be
	* zero.
	*/
	if (props == NULL \|\| nvlist_lookup_uint64(props,
	zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) {
	nvlist_free(props);
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"missing volume size"));
	return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	}

	if ((ret = nvlist_lookup_uint64(props,
	zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	&blocksize)) != 0) {
	if (ret == ENOENT) {
	blocksize = zfs_prop_default_numeric(
	ZFS_PROP_VOLBLOCKSIZE);
	} else {
	nvlist_free(props);
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"missing volume block size"));
	return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	}
	}

	if (size == 0) {
	nvlist_free(props);
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"volume size cannot be zero"));
	return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	}

	if (size % blocksize != 0) {
	nvlist_free(props);
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"volume size must be a multiple of volume block "
	"size"));
	return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	}
	}

	/* create the dataset */
	ret = lzc_create(path, ost, props);
	nvlist_free(props);

	/* check for failure */
	if (ret != 0) {
	char parent[ZFS_MAX_DATASET_NAME_LEN];
	(void) parent_name(path, parent, sizeof (parent));

	switch (errno) {
	case ENOENT:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"no such parent '%s'"), parent);
	return (zfs_error(hdl, EZFS_NOENT, errbuf));

	case EINVAL:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"parent '%s' is not a filesystem"), parent);
	return (zfs_error(hdl, EZFS_BADTYPE, errbuf));

	case ENOTSUP:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool must be upgraded to set this "
	"property or value"));
	return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
	case ERANGE:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"invalid property value(s) specified"));
	return (zfs_error(hdl, EZFS_BADPROP, errbuf));
	#ifdef _ILP32
	case EOVERFLOW:
	/*
	* This platform can't address a volume this big.
	*/
	if (type == ZFS_TYPE_VOLUME)
	return (zfs_error(hdl, EZFS_VOLTOOBIG,
	errbuf));
	#endif
	/* FALLTHROUGH */
	default:
	return (zfs_standard_error(hdl, errno, errbuf));
	}
	}

	return (0);
	}

	/*
	* Destroys the given dataset. The caller must make sure that the filesystem
	* isn't mounted, and that there are no active dependents. If the file system
	* does not exist this function does nothing.
	*/
	int
	zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
	{
	zfs_cmd_t zc = { 0 };

	if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) {
	nvlist_t *nv = fnvlist_alloc();
	fnvlist_add_boolean(nv, zhp->zfs_name);
	int error = lzc_destroy_bookmarks(nv, NULL);
	fnvlist_free(nv);
	if (error != 0) {
	return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
	dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
	zhp->zfs_name));
	}
	return (0);
	}

	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));

	if (ZFS_IS_VOLUME(zhp)) {
	zc.zc_objset_type = DMU_OST_ZVOL;
	} else {
	zc.zc_objset_type = DMU_OST_ZFS;
	}

	zc.zc_defer_destroy = defer;
	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0 &&
	errno != ENOENT) {
	return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
	dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
	zhp->zfs_name));
	}

	remove_mountpoint(zhp);

	return (0);
	}

	struct destroydata {
	nvlist_t *nvl;
	const char *snapname;
	};

	static int
	zfs_check_snap_cb(zfs_handle_t zhp, void arg)
	{
	struct destroydata *dd = arg;
	char name[ZFS_MAX_DATASET_NAME_LEN];
	int rv = 0;

	(void) snprintf(name, sizeof (name),
	"%s@%s", zhp->zfs_name, dd->snapname);

	if (lzc_exists(name))
	verify(nvlist_add_boolean(dd->nvl, name) == 0);

	rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd);
	zfs_close(zhp);
	return (rv);
	}

	/*
	* Destroys all snapshots with the given name in zhp & descendants.
	*/
	int
	zfs_destroy_snaps(zfs_handle_t zhp, char snapname, boolean_t defer)
	{
	int ret;
	struct destroydata dd = { 0 };

	dd.snapname = snapname;
	verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0);
	(void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd);

	if (nvlist_empty(dd.nvl)) {
	ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
	dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"),
	zhp->zfs_name, snapname);
	} else {
	ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer);
	}
	nvlist_free(dd.nvl);
	return (ret);
	}

	/*
	* Destroys all the snapshots named in the nvlist.
	*/
	int
	zfs_destroy_snaps_nvl(libzfs_handle_t hdl, nvlist_t snaps, boolean_t defer)
	{
	int ret;
	nvlist_t *errlist = NULL;

	ret = lzc_destroy_snaps(snaps, defer, &errlist);

	if (ret == 0) {
	nvlist_free(errlist);
	return (0);
	}

	if (nvlist_empty(errlist)) {
	char errbuf[1024];
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot destroy snapshots"));

	ret = zfs_standard_error(hdl, ret, errbuf);
	}
	for (nvpair_t *pair = nvlist_next_nvpair(errlist, NULL);
	pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) {
	char errbuf[1024];
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"),
	nvpair_name(pair));

	switch (fnvpair_value_int32(pair)) {
	case EEXIST:
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN, "snapshot is cloned"));
	ret = zfs_error(hdl, EZFS_EXISTS, errbuf);
	break;
	default:
	ret = zfs_standard_error(hdl, errno, errbuf);
	break;
	}
	}

	nvlist_free(errlist);
	return (ret);
	}

	/*
	* Clones the given dataset. The target must be of the same type as the source.
	*/
	int
	zfs_clone(zfs_handle_t zhp, const char target, nvlist_t *props)
	{
	char parent[ZFS_MAX_DATASET_NAME_LEN];
	int ret;
	char errbuf[1024];
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	uint64_t zoned;

	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);

	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"cannot create '%s'"), target);

	/* validate the target/clone name */
	if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE))
	return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));

	/* validate parents exist */
	if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0)
	return (-1);

	(void) parent_name(target, parent, sizeof (parent));

	/* do the clone */

	if (props) {
	zfs_type_t type;
	if (ZFS_IS_VOLUME(zhp)) {
	type = ZFS_TYPE_VOLUME;
	} else {
	type = ZFS_TYPE_FILESYSTEM;
	}
	if ((props = zfs_valid_proplist(hdl, type, props, zoned,
	zhp, zhp->zpool_hdl, errbuf)) == NULL)
	return (-1);
	}

	ret = lzc_clone(target, zhp->zfs_name, props);
	nvlist_free(props);

	if (ret != 0) {
	switch (errno) {

	case ENOENT:
	/*
	* The parent doesn't exist. We should have caught this
	* above, but there may a race condition that has since
	* destroyed the parent.
	*
	* At this point, we don't know whether it's the source
	* that doesn't exist anymore, or whether the target
	* dataset doesn't exist.
	*/
	zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	"no such parent '%s'"), parent);
	return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));

	case EXDEV:
	zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	"source and target pools differ"));
	return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET,
	errbuf));

	default:
	return (zfs_standard_error(zhp->zfs_hdl, errno,
	errbuf));
	}
	}

	return (ret);
	}

	/*
	* Promotes the given clone fs to be the clone parent.
	*/
	int
	zfs_promote(zfs_handle_t *zhp)
	{
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	char snapname[ZFS_MAX_DATASET_NAME_LEN];
	int ret;
	char errbuf[1024];

	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"cannot promote '%s'"), zhp->zfs_name);

	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"snapshots can not be promoted"));
	return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
	}

	if (zhp->zfs_dmustats.dds_origin[0] == '\0') {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"not a cloned filesystem"));
	return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
	}

	ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname));

	if (ret != 0) {
	switch (ret) {
	case EEXIST:
	/* There is a conflicting snapshot name. */
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"conflicting snapshot '%s' from parent '%s'"),
	snapname, zhp->zfs_dmustats.dds_origin);
	return (zfs_error(hdl, EZFS_EXISTS, errbuf));

	default:
	return (zfs_standard_error(hdl, ret, errbuf));
	}
	}
	return (ret);
	}

	typedef struct snapdata {
	nvlist_t *sd_nvl;
	const char *sd_snapname;
	} snapdata_t;

	static int
	zfs_snapshot_cb(zfs_handle_t zhp, void arg)
	{
	snapdata_t *sd = arg;
	char name[ZFS_MAX_DATASET_NAME_LEN];
	int rv = 0;

	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) {
	(void) snprintf(name, sizeof (name),
	"%s@%s", zfs_get_name(zhp), sd->sd_snapname);

	fnvlist_add_boolean(sd->sd_nvl, name);

	rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
	}
	zfs_close(zhp);

	return (rv);
	}

	+int
	+zfs_remap_indirects(libzfs_handle_t hdl, const char fs)
	+{
	+ int err;
	+ char errbuf[1024];
	+
	+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	+ "cannot remap filesystem '%s' "), fs);
	+
	+ err = lzc_remap(fs);
	+
	+ if (err != 0) {
	+ (void) zfs_standard_error(hdl, err, errbuf);
	+ }
	+
	+ return (err);
	+}
	+
	/*
	* Creates snapshots. The keys in the snaps nvlist are the snapshots to be
	* created.
	*/
	int
	zfs_snapshot_nvl(libzfs_handle_t hdl, nvlist_t snaps, nvlist_t *props)
	{
	int ret;
	char errbuf[1024];
	nvpair_t *elem;
	nvlist_t *errors;

	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"cannot create snapshots "));

	elem = NULL;
	while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) {
	const char *snapname = nvpair_name(elem);

	/* validate the target name */
	if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT,
	B_TRUE)) {
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN,
	"cannot create snapshot '%s'"), snapname);
	return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	}
	}

	/*
	* get pool handle for prop validation. assumes all snaps are in the
	* same pool, as does lzc_snapshot (below).
	*/
	char pool[ZFS_MAX_DATASET_NAME_LEN];
	elem = nvlist_next_nvpair(snaps, NULL);
	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	pool[strcspn(pool, "/@")] = '\0';
	zpool_handle_t *zpool_hdl = zpool_open(hdl, pool);

	if (props != NULL &&
	(props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT,
	props, B_FALSE, NULL, zpool_hdl, errbuf)) == NULL) {
	zpool_close(zpool_hdl);
	return (-1);
	}
	zpool_close(zpool_hdl);

	ret = lzc_snapshot(snaps, props, &errors);

	if (ret != 0) {
	boolean_t printed = B_FALSE;
	for (elem = nvlist_next_nvpair(errors, NULL);
	elem != NULL;
	elem = nvlist_next_nvpair(errors, elem)) {
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN,
	"cannot create snapshot '%s'"), nvpair_name(elem));
	(void) zfs_standard_error(hdl,
	fnvpair_value_int32(elem), errbuf);
	printed = B_TRUE;
	}
	if (!printed) {
	switch (ret) {
	case EXDEV:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"multiple snapshots of same "
	"fs not allowed"));
	(void) zfs_error(hdl, EZFS_EXISTS, errbuf);

	break;
	default:
	(void) zfs_standard_error(hdl, ret, errbuf);
	}
	}
	}

	nvlist_free(props);
	nvlist_free(errors);
	return (ret);
	}

	int
	zfs_snapshot(libzfs_handle_t hdl, const char path, boolean_t recursive,
	nvlist_t *props)
	{
	int ret;
	snapdata_t sd = { 0 };
	char fsname[ZFS_MAX_DATASET_NAME_LEN];
	char *cp;
	zfs_handle_t *zhp;
	char errbuf[1024];

	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"cannot snapshot %s"), path);

	if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE))
	return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));

	(void) strlcpy(fsname, path, sizeof (fsname));
	cp = strchr(fsname, '@');
	*cp = '\0';
	sd.sd_snapname = cp + 1;

	if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM \|
	ZFS_TYPE_VOLUME)) == NULL) {
	return (-1);
	}

	verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0);
	if (recursive) {
	(void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd);
	} else {
	fnvlist_add_boolean(sd.sd_nvl, path);
	}

	ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props);
	nvlist_free(sd.sd_nvl);
	zfs_close(zhp);
	return (ret);
	}

	/*
	* Destroy any more recent snapshots. We invoke this callback on any dependents
	* of the snapshot first. If the 'cb_dependent' member is non-zero, then this
	* is a dependent and we should just destroy it without checking the transaction
	* group.
	*/
	typedef struct rollback_data {
	const char cb_target; / the snapshot */
	uint64_t cb_create; /* creation time reference */
	boolean_t cb_error;
	boolean_t cb_force;
	} rollback_data_t;

	static int
	rollback_destroy_dependent(zfs_handle_t zhp, void data)
	{
	rollback_data_t *cbp = data;
	prop_changelist_t *clp;

	/* We must destroy this clone; first unmount it */
	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
	cbp->cb_force ? MS_FORCE: 0);
	if (clp == NULL \|\| changelist_prefix(clp) != 0) {
	cbp->cb_error = B_TRUE;
	zfs_close(zhp);
	return (0);
	}
	if (zfs_destroy(zhp, B_FALSE) != 0)
	cbp->cb_error = B_TRUE;
	else
	changelist_remove(clp, zhp->zfs_name);
	(void) changelist_postfix(clp);
	changelist_free(clp);

	zfs_close(zhp);
	return (0);
	}

	static int
	rollback_destroy(zfs_handle_t zhp, void data)
	{
	rollback_data_t *cbp = data;

	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
	cbp->cb_error \|= zfs_iter_dependents(zhp, B_FALSE,
	rollback_destroy_dependent, cbp);

	cbp->cb_error \|= zfs_destroy(zhp, B_FALSE);
	}

	zfs_close(zhp);
	return (0);
	}

	/*
	* Given a dataset, rollback to a specific snapshot, discarding any
	* data changes since then and making it the active dataset.
	*
	* Any snapshots and bookmarks more recent than the target are
	* destroyed, along with their dependents (i.e. clones).
	*/
	int
	zfs_rollback(zfs_handle_t zhp, zfs_handle_t snap, boolean_t force)
	{
	rollback_data_t cb = { 0 };
	int err;
	boolean_t restore_resv = 0;
	uint64_t old_volsize = 0, new_volsize;
	zfs_prop_t resv_prop;

	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM \|\|
	zhp->zfs_type == ZFS_TYPE_VOLUME);

	/*
	* Destroy all recent snapshots and their dependents.
	*/
	cb.cb_force = force;
	cb.cb_target = snap->zfs_name;
	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
	(void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb);
	(void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb);

	if (cb.cb_error)
	return (-1);

	/*
	* Now that we have verified that the snapshot is the latest,
	* rollback to the given snapshot.
	*/

	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
	if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
	return (-1);
	old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
	restore_resv =
	(old_volsize == zfs_prop_get_int(zhp, resv_prop));
	}

	/*
	* Pass both the filesystem and the wanted snapshot names,
	* we would get an error back if the snapshot is destroyed or
	* a new snapshot is created before this request is processed.
	*/
	err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name);
	if (err != 0) {
	char errbuf[1024];

	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot rollback '%s'"),
	zhp->zfs_name);
	switch (err) {
	case EEXIST:
	zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	"there is a snapshot or bookmark more recent "
	"than '%s'"), snap->zfs_name);
	(void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf);
	break;
	case ESRCH:
	zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
	"'%s' is not found among snapshots of '%s'"),
	snap->zfs_name, zhp->zfs_name);
	(void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf);
	break;
	case EINVAL:
	(void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf);
	break;
	default:
	(void) zfs_standard_error(zhp->zfs_hdl, err, errbuf);
	}
	return (err);
	}

	/*
	* For volumes, if the pre-rollback volsize matched the pre-
	* rollback reservation and the volsize has changed then set
	* the reservation property to the post-rollback volsize.
	* Make a new handle since the rollback closed the dataset.
	*/
	if ((zhp->zfs_type == ZFS_TYPE_VOLUME) &&
	(zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) {
	if (restore_resv) {
	new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
	if (old_volsize != new_volsize)
	err = zfs_prop_set_int(zhp, resv_prop,
	new_volsize);
	}
	zfs_close(zhp);
	}
	return (err);
	}

	/*
	* Renames the given dataset.
	*/
	int
	zfs_rename(zfs_handle_t zhp, const char source, const char *target,
	renameflags_t flags)
	{
	int ret = 0;
	zfs_cmd_t zc = { 0 };
	char *delim;
	prop_changelist_t *cl = NULL;
	zfs_handle_t *zhrp = NULL;
	char *parentname = NULL;
	char parent[ZFS_MAX_DATASET_NAME_LEN];
	char property[ZFS_MAXPROPLEN];
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	char errbuf[1024];

	/* if we have the same exact name, just return success */
	if (strcmp(zhp->zfs_name, target) == 0)
	return (0);

	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"cannot rename to '%s'"), target);

	if (source != NULL) {
	/*
	* This is recursive snapshots rename, put snapshot name
	* (that might not exist) into zfs_name.
	*/
	assert(flags.recurse);

	(void) strlcat(zhp->zfs_name, "@", sizeof(zhp->zfs_name));
	(void) strlcat(zhp->zfs_name, source, sizeof(zhp->zfs_name));
	zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
	}

	/*
	* Make sure the target name is valid
	*/
	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
	if ((strchr(target, '@') == NULL) \|\|
	*target == '@') {
	/*
	* Snapshot target name is abbreviated,
	* reconstruct full dataset name
	*/
	(void) strlcpy(parent, zhp->zfs_name,
	sizeof (parent));
	delim = strchr(parent, '@');
	if (strchr(target, '@') == NULL)
	*(++delim) = '\0';
	else
	*delim = '\0';
	(void) strlcat(parent, target, sizeof (parent));
	target = parent;
	} else {
	/*
	* Make sure we're renaming within the same dataset.
	*/
	delim = strchr(target, '@');
	if (strncmp(zhp->zfs_name, target, delim - target)
	!= 0 \|\| zhp->zfs_name[delim - target] != '@') {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"snapshots must be part of same "
	"dataset"));
	return (zfs_error(hdl, EZFS_CROSSTARGET,
	errbuf));
	}
	}
	if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
	return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	} else {
	if (flags.recurse) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"recursive rename must be a snapshot"));
	return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
	}

	if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
	return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));

	/* validate parents */
	if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0)
	return (-1);

	/* make sure we're in the same pool */
	verify((delim = strchr(target, '/')) != NULL);
	if (strncmp(zhp->zfs_name, target, delim - target) != 0 \|\|
	zhp->zfs_name[delim - target] != '/') {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"datasets must be within same pool"));
	return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
	}

	/* new name cannot be a child of the current dataset name */
	if (is_descendant(zhp->zfs_name, target)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"New dataset name cannot be a descendant of "
	"current dataset name"));
	return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
	}
	}

	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name);

	if (getzoneid() == GLOBAL_ZONEID &&
	zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"dataset is used in a non-global zone"));
	return (zfs_error(hdl, EZFS_ZONED, errbuf));
	}

	/*
	* Avoid unmounting file systems with mountpoint property set to
	* 'legacy' or 'none' even if -u option is not given.
	*/
	if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
	!flags.recurse && !flags.nounmount &&
	zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property,
	sizeof (property), NULL, NULL, 0, B_FALSE) == 0 &&
	(strcmp(property, "legacy") == 0 \|\|
	strcmp(property, "none") == 0)) {
	flags.nounmount = B_TRUE;
	}
	if (flags.recurse) {

	parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
	if (parentname == NULL) {
	ret = -1;
	goto error;
	}
	delim = strchr(parentname, '@');
	*delim = '\0';
	zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET);
	if (zhrp == NULL) {
	ret = -1;
	goto error;
	}
	} else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) {
	if ((cl = changelist_gather(zhp, ZFS_PROP_NAME,
	flags.nounmount ? CL_GATHER_DONT_UNMOUNT : 0,
	flags.forceunmount ? MS_FORCE : 0)) == NULL) {
	return (-1);
	}

	if (changelist_haszonedchild(cl)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"child dataset with inherited mountpoint is used "
	"in a non-global zone"));
	(void) zfs_error(hdl, EZFS_ZONED, errbuf);
	ret = -1;
	goto error;
	}

	if ((ret = changelist_prefix(cl)) != 0)
	goto error;
	}

	if (ZFS_IS_VOLUME(zhp))
	zc.zc_objset_type = DMU_OST_ZVOL;
	else
	zc.zc_objset_type = DMU_OST_ZFS;

	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	(void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));

	zc.zc_cookie = flags.recurse ? 1 : 0;
	if (flags.nounmount)
	zc.zc_cookie \|= 2;

	if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) {
	/*
	* if it was recursive, the one that actually failed will
	* be in zc.zc_name
	*/
	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"cannot rename '%s'"), zc.zc_name);

	if (flags.recurse && errno == EEXIST) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"a child dataset already has a snapshot "
	"with the new name"));
	(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
	} else {
	(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
	}

	/*
	* On failure, we still want to remount any filesystems that
	* were previously mounted, so we don't alter the system state.
	*/
	if (cl != NULL)
	(void) changelist_postfix(cl);
	} else {
	if (cl != NULL) {
	changelist_rename(cl, zfs_get_name(zhp), target);
	ret = changelist_postfix(cl);
	}
	}

	error:
	if (parentname != NULL) {
	free(parentname);
	}
	if (zhrp != NULL) {
	zfs_close(zhrp);
	}
	if (cl != NULL) {
	changelist_free(cl);
	}
	return (ret);
	}

	nvlist_t *
	zfs_get_user_props(zfs_handle_t *zhp)
	{
	return (zhp->zfs_user_props);
	}

	nvlist_t *
	zfs_get_recvd_props(zfs_handle_t *zhp)
	{
	if (zhp->zfs_recvd_props == NULL)
	if (get_recvd_props_ioctl(zhp) != 0)
	return (NULL);
	return (zhp->zfs_recvd_props);
	}

	/*
	* This function is used by 'zfs list' to determine the exact set of columns to
	* display, and their maximum widths. This does two main things:
	*
	* - If this is a list of all properties, then expand the list to include
	* all native properties, and set a flag so that for each dataset we look
	* for new unique user properties and add them to the list.
	*
	* - For non fixed-width properties, keep track of the maximum width seen
	* so that we can size the column appropriately. If the user has
	* requested received property values, we also need to compute the width
	* of the RECEIVED column.
	*/
	int
	zfs_expand_proplist(zfs_handle_t zhp, zprop_list_t *plp, boolean_t received,
	boolean_t literal)
	{
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	zprop_list_t *entry;
	zprop_list_t last, start;
	nvlist_t userprops, propval;
	nvpair_t *elem;
	char *strval;
	char buf[ZFS_MAXPROPLEN];

	if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0)
	return (-1);

	userprops = zfs_get_user_props(zhp);

	entry = *plp;
	if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) {
	/*
	* Go through and add any user properties as necessary. We
	* start by incrementing our list pointer to the first
	* non-native property.
	*/
	start = plp;
	while (*start != NULL) {
	if ((*start)->pl_prop == ZPROP_INVAL)
	break;
	start = &(*start)->pl_next;
	}

	elem = NULL;
	while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) {
	/*
	* See if we've already found this property in our list.
	*/
	for (last = start; *last != NULL;
	last = &(*last)->pl_next) {
	if (strcmp((*last)->pl_user_prop,
	nvpair_name(elem)) == 0)
	break;
	}

	if (*last == NULL) {
	if ((entry = zfs_alloc(hdl,
	sizeof (zprop_list_t))) == NULL \|\|
	((entry->pl_user_prop = zfs_strdup(hdl,
	nvpair_name(elem)))) == NULL) {
	free(entry);
	return (-1);
	}

	entry->pl_prop = ZPROP_INVAL;
	entry->pl_width = strlen(nvpair_name(elem));
	entry->pl_all = B_TRUE;
	*last = entry;
	}
	}
	}

	/*
	* Now go through and check the width of any non-fixed columns
	*/
	for (entry = *plp; entry != NULL; entry = entry->pl_next) {
	if (entry->pl_fixed && !literal)
	continue;

	if (entry->pl_prop != ZPROP_INVAL) {
	if (zfs_prop_get(zhp, entry->pl_prop,
	buf, sizeof (buf), NULL, NULL, 0, literal) == 0) {
	if (strlen(buf) > entry->pl_width)
	entry->pl_width = strlen(buf);
	}
	if (received && zfs_prop_get_recvd(zhp,
	zfs_prop_to_name(entry->pl_prop),
	buf, sizeof (buf), literal) == 0)
	if (strlen(buf) > entry->pl_recvd_width)
	entry->pl_recvd_width = strlen(buf);
	} else {
	if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop,
	&propval) == 0) {
	verify(nvlist_lookup_string(propval,
	ZPROP_VALUE, &strval) == 0);
	if (strlen(strval) > entry->pl_width)
	entry->pl_width = strlen(strval);
	}
	if (received && zfs_prop_get_recvd(zhp,
	entry->pl_user_prop,
	buf, sizeof (buf), literal) == 0)
	if (strlen(buf) > entry->pl_recvd_width)
	entry->pl_recvd_width = strlen(buf);
	}
	}

	return (0);
	}

	int
	zfs_deleg_share_nfs(libzfs_handle_t hdl, char dataset, char *path,
	char resource, void export, void *sharetab,
	int sharemax, zfs_share_op_t operation)
	{
	zfs_cmd_t zc = { 0 };
	int error;

	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
	(void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
	if (resource)
	(void) strlcpy(zc.zc_string, resource, sizeof (zc.zc_string));
	zc.zc_share.z_sharedata = (uint64_t)(uintptr_t)sharetab;
	zc.zc_share.z_exportdata = (uint64_t)(uintptr_t)export;
	zc.zc_share.z_sharetype = operation;
	zc.zc_share.z_sharemax = sharemax;
	error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc);
	return (error);
	}

	void
	zfs_prune_proplist(zfs_handle_t zhp, uint8_t props)
	{
	nvpair_t *curr;

	/*
	* Keep a reference to the props-table against which we prune the
	* properties.
	*/
	zhp->zfs_props_table = props;

	curr = nvlist_next_nvpair(zhp->zfs_props, NULL);

	while (curr) {
	zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr));
	nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr);

	/*
	* User properties will result in ZPROP_INVAL, and since we
	* only know how to prune standard ZFS properties, we always
	* leave these in the list. This can also happen if we
	* encounter an unknown DSL property (when running older
	* software, for example).
	*/
	if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE)
	(void) nvlist_remove(zhp->zfs_props,
	nvpair_name(curr), nvpair_type(curr));
	curr = next;
	}
	}

	#ifdef illumos
	static int
	zfs_smb_acl_mgmt(libzfs_handle_t hdl, char dataset, char *path,
	zfs_smb_acl_op_t cmd, char resource1, char resource2)
	{
	zfs_cmd_t zc = { 0 };
	nvlist_t *nvlist = NULL;
	int error;

	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
	(void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
	zc.zc_cookie = (uint64_t)cmd;

	if (cmd == ZFS_SMB_ACL_RENAME) {
	if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) {
	(void) no_memory(hdl);
	return (0);
	}
	}

	switch (cmd) {
	case ZFS_SMB_ACL_ADD:
	case ZFS_SMB_ACL_REMOVE:
	(void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string));
	break;
	case ZFS_SMB_ACL_RENAME:
	if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC,
	resource1) != 0) {
	(void) no_memory(hdl);
	return (-1);
	}
	if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET,
	resource2) != 0) {
	(void) no_memory(hdl);
	return (-1);
	}
	if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) {
	nvlist_free(nvlist);
	return (-1);
	}
	break;
	case ZFS_SMB_ACL_PURGE:
	break;
	default:
	return (-1);
	}
	error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc);
	nvlist_free(nvlist);
	return (error);
	}

	int
	zfs_smb_acl_add(libzfs_handle_t hdl, char dataset,
	char path, char resource)
	{
	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD,
	resource, NULL));
	}

	int
	zfs_smb_acl_remove(libzfs_handle_t hdl, char dataset,
	char path, char resource)
	{
	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE,
	resource, NULL));
	}

	int
	zfs_smb_acl_purge(libzfs_handle_t hdl, char dataset, char *path)
	{
	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE,
	NULL, NULL));
	}

	int
	zfs_smb_acl_rename(libzfs_handle_t hdl, char dataset, char *path,
	char oldname, char newname)
	{
	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME,
	oldname, newname));
	}
	#endif /* illumos */

	int
	zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
	zfs_userspace_cb_t func, void *arg)
	{
	zfs_cmd_t zc = { 0 };
	zfs_useracct_t buf[100];
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	int ret;

	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));

	zc.zc_objset_type = type;
	zc.zc_nvlist_dst = (uintptr_t)buf;

	for (;;) {
	zfs_useracct_t *zua = buf;

	zc.zc_nvlist_dst_size = sizeof (buf);
	if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) {
	char errbuf[1024];

	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN,
	"cannot get used/quota for %s"), zc.zc_name);
	return (zfs_standard_error_fmt(hdl, errno, errbuf));
	}
	if (zc.zc_nvlist_dst_size == 0)
	break;

	while (zc.zc_nvlist_dst_size > 0) {
	if ((ret = func(arg, zua->zu_domain, zua->zu_rid,
	zua->zu_space)) != 0)
	return (ret);
	zua++;
	zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t);
	}
	}

	return (0);
	}

	struct holdarg {
	nvlist_t *nvl;
	const char *snapname;
	const char *tag;
	boolean_t recursive;
	int error;
	};

	static int
	zfs_hold_one(zfs_handle_t zhp, void arg)
	{
	struct holdarg *ha = arg;
	char name[ZFS_MAX_DATASET_NAME_LEN];
	int rv = 0;

	(void) snprintf(name, sizeof (name),
	"%s@%s", zhp->zfs_name, ha->snapname);

	if (lzc_exists(name))
	fnvlist_add_string(ha->nvl, name, ha->tag);

	if (ha->recursive)
	rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha);
	zfs_close(zhp);
	return (rv);
	}

	int
	zfs_hold(zfs_handle_t zhp, const char snapname, const char *tag,
	boolean_t recursive, int cleanup_fd)
	{
	int ret;
	struct holdarg ha;

	ha.nvl = fnvlist_alloc();
	ha.snapname = snapname;
	ha.tag = tag;
	ha.recursive = recursive;
	(void) zfs_hold_one(zfs_handle_dup(zhp), &ha);

	if (nvlist_empty(ha.nvl)) {
	char errbuf[1024];

	fnvlist_free(ha.nvl);
	ret = ENOENT;
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN,
	"cannot hold snapshot '%s@%s'"),
	zhp->zfs_name, snapname);
	(void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf);
	return (ret);
	}

	ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl);
	fnvlist_free(ha.nvl);

	return (ret);
	}

	int
	zfs_hold_nvl(zfs_handle_t zhp, int cleanup_fd, nvlist_t holds)
	{
	int ret;
	nvlist_t *errors;
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	char errbuf[1024];
	nvpair_t *elem;

	errors = NULL;
	ret = lzc_hold(holds, cleanup_fd, &errors);

	if (ret == 0) {
	/* There may be errors even in the success case. */
	fnvlist_free(errors);
	return (0);
	}

	if (nvlist_empty(errors)) {
	/* no hold-specific errors */
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot hold"));
	switch (ret) {
	case ENOTSUP:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool must be upgraded"));
	(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	break;
	case EINVAL:
	(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	break;
	default:
	(void) zfs_standard_error(hdl, ret, errbuf);
	}
	}

	for (elem = nvlist_next_nvpair(errors, NULL);
	elem != NULL;
	elem = nvlist_next_nvpair(errors, elem)) {
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN,
	"cannot hold snapshot '%s'"), nvpair_name(elem));
	switch (fnvpair_value_int32(elem)) {
	case E2BIG:
	/*
	* Temporary tags wind up having the ds object id
	* prepended. So even if we passed the length check
	* above, it's still possible for the tag to wind
	* up being slightly too long.
	*/
	(void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf);
	break;
	case EINVAL:
	(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	break;
	case EEXIST:
	(void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf);
	break;
	default:
	(void) zfs_standard_error(hdl,
	fnvpair_value_int32(elem), errbuf);
	}
	}

	fnvlist_free(errors);
	return (ret);
	}

	static int
	zfs_release_one(zfs_handle_t zhp, void arg)
	{
	struct holdarg *ha = arg;
	char name[ZFS_MAX_DATASET_NAME_LEN];
	int rv = 0;
	nvlist_t *existing_holds;

	(void) snprintf(name, sizeof (name),
	"%s@%s", zhp->zfs_name, ha->snapname);

	if (lzc_get_holds(name, &existing_holds) != 0) {
	ha->error = ENOENT;
	} else if (!nvlist_exists(existing_holds, ha->tag)) {
	ha->error = ESRCH;
	} else {
	nvlist_t *torelease = fnvlist_alloc();
	fnvlist_add_boolean(torelease, ha->tag);
	fnvlist_add_nvlist(ha->nvl, name, torelease);
	fnvlist_free(torelease);
	}

	if (ha->recursive)
	rv = zfs_iter_filesystems(zhp, zfs_release_one, ha);
	zfs_close(zhp);
	return (rv);
	}

	int
	zfs_release(zfs_handle_t zhp, const char snapname, const char *tag,
	boolean_t recursive)
	{
	int ret;
	struct holdarg ha;
	nvlist_t *errors = NULL;
	nvpair_t *elem;
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	char errbuf[1024];

	ha.nvl = fnvlist_alloc();
	ha.snapname = snapname;
	ha.tag = tag;
	ha.recursive = recursive;
	ha.error = 0;
	(void) zfs_release_one(zfs_handle_dup(zhp), &ha);

	if (nvlist_empty(ha.nvl)) {
	fnvlist_free(ha.nvl);
	ret = ha.error;
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN,
	"cannot release hold from snapshot '%s@%s'"),
	zhp->zfs_name, snapname);
	if (ret == ESRCH) {
	(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
	} else {
	(void) zfs_standard_error(hdl, ret, errbuf);
	}
	return (ret);
	}

	ret = lzc_release(ha.nvl, &errors);
	fnvlist_free(ha.nvl);

	if (ret == 0) {
	/* There may be errors even in the success case. */
	fnvlist_free(errors);
	return (0);
	}

	if (nvlist_empty(errors)) {
	/* no hold-specific errors */
	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"cannot release"));
	switch (errno) {
	case ENOTSUP:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool must be upgraded"));
	(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	break;
	default:
	(void) zfs_standard_error_fmt(hdl, errno, errbuf);
	}
	}

	for (elem = nvlist_next_nvpair(errors, NULL);
	elem != NULL;
	elem = nvlist_next_nvpair(errors, elem)) {
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN,
	"cannot release hold from snapshot '%s'"),
	nvpair_name(elem));
	switch (fnvpair_value_int32(elem)) {
	case ESRCH:
	(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
	break;
	case EINVAL:
	(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
	break;
	default:
	(void) zfs_standard_error_fmt(hdl,
	fnvpair_value_int32(elem), errbuf);
	}
	}

	fnvlist_free(errors);
	return (ret);
	}

	int
	zfs_get_fsacl(zfs_handle_t zhp, nvlist_t *nvl)
	{
	zfs_cmd_t zc = { 0 };
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	int nvsz = 2048;
	void *nvbuf;
	int err = 0;
	char errbuf[1024];

	assert(zhp->zfs_type == ZFS_TYPE_VOLUME \|\|
	zhp->zfs_type == ZFS_TYPE_FILESYSTEM);

	tryagain:

	nvbuf = malloc(nvsz);
	if (nvbuf == NULL) {
	err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
	goto out;
	}

	zc.zc_nvlist_dst_size = nvsz;
	zc.zc_nvlist_dst = (uintptr_t)nvbuf;

	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));

	if (ioctl(hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) {
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"),
	zc.zc_name);
	switch (errno) {
	case ENOMEM:
	free(nvbuf);
	nvsz = zc.zc_nvlist_dst_size;
	goto tryagain;

	case ENOTSUP:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool must be upgraded"));
	err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
	break;
	case EINVAL:
	err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
	break;
	case ENOENT:
	err = zfs_error(hdl, EZFS_NOENT, errbuf);
	break;
	default:
	err = zfs_standard_error_fmt(hdl, errno, errbuf);
	break;
	}
	} else {
	/* success */
	int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
	if (rc) {
	(void) snprintf(errbuf, sizeof (errbuf), dgettext(
	TEXT_DOMAIN, "cannot get permissions on '%s'"),
	zc.zc_name);
	err = zfs_standard_error_fmt(hdl, rc, errbuf);
	}
	}

	free(nvbuf);
	out:
	return (err);
	}

	int
	zfs_set_fsacl(zfs_handle_t zhp, boolean_t un, nvlist_t nvl)
	{
	zfs_cmd_t zc = { 0 };
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	char *nvbuf;
	char errbuf[1024];
	size_t nvsz;
	int err;

	assert(zhp->zfs_type == ZFS_TYPE_VOLUME \|\|
	zhp->zfs_type == ZFS_TYPE_FILESYSTEM);

	err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE);
	assert(err == 0);

	nvbuf = malloc(nvsz);

	err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0);
	assert(err == 0);

	zc.zc_nvlist_src_size = nvsz;
	zc.zc_nvlist_src = (uintptr_t)nvbuf;
	zc.zc_perm_action = un;

	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));

	if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) {
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"),
	zc.zc_name);
	switch (errno) {
	case ENOTSUP:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool must be upgraded"));
	err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
	break;
	case EINVAL:
	err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
	break;
	case ENOENT:
	err = zfs_error(hdl, EZFS_NOENT, errbuf);
	break;
	default:
	err = zfs_standard_error_fmt(hdl, errno, errbuf);
	break;
	}
	}

	free(nvbuf);

	return (err);
	}

	int
	zfs_get_holds(zfs_handle_t zhp, nvlist_t *nvl)
	{
	int err;
	char errbuf[1024];

	err = lzc_get_holds(zhp->zfs_name, nvl);

	if (err != 0) {
	libzfs_handle_t *hdl = zhp->zfs_hdl;

	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
	zhp->zfs_name);
	switch (err) {
	case ENOTSUP:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool must be upgraded"));
	err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
	break;
	case EINVAL:
	err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
	break;
	case ENOENT:
	err = zfs_error(hdl, EZFS_NOENT, errbuf);
	break;
	default:
	err = zfs_standard_error_fmt(hdl, errno, errbuf);
	break;
	}
	}

	return (err);
	}

	/*
	* Convert the zvol's volume size to an appropriate reservation.
	* Note: If this routine is updated, it is necessary to update the ZFS test
	* suite's shell version in reservation.kshlib.
	*/
	uint64_t
	zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
	{
	uint64_t numdb;
	uint64_t nblocks, volblocksize;
	int ncopies;
	char *strval;

	if (nvlist_lookup_string(props,
	zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0)
	ncopies = atoi(strval);
	else
	ncopies = 1;
	if (nvlist_lookup_uint64(props,
	zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	&volblocksize) != 0)
	volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
	nblocks = volsize/volblocksize;
	/* start with metadnode L0-L6 */
	numdb = 7;
	/* calculate number of indirects */
	while (nblocks > 1) {
	nblocks += DNODES_PER_LEVEL - 1;
	nblocks /= DNODES_PER_LEVEL;
	numdb += nblocks;
	}
	numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1);
	volsize *= ncopies;
	/*
	* this is exactly DN_MAX_INDBLKSHIFT when metadata isn't
	* compressed, but in practice they compress down to about
	* 1100 bytes
	*/
	numdb *= 1ULL << DN_MAX_INDBLKSHIFT;
	volsize += numdb;
	return (volsize);
	}

	/*
	* Attach/detach the given filesystem to/from the given jail.
	*/
	int
	zfs_jail(zfs_handle_t *zhp, int jailid, int attach)
	{
	libzfs_handle_t *hdl = zhp->zfs_hdl;
	zfs_cmd_t zc = { 0 };
	char errbuf[1024];
	unsigned long cmd;
	int ret;

	if (attach) {
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name);
	} else {
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name);
	}

	switch (zhp->zfs_type) {
	case ZFS_TYPE_VOLUME:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"volumes can not be jailed"));
	return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
	case ZFS_TYPE_SNAPSHOT:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"snapshots can not be jailed"));
	return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
	}
	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM);

	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
	zc.zc_objset_type = DMU_OST_ZFS;
	zc.zc_jailid = jailid;

	cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL;
	if ((ret = ioctl(hdl->libzfs_fd, cmd, &zc)) != 0)
	zfs_standard_error(hdl, errno, errbuf);

	return (ret);
	}
	Index: stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
	===================================================================
	--- stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c (revision 332524)
	+++ stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c (revision 332525)
	@@ -1,4284 +1,4356 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright 2016 Nexenta Systems, Inc.
	* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	* Copyright (c) 2017 Datto Inc.
	*/

	#include <sys/types.h>
	#include <sys/stat.h>
	#include <ctype.h>
	#include <errno.h>
	#include <devid.h>
	#include <fcntl.h>
	#include <libintl.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <strings.h>
	#include <unistd.h>
	#include <libgen.h>
	#include <sys/zfs_ioctl.h>
	#include <dlfcn.h>

	#include "zfs_namecheck.h"
	#include "zfs_prop.h"
	#include "libzfs_impl.h"
	#include "zfs_comutil.h"
	#include "zfeature_common.h"

	static int read_efi_label(nvlist_t , diskaddr_t , boolean_t *);
	static boolean_t zpool_vdev_is_interior(const char *name);

	#define BACKUP_SLICE "s2"

	typedef struct prop_flags {
	int create:1; /* Validate property on creation */
	int import:1; /* Validate property on import */
	} prop_flags_t;

	/*
	* ====================================================================
	* zpool property functions
	* ====================================================================
	*/

	static int
	zpool_get_all_props(zpool_handle_t *zhp)
	{
	zfs_cmd_t zc = { 0 };
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));

	if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
	return (-1);

	while (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) {
	if (errno == ENOMEM) {
	if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	zcmd_free_nvlists(&zc);
	return (-1);
	}
	} else {
	zcmd_free_nvlists(&zc);
	return (-1);
	}
	}

	if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) {
	zcmd_free_nvlists(&zc);
	return (-1);
	}

	zcmd_free_nvlists(&zc);

	return (0);
	}

	static int
	zpool_props_refresh(zpool_handle_t *zhp)
	{
	nvlist_t *old_props;

	old_props = zhp->zpool_props;

	if (zpool_get_all_props(zhp) != 0)
	return (-1);

	nvlist_free(old_props);
	return (0);
	}

	static char *
	zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop,
	zprop_source_t *src)
	{
	nvlist_t nv, nvl;
	uint64_t ival;
	char *value;
	zprop_source_t source;

	nvl = zhp->zpool_props;
	if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
	verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0);
	source = ival;
	verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
	} else {
	source = ZPROP_SRC_DEFAULT;
	if ((value = (char *)zpool_prop_default_string(prop)) == NULL)
	value = "-";
	}

	if (src)
	*src = source;

	return (value);
	}

	uint64_t
	zpool_get_prop_int(zpool_handle_t zhp, zpool_prop_t prop, zprop_source_t src)
	{
	nvlist_t nv, nvl;
	uint64_t value;
	zprop_source_t source;

	if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) {
	/*
	* zpool_get_all_props() has most likely failed because
	* the pool is faulted, but if all we need is the top level
	* vdev's guid then get it from the zhp config nvlist.
	*/
	if ((prop == ZPOOL_PROP_GUID) &&
	(nvlist_lookup_nvlist(zhp->zpool_config,
	ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) &&
	(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value)
	== 0)) {
	return (value);
	}
	return (zpool_prop_default_numeric(prop));
	}

	nvl = zhp->zpool_props;
	if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
	verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0);
	source = value;
	verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
	} else {
	source = ZPROP_SRC_DEFAULT;
	value = zpool_prop_default_numeric(prop);
	}

	if (src)
	*src = source;

	return (value);
	}

	/*
	* Map VDEV STATE to printed strings.
	*/
	const char *
	zpool_state_to_name(vdev_state_t state, vdev_aux_t aux)
	{
	switch (state) {
	case VDEV_STATE_CLOSED:
	case VDEV_STATE_OFFLINE:
	return (gettext("OFFLINE"));
	case VDEV_STATE_REMOVED:
	return (gettext("REMOVED"));
	case VDEV_STATE_CANT_OPEN:
	if (aux == VDEV_AUX_CORRUPT_DATA \|\| aux == VDEV_AUX_BAD_LOG)
	return (gettext("FAULTED"));
	else if (aux == VDEV_AUX_SPLIT_POOL)
	return (gettext("SPLIT"));
	else
	return (gettext("UNAVAIL"));
	case VDEV_STATE_FAULTED:
	return (gettext("FAULTED"));
	case VDEV_STATE_DEGRADED:
	return (gettext("DEGRADED"));
	case VDEV_STATE_HEALTHY:
	return (gettext("ONLINE"));

	default:
	break;
	}

	return (gettext("UNKNOWN"));
	}

	/*
	* Map POOL STATE to printed strings.
	*/
	const char *
	zpool_pool_state_to_name(pool_state_t state)
	{
	switch (state) {
	case POOL_STATE_ACTIVE:
	return (gettext("ACTIVE"));
	case POOL_STATE_EXPORTED:
	return (gettext("EXPORTED"));
	case POOL_STATE_DESTROYED:
	return (gettext("DESTROYED"));
	case POOL_STATE_SPARE:
	return (gettext("SPARE"));
	case POOL_STATE_L2CACHE:
	return (gettext("L2CACHE"));
	case POOL_STATE_UNINITIALIZED:
	return (gettext("UNINITIALIZED"));
	case POOL_STATE_UNAVAIL:
	return (gettext("UNAVAIL"));
	case POOL_STATE_POTENTIALLY_ACTIVE:
	return (gettext("POTENTIALLY_ACTIVE"));
	}

	return (gettext("UNKNOWN"));
	}

	/*
	* Get a zpool property value for 'prop' and return the value in
	* a pre-allocated buffer.
	*/
	int
	zpool_get_prop(zpool_handle_t zhp, zpool_prop_t prop, char buf, size_t len,
	zprop_source_t *srctype, boolean_t literal)
	{
	uint64_t intval;
	const char *strval;
	zprop_source_t src = ZPROP_SRC_NONE;
	nvlist_t *nvroot;
	vdev_stat_t *vs;
	uint_t vsc;

	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
	switch (prop) {
	case ZPOOL_PROP_NAME:
	(void) strlcpy(buf, zpool_get_name(zhp), len);
	break;

	case ZPOOL_PROP_HEALTH:
	(void) strlcpy(buf,
	zpool_pool_state_to_name(POOL_STATE_UNAVAIL), len);
	break;

	case ZPOOL_PROP_GUID:
	intval = zpool_get_prop_int(zhp, prop, &src);
	(void) snprintf(buf, len, "%llu", intval);
	break;

	case ZPOOL_PROP_ALTROOT:
	case ZPOOL_PROP_CACHEFILE:
	case ZPOOL_PROP_COMMENT:
	if (zhp->zpool_props != NULL \|\|
	zpool_get_all_props(zhp) == 0) {
	(void) strlcpy(buf,
	zpool_get_prop_string(zhp, prop, &src),
	len);
	break;
	}
	/* FALLTHROUGH */
	default:
	(void) strlcpy(buf, "-", len);
	break;
	}

	if (srctype != NULL)
	*srctype = src;
	return (0);
	}

	if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) &&
	prop != ZPOOL_PROP_NAME)
	return (-1);

	switch (zpool_prop_get_type(prop)) {
	case PROP_TYPE_STRING:
	(void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src),
	len);
	break;

	case PROP_TYPE_NUMBER:
	intval = zpool_get_prop_int(zhp, prop, &src);

	switch (prop) {
	case ZPOOL_PROP_SIZE:
	case ZPOOL_PROP_ALLOCATED:
	case ZPOOL_PROP_FREE:
	case ZPOOL_PROP_FREEING:
	case ZPOOL_PROP_LEAKED:
	if (literal) {
	(void) snprintf(buf, len, "%llu",
	(u_longlong_t)intval);
	} else {
	(void) zfs_nicenum(intval, buf, len);
	}
	break;
	case ZPOOL_PROP_BOOTSIZE:
	case ZPOOL_PROP_EXPANDSZ:
	if (intval == 0) {
	(void) strlcpy(buf, "-", len);
	} else if (literal) {
	(void) snprintf(buf, len, "%llu",
	(u_longlong_t)intval);
	} else {
	(void) zfs_nicenum(intval, buf, len);
	}
	break;
	case ZPOOL_PROP_CAPACITY:
	if (literal) {
	(void) snprintf(buf, len, "%llu",
	(u_longlong_t)intval);
	} else {
	(void) snprintf(buf, len, "%llu%%",
	(u_longlong_t)intval);
	}
	break;
	case ZPOOL_PROP_FRAGMENTATION:
	if (intval == UINT64_MAX) {
	(void) strlcpy(buf, "-", len);
	} else {
	(void) snprintf(buf, len, "%llu%%",
	(u_longlong_t)intval);
	}
	break;
	case ZPOOL_PROP_DEDUPRATIO:
	(void) snprintf(buf, len, "%llu.%02llux",
	(u_longlong_t)(intval / 100),
	(u_longlong_t)(intval % 100));
	break;
	case ZPOOL_PROP_HEALTH:
	verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
	ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	verify(nvlist_lookup_uint64_array(nvroot,
	ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
	== 0);

	(void) strlcpy(buf, zpool_state_to_name(intval,
	vs->vs_aux), len);
	break;
	case ZPOOL_PROP_VERSION:
	if (intval >= SPA_VERSION_FEATURES) {
	(void) snprintf(buf, len, "-");
	break;
	}
	/* FALLTHROUGH */
	default:
	(void) snprintf(buf, len, "%llu", intval);
	}
	break;

	case PROP_TYPE_INDEX:
	intval = zpool_get_prop_int(zhp, prop, &src);
	if (zpool_prop_index_to_string(prop, intval, &strval)
	!= 0)
	return (-1);
	(void) strlcpy(buf, strval, len);
	break;

	default:
	abort();
	}

	if (srctype)
	*srctype = src;

	return (0);
	}

	/*
	* Check if the bootfs name has the same pool name as it is set to.
	* Assuming bootfs is a valid dataset name.
	*/
	static boolean_t
	bootfs_name_valid(const char pool, char bootfs)
	{
	int len = strlen(pool);

	if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM\|ZFS_TYPE_SNAPSHOT))
	return (B_FALSE);

	if (strncmp(pool, bootfs, len) == 0 &&
	(bootfs[len] == '/' \|\| bootfs[len] == '\0'))
	return (B_TRUE);

	return (B_FALSE);
	}

	boolean_t
	zpool_is_bootable(zpool_handle_t *zhp)
	{
	char bootfs[ZFS_MAX_DATASET_NAME_LEN];

	return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs,
	sizeof (bootfs), NULL, B_FALSE) == 0 && strncmp(bootfs, "-",
	sizeof (bootfs)) != 0);
	}


	/*
	* Given an nvlist of zpool properties to be set, validate that they are
	* correct, and parse any numeric properties (index, boolean, etc) if they are
	* specified as strings.
	*/
	static nvlist_t *
	zpool_valid_proplist(libzfs_handle_t hdl, const char poolname,
	nvlist_t props, uint64_t version, prop_flags_t flags, char errbuf)
	{
	nvpair_t *elem;
	nvlist_t *retprops;
	zpool_prop_t prop;
	char *strval;
	uint64_t intval;
	char slash, check;
	struct stat64 statbuf;
	zpool_handle_t *zhp;

	if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) {
	(void) no_memory(hdl);
	return (NULL);
	}

	elem = NULL;
	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
	const char *propname = nvpair_name(elem);

	prop = zpool_name_to_prop(propname);
	if (prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname)) {
	int err;
	char *fname = strchr(propname, '@') + 1;

	err = zfeature_lookup_name(fname, NULL);
	if (err != 0) {
	ASSERT3U(err, ==, ENOENT);
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"invalid feature '%s'"), fname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	if (nvpair_type(elem) != DATA_TYPE_STRING) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' must be a string"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	(void) nvpair_value_string(elem, &strval);
	if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property '%s' can only be set to "
	"'enabled'"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	if (nvlist_add_uint64(retprops, propname, 0) != 0) {
	(void) no_memory(hdl);
	goto error;
	}
	continue;
	}

	/*
	* Make sure this property is valid and applies to this type.
	*/
	if (prop == ZPOOL_PROP_INVAL) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"invalid property '%s'"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	if (zpool_prop_readonly(prop)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
	"is readonly"), propname);
	(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
	goto error;
	}

	if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops,
	&strval, &intval, errbuf) != 0)
	goto error;

	/*
	* Perform additional checking for specific properties.
	*/
	switch (prop) {
	case ZPOOL_PROP_VERSION:
	if (intval < version \|\|
	!SPA_VERSION_IS_SUPPORTED(intval)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property '%s' number %d is invalid."),
	propname, intval);
	(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	goto error;
	}
	break;

	case ZPOOL_PROP_BOOTSIZE:
	if (!flags.create) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property '%s' can only be set during pool "
	"creation"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}
	break;

	case ZPOOL_PROP_BOOTFS:
	if (flags.create \|\| flags.import) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property '%s' cannot be set at creation "
	"or import time"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	if (version < SPA_VERSION_BOOTFS) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool must be upgraded to support "
	"'%s' property"), propname);
	(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
	goto error;
	}

	/*
	* bootfs property value has to be a dataset name and
	* the dataset has to be in the same pool as it sets to.
	*/
	if (strval[0] != '\0' && !bootfs_name_valid(poolname,
	strval)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
	"is an invalid name"), strval);
	(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
	goto error;
	}

	if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"could not open pool '%s'"), poolname);
	(void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
	goto error;
	}
	zpool_close(zhp);
	break;

	case ZPOOL_PROP_ALTROOT:
	if (!flags.create && !flags.import) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property '%s' can only be set during pool "
	"creation or import"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}

	if (strval[0] != '/') {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"bad alternate root '%s'"), strval);
	(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
	goto error;
	}
	break;

	case ZPOOL_PROP_CACHEFILE:
	if (strval[0] == '\0')
	break;

	if (strcmp(strval, "none") == 0)
	break;

	if (strval[0] != '/') {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property '%s' must be empty, an "
	"absolute path, or 'none'"), propname);
	(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
	goto error;
	}

	slash = strrchr(strval, '/');

	if (slash[1] == '\0' \|\| strcmp(slash, "/.") == 0 \|\|
	strcmp(slash, "/..") == 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' is not a valid file"), strval);
	(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
	goto error;
	}

	*slash = '\0';

	if (strval[0] != '\0' &&
	(stat64(strval, &statbuf) != 0 \|\|
	!S_ISDIR(statbuf.st_mode))) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' is not a valid directory"),
	strval);
	(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
	goto error;
	}

	*slash = '/';
	break;

	case ZPOOL_PROP_COMMENT:
	for (check = strval; *check != '\0'; check++) {
	if (!isprint(*check)) {
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN,
	"comment may only have printable "
	"characters"));
	(void) zfs_error(hdl, EZFS_BADPROP,
	errbuf);
	goto error;
	}
	}
	if (strlen(strval) > ZPROP_MAX_COMMENT) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"comment must not exceed %d characters"),
	ZPROP_MAX_COMMENT);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}
	break;
	case ZPOOL_PROP_READONLY:
	if (!flags.import) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property '%s' can only be set at "
	"import time"), propname);
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	goto error;
	}
	break;

	default:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"property '%s'(%d) not defined"), propname, prop);
	break;
	}
	}

	return (retprops);
	error:
	nvlist_free(retprops);
	return (NULL);
	}

	/*
	* Set zpool property : propname=propval.
	*/
	int
	zpool_set_prop(zpool_handle_t zhp, const char propname, const char *propval)
	{
	zfs_cmd_t zc = { 0 };
	int ret = -1;
	char errbuf[1024];
	nvlist_t *nvl = NULL;
	nvlist_t *realprops;
	uint64_t version;
	prop_flags_t flags = { 0 };

	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
	zhp->zpool_name);

	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
	return (no_memory(zhp->zpool_hdl));

	if (nvlist_add_string(nvl, propname, propval) != 0) {
	nvlist_free(nvl);
	return (no_memory(zhp->zpool_hdl));
	}

	version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
	if ((realprops = zpool_valid_proplist(zhp->zpool_hdl,
	zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) {
	nvlist_free(nvl);
	return (-1);
	}

	nvlist_free(nvl);
	nvl = realprops;

	/*
	* Execute the corresponding ioctl() to set this property.
	*/
	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));

	if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) {
	nvlist_free(nvl);
	return (-1);
	}

	ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc);

	zcmd_free_nvlists(&zc);
	nvlist_free(nvl);

	if (ret)
	(void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
	else
	(void) zpool_props_refresh(zhp);

	return (ret);
	}

	int
	zpool_expand_proplist(zpool_handle_t zhp, zprop_list_t *plp)
	{
	libzfs_handle_t *hdl = zhp->zpool_hdl;
	zprop_list_t *entry;
	char buf[ZFS_MAXPROPLEN];
	nvlist_t *features = NULL;
	zprop_list_t **last;
	boolean_t firstexpand = (NULL == *plp);

	if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0)
	return (-1);

	last = plp;
	while (*last != NULL)
	last = &(*last)->pl_next;

	if ((*plp)->pl_all)
	features = zpool_get_features(zhp);

	if ((*plp)->pl_all && firstexpand) {
	for (int i = 0; i < SPA_FEATURES; i++) {
	zprop_list_t *entry = zfs_alloc(hdl,
	sizeof (zprop_list_t));
	entry->pl_prop = ZPROP_INVAL;
	entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s",
	spa_feature_table[i].fi_uname);
	entry->pl_width = strlen(entry->pl_user_prop);
	entry->pl_all = B_TRUE;

	*last = entry;
	last = &entry->pl_next;
	}
	}

	/* add any unsupported features */
	for (nvpair_t *nvp = nvlist_next_nvpair(features, NULL);
	nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) {
	char *propname;
	boolean_t found;
	zprop_list_t *entry;

	if (zfeature_is_supported(nvpair_name(nvp)))
	continue;

	propname = zfs_asprintf(hdl, "unsupported@%s",
	nvpair_name(nvp));

	/*
	* Before adding the property to the list make sure that no
	* other pool already added the same property.
	*/
	found = B_FALSE;
	entry = *plp;
	while (entry != NULL) {
	if (entry->pl_user_prop != NULL &&
	strcmp(propname, entry->pl_user_prop) == 0) {
	found = B_TRUE;
	break;
	}
	entry = entry->pl_next;
	}
	if (found) {
	free(propname);
	continue;
	}

	entry = zfs_alloc(hdl, sizeof (zprop_list_t));
	entry->pl_prop = ZPROP_INVAL;
	entry->pl_user_prop = propname;
	entry->pl_width = strlen(entry->pl_user_prop);
	entry->pl_all = B_TRUE;

	*last = entry;
	last = &entry->pl_next;
	}

	for (entry = *plp; entry != NULL; entry = entry->pl_next) {

	if (entry->pl_fixed)
	continue;

	if (entry->pl_prop != ZPROP_INVAL &&
	zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf),
	NULL, B_FALSE) == 0) {
	if (strlen(buf) > entry->pl_width)
	entry->pl_width = strlen(buf);
	}
	}

	return (0);
	}

	/*
	* Get the state for the given feature on the given ZFS pool.
	*/
	int
	zpool_prop_get_feature(zpool_handle_t zhp, const char propname, char *buf,
	size_t len)
	{
	uint64_t refcount;
	boolean_t found = B_FALSE;
	nvlist_t *features = zpool_get_features(zhp);
	boolean_t supported;
	const char *feature = strchr(propname, '@') + 1;

	supported = zpool_prop_feature(propname);
	ASSERT(supported \|\| zpool_prop_unsupported(propname));

	/*
	* Convert from feature name to feature guid. This conversion is
	* unecessary for unsupported@... properties because they already
	* use guids.
	*/
	if (supported) {
	int ret;
	spa_feature_t fid;

	ret = zfeature_lookup_name(feature, &fid);
	if (ret != 0) {
	(void) strlcpy(buf, "-", len);
	return (ENOTSUP);
	}
	feature = spa_feature_table[fid].fi_guid;
	}

	if (nvlist_lookup_uint64(features, feature, &refcount) == 0)
	found = B_TRUE;

	if (supported) {
	if (!found) {
	(void) strlcpy(buf, ZFS_FEATURE_DISABLED, len);
	} else {
	if (refcount == 0)
	(void) strlcpy(buf, ZFS_FEATURE_ENABLED, len);
	else
	(void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len);
	}
	} else {
	if (found) {
	if (refcount == 0) {
	(void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE);
	} else {
	(void) strcpy(buf, ZFS_UNSUPPORTED_READONLY);
	}
	} else {
	(void) strlcpy(buf, "-", len);
	return (ENOTSUP);
	}
	}

	return (0);
	}

	/*
	* Don't start the slice at the default block of 34; many storage
	* devices will use a stripe width of 128k, so start there instead.
	*/
	#define NEW_START_BLOCK 256

	/*
	* Validate the given pool name, optionally putting an extended error message in
	* 'buf'.
	*/
	boolean_t
	zpool_name_valid(libzfs_handle_t hdl, boolean_t isopen, const char pool)
	{
	namecheck_err_t why;
	char what;
	int ret;

	ret = pool_namecheck(pool, &why, &what);

	/*
	* The rules for reserved pool names were extended at a later point.
	* But we need to support users with existing pools that may now be
	* invalid. So we only check for this expanded set of names during a
	* create (or import), and only in userland.
	*/
	if (ret == 0 && !isopen &&
	(strncmp(pool, "mirror", 6) == 0 \|\|
	strncmp(pool, "raidz", 5) == 0 \|\|
	strncmp(pool, "spare", 5) == 0 \|\|
	strcmp(pool, "log") == 0)) {
	if (hdl != NULL)
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN, "name is reserved"));
	return (B_FALSE);
	}


	if (ret != 0) {
	if (hdl != NULL) {
	switch (why) {
	case NAME_ERR_TOOLONG:
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN, "name is too long"));
	break;

	case NAME_ERR_INVALCHAR:
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN, "invalid character "
	"'%c' in pool name"), what);
	break;

	case NAME_ERR_NOLETTER:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"name must begin with a letter"));
	break;

	case NAME_ERR_RESERVED:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"name is reserved"));
	break;

	case NAME_ERR_DISKLIKE:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool name is reserved"));
	break;

	case NAME_ERR_LEADING_SLASH:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"leading slash in name"));
	break;

	case NAME_ERR_EMPTY_COMPONENT:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"empty component in name"));
	break;

	case NAME_ERR_TRAILING_SLASH:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"trailing slash in name"));
	break;

	case NAME_ERR_MULTIPLE_DELIMITERS:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"multiple '@' and/or '#' delimiters in "
	"name"));
	break;

	default:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"(%d) not defined"), why);
	break;
	}
	}
	return (B_FALSE);
	}

	return (B_TRUE);
	}

	/*
	* Open a handle to the given pool, even if the pool is currently in the FAULTED
	* state.
	*/
	zpool_handle_t *
	zpool_open_canfail(libzfs_handle_t hdl, const char pool)
	{
	zpool_handle_t *zhp;
	boolean_t missing;

	/*
	* Make sure the pool name is valid.
	*/
	if (!zpool_name_valid(hdl, B_TRUE, pool)) {
	(void) zfs_error_fmt(hdl, EZFS_INVALIDNAME,
	dgettext(TEXT_DOMAIN, "cannot open '%s'"),
	pool);
	return (NULL);
	}

	if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
	return (NULL);

	zhp->zpool_hdl = hdl;
	(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));

	if (zpool_refresh_stats(zhp, &missing) != 0) {
	zpool_close(zhp);
	return (NULL);
	}

	if (missing) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool"));
	(void) zfs_error_fmt(hdl, EZFS_NOENT,
	dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool);
	zpool_close(zhp);
	return (NULL);
	}

	return (zhp);
	}

	/*
	* Like the above, but silent on error. Used when iterating over pools (because
	* the configuration cache may be out of date).
	*/
	int
	zpool_open_silent(libzfs_handle_t hdl, const char pool, zpool_handle_t **ret)
	{
	zpool_handle_t *zhp;
	boolean_t missing;

	if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
	return (-1);

	zhp->zpool_hdl = hdl;
	(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));

	if (zpool_refresh_stats(zhp, &missing) != 0) {
	zpool_close(zhp);
	return (-1);
	}

	if (missing) {
	zpool_close(zhp);
	*ret = NULL;
	return (0);
	}

	*ret = zhp;
	return (0);
	}

	/*
	* Similar to zpool_open_canfail(), but refuses to open pools in the faulted
	* state.
	*/
	zpool_handle_t *
	zpool_open(libzfs_handle_t hdl, const char pool)
	{
	zpool_handle_t *zhp;

	if ((zhp = zpool_open_canfail(hdl, pool)) == NULL)
	return (NULL);

	if (zhp->zpool_state == POOL_STATE_UNAVAIL) {
	(void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL,
	dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name);
	zpool_close(zhp);
	return (NULL);
	}

	return (zhp);
	}

	/*
	* Close the handle. Simply frees the memory associated with the handle.
	*/
	void
	zpool_close(zpool_handle_t *zhp)
	{
	nvlist_free(zhp->zpool_config);
	nvlist_free(zhp->zpool_old_config);
	nvlist_free(zhp->zpool_props);
	free(zhp);
	}

	/*
	* Return the name of the pool.
	*/
	const char *
	zpool_get_name(zpool_handle_t *zhp)
	{
	return (zhp->zpool_name);
	}


	/*
	* Return the state of the pool (ACTIVE or UNAVAILABLE)
	*/
	int
	zpool_get_state(zpool_handle_t *zhp)
	{
	return (zhp->zpool_state);
	}

	/*
	* Create the named pool, using the provided vdev list. It is assumed
	* that the consumer has already validated the contents of the nvlist, so we
	* don't have to worry about error semantics.
	*/
	int
	zpool_create(libzfs_handle_t hdl, const char pool, nvlist_t *nvroot,
	nvlist_t props, nvlist_t fsprops)
	{
	zfs_cmd_t zc = { 0 };
	nvlist_t *zc_fsprops = NULL;
	nvlist_t *zc_props = NULL;
	char msg[1024];
	int ret = -1;

	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	"cannot create '%s'"), pool);

	if (!zpool_name_valid(hdl, B_FALSE, pool))
	return (zfs_error(hdl, EZFS_INVALIDNAME, msg));

	if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
	return (-1);

	if (props) {
	prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE };

	if ((zc_props = zpool_valid_proplist(hdl, pool, props,
	SPA_VERSION_1, flags, msg)) == NULL) {
	goto create_failed;
	}
	}

	if (fsprops) {
	uint64_t zoned;
	char *zonestr;

	zoned = ((nvlist_lookup_string(fsprops,
	zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) &&
	strcmp(zonestr, "on") == 0);

	if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM,
	fsprops, zoned, NULL, NULL, msg)) == NULL) {
	goto create_failed;
	}
	if (!zc_props &&
	(nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) {
	goto create_failed;
	}
	if (nvlist_add_nvlist(zc_props,
	ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) {
	goto create_failed;
	}
	}

	if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
	goto create_failed;

	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));

	if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) {

	zcmd_free_nvlists(&zc);
	nvlist_free(zc_props);
	nvlist_free(zc_fsprops);

	switch (errno) {
	case EBUSY:
	/*
	* This can happen if the user has specified the same
	* device multiple times. We can't reliably detect this
	* until we try to add it and see we already have a
	* label.
	*/
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"one or more vdevs refer to the same device"));
	return (zfs_error(hdl, EZFS_BADDEV, msg));

	case ERANGE:
	/*
	* This happens if the record size is smaller or larger
	* than the allowed size range, or not a power of 2.
	*
	* NOTE: although zfs_valid_proplist is called earlier,
	* this case may have slipped through since the
	* pool does not exist yet and it is therefore
	* impossible to read properties e.g. max blocksize
	* from the pool.
	*/
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"record size invalid"));
	return (zfs_error(hdl, EZFS_BADPROP, msg));

	case EOVERFLOW:
	/*
	* This occurs when one of the devices is below
	* SPA_MINDEVSIZE. Unfortunately, we can't detect which
	* device was the problem device since there's no
	* reliable way to determine device size from userland.
	*/
	{
	char buf[64];

	zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));

	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"one or more devices is less than the "
	"minimum size (%s)"), buf);
	}
	return (zfs_error(hdl, EZFS_BADDEV, msg));

	case ENOSPC:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"one or more devices is out of space"));
	return (zfs_error(hdl, EZFS_BADDEV, msg));

	case ENOTBLK:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"cache device must be a disk or disk slice"));
	return (zfs_error(hdl, EZFS_BADDEV, msg));

	default:
	return (zpool_standard_error(hdl, errno, msg));
	}
	}

	create_failed:
	zcmd_free_nvlists(&zc);
	nvlist_free(zc_props);
	nvlist_free(zc_fsprops);
	return (ret);
	}

	/*
	* Destroy the given pool. It is up to the caller to ensure that there are no
	* datasets left in the pool.
	*/
	int
	zpool_destroy(zpool_handle_t zhp, const char log_str)
	{
	zfs_cmd_t zc = { 0 };
	zfs_handle_t *zfp = NULL;
	libzfs_handle_t *hdl = zhp->zpool_hdl;
	char msg[1024];

	if (zhp->zpool_state == POOL_STATE_ACTIVE &&
	(zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL)
	return (-1);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	zc.zc_history = (uint64_t)(uintptr_t)log_str;

	if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	"cannot destroy '%s'"), zhp->zpool_name);

	if (errno == EROFS) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"one or more devices is read only"));
	(void) zfs_error(hdl, EZFS_BADDEV, msg);
	} else {
	(void) zpool_standard_error(hdl, errno, msg);
	}

	if (zfp)
	zfs_close(zfp);
	return (-1);
	}

	if (zfp) {
	remove_mountpoint(zfp);
	zfs_close(zfp);
	}

	return (0);
	}

	/*
	* Add the given vdevs to the pool. The caller must have already performed the
	* necessary verification to ensure that the vdev specification is well-formed.
	*/
	int
	zpool_add(zpool_handle_t zhp, nvlist_t nvroot)
	{
	zfs_cmd_t zc = { 0 };
	int ret;
	libzfs_handle_t *hdl = zhp->zpool_hdl;
	char msg[1024];
	nvlist_t spares, l2cache;
	uint_t nspares, nl2cache;

	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	"cannot add to '%s'"), zhp->zpool_name);

	if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
	SPA_VERSION_SPARES &&
	nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	&spares, &nspares) == 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
	"upgraded to add hot spares"));
	return (zfs_error(hdl, EZFS_BADVERSION, msg));
	}

	if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
	SPA_VERSION_L2CACHE &&
	nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	&l2cache, &nl2cache) == 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
	"upgraded to add cache devices"));
	return (zfs_error(hdl, EZFS_BADVERSION, msg));
	}

	if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
	return (-1);
	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));

	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
	switch (errno) {
	case EBUSY:
	/*
	* This can happen if the user has specified the same
	* device multiple times. We can't reliably detect this
	* until we try to add it and see we already have a
	* label.
	*/
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"one or more vdevs refer to the same device"));
	(void) zfs_error(hdl, EZFS_BADDEV, msg);
	break;

	+ case EINVAL:
	+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	+ "invalid config; a pool with removing/removed "
	+ "vdevs does not support adding raidz vdevs"));
	+ (void) zfs_error(hdl, EZFS_BADDEV, msg);
	+ break;
	+
	case EOVERFLOW:
	/*
	* This occurrs when one of the devices is below
	* SPA_MINDEVSIZE. Unfortunately, we can't detect which
	* device was the problem device since there's no
	* reliable way to determine device size from userland.
	*/
	{
	char buf[64];

	zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));

	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"device is less than the minimum "
	"size (%s)"), buf);
	}
	(void) zfs_error(hdl, EZFS_BADDEV, msg);
	break;

	case ENOTSUP:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool must be upgraded to add these vdevs"));
	(void) zfs_error(hdl, EZFS_BADVERSION, msg);
	break;

	case EDOM:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"root pool can not have multiple vdevs"
	" or separate logs"));
	(void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg);
	break;

	case ENOTBLK:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"cache device must be a disk or disk slice"));
	(void) zfs_error(hdl, EZFS_BADDEV, msg);
	break;

	default:
	(void) zpool_standard_error(hdl, errno, msg);
	}

	ret = -1;
	} else {
	ret = 0;
	}

	zcmd_free_nvlists(&zc);

	return (ret);
	}

	/*
	* Exports the pool from the system. The caller must ensure that there are no
	* mounted datasets in the pool.
	*/
	static int
	zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce,
	const char *log_str)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];

	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	"cannot export '%s'"), zhp->zpool_name);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	zc.zc_cookie = force;
	zc.zc_guid = hardforce;
	zc.zc_history = (uint64_t)(uintptr_t)log_str;

	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
	switch (errno) {
	case EXDEV:
	zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
	"use '-f' to override the following errors:\n"
	"'%s' has an active shared spare which could be"
	" used by other pools once '%s' is exported."),
	zhp->zpool_name, zhp->zpool_name);
	return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE,
	msg));
	default:
	return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
	msg));
	}
	}

	return (0);
	}

	int
	zpool_export(zpool_handle_t zhp, boolean_t force, const char log_str)
	{
	return (zpool_export_common(zhp, force, B_FALSE, log_str));
	}

	int
	zpool_export_force(zpool_handle_t zhp, const char log_str)
	{
	return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str));
	}

	static void
	zpool_rewind_exclaim(libzfs_handle_t hdl, const char name, boolean_t dryrun,
	nvlist_t *config)
	{
	nvlist_t *nv = NULL;
	uint64_t rewindto;
	int64_t loss = -1;
	struct tm t;
	char timestr[128];

	if (!hdl->libzfs_printerr \|\| config == NULL)
	return;

	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 \|\|
	nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) {
	return;
	}

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
	return;
	(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);

	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
	strftime(timestr, 128, 0, &t) != 0) {
	if (dryrun) {
	(void) printf(dgettext(TEXT_DOMAIN,
	"Would be able to return %s "
	"to its state as of %s.\n"),
	name, timestr);
	} else {
	(void) printf(dgettext(TEXT_DOMAIN,
	"Pool %s returned to its state as of %s.\n"),
	name, timestr);
	}
	if (loss > 120) {
	(void) printf(dgettext(TEXT_DOMAIN,
	"%s approximately %lld "),
	dryrun ? "Would discard" : "Discarded",
	(loss + 30) / 60);
	(void) printf(dgettext(TEXT_DOMAIN,
	"minutes of transactions.\n"));
	} else if (loss > 0) {
	(void) printf(dgettext(TEXT_DOMAIN,
	"%s approximately %lld "),
	dryrun ? "Would discard" : "Discarded", loss);
	(void) printf(dgettext(TEXT_DOMAIN,
	"seconds of transactions.\n"));
	}
	}
	}

	void
	zpool_explain_recover(libzfs_handle_t hdl, const char name, int reason,
	nvlist_t *config)
	{
	nvlist_t *nv = NULL;
	int64_t loss = -1;
	uint64_t edata = UINT64_MAX;
	uint64_t rewindto;
	struct tm t;
	char timestr[128];

	if (!hdl->libzfs_printerr)
	return;

	if (reason >= 0)
	(void) printf(dgettext(TEXT_DOMAIN, "action: "));
	else
	(void) printf(dgettext(TEXT_DOMAIN, "\t"));

	/* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 \|\|
	nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 \|\|
	nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
	goto no_info;

	(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
	&edata);

	(void) printf(dgettext(TEXT_DOMAIN,
	"Recovery is possible, but will result in some data loss.\n"));

	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
	strftime(timestr, 128, 0, &t) != 0) {
	(void) printf(dgettext(TEXT_DOMAIN,
	"\tReturning the pool to its state as of %s\n"
	"\tshould correct the problem. "),
	timestr);
	} else {
	(void) printf(dgettext(TEXT_DOMAIN,
	"\tReverting the pool to an earlier state "
	"should correct the problem.\n\t"));
	}

	if (loss > 120) {
	(void) printf(dgettext(TEXT_DOMAIN,
	"Approximately %lld minutes of data\n"
	"\tmust be discarded, irreversibly. "), (loss + 30) / 60);
	} else if (loss > 0) {
	(void) printf(dgettext(TEXT_DOMAIN,
	"Approximately %lld seconds of data\n"
	"\tmust be discarded, irreversibly. "), loss);
	}
	if (edata != 0 && edata != UINT64_MAX) {
	if (edata == 1) {
	(void) printf(dgettext(TEXT_DOMAIN,
	"After rewind, at least\n"
	"\tone persistent user-data error will remain. "));
	} else {
	(void) printf(dgettext(TEXT_DOMAIN,
	"After rewind, several\n"
	"\tpersistent user-data errors will remain. "));
	}
	}
	(void) printf(dgettext(TEXT_DOMAIN,
	"Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "),
	reason >= 0 ? "clear" : "import", name);

	(void) printf(dgettext(TEXT_DOMAIN,
	"A scrub of the pool\n"
	"\tis strongly recommended after recovery.\n"));
	return;

	no_info:
	(void) printf(dgettext(TEXT_DOMAIN,
	"Destroy and re-create the pool from\n\ta backup source.\n"));
	}

	/*
	* zpool_import() is a contracted interface. Should be kept the same
	* if possible.
	*
	* Applications should use zpool_import_props() to import a pool with
	* new properties value to be set.
	*/
	int
	zpool_import(libzfs_handle_t hdl, nvlist_t config, const char *newname,
	char *altroot)
	{
	nvlist_t *props = NULL;
	int ret;

	if (altroot != NULL) {
	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
	return (zfs_error_fmt(hdl, EZFS_NOMEM,
	dgettext(TEXT_DOMAIN, "cannot import '%s'"),
	newname));
	}

	if (nvlist_add_string(props,
	zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 \|\|
	nvlist_add_string(props,
	zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) {
	nvlist_free(props);
	return (zfs_error_fmt(hdl, EZFS_NOMEM,
	dgettext(TEXT_DOMAIN, "cannot import '%s'"),
	newname));
	}
	}

	ret = zpool_import_props(hdl, config, newname, props,
	ZFS_IMPORT_NORMAL);
	nvlist_free(props);
	return (ret);
	}

	static void
	print_vdev_tree(libzfs_handle_t hdl, const char name, nvlist_t *nv,
	int indent)
	{
	nvlist_t **child;
	uint_t c, children;
	char *vname;
	uint64_t is_log = 0;

	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG,
	&is_log);

	if (name != NULL)
	(void) printf("\t%*s%s%s\n", indent, "", name,
	is_log ? " [log]" : "");

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	&child, &children) != 0)
	return;

	for (c = 0; c < children; c++) {
	vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE);
	print_vdev_tree(hdl, vname, child[c], indent + 2);
	free(vname);
	}
	}

	void
	zpool_print_unsup_feat(nvlist_t *config)
	{
	nvlist_t nvinfo, unsup_feat;

	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) ==
	0);
	verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT,
	&unsup_feat) == 0);

	for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL;
	nvp = nvlist_next_nvpair(unsup_feat, nvp)) {
	char *desc;

	verify(nvpair_type(nvp) == DATA_TYPE_STRING);
	verify(nvpair_value_string(nvp, &desc) == 0);

	if (strlen(desc) > 0)
	(void) printf("\t%s (%s)\n", nvpair_name(nvp), desc);
	else
	(void) printf("\t%s\n", nvpair_name(nvp));
	}
	}

	/*
	* Import the given pool using the known configuration and a list of
	* properties to be set. The configuration should have come from
	* zpool_find_import(). The 'newname' parameters control whether the pool
	* is imported with a different name.
	*/
	int
	zpool_import_props(libzfs_handle_t hdl, nvlist_t config, const char *newname,
	nvlist_t *props, int flags)
	{
	zfs_cmd_t zc = { 0 };
	zpool_rewind_policy_t policy;
	nvlist_t *nv = NULL;
	nvlist_t *nvinfo = NULL;
	nvlist_t *missing = NULL;
	char *thename;
	char *origname;
	int ret;
	int error = 0;
	char errbuf[1024];

	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	&origname) == 0);

	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"cannot import pool '%s'"), origname);

	if (newname != NULL) {
	if (!zpool_name_valid(hdl, B_FALSE, newname))
	return (zfs_error_fmt(hdl, EZFS_INVALIDNAME,
	dgettext(TEXT_DOMAIN, "cannot import '%s'"),
	newname));
	thename = (char *)newname;
	} else {
	thename = origname;
	}

	if (props != NULL) {
	uint64_t version;
	prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };

	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	&version) == 0);

	if ((props = zpool_valid_proplist(hdl, origname,
	props, version, flags, errbuf)) == NULL)
	return (-1);
	if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
	nvlist_free(props);
	return (-1);
	}
	nvlist_free(props);
	}

	(void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name));

	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	&zc.zc_guid) == 0);

	if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) {
	zcmd_free_nvlists(&zc);
	return (-1);
	}
	if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) {
	zcmd_free_nvlists(&zc);
	return (-1);
	}

	zc.zc_cookie = flags;
	while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 &&
	errno == ENOMEM) {
	if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	zcmd_free_nvlists(&zc);
	return (-1);
	}
	}
	if (ret != 0)
	error = errno;

	(void) zcmd_read_dst_nvlist(hdl, &zc, &nv);

	zcmd_free_nvlists(&zc);

	zpool_get_rewind_policy(config, &policy);

	if (error) {
	char desc[1024];

	/*
	* Dry-run failed, but we print out what success
	* looks like if we found a best txg
	*/
	if (policy.zrp_request & ZPOOL_TRY_REWIND) {
	zpool_rewind_exclaim(hdl, newname ? origname : thename,
	B_TRUE, nv);
	nvlist_free(nv);
	return (-1);
	}

	if (newname == NULL)
	(void) snprintf(desc, sizeof (desc),
	dgettext(TEXT_DOMAIN, "cannot import '%s'"),
	thename);
	else
	(void) snprintf(desc, sizeof (desc),
	dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"),
	origname, thename);

	switch (error) {
	case ENOTSUP:
	if (nv != NULL && nvlist_lookup_nvlist(nv,
	ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
	nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) {
	(void) printf(dgettext(TEXT_DOMAIN, "This "
	"pool uses the following feature(s) not "
	"supported by this system:\n"));
	zpool_print_unsup_feat(nv);
	if (nvlist_exists(nvinfo,
	ZPOOL_CONFIG_CAN_RDONLY)) {
	(void) printf(dgettext(TEXT_DOMAIN,
	"All unsupported features are only "
	"required for writing to the pool."
	"\nThe pool can be imported using "
	"'-o readonly=on'.\n"));
	}
	}
	/*
	* Unsupported version.
	*/
	(void) zfs_error(hdl, EZFS_BADVERSION, desc);
	break;

	case EINVAL:
	(void) zfs_error(hdl, EZFS_INVALCONFIG, desc);
	break;

	case EROFS:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"one or more devices is read only"));
	(void) zfs_error(hdl, EZFS_BADDEV, desc);
	break;

	case ENXIO:
	if (nv && nvlist_lookup_nvlist(nv,
	ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
	nvlist_lookup_nvlist(nvinfo,
	ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
	(void) printf(dgettext(TEXT_DOMAIN,
	"The devices below are missing, use "
	"'-m' to import the pool anyway:\n"));
	print_vdev_tree(hdl, NULL, missing, 2);
	(void) printf("\n");
	}
	(void) zpool_standard_error(hdl, error, desc);
	break;

	case EEXIST:
	(void) zpool_standard_error(hdl, error, desc);
	break;
	case ENAMETOOLONG:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"new name of at least one dataset is longer than "
	"the maximum allowable length"));
	(void) zfs_error(hdl, EZFS_NAMETOOLONG, desc);
	break;
	default:
	(void) zpool_standard_error(hdl, error, desc);
	zpool_explain_recover(hdl,
	newname ? origname : thename, -error, nv);
	break;
	}

	nvlist_free(nv);
	ret = -1;
	} else {
	zpool_handle_t *zhp;

	/*
	* This should never fail, but play it safe anyway.
	*/
	if (zpool_open_silent(hdl, thename, &zhp) != 0)
	ret = -1;
	else if (zhp != NULL)
	zpool_close(zhp);
	if (policy.zrp_request &
	(ZPOOL_DO_REWIND \| ZPOOL_TRY_REWIND)) {
	zpool_rewind_exclaim(hdl, newname ? origname : thename,
	((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nv);
	}
	nvlist_free(nv);
	return (0);
	}

	return (ret);
	}

	/*
	* Scan the pool.
	*/
	int
	zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	int err;
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	zc.zc_cookie = func;
	zc.zc_flags = cmd;

	if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0)
	return (0);

	err = errno;

	/* ECANCELED on a scrub means we resumed a paused scrub */
	if (err == ECANCELED && func == POOL_SCAN_SCRUB &&
	cmd == POOL_SCRUB_NORMAL)
	return (0);

	if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL)
	return (0);

	if (func == POOL_SCAN_SCRUB) {
	if (cmd == POOL_SCRUB_PAUSE) {
	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	"cannot pause scrubbing %s"), zc.zc_name);
	} else {
	assert(cmd == POOL_SCRUB_NORMAL);
	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	"cannot scrub %s"), zc.zc_name);
	}
	} else if (func == POOL_SCAN_NONE) {
	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
	zc.zc_name);
	} else {
	assert(!"unexpected result");
	}

	if (err == EBUSY) {
	nvlist_t *nvroot;
	pool_scan_stat_t *ps = NULL;
	uint_t psc;

	verify(nvlist_lookup_nvlist(zhp->zpool_config,
	ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	(void) nvlist_lookup_uint64_array(nvroot,
	ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
	if (ps && ps->pss_func == POOL_SCAN_SCRUB) {
	if (cmd == POOL_SCRUB_PAUSE)
	return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg));
	else
	return (zfs_error(hdl, EZFS_SCRUBBING, msg));
	} else {
	return (zfs_error(hdl, EZFS_RESILVERING, msg));
	}
	} else if (err == ENOENT) {
	return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
	} else {
	return (zpool_standard_error(hdl, err, msg));
	}
	}

	#ifdef illumos
	/*
	* This provides a very minimal check whether a given string is likely a
	* c#t#d# style string. Users of this are expected to do their own
	* verification of the s# part.
	*/
	#define CTD_CHECK(str) (str && str[0] == 'c' && isdigit(str[1]))

	/*
	* More elaborate version for ones which may start with "/dev/dsk/"
	* and the like.
	*/
	static int
	ctd_check_path(char *str)
	{
	/*
	* If it starts with a slash, check the last component.
	*/
	if (str && str[0] == '/') {
	char *tmp = strrchr(str, '/');

	/*
	* If it ends in "/old", check the second-to-last
	* component of the string instead.
	*/
	if (tmp != str && strcmp(tmp, "/old") == 0) {
	for (tmp--; *tmp != '/'; tmp--)
	;
	}
	str = tmp + 1;
	}
	return (CTD_CHECK(str));
	}
	#endif

	/*
	* Find a vdev that matches the search criteria specified. We use the
	* the nvpair name to determine how we should look for the device.
	* 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL
	* spare; but FALSE if its an INUSE spare.
	*/
	static nvlist_t *
	vdev_to_nvlist_iter(nvlist_t nv, nvlist_t search, boolean_t *avail_spare,
	boolean_t l2cache, boolean_t log)
	{
	uint_t c, children;
	nvlist_t **child;
	nvlist_t *ret;
	uint64_t is_log;
	char *srchkey;
	nvpair_t *pair = nvlist_next_nvpair(search, NULL);

	/* Nothing to look for */
	if (search == NULL \|\| pair == NULL)
	return (NULL);

	/* Obtain the key we will use to search */
	srchkey = nvpair_name(pair);

	switch (nvpair_type(pair)) {
	case DATA_TYPE_UINT64:
	if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
	uint64_t srchval, theguid;

	verify(nvpair_value_uint64(pair, &srchval) == 0);
	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
	&theguid) == 0);
	if (theguid == srchval)
	return (nv);
	}
	break;

	case DATA_TYPE_STRING: {
	char srchval, val;

	verify(nvpair_value_string(pair, &srchval) == 0);
	if (nvlist_lookup_string(nv, srchkey, &val) != 0)
	break;

	/*
	* Search for the requested value. Special cases:
	*
	* - ZPOOL_CONFIG_PATH for whole disk entries. To support
	* UEFI boot, these end in "s0" or "s0/old" or "s1" or
	* "s1/old". The "s0" or "s1" part is hidden from the user,
	* but included in the string, so this matches around it.
	* - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
	*
	* Otherwise, all other searches are simple string compares.
	*/
	#ifdef illumos
	if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 &&
	ctd_check_path(val)) {
	uint64_t wholedisk = 0;

	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	&wholedisk);
	if (wholedisk) {
	int slen = strlen(srchval);
	int vlen = strlen(val);

	if (slen != vlen - 2)
	break;

	/*
	* make_leaf_vdev() should only set
	* wholedisk for ZPOOL_CONFIG_PATHs which
	* will include "/dev/dsk/", giving plenty of
	* room for the indices used next.
	*/
	ASSERT(vlen >= 6);

	/*
	* strings identical except trailing "s0"
	*/
	if ((strcmp(&val[vlen - 2], "s0") == 0 \|\|
	strcmp(&val[vlen - 2], "s1") == 0) &&
	strncmp(srchval, val, slen) == 0)
	return (nv);

	/*
	* strings identical except trailing "s0/old"
	*/
	if ((strcmp(&val[vlen - 6], "s0/old") == 0 \|\|
	strcmp(&val[vlen - 6], "s1/old") == 0) &&
	strcmp(&srchval[slen - 4], "/old") == 0 &&
	strncmp(srchval, val, slen - 4) == 0)
	return (nv);

	break;
	}
	} else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
	#else
	if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
	#endif
	char type, idx, end, p;
	uint64_t id, vdev_id;

	/*
	* Determine our vdev type, keeping in mind
	* that the srchval is composed of a type and
	* vdev id pair (i.e. mirror-4).
	*/
	if ((type = strdup(srchval)) == NULL)
	return (NULL);

	if ((p = strrchr(type, '-')) == NULL) {
	free(type);
	break;
	}
	idx = p + 1;
	*p = '\0';

	/*
	* If the types don't match then keep looking.
	*/
	if (strncmp(val, type, strlen(val)) != 0) {
	free(type);
	break;
	}

	verify(zpool_vdev_is_interior(type));
	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
	&id) == 0);

	errno = 0;
	vdev_id = strtoull(idx, &end, 10);

	free(type);
	if (errno != 0)
	return (NULL);

	/*
	* Now verify that we have the correct vdev id.
	*/
	if (vdev_id == id)
	return (nv);
	}

	/*
	* Common case
	*/
	if (strcmp(srchval, val) == 0)
	return (nv);
	break;
	}

	default:
	break;
	}

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	&child, &children) != 0)
	return (NULL);

	for (c = 0; c < children; c++) {
	if ((ret = vdev_to_nvlist_iter(child[c], search,
	avail_spare, l2cache, NULL)) != NULL) {
	/*
	* The 'is_log' value is only set for the toplevel
	* vdev, not the leaf vdevs. So we always lookup the
	* log device from the root of the vdev tree (where
	* 'log' is non-NULL).
	*/
	if (log != NULL &&
	nvlist_lookup_uint64(child[c],
	ZPOOL_CONFIG_IS_LOG, &is_log) == 0 &&
	is_log) {
	*log = B_TRUE;
	}
	return (ret);
	}
	}

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
	&child, &children) == 0) {
	for (c = 0; c < children; c++) {
	if ((ret = vdev_to_nvlist_iter(child[c], search,
	avail_spare, l2cache, NULL)) != NULL) {
	*avail_spare = B_TRUE;
	return (ret);
	}
	}
	}

	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
	&child, &children) == 0) {
	for (c = 0; c < children; c++) {
	if ((ret = vdev_to_nvlist_iter(child[c], search,
	avail_spare, l2cache, NULL)) != NULL) {
	*l2cache = B_TRUE;
	return (ret);
	}
	}
	}

	return (NULL);
	}

	/*
	* Given a physical path (minus the "/devices" prefix), find the
	* associated vdev.
	*/
	nvlist_t *
	zpool_find_vdev_by_physpath(zpool_handle_t zhp, const char ppath,
	boolean_t avail_spare, boolean_t l2cache, boolean_t *log)
	{
	nvlist_t search, nvroot, *ret;

	verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0);

	verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
	&nvroot) == 0);

	*avail_spare = B_FALSE;
	*l2cache = B_FALSE;
	if (log != NULL)
	*log = B_FALSE;
	ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
	nvlist_free(search);

	return (ret);
	}

	/*
	* Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
	*/
	static boolean_t
	zpool_vdev_is_interior(const char *name)
	{
	if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 \|\|
	strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 \|\|
	strncmp(name,
	VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 \|\|
	strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
	return (B_TRUE);
	return (B_FALSE);
	}

	nvlist_t *
	zpool_find_vdev(zpool_handle_t zhp, const char path, boolean_t *avail_spare,
	boolean_t l2cache, boolean_t log)
	{
	char buf[MAXPATHLEN];
	char *end;
	nvlist_t nvroot, search, *ret;
	uint64_t guid;

	verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	guid = strtoull(path, &end, 10);
	if (guid != 0 && *end == '\0') {
	verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
	} else if (zpool_vdev_is_interior(path)) {
	verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
	} else if (path[0] != '/') {
	(void) snprintf(buf, sizeof (buf), "%s%s", _PATH_DEV, path);
	verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
	} else {
	verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
	}

	verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
	&nvroot) == 0);

	*avail_spare = B_FALSE;
	*l2cache = B_FALSE;
	if (log != NULL)
	*log = B_FALSE;
	ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
	nvlist_free(search);

	return (ret);
	}

	static int
	vdev_online(nvlist_t *nv)
	{
	uint64_t ival;

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 \|\|
	nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 \|\|
	nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
	return (0);

	return (1);
	}

	/*
	* Helper function for zpool_get_physpaths().
	*/
	static int
	vdev_get_one_physpath(nvlist_t config, char physpath, size_t physpath_size,
	size_t *bytes_written)
	{
	size_t bytes_left, pos, rsz;
	char *tmppath;
	const char *format;

	if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH,
	&tmppath) != 0)
	return (EZFS_NODEVICE);

	pos = *bytes_written;
	bytes_left = physpath_size - pos;
	format = (pos == 0) ? "%s" : " %s";

	rsz = snprintf(physpath + pos, bytes_left, format, tmppath);
	*bytes_written += rsz;

	if (rsz >= bytes_left) {
	/* if physpath was not copied properly, clear it */
	if (bytes_left != 0) {
	physpath[pos] = 0;
	}
	return (EZFS_NOSPC);
	}
	return (0);
	}

	static int
	vdev_get_physpaths(nvlist_t nv, char physpath, size_t phypath_size,
	size_t *rsz, boolean_t is_spare)
	{
	char *type;
	int ret;

	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
	return (EZFS_INVALCONFIG);

	if (strcmp(type, VDEV_TYPE_DISK) == 0) {
	/*
	* An active spare device has ZPOOL_CONFIG_IS_SPARE set.
	* For a spare vdev, we only want to boot from the active
	* spare device.
	*/
	if (is_spare) {
	uint64_t spare = 0;
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
	&spare);
	if (!spare)
	return (EZFS_INVALCONFIG);
	}

	if (vdev_online(nv)) {
	if ((ret = vdev_get_one_physpath(nv, physpath,
	phypath_size, rsz)) != 0)
	return (ret);
	}
	} else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 \|\|
	strcmp(type, VDEV_TYPE_RAIDZ) == 0 \|\|
	strcmp(type, VDEV_TYPE_REPLACING) == 0 \|\|
	(is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) {
	nvlist_t **child;
	uint_t count;
	int i, ret;

	if (nvlist_lookup_nvlist_array(nv,
	ZPOOL_CONFIG_CHILDREN, &child, &count) != 0)
	return (EZFS_INVALCONFIG);

	for (i = 0; i < count; i++) {
	ret = vdev_get_physpaths(child[i], physpath,
	phypath_size, rsz, is_spare);
	if (ret == EZFS_NOSPC)
	return (ret);
	}
	}

	return (EZFS_POOL_INVALARG);
	}

	/*
	* Get phys_path for a root pool config.
	* Return 0 on success; non-zero on failure.
	*/
	static int
	zpool_get_config_physpath(nvlist_t config, char physpath, size_t phypath_size)
	{
	size_t rsz;
	nvlist_t *vdev_root;
	nvlist_t **child;
	uint_t count;
	char *type;

	rsz = 0;

	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&vdev_root) != 0)
	return (EZFS_INVALCONFIG);

	if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 \|\|
	nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
	&child, &count) != 0)
	return (EZFS_INVALCONFIG);

	/*
	* root pool can only have a single top-level vdev.
	*/
	if (strcmp(type, VDEV_TYPE_ROOT) != 0 \|\| count != 1)
	return (EZFS_POOL_INVALARG);

	(void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz,
	B_FALSE);

	/* No online devices */
	if (rsz == 0)
	return (EZFS_NODEVICE);

	return (0);
	}

	/*
	* Get phys_path for a root pool
	* Return 0 on success; non-zero on failure.
	*/
	int
	zpool_get_physpath(zpool_handle_t zhp, char physpath, size_t phypath_size)
	{
	return (zpool_get_config_physpath(zhp->zpool_config, physpath,
	phypath_size));
	}

	/*
	* If the device has being dynamically expanded then we need to relabel
	* the disk to use the new unallocated space.
	*/
	static int
	zpool_relabel_disk(libzfs_handle_t hdl, const char name)
	{
	#ifdef illumos
	char path[MAXPATHLEN];
	char errbuf[1024];
	int fd, error;
	int (*_efi_use_whole_disk)(int);

	if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT,
	"efi_use_whole_disk")) == NULL)
	return (-1);

	(void) snprintf(path, sizeof (path), "%s/%s", ZFS_RDISK_ROOT, name);

	if ((fd = open(path, O_RDWR \| O_NDELAY)) < 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
	"relabel '%s': unable to open device"), name);
	return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
	}

	/*
	* It's possible that we might encounter an error if the device
	* does not have any unallocated space left. If so, we simply
	* ignore that error and continue on.
	*/
	error = _efi_use_whole_disk(fd);
	(void) close(fd);
	if (error && error != VT_ENOSPC) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
	"relabel '%s': unable to read disk capacity"), name);
	return (zfs_error(hdl, EZFS_NOCAP, errbuf));
	}
	#endif /* illumos */
	return (0);
	}

	/*
	* Bring the specified vdev online. The 'flags' parameter is a set of the
	* ZFS_ONLINE_* flags.
	*/
	int
	zpool_vdev_online(zpool_handle_t zhp, const char path, int flags,
	vdev_state_t *newstate)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	char *pathname;
	nvlist_t *tgt;
	boolean_t avail_spare, l2cache, islog;
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	if (flags & ZFS_ONLINE_EXPAND) {
	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot expand %s"), path);
	} else {
	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot online %s"), path);
	}

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
	&islog)) == NULL)
	return (zfs_error(hdl, EZFS_NODEVICE, msg));

	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);

	if (avail_spare)
	return (zfs_error(hdl, EZFS_ISSPARE, msg));

	if ((flags & ZFS_ONLINE_EXPAND \|\|
	zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) &&
	nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) {
	uint64_t wholedisk = 0;

	(void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
	&wholedisk);

	/*
	* XXX - L2ARC 1.0 devices can't support expansion.
	*/
	if (l2cache) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"cannot expand cache devices"));
	return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg));
	}

	if (wholedisk) {
	pathname += strlen(ZFS_DISK_ROOT) + 1;
	(void) zpool_relabel_disk(hdl, pathname);
	}
	}

	zc.zc_cookie = VDEV_STATE_ONLINE;
	zc.zc_obj = flags;

	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
	if (errno == EINVAL) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
	"from this pool into a new one. Use '%s' "
	"instead"), "zpool detach");
	return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg));
	}
	return (zpool_standard_error(hdl, errno, msg));
	}

	*newstate = zc.zc_cookie;
	return (0);
	}

	/*
	* Take the specified vdev offline
	*/
	int
	zpool_vdev_offline(zpool_handle_t zhp, const char path, boolean_t istmp)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	nvlist_t *tgt;
	boolean_t avail_spare, l2cache;
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot offline %s"), path);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
	NULL)) == NULL)
	return (zfs_error(hdl, EZFS_NODEVICE, msg));

	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);

	if (avail_spare)
	return (zfs_error(hdl, EZFS_ISSPARE, msg));

	zc.zc_cookie = VDEV_STATE_OFFLINE;
	zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;

	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
	return (0);

	switch (errno) {
	case EBUSY:

	/*
	* There are no other replicas of this device.
	*/
	return (zfs_error(hdl, EZFS_NOREPLICAS, msg));

	case EEXIST:
	/*
	* The log device has unplayed logs
	*/
	return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg));

	default:
	return (zpool_standard_error(hdl, errno, msg));
	}
	}

	/*
	* Mark the given vdev faulted.
	*/
	int
	zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot fault %llu"), guid);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	zc.zc_guid = guid;
	zc.zc_cookie = VDEV_STATE_FAULTED;
	zc.zc_obj = aux;

	if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
	return (0);

	switch (errno) {
	case EBUSY:

	/*
	* There are no other replicas of this device.
	*/
	return (zfs_error(hdl, EZFS_NOREPLICAS, msg));

	default:
	return (zpool_standard_error(hdl, errno, msg));
	}

	}

	/*
	* Mark the given vdev degraded.
	*/
	int
	zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot degrade %llu"), guid);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	zc.zc_guid = guid;
	zc.zc_cookie = VDEV_STATE_DEGRADED;
	zc.zc_obj = aux;

	if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
	return (0);

	return (zpool_standard_error(hdl, errno, msg));
	}

	/*
	* Returns TRUE if the given nvlist is a vdev that was originally swapped in as
	* a hot spare.
	*/
	static boolean_t
	is_replacing_spare(nvlist_t search, nvlist_t tgt, int which)
	{
	nvlist_t **child;
	uint_t c, children;
	char *type;

	if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child,
	&children) == 0) {
	verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE,
	&type) == 0);

	if (strcmp(type, VDEV_TYPE_SPARE) == 0 &&
	children == 2 && child[which] == tgt)
	return (B_TRUE);

	for (c = 0; c < children; c++)
	if (is_replacing_spare(child[c], tgt, which))
	return (B_TRUE);
	}

	return (B_FALSE);
	}

	/*
	* Attach new_disk (fully described by nvroot) to old_disk.
	* If 'replacing' is specified, the new disk will replace the old one.
	*/
	int
	zpool_vdev_attach(zpool_handle_t *zhp,
	const char old_disk, const char new_disk, nvlist_t *nvroot, int replacing)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	int ret;
	nvlist_t *tgt;
	boolean_t avail_spare, l2cache, islog;
	uint64_t val;
	char *newname;
	nvlist_t **child;
	uint_t children;
	nvlist_t *config_root;
	libzfs_handle_t *hdl = zhp->zpool_hdl;
	boolean_t rootpool = zpool_is_bootable(zhp);

	if (replacing)
	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	"cannot replace %s with %s"), old_disk, new_disk);
	else
	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
	"cannot attach %s to %s"), new_disk, old_disk);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache,
	- &islog)) == 0)
	+ &islog)) == NULL)
	return (zfs_error(hdl, EZFS_NODEVICE, msg));

	if (avail_spare)
	return (zfs_error(hdl, EZFS_ISSPARE, msg));

	if (l2cache)
	return (zfs_error(hdl, EZFS_ISL2CACHE, msg));

	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
	zc.zc_cookie = replacing;

	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	&child, &children) != 0 \|\| children != 1) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"new device must be a single disk"));
	return (zfs_error(hdl, EZFS_INVALCONFIG, msg));
	}

	verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
	ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);

	if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL)
	return (-1);

	/*
	* If the target is a hot spare that has been swapped in, we can only
	* replace it with another hot spare.
	*/
	if (replacing &&
	nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 &&
	(zpool_find_vdev(zhp, newname, &avail_spare, &l2cache,
	NULL) == NULL \|\| !avail_spare) &&
	is_replacing_spare(config_root, tgt, 1)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"can only be replaced by another hot spare"));
	free(newname);
	return (zfs_error(hdl, EZFS_BADTARGET, msg));
	}

	free(newname);

	if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
	return (-1);

	ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc);

	zcmd_free_nvlists(&zc);

	if (ret == 0) {
	if (rootpool) {
	/*
	* XXX need a better way to prevent user from
	* booting up a half-baked vdev.
	*/
	(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make "
	"sure to wait until resilver is done "
	"before rebooting.\n"));
	(void) fprintf(stderr, "\n");
	(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "If "
	"you boot from pool '%s', you may need to update\n"
	"boot code on newly attached disk '%s'.\n\n"
	"Assuming you use GPT partitioning and 'da0' is "
	"your new boot disk\n"
	"you may use the following command:\n\n"
	"\tgpart bootcode -b /boot/pmbr -p "
	"/boot/gptzfsboot -i 1 da0\n\n"),
	zhp->zpool_name, new_disk);
	}
	return (0);
	}

	switch (errno) {
	case ENOTSUP:
	/*
	* Can't attach to or replace this type of vdev.
	*/
	if (replacing) {
	uint64_t version = zpool_get_prop_int(zhp,
	ZPOOL_PROP_VERSION, NULL);

	if (islog)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"cannot replace a log with a spare"));
	else if (version >= SPA_VERSION_MULTI_REPLACE)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"already in replacing/spare config; wait "
	"for completion or use 'zpool detach'"));
	else
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"cannot replace a replacing device"));
	} else {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"can only attach to mirrors and top-level "
	"disks"));
	}
	(void) zfs_error(hdl, EZFS_BADTARGET, msg);
	break;

	case EINVAL:
	/*
	* The new device must be a single disk.
	*/
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"new device must be a single disk"));
	(void) zfs_error(hdl, EZFS_INVALCONFIG, msg);
	break;

	case EBUSY:
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"),
	+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
	+ "or pool has removing/removed vdevs"),
	new_disk);
	(void) zfs_error(hdl, EZFS_BADDEV, msg);
	break;

	case EOVERFLOW:
	/*
	* The new device is too small.
	*/
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"device is too small"));
	(void) zfs_error(hdl, EZFS_BADDEV, msg);
	break;

	case EDOM:
	/*
	* The new device has a different alignment requirement.
	*/
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"devices have different sector alignment"));
	(void) zfs_error(hdl, EZFS_BADDEV, msg);
	break;

	case ENAMETOOLONG:
	/*
	* The resulting top-level vdev spec won't fit in the label.
	*/
	(void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg);
	break;

	default:
	(void) zpool_standard_error(hdl, errno, msg);
	}

	return (-1);
	}

	/*
	* Detach the specified device.
	*/
	int
	zpool_vdev_detach(zpool_handle_t zhp, const char path)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	nvlist_t *tgt;
	boolean_t avail_spare, l2cache;
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot detach %s"), path);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
	- NULL)) == 0)
	+ NULL)) == NULL)
	return (zfs_error(hdl, EZFS_NODEVICE, msg));

	if (avail_spare)
	return (zfs_error(hdl, EZFS_ISSPARE, msg));

	if (l2cache)
	return (zfs_error(hdl, EZFS_ISL2CACHE, msg));

	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);

	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0)
	return (0);

	switch (errno) {

	case ENOTSUP:
	/*
	* Can't detach from this type of vdev.
	*/
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only "
	"applicable to mirror and replacing vdevs"));
	(void) zfs_error(hdl, EZFS_BADTARGET, msg);
	break;

	case EBUSY:
	/*
	* There are no other replicas of this device.
	*/
	(void) zfs_error(hdl, EZFS_NOREPLICAS, msg);
	break;

	default:
	(void) zpool_standard_error(hdl, errno, msg);
	}

	return (-1);
	}

	/*
	* Find a mirror vdev in the source nvlist.
	*
	* The mchild array contains a list of disks in one of the top-level mirrors
	* of the source pool. The schild array contains a list of disks that the
	* user specified on the command line. We loop over the mchild array to
	* see if any entry in the schild array matches.
	*
	* If a disk in the mchild array is found in the schild array, we return
	* the index of that entry. Otherwise we return -1.
	*/
	static int
	find_vdev_entry(zpool_handle_t zhp, nvlist_t *mchild, uint_t mchildren,
	nvlist_t **schild, uint_t schildren)
	{
	uint_t mc;

	for (mc = 0; mc < mchildren; mc++) {
	uint_t sc;
	char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp,
	mchild[mc], B_FALSE);

	for (sc = 0; sc < schildren; sc++) {
	char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp,
	schild[sc], B_FALSE);
	boolean_t result = (strcmp(mpath, spath) == 0);

	free(spath);
	if (result) {
	free(mpath);
	return (mc);
	}
	}

	free(mpath);
	}

	return (-1);
	}

	/*
	* Split a mirror pool. If newroot points to null, then a new nvlist
	* is generated and it is the responsibility of the caller to free it.
	*/
	int
	zpool_vdev_split(zpool_handle_t zhp, char newname, nvlist_t **newroot,
	nvlist_t *props, splitflags_t flags)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	nvlist_t tree, config, child, newchild, *newconfig = NULL;
	nvlist_t *varray = NULL, zc_props = NULL;
	uint_t c, children, newchildren, lastlog = 0, vcount, found = 0;
	libzfs_handle_t *hdl = zhp->zpool_hdl;
	uint64_t vers;
	boolean_t freelist = B_FALSE, memory_err = B_TRUE;
	int retval = 0;

	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name);

	if (!zpool_name_valid(hdl, B_FALSE, newname))
	return (zfs_error(hdl, EZFS_INVALIDNAME, msg));

	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
	(void) fprintf(stderr, gettext("Internal error: unable to "
	"retrieve pool configuration\n"));
	return (-1);
	}

	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree)
	== 0);
	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);

	if (props) {
	prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
	if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
	props, vers, flags, msg)) == NULL)
	return (-1);
	}

	if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
	&children) != 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"Source pool is missing vdev tree"));
	nvlist_free(zc_props);
	return (-1);
	}

	varray = zfs_alloc(hdl, children * sizeof (nvlist_t *));
	vcount = 0;

	if (*newroot == NULL \|\|
	nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN,
	&newchild, &newchildren) != 0)
	newchildren = 0;

	for (c = 0; c < children; c++) {
	uint64_t is_log = B_FALSE, is_hole = B_FALSE;
	char *type;
	nvlist_t *mchild, vdev;
	uint_t mchildren;
	int entry;

	/*
	* Unlike cache & spares, slogs are stored in the
	* ZPOOL_CONFIG_CHILDREN array. We filter them out here.
	*/
	(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
	&is_log);
	(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
	&is_hole);
	if (is_log \|\| is_hole) {
	/*
	* Create a hole vdev and put it in the config.
	*/
	if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0)
	goto out;
	if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE,
	VDEV_TYPE_HOLE) != 0)
	goto out;
	if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE,
	1) != 0)
	goto out;
	if (lastlog == 0)
	lastlog = vcount;
	varray[vcount++] = vdev;
	continue;
	}
	lastlog = 0;
	verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type)
	== 0);
	if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"Source pool must be composed only of mirrors\n"));
	retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
	goto out;
	}

	verify(nvlist_lookup_nvlist_array(child[c],
	ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);

	/* find or add an entry for this top-level vdev */
	if (newchildren > 0 &&
	(entry = find_vdev_entry(zhp, mchild, mchildren,
	newchild, newchildren)) >= 0) {
	/* We found a disk that the user specified. */
	vdev = mchild[entry];
	++found;
	} else {
	/* User didn't specify a disk for this vdev. */
	vdev = mchild[mchildren - 1];
	}

	if (nvlist_dup(vdev, &varray[vcount++], 0) != 0)
	goto out;
	}

	/* did we find every disk the user specified? */
	if (found != newchildren) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must "
	"include at most one disk from each mirror"));
	retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
	goto out;
	}

	/* Prepare the nvlist for populating. */
	if (*newroot == NULL) {
	if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0)
	goto out;
	freelist = B_TRUE;
	if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE,
	VDEV_TYPE_ROOT) != 0)
	goto out;
	} else {
	verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0);
	}

	/* Add all the children we found */
	if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray,
	lastlog == 0 ? vcount : lastlog) != 0)
	goto out;

	/*
	* If we're just doing a dry run, exit now with success.
	*/
	if (flags.dryrun) {
	memory_err = B_FALSE;
	freelist = B_FALSE;
	goto out;
	}

	/* now build up the config list & call the ioctl */
	if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0)
	goto out;

	if (nvlist_add_nvlist(newconfig,
	ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 \|\|
	nvlist_add_string(newconfig,
	ZPOOL_CONFIG_POOL_NAME, newname) != 0 \|\|
	nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0)
	goto out;

	/*
	* The new pool is automatically part of the namespace unless we
	* explicitly export it.
	*/
	if (!flags.import)
	zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT;
	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	(void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string));
	if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0)
	goto out;
	if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
	goto out;

	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) {
	retval = zpool_standard_error(hdl, errno, msg);
	goto out;
	}

	freelist = B_FALSE;
	memory_err = B_FALSE;

	out:
	if (varray != NULL) {
	int v;

	for (v = 0; v < vcount; v++)
	nvlist_free(varray[v]);
	free(varray);
	}
	zcmd_free_nvlists(&zc);
	nvlist_free(zc_props);
	nvlist_free(newconfig);
	if (freelist) {
	nvlist_free(*newroot);
	*newroot = NULL;
	}

	if (retval != 0)
	return (retval);

	if (memory_err)
	return (no_memory(hdl));

	return (0);
	}

	/*
	- * Remove the given device. Currently, this is supported only for hot spares
	- * and level 2 cache devices.
	+ * Remove the given device.
	*/
	int
	zpool_vdev_remove(zpool_handle_t zhp, const char path)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	nvlist_t *tgt;
	boolean_t avail_spare, l2cache, islog;
	libzfs_handle_t *hdl = zhp->zpool_hdl;
	uint64_t version;

	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot remove %s"), path);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
	- &islog)) == 0)
	+ &islog)) == NULL)
	return (zfs_error(hdl, EZFS_NODEVICE, msg));
	- /*
	- * XXX - this should just go away.
	- */
	- if (!avail_spare && !l2cache && !islog) {
	- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "only inactive hot spares, cache, top-level, "
	- "or log devices can be removed"));
	- return (zfs_error(hdl, EZFS_NODEVICE, msg));
	- }

	version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
	if (islog && version < SPA_VERSION_HOLES) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	- "pool must be upgrade to support log removal"));
	+ "pool must be upgraded to support log removal"));
	return (zfs_error(hdl, EZFS_BADVERSION, msg));
	}

	- verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
	+ if (!islog && !avail_spare && !l2cache && zpool_is_bootable(zhp)) {
	+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	+ "root pool can not have removed devices, "
	+ "because GRUB does not understand them"));
	+ return (zfs_error(hdl, EINVAL, msg));
	+ }

	+ zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
	+
	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
	return (0);

	+ switch (errno) {
	+
	+ case EINVAL:
	+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	+ "invalid config; all top-level vdevs must "
	+ "have the same sector size and not be raidz."));
	+ (void) zfs_error(hdl, EZFS_INVALCONFIG, msg);
	+ break;
	+
	+ case EBUSY:
	+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	+ "Pool busy; removal may already be in progress"));
	+ (void) zfs_error(hdl, EZFS_BUSY, msg);
	+ break;
	+
	+ default:
	+ (void) zpool_standard_error(hdl, errno, msg);
	+ }
	+ return (-1);
	+}
	+
	+int
	+zpool_vdev_remove_cancel(zpool_handle_t *zhp)
	+{
	+ zfs_cmd_t zc = { 0 };
	+ char msg[1024];
	+ libzfs_handle_t *hdl = zhp->zpool_hdl;
	+
	+ (void) snprintf(msg, sizeof (msg),
	+ dgettext(TEXT_DOMAIN, "cannot cancel removal"));
	+
	+ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	+ zc.zc_cookie = 1;
	+
	+ if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
	+ return (0);
	+
	return (zpool_standard_error(hdl, errno, msg));
	}

	+int
	+zpool_vdev_indirect_size(zpool_handle_t zhp, const char path,
	+ uint64_t *sizep)
	+{
	+ char msg[1024];
	+ nvlist_t *tgt;
	+ boolean_t avail_spare, l2cache, islog;
	+ libzfs_handle_t *hdl = zhp->zpool_hdl;
	+
	+ (void) snprintf(msg, sizeof (msg),
	+ dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"),
	+ path);
	+
	+ if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
	+ &islog)) == NULL)
	+ return (zfs_error(hdl, EZFS_NODEVICE, msg));
	+
	+ if (avail_spare \|\| l2cache \|\| islog) {
	+ *sizep = 0;
	+ return (0);
	+ }
	+
	+ if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) {
	+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	+ "indirect size not available"));
	+ return (zfs_error(hdl, EINVAL, msg));
	+ }
	+ return (0);
	+}
	+
	/*
	* Clear the errors for the pool, or the particular device if specified.
	*/
	int
	zpool_clear(zpool_handle_t zhp, const char path, nvlist_t *rewindnvl)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	nvlist_t *tgt;
	zpool_rewind_policy_t policy;
	boolean_t avail_spare, l2cache;
	libzfs_handle_t *hdl = zhp->zpool_hdl;
	nvlist_t *nvi = NULL;
	int error;

	if (path)
	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
	path);
	else
	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
	zhp->zpool_name);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	if (path) {
	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare,
	- &l2cache, NULL)) == 0)
	+ &l2cache, NULL)) == NULL)
	return (zfs_error(hdl, EZFS_NODEVICE, msg));

	/*
	* Don't allow error clearing for hot spares. Do allow
	* error clearing for l2cache devices.
	*/
	if (avail_spare)
	return (zfs_error(hdl, EZFS_ISSPARE, msg));

	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID,
	&zc.zc_guid) == 0);
	}

	zpool_get_rewind_policy(rewindnvl, &policy);
	zc.zc_cookie = policy.zrp_request;

	if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0)
	return (-1);

	if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0)
	return (-1);

	while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 &&
	errno == ENOMEM) {
	if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
	zcmd_free_nvlists(&zc);
	return (-1);
	}
	}

	if (!error \|\| ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
	errno != EPERM && errno != EACCES)) {
	if (policy.zrp_request &
	(ZPOOL_DO_REWIND \| ZPOOL_TRY_REWIND)) {
	(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
	zpool_rewind_exclaim(hdl, zc.zc_name,
	((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
	nvi);
	nvlist_free(nvi);
	}
	zcmd_free_nvlists(&zc);
	return (0);
	}

	zcmd_free_nvlists(&zc);
	return (zpool_standard_error(hdl, errno, msg));
	}

	/*
	* Similar to zpool_clear(), but takes a GUID (used by fmd).
	*/
	int
	zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"),
	guid);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	zc.zc_guid = guid;
	zc.zc_cookie = ZPOOL_NO_REWIND;

	if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0)
	return (0);

	return (zpool_standard_error(hdl, errno, msg));
	}

	/*
	* Change the GUID for a pool.
	*/
	int
	zpool_reguid(zpool_handle_t *zhp)
	{
	char msg[1024];
	libzfs_handle_t *hdl = zhp->zpool_hdl;
	zfs_cmd_t zc = { 0 };

	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
	return (0);

	return (zpool_standard_error(hdl, errno, msg));
	}

	/*
	* Reopen the pool.
	*/
	int
	zpool_reopen(zpool_handle_t *zhp)
	{
	zfs_cmd_t zc = { 0 };
	char msg[1024];
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	(void) snprintf(msg, sizeof (msg),
	dgettext(TEXT_DOMAIN, "cannot reopen '%s'"),
	zhp->zpool_name);

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	if (zfs_ioctl(hdl, ZFS_IOC_POOL_REOPEN, &zc) == 0)
	return (0);
	return (zpool_standard_error(hdl, errno, msg));
	}

	/*
	* Convert from a devid string to a path.
	*/
	static char *
	devid_to_path(char *devid_str)
	{
	ddi_devid_t devid;
	char *minor;
	char *path;
	devid_nmlist_t *list = NULL;
	int ret;

	if (devid_str_decode(devid_str, &devid, &minor) != 0)
	return (NULL);

	ret = devid_deviceid_to_nmlist("/dev", devid, minor, &list);

	devid_str_free(minor);
	devid_free(devid);

	if (ret != 0)
	return (NULL);

	/*
	* In a case the strdup() fails, we will just return NULL below.
	*/
	path = strdup(list[0].devname);

	devid_free_nmlist(list);

	return (path);
	}

	/*
	* Convert from a path to a devid string.
	*/
	static char *
	path_to_devid(const char *path)
	{
	#ifdef have_devid
	int fd;
	ddi_devid_t devid;
	char minor, ret;

	if ((fd = open(path, O_RDONLY)) < 0)
	return (NULL);

	minor = NULL;
	ret = NULL;
	if (devid_get(fd, &devid) == 0) {
	if (devid_get_minor_name(fd, &minor) == 0)
	ret = devid_str_encode(devid, minor);
	if (minor != NULL)
	devid_str_free(minor);
	devid_free(devid);
	}
	(void) close(fd);

	return (ret);
	#else
	return (NULL);
	#endif
	}

	/*
	* Issue the necessary ioctl() to update the stored path value for the vdev. We
	* ignore any failure here, since a common case is for an unprivileged user to
	* type 'zpool status', and we'll display the correct information anyway.
	*/
	static void
	set_path(zpool_handle_t zhp, nvlist_t nv, const char *path)
	{
	zfs_cmd_t zc = { 0 };

	(void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	(void) strncpy(zc.zc_value, path, sizeof (zc.zc_value));
	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
	&zc.zc_guid) == 0);

	(void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc);
	}

	/*
	* Given a vdev, return the name to display in iostat. If the vdev has a path,
	* we use that, stripping off any leading "/dev/dsk/"; if not, we use the type.
	* We also check if this is a whole disk, in which case we strip off the
	* trailing 's0' slice name.
	*
	* This routine is also responsible for identifying when disks have been
	* reconfigured in a new location. The kernel will have opened the device by
	* devid, but the path will still refer to the old location. To catch this, we
	* first do a path -> devid translation (which is fast for the common case). If
	* the devid matches, we're done. If not, we do a reverse devid -> path
	* translation and issue the appropriate ioctl() to update the path of the vdev.
	* If 'zhp' is NULL, then this is an exported pool, and we don't need to do any
	* of these checks.
	*/
	char *
	zpool_vdev_name(libzfs_handle_t hdl, zpool_handle_t zhp, nvlist_t *nv,
	boolean_t verbose)
	{
	char path, devid;
	uint64_t value;
	char buf[64];
	vdev_stat_t *vs;
	uint_t vsc;
	int have_stats;
	int have_path;

	have_stats = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	(uint64_t **)&vs, &vsc) == 0;
	have_path = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0;

	/*
	* If the device is not currently present, assume it will not
	* come back at the same device path. Display the device by GUID.
	*/
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 \|\|
	have_path && have_stats && vs->vs_state <= VDEV_STATE_CANT_OPEN) {
	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
	&value) == 0);
	(void) snprintf(buf, sizeof (buf), "%llu",
	(u_longlong_t)value);
	path = buf;
	} else if (have_path) {

	/*
	* If the device is dead (faulted, offline, etc) then don't
	* bother opening it. Otherwise we may be forcing the user to
	* open a misbehaving device, which can have undesirable
	* effects.
	*/
	if ((have_stats == 0 \|\|
	vs->vs_state >= VDEV_STATE_DEGRADED) &&
	zhp != NULL &&
	nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) {
	/*
	* Determine if the current path is correct.
	*/
	char *newdevid = path_to_devid(path);

	if (newdevid == NULL \|\|
	strcmp(devid, newdevid) != 0) {
	char *newpath;

	if ((newpath = devid_to_path(devid)) != NULL) {
	/*
	* Update the path appropriately.
	*/
	set_path(zhp, nv, newpath);
	if (nvlist_add_string(nv,
	ZPOOL_CONFIG_PATH, newpath) == 0)
	verify(nvlist_lookup_string(nv,
	ZPOOL_CONFIG_PATH,
	&path) == 0);
	free(newpath);
	}
	}

	if (newdevid)
	devid_str_free(newdevid);
	}

	#ifdef illumos
	if (strncmp(path, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0)
	path += strlen(ZFS_DISK_ROOTD);

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	&value) == 0 && value) {
	int pathlen = strlen(path);
	char *tmp = zfs_strdup(hdl, path);

	/*
	* If it starts with c#, and ends with "s0" or "s1",
	* chop the slice off, or if it ends with "s0/old" or
	* "s1/old", remove the slice from the middle.
	*/
	if (CTD_CHECK(tmp)) {
	if (strcmp(&tmp[pathlen - 2], "s0") == 0 \|\|
	strcmp(&tmp[pathlen - 2], "s1") == 0) {
	tmp[pathlen - 2] = '\0';
	} else if (pathlen > 6 &&
	(strcmp(&tmp[pathlen - 6], "s0/old") == 0 \|\|
	strcmp(&tmp[pathlen - 6], "s1/old") == 0)) {
	(void) strcpy(&tmp[pathlen - 6],
	"/old");
	}
	}
	return (tmp);
	}
	#else /* !illumos */
	if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
	path += sizeof(_PATH_DEV) - 1;
	#endif /* illumos */
	} else {
	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0);

	/*
	* If it's a raidz device, we need to stick in the parity level.
	*/
	if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) {
	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
	&value) == 0);
	(void) snprintf(buf, sizeof (buf), "%s%llu", path,
	(u_longlong_t)value);
	path = buf;
	}

	/*
	* We identify each top-level vdev by using a <type-id>
	* naming convention.
	*/
	if (verbose) {
	uint64_t id;

	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
	&id) == 0);
	(void) snprintf(buf, sizeof (buf), "%s-%llu", path,
	(u_longlong_t)id);
	path = buf;
	}
	}

	return (zfs_strdup(hdl, path));
	}

	static int
	zbookmark_mem_compare(const void a, const void b)
	{
	return (memcmp(a, b, sizeof (zbookmark_phys_t)));
	}

	/*
	* Retrieve the persistent error log, uniquify the members, and return to the
	* caller.
	*/
	int
	zpool_get_errlog(zpool_handle_t zhp, nvlist_t *nverrlistp)
	{
	zfs_cmd_t zc = { 0 };
	uint64_t count;
	zbookmark_phys_t *zb = NULL;
	int i;

	/*
	* Retrieve the raw error list from the kernel. If the number of errors
	* has increased, allocate more space and continue until we get the
	* entire list.
	*/
	verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT,
	&count) == 0);
	if (count == 0)
	return (0);
	if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
	count * sizeof (zbookmark_phys_t))) == (uintptr_t)NULL)
	return (-1);
	zc.zc_nvlist_dst_size = count;
	(void) strcpy(zc.zc_name, zhp->zpool_name);
	for (;;) {
	if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_ERROR_LOG,
	&zc) != 0) {
	free((void *)(uintptr_t)zc.zc_nvlist_dst);
	if (errno == ENOMEM) {
	void *dst;

	count = zc.zc_nvlist_dst_size;
	dst = zfs_alloc(zhp->zpool_hdl, count *
	sizeof (zbookmark_phys_t));
	if (dst == NULL)
	return (-1);
	zc.zc_nvlist_dst = (uintptr_t)dst;
	} else {
	return (-1);
	}
	} else {
	break;
	}
	}

	/*
	* Sort the resulting bookmarks. This is a little confusing due to the
	* implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last
	* to first, and 'zc_nvlist_dst_size' indicates the number of boomarks
	* _not_ copied as part of the process. So we point the start of our
	* array appropriate and decrement the total number of elements.
	*/
	zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
	zc.zc_nvlist_dst_size;
	count -= zc.zc_nvlist_dst_size;

	qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);

	verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);

	/*
	* Fill in the nverrlistp with nvlist's of dataset and object numbers.
	*/
	for (i = 0; i < count; i++) {
	nvlist_t *nv;

	/* ignoring zb_blkid and zb_level for now */
	if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset &&
	zb[i-1].zb_object == zb[i].zb_object)
	continue;

	if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0)
	goto nomem;
	if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET,
	zb[i].zb_objset) != 0) {
	nvlist_free(nv);
	goto nomem;
	}
	if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT,
	zb[i].zb_object) != 0) {
	nvlist_free(nv);
	goto nomem;
	}
	if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) {
	nvlist_free(nv);
	goto nomem;
	}
	nvlist_free(nv);
	}

	free((void *)(uintptr_t)zc.zc_nvlist_dst);
	return (0);

	nomem:
	free((void *)(uintptr_t)zc.zc_nvlist_dst);
	return (no_memory(zhp->zpool_hdl));
	}

	/*
	* Upgrade a ZFS pool to the latest on-disk version.
	*/
	int
	zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version)
	{
	zfs_cmd_t zc = { 0 };
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	(void) strcpy(zc.zc_name, zhp->zpool_name);
	zc.zc_cookie = new_version;

	if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0)
	return (zpool_standard_error_fmt(hdl, errno,
	dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"),
	zhp->zpool_name));
	return (0);
	}

	void
	zfs_save_arguments(int argc, char *argv, char string, int len)
	{
	(void) strlcpy(string, basename(argv[0]), len);
	for (int i = 1; i < argc; i++) {
	(void) strlcat(string, " ", len);
	(void) strlcat(string, argv[i], len);
	}
	}

	int
	zpool_log_history(libzfs_handle_t hdl, const char message)
	{
	zfs_cmd_t zc = { 0 };
	nvlist_t *args;
	int err;

	args = fnvlist_alloc();
	fnvlist_add_string(args, "message", message);
	err = zcmd_write_src_nvlist(hdl, &zc, args);
	if (err == 0)
	err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc);
	nvlist_free(args);
	zcmd_free_nvlists(&zc);
	return (err);
	}

	/*
	* Perform ioctl to get some command history of a pool.
	*
	* 'buf' is the buffer to fill up to 'len' bytes. 'off' is the
	* logical offset of the history buffer to start reading from.
	*
	* Upon return, 'off' is the next logical offset to read from and
	* 'len' is the actual amount of bytes read into 'buf'.
	*/
	static int
	get_history(zpool_handle_t zhp, char buf, uint64_t off, uint64_t len)
	{
	zfs_cmd_t zc = { 0 };
	libzfs_handle_t *hdl = zhp->zpool_hdl;

	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));

	zc.zc_history = (uint64_t)(uintptr_t)buf;
	zc.zc_history_len = *len;
	zc.zc_history_offset = *off;

	if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) {
	switch (errno) {
	case EPERM:
	return (zfs_error_fmt(hdl, EZFS_PERM,
	dgettext(TEXT_DOMAIN,
	"cannot show history for pool '%s'"),
	zhp->zpool_name));
	case ENOENT:
	return (zfs_error_fmt(hdl, EZFS_NOHISTORY,
	dgettext(TEXT_DOMAIN, "cannot get history for pool "
	"'%s'"), zhp->zpool_name));
	case ENOTSUP:
	return (zfs_error_fmt(hdl, EZFS_BADVERSION,
	dgettext(TEXT_DOMAIN, "cannot get history for pool "
	"'%s', pool must be upgraded"), zhp->zpool_name));
	default:
	return (zpool_standard_error_fmt(hdl, errno,
	dgettext(TEXT_DOMAIN,
	"cannot get history for '%s'"), zhp->zpool_name));
	}
	}

	*len = zc.zc_history_len;
	*off = zc.zc_history_offset;

	return (0);
	}

	/*
	* Process the buffer of nvlists, unpacking and storing each nvlist record
	* into 'records'. 'leftover' is set to the number of bytes that weren't
	* processed as there wasn't a complete record.
	*/
	int
	zpool_history_unpack(char buf, uint64_t bytes_read, uint64_t leftover,
	nvlist_t **records, uint_t numrecords)
	{
	uint64_t reclen;
	nvlist_t *nv;
	int i;

	while (bytes_read > sizeof (reclen)) {

	/* get length of packed record (stored as little endian) */
	for (i = 0, reclen = 0; i < sizeof (reclen); i++)
	reclen += (uint64_t)(((uchar_t )buf)[i]) << (8i);

	if (bytes_read < sizeof (reclen) + reclen)
	break;

	/* unpack record */
	if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0)
	return (ENOMEM);
	bytes_read -= sizeof (reclen) + reclen;
	buf += sizeof (reclen) + reclen;

	/* add record to nvlist array */
	(*numrecords)++;
	if (ISP2(*numrecords + 1)) {
	records = realloc(records,
	numrecords 2 * sizeof (nvlist_t *));
	}
	(records)[numrecords - 1] = nv;
	}

	*leftover = bytes_read;
	return (0);
	}

	/* from spa_history.c: spa_history_create_obj() */
	#define HIS_BUF_LEN_DEF (128 << 10)
	#define HIS_BUF_LEN_MAX (1 << 30)

	/*
	* Retrieve the command history of a pool.
	*/
	int
	zpool_get_history(zpool_handle_t zhp, nvlist_t *nvhisp)
	{
	char *buf;
	uint64_t buflen = HIS_BUF_LEN_DEF;
	uint64_t off = 0;
	nvlist_t **records = NULL;
	uint_t numrecords = 0;
	int err, i;

	buf = malloc(buflen);
	if (buf == NULL)
	return (ENOMEM);
	do {
	uint64_t bytes_read = buflen;
	uint64_t leftover;

	if ((err = get_history(zhp, buf, &off, &bytes_read)) != 0)
	break;

	/* if nothing else was read in, we're at EOF, just return */
	if (bytes_read == 0)
	break;

	if ((err = zpool_history_unpack(buf, bytes_read,
	&leftover, &records, &numrecords)) != 0)
	break;
	off -= leftover;
	if (leftover == bytes_read) {
	/*
	* no progress made, because buffer is not big enough
	* to hold this record; resize and retry.
	*/
	buflen *= 2;
	free(buf);
	buf = NULL;
	if ((buflen >= HIS_BUF_LEN_MAX) \|\|
	((buf = malloc(buflen)) == NULL)) {
	err = ENOMEM;
	break;
	}
	}

	/* CONSTCOND */
	} while (1);

	free(buf);

	if (!err) {
	verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0);
	verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD,
	records, numrecords) == 0);
	}
	for (i = 0; i < numrecords; i++)
	nvlist_free(records[i]);
	free(records);

	return (err);
	}

	void
	zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
	char *pathname, size_t len)
	{
	zfs_cmd_t zc = { 0 };
	boolean_t mounted = B_FALSE;
	char *mntpnt = NULL;
	char dsname[ZFS_MAX_DATASET_NAME_LEN];

	if (dsobj == 0) {
	/* special case for the MOS */
	(void) snprintf(pathname, len, "<metadata>:<0x%llx>", obj);
	return;
	}

	/* get the dataset's name */
	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
	zc.zc_obj = dsobj;
	if (ioctl(zhp->zpool_hdl->libzfs_fd,
	ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) {
	/* just write out a path of two object numbers */
	(void) snprintf(pathname, len, "<0x%llx>:<0x%llx>",
	dsobj, obj);
	return;
	}
	(void) strlcpy(dsname, zc.zc_value, sizeof (dsname));

	/* find out if the dataset is mounted */
	mounted = is_mounted(zhp->zpool_hdl, dsname, &mntpnt);

	/* get the corrupted object's path */
	(void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
	zc.zc_obj = obj;
	if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_PATH,
	&zc) == 0) {
	if (mounted) {
	(void) snprintf(pathname, len, "%s%s", mntpnt,
	zc.zc_value);
	} else {
	(void) snprintf(pathname, len, "%s:%s",
	dsname, zc.zc_value);
	}
	} else {
	(void) snprintf(pathname, len, "%s:<0x%llx>", dsname, obj);
	}
	free(mntpnt);
	}

	#ifdef illumos
	/*
	* Read the EFI label from the config, if a label does not exist then
	* pass back the error to the caller. If the caller has passed a non-NULL
	* diskaddr argument then we set it to the starting address of the EFI
	* partition. If the caller has passed a non-NULL boolean argument, then
	* we set it to indicate if the disk does have efi system partition.
	*/
	static int
	read_efi_label(nvlist_t config, diskaddr_t sb, boolean_t *system)
	{
	char *path;
	int fd;
	char diskname[MAXPATHLEN];
	boolean_t boot = B_FALSE;
	int err = -1;
	int slice;

	if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0)
	return (err);

	(void) snprintf(diskname, sizeof (diskname), "%s%s", ZFS_RDISK_ROOT,
	strrchr(path, '/'));
	if ((fd = open(diskname, O_RDONLY\|O_NDELAY)) >= 0) {
	struct dk_gpt *vtoc;

	if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) {
	for (slice = 0; slice < vtoc->efi_nparts; slice++) {
	if (vtoc->efi_parts[slice].p_tag == V_SYSTEM)
	boot = B_TRUE;
	if (vtoc->efi_parts[slice].p_tag == V_USR)
	break;
	}
	if (sb != NULL && vtoc->efi_parts[slice].p_tag == V_USR)
	*sb = vtoc->efi_parts[slice].p_start;
	if (system != NULL)
	*system = boot;
	efi_free(vtoc);
	}
	(void) close(fd);
	}
	return (err);
	}

	/*
	* determine where a partition starts on a disk in the current
	* configuration
	*/
	static diskaddr_t
	find_start_block(nvlist_t *config)
	{
	nvlist_t **child;
	uint_t c, children;
	diskaddr_t sb = MAXOFFSET_T;
	uint64_t wholedisk;

	if (nvlist_lookup_nvlist_array(config,
	ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) {
	if (nvlist_lookup_uint64(config,
	ZPOOL_CONFIG_WHOLE_DISK,
	&wholedisk) != 0 \|\| !wholedisk) {
	return (MAXOFFSET_T);
	}
	if (read_efi_label(config, &sb, NULL) < 0)
	sb = MAXOFFSET_T;
	return (sb);
	}

	for (c = 0; c < children; c++) {
	sb = find_start_block(child[c]);
	if (sb != MAXOFFSET_T) {
	return (sb);
	}
	}
	return (MAXOFFSET_T);
	}
	#endif /* illumos */

	/*
	* Label an individual disk. The name provided is the short name,
	* stripped of any leading /dev path.
	*/
	int
	zpool_label_disk(libzfs_handle_t hdl, zpool_handle_t zhp, const char *name,
	zpool_boot_label_t boot_type, uint64_t boot_size, int *slice)
	{
	#ifdef illumos
	char path[MAXPATHLEN];
	struct dk_gpt *vtoc;
	int fd;
	size_t resv = EFI_MIN_RESV_SIZE;
	uint64_t slice_size;
	diskaddr_t start_block;
	char errbuf[1024];

	/* prepare an error message just in case */
	(void) snprintf(errbuf, sizeof (errbuf),
	dgettext(TEXT_DOMAIN, "cannot label '%s'"), name);

	if (zhp) {
	nvlist_t *nvroot;

	verify(nvlist_lookup_nvlist(zhp->zpool_config,
	ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);

	if (zhp->zpool_start_block == 0)
	start_block = find_start_block(nvroot);
	else
	start_block = zhp->zpool_start_block;
	zhp->zpool_start_block = start_block;
	} else {
	/* new pool */
	start_block = NEW_START_BLOCK;
	}

	(void) snprintf(path, sizeof (path), "%s/%s%s", ZFS_RDISK_ROOT, name,
	BACKUP_SLICE);

	if ((fd = open(path, O_RDWR \| O_NDELAY)) < 0) {
	/*
	* This shouldn't happen. We've long since verified that this
	* is a valid device.
	*/
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN, "unable to open device"));
	return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
	}

	if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) {
	/*
	* The only way this can fail is if we run out of memory, or we
	* were unable to read the disk's capacity
	*/
	if (errno == ENOMEM)
	(void) no_memory(hdl);

	(void) close(fd);
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"unable to read disk capacity"), name);

	return (zfs_error(hdl, EZFS_NOCAP, errbuf));
	}

	/*
	* Why we use V_USR: V_BACKUP confuses users, and is considered
	* disposable by some EFI utilities (since EFI doesn't have a backup
	* slice). V_UNASSIGNED is supposed to be used only for zero size
	* partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT,
	* etc. were all pretty specific. V_USR is as close to reality as we
	* can get, in the absence of V_OTHER.
	*/
	/* first fix the partition start block */
	if (start_block == MAXOFFSET_T)
	start_block = NEW_START_BLOCK;

	/*
	* EFI System partition is using slice 0.
	* ZFS is on slice 1 and slice 8 is reserved.
	* We assume the GPT partition table without system
	* partition has zfs p_start == NEW_START_BLOCK.
	* If start_block != NEW_START_BLOCK, it means we have
	* system partition. Correct solution would be to query/cache vtoc
	* from existing vdev member.
	*/
	if (boot_type == ZPOOL_CREATE_BOOT_LABEL) {
	if (boot_size % vtoc->efi_lbasize != 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"boot partition size must be a multiple of %d"),
	vtoc->efi_lbasize);
	(void) close(fd);
	efi_free(vtoc);
	return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
	}
	/*
	* System partition size checks.
	* Note the 1MB is quite arbitrary value, since we
	* are creating dedicated pool, it should be enough
	* to hold fat + efi bootloader. May need to be
	* adjusted if the bootloader size will grow.
	*/
	if (boot_size < 1024 * 1024) {
	char buf[64];
	zfs_nicenum(boot_size, buf, sizeof (buf));
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"Specified size %s for EFI System partition is too "
	"small, the minimum size is 1MB."), buf);
	(void) close(fd);
	efi_free(vtoc);
	return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
	}
	/* 33MB is tested with mkfs -F pcfs */
	if (hdl->libzfs_printerr &&
	((vtoc->efi_lbasize == 512 &&
	boot_size < 33 * 1024 * 1024) \|\|
	(vtoc->efi_lbasize == 4096 &&
	boot_size < 256 * 1024 * 1024))) {
	char buf[64];
	zfs_nicenum(boot_size, buf, sizeof (buf));
	(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
	"Warning: EFI System partition size %s is "
	"not allowing to create FAT32 file\nsystem, which "
	"may result in unbootable system.\n"), buf);
	}
	/* Adjust zfs partition start by size of system partition. */
	start_block += boot_size / vtoc->efi_lbasize;
	}

	if (start_block == NEW_START_BLOCK) {
	/*
	* Use default layout.
	* ZFS is on slice 0 and slice 8 is reserved.
	*/
	slice_size = vtoc->efi_last_u_lba + 1;
	slice_size -= EFI_MIN_RESV_SIZE;
	slice_size -= start_block;
	if (slice != NULL)
	*slice = 0;

	vtoc->efi_parts[0].p_start = start_block;
	vtoc->efi_parts[0].p_size = slice_size;

	vtoc->efi_parts[0].p_tag = V_USR;
	(void) strcpy(vtoc->efi_parts[0].p_name, "zfs");

	vtoc->efi_parts[8].p_start = slice_size + start_block;
	vtoc->efi_parts[8].p_size = resv;
	vtoc->efi_parts[8].p_tag = V_RESERVED;
	} else {
	slice_size = start_block - NEW_START_BLOCK;
	vtoc->efi_parts[0].p_start = NEW_START_BLOCK;
	vtoc->efi_parts[0].p_size = slice_size;
	vtoc->efi_parts[0].p_tag = V_SYSTEM;
	(void) strcpy(vtoc->efi_parts[0].p_name, "loader");
	if (slice != NULL)
	*slice = 1;
	/* prepare slice 1 */
	slice_size = vtoc->efi_last_u_lba + 1 - slice_size;
	slice_size -= resv;
	slice_size -= NEW_START_BLOCK;
	vtoc->efi_parts[1].p_start = start_block;
	vtoc->efi_parts[1].p_size = slice_size;
	vtoc->efi_parts[1].p_tag = V_USR;
	(void) strcpy(vtoc->efi_parts[1].p_name, "zfs");

	vtoc->efi_parts[8].p_start = slice_size + start_block;
	vtoc->efi_parts[8].p_size = resv;
	vtoc->efi_parts[8].p_tag = V_RESERVED;
	}

	if (efi_write(fd, vtoc) != 0) {
	/*
	* Some block drivers (like pcata) may not support EFI
	* GPT labels. Print out a helpful error message dir-
	* ecting the user to manually label the disk and give
	* a specific slice.
	*/
	(void) close(fd);
	efi_free(vtoc);

	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"try using fdisk(1M) and then provide a specific slice"));
	return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
	}

	(void) close(fd);
	efi_free(vtoc);
	#endif /* illumos */
	return (0);
	}

	static boolean_t
	supported_dump_vdev_type(libzfs_handle_t hdl, nvlist_t config, char *errbuf)
	{
	char *type;
	nvlist_t **child;
	uint_t children, c;

	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0);
	if (strcmp(type, VDEV_TYPE_FILE) == 0 \|\|
	strcmp(type, VDEV_TYPE_HOLE) == 0 \|\|
	strcmp(type, VDEV_TYPE_MISSING) == 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"vdev type '%s' is not supported"), type);
	(void) zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf);
	return (B_FALSE);
	}
	if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
	&child, &children) == 0) {
	for (c = 0; c < children; c++) {
	if (!supported_dump_vdev_type(hdl, child[c], errbuf))
	return (B_FALSE);
	}
	}
	return (B_TRUE);
	}

	/*
	* Check if this zvol is allowable for use as a dump device; zero if
	* it is, > 0 if it isn't, < 0 if it isn't a zvol.
	*
	* Allowable storage configurations include mirrors, all raidz variants, and
	* pools with log, cache, and spare devices. Pools which are backed by files or
	* have missing/hole vdevs are not suitable.
	*/
	int
	zvol_check_dump_config(char *arg)
	{
	zpool_handle_t *zhp = NULL;
	nvlist_t config, nvroot;
	char p, volname;
	nvlist_t **top;
	uint_t toplevels;
	libzfs_handle_t *hdl;
	char errbuf[1024];
	char poolname[ZFS_MAX_DATASET_NAME_LEN];
	int pathlen = strlen(ZVOL_FULL_DEV_DIR);
	int ret = 1;

	if (strncmp(arg, ZVOL_FULL_DEV_DIR, pathlen)) {
	return (-1);
	}

	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
	"dump is not supported on device '%s'"), arg);

	if ((hdl = libzfs_init()) == NULL)
	return (1);
	libzfs_print_on_error(hdl, B_TRUE);

	volname = arg + pathlen;

	/* check the configuration of the pool */
	if ((p = strchr(volname, '/')) == NULL) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"malformed dataset name"));
	(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
	return (1);
	} else if (p - volname >= ZFS_MAX_DATASET_NAME_LEN) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"dataset name is too long"));
	(void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf);
	return (1);
	} else {
	(void) strncpy(poolname, volname, p - volname);
	poolname[p - volname] = '\0';
	}

	if ((zhp = zpool_open(hdl, poolname)) == NULL) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"could not open pool '%s'"), poolname);
	(void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
	goto out;
	}
	config = zpool_get_config(zhp, NULL);
	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvroot) != 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"could not obtain vdev configuration for '%s'"), poolname);
	(void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf);
	goto out;
	}

	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	&top, &toplevels) == 0);

	if (!supported_dump_vdev_type(hdl, top[0], errbuf)) {
	goto out;
	}
	ret = 0;

	out:
	if (zhp)
	zpool_close(zhp);
	libzfs_fini(hdl);
	return (ret);
	}

	int
	zpool_nextboot(libzfs_handle_t *hdl, uint64_t pool_guid, uint64_t dev_guid,
	const char *command)
	{
	zfs_cmd_t zc = { 0 };
	nvlist_t *args;
	char *packed;
	size_t size;
	int error;

	args = fnvlist_alloc();
	fnvlist_add_uint64(args, ZPOOL_CONFIG_POOL_GUID, pool_guid);
	fnvlist_add_uint64(args, ZPOOL_CONFIG_GUID, dev_guid);
	fnvlist_add_string(args, "command", command);
	error = zcmd_write_src_nvlist(hdl, &zc, args);
	if (error == 0)
	error = ioctl(hdl->libzfs_fd, ZFS_IOC_NEXTBOOT, &zc);
	zcmd_free_nvlists(&zc);
	nvlist_free(args);
	return (error);
	}
	Index: stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
	===================================================================
	--- stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c (revision 332524)
	+++ stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c (revision 332525)
	@@ -1,1528 +1,1535 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	* Copyright (c) 2017 Datto Inc.
	*/

	/*
	* Internal utility routines for the ZFS library.
	*/

	#include <sys/param.h>
	#include <sys/linker.h>
	#include <sys/module.h>
	#include <sys/stat.h>

	#include <errno.h>
	#include <fcntl.h>
	#include <libintl.h>
	#include <stdarg.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <strings.h>
	#include <unistd.h>
	#include <ctype.h>
	#include <math.h>
	#include <sys/mnttab.h>
	#include <sys/mntent.h>
	#include <sys/types.h>
	#include <libcmdutils.h>

	#include <libzfs.h>
	#include <libzfs_core.h>

	#include "libzfs_impl.h"
	#include "zfs_prop.h"
	#include "zfeature_common.h"


	int
	libzfs_errno(libzfs_handle_t *hdl)
	{
	return (hdl->libzfs_error);
	}

	const char *
	libzfs_error_action(libzfs_handle_t *hdl)
	{
	return (hdl->libzfs_action);
	}

	const char *
	libzfs_error_description(libzfs_handle_t *hdl)
	{
	if (hdl->libzfs_desc[0] != '\0')
	return (hdl->libzfs_desc);

	switch (hdl->libzfs_error) {
	case EZFS_NOMEM:
	return (dgettext(TEXT_DOMAIN, "out of memory"));
	case EZFS_BADPROP:
	return (dgettext(TEXT_DOMAIN, "invalid property value"));
	case EZFS_PROPREADONLY:
	return (dgettext(TEXT_DOMAIN, "read-only property"));
	case EZFS_PROPTYPE:
	return (dgettext(TEXT_DOMAIN, "property doesn't apply to "
	"datasets of this type"));
	case EZFS_PROPNONINHERIT:
	return (dgettext(TEXT_DOMAIN, "property cannot be inherited"));
	case EZFS_PROPSPACE:
	return (dgettext(TEXT_DOMAIN, "invalid quota or reservation"));
	case EZFS_BADTYPE:
	return (dgettext(TEXT_DOMAIN, "operation not applicable to "
	"datasets of this type"));
	case EZFS_BUSY:
	return (dgettext(TEXT_DOMAIN, "pool or dataset is busy"));
	case EZFS_EXISTS:
	return (dgettext(TEXT_DOMAIN, "pool or dataset exists"));
	case EZFS_NOENT:
	return (dgettext(TEXT_DOMAIN, "no such pool or dataset"));
	case EZFS_BADSTREAM:
	return (dgettext(TEXT_DOMAIN, "invalid backup stream"));
	case EZFS_DSREADONLY:
	return (dgettext(TEXT_DOMAIN, "dataset is read-only"));
	case EZFS_VOLTOOBIG:
	return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
	"this system"));
	case EZFS_INVALIDNAME:
	return (dgettext(TEXT_DOMAIN, "invalid name"));
	case EZFS_BADRESTORE:
	return (dgettext(TEXT_DOMAIN, "unable to restore to "
	"destination"));
	case EZFS_BADBACKUP:
	return (dgettext(TEXT_DOMAIN, "backup failed"));
	case EZFS_BADTARGET:
	return (dgettext(TEXT_DOMAIN, "invalid target vdev"));
	case EZFS_NODEVICE:
	return (dgettext(TEXT_DOMAIN, "no such device in pool"));
	case EZFS_BADDEV:
	return (dgettext(TEXT_DOMAIN, "invalid device"));
	case EZFS_NOREPLICAS:
	return (dgettext(TEXT_DOMAIN, "no valid replicas"));
	case EZFS_RESILVERING:
	return (dgettext(TEXT_DOMAIN, "currently resilvering"));
	case EZFS_BADVERSION:
	return (dgettext(TEXT_DOMAIN, "unsupported version or "
	"feature"));
	case EZFS_POOLUNAVAIL:
	return (dgettext(TEXT_DOMAIN, "pool is unavailable"));
	case EZFS_DEVOVERFLOW:
	return (dgettext(TEXT_DOMAIN, "too many devices in one vdev"));
	case EZFS_BADPATH:
	return (dgettext(TEXT_DOMAIN, "must be an absolute path"));
	case EZFS_CROSSTARGET:
	return (dgettext(TEXT_DOMAIN, "operation crosses datasets or "
	"pools"));
	case EZFS_ZONED:
	return (dgettext(TEXT_DOMAIN, "dataset in use by local zone"));
	case EZFS_MOUNTFAILED:
	return (dgettext(TEXT_DOMAIN, "mount failed"));
	case EZFS_UMOUNTFAILED:
	return (dgettext(TEXT_DOMAIN, "umount failed"));
	case EZFS_UNSHARENFSFAILED:
	return (dgettext(TEXT_DOMAIN, "unshare(1M) failed"));
	case EZFS_SHARENFSFAILED:
	return (dgettext(TEXT_DOMAIN, "share(1M) failed"));
	case EZFS_UNSHARESMBFAILED:
	return (dgettext(TEXT_DOMAIN, "smb remove share failed"));
	case EZFS_SHARESMBFAILED:
	return (dgettext(TEXT_DOMAIN, "smb add share failed"));
	case EZFS_PERM:
	return (dgettext(TEXT_DOMAIN, "permission denied"));
	case EZFS_NOSPC:
	return (dgettext(TEXT_DOMAIN, "out of space"));
	case EZFS_FAULT:
	return (dgettext(TEXT_DOMAIN, "bad address"));
	case EZFS_IO:
	return (dgettext(TEXT_DOMAIN, "I/O error"));
	case EZFS_INTR:
	return (dgettext(TEXT_DOMAIN, "signal received"));
	case EZFS_ISSPARE:
	return (dgettext(TEXT_DOMAIN, "device is reserved as a hot "
	"spare"));
	case EZFS_INVALCONFIG:
	return (dgettext(TEXT_DOMAIN, "invalid vdev configuration"));
	case EZFS_RECURSIVE:
	return (dgettext(TEXT_DOMAIN, "recursive dataset dependency"));
	case EZFS_NOHISTORY:
	return (dgettext(TEXT_DOMAIN, "no history available"));
	case EZFS_POOLPROPS:
	return (dgettext(TEXT_DOMAIN, "failed to retrieve "
	"pool properties"));
	case EZFS_POOL_NOTSUP:
	return (dgettext(TEXT_DOMAIN, "operation not supported "
	"on this type of pool"));
	case EZFS_POOL_INVALARG:
	return (dgettext(TEXT_DOMAIN, "invalid argument for "
	"this pool operation"));
	case EZFS_NAMETOOLONG:
	return (dgettext(TEXT_DOMAIN, "dataset name is too long"));
	case EZFS_OPENFAILED:
	return (dgettext(TEXT_DOMAIN, "open failed"));
	case EZFS_NOCAP:
	return (dgettext(TEXT_DOMAIN,
	"disk capacity information could not be retrieved"));
	case EZFS_LABELFAILED:
	return (dgettext(TEXT_DOMAIN, "write of label failed"));
	case EZFS_BADWHO:
	return (dgettext(TEXT_DOMAIN, "invalid user/group"));
	case EZFS_BADPERM:
	return (dgettext(TEXT_DOMAIN, "invalid permission"));
	case EZFS_BADPERMSET:
	return (dgettext(TEXT_DOMAIN, "invalid permission set name"));
	case EZFS_NODELEGATION:
	return (dgettext(TEXT_DOMAIN, "delegated administration is "
	"disabled on pool"));
	case EZFS_BADCACHE:
	return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));
	case EZFS_ISL2CACHE:
	return (dgettext(TEXT_DOMAIN, "device is in use as a cache"));
	case EZFS_VDEVNOTSUP:
	return (dgettext(TEXT_DOMAIN, "vdev specification is not "
	"supported"));
	case EZFS_NOTSUP:
	return (dgettext(TEXT_DOMAIN, "operation not supported "
	"on this dataset"));
	case EZFS_ACTIVE_SPARE:
	return (dgettext(TEXT_DOMAIN, "pool has active shared spare "
	"device"));
	case EZFS_UNPLAYED_LOGS:
	return (dgettext(TEXT_DOMAIN, "log device has unplayed intent "
	"logs"));
	case EZFS_REFTAG_RELE:
	return (dgettext(TEXT_DOMAIN, "no such tag on this dataset"));
	case EZFS_REFTAG_HOLD:
	return (dgettext(TEXT_DOMAIN, "tag already exists on this "
	"dataset"));
	case EZFS_TAGTOOLONG:
	return (dgettext(TEXT_DOMAIN, "tag too long"));
	case EZFS_PIPEFAILED:
	return (dgettext(TEXT_DOMAIN, "pipe create failed"));
	case EZFS_THREADCREATEFAILED:
	return (dgettext(TEXT_DOMAIN, "thread create failed"));
	case EZFS_POSTSPLIT_ONLINE:
	return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
	"into a new one"));
	case EZFS_SCRUB_PAUSED:
	return (dgettext(TEXT_DOMAIN, "scrub is paused; "
	"use 'zpool scrub' to resume"));
	case EZFS_SCRUBBING:
	return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
	"use 'zpool scrub -s' to cancel current scrub"));
	case EZFS_NO_SCRUB:
	return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
	case EZFS_DIFF:
	return (dgettext(TEXT_DOMAIN, "unable to generate diffs"));
	case EZFS_DIFFDATA:
	return (dgettext(TEXT_DOMAIN, "invalid diff data"));
	case EZFS_POOLREADONLY:
	return (dgettext(TEXT_DOMAIN, "pool is read-only"));
	+ case EZFS_NO_PENDING:
	+ return (dgettext(TEXT_DOMAIN, "operation is not "
	+ "in progress"));
	case EZFS_UNKNOWN:
	return (dgettext(TEXT_DOMAIN, "unknown error"));
	default:
	assert(hdl->libzfs_error == 0);
	return (dgettext(TEXT_DOMAIN, "no error"));
	}
	}

	/PRINTFLIKE2/
	void
	zfs_error_aux(libzfs_handle_t hdl, const char fmt, ...)
	{
	va_list ap;

	va_start(ap, fmt);

	(void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc),
	fmt, ap);
	hdl->libzfs_desc_active = 1;

	va_end(ap);
	}

	static void
	zfs_verror(libzfs_handle_t hdl, int error, const char fmt, va_list ap)
	{
	(void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action),
	fmt, ap);
	hdl->libzfs_error = error;

	if (hdl->libzfs_desc_active)
	hdl->libzfs_desc_active = 0;
	else
	hdl->libzfs_desc[0] = '\0';

	if (hdl->libzfs_printerr) {
	if (error == EZFS_UNKNOWN) {
	(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal "
	"error: %s\n"), libzfs_error_description(hdl));
	abort();
	}

	(void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action,
	libzfs_error_description(hdl));
	if (error == EZFS_NOMEM)
	exit(1);
	}
	}

	int
	zfs_error(libzfs_handle_t hdl, int error, const char msg)
	{
	return (zfs_error_fmt(hdl, error, "%s", msg));
	}

	/PRINTFLIKE3/
	int
	zfs_error_fmt(libzfs_handle_t hdl, int error, const char fmt, ...)
	{
	va_list ap;

	va_start(ap, fmt);

	zfs_verror(hdl, error, fmt, ap);

	va_end(ap);

	return (-1);
	}

	static int
	zfs_common_error(libzfs_handle_t hdl, int error, const char fmt,
	va_list ap)
	{
	switch (error) {
	case EPERM:
	case EACCES:
	zfs_verror(hdl, EZFS_PERM, fmt, ap);
	return (-1);

	case ECANCELED:
	zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap);
	return (-1);

	case EIO:
	zfs_verror(hdl, EZFS_IO, fmt, ap);
	return (-1);

	case EFAULT:
	zfs_verror(hdl, EZFS_FAULT, fmt, ap);
	return (-1);

	case EINTR:
	zfs_verror(hdl, EZFS_INTR, fmt, ap);
	return (-1);
	}

	return (0);
	}

	int
	zfs_standard_error(libzfs_handle_t hdl, int error, const char msg)
	{
	return (zfs_standard_error_fmt(hdl, error, "%s", msg));
	}

	/PRINTFLIKE3/
	int
	zfs_standard_error_fmt(libzfs_handle_t hdl, int error, const char fmt, ...)
	{
	va_list ap;

	va_start(ap, fmt);

	if (zfs_common_error(hdl, error, fmt, ap) != 0) {
	va_end(ap);
	return (-1);
	}

	switch (error) {
	case ENXIO:
	case ENODEV:
	case EPIPE:
	zfs_verror(hdl, EZFS_IO, fmt, ap);
	break;

	case ENOENT:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"dataset does not exist"));
	zfs_verror(hdl, EZFS_NOENT, fmt, ap);
	break;

	case ENOSPC:
	case EDQUOT:
	zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
	va_end(ap);
	return (-1);

	case EEXIST:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"dataset already exists"));
	zfs_verror(hdl, EZFS_EXISTS, fmt, ap);
	break;

	case EBUSY:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"dataset is busy"));
	zfs_verror(hdl, EZFS_BUSY, fmt, ap);
	break;
	case EROFS:
	zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
	break;
	case ENAMETOOLONG:
	zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap);
	break;
	case ENOTSUP:
	zfs_verror(hdl, EZFS_BADVERSION, fmt, ap);
	break;
	case EAGAIN:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool I/O is currently suspended"));
	zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
	break;
	default:
	zfs_error_aux(hdl, strerror(error));
	zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
	break;
	}

	va_end(ap);
	return (-1);
	}

	int
	zpool_standard_error(libzfs_handle_t hdl, int error, const char msg)
	{
	return (zpool_standard_error_fmt(hdl, error, "%s", msg));
	}

	/PRINTFLIKE3/
	int
	zpool_standard_error_fmt(libzfs_handle_t hdl, int error, const char fmt, ...)
	{
	va_list ap;

	va_start(ap, fmt);

	if (zfs_common_error(hdl, error, fmt, ap) != 0) {
	va_end(ap);
	return (-1);
	}

	switch (error) {
	case ENODEV:
	zfs_verror(hdl, EZFS_NODEVICE, fmt, ap);
	break;

	case ENOENT:
	zfs_error_aux(hdl,
	dgettext(TEXT_DOMAIN, "no such pool or dataset"));
	zfs_verror(hdl, EZFS_NOENT, fmt, ap);
	break;

	case EEXIST:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool already exists"));
	zfs_verror(hdl, EZFS_EXISTS, fmt, ap);
	break;

	case EBUSY:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy"));
	zfs_verror(hdl, EZFS_BUSY, fmt, ap);
	break;

	case ENXIO:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"one or more devices is currently unavailable"));
	zfs_verror(hdl, EZFS_BADDEV, fmt, ap);
	break;

	case ENAMETOOLONG:
	zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap);
	break;

	case ENOTSUP:
	zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap);
	break;

	case EINVAL:
	zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap);
	break;

	case ENOSPC:
	case EDQUOT:
	zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
	va_end(ap);
	return (-1);

	case EAGAIN:
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"pool I/O is currently suspended"));
	zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
	break;

	case EROFS:
	zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
	+ break;
	+ /* There is no pending operation to cancel */
	+ case ESRCH:
	+ zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap);
	break;

	default:
	zfs_error_aux(hdl, strerror(error));
	zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
	}

	va_end(ap);
	return (-1);
	}

	/*
	* Display an out of memory error message and abort the current program.
	*/
	int
	no_memory(libzfs_handle_t *hdl)
	{
	return (zfs_error(hdl, EZFS_NOMEM, "internal error"));
	}

	/*
	* A safe form of malloc() which will die if the allocation fails.
	*/
	void *
	zfs_alloc(libzfs_handle_t *hdl, size_t size)
	{
	void *data;

	if ((data = calloc(1, size)) == NULL)
	(void) no_memory(hdl);

	return (data);
	}

	/*
	* A safe form of asprintf() which will die if the allocation fails.
	*/
	/PRINTFLIKE2/
	char *
	zfs_asprintf(libzfs_handle_t hdl, const char fmt, ...)
	{
	va_list ap;
	char *ret;
	int err;

	va_start(ap, fmt);

	err = vasprintf(&ret, fmt, ap);

	va_end(ap);

	if (err < 0)
	(void) no_memory(hdl);

	return (ret);
	}

	/*
	* A safe form of realloc(), which also zeroes newly allocated space.
	*/
	void *
	zfs_realloc(libzfs_handle_t hdl, void ptr, size_t oldsize, size_t newsize)
	{
	void *ret;

	if ((ret = realloc(ptr, newsize)) == NULL) {
	(void) no_memory(hdl);
	return (NULL);
	}

	bzero((char *)ret + oldsize, (newsize - oldsize));
	return (ret);
	}

	/*
	* A safe form of strdup() which will die if the allocation fails.
	*/
	char *
	zfs_strdup(libzfs_handle_t hdl, const char str)
	{
	char *ret;

	if ((ret = strdup(str)) == NULL)
	(void) no_memory(hdl);

	return (ret);
	}

	/*
	* Convert a number to an appropriately human-readable output.
	*/
	void
	zfs_nicenum(uint64_t num, char *buf, size_t buflen)
	{
	nicenum(num, buf, buflen);
	}

	void
	libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr)
	{
	hdl->libzfs_printerr = printerr;
	}

	static int
	libzfs_load(void)
	{
	int error;

	if (modfind("zfs") < 0) {
	/* Not present in kernel, try loading it. */
	if (kldload("zfs") < 0 \|\| modfind("zfs") < 0) {
	if (errno != EEXIST)
	return (-1);
	}
	}
	return (0);
	}

	libzfs_handle_t *
	libzfs_init(void)
	{
	libzfs_handle_t *hdl;

	if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) {
	return (NULL);
	}

	if (libzfs_load() < 0) {
	free(hdl);
	return (NULL);
	}

	if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
	free(hdl);
	return (NULL);
	}

	if ((hdl->libzfs_mnttab = fopen(MNTTAB, "r")) == NULL) {
	(void) close(hdl->libzfs_fd);
	free(hdl);
	return (NULL);
	}

	hdl->libzfs_sharetab = fopen(ZFS_EXPORTS_PATH, "r");

	if (libzfs_core_init() != 0) {
	(void) close(hdl->libzfs_fd);
	(void) fclose(hdl->libzfs_mnttab);
	(void) fclose(hdl->libzfs_sharetab);
	free(hdl);
	return (NULL);
	}

	zfs_prop_init();
	zpool_prop_init();
	zpool_feature_init();
	libzfs_mnttab_init(hdl);

	if (getenv("ZFS_PROP_DEBUG") != NULL) {
	hdl->libzfs_prop_debug = B_TRUE;
	}

	return (hdl);
	}

	void
	libzfs_fini(libzfs_handle_t *hdl)
	{
	(void) close(hdl->libzfs_fd);
	if (hdl->libzfs_mnttab)
	(void) fclose(hdl->libzfs_mnttab);
	if (hdl->libzfs_sharetab)
	(void) fclose(hdl->libzfs_sharetab);
	zfs_uninit_libshare(hdl);
	zpool_free_handles(hdl);
	#ifdef illumos
	libzfs_fru_clear(hdl, B_TRUE);
	#endif
	namespace_clear(hdl);
	libzfs_mnttab_fini(hdl);
	libzfs_core_fini();
	free(hdl);
	}

	libzfs_handle_t *
	zpool_get_handle(zpool_handle_t *zhp)
	{
	return (zhp->zpool_hdl);
	}

	libzfs_handle_t *
	zfs_get_handle(zfs_handle_t *zhp)
	{
	return (zhp->zfs_hdl);
	}

	zpool_handle_t *
	zfs_get_pool_handle(const zfs_handle_t *zhp)
	{
	return (zhp->zpool_hdl);
	}

	/*
	* Given a name, determine whether or not it's a valid path
	* (starts with '/' or "./"). If so, walk the mnttab trying
	* to match the device number. If not, treat the path as an
	* fs/vol/snap/bkmark name.
	*/
	zfs_handle_t *
	zfs_path_to_zhandle(libzfs_handle_t hdl, char path, zfs_type_t argtype)
	{
	struct stat64 statbuf;
	struct extmnttab entry;
	int ret;

	if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) {
	/*
	* It's not a valid path, assume it's a name of type 'argtype'.
	*/
	return (zfs_open(hdl, path, argtype));
	}

	if (stat64(path, &statbuf) != 0) {
	(void) fprintf(stderr, "%s: %s\n", path, strerror(errno));
	return (NULL);
	}

	#ifdef illumos
	rewind(hdl->libzfs_mnttab);
	while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) {
	if (makedevice(entry.mnt_major, entry.mnt_minor) ==
	statbuf.st_dev) {
	break;
	}
	}
	#else
	{
	struct statfs sfs;

	ret = statfs(path, &sfs);
	if (ret == 0)
	statfs2mnttab(&sfs, &entry);
	else {
	(void) fprintf(stderr, "%s: %s\n", path,
	strerror(errno));
	}
	}
	#endif /* illumos */
	if (ret != 0) {
	return (NULL);
	}

	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
	(void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"),
	path);
	return (NULL);
	}

	return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM));
	}

	/*
	* Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from
	* an ioctl().
	*/
	int
	zcmd_alloc_dst_nvlist(libzfs_handle_t hdl, zfs_cmd_t zc, size_t len)
	{
	if (len == 0)
	len = 16 * 1024;
	zc->zc_nvlist_dst_size = len;
	zc->zc_nvlist_dst =
	(uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
	if (zc->zc_nvlist_dst == 0)
	return (-1);

	return (0);
	}

	/*
	* Called when an ioctl() which returns an nvlist fails with ENOMEM. This will
	* expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was
	* filled in by the kernel to indicate the actual required size.
	*/
	int
	zcmd_expand_dst_nvlist(libzfs_handle_t hdl, zfs_cmd_t zc)
	{
	free((void *)(uintptr_t)zc->zc_nvlist_dst);
	zc->zc_nvlist_dst =
	(uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
	if (zc->zc_nvlist_dst == 0)
	return (-1);

	return (0);
	}

	/*
	* Called to free the src and dst nvlists stored in the command structure.
	*/
	void
	zcmd_free_nvlists(zfs_cmd_t *zc)
	{
	free((void *)(uintptr_t)zc->zc_nvlist_conf);
	free((void *)(uintptr_t)zc->zc_nvlist_src);
	free((void *)(uintptr_t)zc->zc_nvlist_dst);
	zc->zc_nvlist_conf = NULL;
	zc->zc_nvlist_src = NULL;
	zc->zc_nvlist_dst = NULL;
	}

	static int
	zcmd_write_nvlist_com(libzfs_handle_t hdl, uint64_t outnv, uint64_t *outlen,
	nvlist_t *nvl)
	{
	char *packed;
	size_t len;

	verify(nvlist_size(nvl, &len, NV_ENCODE_NATIVE) == 0);

	if ((packed = zfs_alloc(hdl, len)) == NULL)
	return (-1);

	verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0);

	*outnv = (uint64_t)(uintptr_t)packed;
	*outlen = len;

	return (0);
	}

	int
	zcmd_write_conf_nvlist(libzfs_handle_t hdl, zfs_cmd_t zc, nvlist_t *nvl)
	{
	return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf,
	&zc->zc_nvlist_conf_size, nvl));
	}

	int
	zcmd_write_src_nvlist(libzfs_handle_t hdl, zfs_cmd_t zc, nvlist_t *nvl)
	{
	return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src,
	&zc->zc_nvlist_src_size, nvl));
	}

	/*
	* Unpacks an nvlist from the ZFS ioctl command structure.
	*/
	int
	zcmd_read_dst_nvlist(libzfs_handle_t hdl, zfs_cmd_t zc, nvlist_t **nvlp)
	{
	if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst,
	zc->zc_nvlist_dst_size, nvlp, 0) != 0)
	return (no_memory(hdl));

	return (0);
	}

	int
	zfs_ioctl(libzfs_handle_t hdl, int request, zfs_cmd_t zc)
	{
	return (ioctl(hdl->libzfs_fd, request, zc));
	}

	/*
	* ================================================================
	* API shared by zfs and zpool property management
	* ================================================================
	*/

	static void
	zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
	{
	zprop_list_t *pl = cbp->cb_proplist;
	int i;
	char *title;
	size_t len;

	cbp->cb_first = B_FALSE;
	if (cbp->cb_scripted)
	return;

	/*
	* Start with the length of the column headers.
	*/
	cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME"));
	cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN,
	"PROPERTY"));
	cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN,
	"VALUE"));
	cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN,
	"RECEIVED"));
	cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
	"SOURCE"));

	/* first property is always NAME */
	assert(cbp->cb_proplist->pl_prop ==
	((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ZFS_PROP_NAME));

	/*
	* Go through and calculate the widths for each column. For the
	* 'source' column, we kludge it up by taking the worst-case scenario of
	* inheriting from the longest name. This is acceptable because in the
	* majority of cases 'SOURCE' is the last column displayed, and we don't
	* use the width anyway. Note that the 'VALUE' column can be oversized,
	* if the name of the property is much longer than any values we find.
	*/
	for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
	/*
	* 'PROPERTY' column
	*/
	if (pl->pl_prop != ZPROP_INVAL) {
	const char *propname = (type == ZFS_TYPE_POOL) ?
	zpool_prop_to_name(pl->pl_prop) :
	zfs_prop_to_name(pl->pl_prop);

	len = strlen(propname);
	if (len > cbp->cb_colwidths[GET_COL_PROPERTY])
	cbp->cb_colwidths[GET_COL_PROPERTY] = len;
	} else {
	len = strlen(pl->pl_user_prop);
	if (len > cbp->cb_colwidths[GET_COL_PROPERTY])
	cbp->cb_colwidths[GET_COL_PROPERTY] = len;
	}

	/*
	* 'VALUE' column. The first property is always the 'name'
	* property that was tacked on either by /sbin/zfs's
	* zfs_do_get() or when calling zprop_expand_list(), so we
	* ignore its width. If the user specified the name property
	* to display, then it will be later in the list in any case.
	*/
	if (pl != cbp->cb_proplist &&
	pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
	cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;

	/* 'RECEIVED' column. */
	if (pl != cbp->cb_proplist &&
	pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD])
	cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width;

	/*
	* 'NAME' and 'SOURCE' columns
	*/
	if (pl->pl_prop == (type == ZFS_TYPE_POOL ? ZPOOL_PROP_NAME :
	ZFS_PROP_NAME) &&
	pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) {
	cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width;
	cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width +
	strlen(dgettext(TEXT_DOMAIN, "inherited from"));
	}
	}

	/*
	* Now go through and print the headers.
	*/
	for (i = 0; i < ZFS_GET_NCOLS; i++) {
	switch (cbp->cb_columns[i]) {
	case GET_COL_NAME:
	title = dgettext(TEXT_DOMAIN, "NAME");
	break;
	case GET_COL_PROPERTY:
	title = dgettext(TEXT_DOMAIN, "PROPERTY");
	break;
	case GET_COL_VALUE:
	title = dgettext(TEXT_DOMAIN, "VALUE");
	break;
	case GET_COL_RECVD:
	title = dgettext(TEXT_DOMAIN, "RECEIVED");
	break;
	case GET_COL_SOURCE:
	title = dgettext(TEXT_DOMAIN, "SOURCE");
	break;
	default:
	title = NULL;
	}

	if (title != NULL) {
	if (i == (ZFS_GET_NCOLS - 1) \|\|
	cbp->cb_columns[i + 1] == GET_COL_NONE)
	(void) printf("%s", title);
	else
	(void) printf("%-*s ",
	cbp->cb_colwidths[cbp->cb_columns[i]],
	title);
	}
	}
	(void) printf("\n");
	}

	/*
	* Display a single line of output, according to the settings in the callback
	* structure.
	*/
	void
	zprop_print_one_property(const char name, zprop_get_cbdata_t cbp,
	const char propname, const char value, zprop_source_t sourcetype,
	const char source, const char recvd_value)
	{
	int i;
	const char *str = NULL;
	char buf[128];

	/*
	* Ignore those source types that the user has chosen to ignore.
	*/
	if ((sourcetype & cbp->cb_sources) == 0)
	return;

	if (cbp->cb_first)
	zprop_print_headers(cbp, cbp->cb_type);

	for (i = 0; i < ZFS_GET_NCOLS; i++) {
	switch (cbp->cb_columns[i]) {
	case GET_COL_NAME:
	str = name;
	break;

	case GET_COL_PROPERTY:
	str = propname;
	break;

	case GET_COL_VALUE:
	str = value;
	break;

	case GET_COL_SOURCE:
	switch (sourcetype) {
	case ZPROP_SRC_NONE:
	str = "-";
	break;

	case ZPROP_SRC_DEFAULT:
	str = "default";
	break;

	case ZPROP_SRC_LOCAL:
	str = "local";
	break;

	case ZPROP_SRC_TEMPORARY:
	str = "temporary";
	break;

	case ZPROP_SRC_INHERITED:
	(void) snprintf(buf, sizeof (buf),
	"inherited from %s", source);
	str = buf;
	break;
	case ZPROP_SRC_RECEIVED:
	str = "received";
	break;

	default:
	str = NULL;
	assert(!"unhandled zprop_source_t");
	}
	break;

	case GET_COL_RECVD:
	str = (recvd_value == NULL ? "-" : recvd_value);
	break;

	default:
	continue;
	}

	if (cbp->cb_columns[i + 1] == GET_COL_NONE)
	(void) printf("%s", str);
	else if (cbp->cb_scripted)
	(void) printf("%s\t", str);
	else
	(void) printf("%-*s ",
	cbp->cb_colwidths[cbp->cb_columns[i]],
	str);
	}

	(void) printf("\n");
	}

	/*
	* Given a numeric suffix, convert the value into a number of bits that the
	* resulting value must be shifted.
	*/
	static int
	str2shift(libzfs_handle_t hdl, const char buf)
	{
	const char *ends = "BKMGTPEZ";
	int i;

	if (buf[0] == '\0')
	return (0);
	for (i = 0; i < strlen(ends); i++) {
	if (toupper(buf[0]) == ends[i])
	break;
	}
	if (i == strlen(ends)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"invalid numeric suffix '%s'"), buf);
	return (-1);
	}

	/*
	* We want to allow trailing 'b' characters for 'GB' or 'Mb'. But don't
	* allow 'BB' - that's just weird.
	*/
	if (buf[1] == '\0' \|\| (toupper(buf[1]) == 'B' && buf[2] == '\0' &&
	toupper(buf[0]) != 'B'))
	return (10*i);

	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"invalid numeric suffix '%s'"), buf);
	return (-1);
	}

	/*
	* Convert a string of the form '100G' into a real number. Used when setting
	* properties or creating a volume. 'buf' is used to place an extended error
	* message for the caller to use.
	*/
	int
	zfs_nicestrtonum(libzfs_handle_t hdl, const char value, uint64_t *num)
	{
	char *end;
	int shift;

	*num = 0;

	/* Check to see if this looks like a number. */
	if ((value[0] < '0' \|\| value[0] > '9') && value[0] != '.') {
	if (hdl)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"bad numeric value '%s'"), value);
	return (-1);
	}

	/* Rely on strtoull() to process the numeric portion. */
	errno = 0;
	*num = strtoull(value, &end, 10);

	/*
	* Check for ERANGE, which indicates that the value is too large to fit
	* in a 64-bit value.
	*/
	if (errno == ERANGE) {
	if (hdl)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"numeric value is too large"));
	return (-1);
	}

	/*
	* If we have a decimal value, then do the computation with floating
	* point arithmetic. Otherwise, use standard arithmetic.
	*/
	if (*end == '.') {
	double fval = strtod(value, &end);

	if ((shift = str2shift(hdl, end)) == -1)
	return (-1);

	fval *= pow(2, shift);

	if (fval > UINT64_MAX) {
	if (hdl)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"numeric value is too large"));
	return (-1);
	}

	*num = (uint64_t)fval;
	} else {
	if ((shift = str2shift(hdl, end)) == -1)
	return (-1);

	/* Check for overflow */
	if (shift >= 64 \|\| (num << shift) >> shift != num) {
	if (hdl)
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"numeric value is too large"));
	return (-1);
	}

	*num <<= shift;
	}

	return (0);
	}

	/*
	* Given a propname=value nvpair to set, parse any numeric properties
	* (index, boolean, etc) if they are specified as strings and add the
	* resulting nvpair to the returned nvlist.
	*
	* At the DSL layer, all properties are either 64-bit numbers or strings.
	* We want the user to be able to ignore this fact and specify properties
	* as native values (numbers, for example) or as strings (to simplify
	* command line utilities). This also handles converting index types
	* (compression, checksum, etc) from strings to their on-disk index.
	*/
	int
	zprop_parse_value(libzfs_handle_t hdl, nvpair_t elem, int prop,
	zfs_type_t type, nvlist_t ret, char svalp, uint64_t ivalp,
	const char *errbuf)
	{
	data_type_t datatype = nvpair_type(elem);
	zprop_type_t proptype;
	const char *propname;
	char *value;
	boolean_t isnone = B_FALSE;

	if (type == ZFS_TYPE_POOL) {
	proptype = zpool_prop_get_type(prop);
	propname = zpool_prop_to_name(prop);
	} else {
	proptype = zfs_prop_get_type(prop);
	propname = zfs_prop_to_name(prop);
	}

	/*
	* Convert any properties to the internal DSL value types.
	*/
	*svalp = NULL;
	*ivalp = 0;

	switch (proptype) {
	case PROP_TYPE_STRING:
	if (datatype != DATA_TYPE_STRING) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' must be a string"), nvpair_name(elem));
	goto error;
	}
	(void) nvpair_value_string(elem, svalp);
	if (strlen(*svalp) >= ZFS_MAXPROPLEN) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' is too long"), nvpair_name(elem));
	goto error;
	}
	break;

	case PROP_TYPE_NUMBER:
	if (datatype == DATA_TYPE_STRING) {
	(void) nvpair_value_string(elem, &value);
	if (strcmp(value, "none") == 0) {
	isnone = B_TRUE;
	} else if (zfs_nicestrtonum(hdl, value, ivalp)
	!= 0) {
	goto error;
	}
	} else if (datatype == DATA_TYPE_UINT64) {
	(void) nvpair_value_uint64(elem, ivalp);
	} else {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' must be a number"), nvpair_name(elem));
	goto error;
	}

	/*
	* Quota special: force 'none' and don't allow 0.
	*/
	if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone &&
	(prop == ZFS_PROP_QUOTA \|\| prop == ZFS_PROP_REFQUOTA)) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"use 'none' to disable quota/refquota"));
	goto error;
	}

	/*
	* Special handling for "*_limit=none". In this case it's not
	* 0 but UINT64_MAX.
	*/
	if ((type & ZFS_TYPE_DATASET) && isnone &&
	(prop == ZFS_PROP_FILESYSTEM_LIMIT \|\|
	prop == ZFS_PROP_SNAPSHOT_LIMIT)) {
	*ivalp = UINT64_MAX;
	}
	break;

	case PROP_TYPE_INDEX:
	if (datatype != DATA_TYPE_STRING) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' must be a string"), nvpair_name(elem));
	goto error;
	}

	(void) nvpair_value_string(elem, &value);

	if (zprop_string_to_index(prop, value, ivalp, type) != 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"'%s' must be one of '%s'"), propname,
	zprop_values(prop, type));
	goto error;
	}
	break;

	default:
	abort();
	}

	/*
	* Add the result to our return set of properties.
	*/
	if (*svalp != NULL) {
	if (nvlist_add_string(ret, propname, *svalp) != 0) {
	(void) no_memory(hdl);
	return (-1);
	}
	} else {
	if (nvlist_add_uint64(ret, propname, *ivalp) != 0) {
	(void) no_memory(hdl);
	return (-1);
	}
	}

	return (0);
	error:
	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
	return (-1);
	}

	static int
	addlist(libzfs_handle_t hdl, char propname, zprop_list_t **listp,
	zfs_type_t type)
	{
	int prop;
	zprop_list_t *entry;

	prop = zprop_name_to_prop(propname, type);

	if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type))
	prop = ZPROP_INVAL;

	/*
	* When no property table entry can be found, return failure if
	* this is a pool property or if this isn't a user-defined
	* dataset property,
	*/
	if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL &&
	!zpool_prop_feature(propname) &&
	!zpool_prop_unsupported(propname)) \|\|
	(type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) &&
	!zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"invalid property '%s'"), propname);
	return (zfs_error(hdl, EZFS_BADPROP,
	dgettext(TEXT_DOMAIN, "bad property list")));
	}

	if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL)
	return (-1);

	entry->pl_prop = prop;
	if (prop == ZPROP_INVAL) {
	if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) ==
	NULL) {
	free(entry);
	return (-1);
	}
	entry->pl_width = strlen(propname);
	} else {
	entry->pl_width = zprop_width(prop, &entry->pl_fixed,
	type);
	}

	*listp = entry;

	return (0);
	}

	/*
	* Given a comma-separated list of properties, construct a property list
	* containing both user-defined and native properties. This function will
	* return a NULL list if 'all' is specified, which can later be expanded
	* by zprop_expand_list().
	*/
	int
	zprop_get_list(libzfs_handle_t hdl, char props, zprop_list_t **listp,
	zfs_type_t type)
	{
	*listp = NULL;

	/*
	* If 'all' is specified, return a NULL list.
	*/
	if (strcmp(props, "all") == 0)
	return (0);

	/*
	* If no props were specified, return an error.
	*/
	if (props[0] == '\0') {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"no properties specified"));
	return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN,
	"bad property list")));
	}

	/*
	* It would be nice to use getsubopt() here, but the inclusion of column
	* aliases makes this more effort than it's worth.
	*/
	while (*props != '\0') {
	size_t len;
	char *p;
	char c;

	if ((p = strchr(props, ',')) == NULL) {
	len = strlen(props);
	p = props + len;
	} else {
	len = p - props;
	}

	/*
	* Check for empty options.
	*/
	if (len == 0) {
	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
	"empty property name"));
	return (zfs_error(hdl, EZFS_BADPROP,
	dgettext(TEXT_DOMAIN, "bad property list")));
	}

	/*
	* Check all regular property names.
	*/
	c = props[len];
	props[len] = '\0';

	if (strcmp(props, "space") == 0) {
	static char *spaceprops[] = {
	"name", "avail", "used", "usedbysnapshots",
	"usedbydataset", "usedbyrefreservation",
	"usedbychildren", NULL
	};
	int i;

	for (i = 0; spaceprops[i]; i++) {
	if (addlist(hdl, spaceprops[i], listp, type))
	return (-1);
	listp = &(*listp)->pl_next;
	}
	} else {
	if (addlist(hdl, props, listp, type))
	return (-1);
	listp = &(*listp)->pl_next;
	}

	props = p;
	if (c == ',')
	props++;
	}

	return (0);
	}

	void
	zprop_free_list(zprop_list_t *pl)
	{
	zprop_list_t *next;

	while (pl != NULL) {
	next = pl->pl_next;
	free(pl->pl_user_prop);
	free(pl);
	pl = next;
	}
	}

	typedef struct expand_data {
	zprop_list_t **last;
	libzfs_handle_t *hdl;
	zfs_type_t type;
	} expand_data_t;

	int
	zprop_expand_list_cb(int prop, void *cb)
	{
	zprop_list_t *entry;
	expand_data_t *edp = cb;

	if ((entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t))) == NULL)
	return (ZPROP_INVAL);

	entry->pl_prop = prop;
	entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type);
	entry->pl_all = B_TRUE;

	*(edp->last) = entry;
	edp->last = &entry->pl_next;

	return (ZPROP_CONT);
	}

	int
	zprop_expand_list(libzfs_handle_t hdl, zprop_list_t *plp, zfs_type_t type)
	{
	zprop_list_t *entry;
	zprop_list_t **last;
	expand_data_t exp;

	if (*plp == NULL) {
	/*
	* If this is the very first time we've been called for an 'all'
	* specification, expand the list to include all native
	* properties.
	*/
	last = plp;

	exp.last = last;
	exp.hdl = hdl;
	exp.type = type;

	if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE,
	B_FALSE, type) == ZPROP_INVAL)
	return (-1);

	/*
	* Add 'name' to the beginning of the list, which is handled
	* specially.
	*/
	if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL)
	return (-1);

	entry->pl_prop = (type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME :
	ZFS_PROP_NAME;
	entry->pl_width = zprop_width(entry->pl_prop,
	&entry->pl_fixed, type);
	entry->pl_all = B_TRUE;
	entry->pl_next = *plp;
	*plp = entry;
	}
	return (0);
	}

	int
	zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered,
	zfs_type_t type)
	{
	return (zprop_iter_common(func, cb, show_all, ordered, type));
	}
	Index: stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
	===================================================================
	--- stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c (revision 332524)
	+++ stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c (revision 332525)
	@@ -1,1009 +1,1019 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013 Steven Hartland. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2017 RackTop Systems.
	*/

	/*
	* LibZFS_Core (lzc) is intended to replace most functionality in libzfs.
	* It has the following characteristics:
	*
	* - Thread Safe. libzfs_core is accessible concurrently from multiple
	* threads. This is accomplished primarily by avoiding global data
	* (e.g. caching). Since it's thread-safe, there is no reason for a
	* process to have multiple libzfs "instances". Therefore, we store
	* our few pieces of data (e.g. the file descriptor) in global
	* variables. The fd is reference-counted so that the libzfs_core
	* library can be "initialized" multiple times (e.g. by different
	* consumers within the same process).
	*
	* - Committed Interface. The libzfs_core interface will be committed,
	* therefore consumers can compile against it and be confident that
	* their code will continue to work on future releases of this code.
	* Currently, the interface is Evolving (not Committed), but we intend
	* to commit to it once it is more complete and we determine that it
	* meets the needs of all consumers.
	*
	* - Programatic Error Handling. libzfs_core communicates errors with
	* defined error numbers, and doesn't print anything to stdout/stderr.
	*
	* - Thin Layer. libzfs_core is a thin layer, marshaling arguments
	* to/from the kernel ioctls. There is generally a 1:1 correspondence
	* between libzfs_core functions and ioctls to /dev/zfs.
	*
	* - Clear Atomicity. Because libzfs_core functions are generally 1:1
	* with kernel ioctls, and kernel ioctls are general atomic, each
	* libzfs_core function is atomic. For example, creating multiple
	* snapshots with a single call to lzc_snapshot() is atomic -- it
	* can't fail with only some of the requested snapshots created, even
	* in the event of power loss or system crash.
	*
	* - Continued libzfs Support. Some higher-level operations (e.g.
	* support for "zfs send -R") are too complicated to fit the scope of
	* libzfs_core. This functionality will continue to live in libzfs.
	* Where appropriate, libzfs will use the underlying atomic operations
	* of libzfs_core. For example, libzfs may implement "zfs send -R \|
	* zfs receive" by using individual "send one snapshot", rename,
	* destroy, and "receive one snapshot" operations in libzfs_core.
	* /sbin/zfs and /zbin/zpool will link with both libzfs and
	* libzfs_core. Other consumers should aim to use only libzfs_core,
	* since that will be the supported, stable interface going forwards.
	*/

	#define _IN_LIBZFS_CORE_

	#include <libzfs_core.h>
	#include <ctype.h>
	#include <unistd.h>
	#include <stdlib.h>
	#include <string.h>
	#include <errno.h>
	#include <fcntl.h>
	#include <pthread.h>
	#include <sys/nvpair.h>
	#include <sys/param.h>
	#include <sys/types.h>
	#include <sys/stat.h>
	#include <sys/zfs_ioctl.h>
	#include "libzfs_core_compat.h"
	#include "libzfs_compat.h"

	#ifdef __FreeBSD__
	extern int zfs_ioctl_version;
	#endif

	static int g_fd = -1;
	static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER;
	static int g_refcount;

	int
	libzfs_core_init(void)
	{
	(void) pthread_mutex_lock(&g_lock);
	if (g_refcount == 0) {
	g_fd = open("/dev/zfs", O_RDWR);
	if (g_fd < 0) {
	(void) pthread_mutex_unlock(&g_lock);
	return (errno);
	}
	}
	g_refcount++;
	(void) pthread_mutex_unlock(&g_lock);

	return (0);
	}

	void
	libzfs_core_fini(void)
	{
	(void) pthread_mutex_lock(&g_lock);
	ASSERT3S(g_refcount, >, 0);

	if (g_refcount > 0)
	g_refcount--;

	if (g_refcount == 0 && g_fd != -1) {
	(void) close(g_fd);
	g_fd = -1;
	}
	(void) pthread_mutex_unlock(&g_lock);
	}

	static int
	lzc_ioctl(zfs_ioc_t ioc, const char *name,
	nvlist_t source, nvlist_t *resultp)
	{
	zfs_cmd_t zc = { 0 };
	int error = 0;
	char *packed;
	#ifdef __FreeBSD__
	nvlist_t *oldsource;
	#endif
	size_t size;

	ASSERT3S(g_refcount, >, 0);
	VERIFY3S(g_fd, !=, -1);

	(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));

	#ifdef __FreeBSD__
	if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
	zfs_ioctl_version = get_zfs_ioctl_version();

	if (zfs_ioctl_version < ZFS_IOCVER_LZC) {
	oldsource = source;
	error = lzc_compat_pre(&zc, &ioc, &source);
	if (error)
	return (error);
	}
	#endif

	packed = fnvlist_pack(source, &size);
	zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
	zc.zc_nvlist_src_size = size;

	if (resultp != NULL) {
	*resultp = NULL;
	if (ioc == ZFS_IOC_CHANNEL_PROGRAM) {
	zc.zc_nvlist_dst_size = fnvlist_lookup_uint64(source,
	ZCP_ARG_MEMLIMIT);
	} else {
	zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024);
	}
	zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
	malloc(zc.zc_nvlist_dst_size);
	#ifdef illumos
	if (zc.zc_nvlist_dst == NULL) {
	#else
	if (zc.zc_nvlist_dst == 0) {
	#endif
	error = ENOMEM;
	goto out;
	}
	}

	while (ioctl(g_fd, ioc, &zc) != 0) {
	/*
	* If ioctl exited with ENOMEM, we retry the ioctl after
	* increasing the size of the destination nvlist.
	*
	* Channel programs that exit with ENOMEM ran over the
	* lua memory sandbox; they should not be retried.
	*/
	if (errno == ENOMEM && resultp != NULL &&
	ioc != ZFS_IOC_CHANNEL_PROGRAM) {
	free((void *)(uintptr_t)zc.zc_nvlist_dst);
	zc.zc_nvlist_dst_size *= 2;
	zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
	malloc(zc.zc_nvlist_dst_size);
	#ifdef illumos
	if (zc.zc_nvlist_dst == NULL) {
	#else
	if (zc.zc_nvlist_dst == 0) {
	#endif
	error = ENOMEM;
	goto out;
	}
	} else {
	error = errno;
	break;
	}
	}

	#ifdef __FreeBSD__
	if (zfs_ioctl_version < ZFS_IOCVER_LZC)
	lzc_compat_post(&zc, ioc);
	#endif
	if (zc.zc_nvlist_dst_filled) {
	resultp = fnvlist_unpack((void )(uintptr_t)zc.zc_nvlist_dst,
	zc.zc_nvlist_dst_size);
	}
	#ifdef __FreeBSD__
	if (zfs_ioctl_version < ZFS_IOCVER_LZC)
	lzc_compat_outnvl(&zc, ioc, resultp);
	#endif
	out:
	#ifdef __FreeBSD__
	if (zfs_ioctl_version < ZFS_IOCVER_LZC) {
	if (source != oldsource)
	nvlist_free(source);
	source = oldsource;
	}
	#endif
	fnvlist_pack_free(packed, size);
	free((void *)(uintptr_t)zc.zc_nvlist_dst);
	return (error);
	}

	int
	lzc_create(const char fsname, enum lzc_dataset_type type, nvlist_t props)
	{
	int error;
	nvlist_t *args = fnvlist_alloc();
	fnvlist_add_int32(args, "type", (dmu_objset_type_t)type);
	if (props != NULL)
	fnvlist_add_nvlist(args, "props", props);
	error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL);
	nvlist_free(args);
	return (error);
	}

	int
	lzc_clone(const char fsname, const char origin,
	nvlist_t *props)
	{
	int error;
	nvlist_t *args = fnvlist_alloc();
	fnvlist_add_string(args, "origin", origin);
	if (props != NULL)
	fnvlist_add_nvlist(args, "props", props);
	error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL);
	nvlist_free(args);
	return (error);
	}

	int
	lzc_promote(const char fsname, char snapnamebuf, int snapnamelen)
	{
	/*
	* The promote ioctl is still legacy, so we need to construct our
	* own zfs_cmd_t rather than using lzc_ioctl().
	*/
	zfs_cmd_t zc = { 0 };

	ASSERT3S(g_refcount, >, 0);
	VERIFY3S(g_fd, !=, -1);

	(void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name));
	if (ioctl(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) {
	int error = errno;
	if (error == EEXIST && snapnamebuf != NULL)
	(void) strlcpy(snapnamebuf, zc.zc_string, snapnamelen);
	return (error);
	}
	return (0);
	}

	+int
	+lzc_remap(const char *fsname)
	+{
	+ int error;
	+ nvlist_t *args = fnvlist_alloc();
	+ error = lzc_ioctl(ZFS_IOC_REMAP, fsname, args, NULL);
	+ nvlist_free(args);
	+ return (error);
	+}
	+
	/*
	* Creates snapshots.
	*
	* The keys in the snaps nvlist are the snapshots to be created.
	* They must all be in the same pool.
	*
	* The props nvlist is properties to set. Currently only user properties
	* are supported. { user:prop_name -> string value }
	*
	* The returned results nvlist will have an entry for each snapshot that failed.
	* The value will be the (int32) error code.
	*
	* The return value will be 0 if all snapshots were created, otherwise it will
	* be the errno of a (unspecified) snapshot that failed.
	*/
	int
	lzc_snapshot(nvlist_t snaps, nvlist_t props, nvlist_t **errlist)
	{
	nvpair_t *elem;
	nvlist_t *args;
	int error;
	char pool[ZFS_MAX_DATASET_NAME_LEN];

	*errlist = NULL;

	/* determine the pool name */
	elem = nvlist_next_nvpair(snaps, NULL);
	if (elem == NULL)
	return (0);
	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	pool[strcspn(pool, "/@")] = '\0';

	args = fnvlist_alloc();
	fnvlist_add_nvlist(args, "snaps", snaps);
	if (props != NULL)
	fnvlist_add_nvlist(args, "props", props);

	error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist);
	nvlist_free(args);

	return (error);
	}

	/*
	* Destroys snapshots.
	*
	* The keys in the snaps nvlist are the snapshots to be destroyed.
	* They must all be in the same pool.
	*
	* Snapshots that do not exist will be silently ignored.
	*
	* If 'defer' is not set, and a snapshot has user holds or clones, the
	* destroy operation will fail and none of the snapshots will be
	* destroyed.
	*
	* If 'defer' is set, and a snapshot has user holds or clones, it will be
	* marked for deferred destruction, and will be destroyed when the last hold
	* or clone is removed/destroyed.
	*
	* The return value will be 0 if all snapshots were destroyed (or marked for
	* later destruction if 'defer' is set) or didn't exist to begin with.
	*
	* Otherwise the return value will be the errno of a (unspecified) snapshot
	* that failed, no snapshots will be destroyed, and the errlist will have an
	* entry for each snapshot that failed. The value in the errlist will be
	* the (int32) error code.
	*/
	int
	lzc_destroy_snaps(nvlist_t snaps, boolean_t defer, nvlist_t *errlist)
	{
	nvpair_t *elem;
	nvlist_t *args;
	int error;
	char pool[ZFS_MAX_DATASET_NAME_LEN];

	/* determine the pool name */
	elem = nvlist_next_nvpair(snaps, NULL);
	if (elem == NULL)
	return (0);
	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	pool[strcspn(pool, "/@")] = '\0';

	args = fnvlist_alloc();
	fnvlist_add_nvlist(args, "snaps", snaps);
	if (defer)
	fnvlist_add_boolean(args, "defer");

	error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist);
	nvlist_free(args);

	return (error);
	}

	int
	lzc_snaprange_space(const char firstsnap, const char lastsnap,
	uint64_t *usedp)
	{
	nvlist_t *args;
	nvlist_t *result;
	int err;
	char fs[ZFS_MAX_DATASET_NAME_LEN];
	char *atp;

	/* determine the fs name */
	(void) strlcpy(fs, firstsnap, sizeof (fs));
	atp = strchr(fs, '@');
	if (atp == NULL)
	return (EINVAL);
	*atp = '\0';

	args = fnvlist_alloc();
	fnvlist_add_string(args, "firstsnap", firstsnap);

	err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result);
	nvlist_free(args);
	if (err == 0)
	*usedp = fnvlist_lookup_uint64(result, "used");
	fnvlist_free(result);

	return (err);
	}

	boolean_t
	lzc_exists(const char *dataset)
	{
	/*
	* The objset_stats ioctl is still legacy, so we need to construct our
	* own zfs_cmd_t rather than using lzc_ioctl().
	*/
	zfs_cmd_t zc = { 0 };

	ASSERT3S(g_refcount, >, 0);
	VERIFY3S(g_fd, !=, -1);

	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
	return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0);
	}

	/*
	* Create "user holds" on snapshots. If there is a hold on a snapshot,
	* the snapshot can not be destroyed. (However, it can be marked for deletion
	* by lzc_destroy_snaps(defer=B_TRUE).)
	*
	* The keys in the nvlist are snapshot names.
	* The snapshots must all be in the same pool.
	* The value is the name of the hold (string type).
	*
	* If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
	* In this case, when the cleanup_fd is closed (including on process
	* termination), the holds will be released. If the system is shut down
	* uncleanly, the holds will be released when the pool is next opened
	* or imported.
	*
	* Holds for snapshots which don't exist will be skipped and have an entry
	* added to errlist, but will not cause an overall failure.
	*
	* The return value will be 0 if all holds, for snapshots that existed,
	* were succesfully created.
	*
	* Otherwise the return value will be the errno of a (unspecified) hold that
	* failed and no holds will be created.
	*
	* In all cases the errlist will have an entry for each hold that failed
	* (name = snapshot), with its value being the error code (int32).
	*/
	int
	lzc_hold(nvlist_t holds, int cleanup_fd, nvlist_t *errlist)
	{
	char pool[ZFS_MAX_DATASET_NAME_LEN];
	nvlist_t *args;
	nvpair_t *elem;
	int error;

	/* determine the pool name */
	elem = nvlist_next_nvpair(holds, NULL);
	if (elem == NULL)
	return (0);
	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	pool[strcspn(pool, "/@")] = '\0';

	args = fnvlist_alloc();
	fnvlist_add_nvlist(args, "holds", holds);
	if (cleanup_fd != -1)
	fnvlist_add_int32(args, "cleanup_fd", cleanup_fd);

	error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist);
	nvlist_free(args);
	return (error);
	}

	/*
	* Release "user holds" on snapshots. If the snapshot has been marked for
	* deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have
	* any clones, and all the user holds are removed, then the snapshot will be
	* destroyed.
	*
	* The keys in the nvlist are snapshot names.
	* The snapshots must all be in the same pool.
	* The value is a nvlist whose keys are the holds to remove.
	*
	* Holds which failed to release because they didn't exist will have an entry
	* added to errlist, but will not cause an overall failure.
	*
	* The return value will be 0 if the nvl holds was empty or all holds that
	* existed, were successfully removed.
	*
	* Otherwise the return value will be the errno of a (unspecified) hold that
	* failed to release and no holds will be released.
	*
	* In all cases the errlist will have an entry for each hold that failed to
	* to release.
	*/
	int
	lzc_release(nvlist_t holds, nvlist_t *errlist)
	{
	char pool[ZFS_MAX_DATASET_NAME_LEN];
	nvpair_t *elem;

	/* determine the pool name */
	elem = nvlist_next_nvpair(holds, NULL);
	if (elem == NULL)
	return (0);
	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	pool[strcspn(pool, "/@")] = '\0';

	return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist));
	}

	/*
	* Retrieve list of user holds on the specified snapshot.
	*
	* On success, *holdsp will be set to a nvlist which the caller must free.
	* The keys are the names of the holds, and the value is the creation time
	* of the hold (uint64) in seconds since the epoch.
	*/
	int
	lzc_get_holds(const char snapname, nvlist_t *holdsp)
	{
	int error;
	nvlist_t *innvl = fnvlist_alloc();
	error = lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, innvl, holdsp);
	fnvlist_free(innvl);
	return (error);
	}

	/*
	* Generate a zfs send stream for the specified snapshot and write it to
	* the specified file descriptor.
	*
	* "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
	*
	* If "from" is NULL, a full (non-incremental) stream will be sent.
	* If "from" is non-NULL, it must be the full name of a snapshot or
	* bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or
	* "pool/fs#earlier_bmark"). If non-NULL, the specified snapshot or
	* bookmark must represent an earlier point in the history of "snapname").
	* It can be an earlier snapshot in the same filesystem or zvol as "snapname",
	* or it can be the origin of "snapname"'s filesystem, or an earlier
	* snapshot in the origin, etc.
	*
	* "fd" is the file descriptor to write the send stream to.
	*
	* If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
	* to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
	* records with drr_blksz > 128K.
	*
	* If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
	* to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
	* which the receiving system must support (as indicated by support
	* for the "embedded_data" feature).
	*/
	int
	lzc_send(const char snapname, const char from, int fd,
	enum lzc_send_flags flags)
	{
	return (lzc_send_resume(snapname, from, fd, flags, 0, 0));
	}

	int
	lzc_send_resume(const char snapname, const char from, int fd,
	enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff)
	{
	nvlist_t *args;
	int err;

	args = fnvlist_alloc();
	fnvlist_add_int32(args, "fd", fd);
	if (from != NULL)
	fnvlist_add_string(args, "fromsnap", from);
	if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
	fnvlist_add_boolean(args, "largeblockok");
	if (flags & LZC_SEND_FLAG_EMBED_DATA)
	fnvlist_add_boolean(args, "embedok");
	if (flags & LZC_SEND_FLAG_COMPRESS)
	fnvlist_add_boolean(args, "compressok");
	if (resumeobj != 0 \|\| resumeoff != 0) {
	fnvlist_add_uint64(args, "resume_object", resumeobj);
	fnvlist_add_uint64(args, "resume_offset", resumeoff);
	}
	err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
	nvlist_free(args);
	return (err);
	}

	/*
	* "from" can be NULL, a snapshot, or a bookmark.
	*
	* If from is NULL, a full (non-incremental) stream will be estimated. This
	* is calculated very efficiently.
	*
	* If from is a snapshot, lzc_send_space uses the deadlists attached to
	* each snapshot to efficiently estimate the stream size.
	*
	* If from is a bookmark, the indirect blocks in the destination snapshot
	* are traversed, looking for blocks with a birth time since the creation TXG of
	* the snapshot this bookmark was created from. This will result in
	* significantly more I/O and be less efficient than a send space estimation on
	* an equivalent snapshot.
	*/
	int
	lzc_send_space(const char snapname, const char from,
	enum lzc_send_flags flags, uint64_t *spacep)
	{
	nvlist_t *args;
	nvlist_t *result;
	int err;

	args = fnvlist_alloc();
	if (from != NULL)
	fnvlist_add_string(args, "from", from);
	if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
	fnvlist_add_boolean(args, "largeblockok");
	if (flags & LZC_SEND_FLAG_EMBED_DATA)
	fnvlist_add_boolean(args, "embedok");
	if (flags & LZC_SEND_FLAG_COMPRESS)
	fnvlist_add_boolean(args, "compressok");
	err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result);
	nvlist_free(args);
	if (err == 0)
	*spacep = fnvlist_lookup_uint64(result, "space");
	nvlist_free(result);
	return (err);
	}

	static int
	recv_read(int fd, void *buf, int ilen)
	{
	char *cp = buf;
	int rv;
	int len = ilen;

	do {
	rv = read(fd, cp, len);
	cp += rv;
	len -= rv;
	} while (rv > 0);

	if (rv < 0 \|\| len != 0)
	return (EIO);

	return (0);
	}

	static int
	recv_impl(const char snapname, nvlist_t props, const char *origin,
	boolean_t force, boolean_t resumable, int fd,
	const dmu_replay_record_t *begin_record)
	{
	/*
	* The receive ioctl is still legacy, so we need to construct our own
	* zfs_cmd_t rather than using zfsc_ioctl().
	*/
	zfs_cmd_t zc = { 0 };
	char *atp;
	char *packed = NULL;
	size_t size;
	int error;

	ASSERT3S(g_refcount, >, 0);
	VERIFY3S(g_fd, !=, -1);

	/* zc_name is name of containing filesystem */
	(void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name));
	atp = strchr(zc.zc_name, '@');
	if (atp == NULL)
	return (EINVAL);
	*atp = '\0';

	/* if the fs does not exist, try its parent. */
	if (!lzc_exists(zc.zc_name)) {
	char *slashp = strrchr(zc.zc_name, '/');
	if (slashp == NULL)
	return (ENOENT);
	*slashp = '\0';

	}

	/* zc_value is full name of the snapshot to create */
	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));

	if (props != NULL) {
	/* zc_nvlist_src is props to set */
	packed = fnvlist_pack(props, &size);
	zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
	zc.zc_nvlist_src_size = size;
	}

	/* zc_string is name of clone origin (if DRR_FLAG_CLONE) */
	if (origin != NULL)
	(void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string));

	/* zc_begin_record is non-byteswapped BEGIN record */
	if (begin_record == NULL) {
	error = recv_read(fd, &zc.zc_begin_record,
	sizeof (zc.zc_begin_record));
	if (error != 0)
	goto out;
	} else {
	zc.zc_begin_record = *begin_record;
	}

	/* zc_cookie is fd to read from */
	zc.zc_cookie = fd;

	/* zc guid is force flag */
	zc.zc_guid = force;

	zc.zc_resumable = resumable;

	/* zc_cleanup_fd is unused */
	zc.zc_cleanup_fd = -1;

	error = ioctl(g_fd, ZFS_IOC_RECV, &zc);
	if (error != 0)
	error = errno;

	out:
	if (packed != NULL)
	fnvlist_pack_free(packed, size);
	free((void*)(uintptr_t)zc.zc_nvlist_dst);
	return (error);
	}

	/*
	* The simplest receive case: receive from the specified fd, creating the
	* specified snapshot. Apply the specified properties as "received" properties
	* (which can be overridden by locally-set properties). If the stream is a
	* clone, its origin snapshot must be specified by 'origin'. The 'force'
	* flag will cause the target filesystem to be rolled back or destroyed if
	* necessary to receive.
	*
	* Return 0 on success or an errno on failure.
	*
	* Note: this interface does not work on dedup'd streams
	* (those with DMU_BACKUP_FEATURE_DEDUP).
	*/
	int
	lzc_receive(const char snapname, nvlist_t props, const char *origin,
	boolean_t force, int fd)
	{
	return (recv_impl(snapname, props, origin, force, B_FALSE, fd, NULL));
	}

	/*
	* Like lzc_receive, but if the receive fails due to premature stream
	* termination, the intermediate state will be preserved on disk. In this
	* case, ECKSUM will be returned. The receive may subsequently be resumed
	* with a resuming send stream generated by lzc_send_resume().
	*/
	int
	lzc_receive_resumable(const char snapname, nvlist_t props, const char *origin,
	boolean_t force, int fd)
	{
	return (recv_impl(snapname, props, origin, force, B_TRUE, fd, NULL));
	}

	/*
	* Like lzc_receive, but allows the caller to read the begin record and then to
	* pass it in. That could be useful if the caller wants to derive, for example,
	* the snapname or the origin parameters based on the information contained in
	* the begin record.
	* The begin record must be in its original form as read from the stream,
	* in other words, it should not be byteswapped.
	*
	* The 'resumable' parameter allows to obtain the same behavior as with
	* lzc_receive_resumable.
	*/
	int
	lzc_receive_with_header(const char snapname, nvlist_t props,
	const char *origin, boolean_t force, boolean_t resumable, int fd,
	const dmu_replay_record_t *begin_record)
	{
	if (begin_record == NULL)
	return (EINVAL);
	return (recv_impl(snapname, props, origin, force, resumable, fd,
	begin_record));
	}

	/*
	* Roll back this filesystem or volume to its most recent snapshot.
	* If snapnamebuf is not NULL, it will be filled in with the name
	* of the most recent snapshot.
	* Note that the latest snapshot may change if a new one is concurrently
	* created or the current one is destroyed. lzc_rollback_to can be used
	* to roll back to a specific latest snapshot.
	*
	* Return 0 on success or an errno on failure.
	*/
	int
	lzc_rollback(const char fsname, char snapnamebuf, int snapnamelen)
	{
	nvlist_t *args;
	nvlist_t *result;
	int err;

	args = fnvlist_alloc();
	err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
	nvlist_free(args);
	if (err == 0 && snapnamebuf != NULL) {
	const char *snapname = fnvlist_lookup_string(result, "target");
	(void) strlcpy(snapnamebuf, snapname, snapnamelen);
	}
	nvlist_free(result);

	return (err);
	}

	/*
	* Roll back this filesystem or volume to the specified snapshot,
	* if possible.
	*
	* Return 0 on success or an errno on failure.
	*/
	int
	lzc_rollback_to(const char fsname, const char snapname)
	{
	nvlist_t *args;
	nvlist_t *result;
	int err;

	args = fnvlist_alloc();
	fnvlist_add_string(args, "target", snapname);
	err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
	nvlist_free(args);
	nvlist_free(result);
	return (err);
	}

	/*
	* Creates bookmarks.
	*
	* The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to
	* the name of the snapshot (e.g. "pool/fs@snap"). All the bookmarks and
	* snapshots must be in the same pool.
	*
	* The returned results nvlist will have an entry for each bookmark that failed.
	* The value will be the (int32) error code.
	*
	* The return value will be 0 if all bookmarks were created, otherwise it will
	* be the errno of a (undetermined) bookmarks that failed.
	*/
	int
	lzc_bookmark(nvlist_t bookmarks, nvlist_t *errlist)
	{
	nvpair_t *elem;
	int error;
	char pool[ZFS_MAX_DATASET_NAME_LEN];

	/* determine the pool name */
	elem = nvlist_next_nvpair(bookmarks, NULL);
	if (elem == NULL)
	return (0);
	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	pool[strcspn(pool, "/#")] = '\0';

	error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist);

	return (error);
	}

	/*
	* Retrieve bookmarks.
	*
	* Retrieve the list of bookmarks for the given file system. The props
	* parameter is an nvlist of property names (with no values) that will be
	* returned for each bookmark.
	*
	* The following are valid properties on bookmarks, all of which are numbers
	* (represented as uint64 in the nvlist)
	*
	* "guid" - globally unique identifier of the snapshot it refers to
	* "createtxg" - txg when the snapshot it refers to was created
	* "creation" - timestamp when the snapshot it refers to was created
	*
	* The format of the returned nvlist as follows:
	* <short name of bookmark> -> {
	* <name of property> -> {
	* "value" -> uint64
	* }
	* }
	*/
	int
	lzc_get_bookmarks(const char fsname, nvlist_t props, nvlist_t **bmarks)
	{
	return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks));
	}

	/*
	* Destroys bookmarks.
	*
	* The keys in the bmarks nvlist are the bookmarks to be destroyed.
	* They must all be in the same pool. Bookmarks are specified as
	* <fs>#<bmark>.
	*
	* Bookmarks that do not exist will be silently ignored.
	*
	* The return value will be 0 if all bookmarks that existed were destroyed.
	*
	* Otherwise the return value will be the errno of a (undetermined) bookmark
	* that failed, no bookmarks will be destroyed, and the errlist will have an
	* entry for each bookmarks that failed. The value in the errlist will be
	* the (int32) error code.
	*/
	int
	lzc_destroy_bookmarks(nvlist_t bmarks, nvlist_t *errlist)
	{
	nvpair_t *elem;
	int error;
	char pool[ZFS_MAX_DATASET_NAME_LEN];

	/* determine the pool name */
	elem = nvlist_next_nvpair(bmarks, NULL);
	if (elem == NULL)
	return (0);
	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
	pool[strcspn(pool, "/#")] = '\0';

	error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist);

	return (error);
	}

	static int
	lzc_channel_program_impl(const char pool, const char program, boolean_t sync,
	uint64_t instrlimit, uint64_t memlimit, nvlist_t argnvl, nvlist_t *outnvl)
	{
	int error;
	nvlist_t *args;

	args = fnvlist_alloc();
	fnvlist_add_string(args, ZCP_ARG_PROGRAM, program);
	fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl);
	fnvlist_add_boolean_value(args, ZCP_ARG_SYNC, sync);
	fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit);
	fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit);
	error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl);
	fnvlist_free(args);

	return (error);
	}

	/*
	* Executes a channel program.
	*
	* If this function returns 0 the channel program was successfully loaded and
	* ran without failing. Note that individual commands the channel program ran
	* may have failed and the channel program is responsible for reporting such
	* errors through outnvl if they are important.
	*
	* This method may also return:
	*
	* EINVAL The program contains syntax errors, or an invalid memory or time
	* limit was given. No part of the channel program was executed.
	* If caused by syntax errors, 'outnvl' contains information about the
	* errors.
	*
	* EDOM The program was executed, but encountered a runtime error, such as
	* calling a function with incorrect arguments, invoking the error()
	* function directly, failing an assert() command, etc. Some portion
	* of the channel program may have executed and committed changes.
	* Information about the failure can be found in 'outnvl'.
	*
	* ENOMEM The program fully executed, but the output buffer was not large
	* enough to store the returned value. No output is returned through
	* 'outnvl'.
	*
	* ENOSPC The program was terminated because it exceeded its memory usage
	* limit. Some portion of the channel program may have executed and
	* committed changes to disk. No output is returned through 'outnvl'.
	*
	* ETIMEDOUT The program was terminated because it exceeded its Lua instruction
	* limit. Some portion of the channel program may have executed and
	* committed changes to disk. No output is returned through 'outnvl'.
	*/
	int
	lzc_channel_program(const char pool, const char program, uint64_t instrlimit,
	uint64_t memlimit, nvlist_t argnvl, nvlist_t *outnvl)
	{
	return (lzc_channel_program_impl(pool, program, B_TRUE, instrlimit,
	memlimit, argnvl, outnvl));
	}

	/*
	* Executes a read-only channel program.
	*
	* A read-only channel program works programmatically the same way as a
	* normal channel program executed with lzc_channel_program(). The only
	* difference is it runs exclusively in open-context and therefore can
	* return faster. The downside to that, is that the program cannot change
	* on-disk state by calling functions from the zfs.sync submodule.
	*
	* The return values of this function (and their meaning) are exactly the
	* same as the ones described in lzc_channel_program().
	*/
	int
	lzc_channel_program_nosync(const char pool, const char program,
	uint64_t timeout, uint64_t memlimit, nvlist_t argnvl, nvlist_t *outnvl)
	{
	return (lzc_channel_program_impl(pool, program, B_FALSE, timeout,
	memlimit, argnvl, outnvl));
	}
	Index: stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
	===================================================================
	--- stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h (revision 332524)
	+++ stable/11/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h (revision 332525)
	@@ -1,98 +1,99 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright 2017 RackTop Systems.
	*/

	#ifndef _LIBZFS_CORE_H
	#define _LIBZFS_CORE_H

	#include <libnvpair.h>
	#include <sys/param.h>
	#include <sys/types.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	int libzfs_core_init(void);
	void libzfs_core_fini(void);

	/*
	* NB: this type should be kept binary compatible with dmu_objset_type_t.
	*/
	enum lzc_dataset_type {
	LZC_DATSET_TYPE_ZFS = 2,
	LZC_DATSET_TYPE_ZVOL
	};

	+int lzc_remap(const char *fsname);
	int lzc_snapshot(nvlist_t , nvlist_t , nvlist_t **);
	int lzc_create(const char , enum lzc_dataset_type, nvlist_t );
	int lzc_clone(const char , const char , nvlist_t *);
	int lzc_promote(const char , char , int);
	int lzc_destroy_snaps(nvlist_t , boolean_t, nvlist_t *);
	int lzc_bookmark(nvlist_t , nvlist_t *);
	int lzc_get_bookmarks(const char , nvlist_t , nvlist_t **);
	int lzc_destroy_bookmarks(nvlist_t , nvlist_t *);

	int lzc_snaprange_space(const char , const char , uint64_t *);

	int lzc_hold(nvlist_t , int, nvlist_t *);
	int lzc_release(nvlist_t , nvlist_t *);
	int lzc_get_holds(const char , nvlist_t *);

	enum lzc_send_flags {
	LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
	LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1,
	LZC_SEND_FLAG_COMPRESS = 1 << 2
	};

	int lzc_send(const char , const char , int, enum lzc_send_flags);
	int lzc_send_resume(const char , const char , int,
	enum lzc_send_flags, uint64_t, uint64_t);
	int lzc_send_space(const char , const char , enum lzc_send_flags, uint64_t *);

	struct dmu_replay_record;

	int lzc_receive(const char , nvlist_t , const char *, boolean_t, int);
	int lzc_receive_resumable(const char , nvlist_t , const char *,
	boolean_t, int);
	int lzc_receive_with_header(const char , nvlist_t , const char *, boolean_t,
	boolean_t, int, const struct dmu_replay_record *);

	boolean_t lzc_exists(const char *);

	int lzc_rollback(const char , char , int);
	int lzc_rollback_to(const char , const char );

	int lzc_channel_program(const char , const char , uint64_t,
	uint64_t, nvlist_t , nvlist_t *);
	int lzc_channel_program_nosync(const char , const char , uint64_t,
	uint64_t, nvlist_t , nvlist_t *);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _LIBZFS_CORE_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c (revision 332525)
	@@ -1,250 +1,266 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#ifdef _KERNEL
	#include <sys/systm.h>
	#else
	#include <errno.h>
	#include <string.h>
	#endif
	#include <sys/debug.h>
	#include <sys/fs/zfs.h>
	#include <sys/types.h>
	#include "zfeature_common.h"

	/*
	* Set to disable all feature checks while opening pools, allowing pools with
	* unsupported features to be opened. Set for testing only.
	*/
	boolean_t zfeature_checks_disable = B_FALSE;

	zfeature_info_t spa_feature_table[SPA_FEATURES];

	/*
	* Valid characters for feature guids. This list is mainly for aesthetic
	* purposes and could be expanded in the future. There are different allowed
	* characters in the guids reverse dns portion (before the colon) and its
	* short name (after the colon).
	*/
	static int
	valid_char(char c, boolean_t after_colon)
	{
	return ((c >= 'a' && c <= 'z') \|\|
	(c >= '0' && c <= '9') \|\|
	(after_colon && c == '_') \|\|
	(!after_colon && (c == '.' \|\| c == '-')));
	}

	/*
	* Every feature guid must contain exactly one colon which separates a reverse
	* dns organization name from the feature's "short" name (e.g.
	* "com.company:feature_name").
	*/
	boolean_t
	zfeature_is_valid_guid(const char *name)
	{
	int i;
	boolean_t has_colon = B_FALSE;

	i = 0;
	while (name[i] != '\0') {
	char c = name[i++];
	if (c == ':') {
	if (has_colon)
	return (B_FALSE);
	has_colon = B_TRUE;
	continue;
	}
	if (!valid_char(c, has_colon))
	return (B_FALSE);
	}

	return (has_colon);
	}

	boolean_t
	zfeature_is_supported(const char *guid)
	{
	if (zfeature_checks_disable)
	return (B_TRUE);

	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
	zfeature_info_t *feature = &spa_feature_table[i];
	if (strcmp(guid, feature->fi_guid) == 0)
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	int
	zfeature_lookup_name(const char name, spa_feature_t res)
	{
	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
	zfeature_info_t *feature = &spa_feature_table[i];
	if (strcmp(name, feature->fi_uname) == 0) {
	if (res != NULL)
	*res = i;
	return (0);
	}
	}

	return (ENOENT);
	}

	boolean_t
	zfeature_depends_on(spa_feature_t fid, spa_feature_t check)
	{
	zfeature_info_t *feature = &spa_feature_table[fid];

	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) {
	if (feature->fi_depends[i] == check)
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	static void
	zfeature_register(spa_feature_t fid, const char guid, const char name,
	const char desc, zfeature_flags_t flags, const spa_feature_t deps)
	{
	zfeature_info_t *feature = &spa_feature_table[fid];
	static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };

	ASSERT(name != NULL);
	ASSERT(desc != NULL);
	ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 \|\|
	(flags & ZFEATURE_FLAG_MOS) == 0);
	ASSERT3U(fid, <, SPA_FEATURES);
	ASSERT(zfeature_is_valid_guid(guid));

	if (deps == NULL)
	deps = nodeps;

	feature->fi_feature = fid;
	feature->fi_guid = guid;
	feature->fi_uname = name;
	feature->fi_desc = desc;
	feature->fi_flags = flags;
	feature->fi_depends = deps;
	}

	void
	zpool_feature_init(void)
	{
	zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
	"com.delphix:async_destroy", "async_destroy",
	"Destroy filesystems asynchronously.",
	ZFEATURE_FLAG_READONLY_COMPAT, NULL);

	zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
	"com.delphix:empty_bpobj", "empty_bpobj",
	"Snapshots use less space.",
	ZFEATURE_FLAG_READONLY_COMPAT, NULL);

	zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
	"org.illumos:lz4_compress", "lz4_compress",
	"LZ4 compression algorithm support.",
	ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL);

	zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
	"com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
	"Crash dumps to multiple vdev pools.",
	0, NULL);

	zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
	"com.delphix:spacemap_histogram", "spacemap_histogram",
	"Spacemaps maintain space histograms.",
	ZFEATURE_FLAG_READONLY_COMPAT, NULL);

	zfeature_register(SPA_FEATURE_ENABLED_TXG,
	"com.delphix:enabled_txg", "enabled_txg",
	"Record txg at which a feature is enabled",
	ZFEATURE_FLAG_READONLY_COMPAT, NULL);

	static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG,
	SPA_FEATURE_NONE };
	zfeature_register(SPA_FEATURE_HOLE_BIRTH,
	"com.delphix:hole_birth", "hole_birth",
	"Retain hole birth txg for more precise zfs send",
	ZFEATURE_FLAG_MOS \| ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
	hole_birth_deps);

	zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
	"com.delphix:extensible_dataset", "extensible_dataset",
	"Enhanced dataset functionality, used by other features.",
	0, NULL);

	static const spa_feature_t bookmarks_deps[] = {
	SPA_FEATURE_EXTENSIBLE_DATASET,
	SPA_FEATURE_NONE
	};
	zfeature_register(SPA_FEATURE_BOOKMARKS,
	"com.delphix:bookmarks", "bookmarks",
	"\"zfs bookmark\" command",
	ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps);

	static const spa_feature_t filesystem_limits_deps[] = {
	SPA_FEATURE_EXTENSIBLE_DATASET,
	SPA_FEATURE_NONE
	};
	zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
	"com.joyent:filesystem_limits", "filesystem_limits",
	"Filesystem and snapshot limits.",
	ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps);

	zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
	"com.delphix:embedded_data", "embedded_data",
	"Blocks which compress very well use even less space.",
	ZFEATURE_FLAG_MOS \| ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
	NULL);

	static const spa_feature_t large_blocks_deps[] = {
	SPA_FEATURE_EXTENSIBLE_DATASET,
	SPA_FEATURE_NONE
	};
	zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
	"org.open-zfs:large_blocks", "large_blocks",
	"Support for blocks larger than 128KB.",
	ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
	zfeature_register(SPA_FEATURE_SHA512,
	"org.illumos:sha512", "sha512",
	"SHA-512/256 hash algorithm.",
	ZFEATURE_FLAG_PER_DATASET, NULL);
	zfeature_register(SPA_FEATURE_SKEIN,
	"org.illumos:skein", "skein",
	"Skein hash algorithm.",
	ZFEATURE_FLAG_PER_DATASET, NULL);

	#ifdef illumos
	zfeature_register(SPA_FEATURE_EDONR,
	"org.illumos:edonr", "edonr",
	"Edon-R hash algorithm.",
	ZFEATURE_FLAG_PER_DATASET, NULL);
	#endif
	+
	+ zfeature_register(SPA_FEATURE_DEVICE_REMOVAL,
	+ "com.delphix:device_removal", "device_removal",
	+ "Top-level vdevs can be removed, reducing logical pool size.",
	+ ZFEATURE_FLAG_MOS, NULL);
	+
	+ static const spa_feature_t obsolete_counts_deps[] = {
	+ SPA_FEATURE_EXTENSIBLE_DATASET,
	+ SPA_FEATURE_DEVICE_REMOVAL,
	+ SPA_FEATURE_NONE
	+ };
	+ zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS,
	+ "com.delphix:obsolete_counts", "obsolete_counts",
	+ "Reduce memory used by removed devices when their blocks are "
	+ "freed or remapped.",
	+ ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h (revision 332525)
	@@ -1,104 +1,106 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#ifndef _ZFEATURE_COMMON_H
	#define _ZFEATURE_COMMON_H

	#include <sys/fs/zfs.h>
	#include <sys/types.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	struct zfeature_info;

	typedef enum spa_feature {
	SPA_FEATURE_NONE = -1,
	SPA_FEATURE_ASYNC_DESTROY,
	SPA_FEATURE_EMPTY_BPOBJ,
	SPA_FEATURE_LZ4_COMPRESS,
	SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
	SPA_FEATURE_SPACEMAP_HISTOGRAM,
	SPA_FEATURE_ENABLED_TXG,
	SPA_FEATURE_HOLE_BIRTH,
	SPA_FEATURE_EXTENSIBLE_DATASET,
	SPA_FEATURE_EMBEDDED_DATA,
	SPA_FEATURE_BOOKMARKS,
	SPA_FEATURE_FS_SS_LIMIT,
	SPA_FEATURE_LARGE_BLOCKS,
	SPA_FEATURE_SHA512,
	SPA_FEATURE_SKEIN,
	#ifdef illumos
	SPA_FEATURE_EDONR,
	#endif
	+ SPA_FEATURE_DEVICE_REMOVAL,
	+ SPA_FEATURE_OBSOLETE_COUNTS,
	SPA_FEATURES
	} spa_feature_t;

	#define SPA_FEATURE_DISABLED (-1ULL)

	typedef enum zfeature_flags {
	/* Can open pool readonly even if this feature is not supported. */
	ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0),
	/* Is this feature necessary to read the MOS? */
	ZFEATURE_FLAG_MOS = (1 << 1),
	/* Activate this feature at the same time it is enabled. */
	ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2),
	/* Each dataset has a field set if it has ever used this feature. */
	ZFEATURE_FLAG_PER_DATASET = (1 << 3)
	} zfeature_flags_t;

	typedef struct zfeature_info {
	spa_feature_t fi_feature;
	const char fi_uname; / User-facing feature name */
	const char fi_guid; / On-disk feature identifier */
	const char fi_desc; / Feature description */
	zfeature_flags_t fi_flags;
	/* array of dependencies, terminated by SPA_FEATURE_NONE */
	const spa_feature_t *fi_depends;
	} zfeature_info_t;

	typedef int (zfeature_func_t)(zfeature_info_t , void );

	#define ZFS_FEATURE_DEBUG

	extern zfeature_info_t spa_feature_table[SPA_FEATURES];

	extern boolean_t zfeature_is_valid_guid(const char *);

	extern boolean_t zfeature_is_supported(const char *);
	extern int zfeature_lookup_name(const char , spa_feature_t );
	extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);

	extern void zpool_feature_init(void);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _ZFEATURE_COMMON_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c (revision 332525)
	@@ -1,234 +1,235 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
	*/

	#include <sys/zfs_context.h>

	#if defined(_KERNEL)
	#include <sys/systm.h>
	#include <sys/sunddi.h>
	#include <sys/ctype.h>
	#else
	#include <stdio.h>
	#include <unistd.h>
	#include <strings.h>
	#include <libnvpair.h>
	#include <ctype.h>
	#endif
	#include <sys/dsl_deleg.h>
	#include "zfs_prop.h"
	#include "zfs_deleg.h"
	#include "zfs_namecheck.h"

	zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
	{ZFS_DELEG_PERM_ALLOW},
	{ZFS_DELEG_PERM_BOOKMARK},
	{ZFS_DELEG_PERM_CLONE},
	{ZFS_DELEG_PERM_CREATE},
	{ZFS_DELEG_PERM_DESTROY},
	{ZFS_DELEG_PERM_DIFF},
	{ZFS_DELEG_PERM_MOUNT},
	{ZFS_DELEG_PERM_PROMOTE},
	{ZFS_DELEG_PERM_RECEIVE},
	+ {ZFS_DELEG_PERM_REMAP},
	{ZFS_DELEG_PERM_RENAME},
	{ZFS_DELEG_PERM_ROLLBACK},
	{ZFS_DELEG_PERM_SNAPSHOT},
	{ZFS_DELEG_PERM_SHARE},
	{ZFS_DELEG_PERM_SEND},
	{ZFS_DELEG_PERM_USERPROP},
	{ZFS_DELEG_PERM_USERQUOTA},
	{ZFS_DELEG_PERM_GROUPQUOTA},
	{ZFS_DELEG_PERM_USERUSED},
	{ZFS_DELEG_PERM_GROUPUSED},
	{ZFS_DELEG_PERM_HOLD},
	{ZFS_DELEG_PERM_RELEASE},
	{NULL}
	};

	static int
	zfs_valid_permission_name(const char *perm)
	{
	if (zfs_deleg_canonicalize_perm(perm))
	return (0);

	return (permset_namecheck(perm, NULL, NULL));
	}

	const char *
	zfs_deleg_canonicalize_perm(const char *perm)
	{
	int i;
	zfs_prop_t prop;

	for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) {
	if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0)
	return (perm);
	}

	prop = zfs_name_to_prop(perm);
	if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop))
	return (zfs_prop_to_name(prop));
	return (NULL);

	}

	static int
	zfs_validate_who(char *who)
	{
	char *p;

	if (who[2] != ZFS_DELEG_FIELD_SEP_CHR)
	return (-1);

	switch (who[0]) {
	case ZFS_DELEG_USER:
	case ZFS_DELEG_GROUP:
	case ZFS_DELEG_USER_SETS:
	case ZFS_DELEG_GROUP_SETS:
	if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
	return (-1);
	for (p = &who[3]; *p; p++)
	if (!isdigit(*p))
	return (-1);
	break;

	case ZFS_DELEG_NAMED_SET:
	case ZFS_DELEG_NAMED_SET_SETS:
	if (who[1] != ZFS_DELEG_NA)
	return (-1);
	return (permset_namecheck(&who[3], NULL, NULL));

	case ZFS_DELEG_CREATE:
	case ZFS_DELEG_CREATE_SETS:
	if (who[1] != ZFS_DELEG_NA)
	return (-1);
	if (who[3] != '\0')
	return (-1);
	break;

	case ZFS_DELEG_EVERYONE:
	case ZFS_DELEG_EVERYONE_SETS:
	if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
	return (-1);
	if (who[3] != '\0')
	return (-1);
	break;

	default:
	return (-1);
	}

	return (0);
	}

	int
	zfs_deleg_verify_nvlist(nvlist_t *nvp)
	{
	nvpair_t who, perm_name;
	nvlist_t *perms;
	int error;

	if (nvp == NULL)
	return (-1);

	who = nvlist_next_nvpair(nvp, NULL);
	if (who == NULL)
	return (-1);

	do {
	if (zfs_validate_who(nvpair_name(who)))
	return (-1);

	error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms);

	if (error && error != ENOENT)
	return (-1);
	if (error == ENOENT)
	continue;

	perm_name = nvlist_next_nvpair(perms, NULL);
	if (perm_name == NULL) {
	return (-1);
	}
	do {
	error = zfs_valid_permission_name(
	nvpair_name(perm_name));
	if (error)
	return (-1);
	} while ((perm_name = nvlist_next_nvpair(perms, perm_name))
	!= NULL);
	} while ((who = nvlist_next_nvpair(nvp, who)) != NULL);
	return (0);
	}

	/*
	* Construct the base attribute name. The base attribute names
	* are the "key" to locate the jump objects which contain the actual
	* permissions. The base attribute names are encoded based on
	* type of entry and whether it is a local or descendent permission.
	*
	* Arguments:
	* attr - attribute name return string, attribute is assumed to be
	* ZFS_MAX_DELEG_NAME long.
	* type - type of entry to construct
	* inheritchr - inheritance type (local,descendent, or NA for create and
	* permission set definitions
	* data - is either a permission set name or a 64 bit uid/gid.
	*/
	void
	zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
	char inheritchr, void *data)
	{
	int len = ZFS_MAX_DELEG_NAME;
	uint64_t *id = data;

	switch (type) {
	case ZFS_DELEG_USER:
	case ZFS_DELEG_GROUP:
	case ZFS_DELEG_USER_SETS:
	case ZFS_DELEG_GROUP_SETS:
	(void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr,
	ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id);
	break;
	case ZFS_DELEG_NAMED_SET_SETS:
	case ZFS_DELEG_NAMED_SET:
	(void) snprintf(attr, len, "%c-%c%s", type,
	ZFS_DELEG_FIELD_SEP_CHR, (char *)data);
	break;
	case ZFS_DELEG_CREATE:
	case ZFS_DELEG_CREATE_SETS:
	(void) snprintf(attr, len, "%c-%c", type,
	ZFS_DELEG_FIELD_SEP_CHR);
	break;
	case ZFS_DELEG_EVERYONE:
	case ZFS_DELEG_EVERYONE_SETS:
	(void) snprintf(attr, len, "%c%c%c", type, inheritchr,
	ZFS_DELEG_FIELD_SEP_CHR);
	break;
	default:
	ASSERT(!"bad zfs_deleg_who_type_t");
	}
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h (revision 332525)
	@@ -1,89 +1,90 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	*/

	#ifndef _ZFS_DELEG_H
	#define _ZFS_DELEG_H

	#include <sys/fs/zfs.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	#define ZFS_DELEG_SET_NAME_CHR '@' /* set name lead char */
	#define ZFS_DELEG_FIELD_SEP_CHR '$' /* field separator */

	/*
	* Max name length for a delegation attribute
	*/
	#define ZFS_MAX_DELEG_NAME 128

	#define ZFS_DELEG_LOCAL 'l'
	#define ZFS_DELEG_DESCENDENT 'd'
	#define ZFS_DELEG_NA '-'

	typedef enum {
	ZFS_DELEG_NOTE_CREATE,
	ZFS_DELEG_NOTE_DESTROY,
	ZFS_DELEG_NOTE_SNAPSHOT,
	ZFS_DELEG_NOTE_ROLLBACK,
	ZFS_DELEG_NOTE_CLONE,
	ZFS_DELEG_NOTE_PROMOTE,
	ZFS_DELEG_NOTE_RENAME,
	ZFS_DELEG_NOTE_SEND,
	ZFS_DELEG_NOTE_RECEIVE,
	ZFS_DELEG_NOTE_ALLOW,
	ZFS_DELEG_NOTE_USERPROP,
	ZFS_DELEG_NOTE_MOUNT,
	ZFS_DELEG_NOTE_SHARE,
	ZFS_DELEG_NOTE_USERQUOTA,
	ZFS_DELEG_NOTE_GROUPQUOTA,
	ZFS_DELEG_NOTE_USERUSED,
	ZFS_DELEG_NOTE_GROUPUSED,
	ZFS_DELEG_NOTE_HOLD,
	ZFS_DELEG_NOTE_RELEASE,
	ZFS_DELEG_NOTE_DIFF,
	ZFS_DELEG_NOTE_BOOKMARK,
	+ ZFS_DELEG_NOTE_REMAP,
	ZFS_DELEG_NOTE_NONE
	} zfs_deleg_note_t;

	typedef struct zfs_deleg_perm_tab {
	char *z_perm;
	zfs_deleg_note_t z_note;
	} zfs_deleg_perm_tab_t;

	extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[];

	int zfs_deleg_verify_nvlist(nvlist_t *nvlist);
	void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
	char checkflag, void *data);
	const char zfs_deleg_canonicalize_perm(const char perm);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _ZFS_DELEG_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c (revision 332525)
	@@ -1,698 +1,700 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#include <sys/zio.h>
	#include <sys/spa.h>
	#include <sys/u8_textprep.h>
	#include <sys/zfs_acl.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/zfs_znode.h>

	#include "zfs_prop.h"
	#include "zfs_deleg.h"

	#if defined(_KERNEL)
	#include <sys/systm.h>
	#else
	#include <stdlib.h>
	#include <string.h>
	#include <ctype.h>
	#endif

	static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];

	/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */
	const char *zfs_userquota_prop_prefixes[] = {
	"userused@",
	"userquota@",
	"groupused@",
	"groupquota@"
	};

	zprop_desc_t *
	zfs_prop_get_table(void)
	{
	return (zfs_prop_table);
	}

	void
	zfs_prop_init(void)
	{
	static zprop_index_t checksum_table[] = {
	{ "on", ZIO_CHECKSUM_ON },
	{ "off", ZIO_CHECKSUM_OFF },
	{ "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
	{ "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
	{ "sha256", ZIO_CHECKSUM_SHA256 },
	{ "noparity", ZIO_CHECKSUM_NOPARITY },
	{ "sha512", ZIO_CHECKSUM_SHA512 },
	{ "skein", ZIO_CHECKSUM_SKEIN },
	#ifdef illumos
	{ "edonr", ZIO_CHECKSUM_EDONR },
	#endif
	{ NULL }
	};

	static zprop_index_t dedup_table[] = {
	{ "on", ZIO_CHECKSUM_ON },
	{ "off", ZIO_CHECKSUM_OFF },
	{ "verify", ZIO_CHECKSUM_ON \| ZIO_CHECKSUM_VERIFY },
	{ "sha256", ZIO_CHECKSUM_SHA256 },
	{ "sha256,verify",
	ZIO_CHECKSUM_SHA256 \| ZIO_CHECKSUM_VERIFY },
	{ "sha512", ZIO_CHECKSUM_SHA512 },
	{ "sha512,verify",
	ZIO_CHECKSUM_SHA512 \| ZIO_CHECKSUM_VERIFY },
	{ "skein", ZIO_CHECKSUM_SKEIN },
	{ "skein,verify",
	ZIO_CHECKSUM_SKEIN \| ZIO_CHECKSUM_VERIFY },
	#ifdef illumos
	{ "edonr,verify",
	ZIO_CHECKSUM_EDONR \| ZIO_CHECKSUM_VERIFY },
	#endif
	{ NULL }
	};

	static zprop_index_t compress_table[] = {
	{ "on", ZIO_COMPRESS_ON },
	{ "off", ZIO_COMPRESS_OFF },
	{ "lzjb", ZIO_COMPRESS_LZJB },
	{ "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */
	{ "gzip-1", ZIO_COMPRESS_GZIP_1 },
	{ "gzip-2", ZIO_COMPRESS_GZIP_2 },
	{ "gzip-3", ZIO_COMPRESS_GZIP_3 },
	{ "gzip-4", ZIO_COMPRESS_GZIP_4 },
	{ "gzip-5", ZIO_COMPRESS_GZIP_5 },
	{ "gzip-6", ZIO_COMPRESS_GZIP_6 },
	{ "gzip-7", ZIO_COMPRESS_GZIP_7 },
	{ "gzip-8", ZIO_COMPRESS_GZIP_8 },
	{ "gzip-9", ZIO_COMPRESS_GZIP_9 },
	{ "zle", ZIO_COMPRESS_ZLE },
	{ "lz4", ZIO_COMPRESS_LZ4 },
	{ NULL }
	};

	static zprop_index_t snapdir_table[] = {
	{ "hidden", ZFS_SNAPDIR_HIDDEN },
	{ "visible", ZFS_SNAPDIR_VISIBLE },
	{ NULL }
	};

	static zprop_index_t acl_mode_table[] = {
	{ "discard", ZFS_ACL_DISCARD },
	{ "groupmask", ZFS_ACL_GROUPMASK },
	{ "passthrough", ZFS_ACL_PASSTHROUGH },
	{ "restricted", ZFS_ACL_RESTRICTED },
	{ NULL }
	};

	static zprop_index_t acl_inherit_table[] = {
	{ "discard", ZFS_ACL_DISCARD },
	{ "noallow", ZFS_ACL_NOALLOW },
	{ "restricted", ZFS_ACL_RESTRICTED },
	{ "passthrough", ZFS_ACL_PASSTHROUGH },
	{ "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */
	{ "passthrough-x", ZFS_ACL_PASSTHROUGH_X },
	{ NULL }
	};

	static zprop_index_t case_table[] = {
	{ "sensitive", ZFS_CASE_SENSITIVE },
	{ "insensitive", ZFS_CASE_INSENSITIVE },
	{ "mixed", ZFS_CASE_MIXED },
	{ NULL }
	};

	static zprop_index_t copies_table[] = {
	{ "1", 1 },
	{ "2", 2 },
	{ "3", 3 },
	{ NULL }
	};

	/*
	* Use the unique flags we have to send to u8_strcmp() and/or
	* u8_textprep() to represent the various normalization property
	* values.
	*/
	static zprop_index_t normalize_table[] = {
	{ "none", 0 },
	{ "formD", U8_TEXTPREP_NFD },
	{ "formKC", U8_TEXTPREP_NFKC },
	{ "formC", U8_TEXTPREP_NFC },
	{ "formKD", U8_TEXTPREP_NFKD },
	{ NULL }
	};

	static zprop_index_t version_table[] = {
	{ "1", 1 },
	{ "2", 2 },
	{ "3", 3 },
	{ "4", 4 },
	{ "5", 5 },
	{ "current", ZPL_VERSION },
	{ NULL }
	};

	static zprop_index_t boolean_table[] = {
	{ "off", 0 },
	{ "on", 1 },
	{ NULL }
	};

	static zprop_index_t logbias_table[] = {
	{ "latency", ZFS_LOGBIAS_LATENCY },
	{ "throughput", ZFS_LOGBIAS_THROUGHPUT },
	{ NULL }
	};

	static zprop_index_t canmount_table[] = {
	{ "off", ZFS_CANMOUNT_OFF },
	{ "on", ZFS_CANMOUNT_ON },
	{ "noauto", ZFS_CANMOUNT_NOAUTO },
	{ NULL }
	};

	static zprop_index_t cache_table[] = {
	{ "none", ZFS_CACHE_NONE },
	{ "metadata", ZFS_CACHE_METADATA },
	{ "all", ZFS_CACHE_ALL },
	{ NULL }
	};

	static zprop_index_t sync_table[] = {
	{ "standard", ZFS_SYNC_STANDARD },
	{ "always", ZFS_SYNC_ALWAYS },
	{ "disabled", ZFS_SYNC_DISABLED },
	{ NULL }
	};

	static zprop_index_t volmode_table[] = {
	{ "default", ZFS_VOLMODE_DEFAULT },
	{ "geom", ZFS_VOLMODE_GEOM },
	{ "dev", ZFS_VOLMODE_DEV },
	{ "none", ZFS_VOLMODE_NONE },
	{ NULL }
	};

	static zprop_index_t redundant_metadata_table[] = {
	{ "all", ZFS_REDUNDANT_METADATA_ALL },
	{ "most", ZFS_REDUNDANT_METADATA_MOST },
	{ NULL }
	};

	/* inherit index properties */
	zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
	ZFS_REDUNDANT_METADATA_ALL,
	PROP_INHERIT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"all \| most", "REDUND_MD",
	redundant_metadata_table);
	zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
	PROP_INHERIT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"standard \| always \| disabled", "SYNC",
	sync_table);
	zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
	ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM \|
	ZFS_TYPE_VOLUME,
	"on \| off \| fletcher2 \| fletcher4 \| sha256 \| sha512 \| "
	"skein", "CHECKSUM", checksum_table);
	zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
	PROP_INHERIT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"on \| off \| verify \| sha256[,verify], sha512[,verify], "
	"skein[,verify]", "DEDUP", dedup_table);
	zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
	ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"on \| off \| lzjb \| gzip \| gzip-[1-9] \| zle \| lz4",
	"COMPRESS", compress_table);
	zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
	PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
	"hidden \| visible", "SNAPDIR", snapdir_table);
	zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD,
	PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
	"discard \| groupmask \| passthrough \| restricted", "ACLMODE",
	acl_mode_table);
	zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
	ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
	"discard \| noallow \| restricted \| passthrough \| passthrough-x",
	"ACLINHERIT", acl_inherit_table);
	zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"1 \| 2 \| 3", "COPIES", copies_table);
	zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
	ZFS_CACHE_ALL, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT \| ZFS_TYPE_VOLUME,
	"all \| none \| metadata", "PRIMARYCACHE", cache_table);
	zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
	ZFS_CACHE_ALL, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT \| ZFS_TYPE_VOLUME,
	"all \| none \| metadata", "SECONDARYCACHE", cache_table);
	zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
	PROP_INHERIT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"latency \| throughput", "LOGBIAS", logbias_table);
	zprop_register_index(ZFS_PROP_VOLMODE, "volmode",
	ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT \| ZFS_TYPE_VOLUME,
	"default \| geom \| dev \| none", "VOLMODE", volmode_table);

	/* inherit index (boolean) properties */
	zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM, "on \| off", "ATIME", boolean_table);
	zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT, "on \| off", "DEVICES",
	boolean_table);
	zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT, "on \| off", "EXEC",
	boolean_table);
	zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT, "on \| off", "SETUID",
	boolean_table);
	zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "on \| off", "RDONLY",
	boolean_table);
	zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM, "on \| off", "JAILED", boolean_table);
	zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT, "on \| off", "XATTR",
	boolean_table);
	zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM, "on \| off", "VSCAN",
	boolean_table);
	zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT, "on \| off", "NBMAND",
	boolean_table);

	/* default index properties */
	zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT,
	"1 \| 2 \| 3 \| 4 \| 5 \| current", "VERSION", version_table);
	zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
	PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on \| off \| noauto",
	"CANMOUNT", canmount_table);

	/* readonly index (boolean) properties */
	zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
	ZFS_TYPE_FILESYSTEM, "yes \| no", "MOUNTED", boolean_table);
	zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
	PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes \| no", "DEFER_DESTROY",
	boolean_table);

	/* set once index properties */
	zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
	PROP_ONETIME, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT,
	"none \| formC \| formD \| formKC \| formKD", "NORMALIZATION",
	normalize_table);
	zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
	ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM \|
	ZFS_TYPE_SNAPSHOT,
	"sensitive \| insensitive \| mixed", "CASE", case_table);

	/* set once index (boolean) properties */
	zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_SNAPSHOT,
	"on \| off", "UTF8ONLY", boolean_table);

	/* string properties */
	zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
	zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY,
	ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES");
	zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
	PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> \| legacy \| none",
	"MOUNTPOINT");
	zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
	PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on \| off \| share(1M) options",
	"SHARENFS");
	zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
	ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK,
	"filesystem \| volume \| snapshot \| bookmark", "TYPE");
	zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
	PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
	"on \| off \| sharemgr(1M) options", "SHARESMB");
	zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
	ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
	"<sensitivity label>", "MLSLABEL");
	zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
	"receive_resume_token",
	NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"<string token>", "RESUMETOK");

	/* readonly number properties */
	zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
	ZFS_TYPE_DATASET, "<size>", "USED");
	zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>", "AVAIL");
	zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
	PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "REFER");
	zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
	PROP_READONLY, ZFS_TYPE_DATASET,
	"<1.00x or higher if compressed>", "RATIO");
	zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
	PROP_READONLY, ZFS_TYPE_DATASET,
	"<1.00x or higher if compressed>", "REFRATIO");
	zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
	ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
	ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK");
	zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
	PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>",
	"USEDSNAP");
	zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
	PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>",
	"USEDDS");
	zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
	PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>",
	"USEDCHILD");
	zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
	PROP_READONLY,
	ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
	zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
	ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
	zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY,
	ZFS_TYPE_DATASET, "<size>", "WRITTEN");
	zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0,
	PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "<size>",
	"LUSED");
	zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced",
	0, PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LREFER");

	/* default number properties */
	zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
	ZFS_TYPE_FILESYSTEM, "<size> \| none", "QUOTA");
	zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
	PROP_DEFAULT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"<size> \| none", "RESERV");
	zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
	ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
	zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
	ZFS_TYPE_FILESYSTEM, "<size> \| none", "REFQUOTA");
	zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
	PROP_DEFAULT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"<size> \| none", "REFRESERV");
	zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
	UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
	"<count> \| none", "FSLIMIT");
	zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
	UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"<count> \| none", "SSLIMIT");
	zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
	UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
	"<count>", "FSCOUNT");
	zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
	UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME,
	"<count>", "SSCOUNT");

	/* inherit number properties */
	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
	SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
	ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");

	/* hidden properties */
	zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
	PROP_READONLY, ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK, "CREATETXG");
	+ zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
	+ PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG");
	zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
	PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
	zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
	PROP_READONLY, ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK, "NAME");
	zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
	PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
	zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
	PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
	"STMF_SBD_LU");
	zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER,
	PROP_READONLY, ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK, "GUID");
	zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
	PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
	"USERACCOUNTING");
	zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
	PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
	zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
	PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
	zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent",
	PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
	zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
	PROP_READONLY, ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME, "PREVSNAP");

	/* oddball properties */
	zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
	NULL, PROP_READONLY, ZFS_TYPE_DATASET \| ZFS_TYPE_BOOKMARK,
	"<date>", "CREATION", B_FALSE, B_TRUE, NULL);
	}

	boolean_t
	zfs_prop_delegatable(zfs_prop_t prop)
	{
	zprop_desc_t *pd = &zfs_prop_table[prop];

	/* The mlslabel property is never delegatable. */
	if (prop == ZFS_PROP_MLSLABEL)
	return (B_FALSE);

	return (pd->pd_attr != PROP_READONLY);
	}

	/*
	* Given a zfs dataset property name, returns the corresponding property ID.
	*/
	zfs_prop_t
	zfs_name_to_prop(const char *propname)
	{
	return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
	}

	/*
	* For user property names, we allow all lowercase alphanumeric characters, plus
	* a few useful punctuation characters.
	*/
	static int
	valid_char(char c)
	{
	return ((c >= 'a' && c <= 'z') \|\|
	(c >= '0' && c <= '9') \|\|
	c == '-' \|\| c == '_' \|\| c == '.' \|\| c == ':');
	}

	/*
	* Returns true if this is a valid user-defined property (one with a ':').
	*/
	boolean_t
	zfs_prop_user(const char *name)
	{
	int i;
	char c;
	boolean_t foundsep = B_FALSE;

	for (i = 0; i < strlen(name); i++) {
	c = name[i];
	if (!valid_char(c))
	return (B_FALSE);
	if (c == ':')
	foundsep = B_TRUE;
	}

	if (!foundsep)
	return (B_FALSE);

	return (B_TRUE);
	}

	/*
	* Returns true if this is a valid userspace-type property (one with a '@').
	* Note that after the @, any character is valid (eg, another @, for SID
	* user@domain).
	*/
	boolean_t
	zfs_prop_userquota(const char *name)
	{
	zfs_userquota_prop_t prop;

	for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) {
	if (strncmp(name, zfs_userquota_prop_prefixes[prop],
	strlen(zfs_userquota_prop_prefixes[prop])) == 0) {
	return (B_TRUE);
	}
	}

	return (B_FALSE);
	}

	/*
	* Returns true if this is a valid written@ property.
	* Note that after the @, any character is valid (eg, another @, for
	* written@pool/fs@origin).
	*/
	boolean_t
	zfs_prop_written(const char *name)
	{
	static const char *prefix = "written@";
	return (strncmp(name, prefix, strlen(prefix)) == 0);
	}

	/*
	* Tables of index types, plus functions to convert between the user view
	* (strings) and internal representation (uint64_t).
	*/
	int
	zfs_prop_string_to_index(zfs_prop_t prop, const char string, uint64_t index)
	{
	return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET));
	}

	int
	zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
	{
	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
	}

	uint64_t
	zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
	{
	return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
	}

	/*
	* Returns TRUE if the property applies to any of the given dataset types.
	*/
	boolean_t
	zfs_prop_valid_for_type(int prop, zfs_type_t types)
	{
	return (zprop_valid_for_type(prop, types));
	}

	zprop_type_t
	zfs_prop_get_type(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_proptype);
	}

	/*
	* Returns TRUE if the property is readonly.
	*/
	boolean_t
	zfs_prop_readonly(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_attr == PROP_READONLY \|\|
	zfs_prop_table[prop].pd_attr == PROP_ONETIME);
	}

	/*
	* Returns TRUE if the property is visible (not hidden).
	*/
	boolean_t
	zfs_prop_visible(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_visible);
	}

	/*
	* Returns TRUE if the property is only allowed to be set once.
	*/
	boolean_t
	zfs_prop_setonce(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_attr == PROP_ONETIME);
	}

	const char *
	zfs_prop_default_string(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_strdefault);
	}

	uint64_t
	zfs_prop_default_numeric(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_numdefault);
	}

	/*
	* Given a dataset property ID, returns the corresponding name.
	* Assuming the zfs dataset property ID is valid.
	*/
	const char *
	zfs_prop_to_name(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_name);
	}

	/*
	* Returns TRUE if the property is inheritable.
	*/
	boolean_t
	zfs_prop_inheritable(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_attr == PROP_INHERIT \|\|
	zfs_prop_table[prop].pd_attr == PROP_ONETIME);
	}

	#ifndef _KERNEL

	/*
	* Returns a string describing the set of acceptable values for the given
	* zfs property, or NULL if it cannot be set.
	*/
	const char *
	zfs_prop_values(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_values);
	}

	/*
	* Returns TRUE if this property is a string type. Note that index types
	* (compression, checksum) are treated as strings in userland, even though they
	* are stored numerically on disk.
	*/
	int
	zfs_prop_is_string(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING \|\|
	zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX);
	}

	/*
	* Returns the column header for the given property. Used only in
	* 'zfs list -o', but centralized here with the other property information.
	*/
	const char *
	zfs_prop_column_name(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_colname);
	}

	/*
	* Returns whether the given property should be displayed right-justified for
	* 'zfs list'.
	*/
	boolean_t
	zfs_prop_align_right(zfs_prop_t prop)
	{
	return (zfs_prop_table[prop].pd_rightalign);
	}

	#endif
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/Makefile.files (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/Makefile.files (revision 332525)
	@@ -1,173 +1,177 @@
	#
	# CDDL HEADER START
	#
	# The contents of this file are subject to the terms of the
	# Common Development and Distribution License (the "License").
	# You may not use this file except in compliance with the License.
	#
	# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	# or http://www.opensolaris.org/os/licensing.
	# See the License for the specific language governing permissions
	# and limitations under the License.
	#
	# When distributing Covered Code, include this CDDL HEADER in each
	# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	# If applicable, add the following below this CDDL HEADER, with the
	# fields enclosed by brackets "[]" replaced with your own identifying
	# information: Portions Copyright [yyyy] [name of copyright owner]
	#
	# CDDL HEADER END
	#

	#
	# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
	# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
	# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
	# Copyright (c) 2012 Joyent, Inc. All rights reserved.
	# Copyright (c) 2011, 2014 by Delphix. All rights reserved.
	# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	#
	#
	# This Makefile defines all file modules for the directory uts/common
	# and its children. These are the source files which may be considered
	# common to all SunOS systems.

	LUA_OBJS += \
	ldo.o \
	lvm.o \
	lbitlib.o \
	lopcodes.o \
	lstring.o \
	ltable.o \
	ltm.o \
	lcorolib.o \
	lauxlib.o \
	ldebug.o \
	lstate.o \
	lgc.o \
	lmem.o \
	lctype.o \
	lfunc.o \
	ldump.o \
	lundump.o \
	lstrlib.o \
	ltablib.o \
	lapi.o \
	lobject.o \
	lbaselib.o \
	lcompat.o \
	lzio.o \
	lcode.o \
	llex.o \
	lparser.o

	ZFS_COMMON_OBJS += \
	abd.o \
	arc.o \
	bplist.o \
	blkptr.o \
	bpobj.o \
	bptree.o \
	bqueue.o \
	dbuf.o \
	ddt.o \
	ddt_zap.o \
	dmu.o \
	dmu_diff.o \
	dmu_send.o \
	dmu_object.o \
	dmu_objset.o \
	dmu_traverse.o \
	dmu_tx.o \
	dnode.o \
	dnode_sync.o \
	dsl_bookmark.o \
	dsl_dir.o \
	dsl_dataset.o \
	dsl_deadlist.o \
	dsl_destroy.o \
	dsl_pool.o \
	dsl_synctask.o \
	dsl_userhold.o \
	dmu_zfetch.o \
	dsl_deleg.o \
	dsl_prop.o \
	dsl_scan.o \
	zfeature.o \
	gzip.o \
	lz4.o \
	lzjb.o \
	metaslab.o \
	multilist.o \
	range_tree.o \
	refcount.o \
	rrwlock.o \
	sa.o \
	sha256.o \
	skein_zfs.o \
	spa.o \
	spa_config.o \
	spa_errlog.o \
	spa_history.o \
	spa_misc.o \
	space_map.o \
	space_reftree.o \
	txg.o \
	uberblock.o \
	unique.o \
	vdev.o \
	vdev_cache.o \
	vdev_file.o \
	+ vdev_indirect.o \
	+ vdev_indirect_births.o \
	+ vdev_indirect_mapping.o \
	vdev_label.o \
	vdev_mirror.o \
	vdev_missing.o \
	vdev_queue.o \
	vdev_raidz.o \
	+ vdev_removal.o \
	vdev_root.o \
	zap.o \
	zap_leaf.o \
	zap_micro.o \
	zcp.o \
	zcp_get.o \
	zcp_global.o \
	zcp_iter.o \
	zcp_synctask.o \
	zfs_byteswap.o \
	zfs_debug.o \
	zfs_fm.o \
	zfs_fuid.o \
	zfs_sa.o \
	zfs_znode.o \
	zil.o \
	zio.o \
	zio_checksum.o \
	zio_compress.o \
	zio_inject.o \
	zle.o \
	zrlock.o

	ZFS_SHARED_OBJS += \
	zfeature_common.o \
	zfs_comutil.o \
	zfs_deleg.o \
	zfs_fletcher.o \
	zfs_namecheck.o \
	zfs_prop.o \
	zpool_prop.o \
	zprop_common.o

	ZFS_OBJS += \
	$(ZFS_COMMON_OBJS) \
	$(ZFS_SHARED_OBJS) \
	zfs_acl.o \
	zfs_ctldir.o \
	zfs_dir.o \
	zfs_ioctl.o \
	zfs_ioctl_compat.o \
	zfs_log.o \
	zfs_onexit.o \
	zfs_replay.o \
	zfs_rlock.o \
	zfs_vfsops.o \
	zfs_vnops.o \
	zvol.o
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 332525)
	@@ -1,7857 +1,7857 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, Joyent, Inc. All rights reserved.
	* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
	* Copyright 2017 Nexenta Systems, Inc. All rights reserved.
	*/

	/*
	* DVA-based Adjustable Replacement Cache
	*
	* While much of the theory of operation used here is
	* based on the self-tuning, low overhead replacement cache
	* presented by Megiddo and Modha at FAST 2003, there are some
	* significant differences:
	*
	* 1. The Megiddo and Modha model assumes any page is evictable.
	* Pages in its cache cannot be "locked" into memory. This makes
	* the eviction algorithm simple: evict the last page in the list.
	* This also make the performance characteristics easy to reason
	* about. Our cache is not so simple. At any given moment, some
	* subset of the blocks in the cache are un-evictable because we
	* have handed out a reference to them. Blocks are only evictable
	* when there are no external references active. This makes
	* eviction far more problematic: we choose to evict the evictable
	* blocks that are the "lowest" in the list.
	*
	* There are times when it is not possible to evict the requested
	* space. In these circumstances we are unable to adjust the cache
	* size. To prevent the cache growing unbounded at these times we
	* implement a "cache throttle" that slows the flow of new data
	* into the cache until we can make space available.
	*
	* 2. The Megiddo and Modha model assumes a fixed cache size.
	* Pages are evicted when the cache is full and there is a cache
	* miss. Our model has a variable sized cache. It grows with
	* high use, but also tries to react to memory pressure from the
	* operating system: decreasing its size when system memory is
	* tight.
	*
	* 3. The Megiddo and Modha model assumes a fixed page size. All
	* elements of the cache are therefore exactly the same size. So
	* when adjusting the cache size following a cache miss, its simply
	* a matter of choosing a single page to evict. In our model, we
	* have variable sized cache blocks (rangeing from 512 bytes to
	* 128K bytes). We therefore choose a set of blocks to evict to make
	* space for a cache miss that approximates as closely as possible
	* the space used by the new block.
	*
	* See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
	* by N. Megiddo & D. Modha, FAST 2003
	*/

	/*
	* The locking model:
	*
	* A new reference to a cache buffer can be obtained in two
	* ways: 1) via a hash table lookup using the DVA as a key,
	* or 2) via one of the ARC lists. The arc_read() interface
	* uses method 1, while the internal ARC algorithms for
	* adjusting the cache use method 2. We therefore provide two
	* types of locks: 1) the hash table lock array, and 2) the
	* ARC list locks.
	*
	* Buffers do not have their own mutexes, rather they rely on the
	* hash table mutexes for the bulk of their protection (i.e. most
	* fields in the arc_buf_hdr_t are protected by these mutexes).
	*
	* buf_hash_find() returns the appropriate mutex (held) when it
	* locates the requested buffer in the hash table. It returns
	* NULL for the mutex if the buffer was not in the table.
	*
	* buf_hash_remove() expects the appropriate hash mutex to be
	* already held before it is invoked.
	*
	* Each ARC state also has a mutex which is used to protect the
	* buffer list associated with the state. When attempting to
	* obtain a hash table lock while holding an ARC list lock you
	* must use: mutex_tryenter() to avoid deadlock. Also note that
	* the active state mutex must be held before the ghost state mutex.
	*
	* Note that the majority of the performance stats are manipulated
	* with atomic operations.
	*
	* The L2ARC uses the l2ad_mtx on each vdev for the following:
	*
	* - L2ARC buflist creation
	* - L2ARC buflist eviction
	* - L2ARC write completion, which walks L2ARC buflists
	* - ARC header destruction, as it removes from L2ARC buflists
	* - ARC header release, as it removes from L2ARC buflists
	*/

	/*
	* ARC operation:
	*
	* Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
	* This structure can point either to a block that is still in the cache or to
	* one that is only accessible in an L2 ARC device, or it can provide
	* information about a block that was recently evicted. If a block is
	* only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
	* information to retrieve it from the L2ARC device. This information is
	* stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
	* that is in this state cannot access the data directly.
	*
	* Blocks that are actively being referenced or have not been evicted
	* are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
	* the arc_buf_hdr_t that will point to the data block in memory. A block can
	* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
	* caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
	* also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
	*
	* The L1ARC's data pointer may or may not be uncompressed. The ARC has the
	* ability to store the physical data (b_pabd) associated with the DVA of the
	* arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
	* it will match its on-disk compression characteristics. This behavior can be
	* disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
	* compressed ARC functionality is disabled, the b_pabd will point to an
	* uncompressed version of the on-disk data.
	*
	* Data in the L1ARC is not accessed by consumers of the ARC directly. Each
	* arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
	* Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
	* consumer. The ARC will provide references to this data and will keep it
	* cached until it is no longer in use. The ARC caches only the L1ARC's physical
	* data block and will evict any arc_buf_t that is no longer referenced. The
	* amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
	* "overhead_size" kstat.
	*
	* Depending on the consumer, an arc_buf_t can be requested in uncompressed or
	* compressed form. The typical case is that consumers will want uncompressed
	* data, and when that happens a new data buffer is allocated where the data is
	* decompressed for them to use. Currently the only consumer who wants
	* compressed arc_buf_t's is "zfs send", when it streams data exactly as it
	* exists on disk. When this happens, the arc_buf_t's data buffer is shared
	* with the arc_buf_hdr_t.
	*
	* Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
	* first one is owned by a compressed send consumer (and therefore references
	* the same compressed data buffer as the arc_buf_hdr_t) and the second could be
	* used by any other consumer (and has its own uncompressed copy of the data
	* buffer).
	*
	* arc_buf_hdr_t
	* +-----------+
	* \| fields \|
	* \| common to \|
	* \| L1- and \|
	* \| L2ARC \|
	* +-----------+
	* \| l2arc_buf_hdr_t
	* \| \|
	* +-----------+
	* \| l1arc_buf_hdr_t
	* \| \| arc_buf_t
	* \| b_buf +------------>+-----------+ arc_buf_t
	* \| b_pabd +-+ \|b_next +---->+-----------+
	* +-----------+ \| \|-----------\| \|b_next +-->NULL
	* \| \|b_comp = T \| +-----------+
	* \| \|b_data +-+ \|b_comp = F \|
	* \| +-----------+ \| \|b_data +-+
	* +->+------+ \| +-----------+ \|
	* compressed \| \| \| \|
	* data \| \|<--------------+ \| uncompressed
	* +------+ compressed, \| data
	* shared +-->+------+
	* data \| \|
	* \| \|
	* +------+
	*
	* When a consumer reads a block, the ARC must first look to see if the
	* arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
	* arc_buf_t and either copies uncompressed data into a new data buffer from an
	* existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
	* new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
	* hdr is compressed and the desired compression characteristics of the
	* arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
	* arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
	* the last buffer in the hdr's b_buf list, however a shared compressed buf can
	* be anywhere in the hdr's list.
	*
	* The diagram below shows an example of an uncompressed ARC hdr that is
	* sharing its data with an arc_buf_t (note that the shared uncompressed buf is
	* the last element in the buf list):
	*
	* arc_buf_hdr_t
	* +-----------+
	* \| \|
	* \| \|
	* \| \|
	* +-----------+
	* l2arc_buf_hdr_t\| \|
	* \| \|
	* +-----------+
	* l1arc_buf_hdr_t\| \|
	* \| \| arc_buf_t (shared)
	* \| b_buf +------------>+---------+ arc_buf_t
	* \| \| \|b_next +---->+---------+
	* \| b_pabd +-+ \|---------\| \|b_next +-->NULL
	* +-----------+ \| \| \| +---------+
	* \| \|b_data +-+ \| \|
	* \| +---------+ \| \|b_data +-+
	* +->+------+ \| +---------+ \|
	* \| \| \| \|
	* uncompressed \| \| \| \|
	* data +------+ \| \|
	* ^ +->+------+ \|
	* \| uncompressed \| \| \|
	* \| data \| \| \|
	* \| +------+ \|
	* +---------------------------------+
	*
	* Writing to the ARC requires that the ARC first discard the hdr's b_pabd
	* since the physical block is about to be rewritten. The new data contents
	* will be contained in the arc_buf_t. As the I/O pipeline performs the write,
	* it may compress the data before writing it to disk. The ARC will be called
	* with the transformed data and will bcopy the transformed on-disk block into
	* a newly allocated b_pabd. Writes are always done into buffers which have
	* either been loaned (and hence are new and don't have other readers) or
	* buffers which have been released (and hence have their own hdr, if there
	* were originally other readers of the buf's original hdr). This ensures that
	* the ARC only needs to update a single buf and its hdr after a write occurs.
	*
	* When the L2ARC is in use, it will also take advantage of the b_pabd. The
	* L2ARC will always write the contents of b_pabd to the L2ARC. This means
	* that when compressed ARC is enabled that the L2ARC blocks are identical
	* to the on-disk block in the main data pool. This provides a significant
	* advantage since the ARC can leverage the bp's checksum when reading from the
	* L2ARC to determine if the contents are valid. However, if the compressed
	* ARC is disabled, then the L2ARC's block must be transformed to look
	* like the physical block in the main data pool before comparing the
	* checksum and determining its validity.
	*/

	#include <sys/spa.h>
	#include <sys/zio.h>
	#include <sys/spa_impl.h>
	#include <sys/zio_compress.h>
	#include <sys/zio_checksum.h>
	#include <sys/zfs_context.h>
	#include <sys/arc.h>
	#include <sys/refcount.h>
	#include <sys/vdev.h>
	#include <sys/vdev_impl.h>
	#include <sys/dsl_pool.h>
	#include <sys/zio_checksum.h>
	#include <sys/multilist.h>
	#include <sys/abd.h>
	#ifdef _KERNEL
	#include <sys/dnlc.h>
	#include <sys/racct.h>
	#endif
	#include <sys/callb.h>
	#include <sys/kstat.h>
	#include <sys/trim_map.h>
	#include <zfs_fletcher.h>
	#include <sys/sdt.h>

	#include <machine/vmparam.h>

	#ifdef illumos
	#ifndef _KERNEL
	/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
	boolean_t arc_watch = B_FALSE;
	int arc_procfd;
	#endif
	#endif /* illumos */

	static kmutex_t arc_reclaim_lock;
	static kcondvar_t arc_reclaim_thread_cv;
	static boolean_t arc_reclaim_thread_exit;
	static kcondvar_t arc_reclaim_waiters_cv;

	static kmutex_t arc_dnlc_evicts_lock;
	static kcondvar_t arc_dnlc_evicts_cv;
	static boolean_t arc_dnlc_evicts_thread_exit;

	uint_t arc_reduce_dnlc_percent = 3;

	/*
	* The number of headers to evict in arc_evict_state_impl() before
	* dropping the sublist lock and evicting from another sublist. A lower
	* value means we're more likely to evict the "correct" header (i.e. the
	* oldest header in the arc state), but comes with higher overhead
	* (i.e. more invocations of arc_evict_state_impl()).
	*/
	int zfs_arc_evict_batch_limit = 10;

	/* number of seconds before growing cache again */
	static int arc_grow_retry = 60;

	/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
	int zfs_arc_overflow_shift = 8;

	/* shift of arc_c for calculating both min and max arc_p */
	static int arc_p_min_shift = 4;

	/* log2(fraction of arc to reclaim) */
	static int arc_shrink_shift = 7;

	/*
	* log2(fraction of ARC which must be free to allow growing).
	* I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
	* when reading a new block into the ARC, we will evict an equal-sized block
	* from the ARC.
	*
	* This must be less than arc_shrink_shift, so that when we shrink the ARC,
	* we will still not allow it to grow.
	*/
	int arc_no_grow_shift = 5;


	/*
	* minimum lifespan of a prefetch block in clock ticks
	* (initialized in arc_init())
	*/
	static int arc_min_prefetch_lifespan;

	/*
	* If this percent of memory is free, don't throttle.
	*/
	int arc_lotsfree_percent = 10;

	static int arc_dead;
	extern boolean_t zfs_prefetch_disable;

	/*
	* The arc has filled available memory and has now warmed up.
	*/
	static boolean_t arc_warm;

	/*
	* log2 fraction of the zio arena to keep free.
	*/
	int arc_zio_arena_free_shift = 2;

	/*
	* These tunables are for performance analysis.
	*/
	uint64_t zfs_arc_max;
	uint64_t zfs_arc_min;
	uint64_t zfs_arc_meta_limit = 0;
	uint64_t zfs_arc_meta_min = 0;
	int zfs_arc_grow_retry = 0;
	int zfs_arc_shrink_shift = 0;
	int zfs_arc_no_grow_shift = 0;
	int zfs_arc_p_min_shift = 0;
	uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
	u_int zfs_arc_free_target = 0;

	/* Absolute min for arc min / max is 16MB. */
	static uint64_t arc_abs_min = 16 << 20;

	boolean_t zfs_compressed_arc_enabled = B_TRUE;

	static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
	static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
	static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
	static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
	static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS);

	#if defined(__FreeBSD__) && defined(_KERNEL)
	static void
	arc_free_target_init(void *unused __unused)
	{

	zfs_arc_free_target = vm_pageout_wakeup_thresh;
	}
	SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
	arc_free_target_init, NULL);

	TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
	TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
	TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
	TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry);
	TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift);
	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 \| CTLFLAG_RWTUN,
	0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 \| CTLFLAG_RWTUN,
	0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 \| CTLFLAG_RWTUN,
	0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U",
	"log2(fraction of ARC which must be free to allow growing)");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
	&zfs_arc_average_blocksize, 0,
	"ARC average blocksize");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
	&arc_shrink_shift, 0,
	"log2(fraction of arc to reclaim)");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW,
	&arc_grow_retry, 0,
	"Wait in seconds before considering growing ARC");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
	&zfs_compressed_arc_enabled, 0, "Enable compressed ARC");

	/*
	* We don't have a tunable for arc_free_target due to the dependency on
	* pagedaemon initialisation.
	*/
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
	CTLTYPE_UINT \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(u_int),
	sysctl_vfs_zfs_arc_free_target, "IU",
	"Desired number of free pages below which ARC triggers reclaim");

	static int
	sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
	{
	u_int val;
	int err;

	val = zfs_arc_free_target;
	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (val < minfree)
	return (EINVAL);
	if (val > vm_cnt.v_page_count)
	return (EINVAL);

	zfs_arc_free_target = val;

	return (0);
	}

	/*
	* Must be declared here, before the definition of corresponding kstat
	* macro which uses the same names will confuse the compiler.
	*/
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
	CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t),
	sysctl_vfs_zfs_arc_meta_limit, "QU",
	"ARC metadata limit");
	#endif

	/*
	* Note that buffers can be in one of 6 states:
	* ARC_anon - anonymous (discussed below)
	* ARC_mru - recently used, currently cached
	* ARC_mru_ghost - recentely used, no longer in cache
	* ARC_mfu - frequently used, currently cached
	* ARC_mfu_ghost - frequently used, no longer in cache
	* ARC_l2c_only - exists in L2ARC but not other states
	* When there are no active references to the buffer, they are
	* are linked onto a list in one of these arc states. These are
	* the only buffers that can be evicted or deleted. Within each
	* state there are multiple lists, one for meta-data and one for
	* non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
	* etc.) is tracked separately so that it can be managed more
	* explicitly: favored over data, limited explicitly.
	*
	* Anonymous buffers are buffers that are not associated with
	* a DVA. These are buffers that hold dirty block copies
	* before they are written to stable storage. By definition,
	* they are "ref'd" and are considered part of arc_mru
	* that cannot be freed. Generally, they will aquire a DVA
	* as they are written and migrate onto the arc_mru list.
	*
	* The ARC_l2c_only state is for buffers that are in the second
	* level ARC but no longer in any of the ARC_m* lists. The second
	* level ARC itself may also contain buffers that are in any of
	* the ARC_m* states - meaning that a buffer can exist in two
	* places. The reason for the ARC_l2c_only state is to keep the
	* buffer header in the hash table, so that reads that hit the
	* second level ARC benefit from these fast lookups.
	*/

	typedef struct arc_state {
	/*
	* list of evictable buffers
	*/
	multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
	/*
	* total amount of evictable data in this state
	*/
	refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
	/*
	* total amount of data in this state; this includes: evictable,
	* non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
	*/
	refcount_t arcs_size;
	} arc_state_t;

	/* The 6 states: */
	static arc_state_t ARC_anon;
	static arc_state_t ARC_mru;
	static arc_state_t ARC_mru_ghost;
	static arc_state_t ARC_mfu;
	static arc_state_t ARC_mfu_ghost;
	static arc_state_t ARC_l2c_only;

	typedef struct arc_stats {
	kstat_named_t arcstat_hits;
	kstat_named_t arcstat_misses;
	kstat_named_t arcstat_demand_data_hits;
	kstat_named_t arcstat_demand_data_misses;
	kstat_named_t arcstat_demand_metadata_hits;
	kstat_named_t arcstat_demand_metadata_misses;
	kstat_named_t arcstat_prefetch_data_hits;
	kstat_named_t arcstat_prefetch_data_misses;
	kstat_named_t arcstat_prefetch_metadata_hits;
	kstat_named_t arcstat_prefetch_metadata_misses;
	kstat_named_t arcstat_mru_hits;
	kstat_named_t arcstat_mru_ghost_hits;
	kstat_named_t arcstat_mfu_hits;
	kstat_named_t arcstat_mfu_ghost_hits;
	kstat_named_t arcstat_allocated;
	kstat_named_t arcstat_deleted;
	/*
	* Number of buffers that could not be evicted because the hash lock
	* was held by another thread. The lock may not necessarily be held
	* by something using the same buffer, since hash locks are shared
	* by multiple buffers.
	*/
	kstat_named_t arcstat_mutex_miss;
	/*
	* Number of buffers skipped because they have I/O in progress, are
	* indrect prefetch buffers that have not lived long enough, or are
	* not from the spa we're trying to evict from.
	*/
	kstat_named_t arcstat_evict_skip;
	/*
	* Number of times arc_evict_state() was unable to evict enough
	* buffers to reach it's target amount.
	*/
	kstat_named_t arcstat_evict_not_enough;
	kstat_named_t arcstat_evict_l2_cached;
	kstat_named_t arcstat_evict_l2_eligible;
	kstat_named_t arcstat_evict_l2_ineligible;
	kstat_named_t arcstat_evict_l2_skip;
	kstat_named_t arcstat_hash_elements;
	kstat_named_t arcstat_hash_elements_max;
	kstat_named_t arcstat_hash_collisions;
	kstat_named_t arcstat_hash_chains;
	kstat_named_t arcstat_hash_chain_max;
	kstat_named_t arcstat_p;
	kstat_named_t arcstat_c;
	kstat_named_t arcstat_c_min;
	kstat_named_t arcstat_c_max;
	kstat_named_t arcstat_size;
	/*
	* Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
	* Note that the compressed bytes may match the uncompressed bytes
	* if the block is either not compressed or compressed arc is disabled.
	*/
	kstat_named_t arcstat_compressed_size;
	/*
	* Uncompressed size of the data stored in b_pabd. If compressed
	* arc is disabled then this value will be identical to the stat
	* above.
	*/
	kstat_named_t arcstat_uncompressed_size;
	/*
	* Number of bytes stored in all the arc_buf_t's. This is classified
	* as "overhead" since this data is typically short-lived and will
	* be evicted from the arc when it becomes unreferenced unless the
	* zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
	* values have been set (see comment in dbuf.c for more information).
	*/
	kstat_named_t arcstat_overhead_size;
	/*
	* Number of bytes consumed by internal ARC structures necessary
	* for tracking purposes; these structures are not actually
	* backed by ARC buffers. This includes arc_buf_hdr_t structures
	* (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
	* caches), and arc_buf_t structures (allocated via arc_buf_t
	* cache).
	*/
	kstat_named_t arcstat_hdr_size;
	/*
	* Number of bytes consumed by ARC buffers of type equal to
	* ARC_BUFC_DATA. This is generally consumed by buffers backing
	* on disk user data (e.g. plain file contents).
	*/
	kstat_named_t arcstat_data_size;
	/*
	* Number of bytes consumed by ARC buffers of type equal to
	* ARC_BUFC_METADATA. This is generally consumed by buffers
	* backing on disk data that is used for internal ZFS
	* structures (e.g. ZAP, dnode, indirect blocks, etc).
	*/
	kstat_named_t arcstat_metadata_size;
	/*
	* Number of bytes consumed by various buffers and structures
	* not actually backed with ARC buffers. This includes bonus
	* buffers (allocated directly via zio_buf_* functions),
	* dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
	* cache), and dnode_t structures (allocated via dnode_t cache).
	*/
	kstat_named_t arcstat_other_size;
	/*
	* Total number of bytes consumed by ARC buffers residing in the
	* arc_anon state. This includes all buffers in the arc_anon
	* state; e.g. data, metadata, evictable, and unevictable buffers
	* are all included in this value.
	*/
	kstat_named_t arcstat_anon_size;
	/*
	* Number of bytes consumed by ARC buffers that meet the
	* following criteria: backing buffers of type ARC_BUFC_DATA,
	* residing in the arc_anon state, and are eligible for eviction
	* (e.g. have no outstanding holds on the buffer).
	*/
	kstat_named_t arcstat_anon_evictable_data;
	/*
	* Number of bytes consumed by ARC buffers that meet the
	* following criteria: backing buffers of type ARC_BUFC_METADATA,
	* residing in the arc_anon state, and are eligible for eviction
	* (e.g. have no outstanding holds on the buffer).
	*/
	kstat_named_t arcstat_anon_evictable_metadata;
	/*
	* Total number of bytes consumed by ARC buffers residing in the
	* arc_mru state. This includes all buffers in the arc_mru
	* state; e.g. data, metadata, evictable, and unevictable buffers
	* are all included in this value.
	*/
	kstat_named_t arcstat_mru_size;
	/*
	* Number of bytes consumed by ARC buffers that meet the
	* following criteria: backing buffers of type ARC_BUFC_DATA,
	* residing in the arc_mru state, and are eligible for eviction
	* (e.g. have no outstanding holds on the buffer).
	*/
	kstat_named_t arcstat_mru_evictable_data;
	/*
	* Number of bytes consumed by ARC buffers that meet the
	* following criteria: backing buffers of type ARC_BUFC_METADATA,
	* residing in the arc_mru state, and are eligible for eviction
	* (e.g. have no outstanding holds on the buffer).
	*/
	kstat_named_t arcstat_mru_evictable_metadata;
	/*
	* Total number of bytes that would have been consumed by ARC
	* buffers in the arc_mru_ghost state. The key thing to note
	* here, is the fact that this size doesn't actually indicate
	* RAM consumption. The ghost lists only consist of headers and
	* don't actually have ARC buffers linked off of these headers.
	* Thus, if the headers had associated ARC buffers, these
	* buffers would have consumed this number of bytes.
	*/
	kstat_named_t arcstat_mru_ghost_size;
	/*
	* Number of bytes that would have been consumed by ARC
	* buffers that are eligible for eviction, of type
	* ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
	*/
	kstat_named_t arcstat_mru_ghost_evictable_data;
	/*
	* Number of bytes that would have been consumed by ARC
	* buffers that are eligible for eviction, of type
	* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
	*/
	kstat_named_t arcstat_mru_ghost_evictable_metadata;
	/*
	* Total number of bytes consumed by ARC buffers residing in the
	* arc_mfu state. This includes all buffers in the arc_mfu
	* state; e.g. data, metadata, evictable, and unevictable buffers
	* are all included in this value.
	*/
	kstat_named_t arcstat_mfu_size;
	/*
	* Number of bytes consumed by ARC buffers that are eligible for
	* eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
	* state.
	*/
	kstat_named_t arcstat_mfu_evictable_data;
	/*
	* Number of bytes consumed by ARC buffers that are eligible for
	* eviction, of type ARC_BUFC_METADATA, and reside in the
	* arc_mfu state.
	*/
	kstat_named_t arcstat_mfu_evictable_metadata;
	/*
	* Total number of bytes that would have been consumed by ARC
	* buffers in the arc_mfu_ghost state. See the comment above
	* arcstat_mru_ghost_size for more details.
	*/
	kstat_named_t arcstat_mfu_ghost_size;
	/*
	* Number of bytes that would have been consumed by ARC
	* buffers that are eligible for eviction, of type
	* ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
	*/
	kstat_named_t arcstat_mfu_ghost_evictable_data;
	/*
	* Number of bytes that would have been consumed by ARC
	* buffers that are eligible for eviction, of type
	* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
	*/
	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
	kstat_named_t arcstat_l2_hits;
	kstat_named_t arcstat_l2_misses;
	kstat_named_t arcstat_l2_feeds;
	kstat_named_t arcstat_l2_rw_clash;
	kstat_named_t arcstat_l2_read_bytes;
	kstat_named_t arcstat_l2_write_bytes;
	kstat_named_t arcstat_l2_writes_sent;
	kstat_named_t arcstat_l2_writes_done;
	kstat_named_t arcstat_l2_writes_error;
	kstat_named_t arcstat_l2_writes_lock_retry;
	kstat_named_t arcstat_l2_evict_lock_retry;
	kstat_named_t arcstat_l2_evict_reading;
	kstat_named_t arcstat_l2_evict_l1cached;
	kstat_named_t arcstat_l2_free_on_write;
	kstat_named_t arcstat_l2_abort_lowmem;
	kstat_named_t arcstat_l2_cksum_bad;
	kstat_named_t arcstat_l2_io_error;
	kstat_named_t arcstat_l2_lsize;
	kstat_named_t arcstat_l2_psize;
	kstat_named_t arcstat_l2_hdr_size;
	kstat_named_t arcstat_l2_write_trylock_fail;
	kstat_named_t arcstat_l2_write_passed_headroom;
	kstat_named_t arcstat_l2_write_spa_mismatch;
	kstat_named_t arcstat_l2_write_in_l2;
	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
	kstat_named_t arcstat_l2_write_not_cacheable;
	kstat_named_t arcstat_l2_write_full;
	kstat_named_t arcstat_l2_write_buffer_iter;
	kstat_named_t arcstat_l2_write_pios;
	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
	kstat_named_t arcstat_l2_write_buffer_list_iter;
	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
	kstat_named_t arcstat_memory_throttle_count;
	kstat_named_t arcstat_meta_used;
	kstat_named_t arcstat_meta_limit;
	kstat_named_t arcstat_meta_max;
	kstat_named_t arcstat_meta_min;
	kstat_named_t arcstat_sync_wait_for_async;
	kstat_named_t arcstat_demand_hit_predictive_prefetch;
	} arc_stats_t;

	static arc_stats_t arc_stats = {
	{ "hits", KSTAT_DATA_UINT64 },
	{ "misses", KSTAT_DATA_UINT64 },
	{ "demand_data_hits", KSTAT_DATA_UINT64 },
	{ "demand_data_misses", KSTAT_DATA_UINT64 },
	{ "demand_metadata_hits", KSTAT_DATA_UINT64 },
	{ "demand_metadata_misses", KSTAT_DATA_UINT64 },
	{ "prefetch_data_hits", KSTAT_DATA_UINT64 },
	{ "prefetch_data_misses", KSTAT_DATA_UINT64 },
	{ "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
	{ "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
	{ "mru_hits", KSTAT_DATA_UINT64 },
	{ "mru_ghost_hits", KSTAT_DATA_UINT64 },
	{ "mfu_hits", KSTAT_DATA_UINT64 },
	{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
	{ "allocated", KSTAT_DATA_UINT64 },
	{ "deleted", KSTAT_DATA_UINT64 },
	{ "mutex_miss", KSTAT_DATA_UINT64 },
	{ "evict_skip", KSTAT_DATA_UINT64 },
	{ "evict_not_enough", KSTAT_DATA_UINT64 },
	{ "evict_l2_cached", KSTAT_DATA_UINT64 },
	{ "evict_l2_eligible", KSTAT_DATA_UINT64 },
	{ "evict_l2_ineligible", KSTAT_DATA_UINT64 },
	{ "evict_l2_skip", KSTAT_DATA_UINT64 },
	{ "hash_elements", KSTAT_DATA_UINT64 },
	{ "hash_elements_max", KSTAT_DATA_UINT64 },
	{ "hash_collisions", KSTAT_DATA_UINT64 },
	{ "hash_chains", KSTAT_DATA_UINT64 },
	{ "hash_chain_max", KSTAT_DATA_UINT64 },
	{ "p", KSTAT_DATA_UINT64 },
	{ "c", KSTAT_DATA_UINT64 },
	{ "c_min", KSTAT_DATA_UINT64 },
	{ "c_max", KSTAT_DATA_UINT64 },
	{ "size", KSTAT_DATA_UINT64 },
	{ "compressed_size", KSTAT_DATA_UINT64 },
	{ "uncompressed_size", KSTAT_DATA_UINT64 },
	{ "overhead_size", KSTAT_DATA_UINT64 },
	{ "hdr_size", KSTAT_DATA_UINT64 },
	{ "data_size", KSTAT_DATA_UINT64 },
	{ "metadata_size", KSTAT_DATA_UINT64 },
	{ "other_size", KSTAT_DATA_UINT64 },
	{ "anon_size", KSTAT_DATA_UINT64 },
	{ "anon_evictable_data", KSTAT_DATA_UINT64 },
	{ "anon_evictable_metadata", KSTAT_DATA_UINT64 },
	{ "mru_size", KSTAT_DATA_UINT64 },
	{ "mru_evictable_data", KSTAT_DATA_UINT64 },
	{ "mru_evictable_metadata", KSTAT_DATA_UINT64 },
	{ "mru_ghost_size", KSTAT_DATA_UINT64 },
	{ "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
	{ "mfu_size", KSTAT_DATA_UINT64 },
	{ "mfu_evictable_data", KSTAT_DATA_UINT64 },
	{ "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
	{ "mfu_ghost_size", KSTAT_DATA_UINT64 },
	{ "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
	{ "l2_hits", KSTAT_DATA_UINT64 },
	{ "l2_misses", KSTAT_DATA_UINT64 },
	{ "l2_feeds", KSTAT_DATA_UINT64 },
	{ "l2_rw_clash", KSTAT_DATA_UINT64 },
	{ "l2_read_bytes", KSTAT_DATA_UINT64 },
	{ "l2_write_bytes", KSTAT_DATA_UINT64 },
	{ "l2_writes_sent", KSTAT_DATA_UINT64 },
	{ "l2_writes_done", KSTAT_DATA_UINT64 },
	{ "l2_writes_error", KSTAT_DATA_UINT64 },
	{ "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
	{ "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
	{ "l2_evict_reading", KSTAT_DATA_UINT64 },
	{ "l2_evict_l1cached", KSTAT_DATA_UINT64 },
	{ "l2_free_on_write", KSTAT_DATA_UINT64 },
	{ "l2_abort_lowmem", KSTAT_DATA_UINT64 },
	{ "l2_cksum_bad", KSTAT_DATA_UINT64 },
	{ "l2_io_error", KSTAT_DATA_UINT64 },
	{ "l2_size", KSTAT_DATA_UINT64 },
	{ "l2_asize", KSTAT_DATA_UINT64 },
	{ "l2_hdr_size", KSTAT_DATA_UINT64 },
	{ "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
	{ "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
	{ "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
	{ "l2_write_in_l2", KSTAT_DATA_UINT64 },
	{ "l2_write_io_in_progress", KSTAT_DATA_UINT64 },
	{ "l2_write_not_cacheable", KSTAT_DATA_UINT64 },
	{ "l2_write_full", KSTAT_DATA_UINT64 },
	{ "l2_write_buffer_iter", KSTAT_DATA_UINT64 },
	{ "l2_write_pios", KSTAT_DATA_UINT64 },
	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
	{ "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
	{ "memory_throttle_count", KSTAT_DATA_UINT64 },
	{ "arc_meta_used", KSTAT_DATA_UINT64 },
	{ "arc_meta_limit", KSTAT_DATA_UINT64 },
	{ "arc_meta_max", KSTAT_DATA_UINT64 },
	{ "arc_meta_min", KSTAT_DATA_UINT64 },
	{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
	};

	#define ARCSTAT(stat) (arc_stats.stat.value.ui64)

	#define ARCSTAT_INCR(stat, val) \
	atomic_add_64(&arc_stats.stat.value.ui64, (val))

	#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
	#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)

	#define ARCSTAT_MAX(stat, val) { \
	uint64_t m; \
	while ((val) > (m = arc_stats.stat.value.ui64) && \
	(m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
	continue; \
	}

	#define ARCSTAT_MAXSTAT(stat) \
	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)

	/*
	* We define a macro to allow ARC hits/misses to be easily broken down by
	* two separate conditions, giving a total of four different subtypes for
	* each of hits and misses (so eight statistics total).
	*/
	#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
	if (cond1) { \
	if (cond2) { \
	ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
	} else { \
	ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
	} \
	} else { \
	if (cond2) { \
	ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
	} else { \
	ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
	} \
	}

	kstat_t *arc_ksp;
	static arc_state_t *arc_anon;
	static arc_state_t *arc_mru;
	static arc_state_t *arc_mru_ghost;
	static arc_state_t *arc_mfu;
	static arc_state_t *arc_mfu_ghost;
	static arc_state_t *arc_l2c_only;

	/*
	* There are several ARC variables that are critical to export as kstats --
	* but we don't want to have to grovel around in the kstat whenever we wish to
	* manipulate them. For these variables, we therefore define them to be in
	* terms of the statistic variable. This assures that we are not introducing
	* the possibility of inconsistency by having shadow copies of the variables,
	* while still allowing the code to be readable.
	*/
	#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
	#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
	#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
	#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
	#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
	#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
	#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
	#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
	#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */

	/* compressed size of entire arc */
	#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
	/* uncompressed size of entire arc */
	#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
	/* number of bytes in the arc from arc_buf_t's */
	#define arc_overhead_size ARCSTAT(arcstat_overhead_size)

	static int arc_no_grow; /* Don't try to grow cache size */
	static uint64_t arc_tempreserve;
	static uint64_t arc_loaned_bytes;

	typedef struct arc_callback arc_callback_t;

	struct arc_callback {
	void *acb_private;
	arc_done_func_t *acb_done;
	arc_buf_t *acb_buf;
	boolean_t acb_compressed;
	zio_t *acb_zio_dummy;
	arc_callback_t *acb_next;
	};

	typedef struct arc_write_callback arc_write_callback_t;

	struct arc_write_callback {
	void *awcb_private;
	arc_done_func_t *awcb_ready;
	arc_done_func_t *awcb_children_ready;
	arc_done_func_t *awcb_physdone;
	arc_done_func_t *awcb_done;
	arc_buf_t *awcb_buf;
	};

	/*
	* ARC buffers are separated into multiple structs as a memory saving measure:
	* - Common fields struct, always defined, and embedded within it:
	* - L2-only fields, always allocated but undefined when not in L2ARC
	* - L1-only fields, only allocated when in L1ARC
	*
	* Buffer in L1 Buffer only in L2
	* +------------------------+ +------------------------+
	* \| arc_buf_hdr_t \| \| arc_buf_hdr_t \|
	* \| \| \| \|
	* \| \| \| \|
	* \| \| \| \|
	* +------------------------+ +------------------------+
	* \| l2arc_buf_hdr_t \| \| l2arc_buf_hdr_t \|
	* \| (undefined if L1-only) \| \| \|
	* +------------------------+ +------------------------+
	* \| l1arc_buf_hdr_t \|
	* \| \|
	* \| \|
	* \| \|
	* \| \|
	* +------------------------+
	*
	* Because it's possible for the L2ARC to become extremely large, we can wind
	* up eating a lot of memory in L2ARC buffer headers, so the size of a header
	* is minimized by only allocating the fields necessary for an L1-cached buffer
	* when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
	* l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
	* words in pointers. arc_hdr_realloc() is used to switch a header between
	* these two allocation states.
	*/
	typedef struct l1arc_buf_hdr {
	kmutex_t b_freeze_lock;
	zio_cksum_t *b_freeze_cksum;
	#ifdef ZFS_DEBUG
	/*
	* Used for debugging with kmem_flags - by allocating and freeing
	* b_thawed when the buffer is thawed, we get a record of the stack
	* trace that thawed it.
	*/
	void *b_thawed;
	#endif

	arc_buf_t *b_buf;
	uint32_t b_bufcnt;
	/* for waiting on writes to complete */
	kcondvar_t b_cv;
	uint8_t b_byteswap;

	/* protected by arc state mutex */
	arc_state_t *b_state;
	multilist_node_t b_arc_node;

	/* updated atomically */
	clock_t b_arc_access;

	/* self protecting */
	refcount_t b_refcnt;

	arc_callback_t *b_acb;
	abd_t *b_pabd;
	} l1arc_buf_hdr_t;

	typedef struct l2arc_dev l2arc_dev_t;

	typedef struct l2arc_buf_hdr {
	/* protected by arc_buf_hdr mutex */
	l2arc_dev_t b_dev; / L2ARC device */
	uint64_t b_daddr; /* disk address, offset byte */

	list_node_t b_l2node;
	} l2arc_buf_hdr_t;

	struct arc_buf_hdr {
	/* protected by hash lock */
	dva_t b_dva;
	uint64_t b_birth;

	arc_buf_contents_t b_type;
	arc_buf_hdr_t *b_hash_next;
	arc_flags_t b_flags;

	/*
	* This field stores the size of the data buffer after
	* compression, and is set in the arc's zio completion handlers.
	* It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
	*
	* While the block pointers can store up to 32MB in their psize
	* field, we can only store up to 32MB minus 512B. This is due
	* to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
	* a field of zeros represents 512B in the bp). We can't use a
	* bias of 1 since we need to reserve a psize of zero, here, to
	* represent holes and embedded blocks.
	*
	* This isn't a problem in practice, since the maximum size of a
	* buffer is limited to 16MB, so we never need to store 32MB in
	* this field. Even in the upstream illumos code base, the
	* maximum size of a buffer is limited to 16MB.
	*/
	uint16_t b_psize;

	/*
	* This field stores the size of the data buffer before
	* compression, and cannot change once set. It is in units
	* of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
	*/
	uint16_t b_lsize; /* immutable */
	uint64_t b_spa; /* immutable */

	/* L2ARC fields. Undefined when not in L2ARC. */
	l2arc_buf_hdr_t b_l2hdr;
	/* L1ARC fields. Undefined when in l2arc_only state */
	l1arc_buf_hdr_t b_l1hdr;
	};

	#if defined(__FreeBSD__) && defined(_KERNEL)
	static int
	sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
	{
	uint64_t val;
	int err;

	val = arc_meta_limit;
	err = sysctl_handle_64(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (val <= 0 \|\| val > arc_c_max)
	return (EINVAL);

	arc_meta_limit = val;
	return (0);
	}

	static int
	sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
	{
	uint32_t val;
	int err;

	val = arc_no_grow_shift;
	err = sysctl_handle_32(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (val >= arc_shrink_shift)
	return (EINVAL);

	arc_no_grow_shift = val;
	return (0);
	}

	static int
	sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
	{
	uint64_t val;
	int err;

	val = zfs_arc_max;
	err = sysctl_handle_64(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (zfs_arc_max == 0) {
	/* Loader tunable so blindly set */
	zfs_arc_max = val;
	return (0);
	}

	if (val < arc_abs_min \|\| val > kmem_size())
	return (EINVAL);
	if (val < arc_c_min)
	return (EINVAL);
	if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
	return (EINVAL);

	arc_c_max = val;

	arc_c = arc_c_max;
	arc_p = (arc_c >> 1);

	if (zfs_arc_meta_limit == 0) {
	/* limit meta-data to 1/4 of the arc capacity */
	arc_meta_limit = arc_c_max / 4;
	}

	/* if kmem_flags are set, lets try to use less memory */
	if (kmem_debugging())
	arc_c = arc_c / 2;

	zfs_arc_max = arc_c;

	return (0);
	}

	static int
	sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
	{
	uint64_t val;
	int err;

	val = zfs_arc_min;
	err = sysctl_handle_64(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (zfs_arc_min == 0) {
	/* Loader tunable so blindly set */
	zfs_arc_min = val;
	return (0);
	}

	if (val < arc_abs_min \|\| val > arc_c_max)
	return (EINVAL);

	arc_c_min = val;

	if (zfs_arc_meta_min == 0)
	arc_meta_min = arc_c_min / 2;

	if (arc_c < arc_c_min)
	arc_c = arc_c_min;

	zfs_arc_min = arc_c_min;

	return (0);
	}
	#endif

	#define GHOST_STATE(state) \
	((state) == arc_mru_ghost \|\| (state) == arc_mfu_ghost \|\| \
	(state) == arc_l2c_only)

	#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
	#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
	#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
	#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
	#define HDR_COMPRESSION_ENABLED(hdr) \
	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)

	#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
	#define HDR_L2_READING(hdr) \
	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
	#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
	#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
	#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
	#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)

	#define HDR_ISTYPE_METADATA(hdr) \
	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
	#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))

	#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
	#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)

	/* For storing compression mode in b_flags */
	#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)

	#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
	#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));

	#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
	#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
	#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)

	/*
	* Other sizes
	*/

	#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
	#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))

	/*
	* Hash table routines
	*/

	#define HT_LOCK_PAD CACHE_LINE_SIZE

	struct ht_lock {
	kmutex_t ht_lock;
	#ifdef _KERNEL
	unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
	#endif
	};

	#define BUF_LOCKS 256
	typedef struct buf_hash_table {
	uint64_t ht_mask;
	arc_buf_hdr_t **ht_table;
	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
	} buf_hash_table_t;

	static buf_hash_table_t buf_hash_table;

	#define BUF_HASH_INDEX(spa, dva, birth) \
	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
	#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
	#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
	#define HDR_LOCK(hdr) \
	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))

	uint64_t zfs_crc64_table[256];

	/*
	* Level 2 ARC
	*/

	#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
	#define L2ARC_HEADROOM 2 /* num of writes */
	/*
	* If we discover during ARC scan any buffers to be compressed, we boost
	* our headroom for the next scanning cycle by this percentage multiple.
	*/
	#define L2ARC_HEADROOM_BOOST 200
	#define L2ARC_FEED_SECS 1 /* caching interval secs */
	#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */

	#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
	#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)

	/* L2ARC Performance Tunables */
	uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
	uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
	uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
	uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
	uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
	uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
	boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
	boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
	boolean_t l2arc_norw = B_TRUE; /* no reads during writes */

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
	&l2arc_write_max, 0, "max write size");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
	&l2arc_write_boost, 0, "extra write during warmup");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
	&l2arc_headroom, 0, "number of dev writes");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
	&l2arc_feed_secs, 0, "interval seconds");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
	&l2arc_feed_min_ms, 0, "min interval milliseconds");

	SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
	&l2arc_noprefetch, 0, "don't cache prefetch bufs");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
	&l2arc_feed_again, 0, "turbo warmup");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
	&l2arc_norw, 0, "no reads during writes");

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
	&ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
	&ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
	"size of anonymous state");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
	&ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
	"size of anonymous state");

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
	&ARC_mru.arcs_size.rc_count, 0, "size of mru state");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
	&ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
	"size of metadata in mru state");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
	&ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
	"size of data in mru state");

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
	&ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
	&ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
	"size of metadata in mru ghost state");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
	&ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
	"size of data in mru ghost state");

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
	&ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
	&ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
	"size of metadata in mfu state");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
	&ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
	"size of data in mfu state");

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
	&ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
	"size of metadata in mfu ghost state");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
	"size of data in mfu ghost state");

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
	&ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");

	/*
	* L2ARC Internals
	*/
	struct l2arc_dev {
	vdev_t l2ad_vdev; / vdev */
	spa_t l2ad_spa; / spa */
	uint64_t l2ad_hand; /* next write location */
	uint64_t l2ad_start; /* first addr on device */
	uint64_t l2ad_end; /* last addr on device */
	boolean_t l2ad_first; /* first sweep through */
	boolean_t l2ad_writing; /* currently writing */
	kmutex_t l2ad_mtx; /* lock for buffer list */
	list_t l2ad_buflist; /* buffer list */
	list_node_t l2ad_node; /* device list node */
	refcount_t l2ad_alloc; /* allocated bytes */
	};

	static list_t L2ARC_dev_list; /* device list */
	static list_t l2arc_dev_list; / device list pointer */
	static kmutex_t l2arc_dev_mtx; /* device list mutex */
	static l2arc_dev_t l2arc_dev_last; / last device used */
	static list_t L2ARC_free_on_write; /* free after write buf list */
	static list_t l2arc_free_on_write; / free after write list ptr */
	static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
	static uint64_t l2arc_ndev; /* number of devices */

	typedef struct l2arc_read_callback {
	arc_buf_hdr_t l2rcb_hdr; / read header */
	blkptr_t l2rcb_bp; /* original blkptr */
	zbookmark_phys_t l2rcb_zb; /* original bookmark */
	int l2rcb_flags; /* original flags */
	abd_t l2rcb_abd; / temporary buffer */
	} l2arc_read_callback_t;

	typedef struct l2arc_write_callback {
	l2arc_dev_t l2wcb_dev; / device info */
	arc_buf_hdr_t l2wcb_head; / head of write buflist */
	} l2arc_write_callback_t;

	typedef struct l2arc_data_free {
	/* protected by l2arc_free_on_write_mtx */
	abd_t *l2df_abd;
	size_t l2df_size;
	arc_buf_contents_t l2df_type;
	list_node_t l2df_list_node;
	} l2arc_data_free_t;

	static kmutex_t l2arc_feed_thr_lock;
	static kcondvar_t l2arc_feed_thr_cv;
	static uint8_t l2arc_thread_exit;

	static abd_t arc_get_data_abd(arc_buf_hdr_t , uint64_t, void *);
	static void arc_get_data_buf(arc_buf_hdr_t , uint64_t, void *);
	static void arc_get_data_impl(arc_buf_hdr_t , uint64_t, void );
	static void arc_free_data_abd(arc_buf_hdr_t , abd_t , uint64_t, void *);
	static void arc_free_data_buf(arc_buf_hdr_t , void , uint64_t, void *);
	static void arc_free_data_impl(arc_buf_hdr_t hdr, uint64_t size, void tag);
	static void arc_hdr_free_pabd(arc_buf_hdr_t *);
	static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
	static void arc_access(arc_buf_hdr_t , kmutex_t );
	static boolean_t arc_is_overflowing();
	static void arc_buf_watch(arc_buf_t *);

	static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
	static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
	static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
	static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);

	static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
	static void l2arc_read_done(zio_t *);

	static void
	l2arc_trim(const arc_buf_hdr_t *hdr)
	{
	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;

	ASSERT(HDR_HAS_L2HDR(hdr));
	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));

	if (HDR_GET_PSIZE(hdr) != 0) {
	trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
	HDR_GET_PSIZE(hdr), 0);
	}
	}

	static uint64_t
	buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
	{
	uint8_t vdva = (uint8_t )dva;
	uint64_t crc = -1ULL;
	int i;

	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);

	for (i = 0; i < sizeof (dva_t); i++)
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];

	crc ^= (spa>>8) ^ birth;

	return (crc);
	}

	#define HDR_EMPTY(hdr) \
	((hdr)->b_dva.dva_word[0] == 0 && \
	(hdr)->b_dva.dva_word[1] == 0)

	#define HDR_EQUAL(spa, dva, birth, hdr) \
	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)

	static void
	buf_discard_identity(arc_buf_hdr_t *hdr)
	{
	hdr->b_dva.dva_word[0] = 0;
	hdr->b_dva.dva_word[1] = 0;
	hdr->b_birth = 0;
	}

	static arc_buf_hdr_t *
	buf_hash_find(uint64_t spa, const blkptr_t bp, kmutex_t *lockp)
	{
	const dva_t *dva = BP_IDENTITY(bp);
	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
	arc_buf_hdr_t *hdr;

	mutex_enter(hash_lock);
	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
	hdr = hdr->b_hash_next) {
	if (HDR_EQUAL(spa, dva, birth, hdr)) {
	*lockp = hash_lock;
	return (hdr);
	}
	}
	mutex_exit(hash_lock);
	*lockp = NULL;
	return (NULL);
	}

	/*
	* Insert an entry into the hash table. If there is already an element
	* equal to elem in the hash table, then the already existing element
	* will be returned and the new element will not be inserted.
	* Otherwise returns NULL.
	* If lockp == NULL, the caller is assumed to already hold the hash lock.
	*/
	static arc_buf_hdr_t *
	buf_hash_insert(arc_buf_hdr_t hdr, kmutex_t *lockp)
	{
	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
	arc_buf_hdr_t *fhdr;
	uint32_t i;

	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
	ASSERT(hdr->b_birth != 0);
	ASSERT(!HDR_IN_HASH_TABLE(hdr));

	if (lockp != NULL) {
	*lockp = hash_lock;
	mutex_enter(hash_lock);
	} else {
	ASSERT(MUTEX_HELD(hash_lock));
	}

	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
	fhdr = fhdr->b_hash_next, i++) {
	if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
	return (fhdr);
	}

	hdr->b_hash_next = buf_hash_table.ht_table[idx];
	buf_hash_table.ht_table[idx] = hdr;
	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);

	/* collect some hash table performance data */
	if (i > 0) {
	ARCSTAT_BUMP(arcstat_hash_collisions);
	if (i == 1)
	ARCSTAT_BUMP(arcstat_hash_chains);

	ARCSTAT_MAX(arcstat_hash_chain_max, i);
	}

	ARCSTAT_BUMP(arcstat_hash_elements);
	ARCSTAT_MAXSTAT(arcstat_hash_elements);

	return (NULL);
	}

	static void
	buf_hash_remove(arc_buf_hdr_t *hdr)
	{
	arc_buf_hdr_t fhdr, *hdrp;
	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);

	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
	ASSERT(HDR_IN_HASH_TABLE(hdr));

	hdrp = &buf_hash_table.ht_table[idx];
	while ((fhdr = *hdrp) != hdr) {
	ASSERT3P(fhdr, !=, NULL);
	hdrp = &fhdr->b_hash_next;
	}
	*hdrp = hdr->b_hash_next;
	hdr->b_hash_next = NULL;
	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);

	/* collect some hash table performance data */
	ARCSTAT_BUMPDOWN(arcstat_hash_elements);

	if (buf_hash_table.ht_table[idx] &&
	buf_hash_table.ht_table[idx]->b_hash_next == NULL)
	ARCSTAT_BUMPDOWN(arcstat_hash_chains);
	}

	/*
	* Global data structures and functions for the buf kmem cache.
	*/
	static kmem_cache_t *hdr_full_cache;
	static kmem_cache_t *hdr_l2only_cache;
	static kmem_cache_t *buf_cache;

	static void
	buf_fini(void)
	{
	int i;

	kmem_free(buf_hash_table.ht_table,
	(buf_hash_table.ht_mask + 1) * sizeof (void *));
	for (i = 0; i < BUF_LOCKS; i++)
	mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
	kmem_cache_destroy(hdr_full_cache);
	kmem_cache_destroy(hdr_l2only_cache);
	kmem_cache_destroy(buf_cache);
	}

	/*
	* Constructor callback - called when the cache is empty
	* and a new buf is requested.
	*/
	/* ARGSUSED */
	static int
	hdr_full_cons(void vbuf, void unused, int kmflag)
	{
	arc_buf_hdr_t *hdr = vbuf;

	bzero(hdr, HDR_FULL_SIZE);
	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
	refcount_create(&hdr->b_l1hdr.b_refcnt);
	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);

	return (0);
	}

	/* ARGSUSED */
	static int
	hdr_l2only_cons(void vbuf, void unused, int kmflag)
	{
	arc_buf_hdr_t *hdr = vbuf;

	bzero(hdr, HDR_L2ONLY_SIZE);
	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);

	return (0);
	}

	/* ARGSUSED */
	static int
	buf_cons(void vbuf, void unused, int kmflag)
	{
	arc_buf_t *buf = vbuf;

	bzero(buf, sizeof (arc_buf_t));
	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);

	return (0);
	}

	/*
	* Destructor callback - called when a cached buf is
	* no longer required.
	*/
	/* ARGSUSED */
	static void
	hdr_full_dest(void vbuf, void unused)
	{
	arc_buf_hdr_t *hdr = vbuf;

	ASSERT(HDR_EMPTY(hdr));
	cv_destroy(&hdr->b_l1hdr.b_cv);
	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
	}

	/* ARGSUSED */
	static void
	hdr_l2only_dest(void vbuf, void unused)
	{
	arc_buf_hdr_t *hdr = vbuf;

	ASSERT(HDR_EMPTY(hdr));
	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
	}

	/* ARGSUSED */
	static void
	buf_dest(void vbuf, void unused)
	{
	arc_buf_t *buf = vbuf;

	mutex_destroy(&buf->b_evict_lock);
	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
	}

	/*
	* Reclaim callback -- invoked when memory is low.
	*/
	/* ARGSUSED */
	static void
	hdr_recl(void *unused)
	{
	dprintf("hdr_recl called\n");
	/*
	* umem calls the reclaim func when we destroy the buf cache,
	* which is after we do arc_fini().
	*/
	if (!arc_dead)
	cv_signal(&arc_reclaim_thread_cv);
	}

	static void
	buf_init(void)
	{
	uint64_t *ct;
	uint64_t hsize = 1ULL << 12;
	int i, j;

	/*
	* The hash table is big enough to fill all of physical memory
	* with an average block size of zfs_arc_average_blocksize (default 8K).
	* By default, the table will take up
	* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
	*/
	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
	hsize <<= 1;
	retry:
	buf_hash_table.ht_mask = hsize - 1;
	buf_hash_table.ht_table =
	kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
	if (buf_hash_table.ht_table == NULL) {
	ASSERT(hsize > (1ULL << 8));
	hsize >>= 1;
	goto retry;
	}

	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
	0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
	HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
	NULL, NULL, 0);
	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
	0, buf_cons, buf_dest, NULL, NULL, NULL, 0);

	for (i = 0; i < 256; i++)
	for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
	ct = (ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);

	for (i = 0; i < BUF_LOCKS; i++) {
	mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
	NULL, MUTEX_DEFAULT, NULL);
	}
	}

	/*
	* This is the size that the buf occupies in memory. If the buf is compressed,
	* it will correspond to the compressed size. You should use this method of
	* getting the buf size unless you explicitly need the logical size.
	*/
	int32_t
	arc_buf_size(arc_buf_t *buf)
	{
	return (ARC_BUF_COMPRESSED(buf) ?
	HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
	}

	int32_t
	arc_buf_lsize(arc_buf_t *buf)
	{
	return (HDR_GET_LSIZE(buf->b_hdr));
	}

	enum zio_compress
	arc_get_compression(arc_buf_t *buf)
	{
	return (ARC_BUF_COMPRESSED(buf) ?
	HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
	}

	#define ARC_MINTIME (hz>>4) /* 62 ms */

	static inline boolean_t
	arc_buf_is_shared(arc_buf_t *buf)
	{
	boolean_t shared = (buf->b_data != NULL &&
	buf->b_hdr->b_l1hdr.b_pabd != NULL &&
	abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
	buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
	IMPLY(shared, ARC_BUF_SHARED(buf));
	IMPLY(shared, ARC_BUF_COMPRESSED(buf) \|\| ARC_BUF_LAST(buf));

	/*
	* It would be nice to assert arc_can_share() too, but the "hdr isn't
	* already being shared" requirement prevents us from doing that.
	*/

	return (shared);
	}

	/*
	* Free the checksum associated with this header. If there is no checksum, this
	* is a no-op.
	*/
	static inline void
	arc_cksum_free(arc_buf_hdr_t *hdr)
	{
	ASSERT(HDR_HAS_L1HDR(hdr));
	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
	kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
	hdr->b_l1hdr.b_freeze_cksum = NULL;
	}
	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	}

	/*
	* Return true iff at least one of the bufs on hdr is not compressed.
	*/
	static boolean_t
	arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
	{
	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
	if (!ARC_BUF_COMPRESSED(b)) {
	return (B_TRUE);
	}
	}
	return (B_FALSE);
	}

	/*
	* If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
	* matches the checksum that is stored in the hdr. If there is no checksum,
	* or if the buf is compressed, this is a no-op.
	*/
	static void
	arc_cksum_verify(arc_buf_t *buf)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;
	zio_cksum_t zc;

	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
	return;

	if (ARC_BUF_COMPRESSED(buf)) {
	ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL \|\|
	arc_hdr_has_uncompressed_buf(hdr));
	return;
	}

	ASSERT(HDR_HAS_L1HDR(hdr));

	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
	if (hdr->b_l1hdr.b_freeze_cksum == NULL \|\| HDR_IO_ERROR(hdr)) {
	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	return;
	}

	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
	panic("buffer modified while frozen!");
	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	}

	static boolean_t
	arc_cksum_is_equal(arc_buf_hdr_t hdr, zio_t zio)
	{
	enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
	boolean_t valid_cksum;

	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));

	/*
	* We rely on the blkptr's checksum to determine if the block
	* is valid or not. When compressed arc is enabled, the l2arc
	* writes the block to the l2arc just as it appears in the pool.
	* This allows us to use the blkptr's checksum to validate the
	* data that we just read off of the l2arc without having to store
	* a separate checksum in the arc_buf_hdr_t. However, if compressed
	* arc is disabled, then the data written to the l2arc is always
	* uncompressed and won't match the block as it exists in the main
	* pool. When this is the case, we must first compress it if it is
	* compressed on the main pool before we can validate the checksum.
	*/
	if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
	uint64_t lsize = HDR_GET_LSIZE(hdr);
	uint64_t csize;

	abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
	csize = zio_compress_data(compress, zio->io_abd,
	abd_to_buf(cdata), lsize);

	ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
	if (csize < HDR_GET_PSIZE(hdr)) {
	/*
	* Compressed blocks are always a multiple of the
	* smallest ashift in the pool. Ideally, we would
	* like to round up the csize to the next
	* spa_min_ashift but that value may have changed
	* since the block was last written. Instead,
	* we rely on the fact that the hdr's psize
	* was set to the psize of the block when it was
	* last written. We set the csize to that value
	* and zero out any part that should not contain
	* data.
	*/
	abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
	csize = HDR_GET_PSIZE(hdr);
	}
	zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
	}

	/*
	* Block pointers always store the checksum for the logical data.
	* If the block pointer has the gang bit set, then the checksum
	* it represents is for the reconstituted data and not for an
	* individual gang member. The zio pipeline, however, must be able to
	* determine the checksum of each of the gang constituents so it
	* treats the checksum comparison differently than what we need
	* for l2arc blocks. This prevents us from using the
	* zio_checksum_error() interface directly. Instead we must call the
	* zio_checksum_error_impl() so that we can ensure the checksum is
	* generated using the correct checksum algorithm and accounts for the
	* logical I/O size and not just a gang fragment.
	*/
	valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
	BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
	zio->io_offset, NULL) == 0);
	zio_pop_transforms(zio);
	return (valid_cksum);
	}

	/*
	* Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
	* checksum and attaches it to the buf's hdr so that we can ensure that the buf
	* isn't modified later on. If buf is compressed or there is already a checksum
	* on the hdr, this is a no-op (we only checksum uncompressed bufs).
	*/
	static void
	arc_cksum_compute(arc_buf_t *buf)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;

	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
	return;

	ASSERT(HDR_HAS_L1HDR(hdr));

	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
	ASSERT(arc_hdr_has_uncompressed_buf(hdr));
	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	return;
	} else if (ARC_BUF_COMPRESSED(buf)) {
	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	return;
	}

	ASSERT(!ARC_BUF_COMPRESSED(buf));
	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
	KM_SLEEP);
	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
	hdr->b_l1hdr.b_freeze_cksum);
	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
	#ifdef illumos
	arc_buf_watch(buf);
	#endif
	}

	#ifdef illumos
	#ifndef _KERNEL
	typedef struct procctl {
	long cmd;
	prwatch_t prwatch;
	} procctl_t;
	#endif

	/* ARGSUSED */
	static void
	arc_buf_unwatch(arc_buf_t *buf)
	{
	#ifndef _KERNEL
	if (arc_watch) {
	int result;
	procctl_t ctl;
	ctl.cmd = PCWATCH;
	ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
	ctl.prwatch.pr_size = 0;
	ctl.prwatch.pr_wflags = 0;
	result = write(arc_procfd, &ctl, sizeof (ctl));
	ASSERT3U(result, ==, sizeof (ctl));
	}
	#endif
	}

	/* ARGSUSED */
	static void
	arc_buf_watch(arc_buf_t *buf)
	{
	#ifndef _KERNEL
	if (arc_watch) {
	int result;
	procctl_t ctl;
	ctl.cmd = PCWATCH;
	ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
	ctl.prwatch.pr_size = arc_buf_size(buf);
	ctl.prwatch.pr_wflags = WA_WRITE;
	result = write(arc_procfd, &ctl, sizeof (ctl));
	ASSERT3U(result, ==, sizeof (ctl));
	}
	#endif
	}
	#endif /* illumos */

	static arc_buf_contents_t
	arc_buf_type(arc_buf_hdr_t *hdr)
	{
	arc_buf_contents_t type;
	if (HDR_ISTYPE_METADATA(hdr)) {
	type = ARC_BUFC_METADATA;
	} else {
	type = ARC_BUFC_DATA;
	}
	VERIFY3U(hdr->b_type, ==, type);
	return (type);
	}

	boolean_t
	arc_is_metadata(arc_buf_t *buf)
	{
	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
	}

	static uint32_t
	arc_bufc_to_flags(arc_buf_contents_t type)
	{
	switch (type) {
	case ARC_BUFC_DATA:
	/* metadata field is 0 if buffer contains normal data */
	return (0);
	case ARC_BUFC_METADATA:
	return (ARC_FLAG_BUFC_METADATA);
	default:
	break;
	}
	panic("undefined ARC buffer type!");
	return ((uint32_t)-1);
	}

	void
	arc_buf_thaw(arc_buf_t *buf)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;

	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
	ASSERT(!HDR_IO_IN_PROGRESS(hdr));

	arc_cksum_verify(buf);

	/*
	* Compressed buffers do not manipulate the b_freeze_cksum or
	* allocate b_thawed.
	*/
	if (ARC_BUF_COMPRESSED(buf)) {
	ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL \|\|
	arc_hdr_has_uncompressed_buf(hdr));
	return;
	}

	ASSERT(HDR_HAS_L1HDR(hdr));
	arc_cksum_free(hdr);

	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
	#ifdef ZFS_DEBUG
	if (zfs_flags & ZFS_DEBUG_MODIFY) {
	if (hdr->b_l1hdr.b_thawed != NULL)
	kmem_free(hdr->b_l1hdr.b_thawed, 1);
	hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
	}
	#endif

	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);

	#ifdef illumos
	arc_buf_unwatch(buf);
	#endif
	}

	void
	arc_buf_freeze(arc_buf_t *buf)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;
	kmutex_t *hash_lock;

	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
	return;

	if (ARC_BUF_COMPRESSED(buf)) {
	ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL \|\|
	arc_hdr_has_uncompressed_buf(hdr));
	return;
	}

	hash_lock = HDR_LOCK(hdr);
	mutex_enter(hash_lock);

	ASSERT(HDR_HAS_L1HDR(hdr));
	ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL \|\|
	hdr->b_l1hdr.b_state == arc_anon);
	arc_cksum_compute(buf);
	mutex_exit(hash_lock);
	}

	/*
	* The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
	* the following functions should be used to ensure that the flags are
	* updated in a thread-safe way. When manipulating the flags either
	* the hash_lock must be held or the hdr must be undiscoverable. This
	* ensures that we're not racing with any other threads when updating
	* the flags.
	*/
	static inline void
	arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
	{
	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
	hdr->b_flags \|= flags;
	}

	static inline void
	arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
	{
	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
	hdr->b_flags &= ~flags;
	}

	/*
	* Setting the compression bits in the arc_buf_hdr_t's b_flags is
	* done in a special way since we have to clear and set bits
	* at the same time. Consumers that wish to set the compression bits
	* must use this function to ensure that the flags are updated in
	* thread-safe manner.
	*/
	static void
	arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
	{
	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));

	/*
	* Holes and embedded blocks will always have a psize = 0 so
	* we ignore the compression of the blkptr and set the
	* arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
	* Holes and embedded blocks remain anonymous so we don't
	* want to uncompress them. Mark them as uncompressed.
	*/
	if (!zfs_compressed_arc_enabled \|\| HDR_GET_PSIZE(hdr) == 0) {
	arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
	HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
	ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
	} else {
	arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
	HDR_SET_COMPRESS(hdr, cmp);
	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
	ASSERT(HDR_COMPRESSION_ENABLED(hdr));
	}
	}

	/*
	* Looks for another buf on the same hdr which has the data decompressed, copies
	* from it, and returns true. If no such buf exists, returns false.
	*/
	static boolean_t
	arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;
	boolean_t copied = B_FALSE;

	ASSERT(HDR_HAS_L1HDR(hdr));
	ASSERT3P(buf->b_data, !=, NULL);
	ASSERT(!ARC_BUF_COMPRESSED(buf));

	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
	from = from->b_next) {
	/* can't use our own data buffer */
	if (from == buf) {
	continue;
	}

	if (!ARC_BUF_COMPRESSED(from)) {
	bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
	copied = B_TRUE;
	break;
	}
	}

	/*
	* There were no decompressed bufs, so there should not be a
	* checksum on the hdr either.
	*/
	EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);

	return (copied);
	}

	/*
	* Given a buf that has a data buffer attached to it, this function will
	* efficiently fill the buf with data of the specified compression setting from
	* the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
	* are already sharing a data buf, no copy is performed.
	*
	* If the buf is marked as compressed but uncompressed data was requested, this
	* will allocate a new data buffer for the buf, remove that flag, and fill the
	* buf with uncompressed data. You can't request a compressed buf on a hdr with
	* uncompressed data, and (since we haven't added support for it yet) if you
	* want compressed data your buf must already be marked as compressed and have
	* the correct-sized data buffer.
	*/
	static int
	arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;
	boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;

	ASSERT3P(buf->b_data, !=, NULL);
	IMPLY(compressed, hdr_compressed);
	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));

	if (hdr_compressed == compressed) {
	if (!arc_buf_is_shared(buf)) {
	abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
	arc_buf_size(buf));
	}
	} else {
	ASSERT(hdr_compressed);
	ASSERT(!compressed);
	ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));

	/*
	* If the buf is sharing its data with the hdr, unlink it and
	* allocate a new data buffer for the buf.
	*/
	if (arc_buf_is_shared(buf)) {
	ASSERT(ARC_BUF_COMPRESSED(buf));

	/* We need to give the buf it's own b_data */
	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
	buf->b_data =
	arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);

	/* Previously overhead was 0; just add new overhead */
	ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
	} else if (ARC_BUF_COMPRESSED(buf)) {
	/* We need to reallocate the buf's b_data */
	arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
	buf);
	buf->b_data =
	arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);

	/* We increased the size of b_data; update overhead */
	ARCSTAT_INCR(arcstat_overhead_size,
	HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
	}

	/*
	* Regardless of the buf's previous compression settings, it
	* should not be compressed at the end of this function.
	*/
	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;

	/*
	* Try copying the data from another buf which already has a
	* decompressed version. If that's not possible, it's time to
	* bite the bullet and decompress the data from the hdr.
	*/
	if (arc_buf_try_copy_decompressed_data(buf)) {
	/* Skip byteswapping and checksumming (already done) */
	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
	return (0);
	} else {
	int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
	hdr->b_l1hdr.b_pabd, buf->b_data,
	HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));

	/*
	* Absent hardware errors or software bugs, this should
	* be impossible, but log it anyway so we can debug it.
	*/
	if (error != 0) {
	zfs_dbgmsg(
	"hdr %p, compress %d, psize %d, lsize %d",
	hdr, HDR_GET_COMPRESS(hdr),
	HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
	return (SET_ERROR(EIO));
	}
	}
	}

	/* Byteswap the buf's data if necessary */
	if (bswap != DMU_BSWAP_NUMFUNCS) {
	ASSERT(!HDR_SHARED_DATA(hdr));
	ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
	dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
	}

	/* Compute the hdr's checksum if necessary */
	arc_cksum_compute(buf);

	return (0);
	}

	int
	arc_decompress(arc_buf_t *buf)
	{
	return (arc_buf_fill(buf, B_FALSE));
	}

	/*
	* Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
	*/
	static uint64_t
	arc_hdr_size(arc_buf_hdr_t *hdr)
	{
	uint64_t size;

	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
	HDR_GET_PSIZE(hdr) > 0) {
	size = HDR_GET_PSIZE(hdr);
	} else {
	ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
	size = HDR_GET_LSIZE(hdr);
	}
	return (size);
	}

	/*
	* Increment the amount of evictable space in the arc_state_t's refcount.
	* We account for the space used by the hdr and the arc buf individually
	* so that we can add and remove them from the refcount individually.
	*/
	static void
	arc_evictable_space_increment(arc_buf_hdr_t hdr, arc_state_t state)
	{
	arc_buf_contents_t type = arc_buf_type(hdr);

	ASSERT(HDR_HAS_L1HDR(hdr));

	if (GHOST_STATE(state)) {
	ASSERT0(hdr->b_l1hdr.b_bufcnt);
	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	(void) refcount_add_many(&state->arcs_esize[type],
	HDR_GET_LSIZE(hdr), hdr);
	return;
	}

	ASSERT(!GHOST_STATE(state));
	if (hdr->b_l1hdr.b_pabd != NULL) {
	(void) refcount_add_many(&state->arcs_esize[type],
	arc_hdr_size(hdr), hdr);
	}
	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
	buf = buf->b_next) {
	if (arc_buf_is_shared(buf))
	continue;
	(void) refcount_add_many(&state->arcs_esize[type],
	arc_buf_size(buf), buf);
	}
	}

	/*
	* Decrement the amount of evictable space in the arc_state_t's refcount.
	* We account for the space used by the hdr and the arc buf individually
	* so that we can add and remove them from the refcount individually.
	*/
	static void
	arc_evictable_space_decrement(arc_buf_hdr_t hdr, arc_state_t state)
	{
	arc_buf_contents_t type = arc_buf_type(hdr);

	ASSERT(HDR_HAS_L1HDR(hdr));

	if (GHOST_STATE(state)) {
	ASSERT0(hdr->b_l1hdr.b_bufcnt);
	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	(void) refcount_remove_many(&state->arcs_esize[type],
	HDR_GET_LSIZE(hdr), hdr);
	return;
	}

	ASSERT(!GHOST_STATE(state));
	if (hdr->b_l1hdr.b_pabd != NULL) {
	(void) refcount_remove_many(&state->arcs_esize[type],
	arc_hdr_size(hdr), hdr);
	}
	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
	buf = buf->b_next) {
	if (arc_buf_is_shared(buf))
	continue;
	(void) refcount_remove_many(&state->arcs_esize[type],
	arc_buf_size(buf), buf);
	}
	}

	/*
	* Add a reference to this hdr indicating that someone is actively
	* referencing that memory. When the refcount transitions from 0 to 1,
	* we remove it from the respective arc_state_t list to indicate that
	* it is not evictable.
	*/
	static void
	add_reference(arc_buf_hdr_t hdr, void tag)
	{
	ASSERT(HDR_HAS_L1HDR(hdr));
	if (!MUTEX_HELD(HDR_LOCK(hdr))) {
	ASSERT(hdr->b_l1hdr.b_state == arc_anon);
	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	}

	arc_state_t *state = hdr->b_l1hdr.b_state;

	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
	(state != arc_anon)) {
	/* We don't use the L2-only state list. */
	if (state != arc_l2c_only) {
	multilist_remove(state->arcs_list[arc_buf_type(hdr)],
	hdr);
	arc_evictable_space_decrement(hdr, state);
	}
	/* remove the prefetch flag if we get a reference */
	arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
	}
	}

	/*
	* Remove a reference from this hdr. When the reference transitions from
	* 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
	* list making it eligible for eviction.
	*/
	static int
	remove_reference(arc_buf_hdr_t hdr, kmutex_t hash_lock, void *tag)
	{
	int cnt;
	arc_state_t *state = hdr->b_l1hdr.b_state;

	ASSERT(HDR_HAS_L1HDR(hdr));
	ASSERT(state == arc_anon \|\| MUTEX_HELD(hash_lock));
	ASSERT(!GHOST_STATE(state));

	/*
	* arc_l2c_only counts as a ghost state so we don't need to explicitly
	* check to prevent usage of the arc_l2c_only list.
	*/
	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
	(state != arc_anon)) {
	multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
	arc_evictable_space_increment(hdr, state);
	}
	return (cnt);
	}

	/*
	* Move the supplied buffer to the indicated state. The hash lock
	* for the buffer must be held by the caller.
	*/
	static void
	arc_change_state(arc_state_t new_state, arc_buf_hdr_t hdr,
	kmutex_t *hash_lock)
	{
	arc_state_t *old_state;
	int64_t refcnt;
	uint32_t bufcnt;
	boolean_t update_old, update_new;
	arc_buf_contents_t buftype = arc_buf_type(hdr);

	/*
	* We almost always have an L1 hdr here, since we call arc_hdr_realloc()
	* in arc_read() when bringing a buffer out of the L2ARC. However, the
	* L1 hdr doesn't always exist when we change state to arc_anon before
	* destroying a header, in which case reallocating to add the L1 hdr is
	* pointless.
	*/
	if (HDR_HAS_L1HDR(hdr)) {
	old_state = hdr->b_l1hdr.b_state;
	refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
	bufcnt = hdr->b_l1hdr.b_bufcnt;
	update_old = (bufcnt > 0 \|\| hdr->b_l1hdr.b_pabd != NULL);
	} else {
	old_state = arc_l2c_only;
	refcnt = 0;
	bufcnt = 0;
	update_old = B_FALSE;
	}
	update_new = update_old;

	ASSERT(MUTEX_HELD(hash_lock));
	ASSERT3P(new_state, !=, old_state);
	ASSERT(!GHOST_STATE(new_state) \|\| bufcnt == 0);
	ASSERT(old_state != arc_anon \|\| bufcnt <= 1);

	/*
	* If this buffer is evictable, transfer it from the
	* old state list to the new state list.
	*/
	if (refcnt == 0) {
	if (old_state != arc_anon && old_state != arc_l2c_only) {
	ASSERT(HDR_HAS_L1HDR(hdr));
	multilist_remove(old_state->arcs_list[buftype], hdr);

	if (GHOST_STATE(old_state)) {
	ASSERT0(bufcnt);
	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	update_old = B_TRUE;
	}
	arc_evictable_space_decrement(hdr, old_state);
	}
	if (new_state != arc_anon && new_state != arc_l2c_only) {

	/*
	* An L1 header always exists here, since if we're
	* moving to some L1-cached state (i.e. not l2c_only or
	* anonymous), we realloc the header to add an L1hdr
	* beforehand.
	*/
	ASSERT(HDR_HAS_L1HDR(hdr));
	multilist_insert(new_state->arcs_list[buftype], hdr);

	if (GHOST_STATE(new_state)) {
	ASSERT0(bufcnt);
	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	update_new = B_TRUE;
	}
	arc_evictable_space_increment(hdr, new_state);
	}
	}

	ASSERT(!HDR_EMPTY(hdr));
	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
	buf_hash_remove(hdr);

	/* adjust state sizes (ignore arc_l2c_only) */

	if (update_new && new_state != arc_l2c_only) {
	ASSERT(HDR_HAS_L1HDR(hdr));
	if (GHOST_STATE(new_state)) {
	ASSERT0(bufcnt);

	/*
	* When moving a header to a ghost state, we first
	* remove all arc buffers. Thus, we'll have a
	* bufcnt of zero, and no arc buffer to use for
	* the reference. As a result, we use the arc
	* header pointer for the reference.
	*/
	(void) refcount_add_many(&new_state->arcs_size,
	HDR_GET_LSIZE(hdr), hdr);
	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	} else {
	uint32_t buffers = 0;

	/*
	* Each individual buffer holds a unique reference,
	* thus we must remove each of these references one
	* at a time.
	*/
	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
	buf = buf->b_next) {
	ASSERT3U(bufcnt, !=, 0);
	buffers++;

	/*
	* When the arc_buf_t is sharing the data
	* block with the hdr, the owner of the
	* reference belongs to the hdr. Only
	* add to the refcount if the arc_buf_t is
	* not shared.
	*/
	if (arc_buf_is_shared(buf))
	continue;

	(void) refcount_add_many(&new_state->arcs_size,
	arc_buf_size(buf), buf);
	}
	ASSERT3U(bufcnt, ==, buffers);

	if (hdr->b_l1hdr.b_pabd != NULL) {
	(void) refcount_add_many(&new_state->arcs_size,
	arc_hdr_size(hdr), hdr);
	} else {
	ASSERT(GHOST_STATE(old_state));
	}
	}
	}

	if (update_old && old_state != arc_l2c_only) {
	ASSERT(HDR_HAS_L1HDR(hdr));
	if (GHOST_STATE(old_state)) {
	ASSERT0(bufcnt);
	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);

	/*
	* When moving a header off of a ghost state,
	* the header will not contain any arc buffers.
	* We use the arc header pointer for the reference
	* which is exactly what we did when we put the
	* header on the ghost state.
	*/

	(void) refcount_remove_many(&old_state->arcs_size,
	HDR_GET_LSIZE(hdr), hdr);
	} else {
	uint32_t buffers = 0;

	/*
	* Each individual buffer holds a unique reference,
	* thus we must remove each of these references one
	* at a time.
	*/
	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
	buf = buf->b_next) {
	ASSERT3U(bufcnt, !=, 0);
	buffers++;

	/*
	* When the arc_buf_t is sharing the data
	* block with the hdr, the owner of the
	* reference belongs to the hdr. Only
	* add to the refcount if the arc_buf_t is
	* not shared.
	*/
	if (arc_buf_is_shared(buf))
	continue;

	(void) refcount_remove_many(
	&old_state->arcs_size, arc_buf_size(buf),
	buf);
	}
	ASSERT3U(bufcnt, ==, buffers);
	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	(void) refcount_remove_many(
	&old_state->arcs_size, arc_hdr_size(hdr), hdr);
	}
	}

	if (HDR_HAS_L1HDR(hdr))
	hdr->b_l1hdr.b_state = new_state;

	/*
	* L2 headers should never be on the L2 state list since they don't
	* have L1 headers allocated.
	*/
	ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
	multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
	}

	void
	arc_space_consume(uint64_t space, arc_space_type_t type)
	{
	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);

	switch (type) {
	case ARC_SPACE_DATA:
	ARCSTAT_INCR(arcstat_data_size, space);
	break;
	case ARC_SPACE_META:
	ARCSTAT_INCR(arcstat_metadata_size, space);
	break;
	case ARC_SPACE_OTHER:
	ARCSTAT_INCR(arcstat_other_size, space);
	break;
	case ARC_SPACE_HDRS:
	ARCSTAT_INCR(arcstat_hdr_size, space);
	break;
	case ARC_SPACE_L2HDRS:
	ARCSTAT_INCR(arcstat_l2_hdr_size, space);
	break;
	}

	if (type != ARC_SPACE_DATA)
	ARCSTAT_INCR(arcstat_meta_used, space);

	atomic_add_64(&arc_size, space);
	}

	void
	arc_space_return(uint64_t space, arc_space_type_t type)
	{
	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);

	switch (type) {
	case ARC_SPACE_DATA:
	ARCSTAT_INCR(arcstat_data_size, -space);
	break;
	case ARC_SPACE_META:
	ARCSTAT_INCR(arcstat_metadata_size, -space);
	break;
	case ARC_SPACE_OTHER:
	ARCSTAT_INCR(arcstat_other_size, -space);
	break;
	case ARC_SPACE_HDRS:
	ARCSTAT_INCR(arcstat_hdr_size, -space);
	break;
	case ARC_SPACE_L2HDRS:
	ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
	break;
	}

	if (type != ARC_SPACE_DATA) {
	ASSERT(arc_meta_used >= space);
	if (arc_meta_max < arc_meta_used)
	arc_meta_max = arc_meta_used;
	ARCSTAT_INCR(arcstat_meta_used, -space);
	}

	ASSERT(arc_size >= space);
	atomic_add_64(&arc_size, -space);
	}

	/*
	* Given a hdr and a buf, returns whether that buf can share its b_data buffer
	* with the hdr's b_pabd.
	*/
	static boolean_t
	arc_can_share(arc_buf_hdr_t hdr, arc_buf_t buf)
	{
	/*
	* The criteria for sharing a hdr's data are:
	* 1. the hdr's compression matches the buf's compression
	* 2. the hdr doesn't need to be byteswapped
	* 3. the hdr isn't already being shared
	* 4. the buf is either compressed or it is the last buf in the hdr list
	*
	* Criterion #4 maintains the invariant that shared uncompressed
	* bufs must be the final buf in the hdr's b_buf list. Reading this, you
	* might ask, "if a compressed buf is allocated first, won't that be the
	* last thing in the list?", but in that case it's impossible to create
	* a shared uncompressed buf anyway (because the hdr must be compressed
	* to have the compressed buf). You might also think that #3 is
	* sufficient to make this guarantee, however it's possible
	* (specifically in the rare L2ARC write race mentioned in
	* arc_buf_alloc_impl()) there will be an existing uncompressed buf that
	* is sharable, but wasn't at the time of its allocation. Rather than
	* allow a new shared uncompressed buf to be created and then shuffle
	* the list around to make it the last element, this simply disallows
	* sharing if the new buf isn't the first to be added.
	*/
	ASSERT3P(buf->b_hdr, ==, hdr);
	boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
	return (buf_compressed == hdr_compressed &&
	hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
	!HDR_SHARED_DATA(hdr) &&
	(ARC_BUF_LAST(buf) \|\| ARC_BUF_COMPRESSED(buf)));
	}

	/*
	* Allocate a buf for this hdr. If you care about the data that's in the hdr,
	* or if you want a compressed buffer, pass those flags in. Returns 0 if the
	* copy was made successfully, or an error code otherwise.
	*/
	static int
	arc_buf_alloc_impl(arc_buf_hdr_t hdr, void tag, boolean_t compressed,
	boolean_t fill, arc_buf_t **ret)
	{
	arc_buf_t *buf;

	ASSERT(HDR_HAS_L1HDR(hdr));
	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
	VERIFY(hdr->b_type == ARC_BUFC_DATA \|\|
	hdr->b_type == ARC_BUFC_METADATA);
	ASSERT3P(ret, !=, NULL);
	ASSERT3P(*ret, ==, NULL);

	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
	buf->b_hdr = hdr;
	buf->b_data = NULL;
	buf->b_next = hdr->b_l1hdr.b_buf;
	buf->b_flags = 0;

	add_reference(hdr, tag);

	/*
	* We're about to change the hdr's b_flags. We must either
	* hold the hash_lock or be undiscoverable.
	*/
	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));

	/*
	* Only honor requests for compressed bufs if the hdr is actually
	* compressed.
	*/
	if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
	buf->b_flags \|= ARC_BUF_FLAG_COMPRESSED;

	/*
	* If the hdr's data can be shared then we share the data buffer and
	* set the appropriate bit in the hdr's b_flags to indicate the hdr is
	* sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
	* buffer to store the buf's data.
	*
	* There are two additional restrictions here because we're sharing
	* hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
	* actively involved in an L2ARC write, because if this buf is used by
	* an arc_write() then the hdr's data buffer will be released when the
	* write completes, even though the L2ARC write might still be using it.
	* Second, the hdr's ABD must be linear so that the buf's user doesn't
	* need to be ABD-aware.
	*/
	boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
	abd_is_linear(hdr->b_l1hdr.b_pabd);

	/* Set up b_data and sharing */
	if (can_share) {
	buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
	buf->b_flags \|= ARC_BUF_FLAG_SHARED;
	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
	} else {
	buf->b_data =
	arc_get_data_buf(hdr, arc_buf_size(buf), buf);
	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
	}
	VERIFY3P(buf->b_data, !=, NULL);

	hdr->b_l1hdr.b_buf = buf;
	hdr->b_l1hdr.b_bufcnt += 1;

	/*
	* If the user wants the data from the hdr, we need to either copy or
	* decompress the data.
	*/
	if (fill) {
	return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
	}

	return (0);
	}

	static char *arc_onloan_tag = "onloan";

	static inline void
	arc_loaned_bytes_update(int64_t delta)
	{
	atomic_add_64(&arc_loaned_bytes, delta);

	/* assert that it did not wrap around */
	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
	}

	/*
	* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
	* flight data by arc_tempreserve_space() until they are "returned". Loaned
	* buffers must be returned to the arc before they can be used by the DMU or
	* freed.
	*/
	arc_buf_t *
	arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
	{
	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
	is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);

	arc_loaned_bytes_update(size);

	return (buf);
	}

	arc_buf_t *
	arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
	enum zio_compress compression_type)
	{
	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
	psize, lsize, compression_type);

	arc_loaned_bytes_update(psize);

	return (buf);
	}


	/*
	* Return a loaned arc buffer to the arc.
	*/
	void
	arc_return_buf(arc_buf_t buf, void tag)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;

	ASSERT3P(buf->b_data, !=, NULL);
	ASSERT(HDR_HAS_L1HDR(hdr));
	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);

	arc_loaned_bytes_update(-arc_buf_size(buf));
	}

	/* Detach an arc_buf from a dbuf (tag) */
	void
	arc_loan_inuse_buf(arc_buf_t buf, void tag)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;

	ASSERT3P(buf->b_data, !=, NULL);
	ASSERT(HDR_HAS_L1HDR(hdr));
	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);

	arc_loaned_bytes_update(arc_buf_size(buf));
	}

	static void
	l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
	{
	l2arc_data_free_t df = kmem_alloc(sizeof (df), KM_SLEEP);

	df->l2df_abd = abd;
	df->l2df_size = size;
	df->l2df_type = type;
	mutex_enter(&l2arc_free_on_write_mtx);
	list_insert_head(l2arc_free_on_write, df);
	mutex_exit(&l2arc_free_on_write_mtx);
	}

	static void
	arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
	{
	arc_state_t *state = hdr->b_l1hdr.b_state;
	arc_buf_contents_t type = arc_buf_type(hdr);
	uint64_t size = arc_hdr_size(hdr);

	/* protected by hash lock, if in the hash table */
	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	ASSERT(state != arc_anon && state != arc_l2c_only);

	(void) refcount_remove_many(&state->arcs_esize[type],
	size, hdr);
	}
	(void) refcount_remove_many(&state->arcs_size, size, hdr);
	if (type == ARC_BUFC_METADATA) {
	arc_space_return(size, ARC_SPACE_META);
	} else {
	ASSERT(type == ARC_BUFC_DATA);
	arc_space_return(size, ARC_SPACE_DATA);
	}

	l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
	}

	/*
	* Share the arc_buf_t's data with the hdr. Whenever we are sharing the
	* data buffer, we transfer the refcount ownership to the hdr and update
	* the appropriate kstats.
	*/
	static void
	arc_share_buf(arc_buf_hdr_t hdr, arc_buf_t buf)
	{
	arc_state_t *state = hdr->b_l1hdr.b_state;

	ASSERT(arc_can_share(hdr, buf));
	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));

	/*
	* Start sharing the data buffer. We transfer the
	* refcount ownership to the hdr since it always owns
	* the refcount whenever an arc_buf_t is shared.
	*/
	refcount_transfer_ownership(&state->arcs_size, buf, hdr);
	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
	HDR_ISTYPE_METADATA(hdr));
	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
	buf->b_flags \|= ARC_BUF_FLAG_SHARED;

	/*
	* Since we've transferred ownership to the hdr we need
	* to increment its compressed and uncompressed kstats and
	* decrement the overhead size.
	*/
	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
	}

	static void
	arc_unshare_buf(arc_buf_hdr_t hdr, arc_buf_t buf)
	{
	arc_state_t *state = hdr->b_l1hdr.b_state;

	ASSERT(arc_buf_is_shared(buf));
	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));

	/*
	* We are no longer sharing this buffer so we need
	* to transfer its ownership to the rightful owner.
	*/
	refcount_transfer_ownership(&state->arcs_size, hdr, buf);
	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
	abd_put(hdr->b_l1hdr.b_pabd);
	hdr->b_l1hdr.b_pabd = NULL;
	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;

	/*
	* Since the buffer is no longer shared between
	* the arc buf and the hdr, count it as overhead.
	*/
	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
	}

	/*
	* Remove an arc_buf_t from the hdr's buf list and return the last
	* arc_buf_t on the list. If no buffers remain on the list then return
	* NULL.
	*/
	static arc_buf_t *
	arc_buf_remove(arc_buf_hdr_t hdr, arc_buf_t buf)
	{
	ASSERT(HDR_HAS_L1HDR(hdr));
	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));

	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
	arc_buf_t *lastbuf = NULL;

	/*
	* Remove the buf from the hdr list and locate the last
	* remaining buffer on the list.
	*/
	while (*bufp != NULL) {
	if (*bufp == buf)
	*bufp = buf->b_next;

	/*
	* If we've removed a buffer in the middle of
	* the list then update the lastbuf and update
	* bufp.
	*/
	if (*bufp != NULL) {
	lastbuf = *bufp;
	bufp = &(*bufp)->b_next;
	}
	}
	buf->b_next = NULL;
	ASSERT3P(lastbuf, !=, buf);
	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));

	return (lastbuf);
	}

	/*
	* Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
	* list and free it.
	*/
	static void
	arc_buf_destroy_impl(arc_buf_t *buf)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;

	/*
	* Free up the data associated with the buf but only if we're not
	* sharing this with the hdr. If we are sharing it with the hdr, the
	* hdr is responsible for doing the free.
	*/
	if (buf->b_data != NULL) {
	/*
	* We're about to change the hdr's b_flags. We must either
	* hold the hash_lock or be undiscoverable.
	*/
	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));

	arc_cksum_verify(buf);
	#ifdef illumos
	arc_buf_unwatch(buf);
	#endif

	if (arc_buf_is_shared(buf)) {
	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
	} else {
	uint64_t size = arc_buf_size(buf);
	arc_free_data_buf(hdr, buf->b_data, size, buf);
	ARCSTAT_INCR(arcstat_overhead_size, -size);
	}
	buf->b_data = NULL;

	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
	hdr->b_l1hdr.b_bufcnt -= 1;
	}

	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);

	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
	/*
	* If the current arc_buf_t is sharing its data buffer with the
	* hdr, then reassign the hdr's b_pabd to share it with the new
	* buffer at the end of the list. The shared buffer is always
	* the last one on the hdr's buffer list.
	*
	* There is an equivalent case for compressed bufs, but since
	* they aren't guaranteed to be the last buf in the list and
	* that is an exceedingly rare case, we just allow that space be
	* wasted temporarily.
	*/
	if (lastbuf != NULL) {
	/* Only one buf can be shared at once */
	VERIFY(!arc_buf_is_shared(lastbuf));
	/* hdr is uncompressed so can't have compressed buf */
	VERIFY(!ARC_BUF_COMPRESSED(lastbuf));

	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	arc_hdr_free_pabd(hdr);

	/*
	* We must setup a new shared block between the
	* last buffer and the hdr. The data would have
	* been allocated by the arc buf so we need to transfer
	* ownership to the hdr since it's now being shared.
	*/
	arc_share_buf(hdr, lastbuf);
	}
	} else if (HDR_SHARED_DATA(hdr)) {
	/*
	* Uncompressed shared buffers are always at the end
	* of the list. Compressed buffers don't have the
	* same requirements. This makes it hard to
	* simply assert that the lastbuf is shared so
	* we rely on the hdr's compression flags to determine
	* if we have a compressed, shared buffer.
	*/
	ASSERT3P(lastbuf, !=, NULL);
	ASSERT(arc_buf_is_shared(lastbuf) \|\|
	HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
	}

	/*
	* Free the checksum if we're removing the last uncompressed buf from
	* this hdr.
	*/
	if (!arc_hdr_has_uncompressed_buf(hdr)) {
	arc_cksum_free(hdr);
	}

	/* clean up the buf */
	buf->b_hdr = NULL;
	kmem_cache_free(buf_cache, buf);
	}

	static void
	arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr)
	{
	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
	ASSERT(HDR_HAS_L1HDR(hdr));
	ASSERT(!HDR_SHARED_DATA(hdr));

	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);

	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
	}

	static void
	arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
	{
	ASSERT(HDR_HAS_L1HDR(hdr));
	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);

	/*
	* If the hdr is currently being written to the l2arc then
	* we defer freeing the data by adding it to the l2arc_free_on_write
	* list. The l2arc will free the data once it's finished
	* writing it to the l2arc device.
	*/
	if (HDR_L2_WRITING(hdr)) {
	arc_hdr_free_on_write(hdr);
	ARCSTAT_BUMP(arcstat_l2_free_on_write);
	} else {
	arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
	arc_hdr_size(hdr), hdr);
	}
	hdr->b_l1hdr.b_pabd = NULL;
	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;

	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
	}

	static arc_buf_hdr_t *
	arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
	enum zio_compress compression_type, arc_buf_contents_t type)
	{
	arc_buf_hdr_t *hdr;

	VERIFY(type == ARC_BUFC_DATA \|\| type == ARC_BUFC_METADATA);

	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
	ASSERT(HDR_EMPTY(hdr));
	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
	ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
	HDR_SET_PSIZE(hdr, psize);
	HDR_SET_LSIZE(hdr, lsize);
	hdr->b_spa = spa;
	hdr->b_type = type;
	hdr->b_flags = 0;
	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) \| ARC_FLAG_HAS_L1HDR);
	arc_hdr_set_compress(hdr, compression_type);

	hdr->b_l1hdr.b_state = arc_anon;
	hdr->b_l1hdr.b_arc_access = 0;
	hdr->b_l1hdr.b_bufcnt = 0;
	hdr->b_l1hdr.b_buf = NULL;

	/*
	* Allocate the hdr's buffer. This will contain either
	* the compressed or uncompressed data depending on the block
	* it references and compressed arc enablement.
	*/
	arc_hdr_alloc_pabd(hdr);
	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));

	return (hdr);
	}

	/*
	* Transition between the two allocation states for the arc_buf_hdr struct.
	* The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
	* (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
	* version is used when a cache buffer is only in the L2ARC in order to reduce
	* memory usage.
	*/
	static arc_buf_hdr_t *
	arc_hdr_realloc(arc_buf_hdr_t hdr, kmem_cache_t old, kmem_cache_t *new)
	{
	ASSERT(HDR_HAS_L2HDR(hdr));

	arc_buf_hdr_t *nhdr;
	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;

	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) \|\|
	(old == hdr_l2only_cache && new == hdr_full_cache));

	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);

	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
	buf_hash_remove(hdr);

	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);

	if (new == hdr_full_cache) {
	arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
	/*
	* arc_access and arc_change_state need to be aware that a
	* header has just come out of L2ARC, so we set its state to
	* l2c_only even though it's about to change.
	*/
	nhdr->b_l1hdr.b_state = arc_l2c_only;

	/* Verify previous threads set to NULL before freeing */
	ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
	} else {
	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	ASSERT0(hdr->b_l1hdr.b_bufcnt);
	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);

	/*
	* If we've reached here, We must have been called from
	* arc_evict_hdr(), as such we should have already been
	* removed from any ghost list we were previously on
	* (which protects us from racing with arc_evict_state),
	* thus no locking is needed during this check.
	*/
	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));

	/*
	* A buffer must not be moved into the arc_l2c_only
	* state if it's not finished being written out to the
	* l2arc device. Otherwise, the b_l1hdr.b_pabd field
	* might try to be accessed, even though it was removed.
	*/
	VERIFY(!HDR_L2_WRITING(hdr));
	VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);

	#ifdef ZFS_DEBUG
	if (hdr->b_l1hdr.b_thawed != NULL) {
	kmem_free(hdr->b_l1hdr.b_thawed, 1);
	hdr->b_l1hdr.b_thawed = NULL;
	}
	#endif

	arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
	}
	/*
	* The header has been reallocated so we need to re-insert it into any
	* lists it was on.
	*/
	(void) buf_hash_insert(nhdr, NULL);

	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));

	mutex_enter(&dev->l2ad_mtx);

	/*
	* We must place the realloc'ed header back into the list at
	* the same spot. Otherwise, if it's placed earlier in the list,
	* l2arc_write_buffers() could find it during the function's
	* write phase, and try to write it out to the l2arc.
	*/
	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
	list_remove(&dev->l2ad_buflist, hdr);

	mutex_exit(&dev->l2ad_mtx);

	/*
	* Since we're using the pointer address as the tag when
	* incrementing and decrementing the l2ad_alloc refcount, we
	* must remove the old pointer (that we're about to destroy) and
	* add the new pointer to the refcount. Otherwise we'd remove
	* the wrong pointer address when calling arc_hdr_destroy() later.
	*/

	(void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
	(void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr);

	buf_discard_identity(hdr);
	kmem_cache_free(old, hdr);

	return (nhdr);
	}

	/*
	* Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
	* The buf is returned thawed since we expect the consumer to modify it.
	*/
	arc_buf_t *
	arc_alloc_buf(spa_t spa, void tag, arc_buf_contents_t type, int32_t size)
	{
	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
	ZIO_COMPRESS_OFF, type);
	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));

	arc_buf_t *buf = NULL;
	VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
	arc_buf_thaw(buf);

	return (buf);
	}

	/*
	* Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
	* for bufs containing metadata.
	*/
	arc_buf_t *
	arc_alloc_compressed_buf(spa_t spa, void tag, uint64_t psize, uint64_t lsize,
	enum zio_compress compression_type)
	{
	ASSERT3U(lsize, >, 0);
	ASSERT3U(lsize, >=, psize);
	ASSERT(compression_type > ZIO_COMPRESS_OFF);
	ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);

	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
	compression_type, ARC_BUFC_DATA);
	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));

	arc_buf_t *buf = NULL;
	VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
	arc_buf_thaw(buf);
	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);

	if (!arc_buf_is_shared(buf)) {
	/*
	* To ensure that the hdr has the correct data in it if we call
	* arc_decompress() on this buf before it's been written to
	* disk, it's easiest if we just set up sharing between the
	* buf and the hdr.
	*/
	ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
	arc_hdr_free_pabd(hdr);
	arc_share_buf(hdr, buf);
	}

	return (buf);
	}

	static void
	arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
	{
	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
	l2arc_dev_t *dev = l2hdr->b_dev;
	uint64_t psize = arc_hdr_size(hdr);

	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
	ASSERT(HDR_HAS_L2HDR(hdr));

	list_remove(&dev->l2ad_buflist, hdr);

	ARCSTAT_INCR(arcstat_l2_psize, -psize);
	ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));

	vdev_space_update(dev->l2ad_vdev, -psize, 0, 0);

	(void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr);
	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
	}

	static void
	arc_hdr_destroy(arc_buf_hdr_t *hdr)
	{
	if (HDR_HAS_L1HDR(hdr)) {
	ASSERT(hdr->b_l1hdr.b_buf == NULL \|\|
	hdr->b_l1hdr.b_bufcnt > 0);
	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
	}
	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	ASSERT(!HDR_IN_HASH_TABLE(hdr));

	if (!HDR_EMPTY(hdr))
	buf_discard_identity(hdr);

	if (HDR_HAS_L2HDR(hdr)) {
	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
	boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);

	if (!buflist_held)
	mutex_enter(&dev->l2ad_mtx);

	/*
	* Even though we checked this conditional above, we
	* need to check this again now that we have the
	* l2ad_mtx. This is because we could be racing with
	* another thread calling l2arc_evict() which might have
	* destroyed this header's L2 portion as we were waiting
	* to acquire the l2ad_mtx. If that happens, we don't
	* want to re-destroy the header's L2 portion.
	*/
	if (HDR_HAS_L2HDR(hdr)) {
	l2arc_trim(hdr);
	arc_hdr_l2hdr_destroy(hdr);
	}

	if (!buflist_held)
	mutex_exit(&dev->l2ad_mtx);
	}

	if (HDR_HAS_L1HDR(hdr)) {
	arc_cksum_free(hdr);

	while (hdr->b_l1hdr.b_buf != NULL)
	arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);

	#ifdef ZFS_DEBUG
	if (hdr->b_l1hdr.b_thawed != NULL) {
	kmem_free(hdr->b_l1hdr.b_thawed, 1);
	hdr->b_l1hdr.b_thawed = NULL;
	}
	#endif

	if (hdr->b_l1hdr.b_pabd != NULL) {
	arc_hdr_free_pabd(hdr);
	}
	}

	ASSERT3P(hdr->b_hash_next, ==, NULL);
	if (HDR_HAS_L1HDR(hdr)) {
	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
	kmem_cache_free(hdr_full_cache, hdr);
	} else {
	kmem_cache_free(hdr_l2only_cache, hdr);
	}
	}

	void
	arc_buf_destroy(arc_buf_t buf, void tag)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;
	kmutex_t *hash_lock = HDR_LOCK(hdr);

	if (hdr->b_l1hdr.b_state == arc_anon) {
	ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	VERIFY0(remove_reference(hdr, NULL, tag));
	arc_hdr_destroy(hdr);
	return;
	}

	mutex_enter(hash_lock);
	ASSERT3P(hdr, ==, buf->b_hdr);
	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
	ASSERT3P(buf->b_data, !=, NULL);

	(void) remove_reference(hdr, hash_lock, tag);
	arc_buf_destroy_impl(buf);
	mutex_exit(hash_lock);
	}

	/*
	* Evict the arc_buf_hdr that is provided as a parameter. The resultant
	* state of the header is dependent on it's state prior to entering this
	* function. The following transitions are possible:
	*
	* - arc_mru -> arc_mru_ghost
	* - arc_mfu -> arc_mfu_ghost
	* - arc_mru_ghost -> arc_l2c_only
	* - arc_mru_ghost -> deleted
	* - arc_mfu_ghost -> arc_l2c_only
	* - arc_mfu_ghost -> deleted
	*/
	static int64_t
	arc_evict_hdr(arc_buf_hdr_t hdr, kmutex_t hash_lock)
	{
	arc_state_t evicted_state, state;
	int64_t bytes_evicted = 0;

	ASSERT(MUTEX_HELD(hash_lock));
	ASSERT(HDR_HAS_L1HDR(hdr));

	state = hdr->b_l1hdr.b_state;
	if (GHOST_STATE(state)) {
	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);

	/*
	* l2arc_write_buffers() relies on a header's L1 portion
	* (i.e. its b_pabd field) during it's write phase.
	* Thus, we cannot push a header onto the arc_l2c_only
	* state (removing it's L1 piece) until the header is
	* done being written to the l2arc.
	*/
	if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
	ARCSTAT_BUMP(arcstat_evict_l2_skip);
	return (bytes_evicted);
	}

	ARCSTAT_BUMP(arcstat_deleted);
	bytes_evicted += HDR_GET_LSIZE(hdr);

	DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);

	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	if (HDR_HAS_L2HDR(hdr)) {
	/*
	* This buffer is cached on the 2nd Level ARC;
	* don't destroy the header.
	*/
	arc_change_state(arc_l2c_only, hdr, hash_lock);
	/*
	* dropping from L1+L2 cached to L2-only,
	* realloc to remove the L1 header.
	*/
	hdr = arc_hdr_realloc(hdr, hdr_full_cache,
	hdr_l2only_cache);
	} else {
	arc_change_state(arc_anon, hdr, hash_lock);
	arc_hdr_destroy(hdr);
	}
	return (bytes_evicted);
	}

	ASSERT(state == arc_mru \|\| state == arc_mfu);
	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;

	/* prefetch buffers have a minimum lifespan */
	if (HDR_IO_IN_PROGRESS(hdr) \|\|
	((hdr->b_flags & (ARC_FLAG_PREFETCH \| ARC_FLAG_INDIRECT)) &&
	ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
	arc_min_prefetch_lifespan)) {
	ARCSTAT_BUMP(arcstat_evict_skip);
	return (bytes_evicted);
	}

	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
	while (hdr->b_l1hdr.b_buf) {
	arc_buf_t *buf = hdr->b_l1hdr.b_buf;
	if (!mutex_tryenter(&buf->b_evict_lock)) {
	ARCSTAT_BUMP(arcstat_mutex_miss);
	break;
	}
	if (buf->b_data != NULL)
	bytes_evicted += HDR_GET_LSIZE(hdr);
	mutex_exit(&buf->b_evict_lock);
	arc_buf_destroy_impl(buf);
	}

	if (HDR_HAS_L2HDR(hdr)) {
	ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
	} else {
	if (l2arc_write_eligible(hdr->b_spa, hdr)) {
	ARCSTAT_INCR(arcstat_evict_l2_eligible,
	HDR_GET_LSIZE(hdr));
	} else {
	ARCSTAT_INCR(arcstat_evict_l2_ineligible,
	HDR_GET_LSIZE(hdr));
	}
	}

	if (hdr->b_l1hdr.b_bufcnt == 0) {
	arc_cksum_free(hdr);

	bytes_evicted += arc_hdr_size(hdr);

	/*
	* If this hdr is being evicted and has a compressed
	* buffer then we discard it here before we change states.
	* This ensures that the accounting is updated correctly
	* in arc_free_data_impl().
	*/
	arc_hdr_free_pabd(hdr);

	arc_change_state(evicted_state, hdr, hash_lock);
	ASSERT(HDR_IN_HASH_TABLE(hdr));
	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
	DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
	}

	return (bytes_evicted);
	}

	static uint64_t
	arc_evict_state_impl(multilist_t ml, int idx, arc_buf_hdr_t marker,
	uint64_t spa, int64_t bytes)
	{
	multilist_sublist_t *mls;
	uint64_t bytes_evicted = 0;
	arc_buf_hdr_t *hdr;
	kmutex_t *hash_lock;
	int evict_count = 0;

	ASSERT3P(marker, !=, NULL);
	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);

	mls = multilist_sublist_lock(ml, idx);

	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
	hdr = multilist_sublist_prev(mls, marker)) {
	if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) \|\|
	(evict_count >= zfs_arc_evict_batch_limit))
	break;

	/*
	* To keep our iteration location, move the marker
	* forward. Since we're not holding hdr's hash lock, we
	* must be very careful and not remove 'hdr' from the
	* sublist. Otherwise, other consumers might mistake the
	* 'hdr' as not being on a sublist when they call the
	* multilist_link_active() function (they all rely on
	* the hash lock protecting concurrent insertions and
	* removals). multilist_sublist_move_forward() was
	* specifically implemented to ensure this is the case
	* (only 'marker' will be removed and re-inserted).
	*/
	multilist_sublist_move_forward(mls, marker);

	/*
	* The only case where the b_spa field should ever be
	* zero, is the marker headers inserted by
	* arc_evict_state(). It's possible for multiple threads
	* to be calling arc_evict_state() concurrently (e.g.
	* dsl_pool_close() and zio_inject_fault()), so we must
	* skip any markers we see from these other threads.
	*/
	if (hdr->b_spa == 0)
	continue;

	/* we're only interested in evicting buffers of a certain spa */
	if (spa != 0 && hdr->b_spa != spa) {
	ARCSTAT_BUMP(arcstat_evict_skip);
	continue;
	}

	hash_lock = HDR_LOCK(hdr);

	/*
	* We aren't calling this function from any code path
	* that would already be holding a hash lock, so we're
	* asserting on this assumption to be defensive in case
	* this ever changes. Without this check, it would be
	* possible to incorrectly increment arcstat_mutex_miss
	* below (e.g. if the code changed such that we called
	* this function with a hash lock held).
	*/
	ASSERT(!MUTEX_HELD(hash_lock));

	if (mutex_tryenter(hash_lock)) {
	uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
	mutex_exit(hash_lock);

	bytes_evicted += evicted;

	/*
	* If evicted is zero, arc_evict_hdr() must have
	* decided to skip this header, don't increment
	* evict_count in this case.
	*/
	if (evicted != 0)
	evict_count++;

	/*
	* If arc_size isn't overflowing, signal any
	* threads that might happen to be waiting.
	*
	* For each header evicted, we wake up a single
	* thread. If we used cv_broadcast, we could
	* wake up "too many" threads causing arc_size
	* to significantly overflow arc_c; since
	* arc_get_data_impl() doesn't check for overflow
	* when it's woken up (it doesn't because it's
	* possible for the ARC to be overflowing while
	* full of un-evictable buffers, and the
	* function should proceed in this case).
	*
	* If threads are left sleeping, due to not
	* using cv_broadcast, they will be woken up
	* just before arc_reclaim_thread() sleeps.
	*/
	mutex_enter(&arc_reclaim_lock);
	if (!arc_is_overflowing())
	cv_signal(&arc_reclaim_waiters_cv);
	mutex_exit(&arc_reclaim_lock);
	} else {
	ARCSTAT_BUMP(arcstat_mutex_miss);
	}
	}

	multilist_sublist_unlock(mls);

	return (bytes_evicted);
	}

	/*
	* Evict buffers from the given arc state, until we've removed the
	* specified number of bytes. Move the removed buffers to the
	* appropriate evict state.
	*
	* This function makes a "best effort". It skips over any buffers
	* it can't get a hash_lock on, and so, may not catch all candidates.
	* It may also return without evicting as much space as requested.
	*
	* If bytes is specified using the special value ARC_EVICT_ALL, this
	* will evict all available (i.e. unlocked and evictable) buffers from
	* the given arc state; which is used by arc_flush().
	*/
	static uint64_t
	arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
	arc_buf_contents_t type)
	{
	uint64_t total_evicted = 0;
	multilist_t *ml = state->arcs_list[type];
	int num_sublists;
	arc_buf_hdr_t **markers;

	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);

	num_sublists = multilist_get_num_sublists(ml);

	/*
	* If we've tried to evict from each sublist, made some
	* progress, but still have not hit the target number of bytes
	* to evict, we want to keep trying. The markers allow us to
	* pick up where we left off for each individual sublist, rather
	* than starting from the tail each time.
	*/
	markers = kmem_zalloc(sizeof (markers) num_sublists, KM_SLEEP);
	for (int i = 0; i < num_sublists; i++) {
	markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);

	/*
	* A b_spa of 0 is used to indicate that this header is
	* a marker. This fact is used in arc_adjust_type() and
	* arc_evict_state_impl().
	*/
	markers[i]->b_spa = 0;

	multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
	multilist_sublist_insert_tail(mls, markers[i]);
	multilist_sublist_unlock(mls);
	}

	/*
	* While we haven't hit our target number of bytes to evict, or
	* we're evicting all available buffers.
	*/
	while (total_evicted < bytes \|\| bytes == ARC_EVICT_ALL) {
	/*
	* Start eviction using a randomly selected sublist,
	* this is to try and evenly balance eviction across all
	* sublists. Always starting at the same sublist
	* (e.g. index 0) would cause evictions to favor certain
	* sublists over others.
	*/
	int sublist_idx = multilist_get_random_index(ml);
	uint64_t scan_evicted = 0;

	for (int i = 0; i < num_sublists; i++) {
	uint64_t bytes_remaining;
	uint64_t bytes_evicted;

	if (bytes == ARC_EVICT_ALL)
	bytes_remaining = ARC_EVICT_ALL;
	else if (total_evicted < bytes)
	bytes_remaining = bytes - total_evicted;
	else
	break;

	bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
	markers[sublist_idx], spa, bytes_remaining);

	scan_evicted += bytes_evicted;
	total_evicted += bytes_evicted;

	/* we've reached the end, wrap to the beginning */
	if (++sublist_idx >= num_sublists)
	sublist_idx = 0;
	}

	/*
	* If we didn't evict anything during this scan, we have
	* no reason to believe we'll evict more during another
	* scan, so break the loop.
	*/
	if (scan_evicted == 0) {
	/* This isn't possible, let's make that obvious */
	ASSERT3S(bytes, !=, 0);

	/*
	* When bytes is ARC_EVICT_ALL, the only way to
	* break the loop is when scan_evicted is zero.
	* In that case, we actually have evicted enough,
	* so we don't want to increment the kstat.
	*/
	if (bytes != ARC_EVICT_ALL) {
	ASSERT3S(total_evicted, <, bytes);
	ARCSTAT_BUMP(arcstat_evict_not_enough);
	}

	break;
	}
	}

	for (int i = 0; i < num_sublists; i++) {
	multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
	multilist_sublist_remove(mls, markers[i]);
	multilist_sublist_unlock(mls);

	kmem_cache_free(hdr_full_cache, markers[i]);
	}
	kmem_free(markers, sizeof (markers) num_sublists);

	return (total_evicted);
	}

	/*
	* Flush all "evictable" data of the given type from the arc state
	* specified. This will not evict any "active" buffers (i.e. referenced).
	*
	* When 'retry' is set to B_FALSE, the function will make a single pass
	* over the state and evict any buffers that it can. Since it doesn't
	* continually retry the eviction, it might end up leaving some buffers
	* in the ARC due to lock misses.
	*
	* When 'retry' is set to B_TRUE, the function will continually retry the
	* eviction until all evictable buffers have been removed from the
	* state. As a result, if concurrent insertions into the state are
	* allowed (e.g. if the ARC isn't shutting down), this function might
	* wind up in an infinite loop, continually trying to evict buffers.
	*/
	static uint64_t
	arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
	boolean_t retry)
	{
	uint64_t evicted = 0;

	while (refcount_count(&state->arcs_esize[type]) != 0) {
	evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);

	if (!retry)
	break;
	}

	return (evicted);
	}

	/*
	* Evict the specified number of bytes from the state specified,
	* restricting eviction to the spa and type given. This function
	* prevents us from trying to evict more from a state's list than
	* is "evictable", and to skip evicting altogether when passed a
	* negative value for "bytes". In contrast, arc_evict_state() will
	* evict everything it can, when passed a negative value for "bytes".
	*/
	static uint64_t
	arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
	arc_buf_contents_t type)
	{
	int64_t delta;

	if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) {
	delta = MIN(refcount_count(&state->arcs_esize[type]), bytes);
	return (arc_evict_state(state, spa, delta, type));
	}

	return (0);
	}

	/*
	* Evict metadata buffers from the cache, such that arc_meta_used is
	* capped by the arc_meta_limit tunable.
	*/
	static uint64_t
	arc_adjust_meta(void)
	{
	uint64_t total_evicted = 0;
	int64_t target;

	/*
	* If we're over the meta limit, we want to evict enough
	* metadata to get back under the meta limit. We don't want to
	* evict so much that we drop the MRU below arc_p, though. If
	* we're over the meta limit more than we're over arc_p, we
	* evict some from the MRU here, and some from the MFU below.
	*/
	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
	(int64_t)(refcount_count(&arc_anon->arcs_size) +
	refcount_count(&arc_mru->arcs_size) - arc_p));

	total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);

	/*
	* Similar to the above, we want to evict enough bytes to get us
	* below the meta limit, but not so much as to drop us below the
	* space allotted to the MFU (which is defined as arc_c - arc_p).
	*/
	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
	(int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));

	total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);

	return (total_evicted);
	}

	/*
	* Return the type of the oldest buffer in the given arc state
	*
	* This function will select a random sublist of type ARC_BUFC_DATA and
	* a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
	* is compared, and the type which contains the "older" buffer will be
	* returned.
	*/
	static arc_buf_contents_t
	arc_adjust_type(arc_state_t *state)
	{
	multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
	multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
	int data_idx = multilist_get_random_index(data_ml);
	int meta_idx = multilist_get_random_index(meta_ml);
	multilist_sublist_t *data_mls;
	multilist_sublist_t *meta_mls;
	arc_buf_contents_t type;
	arc_buf_hdr_t *data_hdr;
	arc_buf_hdr_t *meta_hdr;

	/*
	* We keep the sublist lock until we're finished, to prevent
	* the headers from being destroyed via arc_evict_state().
	*/
	data_mls = multilist_sublist_lock(data_ml, data_idx);
	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);

	/*
	* These two loops are to ensure we skip any markers that
	* might be at the tail of the lists due to arc_evict_state().
	*/

	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
	data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
	if (data_hdr->b_spa != 0)
	break;
	}

	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
	meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
	if (meta_hdr->b_spa != 0)
	break;
	}

	if (data_hdr == NULL && meta_hdr == NULL) {
	type = ARC_BUFC_DATA;
	} else if (data_hdr == NULL) {
	ASSERT3P(meta_hdr, !=, NULL);
	type = ARC_BUFC_METADATA;
	} else if (meta_hdr == NULL) {
	ASSERT3P(data_hdr, !=, NULL);
	type = ARC_BUFC_DATA;
	} else {
	ASSERT3P(data_hdr, !=, NULL);
	ASSERT3P(meta_hdr, !=, NULL);

	/* The headers can't be on the sublist without an L1 header */
	ASSERT(HDR_HAS_L1HDR(data_hdr));
	ASSERT(HDR_HAS_L1HDR(meta_hdr));

	if (data_hdr->b_l1hdr.b_arc_access <
	meta_hdr->b_l1hdr.b_arc_access) {
	type = ARC_BUFC_DATA;
	} else {
	type = ARC_BUFC_METADATA;
	}
	}

	multilist_sublist_unlock(meta_mls);
	multilist_sublist_unlock(data_mls);

	return (type);
	}

	/*
	* Evict buffers from the cache, such that arc_size is capped by arc_c.
	*/
	static uint64_t
	arc_adjust(void)
	{
	uint64_t total_evicted = 0;
	uint64_t bytes;
	int64_t target;

	/*
	* If we're over arc_meta_limit, we want to correct that before
	* potentially evicting data buffers below.
	*/
	total_evicted += arc_adjust_meta();

	/*
	* Adjust MRU size
	*
	* If we're over the target cache size, we want to evict enough
	* from the list to get back to our target size. We don't want
	* to evict too much from the MRU, such that it drops below
	* arc_p. So, if we're over our target cache size more than
	* the MRU is over arc_p, we'll evict enough to get back to
	* arc_p here, and then evict more from the MFU below.
	*/
	target = MIN((int64_t)(arc_size - arc_c),
	(int64_t)(refcount_count(&arc_anon->arcs_size) +
	refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));

	/*
	* If we're below arc_meta_min, always prefer to evict data.
	* Otherwise, try to satisfy the requested number of bytes to
	* evict from the type which contains older buffers; in an
	* effort to keep newer buffers in the cache regardless of their
	* type. If we cannot satisfy the number of bytes from this
	* type, spill over into the next type.
	*/
	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
	arc_meta_used > arc_meta_min) {
	bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
	total_evicted += bytes;

	/*
	* If we couldn't evict our target number of bytes from
	* metadata, we try to get the rest from data.
	*/
	target -= bytes;

	total_evicted +=
	arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
	} else {
	bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
	total_evicted += bytes;

	/*
	* If we couldn't evict our target number of bytes from
	* data, we try to get the rest from metadata.
	*/
	target -= bytes;

	total_evicted +=
	arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
	}

	/*
	* Adjust MFU size
	*
	* Now that we've tried to evict enough from the MRU to get its
	* size back to arc_p, if we're still above the target cache
	* size, we evict the rest from the MFU.
	*/
	target = arc_size - arc_c;

	if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
	arc_meta_used > arc_meta_min) {
	bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
	total_evicted += bytes;

	/*
	* If we couldn't evict our target number of bytes from
	* metadata, we try to get the rest from data.
	*/
	target -= bytes;

	total_evicted +=
	arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
	} else {
	bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
	total_evicted += bytes;

	/*
	* If we couldn't evict our target number of bytes from
	* data, we try to get the rest from data.
	*/
	target -= bytes;

	total_evicted +=
	arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
	}

	/*
	* Adjust ghost lists
	*
	* In addition to the above, the ARC also defines target values
	* for the ghost lists. The sum of the mru list and mru ghost
	* list should never exceed the target size of the cache, and
	* the sum of the mru list, mfu list, mru ghost list, and mfu
	* ghost list should never exceed twice the target size of the
	* cache. The following logic enforces these limits on the ghost
	* caches, and evicts from them as needed.
	*/
	target = refcount_count(&arc_mru->arcs_size) +
	refcount_count(&arc_mru_ghost->arcs_size) - arc_c;

	bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
	total_evicted += bytes;

	target -= bytes;

	total_evicted +=
	arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);

	/*
	* We assume the sum of the mru list and mfu list is less than
	* or equal to arc_c (we enforced this above), which means we
	* can use the simpler of the two equations below:
	*
	* mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
	* mru ghost + mfu ghost <= arc_c
	*/
	target = refcount_count(&arc_mru_ghost->arcs_size) +
	refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;

	bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
	total_evicted += bytes;

	target -= bytes;

	total_evicted +=
	arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);

	return (total_evicted);
	}

	void
	arc_flush(spa_t *spa, boolean_t retry)
	{
	uint64_t guid = 0;

	/*
	* If retry is B_TRUE, a spa must not be specified since we have
	* no good way to determine if all of a spa's buffers have been
	* evicted from an arc state.
	*/
	ASSERT(!retry \|\| spa == 0);

	if (spa != NULL)
	guid = spa_load_guid(spa);

	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);

	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);

	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);

	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
	}

	void
	arc_shrink(int64_t to_free)
	{
	if (arc_c > arc_c_min) {
	DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
	arc_c_min, uint64_t, arc_p, uint64_t, to_free);
	if (arc_c > arc_c_min + to_free)
	atomic_add_64(&arc_c, -to_free);
	else
	arc_c = arc_c_min;

	atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
	if (arc_c > arc_size)
	arc_c = MAX(arc_size, arc_c_min);
	if (arc_p > arc_c)
	arc_p = (arc_c >> 1);

	DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
	arc_p);

	ASSERT(arc_c >= arc_c_min);
	ASSERT((int64_t)arc_p >= 0);
	}

	if (arc_size > arc_c) {
	DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
	uint64_t, arc_c);
	(void) arc_adjust();
	}
	}

	typedef enum free_memory_reason_t {
	FMR_UNKNOWN,
	FMR_NEEDFREE,
	FMR_LOTSFREE,
	FMR_SWAPFS_MINFREE,
	FMR_PAGES_PP_MAXIMUM,
	FMR_HEAP_ARENA,
	FMR_ZIO_ARENA,
	FMR_ZIO_FRAG,
	} free_memory_reason_t;

	int64_t last_free_memory;
	free_memory_reason_t last_free_reason;

	/*
	* Additional reserve of pages for pp_reserve.
	*/
	int64_t arc_pages_pp_reserve = 64;

	/*
	* Additional reserve of pages for swapfs.
	*/
	int64_t arc_swapfs_reserve = 64;

	/*
	* Return the amount of memory that can be consumed before reclaim will be
	* needed. Positive if there is sufficient free memory, negative indicates
	* the amount of memory that needs to be freed up.
	*/
	static int64_t
	arc_available_memory(void)
	{
	int64_t lowest = INT64_MAX;
	int64_t n;
	free_memory_reason_t r = FMR_UNKNOWN;

	#ifdef _KERNEL
	#ifdef __FreeBSD__
	/*
	* Cooperate with pagedaemon when it's time for it to scan
	* and reclaim some pages.
	*/
	n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
	if (n < lowest) {
	lowest = n;
	r = FMR_LOTSFREE;
	}

	#else
	if (needfree > 0) {
	n = PAGESIZE * (-needfree);
	if (n < lowest) {
	lowest = n;
	r = FMR_NEEDFREE;
	}
	}

	/*
	* check that we're out of range of the pageout scanner. It starts to
	* schedule paging if freemem is less than lotsfree and needfree.
	* lotsfree is the high-water mark for pageout, and needfree is the
	* number of needed free pages. We add extra pages here to make sure
	* the scanner doesn't start up while we're freeing memory.
	*/
	n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
	if (n < lowest) {
	lowest = n;
	r = FMR_LOTSFREE;
	}

	/*
	* check to make sure that swapfs has enough space so that anon
	* reservations can still succeed. anon_resvmem() checks that the
	* availrmem is greater than swapfs_minfree, and the number of reserved
	* swap pages. We also add a bit of extra here just to prevent
	* circumstances from getting really dire.
	*/
	n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
	desfree - arc_swapfs_reserve);
	if (n < lowest) {
	lowest = n;
	r = FMR_SWAPFS_MINFREE;
	}


	/*
	* Check that we have enough availrmem that memory locking (e.g., via
	* mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
	* stores the number of pages that cannot be locked; when availrmem
	* drops below pages_pp_maximum, page locking mechanisms such as
	* page_pp_lock() will fail.)
	*/
	n = PAGESIZE * (availrmem - pages_pp_maximum -
	arc_pages_pp_reserve);
	if (n < lowest) {
	lowest = n;
	r = FMR_PAGES_PP_MAXIMUM;
	}

	#endif /* __FreeBSD__ */
	#if defined(__i386) \|\| !defined(UMA_MD_SMALL_ALLOC)
	/*
	* If we're on an i386 platform, it's possible that we'll exhaust the
	* kernel heap space before we ever run out of available physical
	* memory. Most checks of the size of the heap_area compare against
	* tune.t_minarmem, which is the minimum available real memory that we
	* can have in the system. However, this is generally fixed at 25 pages
	* which is so low that it's useless. In this comparison, we seek to
	* calculate the total heap-size, and reclaim if more than 3/4ths of the
	* heap is allocated. (Or, in the calculation, if less than 1/4th is
	* free)
	*/
	n = (int64_t)vmem_size(heap_arena, VMEM_FREE) -
	(vmem_size(heap_arena, VMEM_FREE \| VMEM_ALLOC) >> 2);
	if (n < lowest) {
	lowest = n;
	r = FMR_HEAP_ARENA;
	}
	#define zio_arena NULL
	#else
	#define zio_arena heap_arena
	#endif

	/*
	* If zio data pages are being allocated out of a separate heap segment,
	* then enforce that the size of available vmem for this arena remains
	* above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
	*
	* Note that reducing the arc_zio_arena_free_shift keeps more virtual
	* memory (in the zio_arena) free, which can avoid memory
	* fragmentation issues.
	*/
	if (zio_arena != NULL) {
	n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
	(vmem_size(zio_arena, VMEM_ALLOC) >>
	arc_zio_arena_free_shift);
	if (n < lowest) {
	lowest = n;
	r = FMR_ZIO_ARENA;
	}
	}

	/*
	* Above limits know nothing about real level of KVA fragmentation.
	* Start aggressive reclamation if too little sequential KVA left.
	*/
	if (lowest > 0) {
	n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ?
	-((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) :
	INT64_MAX;
	if (n < lowest) {
	lowest = n;
	r = FMR_ZIO_FRAG;
	}
	}

	#else /* _KERNEL */
	/* Every 100 calls, free a small amount */
	if (spa_get_random(100) == 0)
	lowest = -1024;
	#endif /* _KERNEL */

	last_free_memory = lowest;
	last_free_reason = r;
	DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
	return (lowest);
	}


	/*
	* Determine if the system is under memory pressure and is asking
	* to reclaim memory. A return value of B_TRUE indicates that the system
	* is under memory pressure and that the arc should adjust accordingly.
	*/
	static boolean_t
	arc_reclaim_needed(void)
	{
	return (arc_available_memory() < 0);
	}

	extern kmem_cache_t *zio_buf_cache[];
	extern kmem_cache_t *zio_data_buf_cache[];
	extern kmem_cache_t *range_seg_cache;
	extern kmem_cache_t *abd_chunk_cache;

	static __noinline void
	arc_kmem_reap_now(void)
	{
	size_t i;
	kmem_cache_t *prev_cache = NULL;
	kmem_cache_t *prev_data_cache = NULL;

	DTRACE_PROBE(arc__kmem_reap_start);
	#ifdef _KERNEL
	if (arc_meta_used >= arc_meta_limit) {
	/*
	* We are exceeding our meta-data cache limit.
	* Purge some DNLC entries to release holds on meta-data.
	*/
	dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
	}
	#if defined(__i386)
	/*
	* Reclaim unused memory from all kmem caches.
	*/
	kmem_reap();
	#endif
	#endif

	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
	if (zio_buf_cache[i] != prev_cache) {
	prev_cache = zio_buf_cache[i];
	kmem_cache_reap_now(zio_buf_cache[i]);
	}
	if (zio_data_buf_cache[i] != prev_data_cache) {
	prev_data_cache = zio_data_buf_cache[i];
	kmem_cache_reap_now(zio_data_buf_cache[i]);
	}
	}
	kmem_cache_reap_now(abd_chunk_cache);
	kmem_cache_reap_now(buf_cache);
	kmem_cache_reap_now(hdr_full_cache);
	kmem_cache_reap_now(hdr_l2only_cache);
	kmem_cache_reap_now(range_seg_cache);

	#ifdef illumos
	if (zio_arena != NULL) {
	/*
	* Ask the vmem arena to reclaim unused memory from its
	* quantum caches.
	*/
	vmem_qcache_reap(zio_arena);
	}
	#endif
	DTRACE_PROBE(arc__kmem_reap_end);
	}

	/*
	* Threads can block in arc_get_data_impl() waiting for this thread to evict
	* enough data and signal them to proceed. When this happens, the threads in
	* arc_get_data_impl() are sleeping while holding the hash lock for their
	* particular arc header. Thus, we must be careful to never sleep on a
	* hash lock in this thread. This is to prevent the following deadlock:
	*
	* - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
	* waiting for the reclaim thread to signal it.
	*
	* - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
	* fails, and goes to sleep forever.
	*
	* This possible deadlock is avoided by always acquiring a hash lock
	* using mutex_tryenter() from arc_reclaim_thread().
	*/
	/* ARGSUSED */
	static void
	arc_reclaim_thread(void *unused __unused)
	{
	hrtime_t growtime = 0;
	callb_cpr_t cpr;

	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);

	mutex_enter(&arc_reclaim_lock);
	while (!arc_reclaim_thread_exit) {
	uint64_t evicted = 0;

	/*
	* This is necessary in order for the mdb ::arc dcmd to
	* show up to date information. Since the ::arc command
	* does not call the kstat's update function, without
	* this call, the command may show stale stats for the
	* anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
	* with this change, the data might be up to 1 second
	* out of date; but that should suffice. The arc_state_t
	* structures can be queried directly if more accurate
	* information is needed.
	*/
	if (arc_ksp != NULL)
	arc_ksp->ks_update(arc_ksp, KSTAT_READ);

	mutex_exit(&arc_reclaim_lock);

	/*
	* We call arc_adjust() before (possibly) calling
	* arc_kmem_reap_now(), so that we can wake up
	* arc_get_data_impl() sooner.
	*/
	evicted = arc_adjust();

	int64_t free_memory = arc_available_memory();
	if (free_memory < 0) {

	arc_no_grow = B_TRUE;
	arc_warm = B_TRUE;

	/*
	* Wait at least zfs_grow_retry (default 60) seconds
	* before considering growing.
	*/
	growtime = gethrtime() + SEC2NSEC(arc_grow_retry);

	arc_kmem_reap_now();

	/*
	* If we are still low on memory, shrink the ARC
	* so that we have arc_shrink_min free space.
	*/
	free_memory = arc_available_memory();

	int64_t to_free =
	(arc_c >> arc_shrink_shift) - free_memory;
	if (to_free > 0) {
	#ifdef _KERNEL
	#ifdef illumos
	to_free = MAX(to_free, ptob(needfree));
	#endif
	#endif
	arc_shrink(to_free);
	}
	} else if (free_memory < arc_c >> arc_no_grow_shift) {
	arc_no_grow = B_TRUE;
	} else if (gethrtime() >= growtime) {
	arc_no_grow = B_FALSE;
	}

	mutex_enter(&arc_reclaim_lock);

	/*
	* If evicted is zero, we couldn't evict anything via
	* arc_adjust(). This could be due to hash lock
	* collisions, but more likely due to the majority of
	* arc buffers being unevictable. Therefore, even if
	* arc_size is above arc_c, another pass is unlikely to
	* be helpful and could potentially cause us to enter an
	* infinite loop.
	*/
	if (arc_size <= arc_c \|\| evicted == 0) {
	/*
	* We're either no longer overflowing, or we
	* can't evict anything more, so we should wake
	* up any threads before we go to sleep.
	*/
	cv_broadcast(&arc_reclaim_waiters_cv);

	/*
	* Block until signaled, or after one second (we
	* might need to perform arc_kmem_reap_now()
	* even if we aren't being signalled)
	*/
	CALLB_CPR_SAFE_BEGIN(&cpr);
	(void) cv_timedwait_hires(&arc_reclaim_thread_cv,
	&arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
	CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
	}
	}

	arc_reclaim_thread_exit = B_FALSE;
	cv_broadcast(&arc_reclaim_thread_cv);
	CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
	thread_exit();
	}

	static u_int arc_dnlc_evicts_arg;
	extern struct vfsops zfs_vfsops;

	static void
	arc_dnlc_evicts_thread(void *dummy __unused)
	{
	callb_cpr_t cpr;
	u_int percent;

	CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG);

	mutex_enter(&arc_dnlc_evicts_lock);
	while (!arc_dnlc_evicts_thread_exit) {
	CALLB_CPR_SAFE_BEGIN(&cpr);
	(void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
	CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock);
	if (arc_dnlc_evicts_arg != 0) {
	percent = arc_dnlc_evicts_arg;
	mutex_exit(&arc_dnlc_evicts_lock);
	#ifdef _KERNEL
	vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops);
	#endif
	mutex_enter(&arc_dnlc_evicts_lock);
	/*
	* Clear our token only after vnlru_free()
	* pass is done, to avoid false queueing of
	* the requests.
	*/
	arc_dnlc_evicts_arg = 0;
	}
	}
	arc_dnlc_evicts_thread_exit = FALSE;
	cv_broadcast(&arc_dnlc_evicts_cv);
	CALLB_CPR_EXIT(&cpr);
	thread_exit();
	}

	void
	dnlc_reduce_cache(void *arg)
	{
	u_int percent;

	percent = (u_int)(uintptr_t)arg;
	mutex_enter(&arc_dnlc_evicts_lock);
	if (arc_dnlc_evicts_arg == 0) {
	arc_dnlc_evicts_arg = percent;
	cv_broadcast(&arc_dnlc_evicts_cv);
	}
	mutex_exit(&arc_dnlc_evicts_lock);
	}

	/*
	* Adapt arc info given the number of bytes we are trying to add and
	* the state that we are comming from. This function is only called
	* when we are adding new content to the cache.
	*/
	static void
	arc_adapt(int bytes, arc_state_t *state)
	{
	int mult;
	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
	int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
	int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);

	if (state == arc_l2c_only)
	return;

	ASSERT(bytes > 0);
	/*
	* Adapt the target size of the MRU list:
	* - if we just hit in the MRU ghost list, then increase
	* the target size of the MRU list.
	* - if we just hit in the MFU ghost list, then increase
	* the target size of the MFU list by decreasing the
	* target size of the MRU list.
	*/
	if (state == arc_mru_ghost) {
	mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
	mult = MIN(mult, 10); /* avoid wild arc_p adjustment */

	arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
	} else if (state == arc_mfu_ghost) {
	uint64_t delta;

	mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
	mult = MIN(mult, 10);

	delta = MIN(bytes * mult, arc_p);
	arc_p = MAX(arc_p_min, arc_p - delta);
	}
	ASSERT((int64_t)arc_p >= 0);

	if (arc_reclaim_needed()) {
	cv_signal(&arc_reclaim_thread_cv);
	return;
	}

	if (arc_no_grow)
	return;

	if (arc_c >= arc_c_max)
	return;

	/*
	* If we're within (2 * maxblocksize) bytes of the target
	* cache size, increment the target cache size
	*/
	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
	DTRACE_PROBE1(arc__inc_adapt, int, bytes);
	atomic_add_64(&arc_c, (int64_t)bytes);
	if (arc_c > arc_c_max)
	arc_c = arc_c_max;
	else if (state == arc_anon)
	atomic_add_64(&arc_p, (int64_t)bytes);
	if (arc_p > arc_c)
	arc_p = arc_c;
	}
	ASSERT((int64_t)arc_p >= 0);
	}

	/*
	* Check if arc_size has grown past our upper threshold, determined by
	* zfs_arc_overflow_shift.
	*/
	static boolean_t
	arc_is_overflowing(void)
	{
	/* Always allow at least one block of overflow */
	uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
	arc_c >> zfs_arc_overflow_shift);

	return (arc_size >= arc_c + overflow);
	}

	static abd_t *
	arc_get_data_abd(arc_buf_hdr_t hdr, uint64_t size, void tag)
	{
	arc_buf_contents_t type = arc_buf_type(hdr);

	arc_get_data_impl(hdr, size, tag);
	if (type == ARC_BUFC_METADATA) {
	return (abd_alloc(size, B_TRUE));
	} else {
	ASSERT(type == ARC_BUFC_DATA);
	return (abd_alloc(size, B_FALSE));
	}
	}

	static void *
	arc_get_data_buf(arc_buf_hdr_t hdr, uint64_t size, void tag)
	{
	arc_buf_contents_t type = arc_buf_type(hdr);

	arc_get_data_impl(hdr, size, tag);
	if (type == ARC_BUFC_METADATA) {
	return (zio_buf_alloc(size));
	} else {
	ASSERT(type == ARC_BUFC_DATA);
	return (zio_data_buf_alloc(size));
	}
	}

	/*
	* Allocate a block and return it to the caller. If we are hitting the
	* hard limit for the cache size, we must sleep, waiting for the eviction
	* thread to catch up. If we're past the target size but below the hard
	* limit, we'll only signal the reclaim thread and continue on.
	*/
	static void
	arc_get_data_impl(arc_buf_hdr_t hdr, uint64_t size, void tag)
	{
	arc_state_t *state = hdr->b_l1hdr.b_state;
	arc_buf_contents_t type = arc_buf_type(hdr);

	arc_adapt(size, state);

	/*
	* If arc_size is currently overflowing, and has grown past our
	* upper limit, we must be adding data faster than the evict
	* thread can evict. Thus, to ensure we don't compound the
	* problem by adding more data and forcing arc_size to grow even
	* further past it's target size, we halt and wait for the
	* eviction thread to catch up.
	*
	* It's also possible that the reclaim thread is unable to evict
	* enough buffers to get arc_size below the overflow limit (e.g.
	* due to buffers being un-evictable, or hash lock collisions).
	* In this case, we want to proceed regardless if we're
	* overflowing; thus we don't use a while loop here.
	*/
	if (arc_is_overflowing()) {
	mutex_enter(&arc_reclaim_lock);

	/*
	* Now that we've acquired the lock, we may no longer be
	* over the overflow limit, lets check.
	*
	* We're ignoring the case of spurious wake ups. If that
	* were to happen, it'd let this thread consume an ARC
	* buffer before it should have (i.e. before we're under
	* the overflow limit and were signalled by the reclaim
	* thread). As long as that is a rare occurrence, it
	* shouldn't cause any harm.
	*/
	if (arc_is_overflowing()) {
	cv_signal(&arc_reclaim_thread_cv);
	cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
	}

	mutex_exit(&arc_reclaim_lock);
	}

	VERIFY3U(hdr->b_type, ==, type);
	if (type == ARC_BUFC_METADATA) {
	arc_space_consume(size, ARC_SPACE_META);
	} else {
	arc_space_consume(size, ARC_SPACE_DATA);
	}

	/*
	* Update the state size. Note that ghost states have a
	* "ghost size" and so don't need to be updated.
	*/
	if (!GHOST_STATE(state)) {

	(void) refcount_add_many(&state->arcs_size, size, tag);

	/*
	* If this is reached via arc_read, the link is
	* protected by the hash lock. If reached via
	* arc_buf_alloc, the header should not be accessed by
	* any other thread. And, if reached via arc_read_done,
	* the hash lock will protect it if it's found in the
	* hash table; otherwise no other thread should be
	* trying to [add\|remove]_reference it.
	*/
	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	(void) refcount_add_many(&state->arcs_esize[type],
	size, tag);
	}

	/*
	* If we are growing the cache, and we are adding anonymous
	* data, and we have outgrown arc_p, update arc_p
	*/
	if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
	(refcount_count(&arc_anon->arcs_size) +
	refcount_count(&arc_mru->arcs_size) > arc_p))
	arc_p = MIN(arc_c, arc_p + size);
	}
	ARCSTAT_BUMP(arcstat_allocated);
	}

	static void
	arc_free_data_abd(arc_buf_hdr_t hdr, abd_t abd, uint64_t size, void *tag)
	{
	arc_free_data_impl(hdr, size, tag);
	abd_free(abd);
	}

	static void
	arc_free_data_buf(arc_buf_hdr_t hdr, void buf, uint64_t size, void *tag)
	{
	arc_buf_contents_t type = arc_buf_type(hdr);

	arc_free_data_impl(hdr, size, tag);
	if (type == ARC_BUFC_METADATA) {
	zio_buf_free(buf, size);
	} else {
	ASSERT(type == ARC_BUFC_DATA);
	zio_data_buf_free(buf, size);
	}
	}

	/*
	* Free the arc data buffer.
	*/
	static void
	arc_free_data_impl(arc_buf_hdr_t hdr, uint64_t size, void tag)
	{
	arc_state_t *state = hdr->b_l1hdr.b_state;
	arc_buf_contents_t type = arc_buf_type(hdr);

	/* protected by hash lock, if in the hash table */
	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	ASSERT(state != arc_anon && state != arc_l2c_only);

	(void) refcount_remove_many(&state->arcs_esize[type],
	size, tag);
	}
	(void) refcount_remove_many(&state->arcs_size, size, tag);

	VERIFY3U(hdr->b_type, ==, type);
	if (type == ARC_BUFC_METADATA) {
	arc_space_return(size, ARC_SPACE_META);
	} else {
	ASSERT(type == ARC_BUFC_DATA);
	arc_space_return(size, ARC_SPACE_DATA);
	}
	}

	/*
	* This routine is called whenever a buffer is accessed.
	* NOTE: the hash lock is dropped in this function.
	*/
	static void
	arc_access(arc_buf_hdr_t hdr, kmutex_t hash_lock)
	{
	clock_t now;

	ASSERT(MUTEX_HELD(hash_lock));
	ASSERT(HDR_HAS_L1HDR(hdr));

	if (hdr->b_l1hdr.b_state == arc_anon) {
	/*
	* This buffer is not in the cache, and does not
	* appear in our "ghost" list. Add the new buffer
	* to the MRU state.
	*/

	ASSERT0(hdr->b_l1hdr.b_arc_access);
	hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
	DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
	arc_change_state(arc_mru, hdr, hash_lock);

	} else if (hdr->b_l1hdr.b_state == arc_mru) {
	now = ddi_get_lbolt();

	/*
	* If this buffer is here because of a prefetch, then either:
	* - clear the flag if this is a "referencing" read
	* (any subsequent access will bump this into the MFU state).
	* or
	* - move the buffer to the head of the list if this is
	* another prefetch (to make it less likely to be evicted).
	*/
	if (HDR_PREFETCH(hdr)) {
	if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
	/* link protected by hash lock */
	ASSERT(multilist_link_active(
	&hdr->b_l1hdr.b_arc_node));
	} else {
	arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
	ARCSTAT_BUMP(arcstat_mru_hits);
	}
	hdr->b_l1hdr.b_arc_access = now;
	return;
	}

	/*
	* This buffer has been "accessed" only once so far,
	* but it is still in the cache. Move it to the MFU
	* state.
	*/
	if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
	/*
	* More than 125ms have passed since we
	* instantiated this buffer. Move it to the
	* most frequently used state.
	*/
	hdr->b_l1hdr.b_arc_access = now;
	DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
	arc_change_state(arc_mfu, hdr, hash_lock);
	}
	ARCSTAT_BUMP(arcstat_mru_hits);
	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
	arc_state_t *new_state;
	/*
	* This buffer has been "accessed" recently, but
	* was evicted from the cache. Move it to the
	* MFU state.
	*/

	if (HDR_PREFETCH(hdr)) {
	new_state = arc_mru;
	if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
	arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
	DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
	} else {
	new_state = arc_mfu;
	DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
	}

	hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
	arc_change_state(new_state, hdr, hash_lock);

	ARCSTAT_BUMP(arcstat_mru_ghost_hits);
	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
	/*
	* This buffer has been accessed more than once and is
	* still in the cache. Keep it in the MFU state.
	*
	* NOTE: an add_reference() that occurred when we did
	* the arc_read() will have kicked this off the list.
	* If it was a prefetch, we will explicitly move it to
	* the head of the list now.
	*/
	if ((HDR_PREFETCH(hdr)) != 0) {
	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	/* link protected by hash_lock */
	ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
	}
	ARCSTAT_BUMP(arcstat_mfu_hits);
	hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
	arc_state_t *new_state = arc_mfu;
	/*
	* This buffer has been accessed more than once but has
	* been evicted from the cache. Move it back to the
	* MFU state.
	*/

	if (HDR_PREFETCH(hdr)) {
	/*
	* This is a prefetch access...
	* move this block back to the MRU state.
	*/
	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
	new_state = arc_mru;
	}

	hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
	DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
	arc_change_state(new_state, hdr, hash_lock);

	ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
	/*
	* This buffer is on the 2nd Level ARC.
	*/

	hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
	DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
	arc_change_state(arc_mfu, hdr, hash_lock);
	} else {
	ASSERT(!"invalid arc state");
	}
	}

	/* a generic arc_done_func_t which you can use */
	/* ARGSUSED */
	void
	arc_bcopy_func(zio_t zio, arc_buf_t buf, void *arg)
	{
	if (zio == NULL \|\| zio->io_error == 0)
	bcopy(buf->b_data, arg, arc_buf_size(buf));
	arc_buf_destroy(buf, arg);
	}

	/* a generic arc_done_func_t */
	void
	arc_getbuf_func(zio_t zio, arc_buf_t buf, void *arg)
	{
	arc_buf_t **bufp = arg;
	if (zio && zio->io_error) {
	arc_buf_destroy(buf, arg);
	*bufp = NULL;
	} else {
	*bufp = buf;
	ASSERT(buf->b_data);
	}
	}

	static void
	arc_hdr_verify(arc_buf_hdr_t hdr, blkptr_t bp)
	{
	if (BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp)) {
	ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
	} else {
	if (HDR_COMPRESSION_ENABLED(hdr)) {
	ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
	BP_GET_COMPRESS(bp));
	}
	ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
	ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
	}
	}

	static void
	arc_read_done(zio_t *zio)
	{
	arc_buf_hdr_t *hdr = zio->io_private;
	kmutex_t *hash_lock = NULL;
	arc_callback_t *callback_list;
	arc_callback_t *acb;
	boolean_t freeable = B_FALSE;
	boolean_t no_zio_error = (zio->io_error == 0);

	/*
	* The hdr was inserted into hash-table and removed from lists
	* prior to starting I/O. We should find this header, since
	* it's in the hash table, and it should be legit since it's
	* not possible to evict it during the I/O. The only possible
	* reason for it not to be found is if we were freed during the
	* read.
	*/
	if (HDR_IN_HASH_TABLE(hdr)) {
	ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
	ASSERT3U(hdr->b_dva.dva_word[0], ==,
	BP_IDENTITY(zio->io_bp)->dva_word[0]);
	ASSERT3U(hdr->b_dva.dva_word[1], ==,
	BP_IDENTITY(zio->io_bp)->dva_word[1]);

	arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
	&hash_lock);

	ASSERT((found == hdr &&
	DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) \|\|
	(found == hdr && HDR_L2_READING(hdr)));
	ASSERT3P(hash_lock, !=, NULL);
	}

	if (no_zio_error) {
	/* byteswap if necessary */
	if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
	if (BP_GET_LEVEL(zio->io_bp) > 0) {
	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
	} else {
	hdr->b_l1hdr.b_byteswap =
	DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
	}
	} else {
	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
	}
	}

	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
	arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);

	callback_list = hdr->b_l1hdr.b_acb;
	ASSERT3P(callback_list, !=, NULL);

	if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
	/*
	* Only call arc_access on anonymous buffers. This is because
	* if we've issued an I/O for an evicted buffer, we've already
	* called arc_access (to prevent any simultaneous readers from
	* getting confused).
	*/
	arc_access(hdr, hash_lock);
	}

	/*
	* If a read request has a callback (i.e. acb_done is not NULL), then we
	* make a buf containing the data according to the parameters which were
	* passed in. The implementation of arc_buf_alloc_impl() ensures that we
	* aren't needlessly decompressing the data multiple times.
	*/
	int callback_cnt = 0;
	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
	if (!acb->acb_done)
	continue;

	/* This is a demand read since prefetches don't use callbacks */
	callback_cnt++;

	int error = arc_buf_alloc_impl(hdr, acb->acb_private,
	acb->acb_compressed, no_zio_error, &acb->acb_buf);
	if (no_zio_error) {
	zio->io_error = error;
	}
	}
	hdr->b_l1hdr.b_acb = NULL;
	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
	if (callback_cnt == 0) {
	ASSERT(HDR_PREFETCH(hdr));
	ASSERT0(hdr->b_l1hdr.b_bufcnt);
	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	}

	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) \|\|
	callback_list != NULL);

	if (no_zio_error) {
	arc_hdr_verify(hdr, zio->io_bp);
	} else {
	arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
	if (hdr->b_l1hdr.b_state != arc_anon)
	arc_change_state(arc_anon, hdr, hash_lock);
	if (HDR_IN_HASH_TABLE(hdr))
	buf_hash_remove(hdr);
	freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
	}

	/*
	* Broadcast before we drop the hash_lock to avoid the possibility
	* that the hdr (and hence the cv) might be freed before we get to
	* the cv_broadcast().
	*/
	cv_broadcast(&hdr->b_l1hdr.b_cv);

	if (hash_lock != NULL) {
	mutex_exit(hash_lock);
	} else {
	/*
	* This block was freed while we waited for the read to
	* complete. It has been removed from the hash table and
	* moved to the anonymous state (so that it won't show up
	* in the cache).
	*/
	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
	freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
	}

	/* execute each callback and free its structure */
	while ((acb = callback_list) != NULL) {
	if (acb->acb_done)
	acb->acb_done(zio, acb->acb_buf, acb->acb_private);

	if (acb->acb_zio_dummy != NULL) {
	acb->acb_zio_dummy->io_error = zio->io_error;
	zio_nowait(acb->acb_zio_dummy);
	}

	callback_list = acb->acb_next;
	kmem_free(acb, sizeof (arc_callback_t));
	}

	if (freeable)
	arc_hdr_destroy(hdr);
	}

	/*
	* "Read" the block at the specified DVA (in bp) via the
	* cache. If the block is found in the cache, invoke the provided
	* callback immediately and return. Note that the `zio' parameter
	* in the callback will be NULL in this case, since no IO was
	* required. If the block is not in the cache pass the read request
	* on to the spa with a substitute callback function, so that the
	* requested block will be added to the cache.
	*
	* If a read request arrives for a block that has a read in-progress,
	* either wait for the in-progress read to complete (and return the
	* results); or, if this is a read with a "done" func, add a record
	* to the read to invoke the "done" func when the read completes,
	* and return; or just return.
	*
	* arc_read_done() will invoke all the requested "done" functions
	* for readers of this block.
	*/
	int
	arc_read(zio_t pio, spa_t spa, const blkptr_t bp, arc_done_func_t done,
	void *private, zio_priority_t priority, int zio_flags,
	arc_flags_t arc_flags, const zbookmark_phys_t zb)
	{
	arc_buf_hdr_t *hdr = NULL;
	kmutex_t *hash_lock = NULL;
	zio_t *rzio;
	uint64_t guid = spa_load_guid(spa);
	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;

	ASSERT(!BP_IS_EMBEDDED(bp) \|\|
	BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);

	top:
	if (!BP_IS_EMBEDDED(bp)) {
	/*
	* Embedded BP's have no DVA and require no I/O to "read".
	* Create an anonymous arc buf to back it.
	*/
	hdr = buf_hash_find(guid, bp, &hash_lock);
	}

	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
	arc_buf_t *buf = NULL;
	*arc_flags \|= ARC_FLAG_CACHED;

	if (HDR_IO_IN_PROGRESS(hdr)) {

	if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
	priority == ZIO_PRIORITY_SYNC_READ) {
	/*
	* This sync read must wait for an
	* in-progress async read (e.g. a predictive
	* prefetch). Async reads are queued
	* separately at the vdev_queue layer, so
	* this is a form of priority inversion.
	* Ideally, we would "inherit" the demand
	* i/o's priority by moving the i/o from
	* the async queue to the synchronous queue,
	* but there is currently no mechanism to do
	* so. Track this so that we can evaluate
	* the magnitude of this potential performance
	* problem.
	*
	* Note that if the prefetch i/o is already
	* active (has been issued to the device),
	* the prefetch improved performance, because
	* we issued it sooner than we would have
	* without the prefetch.
	*/
	DTRACE_PROBE1(arc__sync__wait__for__async,
	arc_buf_hdr_t *, hdr);
	ARCSTAT_BUMP(arcstat_sync_wait_for_async);
	}
	if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
	arc_hdr_clear_flags(hdr,
	ARC_FLAG_PREDICTIVE_PREFETCH);
	}

	if (*arc_flags & ARC_FLAG_WAIT) {
	cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
	mutex_exit(hash_lock);
	goto top;
	}
	ASSERT(*arc_flags & ARC_FLAG_NOWAIT);

	if (done) {
	arc_callback_t *acb = NULL;

	acb = kmem_zalloc(sizeof (arc_callback_t),
	KM_SLEEP);
	acb->acb_done = done;
	acb->acb_private = private;
	acb->acb_compressed = compressed_read;
	if (pio != NULL)
	acb->acb_zio_dummy = zio_null(pio,
	spa, NULL, NULL, NULL, zio_flags);

	ASSERT3P(acb->acb_done, !=, NULL);
	acb->acb_next = hdr->b_l1hdr.b_acb;
	hdr->b_l1hdr.b_acb = acb;
	mutex_exit(hash_lock);
	return (0);
	}
	mutex_exit(hash_lock);
	return (0);
	}

	ASSERT(hdr->b_l1hdr.b_state == arc_mru \|\|
	hdr->b_l1hdr.b_state == arc_mfu);

	if (done) {
	if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
	/*
	* This is a demand read which does not have to
	* wait for i/o because we did a predictive
	* prefetch i/o for it, which has completed.
	*/
	DTRACE_PROBE1(
	arc__demand__hit__predictive__prefetch,
	arc_buf_hdr_t *, hdr);
	ARCSTAT_BUMP(
	arcstat_demand_hit_predictive_prefetch);
	arc_hdr_clear_flags(hdr,
	ARC_FLAG_PREDICTIVE_PREFETCH);
	}
	ASSERT(!BP_IS_EMBEDDED(bp) \|\| !BP_IS_HOLE(bp));

	/* Get a buf with the desired data in it. */
	VERIFY0(arc_buf_alloc_impl(hdr, private,
	compressed_read, B_TRUE, &buf));
	} else if (*arc_flags & ARC_FLAG_PREFETCH &&
	refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
	arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
	}
	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
	arc_access(hdr, hash_lock);
	if (*arc_flags & ARC_FLAG_L2CACHE)
	arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
	mutex_exit(hash_lock);
	ARCSTAT_BUMP(arcstat_hits);
	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
	demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
	data, metadata, hits);

	if (done)
	done(NULL, buf, private);
	} else {
	uint64_t lsize = BP_GET_LSIZE(bp);
	uint64_t psize = BP_GET_PSIZE(bp);
	arc_callback_t *acb;
	vdev_t *vd = NULL;
	uint64_t addr = 0;
	boolean_t devw = B_FALSE;
	uint64_t size;

	if (hdr == NULL) {
	/* this block is not in the cache */
	arc_buf_hdr_t *exists = NULL;
	arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
	hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
	BP_GET_COMPRESS(bp), type);

	if (!BP_IS_EMBEDDED(bp)) {
	hdr->b_dva = *BP_IDENTITY(bp);
	hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
	exists = buf_hash_insert(hdr, &hash_lock);
	}
	if (exists != NULL) {
	/* somebody beat us to the hash insert */
	mutex_exit(hash_lock);
	buf_discard_identity(hdr);
	arc_hdr_destroy(hdr);
	goto top; /* restart the IO request */
	}
	} else {
	/*
	* This block is in the ghost cache. If it was L2-only
	* (and thus didn't have an L1 hdr), we realloc the
	* header to add an L1 hdr.
	*/
	if (!HDR_HAS_L1HDR(hdr)) {
	hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
	hdr_full_cache);
	}
	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);

	/*
	* This is a delicate dance that we play here.
	* This hdr is in the ghost list so we access it
	* to move it out of the ghost list before we
	* initiate the read. If it's a prefetch then
	* it won't have a callback so we'll remove the
	* reference that arc_buf_alloc_impl() created. We
	* do this after we've called arc_access() to
	* avoid hitting an assert in remove_reference().
	*/
	arc_access(hdr, hash_lock);
	arc_hdr_alloc_pabd(hdr);
	}
	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	size = arc_hdr_size(hdr);

	/*
	* If compression is enabled on the hdr, then will do
	* RAW I/O and will store the compressed data in the hdr's
	* data block. Otherwise, the hdr's data block will contain
	* the uncompressed data.
	*/
	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
	zio_flags \|= ZIO_FLAG_RAW;
	}

	if (*arc_flags & ARC_FLAG_PREFETCH)
	arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
	if (*arc_flags & ARC_FLAG_L2CACHE)
	arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
	if (BP_GET_LEVEL(bp) > 0)
	arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
	if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
	arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
	ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));

	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
	acb->acb_done = done;
	acb->acb_private = private;
	acb->acb_compressed = compressed_read;

	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
	hdr->b_l1hdr.b_acb = acb;
	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);

	if (HDR_HAS_L2HDR(hdr) &&
	(vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
	devw = hdr->b_l2hdr.b_dev->l2ad_writing;
	addr = hdr->b_l2hdr.b_daddr;
	/*
	- * Lock out device removal.
	+ * Lock out L2ARC device removal.
	*/
	if (vdev_is_dead(vd) \|\|
	!spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
	vd = NULL;
	}

	if (priority == ZIO_PRIORITY_ASYNC_READ)
	arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
	else
	arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);

	if (hash_lock != NULL)
	mutex_exit(hash_lock);

	/*
	* At this point, we have a level 1 cache miss. Try again in
	* L2ARC if possible.
	*/
	ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);

	DTRACE_PROBE4(arc__miss, arc_buf_hdr_t , hdr, blkptr_t , bp,
	uint64_t, lsize, zbookmark_phys_t *, zb);
	ARCSTAT_BUMP(arcstat_misses);
	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
	demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
	data, metadata, misses);
	#ifdef _KERNEL
	#ifdef RACCT
	if (racct_enable) {
	PROC_LOCK(curproc);
	racct_add_force(curproc, RACCT_READBPS, size);
	racct_add_force(curproc, RACCT_READIOPS, 1);
	PROC_UNLOCK(curproc);
	}
	#endif /* RACCT */
	curthread->td_ru.ru_inblock++;
	#endif

	if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
	/*
	* Read from the L2ARC if the following are true:
	* 1. The L2ARC vdev was previously cached.
	* 2. This buffer still has L2ARC metadata.
	* 3. This buffer isn't currently writing to the L2ARC.
	* 4. The L2ARC entry wasn't evicted, which may
	* also have invalidated the vdev.
	* 5. This isn't prefetch and l2arc_noprefetch is set.
	*/
	if (HDR_HAS_L2HDR(hdr) &&
	!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
	!(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
	l2arc_read_callback_t *cb;
	abd_t *abd;
	uint64_t asize;

	DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
	ARCSTAT_BUMP(arcstat_l2_hits);

	cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
	KM_SLEEP);
	cb->l2rcb_hdr = hdr;
	cb->l2rcb_bp = *bp;
	cb->l2rcb_zb = *zb;
	cb->l2rcb_flags = zio_flags;

	asize = vdev_psize_to_asize(vd, size);
	if (asize != size) {
	abd = abd_alloc_for_io(asize,
	HDR_ISTYPE_METADATA(hdr));
	cb->l2rcb_abd = abd;
	} else {
	abd = hdr->b_l1hdr.b_pabd;
	}

	ASSERT(addr >= VDEV_LABEL_START_SIZE &&
	addr + asize <= vd->vdev_psize -
	VDEV_LABEL_END_SIZE);

	/*
	* l2arc read. The SCL_L2ARC lock will be
	* released by l2arc_read_done().
	* Issue a null zio if the underlying buffer
	* was squashed to zero size by compression.
	*/
	ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
	ZIO_COMPRESS_EMPTY);
	rzio = zio_read_phys(pio, vd, addr,
	asize, abd,
	ZIO_CHECKSUM_OFF,
	l2arc_read_done, cb, priority,
	zio_flags \| ZIO_FLAG_DONT_CACHE \|
	ZIO_FLAG_CANFAIL \|
	ZIO_FLAG_DONT_PROPAGATE \|
	ZIO_FLAG_DONT_RETRY, B_FALSE);
	DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
	zio_t *, rzio);
	ARCSTAT_INCR(arcstat_l2_read_bytes, size);

	if (*arc_flags & ARC_FLAG_NOWAIT) {
	zio_nowait(rzio);
	return (0);
	}

	ASSERT(*arc_flags & ARC_FLAG_WAIT);
	if (zio_wait(rzio) == 0)
	return (0);

	/* l2arc read error; goto zio_read() */
	} else {
	DTRACE_PROBE1(l2arc__miss,
	arc_buf_hdr_t *, hdr);
	ARCSTAT_BUMP(arcstat_l2_misses);
	if (HDR_L2_WRITING(hdr))
	ARCSTAT_BUMP(arcstat_l2_rw_clash);
	spa_config_exit(spa, SCL_L2ARC, vd);
	}
	} else {
	if (vd != NULL)
	spa_config_exit(spa, SCL_L2ARC, vd);
	if (l2arc_ndev != 0) {
	DTRACE_PROBE1(l2arc__miss,
	arc_buf_hdr_t *, hdr);
	ARCSTAT_BUMP(arcstat_l2_misses);
	}
	}

	rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
	arc_read_done, hdr, priority, zio_flags, zb);

	if (*arc_flags & ARC_FLAG_WAIT)
	return (zio_wait(rzio));

	ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
	zio_nowait(rzio);
	}
	return (0);
	}

	/*
	* Notify the arc that a block was freed, and thus will never be used again.
	*/
	void
	arc_freed(spa_t spa, const blkptr_t bp)
	{
	arc_buf_hdr_t *hdr;
	kmutex_t *hash_lock;
	uint64_t guid = spa_load_guid(spa);

	ASSERT(!BP_IS_EMBEDDED(bp));

	hdr = buf_hash_find(guid, bp, &hash_lock);
	if (hdr == NULL)
	return;

	/*
	* We might be trying to free a block that is still doing I/O
	* (i.e. prefetch) or has a reference (i.e. a dedup-ed,
	* dmu_sync-ed block). If this block is being prefetched, then it
	* would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
	* until the I/O completes. A block may also have a reference if it is
	* part of a dedup-ed, dmu_synced write. The dmu_sync() function would
	* have written the new block to its final resting place on disk but
	* without the dedup flag set. This would have left the hdr in the MRU
	* state and discoverable. When the txg finally syncs it detects that
	* the block was overridden in open context and issues an override I/O.
	* Since this is a dedup block, the override I/O will determine if the
	* block is already in the DDT. If so, then it will replace the io_bp
	* with the bp from the DDT and allow the I/O to finish. When the I/O
	* reaches the done callback, dbuf_write_override_done, it will
	* check to see if the io_bp and io_bp_override are identical.
	* If they are not, then it indicates that the bp was replaced with
	* the bp in the DDT and the override bp is freed. This allows
	* us to arrive here with a reference on a block that is being
	* freed. So if we have an I/O in progress, or a reference to
	* this hdr, then we don't destroy the hdr.
	*/
	if (!HDR_HAS_L1HDR(hdr) \|\| (!HDR_IO_IN_PROGRESS(hdr) &&
	refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
	arc_change_state(arc_anon, hdr, hash_lock);
	arc_hdr_destroy(hdr);
	mutex_exit(hash_lock);
	} else {
	mutex_exit(hash_lock);
	}

	}

	/*
	* Release this buffer from the cache, making it an anonymous buffer. This
	* must be done after a read and prior to modifying the buffer contents.
	* If the buffer has more than one reference, we must make
	* a new hdr for the buffer.
	*/
	void
	arc_release(arc_buf_t buf, void tag)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;

	/*
	* It would be nice to assert that if it's DMU metadata (level >
	* 0 \|\| it's the dnode file), then it must be syncing context.
	* But we don't know that information at this level.
	*/

	mutex_enter(&buf->b_evict_lock);

	ASSERT(HDR_HAS_L1HDR(hdr));

	/*
	* We don't grab the hash lock prior to this check, because if
	* the buffer's header is in the arc_anon state, it won't be
	* linked into the hash table.
	*/
	if (hdr->b_l1hdr.b_state == arc_anon) {
	mutex_exit(&buf->b_evict_lock);
	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	ASSERT(!HDR_IN_HASH_TABLE(hdr));
	ASSERT(!HDR_HAS_L2HDR(hdr));
	ASSERT(HDR_EMPTY(hdr));
	ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
	ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
	ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));

	hdr->b_l1hdr.b_arc_access = 0;

	/*
	* If the buf is being overridden then it may already
	* have a hdr that is not empty.
	*/
	buf_discard_identity(hdr);
	arc_buf_thaw(buf);

	return;
	}

	kmutex_t *hash_lock = HDR_LOCK(hdr);
	mutex_enter(hash_lock);

	/*
	* This assignment is only valid as long as the hash_lock is
	* held, we must be careful not to reference state or the
	* b_state field after dropping the lock.
	*/
	arc_state_t *state = hdr->b_l1hdr.b_state;
	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
	ASSERT3P(state, !=, arc_anon);

	/* this buffer is not on any list */
	ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);

	if (HDR_HAS_L2HDR(hdr)) {
	mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);

	/*
	* We have to recheck this conditional again now that
	* we're holding the l2ad_mtx to prevent a race with
	* another thread which might be concurrently calling
	* l2arc_evict(). In that case, l2arc_evict() might have
	* destroyed the header's L2 portion as we were waiting
	* to acquire the l2ad_mtx.
	*/
	if (HDR_HAS_L2HDR(hdr)) {
	l2arc_trim(hdr);
	arc_hdr_l2hdr_destroy(hdr);
	}

	mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
	}

	/*
	* Do we have more than one buf?
	*/
	if (hdr->b_l1hdr.b_bufcnt > 1) {
	arc_buf_hdr_t *nhdr;
	uint64_t spa = hdr->b_spa;
	uint64_t psize = HDR_GET_PSIZE(hdr);
	uint64_t lsize = HDR_GET_LSIZE(hdr);
	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
	arc_buf_contents_t type = arc_buf_type(hdr);
	VERIFY3U(hdr->b_type, ==, type);

	ASSERT(hdr->b_l1hdr.b_buf != buf \|\| buf->b_next != NULL);
	(void) remove_reference(hdr, hash_lock, tag);

	if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
	ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
	ASSERT(ARC_BUF_LAST(buf));
	}

	/*
	* Pull the data off of this hdr and attach it to
	* a new anonymous hdr. Also find the last buffer
	* in the hdr's buffer list.
	*/
	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
	ASSERT3P(lastbuf, !=, NULL);

	/*
	* If the current arc_buf_t and the hdr are sharing their data
	* buffer, then we must stop sharing that block.
	*/
	if (arc_buf_is_shared(buf)) {
	VERIFY(!arc_buf_is_shared(lastbuf));

	/*
	* First, sever the block sharing relationship between
	* buf and the arc_buf_hdr_t.
	*/
	arc_unshare_buf(hdr, buf);

	/*
	* Now we need to recreate the hdr's b_pabd. Since we
	* have lastbuf handy, we try to share with it, but if
	* we can't then we allocate a new b_pabd and copy the
	* data from buf into it.
	*/
	if (arc_can_share(hdr, lastbuf)) {
	arc_share_buf(hdr, lastbuf);
	} else {
	arc_hdr_alloc_pabd(hdr);
	abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
	buf->b_data, psize);
	}
	VERIFY3P(lastbuf->b_data, !=, NULL);
	} else if (HDR_SHARED_DATA(hdr)) {
	/*
	* Uncompressed shared buffers are always at the end
	* of the list. Compressed buffers don't have the
	* same requirements. This makes it hard to
	* simply assert that the lastbuf is shared so
	* we rely on the hdr's compression flags to determine
	* if we have a compressed, shared buffer.
	*/
	ASSERT(arc_buf_is_shared(lastbuf) \|\|
	HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
	ASSERT(!ARC_BUF_SHARED(buf));
	}
	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	ASSERT3P(state, !=, arc_l2c_only);

	(void) refcount_remove_many(&state->arcs_size,
	arc_buf_size(buf), buf);

	if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
	ASSERT3P(state, !=, arc_l2c_only);
	(void) refcount_remove_many(&state->arcs_esize[type],
	arc_buf_size(buf), buf);
	}

	hdr->b_l1hdr.b_bufcnt -= 1;
	arc_cksum_verify(buf);
	#ifdef illumos
	arc_buf_unwatch(buf);
	#endif

	mutex_exit(hash_lock);

	/*
	* Allocate a new hdr. The new hdr will contain a b_pabd
	* buffer which will be freed in arc_write().
	*/
	nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
	ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
	ASSERT0(nhdr->b_l1hdr.b_bufcnt);
	ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt));
	VERIFY3U(nhdr->b_type, ==, type);
	ASSERT(!HDR_SHARED_DATA(nhdr));

	nhdr->b_l1hdr.b_buf = buf;
	nhdr->b_l1hdr.b_bufcnt = 1;
	(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
	buf->b_hdr = nhdr;

	mutex_exit(&buf->b_evict_lock);
	(void) refcount_add_many(&arc_anon->arcs_size,
	arc_buf_size(buf), buf);
	} else {
	mutex_exit(&buf->b_evict_lock);
	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
	/* protected by hash lock, or hdr is on arc_anon */
	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	arc_change_state(arc_anon, hdr, hash_lock);
	hdr->b_l1hdr.b_arc_access = 0;
	mutex_exit(hash_lock);

	buf_discard_identity(hdr);
	arc_buf_thaw(buf);
	}
	}

	int
	arc_released(arc_buf_t *buf)
	{
	int released;

	mutex_enter(&buf->b_evict_lock);
	released = (buf->b_data != NULL &&
	buf->b_hdr->b_l1hdr.b_state == arc_anon);
	mutex_exit(&buf->b_evict_lock);
	return (released);
	}

	#ifdef ZFS_DEBUG
	int
	arc_referenced(arc_buf_t *buf)
	{
	int referenced;

	mutex_enter(&buf->b_evict_lock);
	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
	mutex_exit(&buf->b_evict_lock);
	return (referenced);
	}
	#endif

	static void
	arc_write_ready(zio_t *zio)
	{
	arc_write_callback_t *callback = zio->io_private;
	arc_buf_t *buf = callback->awcb_buf;
	arc_buf_hdr_t *hdr = buf->b_hdr;
	uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);

	ASSERT(HDR_HAS_L1HDR(hdr));
	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);

	/*
	* If we're reexecuting this zio because the pool suspended, then
	* cleanup any state that was previously set the first time the
	* callback was invoked.
	*/
	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
	arc_cksum_free(hdr);
	#ifdef illumos
	arc_buf_unwatch(buf);
	#endif
	if (hdr->b_l1hdr.b_pabd != NULL) {
	if (arc_buf_is_shared(buf)) {
	arc_unshare_buf(hdr, buf);
	} else {
	arc_hdr_free_pabd(hdr);
	}
	}
	}
	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
	ASSERT(!HDR_SHARED_DATA(hdr));
	ASSERT(!arc_buf_is_shared(buf));

	callback->awcb_ready(zio, buf, callback->awcb_private);

	if (HDR_IO_IN_PROGRESS(hdr))
	ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);

	arc_cksum_compute(buf);
	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);

	enum zio_compress compress;
	if (BP_IS_HOLE(zio->io_bp) \|\| BP_IS_EMBEDDED(zio->io_bp)) {
	compress = ZIO_COMPRESS_OFF;
	} else {
	ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
	compress = BP_GET_COMPRESS(zio->io_bp);
	}
	HDR_SET_PSIZE(hdr, psize);
	arc_hdr_set_compress(hdr, compress);


	/*
	* Fill the hdr with data. If the hdr is compressed, the data we want
	* is available from the zio, otherwise we can take it from the buf.
	*
	* We might be able to share the buf's data with the hdr here. However,
	* doing so would cause the ARC to be full of linear ABDs if we write a
	* lot of shareable data. As a compromise, we check whether scattered
	* ABDs are allowed, and assume that if they are then the user wants
	* the ARC to be primarily filled with them regardless of the data being
	* written. Therefore, if they're allowed then we allocate one and copy
	* the data into it; otherwise, we share the data directly if we can.
	*/
	if (zfs_abd_scatter_enabled \|\| !arc_can_share(hdr, buf)) {
	arc_hdr_alloc_pabd(hdr);

	/*
	* Ideally, we would always copy the io_abd into b_pabd, but the
	* user may have disabled compressed ARC, thus we must check the
	* hdr's compression setting rather than the io_bp's.
	*/
	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
	ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
	ZIO_COMPRESS_OFF);
	ASSERT3U(psize, >, 0);

	abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
	} else {
	ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));

	abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
	arc_buf_size(buf));
	}
	} else {
	ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
	ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
	ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);

	arc_share_buf(hdr, buf);
	}

	arc_hdr_verify(hdr, zio->io_bp);
	}

	static void
	arc_write_children_ready(zio_t *zio)
	{
	arc_write_callback_t *callback = zio->io_private;
	arc_buf_t *buf = callback->awcb_buf;

	callback->awcb_children_ready(zio, buf, callback->awcb_private);
	}

	/*
	* The SPA calls this callback for each physical write that happens on behalf
	* of a logical write. See the comment in dbuf_write_physdone() for details.
	*/
	static void
	arc_write_physdone(zio_t *zio)
	{
	arc_write_callback_t *cb = zio->io_private;
	if (cb->awcb_physdone != NULL)
	cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
	}

	static void
	arc_write_done(zio_t *zio)
	{
	arc_write_callback_t *callback = zio->io_private;
	arc_buf_t *buf = callback->awcb_buf;
	arc_buf_hdr_t *hdr = buf->b_hdr;

	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);

	if (zio->io_error == 0) {
	arc_hdr_verify(hdr, zio->io_bp);

	if (BP_IS_HOLE(zio->io_bp) \|\| BP_IS_EMBEDDED(zio->io_bp)) {
	buf_discard_identity(hdr);
	} else {
	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
	hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
	}
	} else {
	ASSERT(HDR_EMPTY(hdr));
	}

	/*
	* If the block to be written was all-zero or compressed enough to be
	* embedded in the BP, no write was performed so there will be no
	* dva/birth/checksum. The buffer must therefore remain anonymous
	* (and uncached).
	*/
	if (!HDR_EMPTY(hdr)) {
	arc_buf_hdr_t *exists;
	kmutex_t *hash_lock;

	ASSERT3U(zio->io_error, ==, 0);

	arc_cksum_verify(buf);

	exists = buf_hash_insert(hdr, &hash_lock);
	if (exists != NULL) {
	/*
	* This can only happen if we overwrite for
	* sync-to-convergence, because we remove
	* buffers from the hash table when we arc_free().
	*/
	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
	if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
	panic("bad overwrite, hdr=%p exists=%p",
	(void )hdr, (void )exists);
	ASSERT(refcount_is_zero(
	&exists->b_l1hdr.b_refcnt));
	arc_change_state(arc_anon, exists, hash_lock);
	mutex_exit(hash_lock);
	arc_hdr_destroy(exists);
	exists = buf_hash_insert(hdr, &hash_lock);
	ASSERT3P(exists, ==, NULL);
	} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
	/* nopwrite */
	ASSERT(zio->io_prop.zp_nopwrite);
	if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
	panic("bad nopwrite, hdr=%p exists=%p",
	(void )hdr, (void )exists);
	} else {
	/* Dedup */
	ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
	ASSERT(hdr->b_l1hdr.b_state == arc_anon);
	ASSERT(BP_GET_DEDUP(zio->io_bp));
	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
	}
	}
	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
	/* if it's not anon, we are doing a scrub */
	if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
	arc_access(hdr, hash_lock);
	mutex_exit(hash_lock);
	} else {
	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
	}

	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
	callback->awcb_done(zio, buf, callback->awcb_private);

	abd_put(zio->io_abd);
	kmem_free(callback, sizeof (arc_write_callback_t));
	}

	zio_t *
	arc_write(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp, arc_buf_t buf,
	boolean_t l2arc, const zio_prop_t zp, arc_done_func_t ready,
	arc_done_func_t children_ready, arc_done_func_t physdone,
	arc_done_func_t done, void private, zio_priority_t priority,
	int zio_flags, const zbookmark_phys_t *zb)
	{
	arc_buf_hdr_t *hdr = buf->b_hdr;
	arc_write_callback_t *callback;
	zio_t *zio;
	zio_prop_t localprop = *zp;

	ASSERT3P(ready, !=, NULL);
	ASSERT3P(done, !=, NULL);
	ASSERT(!HDR_IO_ERROR(hdr));
	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
	if (l2arc)
	arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
	if (ARC_BUF_COMPRESSED(buf)) {
	/*
	* We're writing a pre-compressed buffer. Make the
	* compression algorithm requested by the zio_prop_t match
	* the pre-compressed buffer's compression algorithm.
	*/
	localprop.zp_compress = HDR_GET_COMPRESS(hdr);

	ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
	zio_flags \|= ZIO_FLAG_RAW;
	}
	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
	callback->awcb_ready = ready;
	callback->awcb_children_ready = children_ready;
	callback->awcb_physdone = physdone;
	callback->awcb_done = done;
	callback->awcb_private = private;
	callback->awcb_buf = buf;

	/*
	* The hdr's b_pabd is now stale, free it now. A new data block
	* will be allocated when the zio pipeline calls arc_write_ready().
	*/
	if (hdr->b_l1hdr.b_pabd != NULL) {
	/*
	* If the buf is currently sharing the data block with
	* the hdr then we need to break that relationship here.
	* The hdr will remain with a NULL data pointer and the
	* buf will take sole ownership of the block.
	*/
	if (arc_buf_is_shared(buf)) {
	arc_unshare_buf(hdr, buf);
	} else {
	arc_hdr_free_pabd(hdr);
	}
	VERIFY3P(buf->b_data, !=, NULL);
	arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
	}
	ASSERT(!arc_buf_is_shared(buf));
	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);

	zio = zio_write(pio, spa, txg, bp,
	abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
	HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
	(children_ready != NULL) ? arc_write_children_ready : NULL,
	arc_write_physdone, arc_write_done, callback,
	priority, zio_flags, zb);

	return (zio);
	}

	static int
	arc_memory_throttle(uint64_t reserve, uint64_t txg)
	{
	#ifdef _KERNEL
	uint64_t available_memory = ptob(freemem);
	static uint64_t page_load = 0;
	static uint64_t last_txg = 0;

	#if defined(__i386) \|\| !defined(UMA_MD_SMALL_ALLOC)
	available_memory =
	MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
	#endif

	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
	return (0);

	if (txg > last_txg) {
	last_txg = txg;
	page_load = 0;
	}
	/*
	* If we are in pageout, we know that memory is already tight,
	* the arc is already going to be evicting, so we just want to
	* continue to let page writes occur as quickly as possible.
	*/
	if (curproc == pageproc) {
	if (page_load > MAX(ptob(minfree), available_memory) / 4)
	return (SET_ERROR(ERESTART));
	/* Note: reserve is inflated, so we deflate */
	page_load += reserve / 8;
	return (0);
	} else if (page_load > 0 && arc_reclaim_needed()) {
	/* memory is low, delay before restarting */
	ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
	return (SET_ERROR(EAGAIN));
	}
	page_load = 0;
	#endif
	return (0);
	}

	void
	arc_tempreserve_clear(uint64_t reserve)
	{
	atomic_add_64(&arc_tempreserve, -reserve);
	ASSERT((int64_t)arc_tempreserve >= 0);
	}

	int
	arc_tempreserve_space(uint64_t reserve, uint64_t txg)
	{
	int error;
	uint64_t anon_size;

	if (reserve > arc_c/4 && !arc_no_grow) {
	arc_c = MIN(arc_c_max, reserve * 4);
	DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
	}
	if (reserve > arc_c)
	return (SET_ERROR(ENOMEM));

	/*
	* Don't count loaned bufs as in flight dirty data to prevent long
	* network delays from blocking transactions that are ready to be
	* assigned to a txg.
	*/

	/* assert that it has not wrapped around */
	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);

	anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
	arc_loaned_bytes), 0);

	/*
	* Writes will, almost always, require additional memory allocations
	* in order to compress/encrypt/etc the data. We therefore need to
	* make sure that there is sufficient available memory for this.
	*/
	error = arc_memory_throttle(reserve, txg);
	if (error != 0)
	return (error);

	/*
	* Throttle writes when the amount of dirty data in the cache
	* gets too large. We try to keep the cache less than half full
	* of dirty blocks so that our sync times don't grow too large.
	* Note: if two requests come in concurrently, we might let them
	* both succeed, when one of them should fail. Not a huge deal.
	*/

	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
	anon_size > arc_c / 4) {
	uint64_t meta_esize =
	refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
	uint64_t data_esize =
	refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
	dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
	"anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
	arc_tempreserve >> 10, meta_esize >> 10,
	data_esize >> 10, reserve >> 10, arc_c >> 10);
	return (SET_ERROR(ERESTART));
	}
	atomic_add_64(&arc_tempreserve, reserve);
	return (0);
	}

	static void
	arc_kstat_update_state(arc_state_t state, kstat_named_t size,
	kstat_named_t evict_data, kstat_named_t evict_metadata)
	{
	size->value.ui64 = refcount_count(&state->arcs_size);
	evict_data->value.ui64 =
	refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
	evict_metadata->value.ui64 =
	refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
	}

	static int
	arc_kstat_update(kstat_t *ksp, int rw)
	{
	arc_stats_t *as = ksp->ks_data;

	if (rw == KSTAT_WRITE) {
	return (EACCES);
	} else {
	arc_kstat_update_state(arc_anon,
	&as->arcstat_anon_size,
	&as->arcstat_anon_evictable_data,
	&as->arcstat_anon_evictable_metadata);
	arc_kstat_update_state(arc_mru,
	&as->arcstat_mru_size,
	&as->arcstat_mru_evictable_data,
	&as->arcstat_mru_evictable_metadata);
	arc_kstat_update_state(arc_mru_ghost,
	&as->arcstat_mru_ghost_size,
	&as->arcstat_mru_ghost_evictable_data,
	&as->arcstat_mru_ghost_evictable_metadata);
	arc_kstat_update_state(arc_mfu,
	&as->arcstat_mfu_size,
	&as->arcstat_mfu_evictable_data,
	&as->arcstat_mfu_evictable_metadata);
	arc_kstat_update_state(arc_mfu_ghost,
	&as->arcstat_mfu_ghost_size,
	&as->arcstat_mfu_ghost_evictable_data,
	&as->arcstat_mfu_ghost_evictable_metadata);
	}

	return (0);
	}

	/*
	* This function must return indices evenly distributed between all
	* sublists of the multilist. This is needed due to how the ARC eviction
	* code is laid out; arc_evict_state() assumes ARC buffers are evenly
	* distributed between all sublists and uses this assumption when
	* deciding which sublist to evict from and how much to evict from it.
	*/
	unsigned int
	arc_state_multilist_index_func(multilist_t ml, void obj)
	{
	arc_buf_hdr_t *hdr = obj;

	/*
	* We rely on b_dva to generate evenly distributed index
	* numbers using buf_hash below. So, as an added precaution,
	* let's make sure we never add empty buffers to the arc lists.
	*/
	ASSERT(!HDR_EMPTY(hdr));

	/*
	* The assumption here, is the hash value for a given
	* arc_buf_hdr_t will remain constant throughout it's lifetime
	* (i.e. it's b_spa, b_dva, and b_birth fields don't change).
	* Thus, we don't need to store the header's sublist index
	* on insertion, as this index can be recalculated on removal.
	*
	* Also, the low order bits of the hash value are thought to be
	* distributed evenly. Otherwise, in the case that the multilist
	* has a power of two number of sublists, each sublists' usage
	* would not be evenly distributed.
	*/
	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
	multilist_get_num_sublists(ml));
	}

	#ifdef _KERNEL
	static eventhandler_tag arc_event_lowmem = NULL;

	static void
	arc_lowmem(void *arg __unused, int howto __unused)
	{

	mutex_enter(&arc_reclaim_lock);
	DTRACE_PROBE1(arc__needfree, int64_t, ((int64_t)freemem - zfs_arc_free_target) * PAGESIZE);
	cv_signal(&arc_reclaim_thread_cv);

	/*
	* It is unsafe to block here in arbitrary threads, because we can come
	* here from ARC itself and may hold ARC locks and thus risk a deadlock
	* with ARC reclaim thread.
	*/
	if (curproc == pageproc)
	(void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
	mutex_exit(&arc_reclaim_lock);
	}
	#endif

	static void
	arc_state_init(void)
	{
	arc_anon = &ARC_anon;
	arc_mru = &ARC_mru;
	arc_mru_ghost = &ARC_mru_ghost;
	arc_mfu = &ARC_mfu;
	arc_mfu_ghost = &ARC_mfu_ghost;
	arc_l2c_only = &ARC_l2c_only;

	arc_mru->arcs_list[ARC_BUFC_METADATA] =
	multilist_create(sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	arc_state_multilist_index_func);
	arc_mru->arcs_list[ARC_BUFC_DATA] =
	multilist_create(sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	arc_state_multilist_index_func);
	arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
	multilist_create(sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	arc_state_multilist_index_func);
	arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
	multilist_create(sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	arc_state_multilist_index_func);
	arc_mfu->arcs_list[ARC_BUFC_METADATA] =
	multilist_create(sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	arc_state_multilist_index_func);
	arc_mfu->arcs_list[ARC_BUFC_DATA] =
	multilist_create(sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	arc_state_multilist_index_func);
	arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
	multilist_create(sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	arc_state_multilist_index_func);
	arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
	multilist_create(sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	arc_state_multilist_index_func);
	arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
	multilist_create(sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	arc_state_multilist_index_func);
	arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
	multilist_create(sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
	arc_state_multilist_index_func);

	refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
	refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
	refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
	refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
	refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
	refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
	refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
	refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
	refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
	refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
	refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
	refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);

	refcount_create(&arc_anon->arcs_size);
	refcount_create(&arc_mru->arcs_size);
	refcount_create(&arc_mru_ghost->arcs_size);
	refcount_create(&arc_mfu->arcs_size);
	refcount_create(&arc_mfu_ghost->arcs_size);
	refcount_create(&arc_l2c_only->arcs_size);
	}

	static void
	arc_state_fini(void)
	{
	refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
	refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
	refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
	refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
	refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
	refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
	refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
	refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
	refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
	refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
	refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
	refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);

	refcount_destroy(&arc_anon->arcs_size);
	refcount_destroy(&arc_mru->arcs_size);
	refcount_destroy(&arc_mru_ghost->arcs_size);
	refcount_destroy(&arc_mfu->arcs_size);
	refcount_destroy(&arc_mfu_ghost->arcs_size);
	refcount_destroy(&arc_l2c_only->arcs_size);

	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
	}

	uint64_t
	arc_max_bytes(void)
	{
	return (arc_c_max);
	}

	void
	arc_init(void)
	{
	int i, prefetch_tunable_set = 0;

	/*
	* allmem is "all memory that we could possibly use".
	*/
	#ifdef illumos
	#ifdef _KERNEL
	uint64_t allmem = ptob(physmem - swapfs_minfree);
	#else
	uint64_t allmem = (physmem * PAGESIZE) / 2;
	#endif
	#else
	uint64_t allmem = kmem_size();
	#endif


	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);

	mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);

	/* Convert seconds to clock ticks */
	arc_min_prefetch_lifespan = 1 * hz;

	/* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
	arc_c_min = MAX(allmem / 32, arc_abs_min);
	/* set max to 5/8 of all memory, or all but 1GB, whichever is more */
	if (allmem >= 1 << 30)
	arc_c_max = allmem - (1 << 30);
	else
	arc_c_max = arc_c_min;
	arc_c_max = MAX(allmem * 5 / 8, arc_c_max);

	/*
	* In userland, there's only the memory pressure that we artificially
	* create (see arc_available_memory()). Don't let arc_c get too
	* small, because it can cause transactions to be larger than
	* arc_c, causing arc_tempreserve_space() to fail.
	*/
	#ifndef _KERNEL
	arc_c_min = arc_c_max / 2;
	#endif

	#ifdef _KERNEL
	/*
	* Allow the tunables to override our calculations if they are
	* reasonable.
	*/
	if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) {
	arc_c_max = zfs_arc_max;
	arc_c_min = MIN(arc_c_min, arc_c_max);
	}
	if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
	arc_c_min = zfs_arc_min;
	#endif

	arc_c = arc_c_max;
	arc_p = (arc_c >> 1);
	arc_size = 0;

	/* limit meta-data to 1/4 of the arc capacity */
	arc_meta_limit = arc_c_max / 4;

	#ifdef _KERNEL
	/*
	* Metadata is stored in the kernel's heap. Don't let us
	* use more than half the heap for the ARC.
	*/
	arc_meta_limit = MIN(arc_meta_limit,
	vmem_size(heap_arena, VMEM_ALLOC \| VMEM_FREE) / 2);
	#endif

	/* Allow the tunable to override if it is reasonable */
	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
	arc_meta_limit = zfs_arc_meta_limit;

	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
	arc_c_min = arc_meta_limit / 2;

	if (zfs_arc_meta_min > 0) {
	arc_meta_min = zfs_arc_meta_min;
	} else {
	arc_meta_min = arc_c_min / 2;
	}

	if (zfs_arc_grow_retry > 0)
	arc_grow_retry = zfs_arc_grow_retry;

	if (zfs_arc_shrink_shift > 0)
	arc_shrink_shift = zfs_arc_shrink_shift;

	if (zfs_arc_no_grow_shift > 0)
	arc_no_grow_shift = zfs_arc_no_grow_shift;
	/*
	* Ensure that arc_no_grow_shift is less than arc_shrink_shift.
	*/
	if (arc_no_grow_shift >= arc_shrink_shift)
	arc_no_grow_shift = arc_shrink_shift - 1;

	if (zfs_arc_p_min_shift > 0)
	arc_p_min_shift = zfs_arc_p_min_shift;

	/* if kmem_flags are set, lets try to use less memory */
	if (kmem_debugging())
	arc_c = arc_c / 2;
	if (arc_c < arc_c_min)
	arc_c = arc_c_min;

	zfs_arc_min = arc_c_min;
	zfs_arc_max = arc_c_max;

	arc_state_init();
	buf_init();

	arc_reclaim_thread_exit = B_FALSE;
	arc_dnlc_evicts_thread_exit = FALSE;

	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
	sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);

	if (arc_ksp != NULL) {
	arc_ksp->ks_data = &arc_stats;
	arc_ksp->ks_update = arc_kstat_update;
	kstat_install(arc_ksp);
	}

	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
	TS_RUN, minclsyspri);

	#ifdef _KERNEL
	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
	EVENTHANDLER_PRI_FIRST);
	#endif

	(void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0,
	TS_RUN, minclsyspri);

	arc_dead = B_FALSE;
	arc_warm = B_FALSE;

	/*
	* Calculate maximum amount of dirty data per pool.
	*
	* If it has been set by /etc/system, take that.
	* Otherwise, use a percentage of physical memory defined by
	* zfs_dirty_data_max_percent (default 10%) with a cap at
	* zfs_dirty_data_max_max (default 4GB).
	*/
	if (zfs_dirty_data_max == 0) {
	zfs_dirty_data_max = ptob(physmem) *
	zfs_dirty_data_max_percent / 100;
	zfs_dirty_data_max = MIN(zfs_dirty_data_max,
	zfs_dirty_data_max_max);
	}

	#ifdef _KERNEL
	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
	prefetch_tunable_set = 1;

	#ifdef __i386__
	if (prefetch_tunable_set == 0) {
	printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
	"-- to enable,\n");
	printf(" add \"vfs.zfs.prefetch_disable=0\" "
	"to /boot/loader.conf.\n");
	zfs_prefetch_disable = 1;
	}
	#else
	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
	prefetch_tunable_set == 0) {
	printf("ZFS NOTICE: Prefetch is disabled by default if less "
	"than 4GB of RAM is present;\n"
	" to enable, add \"vfs.zfs.prefetch_disable=0\" "
	"to /boot/loader.conf.\n");
	zfs_prefetch_disable = 1;
	}
	#endif
	/* Warn about ZFS memory and address space requirements. */
	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
	printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
	"expect unstable behavior.\n");
	}
	if (allmem < 512 * (1 << 20)) {
	printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
	"expect unstable behavior.\n");
	printf(" Consider tuning vm.kmem_size and "
	"vm.kmem_size_max\n");
	printf(" in /boot/loader.conf.\n");
	}
	#endif
	}

	void
	arc_fini(void)
	{
	#ifdef _KERNEL
	if (arc_event_lowmem != NULL)
	EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
	#endif

	mutex_enter(&arc_reclaim_lock);
	arc_reclaim_thread_exit = B_TRUE;
	/*
	* The reclaim thread will set arc_reclaim_thread_exit back to
	* B_FALSE when it is finished exiting; we're waiting for that.
	*/
	while (arc_reclaim_thread_exit) {
	cv_signal(&arc_reclaim_thread_cv);
	cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
	}
	mutex_exit(&arc_reclaim_lock);

	/* Use B_TRUE to ensure all buffers are evicted */
	arc_flush(NULL, B_TRUE);

	mutex_enter(&arc_dnlc_evicts_lock);
	arc_dnlc_evicts_thread_exit = TRUE;
	/*
	* The user evicts thread will set arc_user_evicts_thread_exit
	* to FALSE when it is finished exiting; we're waiting for that.
	*/
	while (arc_dnlc_evicts_thread_exit) {
	cv_signal(&arc_dnlc_evicts_cv);
	cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
	}
	mutex_exit(&arc_dnlc_evicts_lock);

	arc_dead = B_TRUE;

	if (arc_ksp != NULL) {
	kstat_delete(arc_ksp);
	arc_ksp = NULL;
	}

	mutex_destroy(&arc_reclaim_lock);
	cv_destroy(&arc_reclaim_thread_cv);
	cv_destroy(&arc_reclaim_waiters_cv);

	mutex_destroy(&arc_dnlc_evicts_lock);
	cv_destroy(&arc_dnlc_evicts_cv);

	arc_state_fini();
	buf_fini();

	ASSERT0(arc_loaned_bytes);
	}

	/*
	* Level 2 ARC
	*
	* The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
	* It uses dedicated storage devices to hold cached data, which are populated
	* using large infrequent writes. The main role of this cache is to boost
	* the performance of random read workloads. The intended L2ARC devices
	* include short-stroked disks, solid state disks, and other media with
	* substantially faster read latency than disk.
	*
	* +-----------------------+
	* \| ARC \|
	* +-----------------------+
	* \| ^ ^
	* \| \| \|
	* l2arc_feed_thread() arc_read()
	* \| \| \|
	* \| l2arc read \|
	* V \| \|
	* +---------------+ \|
	* \| L2ARC \| \|
	* +---------------+ \|
	* \| ^ \|
	* l2arc_write() \| \|
	* \| \| \|
	* V \| \|
	* +-------+ +-------+
	* \| vdev \| \| vdev \|
	* \| cache \| \| cache \|
	* +-------+ +-------+
	* +=========+ .-----.
	* : L2ARC : \|-_____-\|
	* : devices : \| Disks \|
	* +=========+ `-_____-'
	*
	* Read requests are satisfied from the following sources, in order:
	*
	* 1) ARC
	* 2) vdev cache of L2ARC devices
	* 3) L2ARC devices
	* 4) vdev cache of disks
	* 5) disks
	*
	* Some L2ARC device types exhibit extremely slow write performance.
	* To accommodate for this there are some significant differences between
	* the L2ARC and traditional cache design:
	*
	* 1. There is no eviction path from the ARC to the L2ARC. Evictions from
	* the ARC behave as usual, freeing buffers and placing headers on ghost
	* lists. The ARC does not send buffers to the L2ARC during eviction as
	* this would add inflated write latencies for all ARC memory pressure.
	*
	* 2. The L2ARC attempts to cache data from the ARC before it is evicted.
	* It does this by periodically scanning buffers from the eviction-end of
	* the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
	* not already there. It scans until a headroom of buffers is satisfied,
	* which itself is a buffer for ARC eviction. If a compressible buffer is
	* found during scanning and selected for writing to an L2ARC device, we
	* temporarily boost scanning headroom during the next scan cycle to make
	* sure we adapt to compression effects (which might significantly reduce
	* the data volume we write to L2ARC). The thread that does this is
	* l2arc_feed_thread(), illustrated below; example sizes are included to
	* provide a better sense of ratio than this diagram:
	*
	* head --> tail
	* +---------------------+----------+
	* ARC_mfu \|:::::#:::::::::::::::\|o#o###o###\|-->. # already on L2ARC
	* +---------------------+----------+ \| o L2ARC eligible
	* ARC_mru \|:#:::::::::::::::::::\|#o#ooo####\|-->\| : ARC buffer
	* +---------------------+----------+ \|
	* 15.9 Gbytes ^ 32 Mbytes \|
	* headroom \|
	* l2arc_feed_thread()
	* \|
	* l2arc write hand <--[oooo]--'
	* \| 8 Mbyte
	* \| write max
	* V
	* +==============================+
	* L2ARC dev \|####\|#\|###\|###\| \|####\| ... \|
	* +==============================+
	* 32 Gbytes
	*
	* 3. If an ARC buffer is copied to the L2ARC but then hit instead of
	* evicted, then the L2ARC has cached a buffer much sooner than it probably
	* needed to, potentially wasting L2ARC device bandwidth and storage. It is
	* safe to say that this is an uncommon case, since buffers at the end of
	* the ARC lists have moved there due to inactivity.
	*
	* 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
	* then the L2ARC simply misses copying some buffers. This serves as a
	* pressure valve to prevent heavy read workloads from both stalling the ARC
	* with waits and clogging the L2ARC with writes. This also helps prevent
	* the potential for the L2ARC to churn if it attempts to cache content too
	* quickly, such as during backups of the entire pool.
	*
	* 5. After system boot and before the ARC has filled main memory, there are
	* no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
	* lists can remain mostly static. Instead of searching from tail of these
	* lists as pictured, the l2arc_feed_thread() will search from the list heads
	* for eligible buffers, greatly increasing its chance of finding them.
	*
	* The L2ARC device write speed is also boosted during this time so that
	* the L2ARC warms up faster. Since there have been no ARC evictions yet,
	* there are no L2ARC reads, and no fear of degrading read performance
	* through increased writes.
	*
	* 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
	* the vdev queue can aggregate them into larger and fewer writes. Each
	* device is written to in a rotor fashion, sweeping writes through
	* available space then repeating.
	*
	* 7. The L2ARC does not store dirty content. It never needs to flush
	* write buffers back to disk based storage.
	*
	* 8. If an ARC buffer is written (and dirtied) which also exists in the
	* L2ARC, the now stale L2ARC buffer is immediately dropped.
	*
	* The performance of the L2ARC can be tweaked by a number of tunables, which
	* may be necessary for different workloads:
	*
	* l2arc_write_max max write bytes per interval
	* l2arc_write_boost extra write bytes during device warmup
	* l2arc_noprefetch skip caching prefetched buffers
	* l2arc_headroom number of max device writes to precache
	* l2arc_headroom_boost when we find compressed buffers during ARC
	* scanning, we multiply headroom by this
	* percentage factor for the next scan cycle,
	* since more compressed buffers are likely to
	* be present
	* l2arc_feed_secs seconds between L2ARC writing
	*
	* Tunables may be removed or added as future performance improvements are
	* integrated, and also may become zpool properties.
	*
	* There are three key functions that control how the L2ARC warms up:
	*
	* l2arc_write_eligible() check if a buffer is eligible to cache
	* l2arc_write_size() calculate how much to write
	* l2arc_write_interval() calculate sleep delay between writes
	*
	* These three functions determine what to write, how much, and how quickly
	* to send writes.
	*/

	static boolean_t
	l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
	{
	/*
	* A buffer is not eligible for the L2ARC if it:
	* 1. belongs to a different spa.
	* 2. is already cached on the L2ARC.
	* 3. has an I/O in progress (it may be an incomplete read).
	* 4. is flagged not eligible (zfs property).
	*/
	if (hdr->b_spa != spa_guid) {
	ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
	return (B_FALSE);
	}
	if (HDR_HAS_L2HDR(hdr)) {
	ARCSTAT_BUMP(arcstat_l2_write_in_l2);
	return (B_FALSE);
	}
	if (HDR_IO_IN_PROGRESS(hdr)) {
	ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
	return (B_FALSE);
	}
	if (!HDR_L2CACHE(hdr)) {
	ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
	return (B_FALSE);
	}

	return (B_TRUE);
	}

	static uint64_t
	l2arc_write_size(void)
	{
	uint64_t size;

	/*
	* Make sure our globals have meaningful values in case the user
	* altered them.
	*/
	size = l2arc_write_max;
	if (size == 0) {
	cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
	"be greater than zero, resetting it to the default (%d)",
	L2ARC_WRITE_SIZE);
	size = l2arc_write_max = L2ARC_WRITE_SIZE;
	}

	if (arc_warm == B_FALSE)
	size += l2arc_write_boost;

	return (size);

	}

	static clock_t
	l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
	{
	clock_t interval, next, now;

	/*
	* If the ARC lists are busy, increase our write rate; if the
	* lists are stale, idle back. This is achieved by checking
	* how much we previously wrote - if it was more than half of
	* what we wanted, schedule the next write much sooner.
	*/
	if (l2arc_feed_again && wrote > (wanted / 2))
	interval = (hz * l2arc_feed_min_ms) / 1000;
	else
	interval = hz * l2arc_feed_secs;

	now = ddi_get_lbolt();
	next = MAX(now, MIN(now + interval, began + interval));

	return (next);
	}

	/*
	* Cycle through L2ARC devices. This is how L2ARC load balances.
	* If a device is returned, this also returns holding the spa config lock.
	*/
	static l2arc_dev_t *
	l2arc_dev_get_next(void)
	{
	l2arc_dev_t first, next = NULL;

	/*
	* Lock out the removal of spas (spa_namespace_lock), then removal
	* of cache devices (l2arc_dev_mtx). Once a device has been selected,
	* both locks will be dropped and a spa config lock held instead.
	*/
	mutex_enter(&spa_namespace_lock);
	mutex_enter(&l2arc_dev_mtx);

	/* if there are no vdevs, there is nothing to do */
	if (l2arc_ndev == 0)
	goto out;

	first = NULL;
	next = l2arc_dev_last;
	do {
	/* loop around the list looking for a non-faulted vdev */
	if (next == NULL) {
	next = list_head(l2arc_dev_list);
	} else {
	next = list_next(l2arc_dev_list, next);
	if (next == NULL)
	next = list_head(l2arc_dev_list);
	}

	/* if we have come back to the start, bail out */
	if (first == NULL)
	first = next;
	else if (next == first)
	break;

	} while (vdev_is_dead(next->l2ad_vdev));

	/* if we were unable to find any usable vdevs, return NULL */
	if (vdev_is_dead(next->l2ad_vdev))
	next = NULL;

	l2arc_dev_last = next;

	out:
	mutex_exit(&l2arc_dev_mtx);

	/*
	* Grab the config lock to prevent the 'next' device from being
	* removed while we are writing to it.
	*/
	if (next != NULL)
	spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
	mutex_exit(&spa_namespace_lock);

	return (next);
	}

	/*
	* Free buffers that were tagged for destruction.
	*/
	static void
	l2arc_do_free_on_write()
	{
	list_t *buflist;
	l2arc_data_free_t df, df_prev;

	mutex_enter(&l2arc_free_on_write_mtx);
	buflist = l2arc_free_on_write;

	for (df = list_tail(buflist); df; df = df_prev) {
	df_prev = list_prev(buflist, df);
	ASSERT3P(df->l2df_abd, !=, NULL);
	abd_free(df->l2df_abd);
	list_remove(buflist, df);
	kmem_free(df, sizeof (l2arc_data_free_t));
	}

	mutex_exit(&l2arc_free_on_write_mtx);
	}

	/*
	* A write to a cache device has completed. Update all headers to allow
	* reads from these buffers to begin.
	*/
	static void
	l2arc_write_done(zio_t *zio)
	{
	l2arc_write_callback_t *cb;
	l2arc_dev_t *dev;
	list_t *buflist;
	arc_buf_hdr_t head, hdr, *hdr_prev;
	kmutex_t *hash_lock;
	int64_t bytes_dropped = 0;

	cb = zio->io_private;
	ASSERT3P(cb, !=, NULL);
	dev = cb->l2wcb_dev;
	ASSERT3P(dev, !=, NULL);
	head = cb->l2wcb_head;
	ASSERT3P(head, !=, NULL);
	buflist = &dev->l2ad_buflist;
	ASSERT3P(buflist, !=, NULL);
	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
	l2arc_write_callback_t *, cb);

	if (zio->io_error != 0)
	ARCSTAT_BUMP(arcstat_l2_writes_error);

	/*
	* All writes completed, or an error was hit.
	*/
	top:
	mutex_enter(&dev->l2ad_mtx);
	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
	hdr_prev = list_prev(buflist, hdr);

	hash_lock = HDR_LOCK(hdr);

	/*
	* We cannot use mutex_enter or else we can deadlock
	* with l2arc_write_buffers (due to swapping the order
	* the hash lock and l2ad_mtx are taken).
	*/
	if (!mutex_tryenter(hash_lock)) {
	/*
	* Missed the hash lock. We must retry so we
	* don't leave the ARC_FLAG_L2_WRITING bit set.
	*/
	ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);

	/*
	* We don't want to rescan the headers we've
	* already marked as having been written out, so
	* we reinsert the head node so we can pick up
	* where we left off.
	*/
	list_remove(buflist, head);
	list_insert_after(buflist, hdr, head);

	mutex_exit(&dev->l2ad_mtx);

	/*
	* We wait for the hash lock to become available
	* to try and prevent busy waiting, and increase
	* the chance we'll be able to acquire the lock
	* the next time around.
	*/
	mutex_enter(hash_lock);
	mutex_exit(hash_lock);
	goto top;
	}

	/*
	* We could not have been moved into the arc_l2c_only
	* state while in-flight due to our ARC_FLAG_L2_WRITING
	* bit being set. Let's just ensure that's being enforced.
	*/
	ASSERT(HDR_HAS_L1HDR(hdr));

	if (zio->io_error != 0) {
	/*
	* Error - drop L2ARC entry.
	*/
	list_remove(buflist, hdr);
	l2arc_trim(hdr);
	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);

	ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr));
	ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));

	bytes_dropped += arc_hdr_size(hdr);
	(void) refcount_remove_many(&dev->l2ad_alloc,
	arc_hdr_size(hdr), hdr);
	}

	/*
	* Allow ARC to begin reads and ghost list evictions to
	* this L2ARC entry.
	*/
	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);

	mutex_exit(hash_lock);
	}

	atomic_inc_64(&l2arc_writes_done);
	list_remove(buflist, head);
	ASSERT(!HDR_HAS_L1HDR(head));
	kmem_cache_free(hdr_l2only_cache, head);
	mutex_exit(&dev->l2ad_mtx);

	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);

	l2arc_do_free_on_write();

	kmem_free(cb, sizeof (l2arc_write_callback_t));
	}

	/*
	* A read to a cache device completed. Validate buffer contents before
	* handing over to the regular ARC routines.
	*/
	static void
	l2arc_read_done(zio_t *zio)
	{
	l2arc_read_callback_t *cb;
	arc_buf_hdr_t *hdr;
	kmutex_t *hash_lock;
	boolean_t valid_cksum;

	ASSERT3P(zio->io_vd, !=, NULL);
	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);

	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);

	cb = zio->io_private;
	ASSERT3P(cb, !=, NULL);
	hdr = cb->l2rcb_hdr;
	ASSERT3P(hdr, !=, NULL);

	hash_lock = HDR_LOCK(hdr);
	mutex_enter(hash_lock);
	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));

	/*
	* If the data was read into a temporary buffer,
	* move it and free the buffer.
	*/
	if (cb->l2rcb_abd != NULL) {
	ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
	if (zio->io_error == 0) {
	abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd,
	arc_hdr_size(hdr));
	}

	/*
	* The following must be done regardless of whether
	* there was an error:
	* - free the temporary buffer
	* - point zio to the real ARC buffer
	* - set zio size accordingly
	* These are required because zio is either re-used for
	* an I/O of the block in the case of the error
	* or the zio is passed to arc_read_done() and it
	* needs real data.
	*/
	abd_free(cb->l2rcb_abd);
	zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
	zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
	}

	ASSERT3P(zio->io_abd, !=, NULL);

	/*
	* Check this survived the L2ARC journey.
	*/
	ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
	zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
	zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */

	valid_cksum = arc_cksum_is_equal(hdr, zio);
	if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
	mutex_exit(hash_lock);
	zio->io_private = hdr;
	arc_read_done(zio);
	} else {
	mutex_exit(hash_lock);
	/*
	* Buffer didn't survive caching. Increment stats and
	* reissue to the original storage device.
	*/
	if (zio->io_error != 0) {
	ARCSTAT_BUMP(arcstat_l2_io_error);
	} else {
	zio->io_error = SET_ERROR(EIO);
	}
	if (!valid_cksum)
	ARCSTAT_BUMP(arcstat_l2_cksum_bad);

	/*
	* If there's no waiter, issue an async i/o to the primary
	* storage now. If there is a waiter, the caller must
	* issue the i/o in a context where it's OK to block.
	*/
	if (zio->io_waiter == NULL) {
	zio_t *pio = zio_unique_parent(zio);

	ASSERT(!pio \|\| pio->io_child_type == ZIO_CHILD_LOGICAL);

	zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
	hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
	hdr, zio->io_priority, cb->l2rcb_flags,
	&cb->l2rcb_zb));
	}
	}

	kmem_free(cb, sizeof (l2arc_read_callback_t));
	}

	/*
	* This is the list priority from which the L2ARC will search for pages to
	* cache. This is used within loops (0..3) to cycle through lists in the
	* desired order. This order can have a significant effect on cache
	* performance.
	*
	* Currently the metadata lists are hit first, MFU then MRU, followed by
	* the data lists. This function returns a locked list, and also returns
	* the lock pointer.
	*/
	static multilist_sublist_t *
	l2arc_sublist_lock(int list_num)
	{
	multilist_t *ml = NULL;
	unsigned int idx;

	ASSERT(list_num >= 0 && list_num <= 3);

	switch (list_num) {
	case 0:
	ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
	break;
	case 1:
	ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
	break;
	case 2:
	ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
	break;
	case 3:
	ml = arc_mru->arcs_list[ARC_BUFC_DATA];
	break;
	}

	/*
	* Return a randomly-selected sublist. This is acceptable
	* because the caller feeds only a little bit of data for each
	* call (8MB). Subsequent calls will result in different
	* sublists being selected.
	*/
	idx = multilist_get_random_index(ml);
	return (multilist_sublist_lock(ml, idx));
	}

	/*
	* Evict buffers from the device write hand to the distance specified in
	* bytes. This distance may span populated buffers, it may span nothing.
	* This is clearing a region on the L2ARC device ready for writing.
	* If the 'all' boolean is set, every buffer is evicted.
	*/
	static void
	l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
	{
	list_t *buflist;
	arc_buf_hdr_t hdr, hdr_prev;
	kmutex_t *hash_lock;
	uint64_t taddr;

	buflist = &dev->l2ad_buflist;

	if (!all && dev->l2ad_first) {
	/*
	* This is the first sweep through the device. There is
	* nothing to evict.
	*/
	return;
	}

	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
	/*
	* When nearing the end of the device, evict to the end
	* before the device write hand jumps to the start.
	*/
	taddr = dev->l2ad_end;
	} else {
	taddr = dev->l2ad_hand + distance;
	}
	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t , dev, list_t , buflist,
	uint64_t, taddr, boolean_t, all);

	top:
	mutex_enter(&dev->l2ad_mtx);
	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
	hdr_prev = list_prev(buflist, hdr);

	hash_lock = HDR_LOCK(hdr);

	/*
	* We cannot use mutex_enter or else we can deadlock
	* with l2arc_write_buffers (due to swapping the order
	* the hash lock and l2ad_mtx are taken).
	*/
	if (!mutex_tryenter(hash_lock)) {
	/*
	* Missed the hash lock. Retry.
	*/
	ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
	mutex_exit(&dev->l2ad_mtx);
	mutex_enter(hash_lock);
	mutex_exit(hash_lock);
	goto top;
	}

	/*
	* A header can't be on this list if it doesn't have L2 header.
	*/
	ASSERT(HDR_HAS_L2HDR(hdr));

	/* Ensure this header has finished being written. */
	ASSERT(!HDR_L2_WRITING(hdr));
	ASSERT(!HDR_L2_WRITE_HEAD(hdr));

	if (!all && (hdr->b_l2hdr.b_daddr >= taddr \|\|
	hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
	/*
	* We've evicted to the target address,
	* or the end of the device.
	*/
	mutex_exit(hash_lock);
	break;
	}

	if (!HDR_HAS_L1HDR(hdr)) {
	ASSERT(!HDR_L2_READING(hdr));
	/*
	* This doesn't exist in the ARC. Destroy.
	* arc_hdr_destroy() will call list_remove()
	* and decrement arcstat_l2_lsize.
	*/
	arc_change_state(arc_anon, hdr, hash_lock);
	arc_hdr_destroy(hdr);
	} else {
	ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
	ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
	/*
	* Invalidate issued or about to be issued
	* reads, since we may be about to write
	* over this location.
	*/
	if (HDR_L2_READING(hdr)) {
	ARCSTAT_BUMP(arcstat_l2_evict_reading);
	arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
	}

	arc_hdr_l2hdr_destroy(hdr);
	}
	mutex_exit(hash_lock);
	}
	mutex_exit(&dev->l2ad_mtx);
	}

	/*
	* Find and write ARC buffers to the L2ARC device.
	*
	* An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
	* for reading until they have completed writing.
	* The headroom_boost is an in-out parameter used to maintain headroom boost
	* state between calls to this function.
	*
	* Returns the number of bytes actually written (which may be smaller than
	* the delta by which the device hand has changed due to alignment).
	*/
	static uint64_t
	l2arc_write_buffers(spa_t spa, l2arc_dev_t dev, uint64_t target_sz)
	{
	arc_buf_hdr_t hdr, hdr_prev, *head;
	uint64_t write_asize, write_psize, write_lsize, headroom;
	boolean_t full;
	l2arc_write_callback_t *cb;
	zio_t pio, wzio;
	uint64_t guid = spa_load_guid(spa);
	int try;

	ASSERT3P(dev->l2ad_vdev, !=, NULL);

	pio = NULL;
	write_lsize = write_asize = write_psize = 0;
	full = B_FALSE;
	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD \| ARC_FLAG_HAS_L2HDR);

	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
	/*
	* Copy buffers for L2ARC writing.
	*/
	for (try = 0; try <= 3; try++) {
	multilist_sublist_t *mls = l2arc_sublist_lock(try);
	uint64_t passed_sz = 0;

	ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);

	/*
	* L2ARC fast warmup.
	*
	* Until the ARC is warm and starts to evict, read from the
	* head of the ARC lists rather than the tail.
	*/
	if (arc_warm == B_FALSE)
	hdr = multilist_sublist_head(mls);
	else
	hdr = multilist_sublist_tail(mls);
	if (hdr == NULL)
	ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);

	headroom = target_sz * l2arc_headroom;
	if (zfs_compressed_arc_enabled)
	headroom = (headroom * l2arc_headroom_boost) / 100;

	for (; hdr; hdr = hdr_prev) {
	kmutex_t *hash_lock;

	if (arc_warm == B_FALSE)
	hdr_prev = multilist_sublist_next(mls, hdr);
	else
	hdr_prev = multilist_sublist_prev(mls, hdr);
	ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
	HDR_GET_LSIZE(hdr));

	hash_lock = HDR_LOCK(hdr);
	if (!mutex_tryenter(hash_lock)) {
	ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
	/*
	* Skip this buffer rather than waiting.
	*/
	continue;
	}

	passed_sz += HDR_GET_LSIZE(hdr);
	if (passed_sz > headroom) {
	/*
	* Searched too far.
	*/
	mutex_exit(hash_lock);
	ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
	break;
	}

	if (!l2arc_write_eligible(guid, hdr)) {
	mutex_exit(hash_lock);
	continue;
	}

	/*
	* We rely on the L1 portion of the header below, so
	* it's invalid for this header to have been evicted out
	* of the ghost cache, prior to being written out. The
	* ARC_FLAG_L2_WRITING bit ensures this won't happen.
	*/
	ASSERT(HDR_HAS_L1HDR(hdr));

	ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
	ASSERT3U(arc_hdr_size(hdr), >, 0);
	uint64_t psize = arc_hdr_size(hdr);
	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
	psize);

	if ((write_asize + asize) > target_sz) {
	full = B_TRUE;
	mutex_exit(hash_lock);
	ARCSTAT_BUMP(arcstat_l2_write_full);
	break;
	}

	if (pio == NULL) {
	/*
	* Insert a dummy header on the buflist so
	* l2arc_write_done() can find where the
	* write buffers begin without searching.
	*/
	mutex_enter(&dev->l2ad_mtx);
	list_insert_head(&dev->l2ad_buflist, head);
	mutex_exit(&dev->l2ad_mtx);

	cb = kmem_alloc(
	sizeof (l2arc_write_callback_t), KM_SLEEP);
	cb->l2wcb_dev = dev;
	cb->l2wcb_head = head;
	pio = zio_root(spa, l2arc_write_done, cb,
	ZIO_FLAG_CANFAIL);
	ARCSTAT_BUMP(arcstat_l2_write_pios);
	}

	hdr->b_l2hdr.b_dev = dev;
	hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
	arc_hdr_set_flags(hdr,
	ARC_FLAG_L2_WRITING \| ARC_FLAG_HAS_L2HDR);

	mutex_enter(&dev->l2ad_mtx);
	list_insert_head(&dev->l2ad_buflist, hdr);
	mutex_exit(&dev->l2ad_mtx);

	(void) refcount_add_many(&dev->l2ad_alloc, psize, hdr);

	/*
	* Normally the L2ARC can use the hdr's data, but if
	* we're sharing data between the hdr and one of its
	* bufs, L2ARC needs its own copy of the data so that
	* the ZIO below can't race with the buf consumer.
	* Another case where we need to create a copy of the
	* data is when the buffer size is not device-aligned
	* and we need to pad the block to make it such.
	* That also keeps the clock hand suitably aligned.
	*
	* To ensure that the copy will be available for the
	* lifetime of the ZIO and be cleaned up afterwards, we
	* add it to the l2arc_free_on_write queue.
	*/
	abd_t *to_write;
	if (!HDR_SHARED_DATA(hdr) && psize == asize) {
	to_write = hdr->b_l1hdr.b_pabd;
	} else {
	to_write = abd_alloc_for_io(asize,
	HDR_ISTYPE_METADATA(hdr));
	abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
	if (asize != psize) {
	abd_zero_off(to_write, psize,
	asize - psize);
	}
	l2arc_free_abd_on_write(to_write, asize,
	arc_buf_type(hdr));
	}
	wzio = zio_write_phys(pio, dev->l2ad_vdev,
	hdr->b_l2hdr.b_daddr, asize, to_write,
	ZIO_CHECKSUM_OFF, NULL, hdr,
	ZIO_PRIORITY_ASYNC_WRITE,
	ZIO_FLAG_CANFAIL, B_FALSE);

	write_lsize += HDR_GET_LSIZE(hdr);
	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
	zio_t *, wzio);

	write_psize += psize;
	write_asize += asize;
	dev->l2ad_hand += asize;

	mutex_exit(hash_lock);

	(void) zio_nowait(wzio);
	}

	multilist_sublist_unlock(mls);

	if (full == B_TRUE)
	break;
	}

	/* No buffers selected for writing? */
	if (pio == NULL) {
	ASSERT0(write_lsize);
	ASSERT(!HDR_HAS_L1HDR(head));
	kmem_cache_free(hdr_l2only_cache, head);
	return (0);
	}

	ASSERT3U(write_psize, <=, target_sz);
	ARCSTAT_BUMP(arcstat_l2_writes_sent);
	ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
	ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
	ARCSTAT_INCR(arcstat_l2_psize, write_psize);
	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);

	/*
	* Bump device hand to the device start if it is approaching the end.
	* l2arc_evict() will already have evicted ahead for this case.
	*/
	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
	dev->l2ad_hand = dev->l2ad_start;
	dev->l2ad_first = B_FALSE;
	}

	dev->l2ad_writing = B_TRUE;
	(void) zio_wait(pio);
	dev->l2ad_writing = B_FALSE;

	return (write_asize);
	}

	/*
	* This thread feeds the L2ARC at regular intervals. This is the beating
	* heart of the L2ARC.
	*/
	/* ARGSUSED */
	static void
	l2arc_feed_thread(void *unused __unused)
	{
	callb_cpr_t cpr;
	l2arc_dev_t *dev;
	spa_t *spa;
	uint64_t size, wrote;
	clock_t begin, next = ddi_get_lbolt();

	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);

	mutex_enter(&l2arc_feed_thr_lock);

	while (l2arc_thread_exit == 0) {
	CALLB_CPR_SAFE_BEGIN(&cpr);
	(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
	next - ddi_get_lbolt());
	CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
	next = ddi_get_lbolt() + hz;

	/*
	* Quick check for L2ARC devices.
	*/
	mutex_enter(&l2arc_dev_mtx);
	if (l2arc_ndev == 0) {
	mutex_exit(&l2arc_dev_mtx);
	continue;
	}
	mutex_exit(&l2arc_dev_mtx);
	begin = ddi_get_lbolt();

	/*
	* This selects the next l2arc device to write to, and in
	* doing so the next spa to feed from: dev->l2ad_spa. This
	* will return NULL if there are now no l2arc devices or if
	* they are all faulted.
	*
	* If a device is returned, its spa's config lock is also
	* held to prevent device removal. l2arc_dev_get_next()
	* will grab and release l2arc_dev_mtx.
	*/
	if ((dev = l2arc_dev_get_next()) == NULL)
	continue;

	spa = dev->l2ad_spa;
	ASSERT3P(spa, !=, NULL);

	/*
	* If the pool is read-only then force the feed thread to
	* sleep a little longer.
	*/
	if (!spa_writeable(spa)) {
	next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
	spa_config_exit(spa, SCL_L2ARC, dev);
	continue;
	}

	/*
	* Avoid contributing to memory pressure.
	*/
	if (arc_reclaim_needed()) {
	ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
	spa_config_exit(spa, SCL_L2ARC, dev);
	continue;
	}

	ARCSTAT_BUMP(arcstat_l2_feeds);

	size = l2arc_write_size();

	/*
	* Evict L2ARC buffers that will be overwritten.
	*/
	l2arc_evict(dev, size, B_FALSE);

	/*
	* Write ARC buffers.
	*/
	wrote = l2arc_write_buffers(spa, dev, size);

	/*
	* Calculate interval between writes.
	*/
	next = l2arc_write_interval(begin, size, wrote);
	spa_config_exit(spa, SCL_L2ARC, dev);
	}

	l2arc_thread_exit = 0;
	cv_broadcast(&l2arc_feed_thr_cv);
	CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
	thread_exit();
	}

	boolean_t
	l2arc_vdev_present(vdev_t *vd)
	{
	l2arc_dev_t *dev;

	mutex_enter(&l2arc_dev_mtx);
	for (dev = list_head(l2arc_dev_list); dev != NULL;
	dev = list_next(l2arc_dev_list, dev)) {
	if (dev->l2ad_vdev == vd)
	break;
	}
	mutex_exit(&l2arc_dev_mtx);

	return (dev != NULL);
	}

	/*
	* Add a vdev for use by the L2ARC. By this point the spa has already
	* validated the vdev and opened it.
	*/
	void
	l2arc_add_vdev(spa_t spa, vdev_t vd)
	{
	l2arc_dev_t *adddev;

	ASSERT(!l2arc_vdev_present(vd));

	vdev_ashift_optimize(vd);

	/*
	* Create a new l2arc device entry.
	*/
	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
	adddev->l2ad_spa = spa;
	adddev->l2ad_vdev = vd;
	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
	adddev->l2ad_hand = adddev->l2ad_start;
	adddev->l2ad_first = B_TRUE;
	adddev->l2ad_writing = B_FALSE;

	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
	/*
	* This is a list of all ARC buffers that are still valid on the
	* device.
	*/
	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
	offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));

	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
	refcount_create(&adddev->l2ad_alloc);

	/*
	* Add device to global list
	*/
	mutex_enter(&l2arc_dev_mtx);
	list_insert_head(l2arc_dev_list, adddev);
	atomic_inc_64(&l2arc_ndev);
	mutex_exit(&l2arc_dev_mtx);
	}

	/*
	* Remove a vdev from the L2ARC.
	*/
	void
	l2arc_remove_vdev(vdev_t *vd)
	{
	l2arc_dev_t dev, nextdev, *remdev = NULL;

	/*
	* Find the device by vdev
	*/
	mutex_enter(&l2arc_dev_mtx);
	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
	nextdev = list_next(l2arc_dev_list, dev);
	if (vd == dev->l2ad_vdev) {
	remdev = dev;
	break;
	}
	}
	ASSERT3P(remdev, !=, NULL);

	/*
	* Remove device from global list
	*/
	list_remove(l2arc_dev_list, remdev);
	l2arc_dev_last = NULL; /* may have been invalidated */
	atomic_dec_64(&l2arc_ndev);
	mutex_exit(&l2arc_dev_mtx);

	/*
	* Clear all buflists and ARC references. L2ARC device flush.
	*/
	l2arc_evict(remdev, 0, B_TRUE);
	list_destroy(&remdev->l2ad_buflist);
	mutex_destroy(&remdev->l2ad_mtx);
	refcount_destroy(&remdev->l2ad_alloc);
	kmem_free(remdev, sizeof (l2arc_dev_t));
	}

	void
	l2arc_init(void)
	{
	l2arc_thread_exit = 0;
	l2arc_ndev = 0;
	l2arc_writes_sent = 0;
	l2arc_writes_done = 0;

	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);

	l2arc_dev_list = &L2ARC_dev_list;
	l2arc_free_on_write = &L2ARC_free_on_write;
	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
	offsetof(l2arc_dev_t, l2ad_node));
	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
	offsetof(l2arc_data_free_t, l2df_list_node));
	}

	void
	l2arc_fini(void)
	{
	/*
	* This is called from dmu_fini(), which is called from spa_fini();
	* Because of this, we can assume that all l2arc devices have
	* already been removed when the pools themselves were removed.
	*/

	l2arc_do_free_on_write();

	mutex_destroy(&l2arc_feed_thr_lock);
	cv_destroy(&l2arc_feed_thr_cv);
	mutex_destroy(&l2arc_dev_mtx);
	mutex_destroy(&l2arc_free_on_write_mtx);

	list_destroy(l2arc_dev_list);
	list_destroy(l2arc_free_on_write);
	}

	void
	l2arc_start(void)
	{
	if (!(spa_mode_global & FWRITE))
	return;

	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
	TS_RUN, minclsyspri);
	}

	void
	l2arc_stop(void)
	{
	if (!(spa_mode_global & FWRITE))
	return;

	mutex_enter(&l2arc_feed_thr_lock);
	cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
	l2arc_thread_exit = 1;
	while (l2arc_thread_exit != 0)
	cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
	mutex_exit(&l2arc_feed_thr_lock);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c (revision 332525)
	@@ -1,596 +1,606 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright (c) 2017 Datto Inc.
	*/

	#include <sys/bpobj.h>
	#include <sys/zfs_context.h>
	#include <sys/refcount.h>
	#include <sys/dsl_pool.h>
	#include <sys/zfeature.h>
	#include <sys/zap.h>

	/*
	* Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
	*/
	uint64_t
	bpobj_alloc_empty(objset_t os, int blocksize, dmu_tx_t tx)
	{
	spa_t *spa = dmu_objset_spa(os);
	dsl_pool_t *dp = dmu_objset_pool(os);

	if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
	if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
	ASSERT0(dp->dp_empty_bpobj);
	dp->dp_empty_bpobj =
	bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
	VERIFY(zap_add(os,
	DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
	&dp->dp_empty_bpobj, tx) == 0);
	}
	spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
	ASSERT(dp->dp_empty_bpobj != 0);
	return (dp->dp_empty_bpobj);
	} else {
	return (bpobj_alloc(os, blocksize, tx));
	}
	}

	void
	bpobj_decr_empty(objset_t os, dmu_tx_t tx)
	{
	dsl_pool_t *dp = dmu_objset_pool(os);

	spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
	if (!spa_feature_is_active(dmu_objset_spa(os),
	SPA_FEATURE_EMPTY_BPOBJ)) {
	VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_EMPTY_BPOBJ, tx));
	VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
	dp->dp_empty_bpobj = 0;
	}
	}

	uint64_t
	bpobj_alloc(objset_t os, int blocksize, dmu_tx_t tx)
	{
	int size;

	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
	size = BPOBJ_SIZE_V0;
	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
	size = BPOBJ_SIZE_V1;
	else
	size = sizeof (bpobj_phys_t);

	return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
	DMU_OT_BPOBJ_HDR, size, tx));
	}

	void
	bpobj_free(objset_t os, uint64_t obj, dmu_tx_t tx)
	{
	int64_t i;
	bpobj_t bpo;
	dmu_object_info_t doi;
	int epb;
	dmu_buf_t *dbuf = NULL;

	ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));

	mutex_enter(&bpo.bpo_lock);

	if (!bpo.bpo_havesubobj \|\| bpo.bpo_phys->bpo_subobjs == 0)
	goto out;

	VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
	epb = doi.doi_data_block_size / sizeof (uint64_t);

	for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
	uint64_t *objarray;
	uint64_t offset, blkoff;

	offset = i * sizeof (uint64_t);
	blkoff = P2PHASE(i, epb);

	if (dbuf == NULL \|\| dbuf->db_offset > offset) {
	if (dbuf)
	dmu_buf_rele(dbuf, FTAG);
	VERIFY3U(0, ==, dmu_buf_hold(os,
	bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
	}

	ASSERT3U(offset, >=, dbuf->db_offset);
	ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);

	objarray = dbuf->db_data;
	bpobj_free(os, objarray[blkoff], tx);
	}
	if (dbuf) {
	dmu_buf_rele(dbuf, FTAG);
	dbuf = NULL;
	}
	VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));

	out:
	mutex_exit(&bpo.bpo_lock);
	bpobj_close(&bpo);

	VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
	}

	int
	bpobj_open(bpobj_t bpo, objset_t os, uint64_t object)
	{
	dmu_object_info_t doi;
	int err;

	err = dmu_object_info(os, object, &doi);
	if (err)
	return (err);

	bzero(bpo, sizeof (*bpo));
	mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);

	ASSERT(bpo->bpo_dbuf == NULL);
	ASSERT(bpo->bpo_phys == NULL);
	ASSERT(object != 0);
	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);

	err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
	if (err)
	return (err);

	bpo->bpo_os = os;
	bpo->bpo_object = object;
	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
	bpo->bpo_phys = bpo->bpo_dbuf->db_data;
	return (0);
	}

	+boolean_t
	+bpobj_is_open(const bpobj_t *bpo)
	+{
	+ return (bpo->bpo_object != 0);
	+}
	+
	void
	bpobj_close(bpobj_t *bpo)
	{
	/* Lame workaround for closing a bpobj that was never opened. */
	if (bpo->bpo_object == 0)
	return;

	dmu_buf_rele(bpo->bpo_dbuf, bpo);
	if (bpo->bpo_cached_dbuf != NULL)
	dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
	bpo->bpo_dbuf = NULL;
	bpo->bpo_phys = NULL;
	bpo->bpo_cached_dbuf = NULL;
	bpo->bpo_object = 0;

	mutex_destroy(&bpo->bpo_lock);
	}

	-static boolean_t
	-bpobj_hasentries(bpobj_t *bpo)
	+boolean_t
	+bpobj_is_empty(bpobj_t *bpo)
	{
	- return (bpo->bpo_phys->bpo_num_blkptrs != 0 \|\|
	- (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
	+ return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
	+ (!bpo->bpo_havesubobj \|\| bpo->bpo_phys->bpo_num_subobjs == 0));
	}

	static int
	bpobj_iterate_impl(bpobj_t bpo, bpobj_itor_t func, void arg, dmu_tx_t *tx,
	boolean_t free)
	{
	dmu_object_info_t doi;
	int epb;
	int64_t i;
	int err = 0;
	dmu_buf_t *dbuf = NULL;

	+ ASSERT(bpobj_is_open(bpo));
	mutex_enter(&bpo->bpo_lock);

	- if (!bpobj_hasentries(bpo))
	- goto out;
	-
	if (free)
	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);

	for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
	blkptr_t *bparray;
	blkptr_t *bp;
	uint64_t offset, blkoff;

	offset = i * sizeof (blkptr_t);
	blkoff = P2PHASE(i, bpo->bpo_epb);

	if (dbuf == NULL \|\| dbuf->db_offset > offset) {
	if (dbuf)
	dmu_buf_rele(dbuf, FTAG);
	err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
	FTAG, &dbuf, 0);
	if (err)
	break;
	}

	ASSERT3U(offset, >=, dbuf->db_offset);
	ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);

	bparray = dbuf->db_data;
	bp = &bparray[blkoff];
	err = func(arg, bp, tx);
	if (err)
	break;
	if (free) {
	bpo->bpo_phys->bpo_bytes -=
	bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
	ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
	if (bpo->bpo_havecomp) {
	bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
	bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
	}
	bpo->bpo_phys->bpo_num_blkptrs--;
	ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
	}
	}
	if (dbuf) {
	dmu_buf_rele(dbuf, FTAG);
	dbuf = NULL;
	}
	if (free) {
	VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
	(i + 1) * sizeof (blkptr_t), -1ULL, tx));
	}
	if (err \|\| !bpo->bpo_havesubobj \|\| bpo->bpo_phys->bpo_subobjs == 0)
	goto out;

	ASSERT(bpo->bpo_havecomp);
	err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
	if (err) {
	mutex_exit(&bpo->bpo_lock);
	return (err);
	}
	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
	epb = doi.doi_data_block_size / sizeof (uint64_t);

	for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
	uint64_t *objarray;
	uint64_t offset, blkoff;
	bpobj_t sublist;
	uint64_t used_before, comp_before, uncomp_before;
	uint64_t used_after, comp_after, uncomp_after;

	offset = i * sizeof (uint64_t);
	blkoff = P2PHASE(i, epb);

	if (dbuf == NULL \|\| dbuf->db_offset > offset) {
	if (dbuf)
	dmu_buf_rele(dbuf, FTAG);
	err = dmu_buf_hold(bpo->bpo_os,
	bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
	if (err)
	break;
	}

	ASSERT3U(offset, >=, dbuf->db_offset);
	ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);

	objarray = dbuf->db_data;
	err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
	if (err)
	break;
	if (free) {
	err = bpobj_space(&sublist,
	&used_before, &comp_before, &uncomp_before);
	if (err != 0) {
	bpobj_close(&sublist);
	break;
	}
	}
	err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
	if (free) {
	VERIFY3U(0, ==, bpobj_space(&sublist,
	&used_after, &comp_after, &uncomp_after));
	bpo->bpo_phys->bpo_bytes -= used_before - used_after;
	ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
	bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
	bpo->bpo_phys->bpo_uncomp -=
	uncomp_before - uncomp_after;
	}

	bpobj_close(&sublist);
	if (err)
	break;
	if (free) {
	err = dmu_object_free(bpo->bpo_os,
	objarray[blkoff], tx);
	if (err)
	break;
	bpo->bpo_phys->bpo_num_subobjs--;
	ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
	}
	}
	if (dbuf) {
	dmu_buf_rele(dbuf, FTAG);
	dbuf = NULL;
	}
	if (free) {
	VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
	bpo->bpo_phys->bpo_subobjs,
	(i + 1) * sizeof (uint64_t), -1ULL, tx));
	}

	out:
	/* If there are no entries, there should be no bytes. */
	- if (!bpobj_hasentries(bpo)) {
	+ if (bpobj_is_empty(bpo)) {
	ASSERT0(bpo->bpo_phys->bpo_bytes);
	ASSERT0(bpo->bpo_phys->bpo_comp);
	ASSERT0(bpo->bpo_phys->bpo_uncomp);
	}

	mutex_exit(&bpo->bpo_lock);
	return (err);
	}

	/*
	* Iterate and remove the entries. If func returns nonzero, iteration
	* will stop and that entry will not be removed.
	*/
	int
	bpobj_iterate(bpobj_t bpo, bpobj_itor_t func, void arg, dmu_tx_t *tx)
	{
	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
	}

	/*
	* Iterate the entries. If func returns nonzero, iteration will stop.
	*/
	int
	bpobj_iterate_nofree(bpobj_t bpo, bpobj_itor_t func, void arg, dmu_tx_t *tx)
	{
	return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
	}

	void
	bpobj_enqueue_subobj(bpobj_t bpo, uint64_t subobj, dmu_tx_t tx)
	{
	bpobj_t subbpo;
	uint64_t used, comp, uncomp, subsubobjs;

	+ ASSERT(bpobj_is_open(bpo));
	+ ASSERT(subobj != 0);
	ASSERT(bpo->bpo_havesubobj);
	ASSERT(bpo->bpo_havecomp);
	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);

	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
	bpobj_decr_empty(bpo->bpo_os, tx);
	return;
	}

	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));

	- if (!bpobj_hasentries(&subbpo)) {
	+ if (bpobj_is_empty(&subbpo)) {
	/* No point in having an empty subobj. */
	bpobj_close(&subbpo);
	bpobj_free(bpo->bpo_os, subobj, tx);
	return;
	}

	mutex_enter(&bpo->bpo_lock);
	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
	if (bpo->bpo_phys->bpo_subobjs == 0) {
	bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
	DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
	DMU_OT_NONE, 0, tx);
	}

	dmu_object_info_t doi;
	ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);

	dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
	bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
	sizeof (subobj), &subobj, tx);
	bpo->bpo_phys->bpo_num_subobjs++;

	/*
	* If subobj has only one block of subobjs, then move subobj's
	* subobjs to bpo's subobj list directly. This reduces
	* recursion in bpobj_iterate due to nested subobjs.
	*/
	subsubobjs = subbpo.bpo_phys->bpo_subobjs;
	if (subsubobjs != 0) {
	dmu_object_info_t doi;

	VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
	if (doi.doi_max_offset == doi.doi_data_block_size) {
	dmu_buf_t *subdb;
	uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;

	VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
	0, FTAG, &subdb, 0));
	/*
	* Make sure that we are not asking dmu_write()
	* to write more data than we have in our buffer.
	*/
	VERIFY3U(subdb->db_size, >=,
	numsubsub * sizeof (subobj));
	dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
	bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
	numsubsub * sizeof (subobj), subdb->db_data, tx);
	dmu_buf_rele(subdb, FTAG);
	bpo->bpo_phys->bpo_num_subobjs += numsubsub;

	dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
	subbpo.bpo_phys->bpo_subobjs = 0;
	VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
	subsubobjs, tx));
	}
	}
	bpo->bpo_phys->bpo_bytes += used;
	bpo->bpo_phys->bpo_comp += comp;
	bpo->bpo_phys->bpo_uncomp += uncomp;
	mutex_exit(&bpo->bpo_lock);

	bpobj_close(&subbpo);
	}

	void
	bpobj_enqueue(bpobj_t bpo, const blkptr_t bp, dmu_tx_t *tx)
	{
	blkptr_t stored_bp = *bp;
	uint64_t offset;
	int blkoff;
	blkptr_t *bparray;

	+ ASSERT(bpobj_is_open(bpo));
	ASSERT(!BP_IS_HOLE(bp));
	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);

	if (BP_IS_EMBEDDED(bp)) {
	/*
	* The bpobj will compress better without the payload.
	*
	* Note that we store EMBEDDED bp's because they have an
	* uncompressed size, which must be accounted for. An
	* alternative would be to add their size to bpo_uncomp
	* without storing the bp, but that would create additional
	* complications: bpo_uncomp would be inconsistent with the
	* set of BP's stored, and bpobj_iterate() wouldn't visit
	* all the space accounted for in the bpobj.
	*/
	bzero(&stored_bp, sizeof (stored_bp));
	stored_bp.blk_prop = bp->blk_prop;
	stored_bp.blk_birth = bp->blk_birth;
	} else if (!BP_GET_DEDUP(bp)) {
	/* The bpobj will compress better without the checksum */
	bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
	}

	/* We never need the fill count. */
	stored_bp.blk_fill = 0;

	mutex_enter(&bpo->bpo_lock);

	offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
	blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);

	if (bpo->bpo_cached_dbuf == NULL \|\|
	offset < bpo->bpo_cached_dbuf->db_offset \|\|
	offset >= bpo->bpo_cached_dbuf->db_offset +
	bpo->bpo_cached_dbuf->db_size) {
	if (bpo->bpo_cached_dbuf)
	dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
	VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
	offset, bpo, &bpo->bpo_cached_dbuf, 0));
	}

	dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
	bparray = bpo->bpo_cached_dbuf->db_data;
	bparray[blkoff] = stored_bp;

	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
	bpo->bpo_phys->bpo_num_blkptrs++;
	bpo->bpo_phys->bpo_bytes +=
	bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
	if (bpo->bpo_havecomp) {
	bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
	bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
	}
	mutex_exit(&bpo->bpo_lock);
	}

	struct space_range_arg {
	spa_t *spa;
	uint64_t mintxg;
	uint64_t maxtxg;
	uint64_t used;
	uint64_t comp;
	uint64_t uncomp;
	};

	/* ARGSUSED */
	static int
	space_range_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	struct space_range_arg *sra = arg;

	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
	if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
	sra->used += bp_get_dsize_sync(sra->spa, bp);
	else
	sra->used += bp_get_dsize(sra->spa, bp);
	sra->comp += BP_GET_PSIZE(bp);
	sra->uncomp += BP_GET_UCSIZE(bp);
	}
	return (0);
	}

	int
	bpobj_space(bpobj_t bpo, uint64_t usedp, uint64_t compp, uint64_t uncompp)
	{
	+ ASSERT(bpobj_is_open(bpo));
	mutex_enter(&bpo->bpo_lock);

	*usedp = bpo->bpo_phys->bpo_bytes;
	if (bpo->bpo_havecomp) {
	*compp = bpo->bpo_phys->bpo_comp;
	*uncompp = bpo->bpo_phys->bpo_uncomp;
	mutex_exit(&bpo->bpo_lock);
	return (0);
	} else {
	mutex_exit(&bpo->bpo_lock);
	return (bpobj_space_range(bpo, 0, UINT64_MAX,
	usedp, compp, uncompp));
	}
	}

	/*
	* Return the amount of space in the bpobj which is:
	* mintxg < blk_birth <= maxtxg
	*/
	int
	bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp)
	{
	struct space_range_arg sra = { 0 };
	int err;
	+
	+ ASSERT(bpobj_is_open(bpo));

	/*
	* As an optimization, if they want the whole txg range, just
	* get bpo_bytes rather than iterating over the bps.
	*/
	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
	return (bpobj_space(bpo, usedp, compp, uncompp));

	sra.spa = dmu_objset_spa(bpo->bpo_os);
	sra.mintxg = mintxg;
	sra.maxtxg = maxtxg;

	err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
	*usedp = sra.used;
	*compp = sra.comp;
	*uncompp = sra.uncomp;
	return (err);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 332525)
	@@ -1,3623 +1,3761 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#include <sys/zfs_context.h>
	#include <sys/dmu.h>
	#include <sys/dmu_send.h>
	#include <sys/dmu_impl.h>
	#include <sys/dbuf.h>
	#include <sys/dmu_objset.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dmu_tx.h>
	#include <sys/spa.h>
	#include <sys/zio.h>
	#include <sys/dmu_zfetch.h>
	#include <sys/sa.h>
	#include <sys/sa_impl.h>
	#include <sys/zfeature.h>
	#include <sys/blkptr.h>
	#include <sys/range_tree.h>
	#include <sys/callb.h>
	#include <sys/abd.h>
	+#include <sys/vdev.h>

	uint_t zfs_dbuf_evict_key;

	static boolean_t dbuf_undirty(dmu_buf_impl_t db, dmu_tx_t tx);
	static void dbuf_write(dbuf_dirty_record_t dr, arc_buf_t data, dmu_tx_t *tx);

	#ifndef __lint
	extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
	dmu_buf_evict_func_t *evict_func_sync,
	dmu_buf_evict_func_t *evict_func_async,
	dmu_buf_t **clear_on_evict_dbufp);
	#endif /* ! __lint */

	/*
	* Global data structures and functions for the dbuf cache.
	*/
	static kmem_cache_t *dbuf_kmem_cache;
	static taskq_t *dbu_evict_taskq;

	static kthread_t *dbuf_cache_evict_thread;
	static kmutex_t dbuf_evict_lock;
	static kcondvar_t dbuf_evict_cv;
	static boolean_t dbuf_evict_thread_exit;

	/*
	* LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
	* are not currently held but have been recently released. These dbufs
	* are not eligible for arc eviction until they are aged out of the cache.
	* Dbufs are added to the dbuf cache once the last hold is released. If a
	* dbuf is later accessed and still exists in the dbuf cache, then it will
	* be removed from the cache and later re-added to the head of the cache.
	* Dbufs that are aged out of the cache will be immediately destroyed and
	* become eligible for arc eviction.
	*/
	static multilist_t *dbuf_cache;
	static refcount_t dbuf_cache_size;
	uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024;

	/* Cap the size of the dbuf cache to log2 fraction of arc size. */
	int dbuf_cache_max_shift = 5;

	/*
	* The dbuf cache uses a three-stage eviction policy:
	* - A low water marker designates when the dbuf eviction thread
	* should stop evicting from the dbuf cache.
	* - When we reach the maximum size (aka mid water mark), we
	* signal the eviction thread to run.
	* - The high water mark indicates when the eviction thread
	* is unable to keep up with the incoming load and eviction must
	* happen in the context of the calling thread.
	*
	* The dbuf cache:
	* (max size)
	* low water mid water hi water
	* +----------------------------------------+----------+----------+
	* \| \| \| \|
	* \| \| \| \|
	* \| \| \| \|
	* \| \| \| \|
	* +----------------------------------------+----------+----------+
	* stop signal evict
	* evicting eviction directly
	* thread
	*
	* The high and low water marks indicate the operating range for the eviction
	* thread. The low water mark is, by default, 90% of the total size of the
	* cache and the high water mark is at 110% (both of these percentages can be
	* changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
	* respectively). The eviction thread will try to ensure that the cache remains
	* within this range by waking up every second and checking if the cache is
	* above the low water mark. The thread can also be woken up by callers adding
	* elements into the cache if the cache is larger than the mid water (i.e max
	* cache size). Once the eviction thread is woken up and eviction is required,
	* it will continue evicting buffers until it's able to reduce the cache size
	* to the low water mark. If the cache size continues to grow and hits the high
	* water mark, then callers adding elments to the cache will begin to evict
	* directly from the cache until the cache is no longer above the high water
	* mark.
	*/

	/*
	* The percentage above and below the maximum cache size.
	*/
	uint_t dbuf_cache_hiwater_pct = 10;
	uint_t dbuf_cache_lowater_pct = 10;

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN,
	&dbuf_cache_max_bytes, 0, "dbuf cache size in bytes");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_max_shift, CTLFLAG_RDTUN,
	&dbuf_cache_max_shift, 0, "dbuf size as log2 fraction of ARC");
	SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN,
	&dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size");
	SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN,
	&dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size");

	/* ARGSUSED */
	static int
	dbuf_cons(void vdb, void unused, int kmflag)
	{
	dmu_buf_impl_t *db = vdb;
	bzero(db, sizeof (dmu_buf_impl_t));

	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
	multilist_link_init(&db->db_cache_link);
	refcount_create(&db->db_holds);

	return (0);
	}

	/* ARGSUSED */
	static void
	dbuf_dest(void vdb, void unused)
	{
	dmu_buf_impl_t *db = vdb;
	mutex_destroy(&db->db_mtx);
	cv_destroy(&db->db_changed);
	ASSERT(!multilist_link_active(&db->db_cache_link));
	refcount_destroy(&db->db_holds);
	}

	/*
	* dbuf hash table routines
	*/
	static dbuf_hash_table_t dbuf_hash_table;

	static uint64_t dbuf_hash_count;

	static uint64_t
	dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
	{
	uintptr_t osv = (uintptr_t)os;
	uint64_t crc = -1ULL;

	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];

	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);

	return (crc);
	}

	#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
	((dbuf)->db.db_object == (obj) && \
	(dbuf)->db_objset == (os) && \
	(dbuf)->db_level == (level) && \
	(dbuf)->db_blkid == (blkid))

	dmu_buf_impl_t *
	dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
	{
	dbuf_hash_table_t *h = &dbuf_hash_table;
	uint64_t hv = dbuf_hash(os, obj, level, blkid);
	uint64_t idx = hv & h->hash_table_mask;
	dmu_buf_impl_t *db;

	mutex_enter(DBUF_HASH_MUTEX(h, idx));
	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
	if (DBUF_EQUAL(db, os, obj, level, blkid)) {
	mutex_enter(&db->db_mtx);
	if (db->db_state != DB_EVICTING) {
	mutex_exit(DBUF_HASH_MUTEX(h, idx));
	return (db);
	}
	mutex_exit(&db->db_mtx);
	}
	}
	mutex_exit(DBUF_HASH_MUTEX(h, idx));
	return (NULL);
	}

	static dmu_buf_impl_t *
	dbuf_find_bonus(objset_t *os, uint64_t object)
	{
	dnode_t *dn;
	dmu_buf_impl_t *db = NULL;

	if (dnode_hold(os, object, FTAG, &dn) == 0) {
	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	if (dn->dn_bonus != NULL) {
	db = dn->dn_bonus;
	mutex_enter(&db->db_mtx);
	}
	rw_exit(&dn->dn_struct_rwlock);
	dnode_rele(dn, FTAG);
	}
	return (db);
	}

	/*
	* Insert an entry into the hash table. If there is already an element
	* equal to elem in the hash table, then the already existing element
	* will be returned and the new element will not be inserted.
	* Otherwise returns NULL.
	*/
	static dmu_buf_impl_t *
	dbuf_hash_insert(dmu_buf_impl_t *db)
	{
	dbuf_hash_table_t *h = &dbuf_hash_table;
	objset_t *os = db->db_objset;
	uint64_t obj = db->db.db_object;
	int level = db->db_level;
	uint64_t blkid = db->db_blkid;
	uint64_t hv = dbuf_hash(os, obj, level, blkid);
	uint64_t idx = hv & h->hash_table_mask;
	dmu_buf_impl_t *dbf;

	mutex_enter(DBUF_HASH_MUTEX(h, idx));
	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
	if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
	mutex_enter(&dbf->db_mtx);
	if (dbf->db_state != DB_EVICTING) {
	mutex_exit(DBUF_HASH_MUTEX(h, idx));
	return (dbf);
	}
	mutex_exit(&dbf->db_mtx);
	}
	}

	mutex_enter(&db->db_mtx);
	db->db_hash_next = h->hash_table[idx];
	h->hash_table[idx] = db;
	mutex_exit(DBUF_HASH_MUTEX(h, idx));
	atomic_inc_64(&dbuf_hash_count);

	return (NULL);
	}

	/*
	* Remove an entry from the hash table. It must be in the EVICTING state.
	*/
	static void
	dbuf_hash_remove(dmu_buf_impl_t *db)
	{
	dbuf_hash_table_t *h = &dbuf_hash_table;
	uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
	db->db_level, db->db_blkid);
	uint64_t idx = hv & h->hash_table_mask;
	dmu_buf_impl_t dbf, *dbp;

	/*
	* We musn't hold db_mtx to maintain lock ordering:
	* DBUF_HASH_MUTEX > db_mtx.
	*/
	ASSERT(refcount_is_zero(&db->db_holds));
	ASSERT(db->db_state == DB_EVICTING);
	ASSERT(!MUTEX_HELD(&db->db_mtx));

	mutex_enter(DBUF_HASH_MUTEX(h, idx));
	dbp = &h->hash_table[idx];
	while ((dbf = *dbp) != db) {
	dbp = &dbf->db_hash_next;
	ASSERT(dbf != NULL);
	}
	*dbp = db->db_hash_next;
	db->db_hash_next = NULL;
	mutex_exit(DBUF_HASH_MUTEX(h, idx));
	atomic_dec_64(&dbuf_hash_count);
	}

	typedef enum {
	DBVU_EVICTING,
	DBVU_NOT_EVICTING
	} dbvu_verify_type_t;

	static void
	dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
	{
	#ifdef ZFS_DEBUG
	int64_t holds;

	if (db->db_user == NULL)
	return;

	/* Only data blocks support the attachment of user data. */
	ASSERT(db->db_level == 0);

	/* Clients must resolve a dbuf before attaching user data. */
	ASSERT(db->db.db_data != NULL);
	ASSERT3U(db->db_state, ==, DB_CACHED);

	holds = refcount_count(&db->db_holds);
	if (verify_type == DBVU_EVICTING) {
	/*
	* Immediate eviction occurs when holds == dirtycnt.
	* For normal eviction buffers, holds is zero on
	* eviction, except when dbuf_fix_old_data() calls
	* dbuf_clear_data(). However, the hold count can grow
	* during eviction even though db_mtx is held (see
	* dmu_bonus_hold() for an example), so we can only
	* test the generic invariant that holds >= dirtycnt.
	*/
	ASSERT3U(holds, >=, db->db_dirtycnt);
	} else {
	if (db->db_user_immediate_evict == TRUE)
	ASSERT3U(holds, >=, db->db_dirtycnt);
	else
	ASSERT3U(holds, >, 0);
	}
	#endif
	}

	static void
	dbuf_evict_user(dmu_buf_impl_t *db)
	{
	dmu_buf_user_t *dbu = db->db_user;

	ASSERT(MUTEX_HELD(&db->db_mtx));

	if (dbu == NULL)
	return;

	dbuf_verify_user(db, DBVU_EVICTING);
	db->db_user = NULL;

	#ifdef ZFS_DEBUG
	if (dbu->dbu_clear_on_evict_dbufp != NULL)
	*dbu->dbu_clear_on_evict_dbufp = NULL;
	#endif

	/*
	* There are two eviction callbacks - one that we call synchronously
	* and one that we invoke via a taskq. The async one is useful for
	* avoiding lock order reversals and limiting stack depth.
	*
	* Note that if we have a sync callback but no async callback,
	* it's likely that the sync callback will free the structure
	* containing the dbu. In that case we need to take care to not
	* dereference dbu after calling the sync evict func.
	*/
	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);

	if (dbu->dbu_evict_func_sync != NULL)
	dbu->dbu_evict_func_sync(dbu);

	if (has_async) {
	taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
	dbu, 0, &dbu->dbu_tqent);
	}
	}

	boolean_t
	dbuf_is_metadata(dmu_buf_impl_t *db)
	{
	if (db->db_level > 0) {
	return (B_TRUE);
	} else {
	boolean_t is_metadata;

	DB_DNODE_ENTER(db);
	is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
	DB_DNODE_EXIT(db);

	return (is_metadata);
	}
	}

	/*
	* This function must return indices evenly distributed between all
	* sublists of the multilist. This is needed due to how the dbuf eviction
	* code is laid out; dbuf_evict_thread() assumes dbufs are evenly
	* distributed between all sublists and uses this assumption when
	* deciding which sublist to evict from and how much to evict from it.
	*/
	unsigned int
	dbuf_cache_multilist_index_func(multilist_t ml, void obj)
	{
	dmu_buf_impl_t *db = obj;

	/*
	* The assumption here, is the hash value for a given
	* dmu_buf_impl_t will remain constant throughout it's lifetime
	* (i.e. it's objset, object, level and blkid fields don't change).
	* Thus, we don't need to store the dbuf's sublist index
	* on insertion, as this index can be recalculated on removal.
	*
	* Also, the low order bits of the hash value are thought to be
	* distributed evenly. Otherwise, in the case that the multilist
	* has a power of two number of sublists, each sublists' usage
	* would not be evenly distributed.
	*/
	return (dbuf_hash(db->db_objset, db->db.db_object,
	db->db_level, db->db_blkid) %
	multilist_get_num_sublists(ml));
	}

	static inline boolean_t
	dbuf_cache_above_hiwater(void)
	{
	uint64_t dbuf_cache_hiwater_bytes =
	(dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;

	return (refcount_count(&dbuf_cache_size) >
	dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
	}

	static inline boolean_t
	dbuf_cache_above_lowater(void)
	{
	uint64_t dbuf_cache_lowater_bytes =
	(dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;

	return (refcount_count(&dbuf_cache_size) >
	dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
	}

	/*
	* Evict the oldest eligible dbuf from the dbuf cache.
	*/
	static void
	dbuf_evict_one(void)
	{
	int idx = multilist_get_random_index(dbuf_cache);
	multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);

	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));

	/*
	* Set the thread's tsd to indicate that it's processing evictions.
	* Once a thread stops evicting from the dbuf cache it will
	* reset its tsd to NULL.
	*/
	ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
	(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);

	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
	db = multilist_sublist_prev(mls, db);
	}

	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
	multilist_sublist_t *, mls);

	if (db != NULL) {
	multilist_sublist_remove(mls, db);
	multilist_sublist_unlock(mls);
	(void) refcount_remove_many(&dbuf_cache_size,
	db->db.db_size, db);
	dbuf_destroy(db);
	} else {
	multilist_sublist_unlock(mls);
	}
	(void) tsd_set(zfs_dbuf_evict_key, NULL);
	}

	/*
	* The dbuf evict thread is responsible for aging out dbufs from the
	* cache. Once the cache has reached it's maximum size, dbufs are removed
	* and destroyed. The eviction thread will continue running until the size
	* of the dbuf cache is at or below the maximum size. Once the dbuf is aged
	* out of the cache it is destroyed and becomes eligible for arc eviction.
	*/
	/* ARGSUSED */
	static void
	dbuf_evict_thread(void *unused __unused)
	{
	callb_cpr_t cpr;

	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);

	mutex_enter(&dbuf_evict_lock);
	while (!dbuf_evict_thread_exit) {
	while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
	CALLB_CPR_SAFE_BEGIN(&cpr);
	(void) cv_timedwait_hires(&dbuf_evict_cv,
	&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
	CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
	}
	mutex_exit(&dbuf_evict_lock);

	/*
	* Keep evicting as long as we're above the low water mark
	* for the cache. We do this without holding the locks to
	* minimize lock contention.
	*/
	while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
	dbuf_evict_one();
	}

	mutex_enter(&dbuf_evict_lock);
	}

	dbuf_evict_thread_exit = B_FALSE;
	cv_broadcast(&dbuf_evict_cv);
	CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
	thread_exit();
	}

	/*
	* Wake up the dbuf eviction thread if the dbuf cache is at its max size.
	* If the dbuf cache is at its high water mark, then evict a dbuf from the
	* dbuf cache using the callers context.
	*/
	static void
	dbuf_evict_notify(void)
	{

	/*
	* We use thread specific data to track when a thread has
	* started processing evictions. This allows us to avoid deeply
	* nested stacks that would have a call flow similar to this:
	*
	* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
	* ^ \|
	* \| \|
	* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
	*
	* The dbuf_eviction_thread will always have its tsd set until
	* that thread exits. All other threads will only set their tsd
	* if they are participating in the eviction process. This only
	* happens if the eviction thread is unable to process evictions
	* fast enough. To keep the dbuf cache size in check, other threads
	* can evict from the dbuf cache directly. Those threads will set
	* their tsd values so that we ensure that they only evict one dbuf
	* from the dbuf cache.
	*/
	if (tsd_get(zfs_dbuf_evict_key) != NULL)
	return;

	/*
	* We check if we should evict without holding the dbuf_evict_lock,
	* because it's OK to occasionally make the wrong decision here,
	* and grabbing the lock results in massive lock contention.
	*/
	if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
	if (dbuf_cache_above_hiwater())
	dbuf_evict_one();
	cv_signal(&dbuf_evict_cv);
	}
	}

	void
	dbuf_init(void)
	{
	uint64_t hsize = 1ULL << 16;
	dbuf_hash_table_t *h = &dbuf_hash_table;
	int i;

	/*
	* The hash table is big enough to fill all of physical memory
	* with an average 4K block size. The table will take up
	* totalmemsizeof(void)/4K (i.e. 2MB/GB with 8-byte pointers).
	*/
	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
	hsize <<= 1;

	retry:
	h->hash_table_mask = hsize - 1;
	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
	if (h->hash_table == NULL) {
	/* XXX - we should really return an error instead of assert */
	ASSERT(hsize > (1ULL << 10));
	hsize >>= 1;
	goto retry;
	}

	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
	sizeof (dmu_buf_impl_t),
	0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);

	for (i = 0; i < DBUF_MUTEXES; i++)
	mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);

	/*
	* Setup the parameters for the dbuf cache. We cap the size of the
	* dbuf cache to 1/32nd (default) of the size of the ARC.
	*/
	dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
	arc_max_bytes() >> dbuf_cache_max_shift);

	/*
	* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
	* configuration is not required.
	*/
	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);

	dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
	offsetof(dmu_buf_impl_t, db_cache_link),
	dbuf_cache_multilist_index_func);
	refcount_create(&dbuf_cache_size);

	tsd_create(&zfs_dbuf_evict_key, NULL);
	dbuf_evict_thread_exit = B_FALSE;
	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
	NULL, 0, &p0, TS_RUN, minclsyspri);
	}

	void
	dbuf_fini(void)
	{
	dbuf_hash_table_t *h = &dbuf_hash_table;
	int i;

	for (i = 0; i < DBUF_MUTEXES; i++)
	mutex_destroy(&h->hash_mutexes[i]);
	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
	kmem_cache_destroy(dbuf_kmem_cache);
	taskq_destroy(dbu_evict_taskq);

	mutex_enter(&dbuf_evict_lock);
	dbuf_evict_thread_exit = B_TRUE;
	while (dbuf_evict_thread_exit) {
	cv_signal(&dbuf_evict_cv);
	cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
	}
	mutex_exit(&dbuf_evict_lock);
	tsd_destroy(&zfs_dbuf_evict_key);

	mutex_destroy(&dbuf_evict_lock);
	cv_destroy(&dbuf_evict_cv);

	refcount_destroy(&dbuf_cache_size);
	multilist_destroy(dbuf_cache);
	}

	/*
	* Other stuff.
	*/

	#ifdef ZFS_DEBUG
	static void
	dbuf_verify(dmu_buf_impl_t *db)
	{
	dnode_t *dn;
	dbuf_dirty_record_t *dr;

	ASSERT(MUTEX_HELD(&db->db_mtx));

	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
	return;

	ASSERT(db->db_objset != NULL);
	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	if (dn == NULL) {
	ASSERT(db->db_parent == NULL);
	ASSERT(db->db_blkptr == NULL);
	} else {
	ASSERT3U(db->db.db_object, ==, dn->dn_object);
	ASSERT3P(db->db_objset, ==, dn->dn_objset);
	ASSERT3U(db->db_level, <, dn->dn_nlevels);
	ASSERT(db->db_blkid == DMU_BONUS_BLKID \|\|
	db->db_blkid == DMU_SPILL_BLKID \|\|
	!avl_is_empty(&dn->dn_dbufs));
	}
	if (db->db_blkid == DMU_BONUS_BLKID) {
	ASSERT(dn != NULL);
	ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
	ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
	} else if (db->db_blkid == DMU_SPILL_BLKID) {
	ASSERT(dn != NULL);
	ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
	ASSERT0(db->db.db_offset);
	} else {
	ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
	}

	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
	ASSERT(dr->dr_dbuf == db);

	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
	ASSERT(dr->dr_dbuf == db);

	/*
	* We can't assert that db_size matches dn_datablksz because it
	* can be momentarily different when another thread is doing
	* dnode_set_blksz().
	*/
	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
	dr = db->db_data_pending;
	/*
	* It should only be modified in syncing context, so
	* make sure we only have one copy of the data.
	*/
	ASSERT(dr == NULL \|\| dr->dt.dl.dr_data == db->db_buf);
	}

	/* verify db->db_blkptr */
	if (db->db_blkptr) {
	if (db->db_parent == dn->dn_dbuf) {
	/* db is pointed to by the dnode */
	/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
	if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
	ASSERT(db->db_parent == NULL);
	else
	ASSERT(db->db_parent != NULL);
	if (db->db_blkid != DMU_SPILL_BLKID)
	ASSERT3P(db->db_blkptr, ==,
	&dn->dn_phys->dn_blkptr[db->db_blkid]);
	} else {
	/* db is pointed to by an indirect block */
	int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
	ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
	ASSERT3U(db->db_parent->db.db_object, ==,
	db->db.db_object);
	/*
	* dnode_grow_indblksz() can make this fail if we don't
	* have the struct_rwlock. XXX indblksz no longer
	* grows. safe to do this now?
	*/
	if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
	ASSERT3P(db->db_blkptr, ==,
	((blkptr_t *)db->db_parent->db.db_data +
	db->db_blkid % epb));
	}
	}
	}
	if ((db->db_blkptr == NULL \|\| BP_IS_HOLE(db->db_blkptr)) &&
	(db->db_buf == NULL \|\| db->db_buf->b_data) &&
	db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
	db->db_state != DB_FILL && !dn->dn_free_txg) {
	/*
	* If the blkptr isn't set but they have nonzero data,
	* it had better be dirty, otherwise we'll lose that
	* data when we evict this buffer.
	*
	* There is an exception to this rule for indirect blocks; in
	* this case, if the indirect block is a hole, we fill in a few
	* fields on each of the child blocks (importantly, birth time)
	* to prevent hole birth times from being lost when you
	* partially fill in a hole.
	*/
	if (db->db_dirtycnt == 0) {
	if (db->db_level == 0) {
	uint64_t *buf = db->db.db_data;
	int i;

	for (i = 0; i < db->db.db_size >> 3; i++) {
	ASSERT(buf[i] == 0);
	}
	} else {
	blkptr_t *bps = db->db.db_data;
	ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
	db->db.db_size);
	/*
	* We want to verify that all the blkptrs in the
	* indirect block are holes, but we may have
	* automatically set up a few fields for them.
	* We iterate through each blkptr and verify
	* they only have those fields set.
	*/
	for (int i = 0;
	i < db->db.db_size / sizeof (blkptr_t);
	i++) {
	blkptr_t *bp = &bps[i];
	ASSERT(ZIO_CHECKSUM_IS_ZERO(
	&bp->blk_cksum));
	ASSERT(
	DVA_IS_EMPTY(&bp->blk_dva[0]) &&
	DVA_IS_EMPTY(&bp->blk_dva[1]) &&
	DVA_IS_EMPTY(&bp->blk_dva[2]));
	ASSERT0(bp->blk_fill);
	ASSERT0(bp->blk_pad[0]);
	ASSERT0(bp->blk_pad[1]);
	ASSERT(!BP_IS_EMBEDDED(bp));
	ASSERT(BP_IS_HOLE(bp));
	ASSERT0(bp->blk_phys_birth);
	}
	}
	}
	}
	DB_DNODE_EXIT(db);
	}
	#endif

	static void
	dbuf_clear_data(dmu_buf_impl_t *db)
	{
	ASSERT(MUTEX_HELD(&db->db_mtx));
	dbuf_evict_user(db);
	ASSERT3P(db->db_buf, ==, NULL);
	db->db.db_data = NULL;
	if (db->db_state != DB_NOFILL)
	db->db_state = DB_UNCACHED;
	}

	static void
	dbuf_set_data(dmu_buf_impl_t db, arc_buf_t buf)
	{
	ASSERT(MUTEX_HELD(&db->db_mtx));
	ASSERT(buf != NULL);

	db->db_buf = buf;
	ASSERT(buf->b_data != NULL);
	db->db.db_data = buf->b_data;
	}

	/*
	* Loan out an arc_buf for read. Return the loaned arc_buf.
	*/
	arc_buf_t *
	dbuf_loan_arcbuf(dmu_buf_impl_t *db)
	{
	arc_buf_t *abuf;

	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	mutex_enter(&db->db_mtx);
	if (arc_released(db->db_buf) \|\| refcount_count(&db->db_holds) > 1) {
	int blksz = db->db.db_size;
	spa_t *spa = db->db_objset->os_spa;

	mutex_exit(&db->db_mtx);
	abuf = arc_loan_buf(spa, B_FALSE, blksz);
	bcopy(db->db.db_data, abuf->b_data, blksz);
	} else {
	abuf = db->db_buf;
	arc_loan_inuse_buf(abuf, db);
	db->db_buf = NULL;
	dbuf_clear_data(db);
	mutex_exit(&db->db_mtx);
	}
	return (abuf);
	}

	/*
	* Calculate which level n block references the data at the level 0 offset
	* provided.
	*/
	uint64_t
	dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
	{
	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
	/*
	* The level n blkid is equal to the level 0 blkid divided by
	* the number of level 0s in a level n block.
	*
	* The level 0 blkid is offset >> datablkshift =
	* offset / 2^datablkshift.
	*
	* The number of level 0s in a level n is the number of block
	* pointers in an indirect block, raised to the power of level.
	* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
	* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
	*
	* Thus, the level n blkid is: offset /
	* ((2^datablkshift)(2^(level(indblkshift - SPA_BLKPTRSHIFT)))
	* = offset / 2^(datablkshift + level *
	* (indblkshift - SPA_BLKPTRSHIFT))
	* = offset >> (datablkshift + level *
	* (indblkshift - SPA_BLKPTRSHIFT))
	*/
	return (offset >> (dn->dn_datablkshift + level *
	(dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
	} else {
	ASSERT3U(offset, <, dn->dn_datablksz);
	return (0);
	}
	}

	static void
	dbuf_read_done(zio_t zio, arc_buf_t buf, void *vdb)
	{
	dmu_buf_impl_t *db = vdb;

	mutex_enter(&db->db_mtx);
	ASSERT3U(db->db_state, ==, DB_READ);
	/*
	* All reads are synchronous, so we must have a hold on the dbuf
	*/
	ASSERT(refcount_count(&db->db_holds) > 0);
	ASSERT(db->db_buf == NULL);
	ASSERT(db->db.db_data == NULL);
	if (db->db_level == 0 && db->db_freed_in_flight) {
	/* we were freed in flight; disregard any error */
	arc_release(buf, db);
	bzero(buf->b_data, db->db.db_size);
	arc_buf_freeze(buf);
	db->db_freed_in_flight = FALSE;
	dbuf_set_data(db, buf);
	db->db_state = DB_CACHED;
	} else if (zio == NULL \|\| zio->io_error == 0) {
	dbuf_set_data(db, buf);
	db->db_state = DB_CACHED;
	} else {
	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	ASSERT3P(db->db_buf, ==, NULL);
	arc_buf_destroy(buf, db);
	db->db_state = DB_UNCACHED;
	}
	cv_broadcast(&db->db_changed);
	dbuf_rele_and_unlock(db, NULL);
	}

	static void
	dbuf_read_impl(dmu_buf_impl_t db, zio_t zio, uint32_t flags)
	{
	dnode_t *dn;
	zbookmark_phys_t zb;
	arc_flags_t aflags = ARC_FLAG_NOWAIT;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	ASSERT(!refcount_is_zero(&db->db_holds));
	/* We need the struct_rwlock to prevent db_blkptr from changing. */
	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
	ASSERT(MUTEX_HELD(&db->db_mtx));
	ASSERT(db->db_state == DB_UNCACHED);
	ASSERT(db->db_buf == NULL);

	if (db->db_blkid == DMU_BONUS_BLKID) {
	int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);

	ASSERT3U(bonuslen, <=, db->db.db_size);
	db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
	arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
	if (bonuslen < DN_MAX_BONUSLEN)
	bzero(db->db.db_data, DN_MAX_BONUSLEN);
	if (bonuslen)
	bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
	DB_DNODE_EXIT(db);
	db->db_state = DB_CACHED;
	mutex_exit(&db->db_mtx);
	return;
	}

	/*
	* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
	* processes the delete record and clears the bp while we are waiting
	* for the dn_mtx (resulting in a "no" from block_freed).
	*/
	if (db->db_blkptr == NULL \|\| BP_IS_HOLE(db->db_blkptr) \|\|
	(db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) \|\|
	BP_IS_HOLE(db->db_blkptr)))) {
	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);

	dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
	db->db.db_size));
	bzero(db->db.db_data, db->db.db_size);

	if (db->db_blkptr != NULL && db->db_level > 0 &&
	BP_IS_HOLE(db->db_blkptr) &&
	db->db_blkptr->blk_birth != 0) {
	blkptr_t *bps = db->db.db_data;
	for (int i = 0; i < ((1 <<
	DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
	i++) {
	blkptr_t *bp = &bps[i];
	ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
	1 << dn->dn_indblkshift);
	BP_SET_LSIZE(bp,
	BP_GET_LEVEL(db->db_blkptr) == 1 ?
	dn->dn_datablksz :
	BP_GET_LSIZE(db->db_blkptr));
	BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
	BP_SET_LEVEL(bp,
	BP_GET_LEVEL(db->db_blkptr) - 1);
	BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
	}
	}
	DB_DNODE_EXIT(db);
	db->db_state = DB_CACHED;
	mutex_exit(&db->db_mtx);
	return;
	}

	DB_DNODE_EXIT(db);

	db->db_state = DB_READ;
	mutex_exit(&db->db_mtx);

	if (DBUF_IS_L2CACHEABLE(db))
	aflags \|= ARC_FLAG_L2CACHE;

	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
	db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
	db->db.db_object, db->db_level, db->db_blkid);

	dbuf_add_ref(db, NULL);

	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
	dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
	(flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
	&aflags, &zb);
	}

	/*
	* This is our just-in-time copy function. It makes a copy of buffers that
	* have been modified in a previous transaction group before we access them in
	* the current active group.
	*
	* This function is used in three places: when we are dirtying a buffer for the
	* first time in a txg, when we are freeing a range in a dnode that includes
	* this buffer, and when we are accessing a buffer which was received compressed
	* and later referenced in a WRITE_BYREF record.
	*
	* Note that when we are called from dbuf_free_range() we do not put a hold on
	* the buffer, we just traverse the active dbuf list for the dnode.
	*/
	static void
	dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
	{
	dbuf_dirty_record_t *dr = db->db_last_dirty;

	ASSERT(MUTEX_HELD(&db->db_mtx));
	ASSERT(db->db.db_data != NULL);
	ASSERT(db->db_level == 0);
	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);

	if (dr == NULL \|\|
	(dr->dt.dl.dr_data !=
	((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
	return;

	/*
	* If the last dirty record for this dbuf has not yet synced
	* and its referencing the dbuf data, either:
	* reset the reference to point to a new copy,
	* or (if there a no active holders)
	* just null out the current db_data pointer.
	*/
	ASSERT(dr->dr_txg >= txg - 2);
	if (db->db_blkid == DMU_BONUS_BLKID) {
	/* Note that the data bufs here are zio_bufs */
	dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
	arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
	bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
	int size = arc_buf_size(db->db_buf);
	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
	spa_t *spa = db->db_objset->os_spa;
	enum zio_compress compress_type =
	arc_get_compression(db->db_buf);

	if (compress_type == ZIO_COMPRESS_OFF) {
	dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
	} else {
	ASSERT3U(type, ==, ARC_BUFC_DATA);
	dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
	size, arc_buf_lsize(db->db_buf), compress_type);
	}
	bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
	} else {
	db->db_buf = NULL;
	dbuf_clear_data(db);
	}
	}

	int
	dbuf_read(dmu_buf_impl_t db, zio_t zio, uint32_t flags)
	{
	int err = 0;
	boolean_t prefetch;
	dnode_t *dn;

	/*
	* We don't have to hold the mutex to check db_state because it
	* can't be freed while we have a hold on the buffer.
	*/
	ASSERT(!refcount_is_zero(&db->db_holds));

	if (db->db_state == DB_NOFILL)
	return (SET_ERROR(EIO));

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	if ((flags & DB_RF_HAVESTRUCT) == 0)
	rw_enter(&dn->dn_struct_rwlock, RW_READER);

	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
	(flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
	DBUF_IS_CACHEABLE(db);

	mutex_enter(&db->db_mtx);
	if (db->db_state == DB_CACHED) {
	/*
	* If the arc buf is compressed, we need to decompress it to
	* read the data. This could happen during the "zfs receive" of
	* a stream which is compressed and deduplicated.
	*/
	if (db->db_buf != NULL &&
	arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
	dbuf_fix_old_data(db,
	spa_syncing_txg(dmu_objset_spa(db->db_objset)));
	err = arc_decompress(db->db_buf);
	dbuf_set_data(db, db->db_buf);
	}
	mutex_exit(&db->db_mtx);
	if (prefetch)
	dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
	if ((flags & DB_RF_HAVESTRUCT) == 0)
	rw_exit(&dn->dn_struct_rwlock);
	DB_DNODE_EXIT(db);
	} else if (db->db_state == DB_UNCACHED) {
	spa_t *spa = dn->dn_objset->os_spa;
	boolean_t need_wait = B_FALSE;

	if (zio == NULL &&
	db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
	need_wait = B_TRUE;
	}
	dbuf_read_impl(db, zio, flags);

	/* dbuf_read_impl has dropped db_mtx for us */

	if (prefetch)
	dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);

	if ((flags & DB_RF_HAVESTRUCT) == 0)
	rw_exit(&dn->dn_struct_rwlock);
	DB_DNODE_EXIT(db);

	if (need_wait)
	err = zio_wait(zio);
	} else {
	/*
	* Another reader came in while the dbuf was in flight
	* between UNCACHED and CACHED. Either a writer will finish
	* writing the buffer (sending the dbuf to CACHED) or the
	* first reader's request will reach the read_done callback
	* and send the dbuf to CACHED. Otherwise, a failure
	* occurred and the dbuf went to UNCACHED.
	*/
	mutex_exit(&db->db_mtx);
	if (prefetch)
	dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
	if ((flags & DB_RF_HAVESTRUCT) == 0)
	rw_exit(&dn->dn_struct_rwlock);
	DB_DNODE_EXIT(db);

	/* Skip the wait per the caller's request. */
	mutex_enter(&db->db_mtx);
	if ((flags & DB_RF_NEVERWAIT) == 0) {
	while (db->db_state == DB_READ \|\|
	db->db_state == DB_FILL) {
	ASSERT(db->db_state == DB_READ \|\|
	(flags & DB_RF_HAVESTRUCT) == 0);
	DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
	db, zio_t *, zio);
	cv_wait(&db->db_changed, &db->db_mtx);
	}
	if (db->db_state == DB_UNCACHED)
	err = SET_ERROR(EIO);
	}
	mutex_exit(&db->db_mtx);
	}

	return (err);
	}

	static void
	dbuf_noread(dmu_buf_impl_t *db)
	{
	ASSERT(!refcount_is_zero(&db->db_holds));
	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	mutex_enter(&db->db_mtx);
	while (db->db_state == DB_READ \|\| db->db_state == DB_FILL)
	cv_wait(&db->db_changed, &db->db_mtx);
	if (db->db_state == DB_UNCACHED) {
	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
	spa_t *spa = db->db_objset->os_spa;

	ASSERT(db->db_buf == NULL);
	ASSERT(db->db.db_data == NULL);
	dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
	db->db_state = DB_FILL;
	} else if (db->db_state == DB_NOFILL) {
	dbuf_clear_data(db);
	} else {
	ASSERT3U(db->db_state, ==, DB_CACHED);
	}
	mutex_exit(&db->db_mtx);
	}

	void
	dbuf_unoverride(dbuf_dirty_record_t *dr)
	{
	dmu_buf_impl_t *db = dr->dr_dbuf;
	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
	uint64_t txg = dr->dr_txg;

	ASSERT(MUTEX_HELD(&db->db_mtx));
	/*
	* This assert is valid because dmu_sync() expects to be called by
	* a zilog's get_data while holding a range lock. This call only
	* comes from dbuf_dirty() callers who must also hold a range lock.
	*/
	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
	ASSERT(db->db_level == 0);

	if (db->db_blkid == DMU_BONUS_BLKID \|\|
	dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
	return;

	ASSERT(db->db_data_pending != dr);

	/* free this block */
	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
	zio_free(db->db_objset->os_spa, txg, bp);

	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
	dr->dt.dl.dr_nopwrite = B_FALSE;

	/*
	* Release the already-written buffer, so we leave it in
	* a consistent dirty state. Note that all callers are
	* modifying the buffer, so they will immediately do
	* another (redundant) arc_release(). Therefore, leave
	* the buf thawed to save the effort of freezing &
	* immediately re-thawing it.
	*/
	arc_release(dr->dt.dl.dr_data, db);
	}

	/*
	* Evict (if its unreferenced) or clear (if its referenced) any level-0
	* data blocks in the free range, so that any future readers will find
	* empty blocks.
	*/
	void
	dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
	dmu_tx_t *tx)
	{
	dmu_buf_impl_t db_search;
	dmu_buf_impl_t db, db_next;
	uint64_t txg = tx->tx_txg;
	avl_index_t where;

	if (end_blkid > dn->dn_maxblkid &&
	!(start_blkid == DMU_SPILL_BLKID \|\| end_blkid == DMU_SPILL_BLKID))
	end_blkid = dn->dn_maxblkid;
	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);

	db_search.db_level = 0;
	db_search.db_blkid = start_blkid;
	db_search.db_state = DB_SEARCH;

	mutex_enter(&dn->dn_dbufs_mtx);
	db = avl_find(&dn->dn_dbufs, &db_search, &where);
	ASSERT3P(db, ==, NULL);

	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);

	for (; db != NULL; db = db_next) {
	db_next = AVL_NEXT(&dn->dn_dbufs, db);
	ASSERT(db->db_blkid != DMU_BONUS_BLKID);

	if (db->db_level != 0 \|\| db->db_blkid > end_blkid) {
	break;
	}
	ASSERT3U(db->db_blkid, >=, start_blkid);

	/* found a level 0 buffer in the range */
	mutex_enter(&db->db_mtx);
	if (dbuf_undirty(db, tx)) {
	/* mutex has been dropped and dbuf destroyed */
	continue;
	}

	if (db->db_state == DB_UNCACHED \|\|
	db->db_state == DB_NOFILL \|\|
	db->db_state == DB_EVICTING) {
	ASSERT(db->db.db_data == NULL);
	mutex_exit(&db->db_mtx);
	continue;
	}
	if (db->db_state == DB_READ \|\| db->db_state == DB_FILL) {
	/* will be handled in dbuf_read_done or dbuf_rele */
	db->db_freed_in_flight = TRUE;
	mutex_exit(&db->db_mtx);
	continue;
	}
	if (refcount_count(&db->db_holds) == 0) {
	ASSERT(db->db_buf);
	dbuf_destroy(db);
	continue;
	}
	/* The dbuf is referenced */

	if (db->db_last_dirty != NULL) {
	dbuf_dirty_record_t *dr = db->db_last_dirty;

	if (dr->dr_txg == txg) {
	/*
	* This buffer is "in-use", re-adjust the file
	* size to reflect that this buffer may
	* contain new data when we sync.
	*/
	if (db->db_blkid != DMU_SPILL_BLKID &&
	db->db_blkid > dn->dn_maxblkid)
	dn->dn_maxblkid = db->db_blkid;
	dbuf_unoverride(dr);
	} else {
	/*
	* This dbuf is not dirty in the open context.
	* Either uncache it (if its not referenced in
	* the open context) or reset its contents to
	* empty.
	*/
	dbuf_fix_old_data(db, txg);
	}
	}
	/* clear the contents if its cached */
	if (db->db_state == DB_CACHED) {
	ASSERT(db->db.db_data != NULL);
	arc_release(db->db_buf, db);
	bzero(db->db.db_data, db->db.db_size);
	arc_buf_freeze(db->db_buf);
	}

	mutex_exit(&db->db_mtx);
	}
	mutex_exit(&dn->dn_dbufs_mtx);
	}

	void
	dbuf_new_size(dmu_buf_impl_t db, int size, dmu_tx_t tx)
	{
	arc_buf_t buf, obuf;
	int osize = db->db.db_size;
	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
	dnode_t *dn;

	ASSERT(db->db_blkid != DMU_BONUS_BLKID);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);

	/* XXX does this func really need the lock? */
	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));

	/*
	* This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
	* is OK, because there can be no other references to the db
	* when we are changing its size, so no concurrent DB_FILL can
	* be happening.
	*/
	/*
	* XXX we should be doing a dbuf_read, checking the return
	* value and returning that up to our callers
	*/
	dmu_buf_will_dirty(&db->db, tx);

	/* create the data buffer for the new block */
	buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);

	/* copy old block data to the new block */
	obuf = db->db_buf;
	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
	/* zero the remainder */
	if (size > osize)
	bzero((uint8_t *)buf->b_data + osize, size - osize);

	mutex_enter(&db->db_mtx);
	dbuf_set_data(db, buf);
	arc_buf_destroy(obuf, db);
	db->db.db_size = size;

	if (db->db_level == 0) {
	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
	db->db_last_dirty->dt.dl.dr_data = buf;
	}
	mutex_exit(&db->db_mtx);

	dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
	DB_DNODE_EXIT(db);
	}

	void
	dbuf_release_bp(dmu_buf_impl_t *db)
	{
	objset_t *os = db->db_objset;

	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
	ASSERT(arc_released(os->os_phys_buf) \|\|
	list_link_active(&os->os_dsl_dataset->ds_synced_link));
	ASSERT(db->db_parent == NULL \|\| arc_released(db->db_parent->db_buf));

	(void) arc_release(db->db_buf, db);
	}

	/*
	* We already have a dirty record for this TXG, and we are being
	* dirtied again.
	*/
	static void
	dbuf_redirty(dbuf_dirty_record_t *dr)
	{
	dmu_buf_impl_t *db = dr->dr_dbuf;

	ASSERT(MUTEX_HELD(&db->db_mtx));

	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
	/*
	* If this buffer has already been written out,
	* we now need to reset its state.
	*/
	dbuf_unoverride(dr);
	if (db->db.db_object != DMU_META_DNODE_OBJECT &&
	db->db_state != DB_NOFILL) {
	/* Already released on initial dirty, so just thaw. */
	ASSERT(arc_released(db->db_buf));
	arc_buf_thaw(db->db_buf);
	}
	}
	}

	dbuf_dirty_record_t *
	dbuf_dirty(dmu_buf_impl_t db, dmu_tx_t tx)
	{
	dnode_t *dn;
	objset_t *os;
	dbuf_dirty_record_t *drp, dr;
	int drop_struct_lock = FALSE;
	int txgoff = tx->tx_txg & TXG_MASK;

	ASSERT(tx->tx_txg != 0);
	ASSERT(!refcount_is_zero(&db->db_holds));
	DMU_TX_DIRTY_BUF(tx, db);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	/*
	* Shouldn't dirty a regular buffer in syncing context. Private
	* objects may be dirtied in syncing context, but only if they
	* were already pre-dirtied in open context.
	*/
	#ifdef DEBUG
	if (dn->dn_objset->os_dsl_dataset != NULL) {
	rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
	RW_READER, FTAG);
	}
	ASSERT(!dmu_tx_is_syncing(tx) \|\|
	BP_IS_HOLE(dn->dn_objset->os_rootbp) \|\|
	DMU_OBJECT_IS_SPECIAL(dn->dn_object) \|\|
	dn->dn_objset->os_dsl_dataset == NULL);
	if (dn->dn_objset->os_dsl_dataset != NULL)
	rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
	#endif
	/*
	* We make this assert for private objects as well, but after we
	* check if we're already dirty. They are allowed to re-dirty
	* in syncing context.
	*/
	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT \|\|
	dn->dn_dirtyctx == DN_UNDIRTIED \|\| dn->dn_dirtyctx ==
	(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));

	mutex_enter(&db->db_mtx);
	/*
	* XXX make this true for indirects too? The problem is that
	* transactions created with dmu_tx_create_assigned() from
	* syncing context don't bother holding ahead.
	*/
	ASSERT(db->db_level != 0 \|\|
	db->db_state == DB_CACHED \|\| db->db_state == DB_FILL \|\|
	db->db_state == DB_NOFILL);

	mutex_enter(&dn->dn_mtx);
	/*
	* Don't set dirtyctx to SYNC if we're just modifying this as we
	* initialize the objset.
	*/
	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
	if (dn->dn_objset->os_dsl_dataset != NULL) {
	rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
	RW_READER, FTAG);
	}
	if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
	dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
	DN_DIRTY_SYNC : DN_DIRTY_OPEN);
	ASSERT(dn->dn_dirtyctx_firstset == NULL);
	dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
	}
	if (dn->dn_objset->os_dsl_dataset != NULL) {
	rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
	FTAG);
	}
	}
	mutex_exit(&dn->dn_mtx);

	if (db->db_blkid == DMU_SPILL_BLKID)
	dn->dn_have_spill = B_TRUE;

	/*
	* If this buffer is already dirty, we're done.
	*/
	drp = &db->db_last_dirty;
	ASSERT(drp == NULL \|\| (drp)->dr_txg <= tx->tx_txg \|\|
	db->db.db_object == DMU_META_DNODE_OBJECT);
	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
	drp = &dr->dr_next;
	if (dr && dr->dr_txg == tx->tx_txg) {
	DB_DNODE_EXIT(db);

	dbuf_redirty(dr);
	mutex_exit(&db->db_mtx);
	return (dr);
	}

	/*
	* Only valid if not already dirty.
	*/
	ASSERT(dn->dn_object == 0 \|\|
	dn->dn_dirtyctx == DN_UNDIRTIED \|\| dn->dn_dirtyctx ==
	(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));

	ASSERT3U(dn->dn_nlevels, >, db->db_level);

	/*
	* We should only be dirtying in syncing context if it's the
	* mos or we're initializing the os or it's a special object.
	* However, we are allowed to dirty in syncing context provided
	* we already dirtied it in open context. Hence we must make
	* this assertion only if we're not already dirty.
	*/
	os = dn->dn_objset;
	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
	#ifdef DEBUG
	if (dn->dn_objset->os_dsl_dataset != NULL)
	rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
	ASSERT(!dmu_tx_is_syncing(tx) \|\| DMU_OBJECT_IS_SPECIAL(dn->dn_object) \|\|
	os->os_dsl_dataset == NULL \|\| BP_IS_HOLE(os->os_rootbp));
	if (dn->dn_objset->os_dsl_dataset != NULL)
	rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
	#endif
	ASSERT(db->db.db_size != 0);

	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);

	if (db->db_blkid != DMU_BONUS_BLKID) {
	dmu_objset_willuse_space(os, db->db.db_size, tx);
	}

	/*
	* If this buffer is dirty in an old transaction group we need
	* to make a copy of it so that the changes we make in this
	* transaction group won't leak out when we sync the older txg.
	*/
	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
	if (db->db_level == 0) {
	void *data_old = db->db_buf;

	if (db->db_state != DB_NOFILL) {
	if (db->db_blkid == DMU_BONUS_BLKID) {
	dbuf_fix_old_data(db, tx->tx_txg);
	data_old = db->db.db_data;
	} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
	/*
	* Release the data buffer from the cache so
	* that we can modify it without impacting
	* possible other users of this cached data
	* block. Note that indirect blocks and
	* private objects are not released until the
	* syncing state (since they are only modified
	* then).
	*/
	arc_release(db->db_buf, db);
	dbuf_fix_old_data(db, tx->tx_txg);
	data_old = db->db_buf;
	}
	ASSERT(data_old != NULL);
	}
	dr->dt.dl.dr_data = data_old;
	} else {
	mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
	list_create(&dr->dt.di.dr_children,
	sizeof (dbuf_dirty_record_t),
	offsetof(dbuf_dirty_record_t, dr_dirty_node));
	}
	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
	dr->dr_accounted = db->db.db_size;
	dr->dr_dbuf = db;
	dr->dr_txg = tx->tx_txg;
	dr->dr_next = *drp;
	*drp = dr;

	/*
	* We could have been freed_in_flight between the dbuf_noread
	* and dbuf_dirty. We win, as though the dbuf_noread() had
	* happened after the free.
	*/
	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
	db->db_blkid != DMU_SPILL_BLKID) {
	mutex_enter(&dn->dn_mtx);
	if (dn->dn_free_ranges[txgoff] != NULL) {
	range_tree_clear(dn->dn_free_ranges[txgoff],
	db->db_blkid, 1);
	}
	mutex_exit(&dn->dn_mtx);
	db->db_freed_in_flight = FALSE;
	}

	/*
	* This buffer is now part of this txg
	*/
	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
	db->db_dirtycnt += 1;
	ASSERT3U(db->db_dirtycnt, <=, 3);

	mutex_exit(&db->db_mtx);

	if (db->db_blkid == DMU_BONUS_BLKID \|\|
	db->db_blkid == DMU_SPILL_BLKID) {
	mutex_enter(&dn->dn_mtx);
	ASSERT(!list_link_active(&dr->dr_dirty_node));
	list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
	mutex_exit(&dn->dn_mtx);
	dnode_setdirty(dn, tx);
	DB_DNODE_EXIT(db);
	return (dr);
	}

	/*
	* The dn_struct_rwlock prevents db_blkptr from changing
	* due to a write from syncing context completing
	* while we are running, so we want to acquire it before
	* looking at db_blkptr.
	*/
	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	drop_struct_lock = TRUE;
	}

	/*
	* We need to hold the dn_struct_rwlock to make this assertion,
	* because it protects dn_phys / dn_next_nlevels from changing.
	*/
	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) \|\|
	dn->dn_phys->dn_nlevels > db->db_level \|\|
	dn->dn_next_nlevels[txgoff] > db->db_level \|\|
	dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level \|\|
	dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);

	/*
	* If we are overwriting a dedup BP, then unless it is snapshotted,
	* when we get to syncing context we will need to decrement its
	* refcount in the DDT. Prefetch the relevant DDT block so that
	* syncing context won't have to wait for the i/o.
	*/
	ddt_prefetch(os->os_spa, db->db_blkptr);

	if (db->db_level == 0) {
	dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
	ASSERT(dn->dn_maxblkid >= db->db_blkid);
	}

	if (db->db_level+1 < dn->dn_nlevels) {
	dmu_buf_impl_t *parent = db->db_parent;
	dbuf_dirty_record_t *di;
	int parent_held = FALSE;

	if (db->db_parent == NULL \|\| db->db_parent == dn->dn_dbuf) {
	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;

	parent = dbuf_hold_level(dn, db->db_level+1,
	db->db_blkid >> epbs, FTAG);
	ASSERT(parent != NULL);
	parent_held = TRUE;
	}
	if (drop_struct_lock)
	rw_exit(&dn->dn_struct_rwlock);
	ASSERT3U(db->db_level+1, ==, parent->db_level);
	di = dbuf_dirty(parent, tx);
	if (parent_held)
	dbuf_rele(parent, FTAG);

	mutex_enter(&db->db_mtx);
	/*
	* Since we've dropped the mutex, it's possible that
	* dbuf_undirty() might have changed this out from under us.
	*/
	if (db->db_last_dirty == dr \|\|
	dn->dn_object == DMU_META_DNODE_OBJECT) {
	mutex_enter(&di->dt.di.dr_mtx);
	ASSERT3U(di->dr_txg, ==, tx->tx_txg);
	ASSERT(!list_link_active(&dr->dr_dirty_node));
	list_insert_tail(&di->dt.di.dr_children, dr);
	mutex_exit(&di->dt.di.dr_mtx);
	dr->dr_parent = di;
	}
	mutex_exit(&db->db_mtx);
	} else {
	ASSERT(db->db_level+1 == dn->dn_nlevels);
	ASSERT(db->db_blkid < dn->dn_nblkptr);
	ASSERT(db->db_parent == NULL \|\| db->db_parent == dn->dn_dbuf);
	mutex_enter(&dn->dn_mtx);
	ASSERT(!list_link_active(&dr->dr_dirty_node));
	list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
	mutex_exit(&dn->dn_mtx);
	if (drop_struct_lock)
	rw_exit(&dn->dn_struct_rwlock);
	}

	dnode_setdirty(dn, tx);
	DB_DNODE_EXIT(db);
	return (dr);
	}

	/*
	* Undirty a buffer in the transaction group referenced by the given
	* transaction. Return whether this evicted the dbuf.
	*/
	static boolean_t
	dbuf_undirty(dmu_buf_impl_t db, dmu_tx_t tx)
	{
	dnode_t *dn;
	uint64_t txg = tx->tx_txg;
	dbuf_dirty_record_t dr, *drp;

	ASSERT(txg != 0);

	/*
	* Due to our use of dn_nlevels below, this can only be called
	* in open context, unless we are operating on the MOS.
	* From syncing context, dn_nlevels may be different from the
	* dn_nlevels used when dbuf was dirtied.
	*/
	ASSERT(db->db_objset ==
	dmu_objset_pool(db->db_objset)->dp_meta_objset \|\|
	txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	ASSERT0(db->db_level);
	ASSERT(MUTEX_HELD(&db->db_mtx));

	/*
	* If this buffer is not dirty, we're done.
	*/
	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
	if (dr->dr_txg <= txg)
	break;
	if (dr == NULL \|\| dr->dr_txg < txg)
	return (B_FALSE);
	ASSERT(dr->dr_txg == txg);
	ASSERT(dr->dr_dbuf == db);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);

	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);

	ASSERT(db->db.db_size != 0);

	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
	dr->dr_accounted, txg);

	*drp = dr->dr_next;

	/*
	* Note that there are three places in dbuf_dirty()
	* where this dirty record may be put on a list.
	* Make sure to do a list_remove corresponding to
	* every one of those list_insert calls.
	*/
	if (dr->dr_parent) {
	mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
	list_remove(&dr->dr_parent->dt.di.dr_children, dr);
	mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
	} else if (db->db_blkid == DMU_SPILL_BLKID \|\|
	db->db_level + 1 == dn->dn_nlevels) {
	ASSERT(db->db_blkptr == NULL \|\| db->db_parent == dn->dn_dbuf);
	mutex_enter(&dn->dn_mtx);
	list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
	mutex_exit(&dn->dn_mtx);
	}
	DB_DNODE_EXIT(db);

	if (db->db_state != DB_NOFILL) {
	dbuf_unoverride(dr);

	ASSERT(db->db_buf != NULL);
	ASSERT(dr->dt.dl.dr_data != NULL);
	if (dr->dt.dl.dr_data != db->db_buf)
	arc_buf_destroy(dr->dt.dl.dr_data, db);
	}

	kmem_free(dr, sizeof (dbuf_dirty_record_t));

	ASSERT(db->db_dirtycnt > 0);
	db->db_dirtycnt -= 1;

	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
	ASSERT(db->db_state == DB_NOFILL \|\| arc_released(db->db_buf));
	dbuf_destroy(db);
	return (B_TRUE);
	}

	return (B_FALSE);
	}

	void
	dmu_buf_will_dirty(dmu_buf_t db_fake, dmu_tx_t tx)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	int rf = DB_RF_MUST_SUCCEED \| DB_RF_NOPREFETCH;

	ASSERT(tx->tx_txg != 0);
	ASSERT(!refcount_is_zero(&db->db_holds));

	/*
	* Quick check for dirtyness. For already dirty blocks, this
	* reduces runtime of this function by >90%, and overall performance
	* by 50% for some workloads (e.g. file deletion with indirect blocks
	* cached).
	*/
	mutex_enter(&db->db_mtx);
	dbuf_dirty_record_t *dr;
	for (dr = db->db_last_dirty;
	dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
	/*
	* It's possible that it is already dirty but not cached,
	* because there are some calls to dbuf_dirty() that don't
	* go through dmu_buf_will_dirty().
	*/
	if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
	/* This dbuf is already dirty and cached. */
	dbuf_redirty(dr);
	mutex_exit(&db->db_mtx);
	return;
	}
	}
	mutex_exit(&db->db_mtx);

	DB_DNODE_ENTER(db);
	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
	rf \|= DB_RF_HAVESTRUCT;
	DB_DNODE_EXIT(db);
	(void) dbuf_read(db, NULL, rf);
	(void) dbuf_dirty(db, tx);
	}

	void
	dmu_buf_will_not_fill(dmu_buf_t db_fake, dmu_tx_t tx)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;

	db->db_state = DB_NOFILL;

	dmu_buf_will_fill(db_fake, tx);
	}

	void
	dmu_buf_will_fill(dmu_buf_t db_fake, dmu_tx_t tx)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;

	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	ASSERT(tx->tx_txg != 0);
	ASSERT(db->db_level == 0);
	ASSERT(!refcount_is_zero(&db->db_holds));

	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT \|\|
	dmu_tx_private_ok(tx));

	dbuf_noread(db);
	(void) dbuf_dirty(db, tx);
	}

	#pragma weak dmu_buf_fill_done = dbuf_fill_done
	/* ARGSUSED */
	void
	dbuf_fill_done(dmu_buf_impl_t db, dmu_tx_t tx)
	{
	mutex_enter(&db->db_mtx);
	DBUF_VERIFY(db);

	if (db->db_state == DB_FILL) {
	if (db->db_level == 0 && db->db_freed_in_flight) {
	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	/* we were freed while filling */
	/* XXX dbuf_undirty? */
	bzero(db->db.db_data, db->db.db_size);
	db->db_freed_in_flight = FALSE;
	}
	db->db_state = DB_CACHED;
	cv_broadcast(&db->db_changed);
	}
	mutex_exit(&db->db_mtx);
	}

	void
	dmu_buf_write_embedded(dmu_buf_t dbuf, void data,
	bp_embedded_type_t etype, enum zio_compress comp,
	int uncompressed_size, int compressed_size, int byteorder,
	dmu_tx_t *tx)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )dbuf;
	struct dirty_leaf *dl;
	dmu_object_type_t type;

	if (etype == BP_EMBEDDED_TYPE_DATA) {
	ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
	SPA_FEATURE_EMBEDDED_DATA));
	}

	DB_DNODE_ENTER(db);
	type = DB_DNODE(db)->dn_type;
	DB_DNODE_EXIT(db);

	ASSERT0(db->db_level);
	ASSERT(db->db_blkid != DMU_BONUS_BLKID);

	dmu_buf_will_not_fill(dbuf, tx);

	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
	dl = &db->db_last_dirty->dt.dl;
	encode_embedded_bp_compressed(&dl->dr_overridden_by,
	data, comp, uncompressed_size, compressed_size);
	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
	BP_SET_TYPE(&dl->dr_overridden_by, type);
	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);

	dl->dr_override_state = DR_OVERRIDDEN;
	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
	}

	/*
	* Directly assign a provided arc buf to a given dbuf if it's not referenced
	* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
	*/
	void
	dbuf_assign_arcbuf(dmu_buf_impl_t db, arc_buf_t buf, dmu_tx_t *tx)
	{
	ASSERT(!refcount_is_zero(&db->db_holds));
	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	ASSERT(db->db_level == 0);
	ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
	ASSERT(buf != NULL);
	ASSERT(arc_buf_lsize(buf) == db->db.db_size);
	ASSERT(tx->tx_txg != 0);

	arc_return_buf(buf, db);
	ASSERT(arc_released(buf));

	mutex_enter(&db->db_mtx);

	while (db->db_state == DB_READ \|\| db->db_state == DB_FILL)
	cv_wait(&db->db_changed, &db->db_mtx);

	ASSERT(db->db_state == DB_CACHED \|\| db->db_state == DB_UNCACHED);

	if (db->db_state == DB_CACHED &&
	refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
	mutex_exit(&db->db_mtx);
	(void) dbuf_dirty(db, tx);
	bcopy(buf->b_data, db->db.db_data, db->db.db_size);
	arc_buf_destroy(buf, db);
	xuio_stat_wbuf_copied();
	return;
	}

	xuio_stat_wbuf_nocopy();
	if (db->db_state == DB_CACHED) {
	dbuf_dirty_record_t *dr = db->db_last_dirty;

	ASSERT(db->db_buf != NULL);
	if (dr != NULL && dr->dr_txg == tx->tx_txg) {
	ASSERT(dr->dt.dl.dr_data == db->db_buf);
	if (!arc_released(db->db_buf)) {
	ASSERT(dr->dt.dl.dr_override_state ==
	DR_OVERRIDDEN);
	arc_release(db->db_buf, db);
	}
	dr->dt.dl.dr_data = buf;
	arc_buf_destroy(db->db_buf, db);
	} else if (dr == NULL \|\| dr->dt.dl.dr_data != db->db_buf) {
	arc_release(db->db_buf, db);
	arc_buf_destroy(db->db_buf, db);
	}
	db->db_buf = NULL;
	}
	ASSERT(db->db_buf == NULL);
	dbuf_set_data(db, buf);
	db->db_state = DB_FILL;
	mutex_exit(&db->db_mtx);
	(void) dbuf_dirty(db, tx);
	dmu_buf_fill_done(&db->db, tx);
	}

	void
	dbuf_destroy(dmu_buf_impl_t *db)
	{
	dnode_t *dn;
	dmu_buf_impl_t *parent = db->db_parent;
	dmu_buf_impl_t *dndb;

	ASSERT(MUTEX_HELD(&db->db_mtx));
	ASSERT(refcount_is_zero(&db->db_holds));

	if (db->db_buf != NULL) {
	arc_buf_destroy(db->db_buf, db);
	db->db_buf = NULL;
	}

	if (db->db_blkid == DMU_BONUS_BLKID) {
	ASSERT(db->db.db_data != NULL);
	zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
	arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
	db->db_state = DB_UNCACHED;
	}

	dbuf_clear_data(db);

	if (multilist_link_active(&db->db_cache_link)) {
	multilist_remove(dbuf_cache, db);
	(void) refcount_remove_many(&dbuf_cache_size,
	db->db.db_size, db);
	}

	ASSERT(db->db_state == DB_UNCACHED \|\| db->db_state == DB_NOFILL);
	ASSERT(db->db_data_pending == NULL);

	db->db_state = DB_EVICTING;
	db->db_blkptr = NULL;

	/*
	* Now that db_state is DB_EVICTING, nobody else can find this via
	* the hash table. We can now drop db_mtx, which allows us to
	* acquire the dn_dbufs_mtx.
	*/
	mutex_exit(&db->db_mtx);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	dndb = dn->dn_dbuf;
	if (db->db_blkid != DMU_BONUS_BLKID) {
	boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
	if (needlock)
	mutex_enter(&dn->dn_dbufs_mtx);
	avl_remove(&dn->dn_dbufs, db);
	atomic_dec_32(&dn->dn_dbufs_count);
	membar_producer();
	DB_DNODE_EXIT(db);
	if (needlock)
	mutex_exit(&dn->dn_dbufs_mtx);
	/*
	* Decrementing the dbuf count means that the hold corresponding
	* to the removed dbuf is no longer discounted in dnode_move(),
	* so the dnode cannot be moved until after we release the hold.
	* The membar_producer() ensures visibility of the decremented
	* value in dnode_move(), since DB_DNODE_EXIT doesn't actually
	* release any lock.
	*/
	dnode_rele(dn, db);
	db->db_dnode_handle = NULL;

	dbuf_hash_remove(db);
	} else {
	DB_DNODE_EXIT(db);
	}

	ASSERT(refcount_is_zero(&db->db_holds));

	db->db_parent = NULL;

	ASSERT(db->db_buf == NULL);
	ASSERT(db->db.db_data == NULL);
	ASSERT(db->db_hash_next == NULL);
	ASSERT(db->db_blkptr == NULL);
	ASSERT(db->db_data_pending == NULL);
	ASSERT(!multilist_link_active(&db->db_cache_link));

	kmem_cache_free(dbuf_kmem_cache, db);
	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);

	/*
	* If this dbuf is referenced from an indirect dbuf,
	* decrement the ref count on the indirect dbuf.
	*/
	if (parent && parent != dndb)
	dbuf_rele(parent, db);
	}

	/*
	* Note: While bpp will always be updated if the function returns success,
	* parentp will not be updated if the dnode does not have dn_dbuf filled in;
	* this happens when the dnode is the meta-dnode, or a userused or groupused
	* object.
	*/
	static int
	dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
	dmu_buf_impl_t parentp, blkptr_t bpp)
	{
	*parentp = NULL;
	*bpp = NULL;

	ASSERT(blkid != DMU_BONUS_BLKID);

	if (blkid == DMU_SPILL_BLKID) {
	mutex_enter(&dn->dn_mtx);
	if (dn->dn_have_spill &&
	(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
	*bpp = &dn->dn_phys->dn_spill;
	else
	*bpp = NULL;
	dbuf_add_ref(dn->dn_dbuf, NULL);
	*parentp = dn->dn_dbuf;
	mutex_exit(&dn->dn_mtx);
	return (0);
	}

	int nlevels =
	(dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;

	ASSERT3U(level * epbs, <, 64);
	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
	/*
	* This assertion shouldn't trip as long as the max indirect block size
	* is less than 1M. The reason for this is that up to that point,
	* the number of levels required to address an entire object with blocks
	* of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In
	* other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
	* (i.e. we can address the entire object), objects will all use at most
	* N-1 levels and the assertion won't overflow. However, once epbs is
	* 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be
	* enough to address an entire object, so objects will have 5 levels,
	* but then this assertion will overflow.
	*
	* All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
	* need to redo this logic to handle overflows.
	*/
	ASSERT(level >= nlevels \|\|
	((nlevels - level - 1) * epbs) +
	highbit64(dn->dn_phys->dn_nblkptr) <= 64);
	if (level >= nlevels \|\|
	blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
	((nlevels - level - 1) * epbs)) \|\|
	(fail_sparse &&
	blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
	/* the buffer has no parent yet */
	return (SET_ERROR(ENOENT));
	} else if (level < nlevels-1) {
	/* this block is referenced from an indirect block */
	int err = dbuf_hold_impl(dn, level+1,
	blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
	if (err)
	return (err);
	err = dbuf_read(*parentp, NULL,
	(DB_RF_HAVESTRUCT \| DB_RF_NOPREFETCH \| DB_RF_CANFAIL));
	if (err) {
	dbuf_rele(*parentp, NULL);
	*parentp = NULL;
	return (err);
	}
	bpp = ((blkptr_t )(*parentp)->db.db_data) +
	(blkid & ((1ULL << epbs) - 1));
	if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
	ASSERT(BP_IS_HOLE(*bpp));
	return (0);
	} else {
	/* the block is referenced from the dnode */
	ASSERT3U(level, ==, nlevels-1);
	ASSERT(dn->dn_phys->dn_nblkptr == 0 \|\|
	blkid < dn->dn_phys->dn_nblkptr);
	if (dn->dn_dbuf) {
	dbuf_add_ref(dn->dn_dbuf, NULL);
	*parentp = dn->dn_dbuf;
	}
	*bpp = &dn->dn_phys->dn_blkptr[blkid];
	return (0);
	}
	}

	static dmu_buf_impl_t *
	dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
	dmu_buf_impl_t parent, blkptr_t blkptr)
	{
	objset_t *os = dn->dn_objset;
	dmu_buf_impl_t db, odb;

	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
	ASSERT(dn->dn_type != DMU_OT_NONE);

	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);

	db->db_objset = os;
	db->db.db_object = dn->dn_object;
	db->db_level = level;
	db->db_blkid = blkid;
	db->db_last_dirty = NULL;
	db->db_dirtycnt = 0;
	db->db_dnode_handle = dn->dn_handle;
	db->db_parent = parent;
	db->db_blkptr = blkptr;

	db->db_user = NULL;
	db->db_user_immediate_evict = FALSE;
	db->db_freed_in_flight = FALSE;
	db->db_pending_evict = FALSE;

	if (blkid == DMU_BONUS_BLKID) {
	ASSERT3P(parent, ==, dn->dn_dbuf);
	db->db.db_size = DN_MAX_BONUSLEN -
	(dn->dn_nblkptr-1) * sizeof (blkptr_t);
	ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
	db->db.db_offset = DMU_BONUS_BLKID;
	db->db_state = DB_UNCACHED;
	/* the bonus dbuf is not placed in the hash table */
	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
	return (db);
	} else if (blkid == DMU_SPILL_BLKID) {
	db->db.db_size = (blkptr != NULL) ?
	BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
	db->db.db_offset = 0;
	} else {
	int blocksize =
	db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
	db->db.db_size = blocksize;
	db->db.db_offset = db->db_blkid * blocksize;
	}

	/*
	* Hold the dn_dbufs_mtx while we get the new dbuf
	* in the hash table and added to the dbufs list.
	* This prevents a possible deadlock with someone
	* trying to look up this dbuf before its added to the
	* dn_dbufs list.
	*/
	mutex_enter(&dn->dn_dbufs_mtx);
	db->db_state = DB_EVICTING;
	if ((odb = dbuf_hash_insert(db)) != NULL) {
	/* someone else inserted it first */
	kmem_cache_free(dbuf_kmem_cache, db);
	mutex_exit(&dn->dn_dbufs_mtx);
	return (odb);
	}
	avl_add(&dn->dn_dbufs, db);

	db->db_state = DB_UNCACHED;
	mutex_exit(&dn->dn_dbufs_mtx);
	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);

	if (parent && parent != dn->dn_dbuf)
	dbuf_add_ref(parent, db);

	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT \|\|
	refcount_count(&dn->dn_holds) > 0);
	(void) refcount_add(&dn->dn_holds, db);
	atomic_inc_32(&dn->dn_dbufs_count);

	dprintf_dbuf(db, "db=%p\n", db);

	return (db);
	}

	typedef struct dbuf_prefetch_arg {
	spa_t dpa_spa; / The spa to issue the prefetch in. */
	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
	int dpa_curlevel; /* The current level that we're reading */
	dnode_t dpa_dnode; / The dnode associated with the prefetch */
	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
	zio_t dpa_zio; / The parent zio_t for all prefetches. */
	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
	} dbuf_prefetch_arg_t;

	/*
	* Actually issue the prefetch read for the block given.
	*/
	static void
	dbuf_issue_final_prefetch(dbuf_prefetch_arg_t dpa, blkptr_t bp)
	{
	if (BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp))
	return;

	arc_flags_t aflags =
	dpa->dpa_aflags \| ARC_FLAG_NOWAIT \| ARC_FLAG_PREFETCH;

	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
	ASSERT(dpa->dpa_zio != NULL);
	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
	dpa->dpa_prio, ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE,
	&aflags, &dpa->dpa_zb);
	}

	/*
	* Called when an indirect block above our prefetch target is read in. This
	* will either read in the next indirect block down the tree or issue the actual
	* prefetch if the next block down is our target.
	*/
	static void
	dbuf_prefetch_indirect_done(zio_t zio, arc_buf_t abuf, void *private)
	{
	dbuf_prefetch_arg_t *dpa = private;

	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
	ASSERT3S(dpa->dpa_curlevel, >, 0);

	/*
	* The dpa_dnode is only valid if we are called with a NULL
	* zio. This indicates that the arc_read() returned without
	* first calling zio_read() to issue a physical read. Once
	* a physical read is made the dpa_dnode must be invalidated
	* as the locks guarding it may have been dropped. If the
	* dpa_dnode is still valid, then we want to add it to the dbuf
	* cache. To do so, we must hold the dbuf associated with the block
	* we just prefetched, read its contents so that we associate it
	* with an arc_buf_t, and then release it.
	*/
	if (zio != NULL) {
	ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
	if (zio->io_flags & ZIO_FLAG_RAW) {
	ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
	} else {
	ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
	}
	ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);

	dpa->dpa_dnode = NULL;
	} else if (dpa->dpa_dnode != NULL) {
	uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
	(dpa->dpa_epbs * (dpa->dpa_curlevel -
	dpa->dpa_zb.zb_level));
	dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
	dpa->dpa_curlevel, curblkid, FTAG);
	(void) dbuf_read(db, NULL,
	DB_RF_MUST_SUCCEED \| DB_RF_NOPREFETCH \| DB_RF_HAVESTRUCT);
	dbuf_rele(db, FTAG);
	}

	dpa->dpa_curlevel--;

	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
	(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
	blkptr_t bp = ((blkptr_t )abuf->b_data) +
	P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
	if (BP_IS_HOLE(bp) \|\| (zio != NULL && zio->io_error != 0)) {
	kmem_free(dpa, sizeof (*dpa));
	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
	ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
	dbuf_issue_final_prefetch(dpa, bp);
	kmem_free(dpa, sizeof (*dpa));
	} else {
	arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
	zbookmark_phys_t zb;

	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
	if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
	iter_aflags \|= ARC_FLAG_L2CACHE;

	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));

	SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
	dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);

	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
	bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE,
	&iter_aflags, &zb);
	}

	arc_buf_destroy(abuf, private);
	}

	/*
	* Issue prefetch reads for the given block on the given level. If the indirect
	* blocks above that block are not in memory, we will read them in
	* asynchronously. As a result, this call never blocks waiting for a read to
	* complete.
	*/
	void
	dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
	arc_flags_t aflags)
	{
	blkptr_t bp;
	int epbs, nlevels, curlevel;
	uint64_t curblkid;

	ASSERT(blkid != DMU_BONUS_BLKID);
	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));

	if (blkid > dn->dn_maxblkid)
	return;

	if (dnode_block_freed(dn, blkid))
	return;

	/*
	* This dnode hasn't been written to disk yet, so there's nothing to
	* prefetch.
	*/
	nlevels = dn->dn_phys->dn_nlevels;
	if (level >= nlevels \|\| dn->dn_phys->dn_nblkptr == 0)
	return;

	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
	return;

	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
	level, blkid);
	if (db != NULL) {
	mutex_exit(&db->db_mtx);
	/*
	* This dbuf already exists. It is either CACHED, or
	* (we assume) about to be read or filled.
	*/
	return;
	}

	/*
	* Find the closest ancestor (indirect block) of the target block
	* that is present in the cache. In this indirect block, we will
	* find the bp that is at curlevel, curblkid.
	*/
	curlevel = level;
	curblkid = blkid;
	while (curlevel < nlevels - 1) {
	int parent_level = curlevel + 1;
	uint64_t parent_blkid = curblkid >> epbs;
	dmu_buf_impl_t *db;

	if (dbuf_hold_impl(dn, parent_level, parent_blkid,
	FALSE, TRUE, FTAG, &db) == 0) {
	blkptr_t *bpp = db->db_buf->b_data;
	bp = bpp[P2PHASE(curblkid, 1 << epbs)];
	dbuf_rele(db, FTAG);
	break;
	}

	curlevel = parent_level;
	curblkid = parent_blkid;
	}

	if (curlevel == nlevels - 1) {
	/* No cached indirect blocks found. */
	ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
	bp = dn->dn_phys->dn_blkptr[curblkid];
	}
	if (BP_IS_HOLE(&bp))
	return;

	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));

	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
	ZIO_FLAG_CANFAIL);

	dbuf_prefetch_arg_t dpa = kmem_zalloc(sizeof (dpa), KM_SLEEP);
	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
	dn->dn_object, level, blkid);
	dpa->dpa_curlevel = curlevel;
	dpa->dpa_prio = prio;
	dpa->dpa_aflags = aflags;
	dpa->dpa_spa = dn->dn_objset->os_spa;
	dpa->dpa_dnode = dn;
	dpa->dpa_epbs = epbs;
	dpa->dpa_zio = pio;

	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
	if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
	dpa->dpa_aflags \|= ARC_FLAG_L2CACHE;

	/*
	* If we have the indirect just above us, no need to do the asynchronous
	* prefetch chain; we'll just run the last step ourselves. If we're at
	* a higher level, though, we want to issue the prefetches for all the
	* indirect blocks asynchronously, so we can go on with whatever we were
	* doing.
	*/
	if (curlevel == level) {
	ASSERT3U(curblkid, ==, blkid);
	dbuf_issue_final_prefetch(dpa, &bp);
	kmem_free(dpa, sizeof (*dpa));
	} else {
	arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
	zbookmark_phys_t zb;

	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
	if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
	iter_aflags \|= ARC_FLAG_L2CACHE;

	SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
	dn->dn_object, curlevel, curblkid);
	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
	&bp, dbuf_prefetch_indirect_done, dpa, prio,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE,
	&iter_aflags, &zb);
	}
	/*
	* We use pio here instead of dpa_zio since it's possible that
	* dpa may have already been freed.
	*/
	zio_nowait(pio);
	}

	/*
	* Returns with db_holds incremented, and db_mtx not held.
	* Note: dn_struct_rwlock must be held.
	*/
	int
	dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
	boolean_t fail_sparse, boolean_t fail_uncached,
	void tag, dmu_buf_impl_t *dbp)
	{
	dmu_buf_impl_t db, parent = NULL;

	ASSERT(blkid != DMU_BONUS_BLKID);
	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
	ASSERT3U(dn->dn_nlevels, >, level);

	*dbp = NULL;
	top:
	/* dbuf_find() returns with db_mtx held */
	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);

	if (db == NULL) {
	blkptr_t *bp = NULL;
	int err;

	if (fail_uncached)
	return (SET_ERROR(ENOENT));

	ASSERT3P(parent, ==, NULL);
	err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
	if (fail_sparse) {
	if (err == 0 && bp && BP_IS_HOLE(bp))
	err = SET_ERROR(ENOENT);
	if (err) {
	if (parent)
	dbuf_rele(parent, NULL);
	return (err);
	}
	}
	if (err && err != ENOENT)
	return (err);
	db = dbuf_create(dn, level, blkid, parent, bp);
	}

	if (fail_uncached && db->db_state != DB_CACHED) {
	mutex_exit(&db->db_mtx);
	return (SET_ERROR(ENOENT));
	}

	if (db->db_buf != NULL)
	ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);

	ASSERT(db->db_buf == NULL \|\| arc_referenced(db->db_buf));

	/*
	* If this buffer is currently syncing out, and we are are
	* still referencing it from db_data, we need to make a copy
	* of it in case we decide we want to dirty it again in this txg.
	*/
	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
	dn->dn_object != DMU_META_DNODE_OBJECT &&
	db->db_state == DB_CACHED && db->db_data_pending) {
	dbuf_dirty_record_t *dr = db->db_data_pending;

	if (dr->dt.dl.dr_data == db->db_buf) {
	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);

	dbuf_set_data(db,
	arc_alloc_buf(dn->dn_objset->os_spa, db, type,
	db->db.db_size));
	bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
	db->db.db_size);
	}
	}

	if (multilist_link_active(&db->db_cache_link)) {
	ASSERT(refcount_is_zero(&db->db_holds));
	multilist_remove(dbuf_cache, db);
	(void) refcount_remove_many(&dbuf_cache_size,
	db->db.db_size, db);
	}
	(void) refcount_add(&db->db_holds, tag);
	DBUF_VERIFY(db);
	mutex_exit(&db->db_mtx);

	/* NOTE: we can't rele the parent until after we drop the db_mtx */
	if (parent)
	dbuf_rele(parent, NULL);

	ASSERT3P(DB_DNODE(db), ==, dn);
	ASSERT3U(db->db_blkid, ==, blkid);
	ASSERT3U(db->db_level, ==, level);
	*dbp = db;

	return (0);
	}

	dmu_buf_impl_t *
	dbuf_hold(dnode_t dn, uint64_t blkid, void tag)
	{
	return (dbuf_hold_level(dn, 0, blkid, tag));
	}

	dmu_buf_impl_t *
	dbuf_hold_level(dnode_t dn, int level, uint64_t blkid, void tag)
	{
	dmu_buf_impl_t *db;
	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
	return (err ? NULL : db);
	}

	void
	dbuf_create_bonus(dnode_t *dn)
	{
	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));

	ASSERT(dn->dn_bonus == NULL);
	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
	}

	int
	dbuf_spill_set_blksz(dmu_buf_t db_fake, uint64_t blksz, dmu_tx_t tx)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	dnode_t *dn;

	if (db->db_blkid != DMU_SPILL_BLKID)
	return (SET_ERROR(ENOTSUP));
	if (blksz == 0)
	blksz = SPA_MINBLOCKSIZE;
	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	dbuf_new_size(db, blksz, tx);
	rw_exit(&dn->dn_struct_rwlock);
	DB_DNODE_EXIT(db);

	return (0);
	}

	void
	dbuf_rm_spill(dnode_t dn, dmu_tx_t tx)
	{
	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
	}

	#pragma weak dmu_buf_add_ref = dbuf_add_ref
	void
	dbuf_add_ref(dmu_buf_impl_t db, void tag)
	{
	int64_t holds = refcount_add(&db->db_holds, tag);
	ASSERT3S(holds, >, 1);
	}

	#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
	boolean_t
	dbuf_try_add_ref(dmu_buf_t db_fake, objset_t os, uint64_t obj, uint64_t blkid,
	void *tag)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	dmu_buf_impl_t *found_db;
	boolean_t result = B_FALSE;

	if (db->db_blkid == DMU_BONUS_BLKID)
	found_db = dbuf_find_bonus(os, obj);
	else
	found_db = dbuf_find(os, obj, 0, blkid);

	if (found_db != NULL) {
	if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
	(void) refcount_add(&db->db_holds, tag);
	result = B_TRUE;
	}
	mutex_exit(&db->db_mtx);
	}
	return (result);
	}

	/*
	* If you call dbuf_rele() you had better not be referencing the dnode handle
	* unless you have some other direct or indirect hold on the dnode. (An indirect
	* hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
	* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
	* dnode's parent dbuf evicting its dnode handles.
	*/
	void
	dbuf_rele(dmu_buf_impl_t db, void tag)
	{
	mutex_enter(&db->db_mtx);
	dbuf_rele_and_unlock(db, tag);
	}

	void
	dmu_buf_rele(dmu_buf_t db, void tag)
	{
	dbuf_rele((dmu_buf_impl_t *)db, tag);
	}

	/*
	* dbuf_rele() for an already-locked dbuf. This is necessary to allow
	* db_dirtycnt and db_holds to be updated atomically.
	*/
	void
	dbuf_rele_and_unlock(dmu_buf_impl_t db, void tag)
	{
	int64_t holds;

	ASSERT(MUTEX_HELD(&db->db_mtx));
	DBUF_VERIFY(db);

	/*
	* Remove the reference to the dbuf before removing its hold on the
	* dnode so we can guarantee in dnode_move() that a referenced bonus
	* buffer has a corresponding dnode hold.
	*/
	holds = refcount_remove(&db->db_holds, tag);
	ASSERT(holds >= 0);

	/*
	* We can't freeze indirects if there is a possibility that they
	* may be modified in the current syncing context.
	*/
	if (db->db_buf != NULL &&
	holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
	arc_buf_freeze(db->db_buf);
	}

	if (holds == db->db_dirtycnt &&
	db->db_level == 0 && db->db_user_immediate_evict)
	dbuf_evict_user(db);

	if (holds == 0) {
	if (db->db_blkid == DMU_BONUS_BLKID) {
	dnode_t *dn;
	boolean_t evict_dbuf = db->db_pending_evict;

	/*
	* If the dnode moves here, we cannot cross this
	* barrier until the move completes.
	*/
	DB_DNODE_ENTER(db);

	dn = DB_DNODE(db);
	atomic_dec_32(&dn->dn_dbufs_count);

	/*
	* Decrementing the dbuf count means that the bonus
	* buffer's dnode hold is no longer discounted in
	* dnode_move(). The dnode cannot move until after
	* the dnode_rele() below.
	*/
	DB_DNODE_EXIT(db);

	/*
	* Do not reference db after its lock is dropped.
	* Another thread may evict it.
	*/
	mutex_exit(&db->db_mtx);

	if (evict_dbuf)
	dnode_evict_bonus(dn);

	dnode_rele(dn, db);
	} else if (db->db_buf == NULL) {
	/*
	* This is a special case: we never associated this
	* dbuf with any data allocated from the ARC.
	*/
	ASSERT(db->db_state == DB_UNCACHED \|\|
	db->db_state == DB_NOFILL);
	dbuf_destroy(db);
	} else if (arc_released(db->db_buf)) {
	/*
	* This dbuf has anonymous data associated with it.
	*/
	dbuf_destroy(db);
	} else {
	boolean_t do_arc_evict = B_FALSE;
	blkptr_t bp;
	spa_t *spa = dmu_objset_spa(db->db_objset);

	if (!DBUF_IS_CACHEABLE(db) &&
	db->db_blkptr != NULL &&
	!BP_IS_HOLE(db->db_blkptr) &&
	!BP_IS_EMBEDDED(db->db_blkptr)) {
	do_arc_evict = B_TRUE;
	bp = *db->db_blkptr;
	}

	if (!DBUF_IS_CACHEABLE(db) \|\|
	db->db_pending_evict) {
	dbuf_destroy(db);
	} else if (!multilist_link_active(&db->db_cache_link)) {
	multilist_insert(dbuf_cache, db);
	(void) refcount_add_many(&dbuf_cache_size,
	db->db.db_size, db);
	mutex_exit(&db->db_mtx);

	dbuf_evict_notify();
	}

	if (do_arc_evict)
	arc_freed(spa, &bp);
	}
	} else {
	mutex_exit(&db->db_mtx);
	}

	}

	#pragma weak dmu_buf_refcount = dbuf_refcount
	uint64_t
	dbuf_refcount(dmu_buf_impl_t *db)
	{
	return (refcount_count(&db->db_holds));
	}

	void *
	dmu_buf_replace_user(dmu_buf_t db_fake, dmu_buf_user_t old_user,
	dmu_buf_user_t *new_user)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;

	mutex_enter(&db->db_mtx);
	dbuf_verify_user(db, DBVU_NOT_EVICTING);
	if (db->db_user == old_user)
	db->db_user = new_user;
	else
	old_user = db->db_user;
	dbuf_verify_user(db, DBVU_NOT_EVICTING);
	mutex_exit(&db->db_mtx);

	return (old_user);
	}

	void *
	dmu_buf_set_user(dmu_buf_t db_fake, dmu_buf_user_t user)
	{
	return (dmu_buf_replace_user(db_fake, NULL, user));
	}

	void *
	dmu_buf_set_user_ie(dmu_buf_t db_fake, dmu_buf_user_t user)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;

	db->db_user_immediate_evict = TRUE;
	return (dmu_buf_set_user(db_fake, user));
	}

	void *
	dmu_buf_remove_user(dmu_buf_t db_fake, dmu_buf_user_t user)
	{
	return (dmu_buf_replace_user(db_fake, user, NULL));
	}

	void *
	dmu_buf_get_user(dmu_buf_t *db_fake)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;

	dbuf_verify_user(db, DBVU_NOT_EVICTING);
	return (db->db_user);
	}

	void
	dmu_buf_user_evict_wait()
	{
	taskq_wait(dbu_evict_taskq);
	}

	blkptr_t *
	dmu_buf_get_blkptr(dmu_buf_t *db)
	{
	dmu_buf_impl_t dbi = (dmu_buf_impl_t )db;
	return (dbi->db_blkptr);
	}

	objset_t *
	dmu_buf_get_objset(dmu_buf_t *db)
	{
	dmu_buf_impl_t dbi = (dmu_buf_impl_t )db;
	return (dbi->db_objset);
	}

	dnode_t *
	dmu_buf_dnode_enter(dmu_buf_t *db)
	{
	dmu_buf_impl_t dbi = (dmu_buf_impl_t )db;
	DB_DNODE_ENTER(dbi);
	return (DB_DNODE(dbi));
	}

	void
	dmu_buf_dnode_exit(dmu_buf_t *db)
	{
	dmu_buf_impl_t dbi = (dmu_buf_impl_t )db;
	DB_DNODE_EXIT(dbi);
	}

	static void
	dbuf_check_blkptr(dnode_t dn, dmu_buf_impl_t db)
	{
	/* ASSERT(dmu_tx_is_syncing(tx) */
	ASSERT(MUTEX_HELD(&db->db_mtx));

	if (db->db_blkptr != NULL)
	return;

	if (db->db_blkid == DMU_SPILL_BLKID) {
	db->db_blkptr = &dn->dn_phys->dn_spill;
	BP_ZERO(db->db_blkptr);
	return;
	}
	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
	/*
	* This buffer was allocated at a time when there was
	* no available blkptrs from the dnode, or it was
	* inappropriate to hook it in (i.e., nlevels mis-match).
	*/
	ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
	ASSERT(db->db_parent == NULL);
	db->db_parent = dn->dn_dbuf;
	db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
	DBUF_VERIFY(db);
	} else {
	dmu_buf_impl_t *parent = db->db_parent;
	int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;

	ASSERT(dn->dn_phys->dn_nlevels > 1);
	if (parent == NULL) {
	mutex_exit(&db->db_mtx);
	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	parent = dbuf_hold_level(dn, db->db_level + 1,
	db->db_blkid >> epbs, db);
	rw_exit(&dn->dn_struct_rwlock);
	mutex_enter(&db->db_mtx);
	db->db_parent = parent;
	}
	db->db_blkptr = (blkptr_t *)parent->db.db_data +
	(db->db_blkid & ((1ULL << epbs) - 1));
	DBUF_VERIFY(db);
	}
	}

	static void
	dbuf_sync_indirect(dbuf_dirty_record_t dr, dmu_tx_t tx)
	{
	dmu_buf_impl_t *db = dr->dr_dbuf;
	dnode_t *dn;
	zio_t *zio;

	ASSERT(dmu_tx_is_syncing(tx));

	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);

	mutex_enter(&db->db_mtx);

	ASSERT(db->db_level > 0);
	DBUF_VERIFY(db);

	/* Read the block if it hasn't been read yet. */
	if (db->db_buf == NULL) {
	mutex_exit(&db->db_mtx);
	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
	mutex_enter(&db->db_mtx);
	}
	ASSERT3U(db->db_state, ==, DB_CACHED);
	ASSERT(db->db_buf != NULL);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	/* Indirect block size must match what the dnode thinks it is. */
	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
	dbuf_check_blkptr(dn, db);
	DB_DNODE_EXIT(db);

	/* Provide the pending dirty record to child dbufs */
	db->db_data_pending = dr;

	mutex_exit(&db->db_mtx);
	+
	dbuf_write(dr, db->db_buf, tx);

	zio = dr->dr_zio;
	mutex_enter(&dr->dt.di.dr_mtx);
	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
	mutex_exit(&dr->dt.di.dr_mtx);
	zio_nowait(zio);
	}

	static void
	dbuf_sync_leaf(dbuf_dirty_record_t dr, dmu_tx_t tx)
	{
	arc_buf_t **datap = &dr->dt.dl.dr_data;
	dmu_buf_impl_t *db = dr->dr_dbuf;
	dnode_t *dn;
	objset_t *os;
	uint64_t txg = tx->tx_txg;

	ASSERT(dmu_tx_is_syncing(tx));

	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);

	mutex_enter(&db->db_mtx);
	/*
	* To be synced, we must be dirtied. But we
	* might have been freed after the dirty.
	*/
	if (db->db_state == DB_UNCACHED) {
	/* This buffer has been freed since it was dirtied */
	ASSERT(db->db.db_data == NULL);
	} else if (db->db_state == DB_FILL) {
	/* This buffer was freed and is now being re-filled */
	ASSERT(db->db.db_data != dr->dt.dl.dr_data);
	} else {
	ASSERT(db->db_state == DB_CACHED \|\| db->db_state == DB_NOFILL);
	}
	DBUF_VERIFY(db);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);

	if (db->db_blkid == DMU_SPILL_BLKID) {
	mutex_enter(&dn->dn_mtx);
	dn->dn_phys->dn_flags \|= DNODE_FLAG_SPILL_BLKPTR;
	mutex_exit(&dn->dn_mtx);
	}

	/*
	* If this is a bonus buffer, simply copy the bonus data into the
	* dnode. It will be written out when the dnode is synced (and it
	* will be synced, since it must have been dirty for dbuf_sync to
	* be called).
	*/
	if (db->db_blkid == DMU_BONUS_BLKID) {
	dbuf_dirty_record_t **drp;

	ASSERT(*datap != NULL);
	ASSERT0(db->db_level);
	ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
	bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
	DB_DNODE_EXIT(db);

	if (*datap != db->db.db_data) {
	zio_buf_free(*datap, DN_MAX_BONUSLEN);
	arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
	}
	db->db_data_pending = NULL;
	drp = &db->db_last_dirty;
	while (*drp != dr)
	drp = &(*drp)->dr_next;
	ASSERT(dr->dr_next == NULL);
	ASSERT(dr->dr_dbuf == db);
	*drp = dr->dr_next;
	if (dr->dr_dbuf->db_level != 0) {
	list_destroy(&dr->dt.di.dr_children);
	mutex_destroy(&dr->dt.di.dr_mtx);
	}
	kmem_free(dr, sizeof (dbuf_dirty_record_t));
	ASSERT(db->db_dirtycnt > 0);
	db->db_dirtycnt -= 1;
	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
	return;
	}

	os = dn->dn_objset;

	/*
	* This function may have dropped the db_mtx lock allowing a dmu_sync
	* operation to sneak in. As a result, we need to ensure that we
	* don't check the dr_override_state until we have returned from
	* dbuf_check_blkptr.
	*/
	dbuf_check_blkptr(dn, db);

	/*
	* If this buffer is in the middle of an immediate write,
	* wait for the synchronous IO to complete.
	*/
	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
	cv_wait(&db->db_changed, &db->db_mtx);
	ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
	}

	if (db->db_state != DB_NOFILL &&
	dn->dn_object != DMU_META_DNODE_OBJECT &&
	refcount_count(&db->db_holds) > 1 &&
	dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
	*datap == db->db_buf) {
	/*
	* If this buffer is currently "in use" (i.e., there
	* are active holds and db_data still references it),
	* then make a copy before we start the write so that
	* any modifications from the open txg will not leak
	* into this write.
	*
	* NOTE: this copy does not need to be made for
	* objects only modified in the syncing context (e.g.
	* DNONE_DNODE blocks).
	*/
	int psize = arc_buf_size(*datap);
	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
	enum zio_compress compress_type = arc_get_compression(*datap);

	if (compress_type == ZIO_COMPRESS_OFF) {
	*datap = arc_alloc_buf(os->os_spa, db, type, psize);
	} else {
	ASSERT3U(type, ==, ARC_BUFC_DATA);
	int lsize = arc_buf_lsize(*datap);
	*datap = arc_alloc_compressed_buf(os->os_spa, db,
	psize, lsize, compress_type);
	}
	bcopy(db->db.db_data, (*datap)->b_data, psize);
	}
	db->db_data_pending = dr;

	mutex_exit(&db->db_mtx);

	dbuf_write(dr, *datap, tx);

	ASSERT(!list_link_active(&dr->dr_dirty_node));
	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
	list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
	DB_DNODE_EXIT(db);
	} else {
	/*
	* Although zio_nowait() does not "wait for an IO", it does
	* initiate the IO. If this is an empty write it seems plausible
	* that the IO could actually be completed before the nowait
	* returns. We need to DB_DNODE_EXIT() first in case
	* zio_nowait() invalidates the dbuf.
	*/
	DB_DNODE_EXIT(db);
	zio_nowait(dr->dr_zio);
	}
	}

	void
	dbuf_sync_list(list_t list, int level, dmu_tx_t tx)
	{
	dbuf_dirty_record_t *dr;

	while (dr = list_head(list)) {
	if (dr->dr_zio != NULL) {
	/*
	* If we find an already initialized zio then we
	* are processing the meta-dnode, and we have finished.
	* The dbufs for all dnodes are put back on the list
	* during processing, so that we can zio_wait()
	* these IOs after initiating all child IOs.
	*/
	ASSERT3U(dr->dr_dbuf->db.db_object, ==,
	DMU_META_DNODE_OBJECT);
	break;
	}
	if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
	dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
	VERIFY3U(dr->dr_dbuf->db_level, ==, level);
	}
	list_remove(list, dr);
	if (dr->dr_dbuf->db_level > 0)
	dbuf_sync_indirect(dr, tx);
	else
	dbuf_sync_leaf(dr, tx);
	}
	}

	/* ARGSUSED */
	static void
	dbuf_write_ready(zio_t zio, arc_buf_t buf, void *vdb)
	{
	dmu_buf_impl_t *db = vdb;
	dnode_t *dn;
	blkptr_t *bp = zio->io_bp;
	blkptr_t *bp_orig = &zio->io_bp_orig;
	spa_t *spa = zio->io_spa;
	int64_t delta;
	uint64_t fill = 0;
	int i;

	ASSERT3P(db->db_blkptr, !=, NULL);
	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
	zio->io_prev_space_delta = delta;

	if (bp->blk_birth != 0) {
	ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
	BP_GET_TYPE(bp) == dn->dn_type) \|\|
	(db->db_blkid == DMU_SPILL_BLKID &&
	BP_GET_TYPE(bp) == dn->dn_bonustype) \|\|
	BP_IS_EMBEDDED(bp));
	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
	}

	mutex_enter(&db->db_mtx);

	#ifdef ZFS_DEBUG
	if (db->db_blkid == DMU_SPILL_BLKID) {
	ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
	ASSERT(!(BP_IS_HOLE(bp)) &&
	db->db_blkptr == &dn->dn_phys->dn_spill);
	}
	#endif

	if (db->db_level == 0) {
	mutex_enter(&dn->dn_mtx);
	if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
	db->db_blkid != DMU_SPILL_BLKID)
	dn->dn_phys->dn_maxblkid = db->db_blkid;
	mutex_exit(&dn->dn_mtx);

	if (dn->dn_type == DMU_OT_DNODE) {
	dnode_phys_t *dnp = db->db.db_data;
	for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
	i--, dnp++) {
	if (dnp->dn_type != DMU_OT_NONE)
	fill++;
	}
	} else {
	if (BP_IS_HOLE(bp)) {
	fill = 0;
	} else {
	fill = 1;
	}
	}
	} else {
	blkptr_t *ibp = db->db.db_data;
	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
	for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
	if (BP_IS_HOLE(ibp))
	continue;
	fill += BP_GET_FILL(ibp);
	}
	}
	DB_DNODE_EXIT(db);

	if (!BP_IS_EMBEDDED(bp))
	bp->blk_fill = fill;

	mutex_exit(&db->db_mtx);

	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	db->db_blkptr = bp;
	rw_exit(&dn->dn_struct_rwlock);
	}

	/* ARGSUSED */
	/*
	* This function gets called just prior to running through the compression
	* stage of the zio pipeline. If we're an indirect block comprised of only
	* holes, then we want this indirect to be compressed away to a hole. In
	* order to do that we must zero out any information about the holes that
	* this indirect points to prior to before we try to compress it.
	*/
	static void
	dbuf_write_children_ready(zio_t zio, arc_buf_t buf, void *vdb)
	{
	dmu_buf_impl_t *db = vdb;
	dnode_t *dn;
	blkptr_t *bp;
	unsigned int epbs, i;

	ASSERT3U(db->db_level, >, 0);
	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	ASSERT3U(epbs, <, 31);

	/* Determine if all our children are holes */
	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
	if (!BP_IS_HOLE(bp))
	break;
	}

	/*
	* If all the children are holes, then zero them all out so that
	* we may get compressed away.
	*/
	if (i == 1 << epbs) {
	/*
	* We only found holes. Grab the rwlock to prevent
	* anybody from reading the blocks we're about to
	* zero out.
	*/
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	bzero(db->db.db_data, db->db.db_size);
	rw_exit(&dn->dn_struct_rwlock);
	}
	DB_DNODE_EXIT(db);
	}

	/*
	* The SPA will call this callback several times for each zio - once
	* for every physical child i/o (zio->io_phys_children times). This
	* allows the DMU to monitor the progress of each logical i/o. For example,
	* there may be 2 copies of an indirect block, or many fragments of a RAID-Z
	* block. There may be a long delay before all copies/fragments are completed,
	* so this callback allows us to retire dirty space gradually, as the physical
	* i/os complete.
	*/
	/* ARGSUSED */
	static void
	dbuf_write_physdone(zio_t zio, arc_buf_t buf, void *arg)
	{
	dmu_buf_impl_t *db = arg;
	objset_t *os = db->db_objset;
	dsl_pool_t *dp = dmu_objset_pool(os);
	dbuf_dirty_record_t *dr;
	int delta = 0;

	dr = db->db_data_pending;
	ASSERT3U(dr->dr_txg, ==, zio->io_txg);

	/*
	* The callback will be called io_phys_children times. Retire one
	* portion of our dirty space each time we are called. Any rounding
	* error will be cleaned up by dsl_pool_sync()'s call to
	* dsl_pool_undirty_space().
	*/
	delta = dr->dr_accounted / zio->io_phys_children;
	dsl_pool_undirty_space(dp, delta, zio->io_txg);
	}

	/* ARGSUSED */
	static void
	dbuf_write_done(zio_t zio, arc_buf_t buf, void *vdb)
	{
	dmu_buf_impl_t *db = vdb;
	blkptr_t *bp_orig = &zio->io_bp_orig;
	blkptr_t *bp = db->db_blkptr;
	objset_t *os = db->db_objset;
	dmu_tx_t *tx = os->os_synctx;
	dbuf_dirty_record_t *drp, dr;

	ASSERT0(zio->io_error);
	ASSERT(db->db_blkptr == bp);

	/*
	* For nopwrites and rewrites we ensure that the bp matches our
	* original and bypass all the accounting.
	*/
	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE \| ZIO_FLAG_NOPWRITE)) {
	ASSERT(BP_EQUAL(bp, bp_orig));
	} else {
	dsl_dataset_t *ds = os->os_dsl_dataset;
	(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
	dsl_dataset_block_born(ds, bp, tx);
	}

	mutex_enter(&db->db_mtx);

	DBUF_VERIFY(db);

	drp = &db->db_last_dirty;
	while ((dr = *drp) != db->db_data_pending)
	drp = &dr->dr_next;
	ASSERT(!list_link_active(&dr->dr_dirty_node));
	ASSERT(dr->dr_dbuf == db);
	ASSERT(dr->dr_next == NULL);
	*drp = dr->dr_next;

	#ifdef ZFS_DEBUG
	if (db->db_blkid == DMU_SPILL_BLKID) {
	dnode_t *dn;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
	ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
	db->db_blkptr == &dn->dn_phys->dn_spill);
	DB_DNODE_EXIT(db);
	}
	#endif

	if (db->db_level == 0) {
	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
	if (db->db_state != DB_NOFILL) {
	if (dr->dt.dl.dr_data != db->db_buf)
	arc_buf_destroy(dr->dt.dl.dr_data, db);
	}
	} else {
	dnode_t *dn;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
	ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
	if (!BP_IS_HOLE(db->db_blkptr)) {
	int epbs =
	dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	ASSERT3U(db->db_blkid, <=,
	dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
	ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
	db->db.db_size);
	}
	DB_DNODE_EXIT(db);
	mutex_destroy(&dr->dt.di.dr_mtx);
	list_destroy(&dr->dt.di.dr_children);
	}
	kmem_free(dr, sizeof (dbuf_dirty_record_t));

	cv_broadcast(&db->db_changed);
	ASSERT(db->db_dirtycnt > 0);
	db->db_dirtycnt -= 1;
	db->db_data_pending = NULL;
	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
	}

	static void
	dbuf_write_nofill_ready(zio_t *zio)
	{
	dbuf_write_ready(zio, NULL, zio->io_private);
	}

	static void
	dbuf_write_nofill_done(zio_t *zio)
	{
	dbuf_write_done(zio, NULL, zio->io_private);
	}

	static void
	dbuf_write_override_ready(zio_t *zio)
	{
	dbuf_dirty_record_t *dr = zio->io_private;
	dmu_buf_impl_t *db = dr->dr_dbuf;

	dbuf_write_ready(zio, NULL, db);
	}

	static void
	dbuf_write_override_done(zio_t *zio)
	{
	dbuf_dirty_record_t *dr = zio->io_private;
	dmu_buf_impl_t *db = dr->dr_dbuf;
	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;

	mutex_enter(&db->db_mtx);
	if (!BP_EQUAL(zio->io_bp, obp)) {
	if (!BP_IS_HOLE(obp))
	dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
	arc_release(dr->dt.dl.dr_data, db);
	}
	mutex_exit(&db->db_mtx);
	dbuf_write_done(zio, NULL, db);

	if (zio->io_abd != NULL)
	abd_put(zio->io_abd);
	}

	+typedef struct dbuf_remap_impl_callback_arg {
	+ objset_t *drica_os;
	+ uint64_t drica_blk_birth;
	+ dmu_tx_t *drica_tx;
	+} dbuf_remap_impl_callback_arg_t;
	+
	+static void
	+dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
	+ void *arg)
	+{
	+ dbuf_remap_impl_callback_arg_t *drica = arg;
	+ objset_t *os = drica->drica_os;
	+ spa_t *spa = dmu_objset_spa(os);
	+ dmu_tx_t *tx = drica->drica_tx;
	+
	+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
	+
	+ if (os == spa_meta_objset(spa)) {
	+ spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
	+ } else {
	+ dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
	+ size, drica->drica_blk_birth, tx);
	+ }
	+}
	+
	+static void
	+dbuf_remap_impl(dnode_t dn, blkptr_t bp, dmu_tx_t *tx)
	+{
	+ blkptr_t bp_copy = *bp;
	+ spa_t *spa = dmu_objset_spa(dn->dn_objset);
	+ dbuf_remap_impl_callback_arg_t drica;
	+
	+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
	+
	+ drica.drica_os = dn->dn_objset;
	+ drica.drica_blk_birth = bp->blk_birth;
	+ drica.drica_tx = tx;
	+ if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
	+ &drica)) {
	+ /*
	+ * The struct_rwlock prevents dbuf_read_impl() from
	+ * dereferencing the BP while we are changing it. To
	+ * avoid lock contention, only grab it when we are actually
	+ * changing the BP.
	+ */
	+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	+ *bp = bp_copy;
	+ rw_exit(&dn->dn_struct_rwlock);
	+ }
	+}
	+
	+/*
	+ * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting
	+ * to remap a copy of every bp in the dbuf.
	+ */
	+boolean_t
	+dbuf_can_remap(const dmu_buf_impl_t *db)
	+{
	+ spa_t *spa = dmu_objset_spa(db->db_objset);
	+ blkptr_t *bp = db->db.db_data;
	+ boolean_t ret = B_FALSE;
	+
	+ ASSERT3U(db->db_level, >, 0);
	+ ASSERT3S(db->db_state, ==, DB_CACHED);
	+
	+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
	+
	+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	+ for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
	+ blkptr_t bp_copy = bp[i];
	+ if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
	+ ret = B_TRUE;
	+ break;
	+ }
	+ }
	+ spa_config_exit(spa, SCL_VDEV, FTAG);
	+
	+ return (ret);
	+}
	+
	+boolean_t
	+dnode_needs_remap(const dnode_t *dn)
	+{
	+ spa_t *spa = dmu_objset_spa(dn->dn_objset);
	+ boolean_t ret = B_FALSE;
	+
	+ if (dn->dn_phys->dn_nlevels == 0) {
	+ return (B_FALSE);
	+ }
	+
	+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
	+
	+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	+ for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) {
	+ blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j];
	+ if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
	+ ret = B_TRUE;
	+ break;
	+ }
	+ }
	+ spa_config_exit(spa, SCL_VDEV, FTAG);
	+
	+ return (ret);
	+}
	+
	+/*
	+ * Remap any existing BP's to concrete vdevs, if possible.
	+ */
	+static void
	+dbuf_remap(dnode_t dn, dmu_buf_impl_t db, dmu_tx_t *tx)
	+{
	+ spa_t *spa = dmu_objset_spa(db->db_objset);
	+ ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
	+
	+ if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
	+ return;
	+
	+ if (db->db_level > 0) {
	+ blkptr_t *bp = db->db.db_data;
	+ for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
	+ dbuf_remap_impl(dn, &bp[i], tx);
	+ }
	+ } else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
	+ dnode_phys_t *dnp = db->db.db_data;
	+ ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
	+ DMU_OT_DNODE);
	+ for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) {
	+ for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
	+ dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
	+ }
	+ }
	+ }
	+}
	+
	+
	/* Issue I/O to commit a dirty buffer to disk. */
	static void
	dbuf_write(dbuf_dirty_record_t dr, arc_buf_t data, dmu_tx_t *tx)
	{
	dmu_buf_impl_t *db = dr->dr_dbuf;
	dnode_t *dn;
	objset_t *os;
	dmu_buf_impl_t *parent = db->db_parent;
	uint64_t txg = tx->tx_txg;
	zbookmark_phys_t zb;
	zio_prop_t zp;
	zio_t *zio;
	int wp_flag = 0;

	ASSERT(dmu_tx_is_syncing(tx));

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	os = dn->dn_objset;

	if (db->db_state != DB_NOFILL) {
	if (db->db_level > 0 \|\| dn->dn_type == DMU_OT_DNODE) {
	/*
	* Private object buffers are released here rather
	* than in dbuf_dirty() since they are only modified
	* in the syncing context and we don't want the
	* overhead of making multiple copies of the data.
	*/
	if (BP_IS_HOLE(db->db_blkptr)) {
	arc_buf_thaw(data);
	} else {
	dbuf_release_bp(db);
	}
	+ dbuf_remap(dn, db, tx);
	}
	}

	if (parent != dn->dn_dbuf) {
	/* Our parent is an indirect block. */
	/* We have a dirty parent that has been scheduled for write. */
	ASSERT(parent && parent->db_data_pending);
	/* Our parent's buffer is one level closer to the dnode. */
	ASSERT(db->db_level == parent->db_level-1);
	/*
	* We're about to modify our parent's db_data by modifying
	* our block pointer, so the parent must be released.
	*/
	ASSERT(arc_released(parent->db_buf));
	zio = parent->db_data_pending->dr_zio;
	} else {
	/* Our parent is the dnode itself. */
	ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
	db->db_blkid != DMU_SPILL_BLKID) \|\|
	(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
	if (db->db_blkid != DMU_SPILL_BLKID)
	ASSERT3P(db->db_blkptr, ==,
	&dn->dn_phys->dn_blkptr[db->db_blkid]);
	zio = dn->dn_zio;
	}

	ASSERT(db->db_level == 0 \|\| data == db->db_buf);
	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
	ASSERT(zio);

	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
	os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
	db->db.db_object, db->db_level, db->db_blkid);

	if (db->db_blkid == DMU_SPILL_BLKID)
	wp_flag = WP_SPILL;
	wp_flag \|= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;

	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
	DB_DNODE_EXIT(db);

	/*
	* We copy the blkptr now (rather than when we instantiate the dirty
	* record), because its value can change between open context and
	* syncing context. We do not need to hold dn_struct_rwlock to read
	* db_blkptr because we are in syncing context.
	*/
	dr->dr_bp_copy = *db->db_blkptr;

	if (db->db_level == 0 &&
	dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
	/*
	* The BP for this block has been provided by open context
	* (by dmu_sync() or dmu_buf_write_embedded()).
	*/
	abd_t *contents = (data != NULL) ?
	abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;

	dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
	contents, db->db.db_size, db->db.db_size, &zp,
	dbuf_write_override_ready, NULL, NULL,
	dbuf_write_override_done,
	dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
	mutex_enter(&db->db_mtx);
	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
	zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
	dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
	mutex_exit(&db->db_mtx);
	} else if (db->db_state == DB_NOFILL) {
	ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF \|\|
	zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
	dr->dr_zio = zio_write(zio, os->os_spa, txg,
	&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
	dbuf_write_nofill_ready, NULL, NULL,
	dbuf_write_nofill_done, db,
	ZIO_PRIORITY_ASYNC_WRITE,
	ZIO_FLAG_MUSTSUCCEED \| ZIO_FLAG_NODATA, &zb);
	} else {
	ASSERT(arc_released(data));

	/*
	* For indirect blocks, we want to setup the children
	* ready callback so that we can properly handle an indirect
	* block that only contains holes.
	*/
	arc_done_func_t *children_ready_cb = NULL;
	if (db->db_level != 0)
	children_ready_cb = dbuf_write_children_ready;

	dr->dr_zio = arc_write(zio, os->os_spa, txg,
	&dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
	&zp, dbuf_write_ready, children_ready_cb,
	dbuf_write_physdone, dbuf_write_done, db,
	ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
	}
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c (revision 332525)
	@@ -1,1164 +1,1164 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/spa_impl.h>
	#include <sys/zio.h>
	#include <sys/ddt.h>
	#include <sys/zap.h>
	#include <sys/dmu_tx.h>
	#include <sys/arc.h>
	#include <sys/dsl_pool.h>
	#include <sys/zio_checksum.h>
	#include <sys/zio_compress.h>
	#include <sys/dsl_scan.h>
	#include <sys/abd.h>

	/*
	* Enable/disable prefetching of dedup-ed blocks which are going to be freed.
	*/
	int zfs_dedup_prefetch = 1;

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP");
	SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch,
	0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed");

	static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
	&ddt_zap_ops,
	};

	static const char *ddt_class_name[DDT_CLASSES] = {
	"ditto",
	"duplicate",
	"unique",
	};

	static void
	ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	dmu_tx_t *tx)
	{
	spa_t *spa = ddt->ddt_spa;
	objset_t *os = ddt->ddt_os;
	uint64_t *objectp = &ddt->ddt_object[type][class];
	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
	ZCHECKSUM_FLAG_DEDUP;
	char name[DDT_NAMELEN];

	ddt_object_name(ddt, type, class, name);

	ASSERT(*objectp == 0);
	VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
	ASSERT(*objectp != 0);

	VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
	sizeof (uint64_t), 1, objectp, tx) == 0);

	VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
	sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
	&ddt->ddt_histogram[type][class], tx) == 0);
	}

	static void
	ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	dmu_tx_t *tx)
	{
	spa_t *spa = ddt->ddt_spa;
	objset_t *os = ddt->ddt_os;
	uint64_t *objectp = &ddt->ddt_object[type][class];
	uint64_t count;
	char name[DDT_NAMELEN];

	ddt_object_name(ddt, type, class, name);

	ASSERT(*objectp != 0);
	VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
	VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
	VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
	VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
	bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));

	*objectp = 0;
	}

	static int
	ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
	{
	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
	dmu_object_info_t doi;
	uint64_t count;
	char name[DDT_NAMELEN];
	int error;

	ddt_object_name(ddt, type, class, name);

	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
	sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);

	if (error != 0)
	return (error);

	VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
	sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
	&ddt->ddt_histogram[type][class]));

	/*
	* Seed the cached statistics.
	*/
	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);

	error = ddt_object_count(ddt, type, class, &count);
	if (error)
	return error;

	ddo->ddo_count = count;
	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;

	return (0);
	}

	static void
	ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	dmu_tx_t *tx)
	{
	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
	dmu_object_info_t doi;
	uint64_t count;
	char name[DDT_NAMELEN];

	ddt_object_name(ddt, type, class, name);

	VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
	sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
	&ddt->ddt_histogram[type][class], tx) == 0);

	/*
	* Cache DDT statistics; this is the only time they'll change.
	*/
	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
	VERIFY(ddt_object_count(ddt, type, class, &count) == 0);

	ddo->ddo_count = count;
	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
	}

	static int
	ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	ddt_entry_t *dde)
	{
	if (!ddt_object_exists(ddt, type, class))
	return (SET_ERROR(ENOENT));

	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
	ddt->ddt_object[type][class], dde));
	}

	static void
	ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	ddt_entry_t *dde)
	{
	if (!ddt_object_exists(ddt, type, class))
	return;

	ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
	ddt->ddt_object[type][class], dde);
	}

	int
	ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	ddt_entry_t dde, dmu_tx_t tx)
	{
	ASSERT(ddt_object_exists(ddt, type, class));

	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
	ddt->ddt_object[type][class], dde, tx));
	}

	static int
	ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	ddt_entry_t dde, dmu_tx_t tx)
	{
	ASSERT(ddt_object_exists(ddt, type, class));

	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
	ddt->ddt_object[type][class], dde, tx));
	}

	int
	ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	uint64_t walk, ddt_entry_t dde)
	{
	ASSERT(ddt_object_exists(ddt, type, class));

	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
	ddt->ddt_object[type][class], dde, walk));
	}

	int
	ddt_object_count(ddt_t ddt, enum ddt_type type, enum ddt_class class, uint64_t count)
	{
	ASSERT(ddt_object_exists(ddt, type, class));

	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
	ddt->ddt_object[type][class], count));
	}

	int
	ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	dmu_object_info_t *doi)
	{
	if (!ddt_object_exists(ddt, type, class))
	return (SET_ERROR(ENOENT));

	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
	doi));
	}

	boolean_t
	ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
	{
	return (!!ddt->ddt_object[type][class]);
	}

	void
	ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
	char *name)
	{
	(void) sprintf(name, DMU_POOL_DDT,
	zio_checksum_table[ddt->ddt_checksum].ci_name,
	ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
	}

	void
	ddt_bp_fill(const ddt_phys_t ddp, blkptr_t bp, uint64_t txg)
	{
	ASSERT(txg != 0);

	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
	bp->blk_dva[d] = ddp->ddp_dva[d];
	BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
	}

	void
	ddt_bp_create(enum zio_checksum checksum,
	const ddt_key_t ddk, const ddt_phys_t ddp, blkptr_t *bp)
	{
	BP_ZERO(bp);

	if (ddp != NULL)
	ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);

	bp->blk_cksum = ddk->ddk_cksum;
	bp->blk_fill = 1;

	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
	BP_SET_CHECKSUM(bp, checksum);
	BP_SET_TYPE(bp, DMU_OT_DEDUP);
	BP_SET_LEVEL(bp, 0);
	BP_SET_DEDUP(bp, 0);
	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
	}

	void
	ddt_key_fill(ddt_key_t ddk, const blkptr_t bp)
	{
	ddk->ddk_cksum = bp->blk_cksum;
	ddk->ddk_prop = 0;

	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
	}

	void
	ddt_phys_fill(ddt_phys_t ddp, const blkptr_t bp)
	{
	ASSERT(ddp->ddp_phys_birth == 0);

	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
	ddp->ddp_dva[d] = bp->blk_dva[d];
	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
	}

	void
	ddt_phys_clear(ddt_phys_t *ddp)
	{
	bzero(ddp, sizeof (*ddp));
	}

	void
	ddt_phys_addref(ddt_phys_t *ddp)
	{
	ddp->ddp_refcnt++;
	}

	void
	ddt_phys_decref(ddt_phys_t *ddp)
	{
	ASSERT((int64_t)ddp->ddp_refcnt > 0);
	ddp->ddp_refcnt--;
	}

	void
	ddt_phys_free(ddt_t ddt, ddt_key_t ddk, ddt_phys_t *ddp, uint64_t txg)
	{
	blkptr_t blk;

	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
	ddt_phys_clear(ddp);
	zio_free(ddt->ddt_spa, txg, &blk);
	}

	ddt_phys_t *
	ddt_phys_select(const ddt_entry_t dde, const blkptr_t bp)
	{
	ddt_phys_t ddp = (ddt_phys_t )dde->dde_phys;

	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
	BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
	return (ddp);
	}
	return (NULL);
	}

	uint64_t
	ddt_phys_total_refcnt(const ddt_entry_t *dde)
	{
	uint64_t refcnt = 0;

	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
	refcnt += dde->dde_phys[p].ddp_refcnt;

	return (refcnt);
	}

	static void
	ddt_stat_generate(ddt_t ddt, ddt_entry_t dde, ddt_stat_t *dds)
	{
	spa_t *spa = ddt->ddt_spa;
	ddt_phys_t *ddp = dde->dde_phys;
	ddt_key_t *ddk = &dde->dde_key;
	uint64_t lsize = DDK_GET_LSIZE(ddk);
	uint64_t psize = DDK_GET_PSIZE(ddk);

	bzero(dds, sizeof (*dds));

	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	uint64_t dsize = 0;
	uint64_t refcnt = ddp->ddp_refcnt;

	if (ddp->ddp_phys_birth == 0)
	continue;

	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
	dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);

	dds->dds_blocks += 1;
	dds->dds_lsize += lsize;
	dds->dds_psize += psize;
	dds->dds_dsize += dsize;

	dds->dds_ref_blocks += refcnt;
	dds->dds_ref_lsize += lsize * refcnt;
	dds->dds_ref_psize += psize * refcnt;
	dds->dds_ref_dsize += dsize * refcnt;
	}
	}

	void
	ddt_stat_add(ddt_stat_t dst, const ddt_stat_t src, uint64_t neg)
	{
	const uint64_t s = (const uint64_t )src;
	uint64_t d = (uint64_t )dst;
	uint64_t d_end = (uint64_t )(dst + 1);

	ASSERT(neg == 0 \|\| neg == -1ULL); /* add or subtract */

	while (d < d_end)
	d++ += (s++ ^ neg) - neg;
	}

	static void
	ddt_stat_update(ddt_t ddt, ddt_entry_t dde, uint64_t neg)
	{
	ddt_stat_t dds;
	ddt_histogram_t *ddh;
	int bucket;

	ddt_stat_generate(ddt, dde, &dds);

	bucket = highbit64(dds.dds_ref_blocks) - 1;
	ASSERT(bucket >= 0);

	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];

	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
	}

	void
	ddt_histogram_add(ddt_histogram_t dst, const ddt_histogram_t src)
	{
	for (int h = 0; h < 64; h++)
	ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
	}

	void
	ddt_histogram_stat(ddt_stat_t dds, const ddt_histogram_t ddh)
	{
	bzero(dds, sizeof (*dds));

	for (int h = 0; h < 64; h++)
	ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
	}

	boolean_t
	ddt_histogram_empty(const ddt_histogram_t *ddh)
	{
	const uint64_t s = (const uint64_t )ddh;
	const uint64_t s_end = (const uint64_t )(ddh + 1);

	while (s < s_end)
	if (*s++ != 0)
	return (B_FALSE);

	return (B_TRUE);
	}

	void
	ddt_get_dedup_object_stats(spa_t spa, ddt_object_t ddo_total)
	{
	/* Sum the statistics we cached in ddt_object_sync(). */
	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	ddt_t *ddt = spa->spa_ddt[c];
	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	for (enum ddt_class class = 0; class < DDT_CLASSES;
	class++) {
	ddt_object_t *ddo =
	&ddt->ddt_object_stats[type][class];
	ddo_total->ddo_count += ddo->ddo_count;
	ddo_total->ddo_dspace += ddo->ddo_dspace;
	ddo_total->ddo_mspace += ddo->ddo_mspace;
	}
	}
	}

	/* ... and compute the averages. */
	if (ddo_total->ddo_count != 0) {
	ddo_total->ddo_dspace /= ddo_total->ddo_count;
	ddo_total->ddo_mspace /= ddo_total->ddo_count;
	}
	}

	void
	ddt_get_dedup_histogram(spa_t spa, ddt_histogram_t ddh)
	{
	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	ddt_t *ddt = spa->spa_ddt[c];
	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	for (enum ddt_class class = 0; class < DDT_CLASSES;
	class++) {
	ddt_histogram_add(ddh,
	&ddt->ddt_histogram_cache[type][class]);
	}
	}
	}
	}

	void
	ddt_get_dedup_stats(spa_t spa, ddt_stat_t dds_total)
	{
	ddt_histogram_t *ddh_total;

	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
	ddt_get_dedup_histogram(spa, ddh_total);
	ddt_histogram_stat(dds_total, ddh_total);
	kmem_free(ddh_total, sizeof (ddt_histogram_t));
	}

	uint64_t
	ddt_get_dedup_dspace(spa_t *spa)
	{
	ddt_stat_t dds_total = { 0 };

	ddt_get_dedup_stats(spa, &dds_total);
	return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
	}

	uint64_t
	ddt_get_pool_dedup_ratio(spa_t *spa)
	{
	ddt_stat_t dds_total = { 0 };

	ddt_get_dedup_stats(spa, &dds_total);
	if (dds_total.dds_dsize == 0)
	return (100);

	return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
	}

	int
	ddt_ditto_copies_needed(ddt_t ddt, ddt_entry_t dde, ddt_phys_t *ddp_willref)
	{
	spa_t *spa = ddt->ddt_spa;
	uint64_t total_refcnt = 0;
	uint64_t ditto = spa->spa_dedup_ditto;
	int total_copies = 0;
	int desired_copies = 0;

	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
	ddt_phys_t *ddp = &dde->dde_phys[p];
	zio_t *zio = dde->dde_lead_zio[p];
	uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
	if (zio != NULL)
	refcnt += zio->io_parent_count; /* pending refs */
	if (ddp == ddp_willref)
	refcnt++; /* caller's ref */
	if (refcnt != 0) {
	total_refcnt += refcnt;
	total_copies += p;
	}
	}

	if (ditto == 0 \|\| ditto > UINT32_MAX)
	ditto = UINT32_MAX;

	if (total_refcnt >= 1)
	desired_copies++;
	if (total_refcnt >= ditto)
	desired_copies++;
	if (total_refcnt >= ditto * ditto)
	desired_copies++;

	return (MAX(desired_copies, total_copies) - total_copies);
	}

	int
	ddt_ditto_copies_present(ddt_entry_t *dde)
	{
	ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
	dva_t *dva = ddp->ddp_dva;
	int copies = 0 - DVA_GET_GANG(dva);

	for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
	if (DVA_IS_VALID(dva))
	copies++;

	ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);

	return (copies);
	}

	size_t
	ddt_compress(void src, uchar_t dst, size_t s_len, size_t d_len)
	{
	uchar_t *version = dst++;
	int cpfunc = ZIO_COMPRESS_ZLE;
	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
	size_t c_len;

	ASSERT(d_len >= s_len + 1); /* no compression plus version byte */

	c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);

	if (c_len == s_len) {
	cpfunc = ZIO_COMPRESS_OFF;
	bcopy(src, dst, s_len);
	}

	*version = cpfunc;
	/* CONSTCOND */
	if (ZFS_HOST_BYTEORDER)
	*version \|= DDT_COMPRESS_BYTEORDER_MASK;

	return (c_len + 1);
	}

	void
	ddt_decompress(uchar_t src, void dst, size_t s_len, size_t d_len)
	{
	uchar_t version = *src++;
	int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
	zio_compress_info_t *ci = &zio_compress_table[cpfunc];

	if (ci->ci_decompress != NULL)
	(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
	else
	bcopy(src, dst, d_len);

	if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
	(ZFS_HOST_BYTEORDER != 0))
	byteswap_uint64_array(dst, d_len);
	}

	ddt_t *
	ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
	{
	return (spa->spa_ddt[c]);
	}

	ddt_t *
	ddt_select(spa_t spa, const blkptr_t bp)
	{
	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
	}

	void
	ddt_enter(ddt_t *ddt)
	{
	mutex_enter(&ddt->ddt_lock);
	}

	void
	ddt_exit(ddt_t *ddt)
	{
	mutex_exit(&ddt->ddt_lock);
	}

	static ddt_entry_t *
	ddt_alloc(const ddt_key_t *ddk)
	{
	ddt_entry_t *dde;

	dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);

	dde->dde_key = *ddk;

	return (dde);
	}

	static void
	ddt_free(ddt_entry_t *dde)
	{
	ASSERT(!dde->dde_loading);

	for (int p = 0; p < DDT_PHYS_TYPES; p++)
	ASSERT(dde->dde_lead_zio[p] == NULL);

	if (dde->dde_repair_abd != NULL)
	abd_free(dde->dde_repair_abd);

	cv_destroy(&dde->dde_cv);
	kmem_free(dde, sizeof (*dde));
	}

	void
	ddt_remove(ddt_t ddt, ddt_entry_t dde)
	{
	ASSERT(MUTEX_HELD(&ddt->ddt_lock));

	avl_remove(&ddt->ddt_tree, dde);
	ddt_free(dde);
	}

	ddt_entry_t *
	ddt_lookup(ddt_t ddt, const blkptr_t bp, boolean_t add)
	{
	ddt_entry_t *dde, dde_search;
	enum ddt_type type;
	enum ddt_class class;
	avl_index_t where;
	int error;

	ASSERT(MUTEX_HELD(&ddt->ddt_lock));

	ddt_key_fill(&dde_search.dde_key, bp);

	dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
	if (dde == NULL) {
	if (!add)
	return (NULL);
	dde = ddt_alloc(&dde_search.dde_key);
	avl_insert(&ddt->ddt_tree, dde, where);
	}

	while (dde->dde_loading)
	cv_wait(&dde->dde_cv, &ddt->ddt_lock);

	if (dde->dde_loaded)
	return (dde);

	dde->dde_loading = B_TRUE;

	ddt_exit(ddt);

	error = ENOENT;

	for (type = 0; type < DDT_TYPES; type++) {
	for (class = 0; class < DDT_CLASSES; class++) {
	error = ddt_object_lookup(ddt, type, class, dde);
	- if (error != ENOENT)
	+ if (error != ENOENT) {
	+ ASSERT0(error);
	break;
	+ }
	}
	if (error != ENOENT)
	break;
	}

	- ASSERT(error == 0 \|\| error == ENOENT);
	-
	ddt_enter(ddt);

	ASSERT(dde->dde_loaded == B_FALSE);
	ASSERT(dde->dde_loading == B_TRUE);

	dde->dde_type = type; /* will be DDT_TYPES if no entry found */
	dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
	dde->dde_loaded = B_TRUE;
	dde->dde_loading = B_FALSE;

	if (error == 0)
	ddt_stat_update(ddt, dde, -1ULL);

	cv_broadcast(&dde->dde_cv);

	return (dde);
	}

	void
	ddt_prefetch(spa_t spa, const blkptr_t bp)
	{
	ddt_t *ddt;
	ddt_entry_t dde;

	if (!zfs_dedup_prefetch \|\| bp == NULL \|\| !BP_GET_DEDUP(bp))
	return;

	/*
	* We only remove the DDT once all tables are empty and only
	* prefetch dedup blocks when there are entries in the DDT.
	* Thus no locking is required as the DDT can't disappear on us.
	*/
	ddt = ddt_select(spa, bp);
	ddt_key_fill(&dde.dde_key, bp);

	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
	ddt_object_prefetch(ddt, type, class, &dde);
	}
	}
	}

	int
	ddt_entry_compare(const void x1, const void x2)
	{
	const ddt_entry_t *dde1 = x1;
	const ddt_entry_t *dde2 = x2;
	const uint64_t u1 = (const uint64_t )&dde1->dde_key;
	const uint64_t u2 = (const uint64_t )&dde2->dde_key;

	for (int i = 0; i < DDT_KEY_WORDS; i++) {
	if (u1[i] < u2[i])
	return (-1);
	if (u1[i] > u2[i])
	return (1);
	}

	return (0);
	}

	static ddt_t *
	ddt_table_alloc(spa_t *spa, enum zio_checksum c)
	{
	ddt_t *ddt;

	ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);

	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
	avl_create(&ddt->ddt_tree, ddt_entry_compare,
	sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
	avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
	sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
	ddt->ddt_checksum = c;
	ddt->ddt_spa = spa;
	ddt->ddt_os = spa->spa_meta_objset;

	return (ddt);
	}

	static void
	ddt_table_free(ddt_t *ddt)
	{
	ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
	ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
	avl_destroy(&ddt->ddt_tree);
	avl_destroy(&ddt->ddt_repair_tree);
	mutex_destroy(&ddt->ddt_lock);
	kmem_free(ddt, sizeof (*ddt));
	}

	void
	ddt_create(spa_t *spa)
	{
	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;

	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
	spa->spa_ddt[c] = ddt_table_alloc(spa, c);
	}

	int
	ddt_load(spa_t *spa)
	{
	int error;

	ddt_create(spa);

	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
	&spa->spa_ddt_stat_object);

	if (error)
	return (error == ENOENT ? 0 : error);

	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	ddt_t *ddt = spa->spa_ddt[c];
	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	for (enum ddt_class class = 0; class < DDT_CLASSES;
	class++) {
	error = ddt_object_load(ddt, type, class);
	if (error != 0 && error != ENOENT)
	return (error);
	}
	}

	/*
	* Seed the cached histograms.
	*/
	bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
	sizeof (ddt->ddt_histogram));
	}

	return (0);
	}

	void
	ddt_unload(spa_t *spa)
	{
	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	if (spa->spa_ddt[c]) {
	ddt_table_free(spa->spa_ddt[c]);
	spa->spa_ddt[c] = NULL;
	}
	}
	}

	boolean_t
	ddt_class_contains(spa_t spa, enum ddt_class max_class, const blkptr_t bp)
	{
	ddt_t *ddt;
	ddt_entry_t dde;

	if (!BP_GET_DEDUP(bp))
	return (B_FALSE);

	if (max_class == DDT_CLASS_UNIQUE)
	return (B_TRUE);

	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];

	ddt_key_fill(&dde.dde_key, bp);

	for (enum ddt_type type = 0; type < DDT_TYPES; type++)
	for (enum ddt_class class = 0; class <= max_class; class++)
	if (ddt_object_lookup(ddt, type, class, &dde) == 0)
	return (B_TRUE);

	return (B_FALSE);
	}

	ddt_entry_t *
	ddt_repair_start(ddt_t ddt, const blkptr_t bp)
	{
	ddt_key_t ddk;
	ddt_entry_t *dde;

	ddt_key_fill(&ddk, bp);

	dde = ddt_alloc(&ddk);

	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
	/*
	* We can only do repair if there are multiple copies
	* of the block. For anything in the UNIQUE class,
	* there's definitely only one copy, so don't even try.
	*/
	if (class != DDT_CLASS_UNIQUE &&
	ddt_object_lookup(ddt, type, class, dde) == 0)
	return (dde);
	}
	}

	bzero(dde->dde_phys, sizeof (dde->dde_phys));

	return (dde);
	}

	void
	ddt_repair_done(ddt_t ddt, ddt_entry_t dde)
	{
	avl_index_t where;

	ddt_enter(ddt);

	if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
	avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
	avl_insert(&ddt->ddt_repair_tree, dde, where);
	else
	ddt_free(dde);

	ddt_exit(ddt);
	}

	static void
	ddt_repair_entry_done(zio_t *zio)
	{
	ddt_entry_t *rdde = zio->io_private;

	ddt_free(rdde);
	}

	static void
	ddt_repair_entry(ddt_t ddt, ddt_entry_t dde, ddt_entry_t rdde, zio_t rio)
	{
	ddt_phys_t *ddp = dde->dde_phys;
	ddt_phys_t *rddp = rdde->dde_phys;
	ddt_key_t *ddk = &dde->dde_key;
	ddt_key_t *rddk = &rdde->dde_key;
	zio_t *zio;
	blkptr_t blk;

	zio = zio_null(rio, rio->io_spa, NULL,
	ddt_repair_entry_done, rdde, rio->io_flags);

	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
	if (ddp->ddp_phys_birth == 0 \|\|
	ddp->ddp_phys_birth != rddp->ddp_phys_birth \|\|
	bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
	continue;
	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
	zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
	rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
	ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
	}

	zio_nowait(zio);
	}

	static void
	ddt_repair_table(ddt_t ddt, zio_t rio)
	{
	spa_t *spa = ddt->ddt_spa;
	ddt_entry_t dde, rdde_next, *rdde;
	avl_tree_t *t = &ddt->ddt_repair_tree;
	blkptr_t blk;

	if (spa_sync_pass(spa) > 1)
	return;

	ddt_enter(ddt);
	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
	rdde_next = AVL_NEXT(t, rdde);
	avl_remove(&ddt->ddt_repair_tree, rdde);
	ddt_exit(ddt);
	ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
	dde = ddt_repair_start(ddt, &blk);
	ddt_repair_entry(ddt, dde, rdde, rio);
	ddt_repair_done(ddt, dde);
	ddt_enter(ddt);
	}
	ddt_exit(ddt);
	}

	static void
	ddt_sync_entry(ddt_t ddt, ddt_entry_t dde, dmu_tx_t *tx, uint64_t txg)
	{
	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
	ddt_phys_t *ddp = dde->dde_phys;
	ddt_key_t *ddk = &dde->dde_key;
	enum ddt_type otype = dde->dde_type;
	enum ddt_type ntype = DDT_TYPE_CURRENT;
	enum ddt_class oclass = dde->dde_class;
	enum ddt_class nclass;
	uint64_t total_refcnt = 0;

	ASSERT(dde->dde_loaded);
	ASSERT(!dde->dde_loading);

	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	ASSERT(dde->dde_lead_zio[p] == NULL);
	ASSERT((int64_t)ddp->ddp_refcnt >= 0);
	if (ddp->ddp_phys_birth == 0) {
	ASSERT(ddp->ddp_refcnt == 0);
	continue;
	}
	if (p == DDT_PHYS_DITTO) {
	if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
	ddt_phys_free(ddt, ddk, ddp, txg);
	continue;
	}
	if (ddp->ddp_refcnt == 0)
	ddt_phys_free(ddt, ddk, ddp, txg);
	total_refcnt += ddp->ddp_refcnt;
	}

	if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
	nclass = DDT_CLASS_DITTO;
	else if (total_refcnt > 1)
	nclass = DDT_CLASS_DUPLICATE;
	else
	nclass = DDT_CLASS_UNIQUE;

	if (otype != DDT_TYPES &&
	(otype != ntype \|\| oclass != nclass \|\| total_refcnt == 0)) {
	VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
	ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
	}

	if (total_refcnt != 0) {
	dde->dde_type = ntype;
	dde->dde_class = nclass;
	ddt_stat_update(ddt, dde, 0);
	if (!ddt_object_exists(ddt, ntype, nclass))
	ddt_object_create(ddt, ntype, nclass, tx);
	VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);

	/*
	* If the class changes, the order that we scan this bp
	* changes. If it decreases, we could miss it, so
	* scan it right now. (This covers both class changing
	* while we are doing ddt_walk(), and when we are
	* traversing.)
	*/
	if (nclass < oclass) {
	dsl_scan_ddt_entry(dp->dp_scan,
	ddt->ddt_checksum, dde, tx);
	}
	}
	}

	static void
	ddt_sync_table(ddt_t ddt, dmu_tx_t tx, uint64_t txg)
	{
	spa_t *spa = ddt->ddt_spa;
	ddt_entry_t *dde;
	void *cookie = NULL;

	if (avl_numnodes(&ddt->ddt_tree) == 0)
	return;

	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);

	if (spa->spa_ddt_stat_object == 0) {
	spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
	DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_DDT_STATS, tx);
	}

	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
	ddt_sync_entry(ddt, dde, tx, txg);
	ddt_free(dde);
	}

	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
	uint64_t add, count = 0;
	for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
	if (ddt_object_exists(ddt, type, class)) {
	ddt_object_sync(ddt, type, class, tx);
	VERIFY(ddt_object_count(ddt, type, class,
	&add) == 0);
	count += add;
	}
	}
	for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
	if (count == 0 && ddt_object_exists(ddt, type, class))
	ddt_object_destroy(ddt, type, class, tx);
	}
	}

	bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
	sizeof (ddt->ddt_histogram));
	}

	void
	ddt_sync(spa_t *spa, uint64_t txg)
	{
	dmu_tx_t *tx;
	zio_t *rio = zio_root(spa, NULL, NULL,
	- ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE);
	+ ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SELF_HEAL);

	ASSERT(spa_syncing_txg(spa) == txg);

	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);

	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
	ddt_t *ddt = spa->spa_ddt[c];
	if (ddt == NULL)
	continue;
	ddt_sync_table(ddt, tx, txg);
	ddt_repair_table(ddt, rio);
	}

	(void) zio_wait(rio);

	dmu_tx_commit(tx);
	}

	int
	ddt_walk(spa_t spa, ddt_bookmark_t ddb, ddt_entry_t *dde)
	{
	do {
	do {
	do {
	ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
	int error = ENOENT;
	if (ddt_object_exists(ddt, ddb->ddb_type,
	ddb->ddb_class)) {
	error = ddt_object_walk(ddt,
	ddb->ddb_type, ddb->ddb_class,
	&ddb->ddb_cursor, dde);
	}
	dde->dde_type = ddb->ddb_type;
	dde->dde_class = ddb->ddb_class;
	if (error == 0)
	return (0);
	if (error != ENOENT)
	return (error);
	ddb->ddb_cursor = 0;
	} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
	ddb->ddb_checksum = 0;
	} while (++ddb->ddb_type < DDT_TYPES);
	ddb->ddb_type = 0;
	} while (++ddb->ddb_class < DDT_CLASSES);

	return (SET_ERROR(ENOENT));
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 332525)
	@@ -1,2535 +1,2659 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	*/
	/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
	/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
	/* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */

	#include <sys/dmu.h>
	#include <sys/dmu_impl.h>
	#include <sys/dmu_tx.h>
	#include <sys/dbuf.h>
	#include <sys/dnode.h>
	#include <sys/zfs_context.h>
	#include <sys/dmu_objset.h>
	#include <sys/dmu_traverse.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_synctask.h>
	#include <sys/dsl_prop.h>
	#include <sys/dmu_zfetch.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/zap.h>
	#include <sys/zio_checksum.h>
	#include <sys/zio_compress.h>
	#include <sys/sa.h>
	#include <sys/zfeature.h>
	#include <sys/abd.h>
	#ifdef _KERNEL
	#include <sys/racct.h>
	#include <sys/vm.h>
	#include <sys/zfs_znode.h>
	#endif

	/*
	* Enable/disable nopwrite feature.
	*/
	int zfs_nopwrite_enabled = 1;
	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
	&zfs_nopwrite_enabled, 0, "Enable nopwrite feature");

	/*
	* Tunable to control percentage of dirtied blocks from frees in one TXG.
	* After this threshold is crossed, additional dirty blocks from frees
	* wait until the next TXG.
	* A value of zero will disable this throttle.
	*/
	uint32_t zfs_per_txg_dirty_frees_percent = 30;
	SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
	&zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg");

	+/*
	+ * This can be used for testing, to ensure that certain actions happen
	+ * while in the middle of a remap (which might otherwise complete too
	+ * quickly).
	+ */
	+int zfs_object_remap_one_indirect_delay_ticks = 0;
	+
	const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
	{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
	{ DMU_BSWAP_ZAP, TRUE, "object directory" },
	{ DMU_BSWAP_UINT64, TRUE, "object array" },
	{ DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
	{ DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
	{ DMU_BSWAP_UINT64, TRUE, "bpobj" },
	{ DMU_BSWAP_UINT64, TRUE, "bpobj header" },
	{ DMU_BSWAP_UINT64, TRUE, "SPA space map header" },
	{ DMU_BSWAP_UINT64, TRUE, "SPA space map" },
	{ DMU_BSWAP_UINT64, TRUE, "ZIL intent log" },
	{ DMU_BSWAP_DNODE, TRUE, "DMU dnode" },
	{ DMU_BSWAP_OBJSET, TRUE, "DMU objset" },
	{ DMU_BSWAP_UINT64, TRUE, "DSL directory" },
	{ DMU_BSWAP_ZAP, TRUE, "DSL directory child map"},
	{ DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" },
	{ DMU_BSWAP_ZAP, TRUE, "DSL props" },
	{ DMU_BSWAP_UINT64, TRUE, "DSL dataset" },
	{ DMU_BSWAP_ZNODE, TRUE, "ZFS znode" },
	{ DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" },
	{ DMU_BSWAP_UINT8, FALSE, "ZFS plain file" },
	{ DMU_BSWAP_ZAP, TRUE, "ZFS directory" },
	{ DMU_BSWAP_ZAP, TRUE, "ZFS master node" },
	{ DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" },
	{ DMU_BSWAP_UINT8, FALSE, "zvol object" },
	{ DMU_BSWAP_ZAP, TRUE, "zvol prop" },
	{ DMU_BSWAP_UINT8, FALSE, "other uint8[]" },
	{ DMU_BSWAP_UINT64, FALSE, "other uint64[]" },
	{ DMU_BSWAP_ZAP, TRUE, "other ZAP" },
	{ DMU_BSWAP_ZAP, TRUE, "persistent error log" },
	{ DMU_BSWAP_UINT8, TRUE, "SPA history" },
	{ DMU_BSWAP_UINT64, TRUE, "SPA history offsets" },
	{ DMU_BSWAP_ZAP, TRUE, "Pool properties" },
	{ DMU_BSWAP_ZAP, TRUE, "DSL permissions" },
	{ DMU_BSWAP_ACL, TRUE, "ZFS ACL" },
	{ DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" },
	{ DMU_BSWAP_UINT8, TRUE, "FUID table" },
	{ DMU_BSWAP_UINT64, TRUE, "FUID table size" },
	{ DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"},
	{ DMU_BSWAP_ZAP, TRUE, "scan work queue" },
	{ DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" },
	{ DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" },
	{ DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"},
	{ DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" },
	{ DMU_BSWAP_ZAP, TRUE, "DDT statistics" },
	{ DMU_BSWAP_UINT8, TRUE, "System attributes" },
	{ DMU_BSWAP_ZAP, TRUE, "SA master node" },
	{ DMU_BSWAP_ZAP, TRUE, "SA attr registration" },
	{ DMU_BSWAP_ZAP, TRUE, "SA attr layouts" },
	{ DMU_BSWAP_ZAP, TRUE, "scan translations" },
	{ DMU_BSWAP_UINT8, FALSE, "deduplicated block" },
	{ DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" },
	{ DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" },
	{ DMU_BSWAP_ZAP, TRUE, "DSL dir clones" },
	{ DMU_BSWAP_UINT64, TRUE, "bpobj subobj" }
	};

	const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
	{ byteswap_uint8_array, "uint8" },
	{ byteswap_uint16_array, "uint16" },
	{ byteswap_uint32_array, "uint32" },
	{ byteswap_uint64_array, "uint64" },
	{ zap_byteswap, "zap" },
	{ dnode_buf_byteswap, "dnode" },
	{ dmu_objset_byteswap, "objset" },
	{ zfs_znode_byteswap, "znode" },
	{ zfs_oldacl_byteswap, "oldacl" },
	{ zfs_acl_byteswap, "acl" }
	};

	int
	dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
	void tag, dmu_buf_t *dbp)
	{
	uint64_t blkid;
	dmu_buf_impl_t *db;

	blkid = dbuf_whichblock(dn, 0, offset);
	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	db = dbuf_hold(dn, blkid, tag);
	rw_exit(&dn->dn_struct_rwlock);

	if (db == NULL) {
	*dbp = NULL;
	return (SET_ERROR(EIO));
	}

	*dbp = &db->db;
	return (0);
	}
	int
	dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
	void tag, dmu_buf_t *dbp)
	{
	dnode_t *dn;
	uint64_t blkid;
	dmu_buf_impl_t *db;
	int err;

	err = dnode_hold(os, object, FTAG, &dn);
	if (err)
	return (err);
	blkid = dbuf_whichblock(dn, 0, offset);
	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	db = dbuf_hold(dn, blkid, tag);
	rw_exit(&dn->dn_struct_rwlock);
	dnode_rele(dn, FTAG);

	if (db == NULL) {
	*dbp = NULL;
	return (SET_ERROR(EIO));
	}

	*dbp = &db->db;
	return (err);
	}

	int
	dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
	void tag, dmu_buf_t *dbp, int flags)
	{
	int err;
	int db_flags = DB_RF_CANFAIL;

	if (flags & DMU_READ_NO_PREFETCH)
	db_flags \|= DB_RF_NOPREFETCH;

	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
	if (err == 0) {
	dmu_buf_impl_t db = (dmu_buf_impl_t )(*dbp);
	err = dbuf_read(db, NULL, db_flags);
	if (err != 0) {
	dbuf_rele(db, tag);
	*dbp = NULL;
	}
	}

	return (err);
	}

	int
	dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
	void tag, dmu_buf_t *dbp, int flags)
	{
	int err;
	int db_flags = DB_RF_CANFAIL;

	if (flags & DMU_READ_NO_PREFETCH)
	db_flags \|= DB_RF_NOPREFETCH;

	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
	if (err == 0) {
	dmu_buf_impl_t db = (dmu_buf_impl_t )(*dbp);
	err = dbuf_read(db, NULL, db_flags);
	if (err != 0) {
	dbuf_rele(db, tag);
	*dbp = NULL;
	}
	}

	return (err);
	}

	int
	dmu_bonus_max(void)
	{
	return (DN_MAX_BONUSLEN);
	}

	int
	dmu_set_bonus(dmu_buf_t db_fake, int newsize, dmu_tx_t tx)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	dnode_t *dn;
	int error;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);

	if (dn->dn_bonus != db) {
	error = SET_ERROR(EINVAL);
	} else if (newsize < 0 \|\| newsize > db_fake->db_size) {
	error = SET_ERROR(EINVAL);
	} else {
	dnode_setbonuslen(dn, newsize, tx);
	error = 0;
	}

	DB_DNODE_EXIT(db);
	return (error);
	}

	int
	dmu_set_bonustype(dmu_buf_t db_fake, dmu_object_type_t type, dmu_tx_t tx)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	dnode_t *dn;
	int error;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);

	if (!DMU_OT_IS_VALID(type)) {
	error = SET_ERROR(EINVAL);
	} else if (dn->dn_bonus != db) {
	error = SET_ERROR(EINVAL);
	} else {
	dnode_setbonus_type(dn, type, tx);
	error = 0;
	}

	DB_DNODE_EXIT(db);
	return (error);
	}

	dmu_object_type_t
	dmu_get_bonustype(dmu_buf_t *db_fake)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	dnode_t *dn;
	dmu_object_type_t type;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	type = dn->dn_bonustype;
	DB_DNODE_EXIT(db);

	return (type);
	}

	int
	dmu_rm_spill(objset_t os, uint64_t object, dmu_tx_t tx)
	{
	dnode_t *dn;
	int error;

	error = dnode_hold(os, object, FTAG, &dn);
	dbuf_rm_spill(dn, tx);
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	dnode_rm_spill(dn, tx);
	rw_exit(&dn->dn_struct_rwlock);
	dnode_rele(dn, FTAG);
	return (error);
	}

	/*
	* returns ENOENT, EIO, or 0.
	*/
	int
	dmu_bonus_hold(objset_t os, uint64_t object, void tag, dmu_buf_t **dbp)
	{
	dnode_t *dn;
	dmu_buf_impl_t *db;
	int error;

	error = dnode_hold(os, object, FTAG, &dn);
	if (error)
	return (error);

	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	if (dn->dn_bonus == NULL) {
	rw_exit(&dn->dn_struct_rwlock);
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	if (dn->dn_bonus == NULL)
	dbuf_create_bonus(dn);
	}
	db = dn->dn_bonus;

	/* as long as the bonus buf is held, the dnode will be held */
	if (refcount_add(&db->db_holds, tag) == 1) {
	VERIFY(dnode_add_ref(dn, db));
	atomic_inc_32(&dn->dn_dbufs_count);
	}

	/*
	* Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
	* hold and incrementing the dbuf count to ensure that dnode_move() sees
	* a dnode hold for every dbuf.
	*/
	rw_exit(&dn->dn_struct_rwlock);

	dnode_rele(dn, FTAG);

	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED \| DB_RF_NOPREFETCH));

	*dbp = &db->db;
	return (0);
	}

	/*
	* returns ENOENT, EIO, or 0.
	*
	* This interface will allocate a blank spill dbuf when a spill blk
	* doesn't already exist on the dnode.
	*
	* if you only want to find an already existing spill db, then
	* dmu_spill_hold_existing() should be used.
	*/
	int
	dmu_spill_hold_by_dnode(dnode_t dn, uint32_t flags, void tag, dmu_buf_t **dbp)
	{
	dmu_buf_impl_t *db = NULL;
	int err;

	if ((flags & DB_RF_HAVESTRUCT) == 0)
	rw_enter(&dn->dn_struct_rwlock, RW_READER);

	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);

	if ((flags & DB_RF_HAVESTRUCT) == 0)
	rw_exit(&dn->dn_struct_rwlock);

	ASSERT(db != NULL);
	err = dbuf_read(db, NULL, flags);
	if (err == 0)
	*dbp = &db->db;
	else
	dbuf_rele(db, tag);
	return (err);
	}

	int
	dmu_spill_hold_existing(dmu_buf_t bonus, void tag, dmu_buf_t **dbp)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )bonus;
	dnode_t *dn;
	int err;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);

	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
	err = SET_ERROR(EINVAL);
	} else {
	rw_enter(&dn->dn_struct_rwlock, RW_READER);

	if (!dn->dn_have_spill) {
	err = SET_ERROR(ENOENT);
	} else {
	err = dmu_spill_hold_by_dnode(dn,
	DB_RF_HAVESTRUCT \| DB_RF_CANFAIL, tag, dbp);
	}

	rw_exit(&dn->dn_struct_rwlock);
	}

	DB_DNODE_EXIT(db);
	return (err);
	}

	int
	dmu_spill_hold_by_bonus(dmu_buf_t bonus, void tag, dmu_buf_t **dbp)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )bonus;
	dnode_t *dn;
	int err;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
	DB_DNODE_EXIT(db);

	return (err);
	}

	/*
	* Note: longer-term, we should modify all of the dmu_buf_*() interfaces
	* to take a held dnode rather than <os, object> -- the lookup is wasteful,
	* and can induce severe lock contention when writing to several files
	* whose dnodes are in the same block.
	*/
	static int
	dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
	boolean_t read, void tag, int numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
	{
	dmu_buf_t **dbp;
	uint64_t blkid, nblks, i;
	uint32_t dbuf_flags;
	int err;
	zio_t *zio;

	ASSERT(length <= DMU_MAX_ACCESS);

	/*
	* Note: We directly notify the prefetch code of this read, so that
	* we can tell it about the multi-block read. dbuf_read() only knows
	* about the one block it is accessing.
	*/
	dbuf_flags = DB_RF_CANFAIL \| DB_RF_NEVERWAIT \| DB_RF_HAVESTRUCT \|
	DB_RF_NOPREFETCH;

	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	if (dn->dn_datablkshift) {
	int blkshift = dn->dn_datablkshift;
	nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
	P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
	} else {
	if (offset + length > dn->dn_datablksz) {
	zfs_panic_recover("zfs: accessing past end of object "
	"%llx/%llx (size=%u access=%llu+%llu)",
	(longlong_t)dn->dn_objset->
	os_dsl_dataset->ds_object,
	(longlong_t)dn->dn_object, dn->dn_datablksz,
	(longlong_t)offset, (longlong_t)length);
	rw_exit(&dn->dn_struct_rwlock);
	return (SET_ERROR(EIO));
	}
	nblks = 1;
	}
	dbp = kmem_zalloc(sizeof (dmu_buf_t ) nblks, KM_SLEEP);

	#if defined(_KERNEL) && defined(RACCT)
	if (racct_enable && !read) {
	PROC_LOCK(curproc);
	racct_add_force(curproc, RACCT_WRITEBPS, length);
	racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
	PROC_UNLOCK(curproc);
	}
	#endif

	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
	blkid = dbuf_whichblock(dn, 0, offset);
	for (i = 0; i < nblks; i++) {
	dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
	if (db == NULL) {
	rw_exit(&dn->dn_struct_rwlock);
	dmu_buf_rele_array(dbp, nblks, tag);
	zio_nowait(zio);
	return (SET_ERROR(EIO));
	}

	/* initiate async i/o */
	if (read)
	(void) dbuf_read(db, zio, dbuf_flags);
	#ifdef _KERNEL
	else
	curthread->td_ru.ru_oublock++;
	#endif
	dbp[i] = &db->db;
	}

	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
	DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
	dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
	read && DNODE_IS_CACHEABLE(dn));
	}
	rw_exit(&dn->dn_struct_rwlock);

	/* wait for async i/o */
	err = zio_wait(zio);
	if (err) {
	dmu_buf_rele_array(dbp, nblks, tag);
	return (err);
	}

	/* wait for other io to complete */
	if (read) {
	for (i = 0; i < nblks; i++) {
	dmu_buf_impl_t db = (dmu_buf_impl_t )dbp[i];
	mutex_enter(&db->db_mtx);
	while (db->db_state == DB_READ \|\|
	db->db_state == DB_FILL)
	cv_wait(&db->db_changed, &db->db_mtx);
	if (db->db_state == DB_UNCACHED)
	err = SET_ERROR(EIO);
	mutex_exit(&db->db_mtx);
	if (err) {
	dmu_buf_rele_array(dbp, nblks, tag);
	return (err);
	}
	}
	}

	*numbufsp = nblks;
	*dbpp = dbp;
	return (0);
	}

	static int
	dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
	uint64_t length, int read, void tag, int numbufsp, dmu_buf_t ***dbpp)
	{
	dnode_t *dn;
	int err;

	err = dnode_hold(os, object, FTAG, &dn);
	if (err)
	return (err);

	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
	numbufsp, dbpp, DMU_READ_PREFETCH);

	dnode_rele(dn, FTAG);

	return (err);
	}

	int
	dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
	uint64_t length, boolean_t read, void tag, int numbufsp,
	dmu_buf_t ***dbpp)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	dnode_t *dn;
	int err;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
	numbufsp, dbpp, DMU_READ_PREFETCH);
	DB_DNODE_EXIT(db);

	return (err);
	}

	void
	dmu_buf_rele_array(dmu_buf_t *dbp_fake, int numbufs, void tag)
	{
	int i;
	dmu_buf_impl_t dbp = (dmu_buf_impl_t )dbp_fake;

	if (numbufs == 0)
	return;

	for (i = 0; i < numbufs; i++) {
	if (dbp[i])
	dbuf_rele(dbp[i], tag);
	}

	kmem_free(dbp, sizeof (dmu_buf_t ) numbufs);
	}

	/*
	* Issue prefetch i/os for the given blocks. If level is greater than 0, the
	* indirect blocks prefeteched will be those that point to the blocks containing
	* the data starting at offset, and continuing to offset + len.
	*
	* Note that if the indirect blocks above the blocks being prefetched are not in
	* cache, they will be asychronously read in.
	*/
	void
	dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
	uint64_t len, zio_priority_t pri)
	{
	dnode_t *dn;
	uint64_t blkid;
	int nblks, err;

	if (len == 0) { /* they're interested in the bonus buffer */
	dn = DMU_META_DNODE(os);

	if (object == 0 \|\| object >= DN_MAX_OBJECT)
	return;

	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	blkid = dbuf_whichblock(dn, level,
	object * sizeof (dnode_phys_t));
	dbuf_prefetch(dn, level, blkid, pri, 0);
	rw_exit(&dn->dn_struct_rwlock);
	return;
	}

	/*
	* XXX - Note, if the dnode for the requested object is not
	* already cached, we will do a synchronous read in the
	* dnode_hold() call. The same is true for any indirects.
	*/
	err = dnode_hold(os, object, FTAG, &dn);
	if (err != 0)
	return;

	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	/*
	* offset + len - 1 is the last byte we want to prefetch for, and offset
	* is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
	* last block we want to prefetch, and dbuf_whichblock(dn, level,
	* offset) is the first. Then the number we need to prefetch is the
	* last - first + 1.
	*/
	if (level > 0 \|\| dn->dn_datablkshift != 0) {
	nblks = dbuf_whichblock(dn, level, offset + len - 1) -
	dbuf_whichblock(dn, level, offset) + 1;
	} else {
	nblks = (offset < dn->dn_datablksz);
	}

	if (nblks != 0) {
	blkid = dbuf_whichblock(dn, level, offset);
	for (int i = 0; i < nblks; i++)
	dbuf_prefetch(dn, level, blkid + i, pri, 0);
	}

	rw_exit(&dn->dn_struct_rwlock);

	dnode_rele(dn, FTAG);
	}

	/*
	* Get the next "chunk" of file data to free. We traverse the file from
	* the end so that the file gets shorter over time (if we crashes in the
	* middle, this will leave us in a better state). We find allocated file
	* data by simply searching the allocated level 1 indirects.
	*
	* On input, *start should be the first offset that does not need to be
	* freed (e.g. "offset + length"). On return, *start will be the first
	* offset that should be freed.
	*/
	static int
	get_next_chunk(dnode_t dn, uint64_t start, uint64_t minimum)
	{
	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
	/* bytes of data covered by a level-1 indirect block */
	uint64_t iblkrange =
	dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);

	ASSERT3U(minimum, <=, *start);

	if (start - minimum <= iblkrange maxblks) {
	*start = minimum;
	return (0);
	}
	ASSERT(ISP2(iblkrange));

	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
	int err;

	/*
	* dnode_next_offset(BACKWARDS) will find an allocated L1
	* indirect block at or before the input offset. We must
	* decrement *start so that it is at the end of the region
	* to search.
	*/
	(*start)--;
	err = dnode_next_offset(dn,
	DNODE_FIND_BACKWARDS, start, 2, 1, 0);

	/* if there are no indirect blocks before start, we are done */
	if (err == ESRCH) {
	*start = minimum;
	break;
	} else if (err != 0) {
	return (err);
	}

	/* set start to the beginning of this L1 indirect */
	start = P2ALIGN(start, iblkrange);
	}
	if (*start < minimum)
	*start = minimum;
	return (0);
	}

	/*
	* If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
	* otherwise return false.
	* Used below in dmu_free_long_range_impl() to enable abort when unmounting
	*/
	/ARGSUSED/
	static boolean_t
	dmu_objset_zfs_unmounting(objset_t *os)
	{
	#ifdef _KERNEL
	if (dmu_objset_type(os) == DMU_OST_ZFS)
	return (zfs_get_vfs_flag_unmounted(os));
	#endif
	return (B_FALSE);
	}

	static int
	dmu_free_long_range_impl(objset_t os, dnode_t dn, uint64_t offset,
	uint64_t length)
	{
	uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
	int err;
	uint64_t dirty_frees_threshold;
	dsl_pool_t *dp = dmu_objset_pool(os);

	if (offset >= object_size)
	return (0);

	if (zfs_per_txg_dirty_frees_percent <= 100)
	dirty_frees_threshold =
	zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
	else
	dirty_frees_threshold = zfs_dirty_data_max / 4;

	if (length == DMU_OBJECT_END \|\| offset + length > object_size)
	length = object_size - offset;

	while (length != 0) {
	uint64_t chunk_end, chunk_begin, chunk_len;
	uint64_t long_free_dirty_all_txgs = 0;
	dmu_tx_t *tx;

	if (dmu_objset_zfs_unmounting(dn->dn_objset))
	return (SET_ERROR(EINTR));

	chunk_end = chunk_begin = offset + length;

	/* move chunk_begin backwards to the beginning of this chunk */
	err = get_next_chunk(dn, &chunk_begin, offset);
	if (err)
	return (err);
	ASSERT3U(chunk_begin, >=, offset);
	ASSERT3U(chunk_begin, <=, chunk_end);

	chunk_len = chunk_end - chunk_begin;

	mutex_enter(&dp->dp_lock);
	for (int t = 0; t < TXG_SIZE; t++) {
	long_free_dirty_all_txgs +=
	dp->dp_long_free_dirty_pertxg[t];
	}
	mutex_exit(&dp->dp_lock);

	/*
	* To avoid filling up a TXG with just frees wait for
	* the next TXG to open before freeing more chunks if
	* we have reached the threshold of frees
	*/
	if (dirty_frees_threshold != 0 &&
	long_free_dirty_all_txgs >= dirty_frees_threshold) {
	txg_wait_open(dp, 0);
	continue;
	}

	tx = dmu_tx_create(os);
	dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);

	/*
	* Mark this transaction as typically resulting in a net
	* reduction in space used.
	*/
	dmu_tx_mark_netfree(tx);
	err = dmu_tx_assign(tx, TXG_WAIT);
	if (err) {
	dmu_tx_abort(tx);
	return (err);
	}

	mutex_enter(&dp->dp_lock);
	dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
	chunk_len;
	mutex_exit(&dp->dp_lock);
	DTRACE_PROBE3(free__long__range,
	uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
	uint64_t, dmu_tx_get_txg(tx));
	dnode_free_range(dn, chunk_begin, chunk_len, tx);
	dmu_tx_commit(tx);

	length -= chunk_len;
	}
	return (0);
	}

	int
	dmu_free_long_range(objset_t *os, uint64_t object,
	uint64_t offset, uint64_t length)
	{
	dnode_t *dn;
	int err;

	err = dnode_hold(os, object, FTAG, &dn);
	if (err != 0)
	return (err);
	err = dmu_free_long_range_impl(os, dn, offset, length);

	/*
	* It is important to zero out the maxblkid when freeing the entire
	* file, so that (a) subsequent calls to dmu_free_long_range_impl()
	* will take the fast path, and (b) dnode_reallocate() can verify
	* that the entire file has been freed.
	*/
	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
	dn->dn_maxblkid = 0;

	dnode_rele(dn, FTAG);
	return (err);
	}

	int
	dmu_free_long_object(objset_t *os, uint64_t object)
	{
	dmu_tx_t *tx;
	int err;

	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
	if (err != 0)
	return (err);

	tx = dmu_tx_create(os);
	dmu_tx_hold_bonus(tx, object);
	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
	dmu_tx_mark_netfree(tx);
	err = dmu_tx_assign(tx, TXG_WAIT);
	if (err == 0) {
	err = dmu_object_free(os, object, tx);
	dmu_tx_commit(tx);
	} else {
	dmu_tx_abort(tx);
	}

	return (err);
	}

	int
	dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
	uint64_t size, dmu_tx_t *tx)
	{
	dnode_t *dn;
	int err = dnode_hold(os, object, FTAG, &dn);
	if (err)
	return (err);
	ASSERT(offset < UINT64_MAX);
	ASSERT(size == -1ULL \|\| size <= UINT64_MAX - offset);
	dnode_free_range(dn, offset, size, tx);
	dnode_rele(dn, FTAG);
	return (0);
	}

	static int
	dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
	void *buf, uint32_t flags)
	{
	dmu_buf_t **dbp;
	int numbufs, err = 0;

	/*
	* Deal with odd block sizes, where there can't be data past the first
	* block. If we ever do the tail block optimization, we will need to
	* handle that here as well.
	*/
	if (dn->dn_maxblkid == 0) {
	int newsz = offset > dn->dn_datablksz ? 0 :
	MIN(size, dn->dn_datablksz - offset);
	bzero((char *)buf + newsz, size - newsz);
	size = newsz;
	}

	while (size > 0) {
	uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
	int i;

	/*
	* NB: we could do this block-at-a-time, but it's nice
	* to be reading in parallel.
	*/
	err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
	TRUE, FTAG, &numbufs, &dbp, flags);
	if (err)
	break;

	for (i = 0; i < numbufs; i++) {
	int tocpy;
	int bufoff;
	dmu_buf_t *db = dbp[i];

	ASSERT(size > 0);

	bufoff = offset - db->db_offset;
	tocpy = (int)MIN(db->db_size - bufoff, size);

	bcopy((char *)db->db_data + bufoff, buf, tocpy);

	offset += tocpy;
	size -= tocpy;
	buf = (char *)buf + tocpy;
	}
	dmu_buf_rele_array(dbp, numbufs, FTAG);
	}
	return (err);
	}

	int
	dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	void *buf, uint32_t flags)
	{
	dnode_t *dn;
	int err;

	err = dnode_hold(os, object, FTAG, &dn);
	if (err != 0)
	return (err);

	err = dmu_read_impl(dn, offset, size, buf, flags);
	dnode_rele(dn, FTAG);
	return (err);
	}

	int
	dmu_read_by_dnode(dnode_t dn, uint64_t offset, uint64_t size, void buf,
	uint32_t flags)
	{
	return (dmu_read_impl(dn, offset, size, buf, flags));
	}

	static void
	dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
	const void buf, dmu_tx_t tx)
	{
	int i;

	for (i = 0; i < numbufs; i++) {
	int tocpy;
	int bufoff;
	dmu_buf_t *db = dbp[i];

	ASSERT(size > 0);

	bufoff = offset - db->db_offset;
	tocpy = (int)MIN(db->db_size - bufoff, size);

	ASSERT(i == 0 \|\| i == numbufs-1 \|\| tocpy == db->db_size);

	if (tocpy == db->db_size)
	dmu_buf_will_fill(db, tx);
	else
	dmu_buf_will_dirty(db, tx);

	bcopy(buf, (char *)db->db_data + bufoff, tocpy);

	if (tocpy == db->db_size)
	dmu_buf_fill_done(db, tx);

	offset += tocpy;
	size -= tocpy;
	buf = (char *)buf + tocpy;
	}
	}

	void
	dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	const void buf, dmu_tx_t tx)
	{
	dmu_buf_t **dbp;
	int numbufs;

	if (size == 0)
	return;

	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
	FALSE, FTAG, &numbufs, &dbp));
	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
	dmu_buf_rele_array(dbp, numbufs, FTAG);
	}

	void
	dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
	const void buf, dmu_tx_t tx)
	{
	dmu_buf_t **dbp;
	int numbufs;

	if (size == 0)
	return;

	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
	FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
	dmu_buf_rele_array(dbp, numbufs, FTAG);
	+}
	+
	+static int
	+dmu_object_remap_one_indirect(objset_t os, dnode_t dn,
	+ uint64_t last_removal_txg, uint64_t offset)
	+{
	+ uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
	+ int err = 0;
	+
	+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
	+ dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
	+ ASSERT3P(dbuf, !=, NULL);
	+
	+ /*
	+ * If the block hasn't been written yet, this default will ensure
	+ * we don't try to remap it.
	+ */
	+ uint64_t birth = UINT64_MAX;
	+ ASSERT3U(last_removal_txg, !=, UINT64_MAX);
	+ if (dbuf->db_blkptr != NULL)
	+ birth = dbuf->db_blkptr->blk_birth;
	+ rw_exit(&dn->dn_struct_rwlock);
	+
	+ /*
	+ * If this L1 was already written after the last removal, then we've
	+ * already tried to remap it.
	+ */
	+ if (birth <= last_removal_txg &&
	+ dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
	+ dbuf_can_remap(dbuf)) {
	+ dmu_tx_t *tx = dmu_tx_create(os);
	+ dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
	+ err = dmu_tx_assign(tx, TXG_WAIT);
	+ if (err == 0) {
	+ (void) dbuf_dirty(dbuf, tx);
	+ dmu_tx_commit(tx);
	+ } else {
	+ dmu_tx_abort(tx);
	+ }
	+ }
	+
	+ dbuf_rele(dbuf, FTAG);
	+
	+ delay(zfs_object_remap_one_indirect_delay_ticks);
	+
	+ return (err);
	+}
	+
	+/*
	+ * Remap all blockpointers in the object, if possible, so that they reference
	+ * only concrete vdevs.
	+ *
	+ * To do this, iterate over the L0 blockpointers and remap any that reference
	+ * an indirect vdev. Note that we only examine L0 blockpointers; since we
	+ * cannot guarantee that we can remap all blockpointer anyways (due to split
	+ * blocks), we do not want to make the code unnecessarily complicated to
	+ * catch the unlikely case that there is an L1 block on an indirect vdev that
	+ * contains no indirect blockpointers.
	+ */
	+int
	+dmu_object_remap_indirects(objset_t *os, uint64_t object,
	+ uint64_t last_removal_txg)
	+{
	+ uint64_t offset, l1span;
	+ int err;
	+ dnode_t *dn;
	+
	+ err = dnode_hold(os, object, FTAG, &dn);
	+ if (err != 0) {
	+ return (err);
	+ }
	+
	+ if (dn->dn_nlevels <= 1) {
	+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
	+ err = SET_ERROR(EINTR);
	+ }
	+
	+ /*
	+ * If the dnode has no indirect blocks, we cannot dirty them.
	+ * We still want to remap the blkptr(s) in the dnode if
	+ * appropriate, so mark it as dirty.
	+ */
	+ if (err == 0 && dnode_needs_remap(dn)) {
	+ dmu_tx_t *tx = dmu_tx_create(os);
	+ dmu_tx_hold_bonus(tx, dn->dn_object);
	+ if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
	+ dnode_setdirty(dn, tx);
	+ dmu_tx_commit(tx);
	+ } else {
	+ dmu_tx_abort(tx);
	+ }
	+ }
	+
	+ dnode_rele(dn, FTAG);
	+ return (err);
	+ }
	+
	+ offset = 0;
	+ l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
	+ dn->dn_datablkshift);
	+ /*
	+ * Find the next L1 indirect that is not a hole.
	+ */
	+ while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
	+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
	+ err = SET_ERROR(EINTR);
	+ break;
	+ }
	+ if ((err = dmu_object_remap_one_indirect(os, dn,
	+ last_removal_txg, offset)) != 0) {
	+ break;
	+ }
	+ offset += l1span;
	+ }
	+
	+ dnode_rele(dn, FTAG);
	+ return (err);
	}

	void
	dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	dmu_tx_t *tx)
	{
	dmu_buf_t **dbp;
	int numbufs, i;

	if (size == 0)
	return;

	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
	FALSE, FTAG, &numbufs, &dbp));

	for (i = 0; i < numbufs; i++) {
	dmu_buf_t *db = dbp[i];

	dmu_buf_will_not_fill(db, tx);
	}
	dmu_buf_rele_array(dbp, numbufs, FTAG);
	}

	void
	dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
	void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
	int compressed_size, int byteorder, dmu_tx_t *tx)
	{
	dmu_buf_t *db;

	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
	VERIFY0(dmu_buf_hold_noread(os, object, offset,
	FTAG, &db));

	dmu_buf_write_embedded(db,
	data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
	uncompressed_size, compressed_size, byteorder, tx);

	dmu_buf_rele(db, FTAG);
	}

	/*
	* DMU support for xuio
	*/
	kstat_t *xuio_ksp = NULL;

	int
	dmu_xuio_init(xuio_t *xuio, int nblk)
	{
	dmu_xuio_t *priv;
	uio_t *uio = &xuio->xu_uio;

	uio->uio_iovcnt = nblk;
	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);

	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
	priv->cnt = nblk;
	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
	priv->iovp = uio->uio_iov;
	XUIO_XUZC_PRIV(xuio) = priv;

	if (XUIO_XUZC_RW(xuio) == UIO_READ)
	XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
	else
	XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);

	return (0);
	}

	void
	dmu_xuio_fini(xuio_t *xuio)
	{
	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
	int nblk = priv->cnt;

	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
	kmem_free(priv, sizeof (dmu_xuio_t));

	if (XUIO_XUZC_RW(xuio) == UIO_READ)
	XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
	else
	XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
	}

	/*
	* Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
	* and increase priv->next by 1.
	*/
	int
	dmu_xuio_add(xuio_t xuio, arc_buf_t abuf, offset_t off, size_t n)
	{
	struct iovec *iov;
	uio_t *uio = &xuio->xu_uio;
	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
	int i = priv->next++;

	ASSERT(i < priv->cnt);
	ASSERT(off + n <= arc_buf_lsize(abuf));
	iov = uio->uio_iov + i;
	iov->iov_base = (char *)abuf->b_data + off;
	iov->iov_len = n;
	priv->bufs[i] = abuf;
	return (0);
	}

	int
	dmu_xuio_cnt(xuio_t *xuio)
	{
	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
	return (priv->cnt);
	}

	arc_buf_t *
	dmu_xuio_arcbuf(xuio_t *xuio, int i)
	{
	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);

	ASSERT(i < priv->cnt);
	return (priv->bufs[i]);
	}

	void
	dmu_xuio_clear(xuio_t *xuio, int i)
	{
	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);

	ASSERT(i < priv->cnt);
	priv->bufs[i] = NULL;
	}

	static void
	xuio_stat_init(void)
	{
	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
	KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
	KSTAT_FLAG_VIRTUAL);
	if (xuio_ksp != NULL) {
	xuio_ksp->ks_data = &xuio_stats;
	kstat_install(xuio_ksp);
	}
	}

	static void
	xuio_stat_fini(void)
	{
	if (xuio_ksp != NULL) {
	kstat_delete(xuio_ksp);
	xuio_ksp = NULL;
	}
	}

	void
	xuio_stat_wbuf_copied(void)
	{
	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
	}

	void
	xuio_stat_wbuf_nocopy(void)
	{
	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
	}

	#ifdef _KERNEL
	static int
	dmu_read_uio_dnode(dnode_t dn, uio_t uio, uint64_t size)
	{
	dmu_buf_t **dbp;
	int numbufs, i, err;
	xuio_t *xuio = NULL;

	/*
	* NB: we could do this block-at-a-time, but it's nice
	* to be reading in parallel.
	*/
	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
	TRUE, FTAG, &numbufs, &dbp, 0);
	if (err)
	return (err);

	#ifdef UIO_XUIO
	if (uio->uio_extflg == UIO_XUIO)
	xuio = (xuio_t *)uio;
	#endif

	for (i = 0; i < numbufs; i++) {
	int tocpy;
	int bufoff;
	dmu_buf_t *db = dbp[i];

	ASSERT(size > 0);

	bufoff = uio->uio_loffset - db->db_offset;
	tocpy = (int)MIN(db->db_size - bufoff, size);

	if (xuio) {
	dmu_buf_impl_t dbi = (dmu_buf_impl_t )db;
	arc_buf_t *dbuf_abuf = dbi->db_buf;
	arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
	err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
	if (!err) {
	uio->uio_resid -= tocpy;
	uio->uio_loffset += tocpy;
	}

	if (abuf == dbuf_abuf)
	XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
	else
	XUIOSTAT_BUMP(xuiostat_rbuf_copied);
	} else {
	#ifdef illumos
	err = uiomove((char *)db->db_data + bufoff, tocpy,
	UIO_READ, uio);
	#else
	err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
	tocpy, uio);
	#endif
	}
	if (err)
	break;

	size -= tocpy;
	}
	dmu_buf_rele_array(dbp, numbufs, FTAG);

	return (err);
	}

	/*
	* Read 'size' bytes into the uio buffer.
	* From object zdb->db_object.
	* Starting at offset uio->uio_loffset.
	*
	* If the caller already has a dbuf in the target object
	* (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
	* because we don't have to find the dnode_t for the object.
	*/
	int
	dmu_read_uio_dbuf(dmu_buf_t zdb, uio_t uio, uint64_t size)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )zdb;
	dnode_t *dn;
	int err;

	if (size == 0)
	return (0);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	err = dmu_read_uio_dnode(dn, uio, size);
	DB_DNODE_EXIT(db);

	return (err);
	}

	/*
	* Read 'size' bytes into the uio buffer.
	* From the specified object
	* Starting at offset uio->uio_loffset.
	*/
	int
	dmu_read_uio(objset_t os, uint64_t object, uio_t uio, uint64_t size)
	{
	dnode_t *dn;
	int err;

	if (size == 0)
	return (0);

	err = dnode_hold(os, object, FTAG, &dn);
	if (err)
	return (err);

	err = dmu_read_uio_dnode(dn, uio, size);

	dnode_rele(dn, FTAG);

	return (err);
	}

	static int
	dmu_write_uio_dnode(dnode_t dn, uio_t uio, uint64_t size, dmu_tx_t *tx)
	{
	dmu_buf_t **dbp;
	int numbufs;
	int err = 0;
	int i;

	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
	FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
	if (err)
	return (err);

	for (i = 0; i < numbufs; i++) {
	int tocpy;
	int bufoff;
	dmu_buf_t *db = dbp[i];

	ASSERT(size > 0);

	bufoff = uio->uio_loffset - db->db_offset;
	tocpy = (int)MIN(db->db_size - bufoff, size);

	ASSERT(i == 0 \|\| i == numbufs-1 \|\| tocpy == db->db_size);

	if (tocpy == db->db_size)
	dmu_buf_will_fill(db, tx);
	else
	dmu_buf_will_dirty(db, tx);

	#ifdef illumos
	/*
	* XXX uiomove could block forever (eg. nfs-backed
	* pages). There needs to be a uiolockdown() function
	* to lock the pages in memory, so that uiomove won't
	* block.
	*/
	err = uiomove((char *)db->db_data + bufoff, tocpy,
	UIO_WRITE, uio);
	#else
	err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
	uio);
	#endif

	if (tocpy == db->db_size)
	dmu_buf_fill_done(db, tx);

	if (err)
	break;

	size -= tocpy;
	}

	dmu_buf_rele_array(dbp, numbufs, FTAG);
	return (err);
	}

	/*
	* Write 'size' bytes from the uio buffer.
	* To object zdb->db_object.
	* Starting at offset uio->uio_loffset.
	*
	* If the caller already has a dbuf in the target object
	* (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
	* because we don't have to find the dnode_t for the object.
	*/
	int
	dmu_write_uio_dbuf(dmu_buf_t zdb, uio_t uio, uint64_t size,
	dmu_tx_t *tx)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )zdb;
	dnode_t *dn;
	int err;

	if (size == 0)
	return (0);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	err = dmu_write_uio_dnode(dn, uio, size, tx);
	DB_DNODE_EXIT(db);

	return (err);
	}

	/*
	* Write 'size' bytes from the uio buffer.
	* To the specified object.
	* Starting at offset uio->uio_loffset.
	*/
	int
	dmu_write_uio(objset_t os, uint64_t object, uio_t uio, uint64_t size,
	dmu_tx_t *tx)
	{
	dnode_t *dn;
	int err;

	if (size == 0)
	return (0);

	err = dnode_hold(os, object, FTAG, &dn);
	if (err)
	return (err);

	err = dmu_write_uio_dnode(dn, uio, size, tx);

	dnode_rele(dn, FTAG);

	return (err);
	}

	#ifdef illumos
	int
	dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	page_t pp, dmu_tx_t tx)
	{
	dmu_buf_t **dbp;
	int numbufs, i;
	int err;

	if (size == 0)
	return (0);

	err = dmu_buf_hold_array(os, object, offset, size,
	FALSE, FTAG, &numbufs, &dbp);
	if (err)
	return (err);

	for (i = 0; i < numbufs; i++) {
	int tocpy, copied, thiscpy;
	int bufoff;
	dmu_buf_t *db = dbp[i];
	caddr_t va;

	ASSERT(size > 0);
	ASSERT3U(db->db_size, >=, PAGESIZE);

	bufoff = offset - db->db_offset;
	tocpy = (int)MIN(db->db_size - bufoff, size);

	ASSERT(i == 0 \|\| i == numbufs-1 \|\| tocpy == db->db_size);

	if (tocpy == db->db_size)
	dmu_buf_will_fill(db, tx);
	else
	dmu_buf_will_dirty(db, tx);

	for (copied = 0; copied < tocpy; copied += PAGESIZE) {
	ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
	thiscpy = MIN(PAGESIZE, tocpy - copied);
	va = zfs_map_page(pp, S_READ);
	bcopy(va, (char *)db->db_data + bufoff, thiscpy);
	zfs_unmap_page(pp, va);
	pp = pp->p_next;
	bufoff += PAGESIZE;
	}

	if (tocpy == db->db_size)
	dmu_buf_fill_done(db, tx);

	offset += tocpy;
	size -= tocpy;
	}
	dmu_buf_rele_array(dbp, numbufs, FTAG);
	return (err);
	}

	#else /* !illumos */

	int
	dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	vm_page_t ma, dmu_tx_t tx)
	{
	dmu_buf_t **dbp;
	struct sf_buf *sf;
	int numbufs, i;
	int err;

	if (size == 0)
	return (0);

	err = dmu_buf_hold_array(os, object, offset, size,
	FALSE, FTAG, &numbufs, &dbp);
	if (err)
	return (err);

	for (i = 0; i < numbufs; i++) {
	int tocpy, copied, thiscpy;
	int bufoff;
	dmu_buf_t *db = dbp[i];
	caddr_t va;

	ASSERT(size > 0);
	ASSERT3U(db->db_size, >=, PAGESIZE);

	bufoff = offset - db->db_offset;
	tocpy = (int)MIN(db->db_size - bufoff, size);

	ASSERT(i == 0 \|\| i == numbufs-1 \|\| tocpy == db->db_size);

	if (tocpy == db->db_size)
	dmu_buf_will_fill(db, tx);
	else
	dmu_buf_will_dirty(db, tx);

	for (copied = 0; copied < tocpy; copied += PAGESIZE) {
	ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
	thiscpy = MIN(PAGESIZE, tocpy - copied);
	va = zfs_map_page(*ma, &sf);
	bcopy(va, (char *)db->db_data + bufoff, thiscpy);
	zfs_unmap_page(sf);
	ma += 1;
	bufoff += PAGESIZE;
	}

	if (tocpy == db->db_size)
	dmu_buf_fill_done(db, tx);

	offset += tocpy;
	size -= tocpy;
	}
	dmu_buf_rele_array(dbp, numbufs, FTAG);
	return (err);
	}

	int
	dmu_read_pages(objset_t os, uint64_t object, vm_page_t ma, int count,
	int rbehind, int rahead, int last_size)
	{
	struct sf_buf *sf;
	vm_object_t vmobj;
	vm_page_t m;
	dmu_buf_t **dbp;
	dmu_buf_t *db;
	caddr_t va;
	int numbufs, i;
	int bufoff, pgoff, tocpy;
	int mi, di;
	int err;

	ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
	ASSERT(last_size <= PAGE_SIZE);

	err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
	IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
	if (err != 0)
	return (err);

	#ifdef DEBUG
	IMPLY(last_size < PAGE_SIZE, *rahead == 0);
	if (dbp[0]->db_offset != 0 \|\| numbufs > 1) {
	for (i = 0; i < numbufs; i++) {
	ASSERT(ISP2(dbp[i]->db_size));
	ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
	ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
	}
	}
	#endif

	vmobj = ma[0]->object;
	zfs_vmobject_wlock(vmobj);

	db = dbp[0];
	for (i = 0; i < *rbehind; i++) {
	m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i,
	VM_ALLOC_NORMAL \| VM_ALLOC_NOWAIT \| VM_ALLOC_NOBUSY);
	if (m == NULL)
	break;
	if (m->valid != 0) {
	ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
	break;
	}
	ASSERT(m->dirty == 0);
	ASSERT(!pmap_page_is_mapped(m));

	ASSERT(db->db_size > PAGE_SIZE);
	bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
	va = zfs_map_page(m, &sf);
	bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
	zfs_unmap_page(sf);
	m->valid = VM_PAGE_BITS_ALL;
	vm_page_lock(m);
	if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
	vm_page_activate(m);
	else
	vm_page_deactivate(m);
	vm_page_unlock(m);
	}
	*rbehind = i;

	bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
	pgoff = 0;
	for (mi = 0, di = 0; mi < count && di < numbufs; ) {
	if (pgoff == 0) {
	m = ma[mi];
	vm_page_assert_xbusied(m);
	ASSERT(m->valid == 0);
	ASSERT(m->dirty == 0);
	ASSERT(!pmap_page_is_mapped(m));
	va = zfs_map_page(m, &sf);
	}
	if (bufoff == 0)
	db = dbp[di];

	ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
	db->db_offset + bufoff);

	/*
	* We do not need to clamp the copy size by the file
	* size as the last block is zero-filled beyond the
	* end of file anyway.
	*/
	tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
	bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);

	pgoff += tocpy;
	ASSERT(pgoff <= PAGESIZE);
	if (pgoff == PAGESIZE) {
	zfs_unmap_page(sf);
	m->valid = VM_PAGE_BITS_ALL;
	ASSERT(mi < count);
	mi++;
	pgoff = 0;
	}

	bufoff += tocpy;
	ASSERT(bufoff <= db->db_size);
	if (bufoff == db->db_size) {
	ASSERT(di < numbufs);
	di++;
	bufoff = 0;
	}
	}

	#ifdef DEBUG
	/*
	* Three possibilities:
	* - last requested page ends at a buffer boundary and , thus,
	* all pages and buffers have been iterated;
	* - all requested pages are filled, but the last buffer
	* has not been exhausted;
	* the read-ahead is possible only in this case;
	* - all buffers have been read, but the last page has not been
	* fully filled;
	* this is only possible if the file has only a single buffer
	* with a size that is not a multiple of the page size.
	*/
	if (mi == count) {
	ASSERT(di >= numbufs - 1);
	IMPLY(*rahead != 0, di == numbufs - 1);
	IMPLY(*rahead != 0, bufoff != 0);
	ASSERT(pgoff == 0);
	}
	if (di == numbufs) {
	ASSERT(mi >= count - 1);
	ASSERT(*rahead == 0);
	IMPLY(pgoff == 0, mi == count);
	if (pgoff != 0) {
	ASSERT(mi == count - 1);
	ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
	}
	}
	#endif
	if (pgoff != 0) {
	bzero(va + pgoff, PAGESIZE - pgoff);
	zfs_unmap_page(sf);
	m->valid = VM_PAGE_BITS_ALL;
	}

	for (i = 0; i < *rahead; i++) {
	m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i,
	VM_ALLOC_NORMAL \| VM_ALLOC_NOWAIT \| VM_ALLOC_NOBUSY);
	if (m == NULL)
	break;
	if (m->valid != 0) {
	ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
	break;
	}
	ASSERT(m->dirty == 0);
	ASSERT(!pmap_page_is_mapped(m));

	ASSERT(db->db_size > PAGE_SIZE);
	bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
	tocpy = MIN(db->db_size - bufoff, PAGESIZE);
	va = zfs_map_page(m, &sf);
	bcopy((char *)db->db_data + bufoff, va, tocpy);
	if (tocpy < PAGESIZE) {
	ASSERT(i == *rahead - 1);
	ASSERT((db->db_size & PAGE_MASK) != 0);
	bzero(va + tocpy, PAGESIZE - tocpy);
	}
	zfs_unmap_page(sf);
	m->valid = VM_PAGE_BITS_ALL;
	vm_page_lock(m);
	if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
	vm_page_activate(m);
	else
	vm_page_deactivate(m);
	vm_page_unlock(m);
	}
	*rahead = i;
	zfs_vmobject_wunlock(vmobj);

	dmu_buf_rele_array(dbp, numbufs, FTAG);
	return (0);
	}
	#endif /* illumos */
	#endif /* _KERNEL */

	/*
	* Allocate a loaned anonymous arc buffer.
	*/
	arc_buf_t *
	dmu_request_arcbuf(dmu_buf_t *handle, int size)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )handle;

	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
	}

	/*
	* Free a loaned arc buffer.
	*/
	void
	dmu_return_arcbuf(arc_buf_t *buf)
	{
	arc_return_buf(buf, FTAG);
	arc_buf_destroy(buf, FTAG);
	}

	/*
	* When possible directly assign passed loaned arc buffer to a dbuf.
	* If this is not possible copy the contents of passed arc buf via
	* dmu_write().
	*/
	void
	dmu_assign_arcbuf(dmu_buf_t handle, uint64_t offset, arc_buf_t buf,
	dmu_tx_t *tx)
	{
	dmu_buf_impl_t dbuf = (dmu_buf_impl_t )handle;
	dnode_t *dn;
	dmu_buf_impl_t *db;
	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
	uint64_t blkid;

	DB_DNODE_ENTER(dbuf);
	dn = DB_DNODE(dbuf);
	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	blkid = dbuf_whichblock(dn, 0, offset);
	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
	rw_exit(&dn->dn_struct_rwlock);
	DB_DNODE_EXIT(dbuf);

	/*
	* We can only assign if the offset is aligned, the arc buf is the
	* same size as the dbuf, and the dbuf is not metadata.
	*/
	if (offset == db->db.db_offset && blksz == db->db.db_size) {
	#ifdef _KERNEL
	curthread->td_ru.ru_oublock++;
	#ifdef RACCT
	if (racct_enable) {
	PROC_LOCK(curproc);
	racct_add_force(curproc, RACCT_WRITEBPS, blksz);
	racct_add_force(curproc, RACCT_WRITEIOPS, 1);
	PROC_UNLOCK(curproc);
	}
	#endif /* RACCT */
	#endif /* _KERNEL */
	dbuf_assign_arcbuf(db, buf, tx);
	dbuf_rele(db, FTAG);
	} else {
	objset_t *os;
	uint64_t object;

	/* compressed bufs must always be assignable to their dbuf */
	ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
	ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));

	DB_DNODE_ENTER(dbuf);
	dn = DB_DNODE(dbuf);
	os = dn->dn_objset;
	object = dn->dn_object;
	DB_DNODE_EXIT(dbuf);

	dbuf_rele(db, FTAG);
	dmu_write(os, object, offset, blksz, buf->b_data, tx);
	dmu_return_arcbuf(buf);
	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
	}
	}

	typedef struct {
	dbuf_dirty_record_t *dsa_dr;
	dmu_sync_cb_t *dsa_done;
	zgd_t *dsa_zgd;
	dmu_tx_t *dsa_tx;
	} dmu_sync_arg_t;

	/* ARGSUSED */
	static void
	dmu_sync_ready(zio_t zio, arc_buf_t buf, void *varg)
	{
	dmu_sync_arg_t *dsa = varg;
	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
	blkptr_t *bp = zio->io_bp;

	if (zio->io_error == 0) {
	if (BP_IS_HOLE(bp)) {
	/*
	* A block of zeros may compress to a hole, but the
	* block size still needs to be known for replay.
	*/
	BP_SET_LSIZE(bp, db->db_size);
	} else if (!BP_IS_EMBEDDED(bp)) {
	ASSERT(BP_GET_LEVEL(bp) == 0);
	bp->blk_fill = 1;
	}
	}
	}

	static void
	dmu_sync_late_arrival_ready(zio_t *zio)
	{
	dmu_sync_ready(zio, NULL, zio->io_private);
	}

	/* ARGSUSED */
	static void
	dmu_sync_done(zio_t zio, arc_buf_t buf, void *varg)
	{
	dmu_sync_arg_t *dsa = varg;
	dbuf_dirty_record_t *dr = dsa->dsa_dr;
	dmu_buf_impl_t *db = dr->dr_dbuf;

	mutex_enter(&db->db_mtx);
	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
	if (zio->io_error == 0) {
	dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
	if (dr->dt.dl.dr_nopwrite) {
	blkptr_t *bp = zio->io_bp;
	blkptr_t *bp_orig = &zio->io_bp_orig;
	uint8_t chksum = BP_GET_CHECKSUM(bp_orig);

	ASSERT(BP_EQUAL(bp, bp_orig));
	VERIFY(BP_EQUAL(bp, db->db_blkptr));
	ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
	ASSERT(zio_checksum_table[chksum].ci_flags &
	ZCHECKSUM_FLAG_NOPWRITE);
	}
	dr->dt.dl.dr_overridden_by = *zio->io_bp;
	dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
	dr->dt.dl.dr_copies = zio->io_prop.zp_copies;

	/*
	* Old style holes are filled with all zeros, whereas
	* new-style holes maintain their lsize, type, level,
	* and birth time (see zio_write_compress). While we
	* need to reset the BP_SET_LSIZE() call that happened
	* in dmu_sync_ready for old style holes, we do not
	* want to wipe out the information contained in new
	* style holes. Thus, only zero out the block pointer if
	* it's an old style hole.
	*/
	if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
	dr->dt.dl.dr_overridden_by.blk_birth == 0)
	BP_ZERO(&dr->dt.dl.dr_overridden_by);
	} else {
	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
	}
	cv_broadcast(&db->db_changed);
	mutex_exit(&db->db_mtx);

	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);

	kmem_free(dsa, sizeof (*dsa));
	}

	static void
	dmu_sync_late_arrival_done(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;
	dmu_sync_arg_t *dsa = zio->io_private;
	blkptr_t *bp_orig = &zio->io_bp_orig;

	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
	ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
	ASSERT(BP_IS_HOLE(bp_orig) \|\| !BP_EQUAL(bp, bp_orig));
	ASSERT(zio->io_bp->blk_birth == zio->io_txg);
	ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
	zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
	}

	dmu_tx_commit(dsa->dsa_tx);

	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);

	abd_put(zio->io_abd);
	kmem_free(dsa, sizeof (*dsa));
	}

	static int
	dmu_sync_late_arrival(zio_t pio, objset_t os, dmu_sync_cb_t done, zgd_t zgd,
	zio_prop_t zp, zbookmark_phys_t zb)
	{
	dmu_sync_arg_t *dsa;
	dmu_tx_t *tx;

	tx = dmu_tx_create(os);
	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
	dmu_tx_abort(tx);
	/* Make zl_get_data do txg_waited_synced() */
	return (SET_ERROR(EIO));
	}

	/*
	* In order to prevent the zgd's lwb from being free'd prior to
	* dmu_sync_late_arrival_done() being called, we have to ensure
	* the lwb's "max txg" takes this tx's txg into account.
	*/
	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));

	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
	dsa->dsa_dr = NULL;
	dsa->dsa_done = done;
	dsa->dsa_zgd = zgd;
	dsa->dsa_tx = tx;

	/*
	* Since we are currently syncing this txg, it's nontrivial to
	* determine what BP to nopwrite against, so we disable nopwrite.
	*
	* When syncing, the db_blkptr is initially the BP of the previous
	* txg. We can not nopwrite against it because it will be changed
	* (this is similar to the non-late-arrival case where the dbuf is
	* dirty in a future txg).
	*
	* Then dbuf_write_ready() sets bp_blkptr to the location we will write.
	* We can not nopwrite against it because although the BP will not
	* (typically) be changed, the data has not yet been persisted to this
	* location.
	*
	* Finally, when dbuf_write_done() is called, it is theoretically
	* possible to always nopwrite, because the data that was written in
	* this txg is the same data that we are trying to write. However we
	* would need to check that this dbuf is not dirty in any future
	* txg's (as we do in the normal dmu_sync() path). For simplicity, we
	* don't nopwrite in this case.
	*/
	zp->zp_nopwrite = B_FALSE;

	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
	abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
	zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
	dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
	dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));

	return (0);
	}

	/*
	* Intent log support: sync the block associated with db to disk.
	* N.B. and XXX: the caller is responsible for making sure that the
	* data isn't changing while dmu_sync() is writing it.
	*
	* Return values:
	*
	* EEXIST: this txg has already been synced, so there's nothing to do.
	* The caller should not log the write.
	*
	* ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
	* The caller should not log the write.
	*
	* EALREADY: this block is already in the process of being synced.
	* The caller should track its progress (somehow).
	*
	* EIO: could not do the I/O.
	* The caller should do a txg_wait_synced().
	*
	* 0: the I/O has been initiated.
	* The caller should log this blkptr in the done callback.
	* It is possible that the I/O will fail, in which case
	* the error will be reported to the done callback and
	* propagated to pio from zio_done().
	*/
	int
	dmu_sync(zio_t pio, uint64_t txg, dmu_sync_cb_t done, zgd_t *zgd)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )zgd->zgd_db;
	objset_t *os = db->db_objset;
	dsl_dataset_t *ds = os->os_dsl_dataset;
	dbuf_dirty_record_t *dr;
	dmu_sync_arg_t *dsa;
	zbookmark_phys_t zb;
	zio_prop_t zp;
	dnode_t *dn;

	ASSERT(pio != NULL);
	ASSERT(txg != 0);

	SET_BOOKMARK(&zb, ds->ds_object,
	db->db.db_object, db->db_level, db->db_blkid);

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
	DB_DNODE_EXIT(db);

	/*
	* If we're frozen (running ziltest), we always need to generate a bp.
	*/
	if (txg > spa_freeze_txg(os->os_spa))
	return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));

	/*
	* Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
	* and us. If we determine that this txg is not yet syncing,
	* but it begins to sync a moment later, that's OK because the
	* sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
	*/
	mutex_enter(&db->db_mtx);

	if (txg <= spa_last_synced_txg(os->os_spa)) {
	/*
	* This txg has already synced. There's nothing to do.
	*/
	mutex_exit(&db->db_mtx);
	return (SET_ERROR(EEXIST));
	}

	if (txg <= spa_syncing_txg(os->os_spa)) {
	/*
	* This txg is currently syncing, so we can't mess with
	* the dirty record anymore; just write a new log block.
	*/
	mutex_exit(&db->db_mtx);
	return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
	}

	dr = db->db_last_dirty;
	while (dr && dr->dr_txg != txg)
	dr = dr->dr_next;

	if (dr == NULL) {
	/*
	* There's no dr for this dbuf, so it must have been freed.
	* There's no need to log writes to freed blocks, so we're done.
	*/
	mutex_exit(&db->db_mtx);
	return (SET_ERROR(ENOENT));
	}

	ASSERT(dr->dr_next == NULL \|\| dr->dr_next->dr_txg < txg);

	if (db->db_blkptr != NULL) {
	/*
	* We need to fill in zgd_bp with the current blkptr so that
	* the nopwrite code can check if we're writing the same
	* data that's already on disk. We can only nopwrite if we
	* are sure that after making the copy, db_blkptr will not
	* change until our i/o completes. We ensure this by
	* holding the db_mtx, and only allowing nopwrite if the
	* block is not already dirty (see below). This is verified
	* by dmu_sync_done(), which VERIFYs that the db_blkptr has
	* not changed.
	*/
	zgd->zgd_bp = db->db_blkptr;
	}

	/*
	* Assume the on-disk data is X, the current syncing data (in
	* txg - 1) is Y, and the current in-memory data is Z (currently
	* in dmu_sync).
	*
	* We usually want to perform a nopwrite if X and Z are the
	* same. However, if Y is different (i.e. the BP is going to
	* change before this write takes effect), then a nopwrite will
	* be incorrect - we would override with X, which could have
	* been freed when Y was written.
	*
	* (Note that this is not a concern when we are nop-writing from
	* syncing context, because X and Y must be identical, because
	* all previous txgs have been synced.)
	*
	* Therefore, we disable nopwrite if the current BP could change
	* before this TXG. There are two ways it could change: by
	* being dirty (dr_next is non-NULL), or by being freed
	* (dnode_block_freed()). This behavior is verified by
	* zio_done(), which VERIFYs that the override BP is identical
	* to the on-disk BP.
	*/
	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	if (dr->dr_next != NULL \|\| dnode_block_freed(dn, db->db_blkid))
	zp.zp_nopwrite = B_FALSE;
	DB_DNODE_EXIT(db);

	ASSERT(dr->dr_txg == txg);
	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC \|\|
	dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
	/*
	* We have already issued a sync write for this buffer,
	* or this buffer has already been synced. It could not
	* have been dirtied since, or we would have cleared the state.
	*/
	mutex_exit(&db->db_mtx);
	return (SET_ERROR(EALREADY));
	}

	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
	mutex_exit(&db->db_mtx);

	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
	dsa->dsa_dr = dr;
	dsa->dsa_done = done;
	dsa->dsa_zgd = zgd;
	dsa->dsa_tx = NULL;

	zio_nowait(arc_write(pio, os->os_spa, txg,
	zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
	&zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
	ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));

	return (0);
	}

	int
	dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
	dmu_tx_t *tx)
	{
	dnode_t *dn;
	int err;

	err = dnode_hold(os, object, FTAG, &dn);
	if (err)
	return (err);
	err = dnode_set_blksz(dn, size, ibs, tx);
	dnode_rele(dn, FTAG);
	return (err);
	}

	void
	dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
	dmu_tx_t *tx)
	{
	dnode_t *dn;

	/*
	* Send streams include each object's checksum function. This
	* check ensures that the receiving system can understand the
	* checksum function transmitted.
	*/
	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);

	VERIFY0(dnode_hold(os, object, FTAG, &dn));
	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
	dn->dn_checksum = checksum;
	dnode_setdirty(dn, tx);
	dnode_rele(dn, FTAG);
	}

	void
	dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
	dmu_tx_t *tx)
	{
	dnode_t *dn;

	/*
	* Send streams include each object's compression function. This
	* check ensures that the receiving system can understand the
	* compression function transmitted.
	*/
	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);

	VERIFY0(dnode_hold(os, object, FTAG, &dn));
	dn->dn_compress = compress;
	dnode_setdirty(dn, tx);
	dnode_rele(dn, FTAG);
	}

	int zfs_mdcomp_disable = 0;
	SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
	&zfs_mdcomp_disable, 0, "Disable metadata compression");

	/*
	* When the "redundant_metadata" property is set to "most", only indirect
	* blocks of this level and higher will have an additional ditto block.
	*/
	int zfs_redundant_metadata_most_ditto_level = 2;

	void
	dmu_write_policy(objset_t os, dnode_t dn, int level, int wp, zio_prop_t *zp)
	{
	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
	boolean_t ismd = (level > 0 \|\| DMU_OT_IS_METADATA(type) \|\|
	(wp & WP_SPILL));
	enum zio_checksum checksum = os->os_checksum;
	enum zio_compress compress = os->os_compress;
	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
	boolean_t dedup = B_FALSE;
	boolean_t nopwrite = B_FALSE;
	boolean_t dedup_verify = os->os_dedup_verify;
	int copies = os->os_copies;

	/*
	* We maintain different write policies for each of the following
	* types of data:
	* 1. metadata
	* 2. preallocated blocks (i.e. level-0 blocks of a dump device)
	* 3. all other level 0 blocks
	*/
	if (ismd) {
	if (zfs_mdcomp_disable) {
	compress = ZIO_COMPRESS_EMPTY;
	} else {
	/*
	* XXX -- we should design a compression algorithm
	* that specializes in arrays of bps.
	*/
	compress = zio_compress_select(os->os_spa,
	ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
	}

	/*
	* Metadata always gets checksummed. If the data
	* checksum is multi-bit correctable, and it's not a
	* ZBT-style checksum, then it's suitable for metadata
	* as well. Otherwise, the metadata checksum defaults
	* to fletcher4.
	*/
	if (!(zio_checksum_table[checksum].ci_flags &
	ZCHECKSUM_FLAG_METADATA) \|\|
	(zio_checksum_table[checksum].ci_flags &
	ZCHECKSUM_FLAG_EMBEDDED))
	checksum = ZIO_CHECKSUM_FLETCHER_4;

	if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL \|\|
	(os->os_redundant_metadata ==
	ZFS_REDUNDANT_METADATA_MOST &&
	(level >= zfs_redundant_metadata_most_ditto_level \|\|
	DMU_OT_IS_METADATA(type) \|\| (wp & WP_SPILL))))
	copies++;
	} else if (wp & WP_NOFILL) {
	ASSERT(level == 0);

	/*
	* If we're writing preallocated blocks, we aren't actually
	* writing them so don't set any policy properties. These
	* blocks are currently only used by an external subsystem
	* outside of zfs (i.e. dump) and not written by the zio
	* pipeline.
	*/
	compress = ZIO_COMPRESS_OFF;
	checksum = ZIO_CHECKSUM_NOPARITY;
	} else {
	compress = zio_compress_select(os->os_spa, dn->dn_compress,
	compress);

	checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
	zio_checksum_select(dn->dn_checksum, checksum) :
	dedup_checksum;

	/*
	* Determine dedup setting. If we are in dmu_sync(),
	* we won't actually dedup now because that's all
	* done in syncing context; but we do want to use the
	* dedup checkum. If the checksum is not strong
	* enough to ensure unique signatures, force
	* dedup_verify.
	*/
	if (dedup_checksum != ZIO_CHECKSUM_OFF) {
	dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
	if (!(zio_checksum_table[checksum].ci_flags &
	ZCHECKSUM_FLAG_DEDUP))
	dedup_verify = B_TRUE;
	}

	/*
	* Enable nopwrite if we have secure enough checksum
	* algorithm (see comment in zio_nop_write) and
	* compression is enabled. We don't enable nopwrite if
	* dedup is enabled as the two features are mutually
	* exclusive.
	*/
	nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
	ZCHECKSUM_FLAG_NOPWRITE) &&
	compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
	}

	zp->zp_checksum = checksum;
	zp->zp_compress = compress;
	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);

	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
	zp->zp_level = level;
	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
	zp->zp_dedup = dedup;
	zp->zp_dedup_verify = dedup && dedup_verify;
	zp->zp_nopwrite = nopwrite;
	}

	int
	dmu_offset_next(objset_t os, uint64_t object, boolean_t hole, uint64_t off)
	{
	dnode_t *dn;
	int err;

	/*
	* Sync any current changes before
	* we go trundling through the block pointers.
	*/
	err = dmu_object_wait_synced(os, object);
	if (err) {
	return (err);
	}

	err = dnode_hold(os, object, FTAG, &dn);
	if (err) {
	return (err);
	}

	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
	dnode_rele(dn, FTAG);

	return (err);
	}

	/*
	* Given the ZFS object, if it contains any dirty nodes
	* this function flushes all dirty blocks to disk. This
	* ensures the DMU object info is updated. A more efficient
	* future version might just find the TXG with the maximum
	* ID and wait for that to be synced.
	*/
	int
	dmu_object_wait_synced(objset_t *os, uint64_t object)
	{
	dnode_t *dn;
	int error, i;

	error = dnode_hold(os, object, FTAG, &dn);
	if (error) {
	return (error);
	}

	for (i = 0; i < TXG_SIZE; i++) {
	if (list_link_active(&dn->dn_dirty_link[i])) {
	break;
	}
	}
	dnode_rele(dn, FTAG);
	if (i != TXG_SIZE) {
	txg_wait_synced(dmu_objset_pool(os), 0);
	}

	return (0);
	}

	void
	dmu_object_info_from_dnode(dnode_t dn, dmu_object_info_t doi)
	{
	dnode_phys_t *dnp;

	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	mutex_enter(&dn->dn_mtx);

	dnp = dn->dn_phys;

	doi->doi_data_block_size = dn->dn_datablksz;
	doi->doi_metadata_block_size = dn->dn_indblkshift ?
	1ULL << dn->dn_indblkshift : 0;
	doi->doi_type = dn->dn_type;
	doi->doi_bonus_type = dn->dn_bonustype;
	doi->doi_bonus_size = dn->dn_bonuslen;
	doi->doi_indirection = dn->dn_nlevels;
	doi->doi_checksum = dn->dn_checksum;
	doi->doi_compress = dn->dn_compress;
	doi->doi_nblkptr = dn->dn_nblkptr;
	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
	doi->doi_fill_count = 0;
	for (int i = 0; i < dnp->dn_nblkptr; i++)
	doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);

	mutex_exit(&dn->dn_mtx);
	rw_exit(&dn->dn_struct_rwlock);
	}

	/*
	* Get information on a DMU object.
	* If doi is NULL, just indicates whether the object exists.
	*/
	int
	dmu_object_info(objset_t os, uint64_t object, dmu_object_info_t doi)
	{
	dnode_t *dn;
	int err = dnode_hold(os, object, FTAG, &dn);

	if (err)
	return (err);

	if (doi != NULL)
	dmu_object_info_from_dnode(dn, doi);

	dnode_rele(dn, FTAG);
	return (0);
	}

	/*
	* As above, but faster; can be used when you have a held dbuf in hand.
	*/
	void
	dmu_object_info_from_db(dmu_buf_t db_fake, dmu_object_info_t doi)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;

	DB_DNODE_ENTER(db);
	dmu_object_info_from_dnode(DB_DNODE(db), doi);
	DB_DNODE_EXIT(db);
	}

	/*
	* Faster still when you only care about the size.
	* This is specifically optimized for zfs_getattr().
	*/
	void
	dmu_object_size_from_db(dmu_buf_t db_fake, uint32_t blksize,
	u_longlong_t *nblk512)
	{
	dmu_buf_impl_t db = (dmu_buf_impl_t )db_fake;
	dnode_t *dn;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);

	*blksize = dn->dn_datablksz;
	/* add 1 for dnode space */
	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
	SPA_MINBLOCKSHIFT) + 1;
	DB_DNODE_EXIT(db);
	}

	void
	byteswap_uint64_array(void *vbuf, size_t size)
	{
	uint64_t *buf = vbuf;
	size_t count = size >> 3;
	int i;

	ASSERT((size & 7) == 0);

	for (i = 0; i < count; i++)
	buf[i] = BSWAP_64(buf[i]);
	}

	void
	byteswap_uint32_array(void *vbuf, size_t size)
	{
	uint32_t *buf = vbuf;
	size_t count = size >> 2;
	int i;

	ASSERT((size & 3) == 0);

	for (i = 0; i < count; i++)
	buf[i] = BSWAP_32(buf[i]);
	}

	void
	byteswap_uint16_array(void *vbuf, size_t size)
	{
	uint16_t *buf = vbuf;
	size_t count = size >> 1;
	int i;

	ASSERT((size & 1) == 0);

	for (i = 0; i < count; i++)
	buf[i] = BSWAP_16(buf[i]);
	}

	/* ARGSUSED */
	void
	byteswap_uint8_array(void *vbuf, size_t size)
	{
	}

	void
	dmu_init(void)
	{
	abd_init();
	zfs_dbgmsg_init();
	sa_cache_init();
	xuio_stat_init();
	dmu_objset_init();
	dnode_init();
	zfetch_init();
	zio_compress_init();
	l2arc_init();
	arc_init();
	dbuf_init();
	}

	void
	dmu_fini(void)
	{
	arc_fini(); /* arc depends on l2arc, so arc must go first */
	l2arc_fini();
	zfetch_fini();
	zio_compress_fini();
	dbuf_fini();
	dnode_fini();
	dmu_objset_fini();
	xuio_stat_fini();
	sa_cache_fini();
	zfs_dbgmsg_fini();
	abd_fini();
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 332525)
	@@ -1,2240 +1,2347 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2017 Nexenta Systems, Inc.
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#include <sys/cred.h>
	#include <sys/zfs_context.h>
	#include <sys/dmu_objset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_synctask.h>
	#include <sys/dsl_deleg.h>
	#include <sys/dnode.h>
	#include <sys/dbuf.h>
	#include <sys/zvol.h>
	#include <sys/dmu_tx.h>
	#include <sys/zap.h>
	#include <sys/zil.h>
	#include <sys/dmu_impl.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/sa.h>
	#include <sys/zfs_onexit.h>
	#include <sys/dsl_destroy.h>
	#include <sys/vdev.h>
	+#include <sys/zfeature.h>

	/*
	* Needed to close a window in dnode_move() that allows the objset to be freed
	* before it can be safely accessed.
	*/
	krwlock_t os_lock;

	/*
	* Tunable to overwrite the maximum number of threads for the parallization
	* of dmu_objset_find_dp, needed to speed up the import of pools with many
	* datasets.
	* Default is 4 times the number of leaf vdevs.
	*/
	int dmu_find_threads = 0;

	/*
	* Backfill lower metadnode objects after this many have been freed.
	* Backfilling negatively impacts object creation rates, so only do it
	* if there are enough holes to fill.
	*/
	int dmu_rescan_dnode_threshold = 131072;

	static void dmu_objset_find_dp_cb(void *arg);

	void
	dmu_objset_init(void)
	{
	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
	}

	void
	dmu_objset_fini(void)
	{
	rw_destroy(&os_lock);
	}

	spa_t *
	dmu_objset_spa(objset_t *os)
	{
	return (os->os_spa);
	}

	zilog_t *
	dmu_objset_zil(objset_t *os)
	{
	return (os->os_zil);
	}

	dsl_pool_t *
	dmu_objset_pool(objset_t *os)
	{
	dsl_dataset_t *ds;

	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
	return (ds->ds_dir->dd_pool);
	else
	return (spa_get_dsl(os->os_spa));
	}

	dsl_dataset_t *
	dmu_objset_ds(objset_t *os)
	{
	return (os->os_dsl_dataset);
	}

	dmu_objset_type_t
	dmu_objset_type(objset_t *os)
	{
	return (os->os_phys->os_type);
	}

	void
	dmu_objset_name(objset_t os, char buf)
	{
	dsl_dataset_name(os->os_dsl_dataset, buf);
	}

	uint64_t
	dmu_objset_id(objset_t *os)
	{
	dsl_dataset_t *ds = os->os_dsl_dataset;

	return (ds ? ds->ds_object : 0);
	}

	zfs_sync_type_t
	dmu_objset_syncprop(objset_t *os)
	{
	return (os->os_sync);
	}

	zfs_logbias_op_t
	dmu_objset_logbias(objset_t *os)
	{
	return (os->os_logbias);
	}

	static void
	checksum_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance should have been done by now.
	*/
	ASSERT(newval != ZIO_CHECKSUM_INHERIT);

	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
	}

	static void
	compression_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval != ZIO_COMPRESS_INHERIT);

	os->os_compress = zio_compress_select(os->os_spa, newval,
	ZIO_COMPRESS_ON);
	}

	static void
	copies_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval > 0);
	ASSERT(newval <= spa_max_replication(os->os_spa));

	os->os_copies = newval;
	}

	static void
	dedup_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;
	spa_t *spa = os->os_spa;
	enum zio_checksum checksum;

	/*
	* Inheritance should have been done by now.
	*/
	ASSERT(newval != ZIO_CHECKSUM_INHERIT);

	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);

	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
	}

	static void
	primary_cache_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval == ZFS_CACHE_ALL \|\| newval == ZFS_CACHE_NONE \|\|
	newval == ZFS_CACHE_METADATA);

	os->os_primary_cache = newval;
	}

	static void
	secondary_cache_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval == ZFS_CACHE_ALL \|\| newval == ZFS_CACHE_NONE \|\|
	newval == ZFS_CACHE_METADATA);

	os->os_secondary_cache = newval;
	}

	static void
	sync_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval == ZFS_SYNC_STANDARD \|\| newval == ZFS_SYNC_ALWAYS \|\|
	newval == ZFS_SYNC_DISABLED);

	os->os_sync = newval;
	if (os->os_zil)
	zil_set_sync(os->os_zil, newval);
	}

	static void
	redundant_metadata_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL \|\|
	newval == ZFS_REDUNDANT_METADATA_MOST);

	os->os_redundant_metadata = newval;
	}

	static void
	logbias_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	ASSERT(newval == ZFS_LOGBIAS_LATENCY \|\|
	newval == ZFS_LOGBIAS_THROUGHPUT);
	os->os_logbias = newval;
	if (os->os_zil)
	zil_set_logbias(os->os_zil, newval);
	}

	static void
	recordsize_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	os->os_recordsize = newval;
	}

	void
	dmu_objset_byteswap(void *buf, size_t size)
	{
	objset_phys_t *osp = buf;

	ASSERT(size == OBJSET_OLD_PHYS_SIZE \|\| size == sizeof (objset_phys_t));
	dnode_byteswap(&osp->os_meta_dnode);
	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
	osp->os_type = BSWAP_64(osp->os_type);
	osp->os_flags = BSWAP_64(osp->os_flags);
	if (size == sizeof (objset_phys_t)) {
	dnode_byteswap(&osp->os_userused_dnode);
	dnode_byteswap(&osp->os_groupused_dnode);
	}
	}

	/*
	* The hash is a CRC-based hash of the objset_t pointer and the object number.
	*/
	static uint64_t
	dnode_hash(const objset_t *os, uint64_t obj)
	{
	uintptr_t osv = (uintptr_t)os;
	uint64_t crc = -1ULL;

	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
	/*
	* The low 6 bits of the pointer don't have much entropy, because
	* the objset_t is larger than 2^6 bytes long.
	*/
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];

	crc ^= (osv>>14) ^ (obj>>24);

	return (crc);
	}

	unsigned int
	dnode_multilist_index_func(multilist_t ml, void obj)
	{
	dnode_t *dn = obj;
	return (dnode_hash(dn->dn_objset, dn->dn_object) %
	multilist_get_num_sublists(ml));
	}

	/*
	* Instantiates the objset_t in-memory structure corresponding to the
	* objset_phys_t that's pointed to by the specified blkptr_t.
	*/
	int
	dmu_objset_open_impl(spa_t spa, dsl_dataset_t ds, blkptr_t *bp,
	objset_t **osp)
	{
	objset_t *os;
	int i, err;

	ASSERT(ds == NULL \|\| MUTEX_HELD(&ds->ds_opening_lock));

	+ /*
	+ * The $ORIGIN dataset (if it exists) doesn't have an associated
	+ * objset, so there's no reason to open it. The $ORIGIN dataset
	+ * will not exist on pools older than SPA_VERSION_ORIGIN.
	+ */
	+ if (ds != NULL && spa_get_dsl(spa) != NULL &&
	+ spa_get_dsl(spa)->dp_origin_snap != NULL) {
	+ ASSERT3P(ds->ds_dir, !=,
	+ spa_get_dsl(spa)->dp_origin_snap->ds_dir);
	+ }
	+
	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
	os->os_dsl_dataset = ds;
	os->os_spa = spa;
	os->os_rootbp = bp;
	if (!BP_IS_HOLE(os->os_rootbp)) {
	arc_flags_t aflags = ARC_FLAG_WAIT;
	zbookmark_phys_t zb;
	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
	ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);

	if (DMU_OS_IS_L2CACHEABLE(os))
	aflags \|= ARC_FLAG_L2CACHE;

	dprintf_bp(os->os_rootbp, "reading %s", "");
	err = arc_read(NULL, spa, os->os_rootbp,
	arc_getbuf_func, &os->os_phys_buf,
	ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
	if (err != 0) {
	kmem_free(os, sizeof (objset_t));
	/* convert checksum errors into IO errors */
	if (err == ECKSUM)
	err = SET_ERROR(EIO);
	return (err);
	}

	/* Increase the blocksize if we are permitted. */
	if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
	arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
	arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
	ARC_BUFC_METADATA, sizeof (objset_phys_t));
	bzero(buf->b_data, sizeof (objset_phys_t));
	bcopy(os->os_phys_buf->b_data, buf->b_data,
	arc_buf_size(os->os_phys_buf));
	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
	os->os_phys_buf = buf;
	}

	os->os_phys = os->os_phys_buf->b_data;
	os->os_flags = os->os_phys->os_flags;
	} else {
	int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
	sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
	os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
	ARC_BUFC_METADATA, size);
	os->os_phys = os->os_phys_buf->b_data;
	bzero(os->os_phys, size);
	}

	/*
	* Note: the changed_cb will be called once before the register
	* func returns, thus changing the checksum/compression from the
	* default (fletcher2/off). Snapshots don't need to know about
	* checksum/compression/copies.
	*/
	if (ds != NULL) {
	boolean_t needlock = B_FALSE;

	/*
	* Note: it's valid to open the objset if the dataset is
	* long-held, in which case the pool_config lock will not
	* be held.
	*/
	if (!dsl_pool_config_held(dmu_objset_pool(os))) {
	needlock = B_TRUE;
	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
	}
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
	primary_cache_changed_cb, os);
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
	secondary_cache_changed_cb, os);
	}
	if (!ds->ds_is_snapshot) {
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_CHECKSUM),
	checksum_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_COMPRESSION),
	compression_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_COPIES),
	copies_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_DEDUP),
	dedup_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_LOGBIAS),
	logbias_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_SYNC),
	sync_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(
	ZFS_PROP_REDUNDANT_METADATA),
	redundant_metadata_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
	recordsize_changed_cb, os);
	}
	}
	if (needlock)
	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	if (err != 0) {
	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
	kmem_free(os, sizeof (objset_t));
	return (err);
	}
	} else {
	/* It's the meta-objset. */
	os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
	os->os_compress = ZIO_COMPRESS_ON;
	os->os_copies = spa_max_replication(spa);
	os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
	os->os_dedup_verify = B_FALSE;
	os->os_logbias = ZFS_LOGBIAS_LATENCY;
	os->os_sync = ZFS_SYNC_STANDARD;
	os->os_primary_cache = ZFS_CACHE_ALL;
	os->os_secondary_cache = ZFS_CACHE_ALL;
	}

	if (ds == NULL \|\| !ds->ds_is_snapshot)
	os->os_zil_header = os->os_phys->os_zil_header;
	os->os_zil = zil_alloc(os, &os->os_zil_header);

	for (i = 0; i < TXG_SIZE; i++) {
	os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t),
	offsetof(dnode_t, dn_dirty_link[i]),
	dnode_multilist_index_func);
	}
	list_create(&os->os_dnodes, sizeof (dnode_t),
	offsetof(dnode_t, dn_link));
	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
	offsetof(dmu_buf_impl_t, db_link));

	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);

	dnode_special_open(os, &os->os_phys->os_meta_dnode,
	DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
	dnode_special_open(os, &os->os_phys->os_userused_dnode,
	DMU_USERUSED_OBJECT, &os->os_userused_dnode);
	dnode_special_open(os, &os->os_phys->os_groupused_dnode,
	DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
	}

	*osp = os;
	return (0);
	}

	int
	dmu_objset_from_ds(dsl_dataset_t ds, objset_t *osp)
	{
	int err = 0;

	/*
	* We shouldn't be doing anything with dsl_dataset_t's unless the
	* pool_config lock is held, or the dataset is long-held.
	*/
	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) \|\|
	dsl_dataset_long_held(ds));

	mutex_enter(&ds->ds_opening_lock);
	if (ds->ds_objset == NULL) {
	objset_t *os;
	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
	ds, dsl_dataset_get_blkptr(ds), &os);
	rrw_exit(&ds->ds_bp_rwlock, FTAG);

	if (err == 0) {
	mutex_enter(&ds->ds_lock);
	ASSERT(ds->ds_objset == NULL);
	ds->ds_objset = os;
	mutex_exit(&ds->ds_lock);
	}
	}
	*osp = ds->ds_objset;
	mutex_exit(&ds->ds_opening_lock);
	return (err);
	}

	/*
	* Holds the pool while the objset is held. Therefore only one objset
	* can be held at a time.
	*/
	int
	dmu_objset_hold(const char name, void tag, objset_t **osp)
	{
	dsl_pool_t *dp;
	dsl_dataset_t *ds;
	int err;

	err = dsl_pool_hold(name, tag, &dp);
	if (err != 0)
	return (err);
	err = dsl_dataset_hold(dp, name, tag, &ds);
	if (err != 0) {
	dsl_pool_rele(dp, tag);
	return (err);
	}

	err = dmu_objset_from_ds(ds, osp);
	if (err != 0) {
	dsl_dataset_rele(ds, tag);
	dsl_pool_rele(dp, tag);
	}

	return (err);
	}

	static int
	dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
	boolean_t readonly, void tag, objset_t *osp)
	{
	int err;

	err = dmu_objset_from_ds(ds, osp);
	if (err != 0) {
	dsl_dataset_disown(ds, tag);
	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
	dsl_dataset_disown(ds, tag);
	return (SET_ERROR(EINVAL));
	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
	dsl_dataset_disown(ds, tag);
	return (SET_ERROR(EROFS));
	}
	return (err);
	}

	/*
	* dsl_pool must not be held when this is called.
	* Upon successful return, there will be a longhold on the dataset,
	* and the dsl_pool will not be held.
	*/
	int
	dmu_objset_own(const char *name, dmu_objset_type_t type,
	boolean_t readonly, void tag, objset_t *osp)
	{
	dsl_pool_t *dp;
	dsl_dataset_t *ds;
	int err;

	err = dsl_pool_hold(name, FTAG, &dp);
	if (err != 0)
	return (err);
	err = dsl_dataset_own(dp, name, tag, &ds);
	if (err != 0) {
	dsl_pool_rele(dp, FTAG);
	return (err);
	}
	err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
	dsl_pool_rele(dp, FTAG);

	return (err);
	}

	int
	dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
	boolean_t readonly, void tag, objset_t *osp)
	{
	dsl_dataset_t *ds;
	int err;

	err = dsl_dataset_own_obj(dp, obj, tag, &ds);
	if (err != 0)
	return (err);

	return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
	}

	void
	dmu_objset_rele(objset_t os, void tag)
	{
	dsl_pool_t *dp = dmu_objset_pool(os);
	dsl_dataset_rele(os->os_dsl_dataset, tag);
	dsl_pool_rele(dp, tag);
	}

	/*
	* When we are called, os MUST refer to an objset associated with a dataset
	* that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
	* == tag. We will then release and reacquire ownership of the dataset while
	* holding the pool config_rwlock to avoid intervening namespace or ownership
	* changes may occur.
	*
	* This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
	* release the hold on its dataset and acquire a new one on the dataset of the
	* same name so that it can be partially torn down and reconstructed.
	*/
	void
	dmu_objset_refresh_ownership(dsl_dataset_t ds, dsl_dataset_t *newds,
	void *tag)
	{
	dsl_pool_t *dp;
	char name[ZFS_MAX_DATASET_NAME_LEN];

	VERIFY3P(ds, !=, NULL);
	VERIFY3P(ds->ds_owner, ==, tag);
	VERIFY(dsl_dataset_long_held(ds));

	dsl_dataset_name(ds, name);
	dp = ds->ds_dir->dd_pool;
	dsl_pool_config_enter(dp, FTAG);
	dsl_dataset_disown(ds, tag);
	VERIFY0(dsl_dataset_own(dp, name, tag, newds));
	dsl_pool_config_exit(dp, FTAG);
	}

	void
	dmu_objset_disown(objset_t os, void tag)
	{
	dsl_dataset_disown(os->os_dsl_dataset, tag);
	}

	void
	dmu_objset_evict_dbufs(objset_t *os)
	{
	dnode_t dn_marker;
	dnode_t *dn;

	mutex_enter(&os->os_lock);
	dn = list_head(&os->os_dnodes);
	while (dn != NULL) {
	/*
	* Skip dnodes without holds. We have to do this dance
	* because dnode_add_ref() only works if there is already a
	* hold. If the dnode has no holds, then it has no dbufs.
	*/
	if (dnode_add_ref(dn, FTAG)) {
	list_insert_after(&os->os_dnodes, dn, &dn_marker);
	mutex_exit(&os->os_lock);

	dnode_evict_dbufs(dn);
	dnode_rele(dn, FTAG);

	mutex_enter(&os->os_lock);
	dn = list_next(&os->os_dnodes, &dn_marker);
	list_remove(&os->os_dnodes, &dn_marker);
	} else {
	dn = list_next(&os->os_dnodes, dn);
	}
	}
	mutex_exit(&os->os_lock);

	if (DMU_USERUSED_DNODE(os) != NULL) {
	dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
	dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
	}
	dnode_evict_dbufs(DMU_META_DNODE(os));
	}

	/*
	* Objset eviction processing is split into into two pieces.
	* The first marks the objset as evicting, evicts any dbufs that
	* have a refcount of zero, and then queues up the objset for the
	* second phase of eviction. Once os->os_dnodes has been cleared by
	* dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
	* The second phase closes the special dnodes, dequeues the objset from
	* the list of those undergoing eviction, and finally frees the objset.
	*
	* NOTE: Due to asynchronous eviction processing (invocation of
	* dnode_buf_pageout()), it is possible for the meta dnode for the
	* objset to have no holds even though os->os_dnodes is not empty.
	*/
	void
	dmu_objset_evict(objset_t *os)
	{
	dsl_dataset_t *ds = os->os_dsl_dataset;

	for (int t = 0; t < TXG_SIZE; t++)
	ASSERT(!dmu_objset_is_dirty(os, t));

	if (ds)
	dsl_prop_unregister_all(ds, os);

	if (os->os_sa)
	sa_tear_down(os);

	dmu_objset_evict_dbufs(os);

	mutex_enter(&os->os_lock);
	spa_evicting_os_register(os->os_spa, os);
	if (list_is_empty(&os->os_dnodes)) {
	mutex_exit(&os->os_lock);
	dmu_objset_evict_done(os);
	} else {
	mutex_exit(&os->os_lock);
	}
	}

	void
	dmu_objset_evict_done(objset_t *os)
	{
	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);

	dnode_special_close(&os->os_meta_dnode);
	if (DMU_USERUSED_DNODE(os)) {
	dnode_special_close(&os->os_userused_dnode);
	dnode_special_close(&os->os_groupused_dnode);
	}
	zil_free(os->os_zil);

	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);

	/*
	* This is a barrier to prevent the objset from going away in
	* dnode_move() until we can safely ensure that the objset is still in
	* use. We consider the objset valid before the barrier and invalid
	* after the barrier.
	*/
	rw_enter(&os_lock, RW_READER);
	rw_exit(&os_lock);

	mutex_destroy(&os->os_lock);
	mutex_destroy(&os->os_userused_lock);
	mutex_destroy(&os->os_obj_lock);
	mutex_destroy(&os->os_user_ptr_lock);
	for (int i = 0; i < TXG_SIZE; i++) {
	multilist_destroy(os->os_dirty_dnodes[i]);
	}
	spa_evicting_os_deregister(os->os_spa, os);
	kmem_free(os, sizeof (objset_t));
	}

	timestruc_t
	dmu_objset_snap_cmtime(objset_t *os)
	{
	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
	}

	/* called from dsl for meta-objset */
	objset_t *
	dmu_objset_create_impl(spa_t spa, dsl_dataset_t ds, blkptr_t *bp,
	dmu_objset_type_t type, dmu_tx_t *tx)
	{
	objset_t *os;
	dnode_t *mdn;

	ASSERT(dmu_tx_is_syncing(tx));

	if (ds != NULL)
	VERIFY0(dmu_objset_from_ds(ds, &os));
	else
	VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));

	mdn = DMU_META_DNODE(os);

	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
	DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);

	/*
	* We don't want to have to increase the meta-dnode's nlevels
	* later, because then we could do it in quescing context while
	* we are also accessing it in open context.
	*
	* This precaution is not necessary for the MOS (ds == NULL),
	* because the MOS is only updated in syncing context.
	* This is most fortunate: the MOS is the only objset that
	* needs to be synced multiple times as spa_sync() iterates
	* to convergence, so minimizing its dn_nlevels matters.
	*/
	if (ds != NULL) {
	int levels = 1;

	/*
	* Determine the number of levels necessary for the meta-dnode
	* to contain DN_MAX_OBJECT dnodes. Note that in order to
	* ensure that we do not overflow 64 bits, there has to be
	* a nlevels that gives us a number of blocks > DN_MAX_OBJECT
	* but < 2^64. Therefore,
	* (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
	* less than (64 - log2(DN_MAX_OBJECT)) (16).
	*/
	while ((uint64_t)mdn->dn_nblkptr <<
	(mdn->dn_datablkshift - DNODE_SHIFT +
	(levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
	DN_MAX_OBJECT)
	levels++;

	mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
	mdn->dn_nlevels = levels;
	}

	ASSERT(type != DMU_OST_NONE);
	ASSERT(type != DMU_OST_ANY);
	ASSERT(type < DMU_OST_NUMTYPES);
	os->os_phys->os_type = type;
	if (dmu_objset_userused_enabled(os)) {
	os->os_phys->os_flags \|= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
	os->os_flags = os->os_phys->os_flags;
	}

	dsl_dataset_dirty(ds, tx);

	return (os);
	}

	typedef struct dmu_objset_create_arg {
	const char *doca_name;
	cred_t *doca_cred;
	void (doca_userfunc)(objset_t os, void *arg,
	cred_t cr, dmu_tx_t tx);
	void *doca_userarg;
	dmu_objset_type_t doca_type;
	uint64_t doca_flags;
	} dmu_objset_create_arg_t;

	/ARGSUSED/
	static int
	dmu_objset_create_check(void arg, dmu_tx_t tx)
	{
	dmu_objset_create_arg_t *doca = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dir_t *pdd;
	const char *tail;
	int error;

	if (strchr(doca->doca_name, '@') != NULL)
	return (SET_ERROR(EINVAL));

	if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
	return (SET_ERROR(ENAMETOOLONG));

	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
	if (error != 0)
	return (error);
	if (tail == NULL) {
	dsl_dir_rele(pdd, FTAG);
	return (SET_ERROR(EEXIST));
	}
	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
	doca->doca_cred);
	dsl_dir_rele(pdd, FTAG);

	return (error);
	}

	static void
	dmu_objset_create_sync(void arg, dmu_tx_t tx)
	{
	dmu_objset_create_arg_t *doca = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dir_t *pdd;
	const char *tail;
	dsl_dataset_t *ds;
	uint64_t obj;
	blkptr_t *bp;
	objset_t *os;

	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));

	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
	doca->doca_cred, tx);

	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	bp = dsl_dataset_get_blkptr(ds);
	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
	ds, bp, doca->doca_type, tx);
	rrw_exit(&ds->ds_bp_rwlock, FTAG);

	if (doca->doca_userfunc != NULL) {
	doca->doca_userfunc(os, doca->doca_userarg,
	doca->doca_cred, tx);
	}

	spa_history_log_internal_ds(ds, "create", tx, "");
	dsl_dataset_rele(ds, FTAG);
	dsl_dir_rele(pdd, FTAG);
	}

	int
	dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
	void (func)(objset_t os, void arg, cred_t cr, dmu_tx_t tx), void arg)
	{
	dmu_objset_create_arg_t doca;

	doca.doca_name = name;
	doca.doca_cred = CRED();
	doca.doca_flags = flags;
	doca.doca_userfunc = func;
	doca.doca_userarg = arg;
	doca.doca_type = type;

	return (dsl_sync_task(name,
	dmu_objset_create_check, dmu_objset_create_sync, &doca,
	5, ZFS_SPACE_CHECK_NORMAL));
	}

	typedef struct dmu_objset_clone_arg {
	const char *doca_clone;
	const char *doca_origin;
	cred_t *doca_cred;
	} dmu_objset_clone_arg_t;

	/ARGSUSED/
	static int
	dmu_objset_clone_check(void arg, dmu_tx_t tx)
	{
	dmu_objset_clone_arg_t *doca = arg;
	dsl_dir_t *pdd;
	const char *tail;
	int error;
	dsl_dataset_t *origin;
	dsl_pool_t *dp = dmu_tx_pool(tx);

	if (strchr(doca->doca_clone, '@') != NULL)
	return (SET_ERROR(EINVAL));

	if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
	return (SET_ERROR(ENAMETOOLONG));

	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
	if (error != 0)
	return (error);
	if (tail == NULL) {
	dsl_dir_rele(pdd, FTAG);
	return (SET_ERROR(EEXIST));
	}

	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
	doca->doca_cred);
	if (error != 0) {
	dsl_dir_rele(pdd, FTAG);
	return (SET_ERROR(EDQUOT));
	}
	dsl_dir_rele(pdd, FTAG);

	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
	if (error != 0)
	return (error);

	/* You can only clone snapshots, not the head datasets. */
	if (!origin->ds_is_snapshot) {
	dsl_dataset_rele(origin, FTAG);
	return (SET_ERROR(EINVAL));
	}
	dsl_dataset_rele(origin, FTAG);

	return (0);
	}

	static void
	dmu_objset_clone_sync(void arg, dmu_tx_t tx)
	{
	dmu_objset_clone_arg_t *doca = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dir_t *pdd;
	const char *tail;
	dsl_dataset_t origin, ds;
	uint64_t obj;
	char namebuf[ZFS_MAX_DATASET_NAME_LEN];

	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));

	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
	doca->doca_cred, tx);

	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
	dsl_dataset_name(origin, namebuf);
	spa_history_log_internal_ds(ds, "clone", tx,
	"origin=%s (%llu)", namebuf, origin->ds_object);
	dsl_dataset_rele(ds, FTAG);
	dsl_dataset_rele(origin, FTAG);
	dsl_dir_rele(pdd, FTAG);
	}

	int
	dmu_objset_clone(const char clone, const char origin)
	{
	dmu_objset_clone_arg_t doca;

	doca.doca_clone = clone;
	doca.doca_origin = origin;
	doca.doca_cred = CRED();

	return (dsl_sync_task(clone,
	dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
	5, ZFS_SPACE_CHECK_NORMAL));
	+}
	+
	+static int
	+dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg)
	+{
	+ int error = 0;
	+ uint64_t object = 0;
	+ while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
	+ error = dmu_object_remap_indirects(os, object,
	+ last_removed_txg);
	+ /*
	+ * If the ZPL removed the object before we managed to dnode_hold
	+ * it, we would get an ENOENT. If the ZPL declares its intent
	+ * to remove the object (dnode_free) before we manage to
	+ * dnode_hold it, we would get an EEXIST. In either case, we
	+ * want to continue remapping the other objects in the objset;
	+ * in all other cases, we want to break early.
	+ */
	+ if (error != 0 && error != ENOENT && error != EEXIST) {
	+ break;
	+ }
	+ }
	+ if (error == ESRCH) {
	+ error = 0;
	+ }
	+ return (error);
	+}
	+
	+int
	+dmu_objset_remap_indirects(const char *fsname)
	+{
	+ int error = 0;
	+ objset_t *os = NULL;
	+ uint64_t last_removed_txg;
	+ uint64_t remap_start_txg;
	+ dsl_dir_t *dd;
	+
	+ error = dmu_objset_hold(fsname, FTAG, &os);
	+ if (error != 0) {
	+ return (error);
	+ }
	+ dd = dmu_objset_ds(os)->ds_dir;
	+
	+ if (!spa_feature_is_enabled(dmu_objset_spa(os),
	+ SPA_FEATURE_OBSOLETE_COUNTS)) {
	+ dmu_objset_rele(os, FTAG);
	+ return (SET_ERROR(ENOTSUP));
	+ }
	+
	+ if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) {
	+ dmu_objset_rele(os, FTAG);
	+ return (SET_ERROR(EINVAL));
	+ }
	+
	+ /*
	+ * If there has not been a removal, we're done.
	+ */
	+ last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os));
	+ if (last_removed_txg == -1ULL) {
	+ dmu_objset_rele(os, FTAG);
	+ return (0);
	+ }
	+
	+ /*
	+ * If we have remapped since the last removal, we're done.
	+ */
	+ if (dsl_dir_is_zapified(dd)) {
	+ uint64_t last_remap_txg;
	+ if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)),
	+ dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
	+ sizeof (last_remap_txg), 1, &last_remap_txg) == 0 &&
	+ last_remap_txg > last_removed_txg) {
	+ dmu_objset_rele(os, FTAG);
	+ return (0);
	+ }
	+ }
	+
	+ dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
	+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
	+
	+ remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os));
	+ error = dmu_objset_remap_indirects_impl(os, last_removed_txg);
	+ if (error == 0) {
	+ /*
	+ * We update the last_remap_txg to be the start txg so that
	+ * we can guarantee that every block older than last_remap_txg
	+ * that can be remapped has been remapped.
	+ */
	+ error = dsl_dir_update_last_remap_txg(dd, remap_start_txg);
	+ }
	+
	+ dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
	+ dsl_dataset_rele(dmu_objset_ds(os), FTAG);
	+
	+ return (error);
	}

	int
	dmu_objset_snapshot_one(const char fsname, const char snapname)
	{
	int err;
	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
	nvlist_t *snaps = fnvlist_alloc();

	fnvlist_add_boolean(snaps, longsnap);
	strfree(longsnap);
	err = dsl_dataset_snapshot(snaps, NULL, NULL);
	fnvlist_free(snaps);
	return (err);
	}

	static void
	dmu_objset_sync_dnodes(multilist_sublist_t list, dmu_tx_t tx)
	{
	dnode_t *dn;

	while ((dn = multilist_sublist_head(list)) != NULL) {
	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
	ASSERT(dn->dn_dbuf->db_data_pending);
	/*
	* Initialize dn_zio outside dnode_sync() because the
	* meta-dnode needs to set it ouside dnode_sync().
	*/
	dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
	ASSERT(dn->dn_zio);

	ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
	multilist_sublist_remove(list, dn);

	multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
	if (newlist != NULL) {
	(void) dnode_add_ref(dn, newlist);
	multilist_insert(newlist, dn);
	}

	dnode_sync(dn, tx);
	}
	}

	/* ARGSUSED */
	static void
	dmu_objset_write_ready(zio_t zio, arc_buf_t abuf, void *arg)
	{
	blkptr_t *bp = zio->io_bp;
	objset_t *os = arg;
	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;

	ASSERT(!BP_IS_EMBEDDED(bp));
	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
	ASSERT0(BP_GET_LEVEL(bp));

	/*
	* Update rootbp fill count: it should be the number of objects
	* allocated in the object set (not counting the "special"
	* objects that are stored in the objset_phys_t -- the meta
	* dnode and user/group accounting objects).
	*/
	bp->blk_fill = 0;
	for (int i = 0; i < dnp->dn_nblkptr; i++)
	bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
	if (os->os_dsl_dataset != NULL)
	rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
	os->os_rootbp = bp;
	if (os->os_dsl_dataset != NULL)
	rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
	}

	/* ARGSUSED */
	static void
	dmu_objset_write_done(zio_t zio, arc_buf_t abuf, void *arg)
	{
	blkptr_t *bp = zio->io_bp;
	blkptr_t *bp_orig = &zio->io_bp_orig;
	objset_t *os = arg;

	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
	ASSERT(BP_EQUAL(bp, bp_orig));
	} else {
	dsl_dataset_t *ds = os->os_dsl_dataset;
	dmu_tx_t *tx = os->os_synctx;

	(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
	dsl_dataset_block_born(ds, bp, tx);
	}
	kmem_free(bp, sizeof (*bp));
	}

	typedef struct sync_dnodes_arg {
	multilist_t *sda_list;
	int sda_sublist_idx;
	multilist_t *sda_newlist;
	dmu_tx_t *sda_tx;
	} sync_dnodes_arg_t;

	static void
	sync_dnodes_task(void *arg)
	{
	sync_dnodes_arg_t *sda = arg;

	multilist_sublist_t *ms =
	multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);

	dmu_objset_sync_dnodes(ms, sda->sda_tx);

	multilist_sublist_unlock(ms);

	kmem_free(sda, sizeof (*sda));
	}


	/* called from dsl */
	void
	dmu_objset_sync(objset_t os, zio_t pio, dmu_tx_t *tx)
	{
	int txgoff;
	zbookmark_phys_t zb;
	zio_prop_t zp;
	zio_t *zio;
	list_t *list;
	dbuf_dirty_record_t *dr;
	blkptr_t blkptr_copy = kmem_alloc(sizeof (os->os_rootbp), KM_SLEEP);
	blkptr_copy = os->os_rootbp;

	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);

	ASSERT(dmu_tx_is_syncing(tx));
	/* XXX the write_done callback should really give us the tx... */
	os->os_synctx = tx;

	if (os->os_dsl_dataset == NULL) {
	/*
	* This is the MOS. If we have upgraded,
	* spa_max_replication() could change, so reset
	* os_copies here.
	*/
	os->os_copies = spa_max_replication(os->os_spa);
	}

	/*
	* Create the root block IO
	*/
	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
	os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
	ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
	arc_release(os->os_phys_buf, &os->os_phys_buf);

	dmu_write_policy(os, NULL, 0, 0, &zp);

	zio = arc_write(pio, os->os_spa, tx->tx_txg,
	blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
	&zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
	os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);

	/*
	* Sync special dnodes - the parent IO for the sync is the root block
	*/
	DMU_META_DNODE(os)->dn_zio = zio;
	dnode_sync(DMU_META_DNODE(os), tx);

	os->os_phys->os_flags = os->os_flags;

	if (DMU_USERUSED_DNODE(os) &&
	DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
	DMU_USERUSED_DNODE(os)->dn_zio = zio;
	dnode_sync(DMU_USERUSED_DNODE(os), tx);
	DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
	dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
	}

	txgoff = tx->tx_txg & TXG_MASK;

	if (dmu_objset_userused_enabled(os)) {
	/*
	* We must create the list here because it uses the
	* dn_dirty_link[] of this txg. But it may already
	* exist because we call dsl_dataset_sync() twice per txg.
	*/
	if (os->os_synced_dnodes == NULL) {
	os->os_synced_dnodes =
	multilist_create(sizeof (dnode_t),
	offsetof(dnode_t, dn_dirty_link[txgoff]),
	dnode_multilist_index_func);
	} else {
	ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
	offsetof(dnode_t, dn_dirty_link[txgoff]));
	}
	}

	for (int i = 0;
	i < multilist_get_num_sublists(os->os_dirty_dnodes[txgoff]); i++) {
	sync_dnodes_arg_t sda = kmem_alloc(sizeof (sda), KM_SLEEP);
	sda->sda_list = os->os_dirty_dnodes[txgoff];
	sda->sda_sublist_idx = i;
	sda->sda_tx = tx;
	(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
	sync_dnodes_task, sda, 0);
	/* callback frees sda */
	}
	taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);

	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
	while ((dr = list_head(list)) != NULL) {
	ASSERT0(dr->dr_dbuf->db_level);
	list_remove(list, dr);
	if (dr->dr_zio)
	zio_nowait(dr->dr_zio);
	}

	/* Enable dnode backfill if enough objects have been freed. */
	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
	os->os_rescan_dnodes = B_TRUE;
	os->os_freed_dnodes = 0;
	}

	/*
	* Free intent log blocks up to this tx.
	*/
	zil_sync(os->os_zil, tx);
	os->os_phys->os_zil_header = os->os_zil_header;
	zio_nowait(zio);
	}

	boolean_t
	dmu_objset_is_dirty(objset_t *os, uint64_t txg)
	{
	return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK]));
	}

	static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];

	void
	dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
	{
	used_cbs[ost] = cb;
	}

	boolean_t
	dmu_objset_userused_enabled(objset_t *os)
	{
	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
	used_cbs[os->os_phys->os_type] != NULL &&
	DMU_USERUSED_DNODE(os) != NULL);
	}

	typedef struct userquota_node {
	uint64_t uqn_id;
	int64_t uqn_delta;
	avl_node_t uqn_node;
	} userquota_node_t;

	typedef struct userquota_cache {
	avl_tree_t uqc_user_deltas;
	avl_tree_t uqc_group_deltas;
	} userquota_cache_t;

	static int
	userquota_compare(const void l, const void r)
	{
	const userquota_node_t *luqn = l;
	const userquota_node_t *ruqn = r;

	if (luqn->uqn_id < ruqn->uqn_id)
	return (-1);
	if (luqn->uqn_id > ruqn->uqn_id)
	return (1);
	return (0);
	}

	static void
	do_userquota_cacheflush(objset_t os, userquota_cache_t cache, dmu_tx_t *tx)
	{
	void *cookie;
	userquota_node_t *uqn;

	ASSERT(dmu_tx_is_syncing(tx));

	cookie = NULL;
	while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
	&cookie)) != NULL) {
	/*
	* os_userused_lock protects against concurrent calls to
	* zap_increment_int(). It's needed because zap_increment_int()
	* is not thread-safe (i.e. not atomic).
	*/
	mutex_enter(&os->os_userused_lock);
	VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
	uqn->uqn_id, uqn->uqn_delta, tx));
	mutex_exit(&os->os_userused_lock);
	kmem_free(uqn, sizeof (*uqn));
	}
	avl_destroy(&cache->uqc_user_deltas);

	cookie = NULL;
	while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
	&cookie)) != NULL) {
	mutex_enter(&os->os_userused_lock);
	VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
	uqn->uqn_id, uqn->uqn_delta, tx));
	mutex_exit(&os->os_userused_lock);
	kmem_free(uqn, sizeof (*uqn));
	}
	avl_destroy(&cache->uqc_group_deltas);
	}

	static void
	userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
	{
	userquota_node_t search = { .uqn_id = id };
	avl_index_t idx;

	userquota_node_t *uqn = avl_find(avl, &search, &idx);
	if (uqn == NULL) {
	uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
	uqn->uqn_id = id;
	avl_insert(avl, uqn, idx);
	}
	uqn->uqn_delta += delta;
	}

	static void
	do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
	uint64_t user, uint64_t group, boolean_t subtract)
	{
	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
	int64_t delta = DNODE_SIZE + used;
	if (subtract)
	delta = -delta;

	userquota_update_cache(&cache->uqc_user_deltas, user, delta);
	userquota_update_cache(&cache->uqc_group_deltas, group, delta);
	}
	}

	typedef struct userquota_updates_arg {
	objset_t *uua_os;
	int uua_sublist_idx;
	dmu_tx_t *uua_tx;
	} userquota_updates_arg_t;

	static void
	userquota_updates_task(void *arg)
	{
	userquota_updates_arg_t *uua = arg;
	objset_t *os = uua->uua_os;
	dmu_tx_t *tx = uua->uua_tx;
	dnode_t *dn;
	userquota_cache_t cache = { 0 };

	multilist_sublist_t *list =
	multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);

	ASSERT(multilist_sublist_head(list) == NULL \|\|
	dmu_objset_userused_enabled(os));
	avl_create(&cache.uqc_user_deltas, userquota_compare,
	sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
	avl_create(&cache.uqc_group_deltas, userquota_compare,
	sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));

	while ((dn = multilist_sublist_head(list)) != NULL) {
	int flags;
	ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE \|\|
	dn->dn_phys->dn_flags &
	DNODE_FLAG_USERUSED_ACCOUNTED);

	flags = dn->dn_id_flags;
	ASSERT(flags);
	if (flags & DN_ID_OLD_EXIST) {
	do_userquota_update(&cache,
	dn->dn_oldused, dn->dn_oldflags,
	dn->dn_olduid, dn->dn_oldgid, B_TRUE);
	}
	if (flags & DN_ID_NEW_EXIST) {
	do_userquota_update(&cache,
	DN_USED_BYTES(dn->dn_phys),
	dn->dn_phys->dn_flags, dn->dn_newuid,
	dn->dn_newgid, B_FALSE);
	}

	mutex_enter(&dn->dn_mtx);
	dn->dn_oldused = 0;
	dn->dn_oldflags = 0;
	if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
	dn->dn_olduid = dn->dn_newuid;
	dn->dn_oldgid = dn->dn_newgid;
	dn->dn_id_flags \|= DN_ID_OLD_EXIST;
	if (dn->dn_bonuslen == 0)
	dn->dn_id_flags \|= DN_ID_CHKED_SPILL;
	else
	dn->dn_id_flags \|= DN_ID_CHKED_BONUS;
	}
	dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
	mutex_exit(&dn->dn_mtx);

	multilist_sublist_remove(list, dn);
	dnode_rele(dn, os->os_synced_dnodes);
	}
	do_userquota_cacheflush(os, &cache, tx);
	multilist_sublist_unlock(list);
	kmem_free(uua, sizeof (*uua));
	}

	void
	dmu_objset_do_userquota_updates(objset_t os, dmu_tx_t tx)
	{
	if (!dmu_objset_userused_enabled(os))
	return;

	/* Allocate the user/groupused objects if necessary. */
	if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
	VERIFY0(zap_create_claim(os,
	DMU_USERUSED_OBJECT,
	DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
	VERIFY0(zap_create_claim(os,
	DMU_GROUPUSED_OBJECT,
	DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
	}

	for (int i = 0;
	i < multilist_get_num_sublists(os->os_synced_dnodes); i++) {
	userquota_updates_arg_t *uua =
	kmem_alloc(sizeof (*uua), KM_SLEEP);
	uua->uua_os = os;
	uua->uua_sublist_idx = i;
	uua->uua_tx = tx;
	/* note: caller does taskq_wait() */
	(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
	userquota_updates_task, uua, 0);
	/* callback frees uua */
	}
	}

	/*
	* Returns a pointer to data to find uid/gid from
	*
	* If a dirty record for transaction group that is syncing can't
	* be found then NULL is returned. In the NULL case it is assumed
	* the uid/gid aren't changing.
	*/
	static void *
	dmu_objset_userquota_find_data(dmu_buf_impl_t db, dmu_tx_t tx)
	{
	dbuf_dirty_record_t dr, *drp;
	void *data;

	if (db->db_dirtycnt == 0)
	return (db->db.db_data); /* Nothing is changing */

	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
	if (dr->dr_txg == tx->tx_txg)
	break;

	if (dr == NULL) {
	data = NULL;
	} else {
	dnode_t *dn;

	DB_DNODE_ENTER(dr->dr_dbuf);
	dn = DB_DNODE(dr->dr_dbuf);

	if (dn->dn_bonuslen == 0 &&
	dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
	data = dr->dt.dl.dr_data->b_data;
	else
	data = dr->dt.dl.dr_data;

	DB_DNODE_EXIT(dr->dr_dbuf);
	}

	return (data);
	}

	void
	dmu_objset_userquota_get_ids(dnode_t dn, boolean_t before, dmu_tx_t tx)
	{
	objset_t *os = dn->dn_objset;
	void *data = NULL;
	dmu_buf_impl_t *db = NULL;
	uint64_t *user = NULL;
	uint64_t *group = NULL;
	int flags = dn->dn_id_flags;
	int error;
	boolean_t have_spill = B_FALSE;

	if (!dmu_objset_userused_enabled(dn->dn_objset))
	return;

	if (before && (flags & (DN_ID_CHKED_BONUS\|DN_ID_OLD_EXIST\|
	DN_ID_CHKED_SPILL)))
	return;

	if (before && dn->dn_bonuslen != 0)
	data = DN_BONUS(dn->dn_phys);
	else if (!before && dn->dn_bonuslen != 0) {
	if (dn->dn_bonus) {
	db = dn->dn_bonus;
	mutex_enter(&db->db_mtx);
	data = dmu_objset_userquota_find_data(db, tx);
	} else {
	data = DN_BONUS(dn->dn_phys);
	}
	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
	int rf = 0;

	if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
	rf \|= DB_RF_HAVESTRUCT;
	error = dmu_spill_hold_by_dnode(dn,
	rf \| DB_RF_MUST_SUCCEED,
	FTAG, (dmu_buf_t **)&db);
	ASSERT(error == 0);
	mutex_enter(&db->db_mtx);
	data = (before) ? db->db.db_data :
	dmu_objset_userquota_find_data(db, tx);
	have_spill = B_TRUE;
	} else {
	mutex_enter(&dn->dn_mtx);
	dn->dn_id_flags \|= DN_ID_CHKED_BONUS;
	mutex_exit(&dn->dn_mtx);
	return;
	}

	if (before) {
	ASSERT(data);
	user = &dn->dn_olduid;
	group = &dn->dn_oldgid;
	} else if (data) {
	user = &dn->dn_newuid;
	group = &dn->dn_newgid;
	}

	/*
	* Must always call the callback in case the object
	* type has changed and that type isn't an object type to track
	*/
	error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
	user, group);

	/*
	* Preserve existing uid/gid when the callback can't determine
	* what the new uid/gid are and the callback returned EEXIST.
	* The EEXIST error tells us to just use the existing uid/gid.
	* If we don't know what the old values are then just assign
	* them to 0, since that is a new file being created.
	*/
	if (!before && data == NULL && error == EEXIST) {
	if (flags & DN_ID_OLD_EXIST) {
	dn->dn_newuid = dn->dn_olduid;
	dn->dn_newgid = dn->dn_oldgid;
	} else {
	dn->dn_newuid = 0;
	dn->dn_newgid = 0;
	}
	error = 0;
	}

	if (db)
	mutex_exit(&db->db_mtx);

	mutex_enter(&dn->dn_mtx);
	if (error == 0 && before)
	dn->dn_id_flags \|= DN_ID_OLD_EXIST;
	if (error == 0 && !before)
	dn->dn_id_flags \|= DN_ID_NEW_EXIST;

	if (have_spill) {
	dn->dn_id_flags \|= DN_ID_CHKED_SPILL;
	} else {
	dn->dn_id_flags \|= DN_ID_CHKED_BONUS;
	}
	mutex_exit(&dn->dn_mtx);
	if (have_spill)
	dmu_buf_rele((dmu_buf_t *)db, FTAG);
	}

	boolean_t
	dmu_objset_userspace_present(objset_t *os)
	{
	return (os->os_phys->os_flags &
	OBJSET_FLAG_USERACCOUNTING_COMPLETE);
	}

	int
	dmu_objset_userspace_upgrade(objset_t *os)
	{
	uint64_t obj;
	int err = 0;

	if (dmu_objset_userspace_present(os))
	return (0);
	if (!dmu_objset_userused_enabled(os))
	return (SET_ERROR(ENOTSUP));
	if (dmu_objset_is_snapshot(os))
	return (SET_ERROR(EINVAL));

	/*
	* We simply need to mark every object dirty, so that it will be
	* synced out and now accounted. If this is called
	* concurrently, or if we already did some work before crashing,
	* that's fine, since we track each object's accounted state
	* independently.
	*/

	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
	dmu_tx_t *tx;
	dmu_buf_t *db;
	int objerr;

	if (issig(JUSTLOOKING) && issig(FORREAL))
	return (SET_ERROR(EINTR));

	objerr = dmu_bonus_hold(os, obj, FTAG, &db);
	if (objerr != 0)
	continue;
	tx = dmu_tx_create(os);
	dmu_tx_hold_bonus(tx, obj);
	objerr = dmu_tx_assign(tx, TXG_WAIT);
	if (objerr != 0) {
	dmu_tx_abort(tx);
	continue;
	}
	dmu_buf_will_dirty(db, tx);
	dmu_buf_rele(db, FTAG);
	dmu_tx_commit(tx);
	}

	os->os_flags \|= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
	txg_wait_synced(dmu_objset_pool(os), 0);
	return (0);
	}

	void
	dmu_objset_space(objset_t os, uint64_t refdbytesp, uint64_t *availbytesp,
	uint64_t usedobjsp, uint64_t availobjsp)
	{
	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
	usedobjsp, availobjsp);
	}

	uint64_t
	dmu_objset_fsid_guid(objset_t *os)
	{
	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
	}

	void
	dmu_objset_fast_stat(objset_t os, dmu_objset_stats_t stat)
	{
	stat->dds_type = os->os_phys->os_type;
	if (os->os_dsl_dataset)
	dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
	}

	void
	dmu_objset_stats(objset_t os, nvlist_t nv)
	{
	ASSERT(os->os_dsl_dataset \|\|
	os->os_phys->os_type == DMU_OST_META);

	if (os->os_dsl_dataset != NULL)
	dsl_dataset_stats(os->os_dsl_dataset, nv);

	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
	os->os_phys->os_type);
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
	dmu_objset_userspace_present(os));
	}

	int
	dmu_objset_is_snapshot(objset_t *os)
	{
	if (os->os_dsl_dataset != NULL)
	return (os->os_dsl_dataset->ds_is_snapshot);
	else
	return (B_FALSE);
	}

	int
	dmu_snapshot_realname(objset_t os, char name, char *real, int maxlen,
	boolean_t *conflict)
	{
	dsl_dataset_t *ds = os->os_dsl_dataset;
	uint64_t ignored;

	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
	return (SET_ERROR(ENOENT));

	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
	dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
	MT_NORMALIZE, real, maxlen, conflict));
	}

	int
	dmu_snapshot_list_next(objset_t os, int namelen, char name,
	uint64_t idp, uint64_t offp, boolean_t *case_conflict)
	{
	dsl_dataset_t *ds = os->os_dsl_dataset;
	zap_cursor_t cursor;
	zap_attribute_t attr;

	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));

	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
	return (SET_ERROR(ENOENT));

	zap_cursor_init_serialized(&cursor,
	ds->ds_dir->dd_pool->dp_meta_objset,
	dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);

	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
	zap_cursor_fini(&cursor);
	return (SET_ERROR(ENOENT));
	}

	if (strlen(attr.za_name) + 1 > namelen) {
	zap_cursor_fini(&cursor);
	return (SET_ERROR(ENAMETOOLONG));
	}

	(void) strcpy(name, attr.za_name);
	if (idp)
	*idp = attr.za_first_integer;
	if (case_conflict)
	*case_conflict = attr.za_normalization_conflict;
	zap_cursor_advance(&cursor);
	*offp = zap_cursor_serialize(&cursor);
	zap_cursor_fini(&cursor);

	return (0);
	}

	int
	dmu_dir_list_next(objset_t os, int namelen, char name,
	uint64_t idp, uint64_t offp)
	{
	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
	zap_cursor_t cursor;
	zap_attribute_t attr;

	/* there is no next dir on a snapshot! */
	if (os->os_dsl_dataset->ds_object !=
	dsl_dir_phys(dd)->dd_head_dataset_obj)
	return (SET_ERROR(ENOENT));

	zap_cursor_init_serialized(&cursor,
	dd->dd_pool->dp_meta_objset,
	dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);

	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
	zap_cursor_fini(&cursor);
	return (SET_ERROR(ENOENT));
	}

	if (strlen(attr.za_name) + 1 > namelen) {
	zap_cursor_fini(&cursor);
	return (SET_ERROR(ENAMETOOLONG));
	}

	(void) strcpy(name, attr.za_name);
	if (idp)
	*idp = attr.za_first_integer;
	zap_cursor_advance(&cursor);
	*offp = zap_cursor_serialize(&cursor);
	zap_cursor_fini(&cursor);

	return (0);
	}

	typedef struct dmu_objset_find_ctx {
	taskq_t *dc_tq;
	dsl_pool_t *dc_dp;
	uint64_t dc_ddobj;
	char dc_ddname; / last component of ddobj's name */
	int (dc_func)(dsl_pool_t , dsl_dataset_t , void );
	void *dc_arg;
	int dc_flags;
	kmutex_t *dc_error_lock;
	int *dc_error;
	} dmu_objset_find_ctx_t;

	static void
	dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
	{
	dsl_pool_t *dp = dcp->dc_dp;
	dsl_dir_t *dd;
	dsl_dataset_t *ds;
	zap_cursor_t zc;
	zap_attribute_t *attr;
	uint64_t thisobj;
	int err = 0;

	/* don't process if there already was an error */
	if (*dcp->dc_error != 0)
	goto out;

	/*
	* Note: passing the name (dc_ddname) here is optional, but it
	* improves performance because we don't need to call
	* zap_value_search() to determine the name.
	*/
	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
	if (err != 0)
	goto out;

	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
	if (dd->dd_myname[0] == '$') {
	dsl_dir_rele(dd, FTAG);
	goto out;
	}

	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);

	/*
	* Iterate over all children.
	*/
	if (dcp->dc_flags & DS_FIND_CHILDREN) {
	for (zap_cursor_init(&zc, dp->dp_meta_objset,
	dsl_dir_phys(dd)->dd_child_dir_zapobj);
	zap_cursor_retrieve(&zc, attr) == 0;
	(void) zap_cursor_advance(&zc)) {
	ASSERT3U(attr->za_integer_length, ==,
	sizeof (uint64_t));
	ASSERT3U(attr->za_num_integers, ==, 1);

	dmu_objset_find_ctx_t *child_dcp =
	kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
	child_dcp = dcp;
	child_dcp->dc_ddobj = attr->za_first_integer;
	child_dcp->dc_ddname = spa_strdup(attr->za_name);
	if (dcp->dc_tq != NULL)
	(void) taskq_dispatch(dcp->dc_tq,
	dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
	else
	dmu_objset_find_dp_impl(child_dcp);
	}
	zap_cursor_fini(&zc);
	}

	/*
	* Iterate over all snapshots.
	*/
	if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
	dsl_dataset_t *ds;
	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);

	if (err == 0) {
	uint64_t snapobj;

	snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
	dsl_dataset_rele(ds, FTAG);

	for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
	zap_cursor_retrieve(&zc, attr) == 0;
	(void) zap_cursor_advance(&zc)) {
	ASSERT3U(attr->za_integer_length, ==,
	sizeof (uint64_t));
	ASSERT3U(attr->za_num_integers, ==, 1);

	err = dsl_dataset_hold_obj(dp,
	attr->za_first_integer, FTAG, &ds);
	if (err != 0)
	break;
	err = dcp->dc_func(dp, ds, dcp->dc_arg);
	dsl_dataset_rele(ds, FTAG);
	if (err != 0)
	break;
	}
	zap_cursor_fini(&zc);
	}
	}

	kmem_free(attr, sizeof (zap_attribute_t));

	if (err != 0) {
	dsl_dir_rele(dd, FTAG);
	goto out;
	}

	/*
	* Apply to self.
	*/
	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);

	/*
	* Note: we hold the dir while calling dsl_dataset_hold_obj() so
	* that the dir will remain cached, and we won't have to re-instantiate
	* it (which could be expensive due to finding its name via
	* zap_value_search()).
	*/
	dsl_dir_rele(dd, FTAG);
	if (err != 0)
	goto out;
	err = dcp->dc_func(dp, ds, dcp->dc_arg);
	dsl_dataset_rele(ds, FTAG);

	out:
	if (err != 0) {
	mutex_enter(dcp->dc_error_lock);
	/* only keep first error */
	if (*dcp->dc_error == 0)
	*dcp->dc_error = err;
	mutex_exit(dcp->dc_error_lock);
	}

	if (dcp->dc_ddname != NULL)
	spa_strfree(dcp->dc_ddname);
	kmem_free(dcp, sizeof (*dcp));
	}

	static void
	dmu_objset_find_dp_cb(void *arg)
	{
	dmu_objset_find_ctx_t *dcp = arg;
	dsl_pool_t *dp = dcp->dc_dp;

	/*
	* We need to get a pool_config_lock here, as there are several
	* asssert(pool_config_held) down the stack. Getting a lock via
	* dsl_pool_config_enter is risky, as it might be stalled by a
	* pending writer. This would deadlock, as the write lock can
	* only be granted when our parent thread gives up the lock.
	* The _prio interface gives us priority over a pending writer.
	*/
	dsl_pool_config_enter_prio(dp, FTAG);

	dmu_objset_find_dp_impl(dcp);

	dsl_pool_config_exit(dp, FTAG);
	}

	/*
	* Find objsets under and including ddobj, call func(ds) on each.
	* The order for the enumeration is completely undefined.
	* func is called with dsl_pool_config held.
	*/
	int
	dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
	int func(dsl_pool_t , dsl_dataset_t , void ), void arg, int flags)
	{
	int error = 0;
	taskq_t *tq = NULL;
	int ntasks;
	dmu_objset_find_ctx_t *dcp;
	kmutex_t err_lock;

	mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
	dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
	dcp->dc_tq = NULL;
	dcp->dc_dp = dp;
	dcp->dc_ddobj = ddobj;
	dcp->dc_ddname = NULL;
	dcp->dc_func = func;
	dcp->dc_arg = arg;
	dcp->dc_flags = flags;
	dcp->dc_error_lock = &err_lock;
	dcp->dc_error = &error;

	if ((flags & DS_FIND_SERIALIZE) \|\| dsl_pool_config_held_writer(dp)) {
	/*
	* In case a write lock is held we can't make use of
	* parallelism, as down the stack of the worker threads
	* the lock is asserted via dsl_pool_config_held.
	* In case of a read lock this is solved by getting a read
	* lock in each worker thread, which isn't possible in case
	* of a writer lock. So we fall back to the synchronous path
	* here.
	* In the future it might be possible to get some magic into
	* dsl_pool_config_held in a way that it returns true for
	* the worker threads so that a single lock held from this
	* thread suffices. For now, stay single threaded.
	*/
	dmu_objset_find_dp_impl(dcp);
	mutex_destroy(&err_lock);

	return (error);
	}

	ntasks = dmu_find_threads;
	if (ntasks == 0)
	ntasks = vdev_count_leaves(dp->dp_spa) * 4;
	tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
	INT_MAX, 0);
	if (tq == NULL) {
	kmem_free(dcp, sizeof (*dcp));
	mutex_destroy(&err_lock);

	return (SET_ERROR(ENOMEM));
	}
	dcp->dc_tq = tq;

	/* dcp will be freed by task */
	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);

	/*
	* PORTING: this code relies on the property of taskq_wait to wait
	* until no more tasks are queued and no more tasks are active. As
	* we always queue new tasks from within other tasks, task_wait
	* reliably waits for the full recursion to finish, even though we
	* enqueue new tasks after taskq_wait has been called.
	* On platforms other than illumos, taskq_wait may not have this
	* property.
	*/
	taskq_wait(tq);
	taskq_destroy(tq);
	mutex_destroy(&err_lock);

	return (error);
	}

	/*
	* Find all objsets under name, and for each, call 'func(child_name, arg)'.
	* The dp_config_rwlock must not be held when this is called, and it
	* will not be held when the callback is called.
	* Therefore this function should only be used when the pool is not changing
	* (e.g. in syncing context), or the callback can deal with the possible races.
	*/
	static int
	dmu_objset_find_impl(spa_t spa, const char name,
	int func(const char , void ), void *arg, int flags)
	{
	dsl_dir_t *dd;
	dsl_pool_t *dp = spa_get_dsl(spa);
	dsl_dataset_t *ds;
	zap_cursor_t zc;
	zap_attribute_t *attr;
	char *child;
	uint64_t thisobj;
	int err;

	dsl_pool_config_enter(dp, FTAG);

	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
	if (err != 0) {
	dsl_pool_config_exit(dp, FTAG);
	return (err);
	}

	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
	if (dd->dd_myname[0] == '$') {
	dsl_dir_rele(dd, FTAG);
	dsl_pool_config_exit(dp, FTAG);
	return (0);
	}

	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);

	/*
	* Iterate over all children.
	*/
	if (flags & DS_FIND_CHILDREN) {
	for (zap_cursor_init(&zc, dp->dp_meta_objset,
	dsl_dir_phys(dd)->dd_child_dir_zapobj);
	zap_cursor_retrieve(&zc, attr) == 0;
	(void) zap_cursor_advance(&zc)) {
	ASSERT3U(attr->za_integer_length, ==,
	sizeof (uint64_t));
	ASSERT3U(attr->za_num_integers, ==, 1);

	child = kmem_asprintf("%s/%s", name, attr->za_name);
	dsl_pool_config_exit(dp, FTAG);
	err = dmu_objset_find_impl(spa, child,
	func, arg, flags);
	dsl_pool_config_enter(dp, FTAG);
	strfree(child);
	if (err != 0)
	break;
	}
	zap_cursor_fini(&zc);

	if (err != 0) {
	dsl_dir_rele(dd, FTAG);
	dsl_pool_config_exit(dp, FTAG);
	kmem_free(attr, sizeof (zap_attribute_t));
	return (err);
	}
	}

	/*
	* Iterate over all snapshots.
	*/
	if (flags & DS_FIND_SNAPSHOTS) {
	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);

	if (err == 0) {
	uint64_t snapobj;

	snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
	dsl_dataset_rele(ds, FTAG);

	for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
	zap_cursor_retrieve(&zc, attr) == 0;
	(void) zap_cursor_advance(&zc)) {
	ASSERT3U(attr->za_integer_length, ==,
	sizeof (uint64_t));
	ASSERT3U(attr->za_num_integers, ==, 1);

	child = kmem_asprintf("%s@%s",
	name, attr->za_name);
	dsl_pool_config_exit(dp, FTAG);
	err = func(child, arg);
	dsl_pool_config_enter(dp, FTAG);
	strfree(child);
	if (err != 0)
	break;
	}
	zap_cursor_fini(&zc);
	}
	}

	dsl_dir_rele(dd, FTAG);
	kmem_free(attr, sizeof (zap_attribute_t));
	dsl_pool_config_exit(dp, FTAG);

	if (err != 0)
	return (err);

	/* Apply to self. */
	return (func(name, arg));
	}

	/*
	* See comment above dmu_objset_find_impl().
	*/
	int
	dmu_objset_find(char name, int func(const char , void ), void arg,
	int flags)
	{
	spa_t *spa;
	int error;

	error = spa_open(name, &spa, FTAG);
	if (error != 0)
	return (error);
	error = dmu_objset_find_impl(spa, name, func, arg, flags);
	spa_close(spa, FTAG);
	return (error);
	}

	void
	dmu_objset_set_user(objset_t os, void user_ptr)
	{
	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
	os->os_user_ptr = user_ptr;
	}

	void *
	dmu_objset_get_user(objset_t *os)
	{
	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
	return (os->os_user_ptr);
	}

	/*
	* Determine name of filesystem, given name of snapshot.
	* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
	*/
	int
	dmu_fsname(const char snapname, char buf)
	{
	char *atp = strchr(snapname, '@');
	if (atp == NULL)
	return (SET_ERROR(EINVAL));
	if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
	return (SET_ERROR(ENAMETOOLONG));
	(void) strlcpy(buf, snapname, atp - snapname + 1);
	return (0);
	}

	/*
	* Call when we think we're going to write/free space in open context to track
	* the amount of dirty data in the open txg, which is also the amount
	* of memory that can not be evicted until this txg syncs.
	*/
	void
	dmu_objset_willuse_space(objset_t os, int64_t space, dmu_tx_t tx)
	{
	dsl_dataset_t *ds = os->os_dsl_dataset;
	int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);

	if (ds != NULL) {
	dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
	dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
	}
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 332525)
	@@ -1,1320 +1,1337 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#include <sys/dmu.h>
	#include <sys/dmu_impl.h>
	#include <sys/dbuf.h>
	#include <sys/dmu_tx.h>
	#include <sys/dmu_objset.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_pool.h>
	#include <sys/zap_impl.h>
	#include <sys/spa.h>
	#include <sys/sa.h>
	#include <sys/sa_impl.h>
	#include <sys/zfs_context.h>
	#include <sys/varargs.h>

	typedef void (dmu_tx_hold_func_t)(dmu_tx_t tx, struct dnode *dn,
	uint64_t arg1, uint64_t arg2);


	dmu_tx_t *
	dmu_tx_create_dd(dsl_dir_t *dd)
	{
	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
	tx->tx_dir = dd;
	if (dd != NULL)
	tx->tx_pool = dd->dd_pool;
	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
	offsetof(dmu_tx_hold_t, txh_node));
	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
	offsetof(dmu_tx_callback_t, dcb_node));
	tx->tx_start = gethrtime();
	return (tx);
	}

	dmu_tx_t *
	dmu_tx_create(objset_t *os)
	{
	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
	tx->tx_objset = os;
	return (tx);
	}

	dmu_tx_t *
	dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
	{
	dmu_tx_t *tx = dmu_tx_create_dd(NULL);

	txg_verify(dp->dp_spa, txg);
	tx->tx_pool = dp;
	tx->tx_txg = txg;
	tx->tx_anyobj = TRUE;

	return (tx);
	}

	int
	dmu_tx_is_syncing(dmu_tx_t *tx)
	{
	return (tx->tx_anyobj);
	}

	int
	dmu_tx_private_ok(dmu_tx_t *tx)
	{
	return (tx->tx_anyobj);
	}

	static dmu_tx_hold_t *
	dmu_tx_hold_dnode_impl(dmu_tx_t tx, dnode_t dn, enum dmu_tx_hold_type type,
	uint64_t arg1, uint64_t arg2)
	{
	dmu_tx_hold_t *txh;

	if (dn != NULL) {
	(void) refcount_add(&dn->dn_holds, tx);
	if (tx->tx_txg != 0) {
	mutex_enter(&dn->dn_mtx);
	/*
	* dn->dn_assigned_txg == tx->tx_txg doesn't pose a
	* problem, but there's no way for it to happen (for
	* now, at least).
	*/
	ASSERT(dn->dn_assigned_txg == 0);
	dn->dn_assigned_txg = tx->tx_txg;
	(void) refcount_add(&dn->dn_tx_holds, tx);
	mutex_exit(&dn->dn_mtx);
	}
	}

	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
	txh->txh_tx = tx;
	txh->txh_dnode = dn;
	refcount_create(&txh->txh_space_towrite);
	refcount_create(&txh->txh_memory_tohold);
	txh->txh_type = type;
	txh->txh_arg1 = arg1;
	txh->txh_arg2 = arg2;
	list_insert_tail(&tx->tx_holds, txh);

	return (txh);
	}

	static dmu_tx_hold_t *
	dmu_tx_hold_object_impl(dmu_tx_t tx, objset_t os, uint64_t object,
	enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
	{
	dnode_t *dn = NULL;
	dmu_tx_hold_t *txh;
	int err;

	if (object != DMU_NEW_OBJECT) {
	err = dnode_hold(os, object, FTAG, &dn);
	if (err != 0) {
	tx->tx_err = err;
	return (NULL);
	}
	}
	txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
	if (dn != NULL)
	dnode_rele(dn, FTAG);
	return (txh);
	}

	void
	dmu_tx_add_new_object(dmu_tx_t tx, dnode_t dn)
	{
	/*
	* If we're syncing, they can manipulate any object anyhow, and
	* the hold on the dnode_t can cause problems.
	*/
	if (!dmu_tx_is_syncing(tx))
	(void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
	}

	/*
	* This function reads specified data from disk. The specified data will
	* be needed to perform the transaction -- i.e, it will be read after
	* we do dmu_tx_assign(). There are two reasons that we read the data now
	* (before dmu_tx_assign()):
	*
	* 1. Reading it now has potentially better performance. The transaction
	* has not yet been assigned, so the TXG is not held open, and also the
	* caller typically has less locks held when calling dmu_tx_hold_*() than
	* after the transaction has been assigned. This reduces the lock (and txg)
	* hold times, thus reducing lock contention.
	*
	* 2. It is easier for callers (primarily the ZPL) to handle i/o errors
	* that are detected before they start making changes to the DMU state
	* (i.e. now). Once the transaction has been assigned, and some DMU
	* state has been changed, it can be difficult to recover from an i/o
	* error (e.g. to undo the changes already made in memory at the DMU
	* layer). Typically code to do so does not exist in the caller -- it
	* assumes that the data has already been cached and thus i/o errors are
	* not possible.
	*
	* It has been observed that the i/o initiated here can be a performance
	* problem, and it appears to be optional, because we don't look at the
	* data which is read. However, removing this read would only serve to
	* move the work elsewhere (after the dmu_tx_assign()), where it may
	* have a greater impact on performance (in addition to the impact on
	* fault tolerance noted above).
	*/
	static int
	dmu_tx_check_ioerr(zio_t zio, dnode_t dn, int level, uint64_t blkid)
	{
	int err;
	dmu_buf_impl_t *db;

	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	db = dbuf_hold_level(dn, level, blkid, FTAG);
	rw_exit(&dn->dn_struct_rwlock);
	if (db == NULL)
	return (SET_ERROR(EIO));
	err = dbuf_read(db, zio, DB_RF_CANFAIL \| DB_RF_NOPREFETCH);
	dbuf_rele(db, FTAG);
	return (err);
	}

	/* ARGSUSED */
	static void
	dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
	{
	dnode_t *dn = txh->txh_dnode;
	int err = 0;

	if (len == 0)
	return;

	(void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);

	if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
	err = SET_ERROR(EFBIG);

	if (dn == NULL)
	return;

	/*
	* For i/o error checking, read the blocks that will be needed
	* to perform the write: the first and last level-0 blocks (if
	* they are not aligned, i.e. if they are partial-block writes),
	* and all the level-1 blocks.
	*/
	if (dn->dn_maxblkid == 0) {
	if (off < dn->dn_datablksz &&
	(off > 0 \|\| len < dn->dn_datablksz)) {
	err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
	if (err != 0) {
	txh->txh_tx->tx_err = err;
	}
	}
	} else {
	zio_t *zio = zio_root(dn->dn_objset->os_spa,
	NULL, NULL, ZIO_FLAG_CANFAIL);

	/* first level-0 block */
	uint64_t start = off >> dn->dn_datablkshift;
	if (P2PHASE(off, dn->dn_datablksz) \|\| len < dn->dn_datablksz) {
	err = dmu_tx_check_ioerr(zio, dn, 0, start);
	if (err != 0) {
	txh->txh_tx->tx_err = err;
	}
	}

	/* last level-0 block */
	uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
	if (end != start && end <= dn->dn_maxblkid &&
	P2PHASE(off + len, dn->dn_datablksz)) {
	err = dmu_tx_check_ioerr(zio, dn, 0, end);
	if (err != 0) {
	txh->txh_tx->tx_err = err;
	}
	}

	/* level-1 blocks */
	if (dn->dn_nlevels > 1) {
	int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
	for (uint64_t i = (start >> shft) + 1;
	i < end >> shft; i++) {
	err = dmu_tx_check_ioerr(zio, dn, 1, i);
	if (err != 0) {
	txh->txh_tx->tx_err = err;
	}
	}
	}

	err = zio_wait(zio);
	if (err != 0) {
	txh->txh_tx->tx_err = err;
	}
	}
	}

	static void
	dmu_tx_count_dnode(dmu_tx_hold_t *txh)
	{
	(void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
	}

	void
	dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
	{
	dmu_tx_hold_t *txh;

	ASSERT0(tx->tx_txg);
	ASSERT3U(len, <=, DMU_MAX_ACCESS);
	ASSERT(len == 0 \|\| UINT64_MAX - off >= len - 1);

	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	object, THT_WRITE, off, len);
	if (txh != NULL) {
	dmu_tx_count_write(txh, off, len);
	dmu_tx_count_dnode(txh);
	}
	}

	void
	+dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
	+{
	+ dmu_tx_hold_t *txh;
	+
	+ ASSERT(tx->tx_txg == 0);
	+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	+ object, THT_WRITE, 0, 0);
	+ if (txh == NULL)
	+ return;
	+
	+ dnode_t *dn = txh->txh_dnode;
	+ (void) refcount_add_many(&txh->txh_space_towrite,
	+ 1ULL << dn->dn_indblkshift, FTAG);
	+ dmu_tx_count_dnode(txh);
	+}
	+
	+void
	dmu_tx_hold_write_by_dnode(dmu_tx_t tx, dnode_t dn, uint64_t off, int len)
	{
	dmu_tx_hold_t *txh;

	ASSERT0(tx->tx_txg);
	ASSERT3U(len, <=, DMU_MAX_ACCESS);
	ASSERT(len == 0 \|\| UINT64_MAX - off >= len - 1);

	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
	if (txh != NULL) {
	dmu_tx_count_write(txh, off, len);
	dmu_tx_count_dnode(txh);
	}
	}

	/*
	* This function marks the transaction as being a "net free". The end
	* result is that refquotas will be disabled for this transaction, and
	* this transaction will be able to use half of the pool space overhead
	* (see dsl_pool_adjustedsize()). Therefore this function should only
	* be called for transactions that we expect will not cause a net increase
	* in the amount of space used (but it's OK if that is occasionally not true).
	*/
	void
	dmu_tx_mark_netfree(dmu_tx_t *tx)
	{
	tx->tx_netfree = B_TRUE;
	}

	static void
	dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
	{
	dmu_tx_t *tx;
	dnode_t *dn;
	int err;

	tx = txh->txh_tx;
	ASSERT(tx->tx_txg == 0);

	dn = txh->txh_dnode;
	dmu_tx_count_dnode(txh);

	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
	return;
	if (len == DMU_OBJECT_END)
	len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;


	/*
	* For i/o error checking, we read the first and last level-0
	* blocks if they are not aligned, and all the level-1 blocks.
	*
	* Note: dbuf_free_range() assumes that we have not instantiated
	* any level-0 dbufs that will be completely freed. Therefore we must
	* exercise care to not read or count the first and last blocks
	* if they are blocksize-aligned.
	*/
	if (dn->dn_datablkshift == 0) {
	if (off != 0 \|\| len < dn->dn_datablksz)
	dmu_tx_count_write(txh, 0, dn->dn_datablksz);
	} else {
	/* first block will be modified if it is not aligned */
	if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
	dmu_tx_count_write(txh, off, 1);
	/* last block will be modified if it is not aligned */
	if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
	dmu_tx_count_write(txh, off + len, 1);
	}

	/*
	* Check level-1 blocks.
	*/
	if (dn->dn_nlevels > 1) {
	int shift = dn->dn_datablkshift + dn->dn_indblkshift -
	SPA_BLKPTRSHIFT;
	uint64_t start = off >> shift;
	uint64_t end = (off + len) >> shift;

	ASSERT(dn->dn_indblkshift != 0);

	/*
	* dnode_reallocate() can result in an object with indirect
	* blocks having an odd data block size. In this case,
	* just check the single block.
	*/
	if (dn->dn_datablkshift == 0)
	start = end = 0;

	zio_t *zio = zio_root(tx->tx_pool->dp_spa,
	NULL, NULL, ZIO_FLAG_CANFAIL);
	for (uint64_t i = start; i <= end; i++) {
	uint64_t ibyte = i << shift;
	err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
	i = ibyte >> shift;
	if (err == ESRCH \|\| i > end)
	break;
	if (err != 0) {
	tx->tx_err = err;
	(void) zio_wait(zio);
	return;
	}

	(void) refcount_add_many(&txh->txh_memory_tohold,
	1 << dn->dn_indblkshift, FTAG);

	err = dmu_tx_check_ioerr(zio, dn, 1, i);
	if (err != 0) {
	tx->tx_err = err;
	(void) zio_wait(zio);
	return;
	}
	}
	err = zio_wait(zio);
	if (err != 0) {
	tx->tx_err = err;
	return;
	}
	}
	}

	void
	dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
	{
	dmu_tx_hold_t *txh;

	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	object, THT_FREE, off, len);
	if (txh != NULL)
	(void) dmu_tx_hold_free_impl(txh, off, len);
	}

	void
	dmu_tx_hold_free_by_dnode(dmu_tx_t tx, dnode_t dn, uint64_t off, uint64_t len)
	{
	dmu_tx_hold_t *txh;

	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
	if (txh != NULL)
	(void) dmu_tx_hold_free_impl(txh, off, len);
	}

	static void
	dmu_tx_hold_zap_impl(dmu_tx_hold_t txh, const char name)
	{
	dmu_tx_t *tx = txh->txh_tx;
	dnode_t *dn;
	int err;

	ASSERT(tx->tx_txg == 0);

	dn = txh->txh_dnode;

	dmu_tx_count_dnode(txh);

	/*
	* Modifying a almost-full microzap is around the worst case (128KB)
	*
	* If it is a fat zap, the worst case would be 7*16KB=112KB:
	* - 3 blocks overwritten: target leaf, ptrtbl block, header block
	* - 4 new blocks written if adding:
	* - 2 blocks for possibly split leaves,
	* - 2 grown ptrtbl blocks
	*/
	(void) refcount_add_many(&txh->txh_space_towrite,
	MZAP_MAX_BLKSZ, FTAG);

	if (dn == NULL)
	return;

	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);

	if (dn->dn_maxblkid == 0 \|\| name == NULL) {
	/*
	* This is a microzap (only one block), or we don't know
	* the name. Check the first block for i/o errors.
	*/
	err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
	if (err != 0) {
	tx->tx_err = err;
	}
	} else {
	/*
	* Access the name so that we'll check for i/o errors to
	* the leaf blocks, etc. We ignore ENOENT, as this name
	* may not yet exist.
	*/
	err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
	if (err == EIO \|\| err == ECKSUM \|\| err == ENXIO) {
	tx->tx_err = err;
	}
	}
	}

	void
	dmu_tx_hold_zap(dmu_tx_t tx, uint64_t object, int add, const char name)
	{
	dmu_tx_hold_t *txh;

	ASSERT0(tx->tx_txg);

	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	object, THT_ZAP, add, (uintptr_t)name);
	if (txh != NULL)
	dmu_tx_hold_zap_impl(txh, name);
	}

	void
	dmu_tx_hold_zap_by_dnode(dmu_tx_t tx, dnode_t dn, int add, const char *name)
	{
	dmu_tx_hold_t *txh;

	ASSERT0(tx->tx_txg);
	ASSERT(dn != NULL);

	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
	if (txh != NULL)
	dmu_tx_hold_zap_impl(txh, name);
	}

	void
	dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
	{
	dmu_tx_hold_t *txh;

	ASSERT(tx->tx_txg == 0);

	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	object, THT_BONUS, 0, 0);
	if (txh)
	dmu_tx_count_dnode(txh);
	}

	void
	dmu_tx_hold_bonus_by_dnode(dmu_tx_t tx, dnode_t dn)
	{
	dmu_tx_hold_t *txh;

	ASSERT0(tx->tx_txg);

	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
	if (txh)
	dmu_tx_count_dnode(txh);
	}

	void
	dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
	{
	dmu_tx_hold_t *txh;
	ASSERT(tx->tx_txg == 0);

	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
	DMU_NEW_OBJECT, THT_SPACE, space, 0);

	(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
	}

	#ifdef ZFS_DEBUG
	void
	dmu_tx_dirty_buf(dmu_tx_t tx, dmu_buf_impl_t db)
	{
	boolean_t match_object = B_FALSE;
	boolean_t match_offset = B_FALSE;

	DB_DNODE_ENTER(db);
	dnode_t *dn = DB_DNODE(db);
	ASSERT(tx->tx_txg != 0);
	ASSERT(tx->tx_objset == NULL \|\| dn->dn_objset == tx->tx_objset);
	ASSERT3U(dn->dn_object, ==, db->db.db_object);

	if (tx->tx_anyobj) {
	DB_DNODE_EXIT(db);
	return;
	}

	/* XXX No checking on the meta dnode for now */
	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
	DB_DNODE_EXIT(db);
	return;
	}

	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
	txh = list_next(&tx->tx_holds, txh)) {
	ASSERT(dn == NULL \|\| dn->dn_assigned_txg == tx->tx_txg);
	if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
	match_object = TRUE;
	if (txh->txh_dnode == NULL \|\| txh->txh_dnode == dn) {
	int datablkshift = dn->dn_datablkshift ?
	dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
	int shift = datablkshift + epbs * db->db_level;
	uint64_t beginblk = shift >= 64 ? 0 :
	(txh->txh_arg1 >> shift);
	uint64_t endblk = shift >= 64 ? 0 :
	((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
	uint64_t blkid = db->db_blkid;

	/* XXX txh_arg2 better not be zero... */

	dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
	txh->txh_type, beginblk, endblk);

	switch (txh->txh_type) {
	case THT_WRITE:
	if (blkid >= beginblk && blkid <= endblk)
	match_offset = TRUE;
	/*
	* We will let this hold work for the bonus
	* or spill buffer so that we don't need to
	* hold it when creating a new object.
	*/
	if (blkid == DMU_BONUS_BLKID \|\|
	blkid == DMU_SPILL_BLKID)
	match_offset = TRUE;
	/*
	* They might have to increase nlevels,
	* thus dirtying the new TLIBs. Or the
	* might have to change the block size,
	* thus dirying the new lvl=0 blk=0.
	*/
	if (blkid == 0)
	match_offset = TRUE;
	break;
	case THT_FREE:
	/*
	* We will dirty all the level 1 blocks in
	* the free range and perhaps the first and
	* last level 0 block.
	*/
	if (blkid >= beginblk && (blkid <= endblk \|\|
	txh->txh_arg2 == DMU_OBJECT_END))
	match_offset = TRUE;
	break;
	case THT_SPILL:
	if (blkid == DMU_SPILL_BLKID)
	match_offset = TRUE;
	break;
	case THT_BONUS:
	if (blkid == DMU_BONUS_BLKID)
	match_offset = TRUE;
	break;
	case THT_ZAP:
	match_offset = TRUE;
	break;
	case THT_NEWOBJECT:
	match_object = TRUE;
	break;
	default:
	ASSERT(!"bad txh_type");
	}
	}
	if (match_object && match_offset) {
	DB_DNODE_EXIT(db);
	return;
	}
	}
	DB_DNODE_EXIT(db);
	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
	(u_longlong_t)db->db.db_object, db->db_level,
	(u_longlong_t)db->db_blkid);
	}
	#endif

	/*
	* If we can't do 10 iops, something is wrong. Let us go ahead
	* and hit zfs_dirty_data_max.
	*/
	hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
	int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */

	/*
	* We delay transactions when we've determined that the backend storage
	* isn't able to accommodate the rate of incoming writes.
	*
	* If there is already a transaction waiting, we delay relative to when
	* that transaction finishes waiting. This way the calculated min_time
	* is independent of the number of threads concurrently executing
	* transactions.
	*
	* If we are the only waiter, wait relative to when the transaction
	* started, rather than the current time. This credits the transaction for
	* "time already served", e.g. reading indirect blocks.
	*
	* The minimum time for a transaction to take is calculated as:
	* min_time = scale * (dirty - min) / (max - dirty)
	* min_time is then capped at zfs_delay_max_ns.
	*
	* The delay has two degrees of freedom that can be adjusted via tunables.
	* The percentage of dirty data at which we start to delay is defined by
	* zfs_delay_min_dirty_percent. This should typically be at or above
	* zfs_vdev_async_write_active_max_dirty_percent so that we only start to
	* delay after writing at full speed has failed to keep up with the incoming
	* write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
	* speaking, this variable determines the amount of delay at the midpoint of
	* the curve.
	*
	* delay
	* 10ms +-------------------------------------------------------------*+
	* \| *\|
	* 9ms + *+
	* \| *\|
	* 8ms + *+
	* \| * \|
	* 7ms + * +
	* \| * \|
	* 6ms + * +
	* \| * \|
	* 5ms + * +
	* \| * \|
	* 4ms + * +
	* \| * \|
	* 3ms + * +
	* \| * \|
	* 2ms + (midpoint) * +
	* \| \| ** \|
	* 1ms + v *** +
	* \| zfs_delay_scale ----------> ******** \|
	* 0 +-------------------------------------*********----------------+
	* 0% <- zfs_dirty_data_max -> 100%
	*
	* Note that since the delay is added to the outstanding time remaining on the
	* most recent transaction, the delay is effectively the inverse of IOPS.
	* Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
	* was chosen such that small changes in the amount of accumulated dirty data
	* in the first 3/4 of the curve yield relatively small differences in the
	* amount of delay.
	*
	* The effects can be easier to understand when the amount of delay is
	* represented on a log scale:
	*
	* delay
	* 100ms +-------------------------------------------------------------++
	* + +
	* \| \|
	* + *+
	* 10ms + *+
	* + ** +
	* \| (midpoint) ** \|
	* + \| ** +
	* 1ms + v **** +
	* + zfs_delay_scale ----------> ***** +
	* \| **** \|
	* + **** +
	* 100us + ** +
	* + * +
	* \| * \|
	* + * +
	* 10us + * +
	* + +
	* \| \|
	* + +
	* +--------------------------------------------------------------+
	* 0% <- zfs_dirty_data_max -> 100%
	*
	* Note here that only as the amount of dirty data approaches its limit does
	* the delay start to increase rapidly. The goal of a properly tuned system
	* should be to keep the amount of dirty data out of that range by first
	* ensuring that the appropriate limits are set for the I/O scheduler to reach
	* optimal throughput on the backend storage, and then by changing the value
	* of zfs_delay_scale to increase the steepness of the curve.
	*/
	static void
	dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
	{
	dsl_pool_t *dp = tx->tx_pool;
	uint64_t delay_min_bytes =
	zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
	hrtime_t wakeup, min_tx_time, now;

	if (dirty <= delay_min_bytes)
	return;

	/*
	* The caller has already waited until we are under the max.
	* We make them pass us the amount of dirty data so we don't
	* have to handle the case of it being >= the max, which could
	* cause a divide-by-zero if it's == the max.
	*/
	ASSERT3U(dirty, <, zfs_dirty_data_max);

	now = gethrtime();
	min_tx_time = zfs_delay_scale *
	(dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
	if (now > tx->tx_start + min_tx_time)
	return;

	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);

	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
	uint64_t, min_tx_time);

	mutex_enter(&dp->dp_lock);
	wakeup = MAX(tx->tx_start + min_tx_time,
	dp->dp_last_wakeup + min_tx_time);
	dp->dp_last_wakeup = wakeup;
	mutex_exit(&dp->dp_lock);

	#ifdef _KERNEL
	#ifdef illumos
	mutex_enter(&curthread->t_delay_lock);
	while (cv_timedwait_hires(&curthread->t_delay_cv,
	&curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
	CALLOUT_FLAG_ABSOLUTE \| CALLOUT_FLAG_ROUNDUP) > 0)
	continue;
	mutex_exit(&curthread->t_delay_lock);
	#else
	pause_sbt("dmu_tx_delay", nstosbt(wakeup),
	nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE);
	#endif
	#else
	hrtime_t delta = wakeup - gethrtime();
	struct timespec ts;
	ts.tv_sec = delta / NANOSEC;
	ts.tv_nsec = delta % NANOSEC;
	(void) nanosleep(&ts, NULL);
	#endif
	}

	/*
	* This routine attempts to assign the transaction to a transaction group.
	* To do so, we must determine if there is sufficient free space on disk.
	*
	* If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
	* on it), then it is assumed that there is sufficient free space,
	* unless there's insufficient slop space in the pool (see the comment
	* above spa_slop_shift in spa_misc.c).
	*
	* If it is not a "netfree" transaction, then if the data already on disk
	* is over the allowed usage (e.g. quota), this will fail with EDQUOT or
	* ENOSPC. Otherwise, if the current rough estimate of pending changes,
	* plus the rough estimate of this transaction's changes, may exceed the
	* allowed usage, then this will fail with ERESTART, which will cause the
	* caller to wait for the pending changes to be written to disk (by waiting
	* for the next TXG to open), and then check the space usage again.
	*
	* The rough estimate of pending changes is comprised of the sum of:
	*
	* - this transaction's holds' txh_space_towrite
	*
	* - dd_tempreserved[], which is the sum of in-flight transactions'
	* holds' txh_space_towrite (i.e. those transactions that have called
	* dmu_tx_assign() but not yet called dmu_tx_commit()).
	*
	* - dd_space_towrite[], which is the amount of dirtied dbufs.
	*
	* Note that all of these values are inflated by spa_get_worst_case_asize(),
	* which means that we may get ERESTART well before we are actually in danger
	* of running out of space, but this also mitigates any small inaccuracies
	* in the rough estimate (e.g. txh_space_towrite doesn't take into account
	* indirect blocks, and dd_space_towrite[] doesn't take into account changes
	* to the MOS).
	*
	* Note that due to this algorithm, it is possible to exceed the allowed
	* usage by one transaction. Also, as we approach the allowed usage,
	* we will allow a very limited amount of changes into each TXG, thus
	* decreasing performance.
	*/
	static int
	dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
	{
	spa_t *spa = tx->tx_pool->dp_spa;

	ASSERT0(tx->tx_txg);

	if (tx->tx_err)
	return (tx->tx_err);

	if (spa_suspended(spa)) {
	/*
	* If the user has indicated a blocking failure mode
	* then return ERESTART which will block in dmu_tx_wait().
	* Otherwise, return EIO so that an error can get
	* propagated back to the VOP calls.
	*
	* Note that we always honor the txg_how flag regardless
	* of the failuremode setting.
	*/
	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
	!(txg_how & TXG_WAIT))
	return (SET_ERROR(EIO));

	return (SET_ERROR(ERESTART));
	}

	if (!tx->tx_dirty_delayed &&
	dsl_pool_need_dirty_delay(tx->tx_pool)) {
	tx->tx_wait_dirty = B_TRUE;
	return (SET_ERROR(ERESTART));
	}

	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
	tx->tx_needassign_txh = NULL;

	/*
	* NB: No error returns are allowed after txg_hold_open, but
	* before processing the dnode holds, due to the
	* dmu_tx_unassign() logic.
	*/

	uint64_t towrite = 0;
	uint64_t tohold = 0;
	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
	txh = list_next(&tx->tx_holds, txh)) {
	dnode_t *dn = txh->txh_dnode;
	if (dn != NULL) {
	mutex_enter(&dn->dn_mtx);
	if (dn->dn_assigned_txg == tx->tx_txg - 1) {
	mutex_exit(&dn->dn_mtx);
	tx->tx_needassign_txh = txh;
	return (SET_ERROR(ERESTART));
	}
	if (dn->dn_assigned_txg == 0)
	dn->dn_assigned_txg = tx->tx_txg;
	ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
	(void) refcount_add(&dn->dn_tx_holds, tx);
	mutex_exit(&dn->dn_mtx);
	}
	towrite += refcount_count(&txh->txh_space_towrite);
	tohold += refcount_count(&txh->txh_memory_tohold);
	}

	/* needed allocation: worst-case estimate of write space */
	uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
	/* calculate memory footprint estimate */
	uint64_t memory = towrite + tohold;

	if (tx->tx_dir != NULL && asize != 0) {
	int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
	asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
	if (err != 0)
	return (err);
	}

	return (0);
	}

	static void
	dmu_tx_unassign(dmu_tx_t *tx)
	{
	if (tx->tx_txg == 0)
	return;

	txg_rele_to_quiesce(&tx->tx_txgh);

	/*
	* Walk the transaction's hold list, removing the hold on the
	* associated dnode, and notifying waiters if the refcount drops to 0.
	*/
	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
	txh != tx->tx_needassign_txh;
	txh = list_next(&tx->tx_holds, txh)) {
	dnode_t *dn = txh->txh_dnode;

	if (dn == NULL)
	continue;
	mutex_enter(&dn->dn_mtx);
	ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);

	if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
	dn->dn_assigned_txg = 0;
	cv_broadcast(&dn->dn_notxholds);
	}
	mutex_exit(&dn->dn_mtx);
	}

	txg_rele_to_sync(&tx->tx_txgh);

	tx->tx_lasttried_txg = tx->tx_txg;
	tx->tx_txg = 0;
	}

	/*
	* Assign tx to a transaction group; txg_how is a bitmask:
	*
	* If TXG_WAIT is set and the currently open txg is full, this function
	* will wait until there's a new txg. This should be used when no locks
	* are being held. With this bit set, this function will only fail if
	* we're truly out of space (or over quota).
	*
	* If TXG_WAIT is not set and we can't assign into the currently open
	* txg without blocking, this function will return immediately with
	* ERESTART. This should be used whenever locks are being held. On an
	* ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
	* and try again.
	*
	* If TXG_NOTHROTTLE is set, this indicates that this tx should not be
	* delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
	* details on the throttle). This is used by the VFS operations, after
	* they have already called dmu_tx_wait() (though most likely on a
	* different tx).
	*/
	int
	dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
	{
	int err;

	ASSERT(tx->tx_txg == 0);
	ASSERT0(txg_how & ~(TXG_WAIT \| TXG_NOTHROTTLE));
	ASSERT(!dsl_pool_sync_context(tx->tx_pool));

	/* If we might wait, we must not hold the config lock. */
	IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));

	if ((txg_how & TXG_NOTHROTTLE))
	tx->tx_dirty_delayed = B_TRUE;

	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
	dmu_tx_unassign(tx);

	if (err != ERESTART \|\| !(txg_how & TXG_WAIT))
	return (err);

	dmu_tx_wait(tx);
	}

	txg_rele_to_quiesce(&tx->tx_txgh);

	return (0);
	}

	void
	dmu_tx_wait(dmu_tx_t *tx)
	{
	spa_t *spa = tx->tx_pool->dp_spa;
	dsl_pool_t *dp = tx->tx_pool;

	ASSERT(tx->tx_txg == 0);
	ASSERT(!dsl_pool_config_held(tx->tx_pool));

	if (tx->tx_wait_dirty) {
	/*
	* dmu_tx_try_assign() has determined that we need to wait
	* because we've consumed much or all of the dirty buffer
	* space.
	*/
	mutex_enter(&dp->dp_lock);
	while (dp->dp_dirty_total >= zfs_dirty_data_max)
	cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
	uint64_t dirty = dp->dp_dirty_total;
	mutex_exit(&dp->dp_lock);

	dmu_tx_delay(tx, dirty);

	tx->tx_wait_dirty = B_FALSE;

	/*
	* Note: setting tx_dirty_delayed only has effect if the
	* caller used TX_WAIT. Otherwise they are going to
	* destroy this tx and try again. The common case,
	* zfs_write(), uses TX_WAIT.
	*/
	tx->tx_dirty_delayed = B_TRUE;
	} else if (spa_suspended(spa) \|\| tx->tx_lasttried_txg == 0) {
	/*
	* If the pool is suspended we need to wait until it
	* is resumed. Note that it's possible that the pool
	* has become active after this thread has tried to
	* obtain a tx. If that's the case then tx_lasttried_txg
	* would not have been set.
	*/
	txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
	} else if (tx->tx_needassign_txh) {
	/*
	* A dnode is assigned to the quiescing txg. Wait for its
	* transaction to complete.
	*/
	dnode_t *dn = tx->tx_needassign_txh->txh_dnode;

	mutex_enter(&dn->dn_mtx);
	while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
	cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
	mutex_exit(&dn->dn_mtx);
	tx->tx_needassign_txh = NULL;
	} else {
	txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
	}
	}

	static void
	dmu_tx_destroy(dmu_tx_t *tx)
	{
	dmu_tx_hold_t *txh;

	while ((txh = list_head(&tx->tx_holds)) != NULL) {
	dnode_t *dn = txh->txh_dnode;

	list_remove(&tx->tx_holds, txh);
	refcount_destroy_many(&txh->txh_space_towrite,
	refcount_count(&txh->txh_space_towrite));
	refcount_destroy_many(&txh->txh_memory_tohold,
	refcount_count(&txh->txh_memory_tohold));
	kmem_free(txh, sizeof (dmu_tx_hold_t));
	if (dn != NULL)
	dnode_rele(dn, tx);
	}

	list_destroy(&tx->tx_callbacks);
	list_destroy(&tx->tx_holds);
	kmem_free(tx, sizeof (dmu_tx_t));
	}

	void
	dmu_tx_commit(dmu_tx_t *tx)
	{
	ASSERT(tx->tx_txg != 0);

	/*
	* Go through the transaction's hold list and remove holds on
	* associated dnodes, notifying waiters if no holds remain.
	*/
	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
	txh = list_next(&tx->tx_holds, txh)) {
	dnode_t *dn = txh->txh_dnode;

	if (dn == NULL)
	continue;

	mutex_enter(&dn->dn_mtx);
	ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);

	if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
	dn->dn_assigned_txg = 0;
	cv_broadcast(&dn->dn_notxholds);
	}
	mutex_exit(&dn->dn_mtx);
	}

	if (tx->tx_tempreserve_cookie)
	dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);

	if (!list_is_empty(&tx->tx_callbacks))
	txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);

	if (tx->tx_anyobj == FALSE)
	txg_rele_to_sync(&tx->tx_txgh);

	dmu_tx_destroy(tx);
	}

	void
	dmu_tx_abort(dmu_tx_t *tx)
	{
	ASSERT(tx->tx_txg == 0);

	/*
	* Call any registered callbacks with an error code.
	*/
	if (!list_is_empty(&tx->tx_callbacks))
	dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);

	dmu_tx_destroy(tx);
	}

	uint64_t
	dmu_tx_get_txg(dmu_tx_t *tx)
	{
	ASSERT(tx->tx_txg != 0);
	return (tx->tx_txg);
	}

	dsl_pool_t *
	dmu_tx_pool(dmu_tx_t *tx)
	{
	ASSERT(tx->tx_pool != NULL);
	return (tx->tx_pool);
	}

	void
	dmu_tx_callback_register(dmu_tx_t tx, dmu_tx_callback_func_t func, void *data)
	{
	dmu_tx_callback_t *dcb;

	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);

	dcb->dcb_func = func;
	dcb->dcb_data = data;

	list_insert_tail(&tx->tx_callbacks, dcb);
	}

	/*
	* Call all the commit callbacks on a list, with a given error code.
	*/
	void
	dmu_tx_do_callbacks(list_t *cb_list, int error)
	{
	dmu_tx_callback_t *dcb;

	while ((dcb = list_head(cb_list)) != NULL) {
	list_remove(cb_list, dcb);
	dcb->dcb_func(dcb->dcb_data, error);
	kmem_free(dcb, sizeof (dmu_tx_callback_t));
	}
	}

	/*
	* Interface to hold a bunch of attributes.
	* used for creating new files.
	* attrsize is the total size of all attributes
	* to be added during object creation
	*
	* For updating/adding a single attribute dmu_tx_hold_sa() should be used.
	*/

	/*
	* hold necessary attribute name for attribute registration.
	* should be a very rare case where this is needed. If it does
	* happen it would only happen on the first write to the file system.
	*/
	static void
	dmu_tx_sa_registration_hold(sa_os_t sa, dmu_tx_t tx)
	{
	if (!sa->sa_need_attr_registration)
	return;

	for (int i = 0; i != sa->sa_num_attrs; i++) {
	if (!sa->sa_attr_table[i].sa_registered) {
	if (sa->sa_reg_attr_obj)
	dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
	B_TRUE, sa->sa_attr_table[i].sa_name);
	else
	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
	B_TRUE, sa->sa_attr_table[i].sa_name);
	}
	}
	}

	void
	dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
	{
	dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
	tx->tx_objset, object, THT_SPILL, 0, 0);

	(void) refcount_add_many(&txh->txh_space_towrite,
	SPA_OLD_MAXBLOCKSIZE, FTAG);
	}

	void
	dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
	{
	sa_os_t *sa = tx->tx_objset->os_sa;

	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);

	if (tx->tx_objset->os_sa->sa_master_obj == 0)
	return;

	if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
	dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
	} else {
	dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
	dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
	}

	dmu_tx_sa_registration_hold(sa, tx);

	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
	return;

	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
	THT_SPILL, 0, 0);
	}

	/*
	* Hold SA attribute
	*
	* dmu_tx_hold_sa(dmu_tx_t tx, sa_handle_t , attribute, add, size)
	*
	* variable_size is the total size of all variable sized attributes
	* passed to this function. It is not the total size of all
	* variable size attributes that may exist on this object.
	*/
	void
	dmu_tx_hold_sa(dmu_tx_t tx, sa_handle_t hdl, boolean_t may_grow)
	{
	uint64_t object;
	sa_os_t *sa = tx->tx_objset->os_sa;

	ASSERT(hdl != NULL);

	object = sa_handle_object(hdl);

	dmu_tx_hold_bonus(tx, object);

	if (tx->tx_objset->os_sa->sa_master_obj == 0)
	return;

	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 \|\|
	tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
	dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
	dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
	}

	dmu_tx_sa_registration_hold(sa, tx);

	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
	dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);

	if (sa->sa_force_spill \|\| may_grow \|\| hdl->sa_spill) {
	ASSERT(tx->tx_txg == 0);
	dmu_tx_hold_spill(tx, object);
	} else {
	dmu_buf_impl_t db = (dmu_buf_impl_t )hdl->sa_bonus;
	dnode_t *dn;

	DB_DNODE_ENTER(db);
	dn = DB_DNODE(db);
	if (dn->dn_have_spill) {
	ASSERT(tx->tx_txg == 0);
	dmu_tx_hold_spill(tx, object);
	}
	DB_DNODE_EXIT(db);
	}
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c (revision 332525)
	@@ -1,362 +1,373 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/

	/*
	* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/dnode.h>
	#include <sys/dmu_objset.h>
	#include <sys/dmu_zfetch.h>
	#include <sys/dmu.h>
	#include <sys/dbuf.h>
	#include <sys/kstat.h>

	/*
	* This tunable disables predictive prefetch. Note that it leaves "prescient"
	* prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
	* prescient prefetch never issues i/os that end up not being needed,
	* so it can't hurt performance.
	*/
	boolean_t zfs_prefetch_disable = B_FALSE;

	/* max # of streams per zfetch */
	uint32_t zfetch_max_streams = 8;
	/* min time before stream reclaim */
	uint32_t zfetch_min_sec_reap = 2;
	/* max bytes to prefetch per stream (default 8MB) */
	uint32_t zfetch_max_distance = 8 * 1024 * 1024;
	/* max bytes to prefetch indirects for per stream (default 64MB) */
	uint32_t zfetch_max_idistance = 64 * 1024 * 1024;
	/* max number of bytes in an array_read in which we allow prefetching (1MB) */
	uint64_t zfetch_array_rd_sz = 1024 * 1024;

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
	&zfs_prefetch_disable, 0, "Disable prefetch");
	SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
	SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN,
	&zfetch_max_streams, 0, "Max # of streams per zfetch");
	SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
	&zfetch_min_sec_reap, 0, "Min time before stream reclaim");
	SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
	&zfetch_max_distance, 0, "Max bytes to prefetch per stream");
	SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
	&zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
	SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
	&zfetch_array_rd_sz, 0,
	"Number of bytes in a array_read at which we stop prefetching");

	typedef struct zfetch_stats {
	kstat_named_t zfetchstat_hits;
	kstat_named_t zfetchstat_misses;
	kstat_named_t zfetchstat_max_streams;
	} zfetch_stats_t;

	static zfetch_stats_t zfetch_stats = {
	{ "hits", KSTAT_DATA_UINT64 },
	{ "misses", KSTAT_DATA_UINT64 },
	{ "max_streams", KSTAT_DATA_UINT64 },
	};

	#define ZFETCHSTAT_BUMP(stat) \
	atomic_inc_64(&zfetch_stats.stat.value.ui64);

	kstat_t *zfetch_ksp;

	void
	zfetch_init(void)
	{
	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
	KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
	KSTAT_FLAG_VIRTUAL);

	if (zfetch_ksp != NULL) {
	zfetch_ksp->ks_data = &zfetch_stats;
	kstat_install(zfetch_ksp);
	}
	}

	void
	zfetch_fini(void)
	{
	if (zfetch_ksp != NULL) {
	kstat_delete(zfetch_ksp);
	zfetch_ksp = NULL;
	}
	}

	/*
	* This takes a pointer to a zfetch structure and a dnode. It performs the
	* necessary setup for the zfetch structure, grokking data from the
	* associated dnode.
	*/
	void
	dmu_zfetch_init(zfetch_t zf, dnode_t dno)
	{
	if (zf == NULL)
	return;

	zf->zf_dnode = dno;

	list_create(&zf->zf_stream, sizeof (zstream_t),
	offsetof(zstream_t, zs_node));

	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
	}

	static void
	dmu_zfetch_stream_remove(zfetch_t zf, zstream_t zs)
	{
	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
	list_remove(&zf->zf_stream, zs);
	mutex_destroy(&zs->zs_lock);
	kmem_free(zs, sizeof (*zs));
	}

	/*
	* Clean-up state associated with a zfetch structure (e.g. destroy the
	* streams). This doesn't free the zfetch_t itself, that's left to the caller.
	*/
	void
	dmu_zfetch_fini(zfetch_t *zf)
	{
	zstream_t *zs;

	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));

	rw_enter(&zf->zf_rwlock, RW_WRITER);
	while ((zs = list_head(&zf->zf_stream)) != NULL)
	dmu_zfetch_stream_remove(zf, zs);
	rw_exit(&zf->zf_rwlock);
	list_destroy(&zf->zf_stream);
	rw_destroy(&zf->zf_rwlock);

	zf->zf_dnode = NULL;
	}

	/*
	* If there aren't too many streams already, create a new stream.
	* The "blkid" argument is the next block that we expect this stream to access.
	* While we're here, clean up old streams (which haven't been
	* accessed for at least zfetch_min_sec_reap seconds).
	*/
	static void
	dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
	{
	zstream_t *zs_next;
	int numstreams = 0;

	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));

	/*
	* Clean up old streams.
	*/
	for (zstream_t *zs = list_head(&zf->zf_stream);
	zs != NULL; zs = zs_next) {
	zs_next = list_next(&zf->zf_stream, zs);
	if (((gethrtime() - zs->zs_atime) / NANOSEC) >
	zfetch_min_sec_reap)
	dmu_zfetch_stream_remove(zf, zs);
	else
	numstreams++;
	}

	/*
	* The maximum number of streams is normally zfetch_max_streams,
	* but for small files we lower it such that it's at least possible
	* for all the streams to be non-overlapping.
	*
	* If we are already at the maximum number of streams for this file,
	* even after removing old streams, then don't create this stream.
	*/
	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
	zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
	zfetch_max_distance));
	if (numstreams >= max_streams) {
	ZFETCHSTAT_BUMP(zfetchstat_max_streams);
	return;
	}

	zstream_t zs = kmem_zalloc(sizeof (zs), KM_SLEEP);
	zs->zs_blkid = blkid;
	zs->zs_pf_blkid = blkid;
	zs->zs_ipf_blkid = blkid;
	zs->zs_atime = gethrtime();
	mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);

	list_insert_head(&zf->zf_stream, zs);
	}

	/*
	* This is the predictive prefetch entry point. It associates dnode access
	* specified with blkid and nblks arguments with prefetch stream, predicts
	* further accesses based on that stats and initiates speculative prefetch.
	* fetch_data argument specifies whether actual data blocks should be fetched:
	* FALSE -- prefetch only indirect blocks for predicted data blocks;
	* TRUE -- prefetch predicted data blocks plus following indirect blocks.
	*/
	void
	dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
	{
	zstream_t *zs;
	int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
	int64_t pf_ahead_blks, max_blks;
	int epbs, max_dist_blks, pf_nblks, ipf_nblks;
	uint64_t end_of_access_blkid = blkid + nblks;
	+ spa_t *spa = zf->zf_dnode->dn_objset->os_spa;

	if (zfs_prefetch_disable)
	+ return;
	+
	+ /*
	+ * If we haven't yet loaded the indirect vdevs' mappings, we
	+ * can only read from blocks that we carefully ensure are on
	+ * concrete vdevs (or previously-loaded indirect vdevs). So we
	+ * can't allow the predictive prefetcher to attempt reads of other
	+ * blocks (e.g. of the MOS's dnode obejct).
	+ */
	+ if (!spa_indirect_vdevs_loaded(spa))
	return;

	/*
	* As a fast path for small (single-block) files, ignore access
	* to the first block.
	*/
	if (blkid == 0)
	return;

	rw_enter(&zf->zf_rwlock, RW_READER);

	/*
	* Find matching prefetch stream. Depending on whether the accesses
	* are block-aligned, first block of the new access may either follow
	* the last block of the previous access, or be equal to it.
	*/
	for (zs = list_head(&zf->zf_stream); zs != NULL;
	zs = list_next(&zf->zf_stream, zs)) {
	if (blkid == zs->zs_blkid \|\| blkid + 1 == zs->zs_blkid) {
	mutex_enter(&zs->zs_lock);
	/*
	* zs_blkid could have changed before we
	* acquired zs_lock; re-check them here.
	*/
	if (blkid == zs->zs_blkid) {
	break;
	} else if (blkid + 1 == zs->zs_blkid) {
	blkid++;
	nblks--;
	if (nblks == 0) {
	/* Already prefetched this before. */
	mutex_exit(&zs->zs_lock);
	rw_exit(&zf->zf_rwlock);
	return;
	}
	break;
	}
	mutex_exit(&zs->zs_lock);
	}
	}

	if (zs == NULL) {
	/*
	* This access is not part of any existing stream. Create
	* a new stream for it.
	*/
	ZFETCHSTAT_BUMP(zfetchstat_misses);
	if (rw_tryupgrade(&zf->zf_rwlock))
	dmu_zfetch_stream_create(zf, end_of_access_blkid);
	rw_exit(&zf->zf_rwlock);
	return;
	}

	/*
	* This access was to a block that we issued a prefetch for on
	* behalf of this stream. Issue further prefetches for this stream.
	*
	* Normally, we start prefetching where we stopped
	* prefetching last (zs_pf_blkid). But when we get our first
	* hit on this stream, zs_pf_blkid == zs_blkid, we don't
	* want to prefetch the block we just accessed. In this case,
	* start just after the block we just accessed.
	*/
	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);

	/*
	* Double our amount of prefetched data, but don't let the
	* prefetch get further ahead than zfetch_max_distance.
	*/
	if (fetch_data) {
	max_dist_blks =
	zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
	/*
	* Previously, we were (zs_pf_blkid - blkid) ahead. We
	* want to now be double that, so read that amount again,
	* plus the amount we are catching up by (i.e. the amount
	* read just now).
	*/
	pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
	max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
	pf_nblks = MIN(pf_ahead_blks, max_blks);
	} else {
	pf_nblks = 0;
	}

	zs->zs_pf_blkid = pf_start + pf_nblks;

	/*
	* Do the same for indirects, starting from where we stopped last,
	* or where we will stop reading data blocks (and the indirects
	* that point to them).
	*/
	ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
	max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
	/*
	* We want to double our distance ahead of the data prefetch
	* (or reader, if we are not prefetching data). Previously, we
	* were (zs_ipf_blkid - blkid) ahead. To double that, we read
	* that amount again, plus the amount we are catching up by
	* (i.e. the amount read now + the amount of data prefetched now).
	*/
	pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
	max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
	ipf_nblks = MIN(pf_ahead_blks, max_blks);
	zs->zs_ipf_blkid = ipf_start + ipf_nblks;

	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
	ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
	ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;

	zs->zs_atime = gethrtime();
	zs->zs_blkid = end_of_access_blkid;
	mutex_exit(&zs->zs_lock);
	rw_exit(&zf->zf_rwlock);

	/*
	* dbuf_prefetch() is asynchronous (even when it needs to read
	* indirect blocks), but we still prefer to drop our locks before
	* calling it to reduce the time we hold them.
	*/

	for (int i = 0; i < pf_nblks; i++) {
	dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
	ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
	}
	for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
	dbuf_prefetch(zf->zf_dnode, 1, iblk,
	ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
	}
	ZFETCHSTAT_BUMP(zfetchstat_hits);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 332525)
	@@ -1,2004 +1,2003 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#include <sys/zfs_context.h>
	#include <sys/dbuf.h>
	#include <sys/dnode.h>
	#include <sys/dmu.h>
	#include <sys/dmu_impl.h>
	#include <sys/dmu_tx.h>
	#include <sys/dmu_objset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_dataset.h>
	#include <sys/spa.h>
	#include <sys/zio.h>
	#include <sys/dmu_zfetch.h>
	#include <sys/range_tree.h>

	static kmem_cache_t *dnode_cache;
	/*
	* Define DNODE_STATS to turn on statistic gathering. By default, it is only
	* turned on when DEBUG is also defined.
	*/
	#ifdef DEBUG
	#define DNODE_STATS
	#endif /* DEBUG */

	#ifdef DNODE_STATS
	#define DNODE_STAT_ADD(stat) ((stat)++)
	#else
	#define DNODE_STAT_ADD(stat) /* nothing */
	#endif /* DNODE_STATS */

	static dnode_phys_t dnode_phys_zero;

	int zfs_default_bs = SPA_MINBLOCKSHIFT;
	int zfs_default_ibs = DN_MAX_INDBLKSHIFT;

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN,
	&zfs_default_bs, 0, "Default dnode block shift");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN,
	&zfs_default_ibs, 0, "Default dnode indirect block shift");

	#ifdef illumos
	static kmem_cbrc_t dnode_move(void , void , size_t, void *);
	#endif

	static int
	dbuf_compare(const void x1, const void x2)
	{
	const dmu_buf_impl_t *d1 = x1;
	const dmu_buf_impl_t *d2 = x2;

	if (d1->db_level < d2->db_level) {
	return (-1);
	}
	if (d1->db_level > d2->db_level) {
	return (1);
	}

	if (d1->db_blkid < d2->db_blkid) {
	return (-1);
	}
	if (d1->db_blkid > d2->db_blkid) {
	return (1);
	}

	if (d1->db_state == DB_SEARCH) {
	ASSERT3S(d2->db_state, !=, DB_SEARCH);
	return (-1);
	} else if (d2->db_state == DB_SEARCH) {
	ASSERT3S(d1->db_state, !=, DB_SEARCH);
	return (1);
	}

	if ((uintptr_t)d1 < (uintptr_t)d2) {
	return (-1);
	}
	if ((uintptr_t)d1 > (uintptr_t)d2) {
	return (1);
	}
	return (0);
	}

	/* ARGSUSED */
	static int
	dnode_cons(void arg, void unused, int kmflag)
	{
	dnode_t *dn = arg;
	int i;

	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);

	/*
	* Every dbuf has a reference, and dropping a tracked reference is
	* O(number of references), so don't track dn_holds.
	*/
	refcount_create_untracked(&dn->dn_holds);
	refcount_create(&dn->dn_tx_holds);
	list_link_init(&dn->dn_link);

	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));

	for (i = 0; i < TXG_SIZE; i++) {
	list_link_init(&dn->dn_dirty_link[i]);
	dn->dn_free_ranges[i] = NULL;
	list_create(&dn->dn_dirty_records[i],
	sizeof (dbuf_dirty_record_t),
	offsetof(dbuf_dirty_record_t, dr_dirty_node));
	}

	dn->dn_allocated_txg = 0;
	dn->dn_free_txg = 0;
	dn->dn_assigned_txg = 0;
	dn->dn_dirtyctx = 0;
	dn->dn_dirtyctx_firstset = NULL;
	dn->dn_bonus = NULL;
	dn->dn_have_spill = B_FALSE;
	dn->dn_zio = NULL;
	dn->dn_oldused = 0;
	dn->dn_oldflags = 0;
	dn->dn_olduid = 0;
	dn->dn_oldgid = 0;
	dn->dn_newuid = 0;
	dn->dn_newgid = 0;
	dn->dn_id_flags = 0;

	dn->dn_dbufs_count = 0;
	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
	offsetof(dmu_buf_impl_t, db_link));

	dn->dn_moved = 0;
	POINTER_INVALIDATE(&dn->dn_objset);
	return (0);
	}

	/* ARGSUSED */
	static void
	dnode_dest(void arg, void unused)
	{
	int i;
	dnode_t *dn = arg;

	rw_destroy(&dn->dn_struct_rwlock);
	mutex_destroy(&dn->dn_mtx);
	mutex_destroy(&dn->dn_dbufs_mtx);
	cv_destroy(&dn->dn_notxholds);
	refcount_destroy(&dn->dn_holds);
	refcount_destroy(&dn->dn_tx_holds);
	ASSERT(!list_link_active(&dn->dn_link));

	for (i = 0; i < TXG_SIZE; i++) {
	ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
	ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
	list_destroy(&dn->dn_dirty_records[i]);
	ASSERT0(dn->dn_next_nblkptr[i]);
	ASSERT0(dn->dn_next_nlevels[i]);
	ASSERT0(dn->dn_next_indblkshift[i]);
	ASSERT0(dn->dn_next_bonustype[i]);
	ASSERT0(dn->dn_rm_spillblk[i]);
	ASSERT0(dn->dn_next_bonuslen[i]);
	ASSERT0(dn->dn_next_blksz[i]);
	}

	ASSERT0(dn->dn_allocated_txg);
	ASSERT0(dn->dn_free_txg);
	ASSERT0(dn->dn_assigned_txg);
	ASSERT0(dn->dn_dirtyctx);
	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
	ASSERT3P(dn->dn_bonus, ==, NULL);
	ASSERT(!dn->dn_have_spill);
	ASSERT3P(dn->dn_zio, ==, NULL);
	ASSERT0(dn->dn_oldused);
	ASSERT0(dn->dn_oldflags);
	ASSERT0(dn->dn_olduid);
	ASSERT0(dn->dn_oldgid);
	ASSERT0(dn->dn_newuid);
	ASSERT0(dn->dn_newgid);
	ASSERT0(dn->dn_id_flags);

	ASSERT0(dn->dn_dbufs_count);
	avl_destroy(&dn->dn_dbufs);
	}

	void
	dnode_init(void)
	{
	ASSERT(dnode_cache == NULL);
	dnode_cache = kmem_cache_create("dnode_t",
	sizeof (dnode_t),
	0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
	kmem_cache_set_move(dnode_cache, dnode_move);
	}

	void
	dnode_fini(void)
	{
	kmem_cache_destroy(dnode_cache);
	dnode_cache = NULL;
	}


	#ifdef ZFS_DEBUG
	void
	dnode_verify(dnode_t *dn)
	{
	int drop_struct_lock = FALSE;

	ASSERT(dn->dn_phys);
	ASSERT(dn->dn_objset);
	ASSERT(dn->dn_handle->dnh_dnode == dn);

	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));

	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
	return;

	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	drop_struct_lock = TRUE;
	}
	if (dn->dn_phys->dn_type != DMU_OT_NONE \|\| dn->dn_allocated_txg != 0) {
	int i;
	ASSERT3U(dn->dn_indblkshift, >=, 0);
	ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
	if (dn->dn_datablkshift) {
	ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
	ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
	ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
	}
	ASSERT3U(dn->dn_nlevels, <=, 30);
	ASSERT(DMU_OT_IS_VALID(dn->dn_type));
	ASSERT3U(dn->dn_nblkptr, >=, 1);
	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
	ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
	ASSERT3U(dn->dn_datablksz, ==,
	dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
	ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
	ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
	dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
	for (i = 0; i < TXG_SIZE; i++) {
	ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
	}
	}
	if (dn->dn_phys->dn_type != DMU_OT_NONE)
	ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) \|\| dn->dn_dbuf != NULL);
	if (dn->dn_dbuf != NULL) {
	ASSERT3P(dn->dn_phys, ==,
	(dnode_phys_t *)dn->dn_dbuf->db.db_data +
	(dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
	}
	if (drop_struct_lock)
	rw_exit(&dn->dn_struct_rwlock);
	}
	#endif

	void
	dnode_byteswap(dnode_phys_t *dnp)
	{
	uint64_t buf64 = (void)&dnp->dn_blkptr;
	int i;

	if (dnp->dn_type == DMU_OT_NONE) {
	bzero(dnp, sizeof (dnode_phys_t));
	return;
	}

	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
	dnp->dn_used = BSWAP_64(dnp->dn_used);

	/*
	* dn_nblkptr is only one byte, so it's OK to read it in either
	* byte order. We can't read dn_bouslen.
	*/
	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
	buf64[i] = BSWAP_64(buf64[i]);

	/*
	* OK to check dn_bonuslen for zero, because it won't matter if
	* we have the wrong byte order. This is necessary because the
	* dnode dnode is smaller than a regular dnode.
	*/
	if (dnp->dn_bonuslen != 0) {
	/*
	* Note that the bonus length calculated here may be
	* longer than the actual bonus buffer. This is because
	* we always put the bonus buffer after the last block
	* pointer (instead of packing it against the end of the
	* dnode buffer).
	*/
	int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
	size_t len = DN_MAX_BONUSLEN - off;
	ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
	dmu_object_byteswap_t byteswap =
	DMU_OT_BYTESWAP(dnp->dn_bonustype);
	dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
	}

	/* Swap SPILL block if we have one */
	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
	byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));

	}

	void
	dnode_buf_byteswap(void *vbuf, size_t size)
	{
	dnode_phys_t *buf = vbuf;
	int i;

	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);

	size >>= DNODE_SHIFT;
	for (i = 0; i < size; i++) {
	dnode_byteswap(buf);
	buf++;
	}
	}

	void
	dnode_setbonuslen(dnode_t dn, int newsize, dmu_tx_t tx)
	{
	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);

	dnode_setdirty(dn, tx);
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
	(dn->dn_nblkptr-1) * sizeof (blkptr_t));
	dn->dn_bonuslen = newsize;
	if (newsize == 0)
	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
	else
	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
	rw_exit(&dn->dn_struct_rwlock);
	}

	void
	dnode_setbonus_type(dnode_t dn, dmu_object_type_t newtype, dmu_tx_t tx)
	{
	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
	dnode_setdirty(dn, tx);
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	dn->dn_bonustype = newtype;
	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
	rw_exit(&dn->dn_struct_rwlock);
	}

	void
	dnode_rm_spill(dnode_t dn, dmu_tx_t tx)
	{
	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
	dnode_setdirty(dn, tx);
	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
	dn->dn_have_spill = B_FALSE;
	}

	static void
	dnode_setdblksz(dnode_t *dn, int size)
	{
	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
	1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
	dn->dn_datablksz = size;
	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
	}

	static dnode_t *
	dnode_create(objset_t os, dnode_phys_t dnp, dmu_buf_impl_t *db,
	uint64_t object, dnode_handle_t *dnh)
	{
	dnode_t *dn;

	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
	dn->dn_moved = 0;

	/*
	* Defer setting dn_objset until the dnode is ready to be a candidate
	* for the dnode_move() callback.
	*/
	dn->dn_object = object;
	dn->dn_dbuf = db;
	dn->dn_handle = dnh;
	dn->dn_phys = dnp;

	if (dnp->dn_datablkszsec) {
	dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
	} else {
	dn->dn_datablksz = 0;
	dn->dn_datablkszsec = 0;
	dn->dn_datablkshift = 0;
	}
	dn->dn_indblkshift = dnp->dn_indblkshift;
	dn->dn_nlevels = dnp->dn_nlevels;
	dn->dn_type = dnp->dn_type;
	dn->dn_nblkptr = dnp->dn_nblkptr;
	dn->dn_checksum = dnp->dn_checksum;
	dn->dn_compress = dnp->dn_compress;
	dn->dn_bonustype = dnp->dn_bonustype;
	dn->dn_bonuslen = dnp->dn_bonuslen;
	dn->dn_maxblkid = dnp->dn_maxblkid;
	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
	dn->dn_id_flags = 0;

	dmu_zfetch_init(&dn->dn_zfetch, dn);

	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));

	mutex_enter(&os->os_lock);
	if (dnh->dnh_dnode != NULL) {
	/* Lost the allocation race. */
	mutex_exit(&os->os_lock);
	kmem_cache_free(dnode_cache, dn);
	return (dnh->dnh_dnode);
	}

	/*
	* Exclude special dnodes from os_dnodes so an empty os_dnodes
	* signifies that the special dnodes have no references from
	* their children (the entries in os_dnodes). This allows
	* dnode_destroy() to easily determine if the last child has
	* been removed and then complete eviction of the objset.
	*/
	if (!DMU_OBJECT_IS_SPECIAL(object))
	list_insert_head(&os->os_dnodes, dn);
	membar_producer();

	/*
	* Everything else must be valid before assigning dn_objset
	* makes the dnode eligible for dnode_move().
	*/
	dn->dn_objset = os;

	dnh->dnh_dnode = dn;
	mutex_exit(&os->os_lock);

	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
	return (dn);
	}

	/*
	* Caller must be holding the dnode handle, which is released upon return.
	*/
	static void
	dnode_destroy(dnode_t *dn)
	{
	objset_t *os = dn->dn_objset;
	boolean_t complete_os_eviction = B_FALSE;

	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);

	mutex_enter(&os->os_lock);
	POINTER_INVALIDATE(&dn->dn_objset);
	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
	list_remove(&os->os_dnodes, dn);
	complete_os_eviction =
	list_is_empty(&os->os_dnodes) &&
	list_link_active(&os->os_evicting_node);
	}
	mutex_exit(&os->os_lock);

	/* the dnode can no longer move, so we can release the handle */
	zrl_remove(&dn->dn_handle->dnh_zrlock);

	dn->dn_allocated_txg = 0;
	dn->dn_free_txg = 0;
	dn->dn_assigned_txg = 0;

	dn->dn_dirtyctx = 0;
	if (dn->dn_dirtyctx_firstset != NULL) {
	kmem_free(dn->dn_dirtyctx_firstset, 1);
	dn->dn_dirtyctx_firstset = NULL;
	}
	if (dn->dn_bonus != NULL) {
	mutex_enter(&dn->dn_bonus->db_mtx);
	dbuf_destroy(dn->dn_bonus);
	dn->dn_bonus = NULL;
	}
	dn->dn_zio = NULL;

	dn->dn_have_spill = B_FALSE;
	dn->dn_oldused = 0;
	dn->dn_oldflags = 0;
	dn->dn_olduid = 0;
	dn->dn_oldgid = 0;
	dn->dn_newuid = 0;
	dn->dn_newgid = 0;
	dn->dn_id_flags = 0;

	dmu_zfetch_fini(&dn->dn_zfetch);
	kmem_cache_free(dnode_cache, dn);
	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);

	if (complete_os_eviction)
	dmu_objset_evict_done(os);
	}

	void
	dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
	dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	{
	int i;

	ASSERT3U(blocksize, <=,
	spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
	if (blocksize == 0)
	blocksize = 1 << zfs_default_bs;
	else
	blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);

	if (ibs == 0)
	ibs = zfs_default_ibs;

	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);

	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
	dn->dn_object, tx->tx_txg, blocksize, ibs);

	ASSERT(dn->dn_type == DMU_OT_NONE);
	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
	ASSERT(ot != DMU_OT_NONE);
	ASSERT(DMU_OT_IS_VALID(ot));
	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) \|\|
	(bonustype == DMU_OT_SA && bonuslen == 0) \|\|
	(bonustype != DMU_OT_NONE && bonuslen != 0));
	ASSERT(DMU_OT_IS_VALID(bonustype));
	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
	ASSERT(dn->dn_type == DMU_OT_NONE);
	ASSERT0(dn->dn_maxblkid);
	ASSERT0(dn->dn_allocated_txg);
	ASSERT0(dn->dn_assigned_txg);
	ASSERT(refcount_is_zero(&dn->dn_tx_holds));
	ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
	ASSERT(avl_is_empty(&dn->dn_dbufs));

	for (i = 0; i < TXG_SIZE; i++) {
	ASSERT0(dn->dn_next_nblkptr[i]);
	ASSERT0(dn->dn_next_nlevels[i]);
	ASSERT0(dn->dn_next_indblkshift[i]);
	ASSERT0(dn->dn_next_bonuslen[i]);
	ASSERT0(dn->dn_next_bonustype[i]);
	ASSERT0(dn->dn_rm_spillblk[i]);
	ASSERT0(dn->dn_next_blksz[i]);
	ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
	ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
	ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
	}

	dn->dn_type = ot;
	dnode_setdblksz(dn, blocksize);
	dn->dn_indblkshift = ibs;
	dn->dn_nlevels = 1;
	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
	dn->dn_nblkptr = 1;
	else
	dn->dn_nblkptr = 1 +
	((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
	dn->dn_bonustype = bonustype;
	dn->dn_bonuslen = bonuslen;
	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
	dn->dn_compress = ZIO_COMPRESS_INHERIT;
	dn->dn_dirtyctx = 0;

	dn->dn_free_txg = 0;
	if (dn->dn_dirtyctx_firstset) {
	kmem_free(dn->dn_dirtyctx_firstset, 1);
	dn->dn_dirtyctx_firstset = NULL;
	}

	dn->dn_allocated_txg = tx->tx_txg;
	dn->dn_id_flags = 0;

	dnode_setdirty(dn, tx);
	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
	}

	void
	dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
	dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	{
	int nblkptr;

	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
	ASSERT3U(blocksize, <=,
	spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT \|\| dmu_tx_private_ok(tx));
	ASSERT(tx->tx_txg != 0);
	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) \|\|
	(bonustype != DMU_OT_NONE && bonuslen != 0) \|\|
	(bonustype == DMU_OT_SA && bonuslen == 0));
	ASSERT(DMU_OT_IS_VALID(bonustype));
	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);

	/* clean up any unreferenced dbufs */
	dnode_evict_dbufs(dn);

	dn->dn_id_flags = 0;

	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	dnode_setdirty(dn, tx);
	if (dn->dn_datablksz != blocksize) {
	/* change blocksize */
	ASSERT(dn->dn_maxblkid == 0 &&
	(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) \|\|
	dnode_block_freed(dn, 0)));
	dnode_setdblksz(dn, blocksize);
	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
	}
	if (dn->dn_bonuslen != bonuslen)
	dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;

	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
	nblkptr = 1;
	else
	nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
	if (dn->dn_bonustype != bonustype)
	dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
	if (dn->dn_nblkptr != nblkptr)
	dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
	dbuf_rm_spill(dn, tx);
	dnode_rm_spill(dn, tx);
	}
	rw_exit(&dn->dn_struct_rwlock);

	/* change type */
	dn->dn_type = ot;

	/* change bonus size and type */
	mutex_enter(&dn->dn_mtx);
	dn->dn_bonustype = bonustype;
	dn->dn_bonuslen = bonuslen;
	dn->dn_nblkptr = nblkptr;
	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
	dn->dn_compress = ZIO_COMPRESS_INHERIT;
	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);

	/* fix up the bonus db_size */
	if (dn->dn_bonus) {
	dn->dn_bonus->db.db_size =
	DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
	ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
	}

	dn->dn_allocated_txg = tx->tx_txg;
	mutex_exit(&dn->dn_mtx);
	}

	#ifdef DNODE_STATS
	static struct {
	uint64_t dms_dnode_invalid;
	uint64_t dms_dnode_recheck1;
	uint64_t dms_dnode_recheck2;
	uint64_t dms_dnode_special;
	uint64_t dms_dnode_handle;
	uint64_t dms_dnode_rwlock;
	uint64_t dms_dnode_active;
	} dnode_move_stats;
	#endif /* DNODE_STATS */

	static void
	dnode_move_impl(dnode_t odn, dnode_t ndn)
	{
	int i;

	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));

	/* Copy fields. */
	ndn->dn_objset = odn->dn_objset;
	ndn->dn_object = odn->dn_object;
	ndn->dn_dbuf = odn->dn_dbuf;
	ndn->dn_handle = odn->dn_handle;
	ndn->dn_phys = odn->dn_phys;
	ndn->dn_type = odn->dn_type;
	ndn->dn_bonuslen = odn->dn_bonuslen;
	ndn->dn_bonustype = odn->dn_bonustype;
	ndn->dn_nblkptr = odn->dn_nblkptr;
	ndn->dn_checksum = odn->dn_checksum;
	ndn->dn_compress = odn->dn_compress;
	ndn->dn_nlevels = odn->dn_nlevels;
	ndn->dn_indblkshift = odn->dn_indblkshift;
	ndn->dn_datablkshift = odn->dn_datablkshift;
	ndn->dn_datablkszsec = odn->dn_datablkszsec;
	ndn->dn_datablksz = odn->dn_datablksz;
	ndn->dn_maxblkid = odn->dn_maxblkid;
	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
	sizeof (odn->dn_next_nblkptr));
	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
	sizeof (odn->dn_next_nlevels));
	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
	sizeof (odn->dn_next_indblkshift));
	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
	sizeof (odn->dn_next_bonustype));
	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
	sizeof (odn->dn_rm_spillblk));
	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
	sizeof (odn->dn_next_bonuslen));
	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
	sizeof (odn->dn_next_blksz));
	for (i = 0; i < TXG_SIZE; i++) {
	list_move_tail(&ndn->dn_dirty_records[i],
	&odn->dn_dirty_records[i]);
	}
	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
	sizeof (odn->dn_free_ranges));
	ndn->dn_allocated_txg = odn->dn_allocated_txg;
	ndn->dn_free_txg = odn->dn_free_txg;
	ndn->dn_assigned_txg = odn->dn_assigned_txg;
	ndn->dn_dirtyctx = odn->dn_dirtyctx;
	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
	ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
	refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
	ASSERT(avl_is_empty(&ndn->dn_dbufs));
	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
	ndn->dn_dbufs_count = odn->dn_dbufs_count;
	ndn->dn_bonus = odn->dn_bonus;
	ndn->dn_have_spill = odn->dn_have_spill;
	ndn->dn_zio = odn->dn_zio;
	ndn->dn_oldused = odn->dn_oldused;
	ndn->dn_oldflags = odn->dn_oldflags;
	ndn->dn_olduid = odn->dn_olduid;
	ndn->dn_oldgid = odn->dn_oldgid;
	ndn->dn_newuid = odn->dn_newuid;
	ndn->dn_newgid = odn->dn_newgid;
	ndn->dn_id_flags = odn->dn_id_flags;
	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;

	/*
	* Update back pointers. Updating the handle fixes the back pointer of
	* every descendant dbuf as well as the bonus dbuf.
	*/
	ASSERT(ndn->dn_handle->dnh_dnode == odn);
	ndn->dn_handle->dnh_dnode = ndn;
	if (ndn->dn_zfetch.zf_dnode == odn) {
	ndn->dn_zfetch.zf_dnode = ndn;
	}

	/*
	* Invalidate the original dnode by clearing all of its back pointers.
	*/
	odn->dn_dbuf = NULL;
	odn->dn_handle = NULL;
	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
	offsetof(dmu_buf_impl_t, db_link));
	odn->dn_dbufs_count = 0;
	odn->dn_bonus = NULL;
	odn->dn_zfetch.zf_dnode = NULL;

	/*
	* Set the low bit of the objset pointer to ensure that dnode_move()
	* recognizes the dnode as invalid in any subsequent callback.
	*/
	POINTER_INVALIDATE(&odn->dn_objset);

	/*
	* Satisfy the destructor.
	*/
	for (i = 0; i < TXG_SIZE; i++) {
	list_create(&odn->dn_dirty_records[i],
	sizeof (dbuf_dirty_record_t),
	offsetof(dbuf_dirty_record_t, dr_dirty_node));
	odn->dn_free_ranges[i] = NULL;
	odn->dn_next_nlevels[i] = 0;
	odn->dn_next_indblkshift[i] = 0;
	odn->dn_next_bonustype[i] = 0;
	odn->dn_rm_spillblk[i] = 0;
	odn->dn_next_bonuslen[i] = 0;
	odn->dn_next_blksz[i] = 0;
	}
	odn->dn_allocated_txg = 0;
	odn->dn_free_txg = 0;
	odn->dn_assigned_txg = 0;
	odn->dn_dirtyctx = 0;
	odn->dn_dirtyctx_firstset = NULL;
	odn->dn_have_spill = B_FALSE;
	odn->dn_zio = NULL;
	odn->dn_oldused = 0;
	odn->dn_oldflags = 0;
	odn->dn_olduid = 0;
	odn->dn_oldgid = 0;
	odn->dn_newuid = 0;
	odn->dn_newgid = 0;
	odn->dn_id_flags = 0;

	/*
	* Mark the dnode.
	*/
	ndn->dn_moved = 1;
	odn->dn_moved = (uint8_t)-1;
	}

	#ifdef illumos
	#ifdef _KERNEL
	/ARGSUSED/
	static kmem_cbrc_t
	dnode_move(void buf, void newbuf, size_t size, void *arg)
	{
	dnode_t odn = buf, ndn = newbuf;
	objset_t *os;
	int64_t refcount;
	uint32_t dbufs;

	/*
	* The dnode is on the objset's list of known dnodes if the objset
	* pointer is valid. We set the low bit of the objset pointer when
	* freeing the dnode to invalidate it, and the memory patterns written
	* by kmem (baddcafe and deadbeef) set at least one of the two low bits.
	* A newly created dnode sets the objset pointer last of all to indicate
	* that the dnode is known and in a valid state to be moved by this
	* function.
	*/
	os = odn->dn_objset;
	if (!POINTER_IS_VALID(os)) {
	DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
	return (KMEM_CBRC_DONT_KNOW);
	}

	/*
	* Ensure that the objset does not go away during the move.
	*/
	rw_enter(&os_lock, RW_WRITER);
	if (os != odn->dn_objset) {
	rw_exit(&os_lock);
	DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
	return (KMEM_CBRC_DONT_KNOW);
	}

	/*
	* If the dnode is still valid, then so is the objset. We know that no
	* valid objset can be freed while we hold os_lock, so we can safely
	* ensure that the objset remains in use.
	*/
	mutex_enter(&os->os_lock);

	/*
	* Recheck the objset pointer in case the dnode was removed just before
	* acquiring the lock.
	*/
	if (os != odn->dn_objset) {
	mutex_exit(&os->os_lock);
	rw_exit(&os_lock);
	DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
	return (KMEM_CBRC_DONT_KNOW);
	}

	/*
	* At this point we know that as long as we hold os->os_lock, the dnode
	* cannot be freed and fields within the dnode can be safely accessed.
	* The objset listing this dnode cannot go away as long as this dnode is
	* on its list.
	*/
	rw_exit(&os_lock);
	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
	mutex_exit(&os->os_lock);
	DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
	return (KMEM_CBRC_NO);
	}
	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */

	/*
	* Lock the dnode handle to prevent the dnode from obtaining any new
	* holds. This also prevents the descendant dbufs and the bonus dbuf
	* from accessing the dnode, so that we can discount their holds. The
	* handle is safe to access because we know that while the dnode cannot
	* go away, neither can its handle. Once we hold dnh_zrlock, we can
	* safely move any dnode referenced only by dbufs.
	*/
	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
	mutex_exit(&os->os_lock);
	DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
	return (KMEM_CBRC_LATER);
	}

	/*
	* Ensure a consistent view of the dnode's holds and the dnode's dbufs.
	* We need to guarantee that there is a hold for every dbuf in order to
	* determine whether the dnode is actively referenced. Falsely matching
	* a dbuf to an active hold would lead to an unsafe move. It's possible
	* that a thread already having an active dnode hold is about to add a
	* dbuf, and we can't compare hold and dbuf counts while the add is in
	* progress.
	*/
	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
	zrl_exit(&odn->dn_handle->dnh_zrlock);
	mutex_exit(&os->os_lock);
	DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
	return (KMEM_CBRC_LATER);
	}

	/*
	* A dbuf may be removed (evicted) without an active dnode hold. In that
	* case, the dbuf count is decremented under the handle lock before the
	* dbuf's hold is released. This order ensures that if we count the hold
	* after the dbuf is removed but before its hold is released, we will
	* treat the unmatched hold as active and exit safely. If we count the
	* hold before the dbuf is removed, the hold is discounted, and the
	* removal is blocked until the move completes.
	*/
	refcount = refcount_count(&odn->dn_holds);
	ASSERT(refcount >= 0);
	dbufs = odn->dn_dbufs_count;

	/* We can't have more dbufs than dnode holds. */
	ASSERT3U(dbufs, <=, refcount);
	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
	uint32_t, dbufs);

	if (refcount > dbufs) {
	rw_exit(&odn->dn_struct_rwlock);
	zrl_exit(&odn->dn_handle->dnh_zrlock);
	mutex_exit(&os->os_lock);
	DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
	return (KMEM_CBRC_LATER);
	}

	rw_exit(&odn->dn_struct_rwlock);

	/*
	* At this point we know that anyone with a hold on the dnode is not
	* actively referencing it. The dnode is known and in a valid state to
	* move. We're holding the locks needed to execute the critical section.
	*/
	dnode_move_impl(odn, ndn);

	list_link_replace(&odn->dn_link, &ndn->dn_link);
	/* If the dnode was safe to move, the refcount cannot have changed. */
	ASSERT(refcount == refcount_count(&ndn->dn_holds));
	ASSERT(dbufs == ndn->dn_dbufs_count);
	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
	mutex_exit(&os->os_lock);

	return (KMEM_CBRC_YES);
	}
	#endif /* _KERNEL */
	#endif /* illumos */

	void
	dnode_special_close(dnode_handle_t *dnh)
	{
	dnode_t *dn = dnh->dnh_dnode;

	/*
	* Wait for final references to the dnode to clear. This can
	* only happen if the arc is asyncronously evicting state that
	* has a hold on this dnode while we are trying to evict this
	* dnode.
	*/
	while (refcount_count(&dn->dn_holds) > 0)
	delay(1);
	ASSERT(dn->dn_dbuf == NULL \|\|
	dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
	zrl_add(&dnh->dnh_zrlock);
	dnode_destroy(dn); /* implicit zrl_remove() */
	zrl_destroy(&dnh->dnh_zrlock);
	dnh->dnh_dnode = NULL;
	}

	void
	dnode_special_open(objset_t os, dnode_phys_t dnp, uint64_t object,
	dnode_handle_t *dnh)
	{
	dnode_t *dn;

	dn = dnode_create(os, dnp, NULL, object, dnh);
	zrl_init(&dnh->dnh_zrlock);
	DNODE_VERIFY(dn);
	}

	static void
	dnode_buf_evict_async(void *dbu)
	{
	dnode_children_t *children_dnodes = dbu;
	int i;

	for (i = 0; i < children_dnodes->dnc_count; i++) {
	dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
	dnode_t *dn;

	/*
	* The dnode handle lock guards against the dnode moving to
	* another valid address, so there is no need here to guard
	* against changes to or from NULL.
	*/
	if (dnh->dnh_dnode == NULL) {
	zrl_destroy(&dnh->dnh_zrlock);
	continue;
	}

	zrl_add(&dnh->dnh_zrlock);
	dn = dnh->dnh_dnode;
	/*
	* If there are holds on this dnode, then there should
	* be holds on the dnode's containing dbuf as well; thus
	* it wouldn't be eligible for eviction and this function
	* would not have been called.
	*/
	ASSERT(refcount_is_zero(&dn->dn_holds));
	ASSERT(refcount_is_zero(&dn->dn_tx_holds));

	dnode_destroy(dn); /* implicit zrl_remove() */
	zrl_destroy(&dnh->dnh_zrlock);
	dnh->dnh_dnode = NULL;
	}
	kmem_free(children_dnodes, sizeof (dnode_children_t) +
	children_dnodes->dnc_count * sizeof (dnode_handle_t));
	}

	/*
	* errors:
	* EINVAL - invalid object number.
	* EIO - i/o error.
	* succeeds even for free dnodes.
	*/
	int
	dnode_hold_impl(objset_t *os, uint64_t object, int flag,
	void tag, dnode_t *dnp)
	{
	int epb, idx, err;
	int drop_struct_lock = FALSE;
	int type;
	uint64_t blk;
	dnode_t mdn, dn;
	dmu_buf_impl_t *db;
	dnode_children_t *children_dnodes;
	dnode_handle_t *dnh;

	/*
	* If you are holding the spa config lock as writer, you shouldn't
	* be asking the DMU to do anything unless it's the root pool
	* which may require us to read from the root filesystem while
	* holding some (not all) of the locks as writer.
	*/
	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 \|\|
	(spa_is_root(os->os_spa) &&
	spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));

	if (object == DMU_USERUSED_OBJECT \|\| object == DMU_GROUPUSED_OBJECT) {
	dn = (object == DMU_USERUSED_OBJECT) ?
	DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
	if (dn == NULL)
	return (SET_ERROR(ENOENT));
	type = dn->dn_type;
	if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
	return (SET_ERROR(ENOENT));
	if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
	return (SET_ERROR(EEXIST));
	DNODE_VERIFY(dn);
	(void) refcount_add(&dn->dn_holds, tag);
	*dnp = dn;
	return (0);
	}

	if (object == 0 \|\| object >= DN_MAX_OBJECT)
	return (SET_ERROR(EINVAL));

	mdn = DMU_META_DNODE(os);
	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);

	DNODE_VERIFY(mdn);

	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
	rw_enter(&mdn->dn_struct_rwlock, RW_READER);
	drop_struct_lock = TRUE;
	}

	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));

	db = dbuf_hold(mdn, blk, FTAG);
	if (drop_struct_lock)
	rw_exit(&mdn->dn_struct_rwlock);
	if (db == NULL)
	return (SET_ERROR(EIO));
	err = dbuf_read(db, NULL, DB_RF_CANFAIL);
	if (err) {
	dbuf_rele(db, FTAG);
	return (err);
	}

	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
	epb = db->db.db_size >> DNODE_SHIFT;

	idx = object & (epb-1);

	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
	children_dnodes = dmu_buf_get_user(&db->db);
	if (children_dnodes == NULL) {
	int i;
	dnode_children_t *winner;
	children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
	epb * sizeof (dnode_handle_t), KM_SLEEP);
	children_dnodes->dnc_count = epb;
	dnh = &children_dnodes->dnc_children[0];
	for (i = 0; i < epb; i++) {
	zrl_init(&dnh[i].dnh_zrlock);
	}
	dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
	dnode_buf_evict_async, NULL);
	winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
	if (winner != NULL) {

	for (i = 0; i < epb; i++) {
	zrl_destroy(&dnh[i].dnh_zrlock);
	}

	kmem_free(children_dnodes, sizeof (dnode_children_t) +
	epb * sizeof (dnode_handle_t));
	children_dnodes = winner;
	}
	}
	ASSERT(children_dnodes->dnc_count == epb);

	dnh = &children_dnodes->dnc_children[idx];
	zrl_add(&dnh->dnh_zrlock);
	dn = dnh->dnh_dnode;
	if (dn == NULL) {
	dnode_phys_t phys = (dnode_phys_t )db->db.db_data+idx;

	dn = dnode_create(os, phys, db, object, dnh);
	}

	mutex_enter(&dn->dn_mtx);
	type = dn->dn_type;
	if (dn->dn_free_txg \|\|
	((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) \|\|
	((flag & DNODE_MUST_BE_FREE) &&
	(type != DMU_OT_NONE \|\| !refcount_is_zero(&dn->dn_holds)))) {
	mutex_exit(&dn->dn_mtx);
	zrl_remove(&dnh->dnh_zrlock);
	dbuf_rele(db, FTAG);
	return (type == DMU_OT_NONE ? ENOENT : EEXIST);
	}
	if (refcount_add(&dn->dn_holds, tag) == 1)
	dbuf_add_ref(db, dnh);
	mutex_exit(&dn->dn_mtx);

	/* Now we can rely on the hold to prevent the dnode from moving. */
	zrl_remove(&dnh->dnh_zrlock);

	DNODE_VERIFY(dn);
	ASSERT3P(dn->dn_dbuf, ==, db);
	ASSERT3U(dn->dn_object, ==, object);
	dbuf_rele(db, FTAG);

	*dnp = dn;
	return (0);
	}

	/*
	* Return held dnode if the object is allocated, NULL if not.
	*/
	int
	dnode_hold(objset_t os, uint64_t object, void tag, dnode_t **dnp)
	{
	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
	}

	/*
	* Can only add a reference if there is already at least one
	* reference on the dnode. Returns FALSE if unable to add a
	* new reference.
	*/
	boolean_t
	dnode_add_ref(dnode_t dn, void tag)
	{
	mutex_enter(&dn->dn_mtx);
	if (refcount_is_zero(&dn->dn_holds)) {
	mutex_exit(&dn->dn_mtx);
	return (FALSE);
	}
	VERIFY(1 < refcount_add(&dn->dn_holds, tag));
	mutex_exit(&dn->dn_mtx);
	return (TRUE);
	}

	void
	dnode_rele(dnode_t dn, void tag)
	{
	mutex_enter(&dn->dn_mtx);
	dnode_rele_and_unlock(dn, tag);
	}

	void
	dnode_rele_and_unlock(dnode_t dn, void tag)
	{
	uint64_t refs;
	/* Get while the hold prevents the dnode from moving. */
	dmu_buf_impl_t *db = dn->dn_dbuf;
	dnode_handle_t *dnh = dn->dn_handle;

	refs = refcount_remove(&dn->dn_holds, tag);
	mutex_exit(&dn->dn_mtx);

	/*
	* It's unsafe to release the last hold on a dnode by dnode_rele() or
	* indirectly by dbuf_rele() while relying on the dnode handle to
	* prevent the dnode from moving, since releasing the last hold could
	* result in the dnode's parent dbuf evicting its dnode handles. For
	* that reason anyone calling dnode_rele() or dbuf_rele() without some
	* other direct or indirect hold on the dnode must first drop the dnode
	* handle.
	*/
	ASSERT(refs > 0 \|\| dnh->dnh_zrlock.zr_owner != curthread);

	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
	if (refs == 0 && db != NULL) {
	/*
	* Another thread could add a hold to the dnode handle in
	* dnode_hold_impl() while holding the parent dbuf. Since the
	* hold on the parent dbuf prevents the handle from being
	* destroyed, the hold on the handle is OK. We can't yet assert
	* that the handle has zero references, but that will be
	* asserted anyway when the handle gets destroyed.
	*/
	dbuf_rele(db, dnh);
	}
	}

	void
	dnode_setdirty(dnode_t dn, dmu_tx_t tx)
	{
	objset_t *os = dn->dn_objset;
	uint64_t txg = tx->tx_txg;

	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
	dsl_dataset_dirty(os->os_dsl_dataset, tx);
	return;
	}

	DNODE_VERIFY(dn);

	#ifdef ZFS_DEBUG
	mutex_enter(&dn->dn_mtx);
	ASSERT(dn->dn_phys->dn_type \|\| dn->dn_allocated_txg);
	ASSERT(dn->dn_free_txg == 0 \|\| dn->dn_free_txg >= txg);
	mutex_exit(&dn->dn_mtx);
	#endif

	/*
	* Determine old uid/gid when necessary
	*/
	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);

	multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);

	/*
	* If we are already marked dirty, we're done.
	*/
	if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
	multilist_sublist_unlock(mls);
	return;
	}

	ASSERT(!refcount_is_zero(&dn->dn_holds) \|\|
	!avl_is_empty(&dn->dn_dbufs));
	ASSERT(dn->dn_datablksz != 0);
	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
	ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);

	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
	dn->dn_object, txg);

	multilist_sublist_insert_head(mls, dn);

	multilist_sublist_unlock(mls);

	/*
	* The dnode maintains a hold on its containing dbuf as
	* long as there are holds on it. Each instantiated child
	* dbuf maintains a hold on the dnode. When the last child
	* drops its hold, the dnode will drop its hold on the
	* containing dbuf. We add a "dirty hold" here so that the
	* dnode will hang around after we finish processing its
	* children.
	*/
	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));

	(void) dbuf_dirty(dn->dn_dbuf, tx);

	dsl_dataset_dirty(os->os_dsl_dataset, tx);
	}

	void
	dnode_free(dnode_t dn, dmu_tx_t tx)
	{
	mutex_enter(&dn->dn_mtx);
	if (dn->dn_type == DMU_OT_NONE \|\| dn->dn_free_txg) {
	mutex_exit(&dn->dn_mtx);
	return;
	}
	dn->dn_free_txg = tx->tx_txg;
	mutex_exit(&dn->dn_mtx);

	dnode_setdirty(dn, tx);
	}

	/*
	* Try to change the block size for the indicated dnode. This can only
	* succeed if there are no blocks allocated or dirty beyond first block
	*/
	int
	dnode_set_blksz(dnode_t dn, uint64_t size, int ibs, dmu_tx_t tx)
	{
	dmu_buf_impl_t *db;
	int err;

	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
	if (size == 0)
	size = SPA_MINBLOCKSIZE;
	else
	size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);

	if (ibs == dn->dn_indblkshift)
	ibs = 0;

	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
	return (0);

	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);

	/* Check for any allocated blocks beyond the first */
	if (dn->dn_maxblkid != 0)
	goto fail;

	mutex_enter(&dn->dn_dbufs_mtx);
	for (db = avl_first(&dn->dn_dbufs); db != NULL;
	db = AVL_NEXT(&dn->dn_dbufs, db)) {
	if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
	db->db_blkid != DMU_SPILL_BLKID) {
	mutex_exit(&dn->dn_dbufs_mtx);
	goto fail;
	}
	}
	mutex_exit(&dn->dn_dbufs_mtx);

	if (ibs && dn->dn_nlevels != 1)
	goto fail;

	/* resize the old block */
	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
	if (err == 0)
	dbuf_new_size(db, size, tx);
	else if (err != ENOENT)
	goto fail;

	dnode_setdblksz(dn, size);
	dnode_setdirty(dn, tx);
	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
	if (ibs) {
	dn->dn_indblkshift = ibs;
	dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
	}
	/* rele after we have fixed the blocksize in the dnode */
	if (db)
	dbuf_rele(db, FTAG);

	rw_exit(&dn->dn_struct_rwlock);
	return (0);

	fail:
	rw_exit(&dn->dn_struct_rwlock);
	return (SET_ERROR(ENOTSUP));
	}

	/* read-holding callers must not rely on the lock being continuously held */
	void
	dnode_new_blkid(dnode_t dn, uint64_t blkid, dmu_tx_t tx, boolean_t have_read)
	{
	uint64_t txgoff = tx->tx_txg & TXG_MASK;
	int epbs, new_nlevels;
	uint64_t sz;

	ASSERT(blkid != DMU_BONUS_BLKID);

	ASSERT(have_read ?
	RW_READ_HELD(&dn->dn_struct_rwlock) :
	RW_WRITE_HELD(&dn->dn_struct_rwlock));

	/*
	* if we have a read-lock, check to see if we need to do any work
	* before upgrading to a write-lock.
	*/
	if (have_read) {
	if (blkid <= dn->dn_maxblkid)
	return;

	if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
	rw_exit(&dn->dn_struct_rwlock);
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	}
	}

	if (blkid <= dn->dn_maxblkid)
	goto out;

	dn->dn_maxblkid = blkid;

	/*
	* Compute the number of levels necessary to support the new maxblkid.
	*/
	new_nlevels = 1;
	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
	for (sz = dn->dn_nblkptr;
	sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
	new_nlevels++;

	if (new_nlevels > dn->dn_nlevels) {
	int old_nlevels = dn->dn_nlevels;
	dmu_buf_impl_t *db;
	list_t *list;
	dbuf_dirty_record_t new, dr, *dr_next;

	dn->dn_nlevels = new_nlevels;

	ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
	dn->dn_next_nlevels[txgoff] = new_nlevels;

	/* dirty the left indirects */
	db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
	ASSERT(db != NULL);
	new = dbuf_dirty(db, tx);
	dbuf_rele(db, FTAG);

	/* transfer the dirty records to the new indirect */
	mutex_enter(&dn->dn_mtx);
	mutex_enter(&new->dt.di.dr_mtx);
	list = &dn->dn_dirty_records[txgoff];
	for (dr = list_head(list); dr; dr = dr_next) {
	dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
	if (dr->dr_dbuf->db_level != new_nlevels-1 &&
	dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
	dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
	ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
	list_remove(&dn->dn_dirty_records[txgoff], dr);
	list_insert_tail(&new->dt.di.dr_children, dr);
	dr->dr_parent = new;
	}
	}
	mutex_exit(&new->dt.di.dr_mtx);
	mutex_exit(&dn->dn_mtx);
	}

	out:
	if (have_read)
	rw_downgrade(&dn->dn_struct_rwlock);
	}

	static void
	dnode_dirty_l1(dnode_t dn, uint64_t l1blkid, dmu_tx_t tx)
	{
	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
	if (db != NULL) {
	dmu_buf_will_dirty(&db->db, tx);
	dbuf_rele(db, FTAG);
	}
	}

	void
	dnode_free_range(dnode_t dn, uint64_t off, uint64_t len, dmu_tx_t tx)
	{
	dmu_buf_impl_t *db;
	uint64_t blkoff, blkid, nblks;
	int blksz, blkshift, head, tail;
	int trunc = FALSE;
	int epbs;

	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	blksz = dn->dn_datablksz;
	blkshift = dn->dn_datablkshift;
	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;

	if (len == DMU_OBJECT_END) {
	len = UINT64_MAX - off;
	trunc = TRUE;
	}

	/*
	* First, block align the region to free:
	*/
	if (ISP2(blksz)) {
	head = P2NPHASE(off, blksz);
	blkoff = P2PHASE(off, blksz);
	if ((off >> blkshift) > dn->dn_maxblkid)
	goto out;
	} else {
	ASSERT(dn->dn_maxblkid == 0);
	if (off == 0 && len >= blksz) {
	/*
	* Freeing the whole block; fast-track this request.
	* Note that we won't dirty any indirect blocks,
	* which is fine because we will be freeing the entire
	* file and thus all indirect blocks will be freed
	* by free_children().
	*/
	blkid = 0;
	nblks = 1;
	goto done;
	} else if (off >= blksz) {
	/* Freeing past end-of-data */
	goto out;
	} else {
	/* Freeing part of the block. */
	head = blksz - off;
	ASSERT3U(head, >, 0);
	}
	blkoff = off;
	}
	/* zero out any partial block data at the start of the range */
	if (head) {
	ASSERT3U(blkoff + head, ==, blksz);
	if (len < head)
	head = len;
	if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
	TRUE, FALSE, FTAG, &db) == 0) {
	caddr_t data;

	/* don't dirty if it isn't on disk and isn't dirty */
	if (db->db_last_dirty \|\|
	(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
	rw_exit(&dn->dn_struct_rwlock);
	dmu_buf_will_dirty(&db->db, tx);
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	data = db->db.db_data;
	bzero(data + blkoff, head);
	}
	dbuf_rele(db, FTAG);
	}
	off += head;
	len -= head;
	}

	/* If the range was less than one block, we're done */
	if (len == 0)
	goto out;

	/* If the remaining range is past end of file, we're done */
	if ((off >> blkshift) > dn->dn_maxblkid)
	goto out;

	ASSERT(ISP2(blksz));
	if (trunc)
	tail = 0;
	else
	tail = P2PHASE(len, blksz);

	ASSERT0(P2PHASE(off, blksz));
	/* zero out any partial block data at the end of the range */
	if (tail) {
	if (len < tail)
	tail = len;
	if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
	TRUE, FALSE, FTAG, &db) == 0) {
	/* don't dirty if not on disk and not dirty */
	if (db->db_last_dirty \|\|
	(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
	rw_exit(&dn->dn_struct_rwlock);
	dmu_buf_will_dirty(&db->db, tx);
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	bzero(db->db.db_data, tail);
	}
	dbuf_rele(db, FTAG);
	}
	len -= tail;
	}

	/* If the range did not include a full block, we are done */
	if (len == 0)
	goto out;

	ASSERT(IS_P2ALIGNED(off, blksz));
	ASSERT(trunc \|\| IS_P2ALIGNED(len, blksz));
	blkid = off >> blkshift;
	nblks = len >> blkshift;
	if (trunc)
	nblks += 1;

	/*
	* Dirty all the indirect blocks in this range. Note that only
	* the first and last indirect blocks can actually be written
	* (if they were partially freed) -- they must be dirtied, even if
	* they do not exist on disk yet. The interior blocks will
	* be freed by free_children(), so they will not actually be written.
	* Even though these interior blocks will not be written, we
	* dirty them for two reasons:
	*
	* - It ensures that the indirect blocks remain in memory until
	* syncing context. (They have already been prefetched by
	* dmu_tx_hold_free(), so we don't have to worry about reading
	* them serially here.)
	*
	* - The dirty space accounting will put pressure on the txg sync
	* mechanism to begin syncing, and to delay transactions if there
	* is a large amount of freeing. Even though these indirect
	* blocks will not be written, we could need to write the same
	* amount of space if we copy the freed BPs into deadlists.
	*/
	if (dn->dn_nlevels > 1) {
	uint64_t first, last;

	first = blkid >> epbs;
	dnode_dirty_l1(dn, first, tx);
	if (trunc)
	last = dn->dn_maxblkid >> epbs;
	else
	last = (blkid + nblks - 1) >> epbs;
	if (last != first)
	dnode_dirty_l1(dn, last, tx);

	int shift = dn->dn_datablkshift + dn->dn_indblkshift -
	SPA_BLKPTRSHIFT;
	for (uint64_t i = first + 1; i < last; i++) {
	/*
	* Set i to the blockid of the next non-hole
	* level-1 indirect block at or after i. Note
	* that dnode_next_offset() operates in terms of
	* level-0-equivalent bytes.
	*/
	uint64_t ibyte = i << shift;
	int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
	&ibyte, 2, 1, 0);
	i = ibyte >> shift;
	if (i >= last)
	break;

	/*
	* Normally we should not see an error, either
	* from dnode_next_offset() or dbuf_hold_level()
	* (except for ESRCH from dnode_next_offset).
	* If there is an i/o error, then when we read
	* this block in syncing context, it will use
	* ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
	* to the "failmode" property. dnode_next_offset()
	* doesn't have a flag to indicate MUSTSUCCEED.
	*/
	if (err != 0)
	break;

	dnode_dirty_l1(dn, i, tx);
	}
	}

	done:
	/*
	* Add this range to the dnode range list.
	* We will finish up this free operation in the syncing phase.
	*/
	mutex_enter(&dn->dn_mtx);
	int txgoff = tx->tx_txg & TXG_MASK;
	if (dn->dn_free_ranges[txgoff] == NULL) {
	- dn->dn_free_ranges[txgoff] =
	- range_tree_create(NULL, NULL, &dn->dn_mtx);
	+ dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
	}
	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
	blkid, nblks, tx->tx_txg);
	mutex_exit(&dn->dn_mtx);

	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
	dnode_setdirty(dn, tx);
	out:

	rw_exit(&dn->dn_struct_rwlock);
	}

	static boolean_t
	dnode_spill_freed(dnode_t *dn)
	{
	int i;

	mutex_enter(&dn->dn_mtx);
	for (i = 0; i < TXG_SIZE; i++) {
	if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
	break;
	}
	mutex_exit(&dn->dn_mtx);
	return (i < TXG_SIZE);
	}

	/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
	uint64_t
	dnode_block_freed(dnode_t *dn, uint64_t blkid)
	{
	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
	int i;

	if (blkid == DMU_BONUS_BLKID)
	return (FALSE);

	/*
	* If we're in the process of opening the pool, dp will not be
	* set yet, but there shouldn't be anything dirty.
	*/
	if (dp == NULL)
	return (FALSE);

	if (dn->dn_free_txg)
	return (TRUE);

	if (blkid == DMU_SPILL_BLKID)
	return (dnode_spill_freed(dn));

	mutex_enter(&dn->dn_mtx);
	for (i = 0; i < TXG_SIZE; i++) {
	if (dn->dn_free_ranges[i] != NULL &&
	range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
	break;
	}
	mutex_exit(&dn->dn_mtx);
	return (i < TXG_SIZE);
	}

	/* call from syncing context when we actually write/free space for this dnode */
	void
	dnode_diduse_space(dnode_t *dn, int64_t delta)
	{
	uint64_t space;
	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
	dn, dn->dn_phys,
	(u_longlong_t)dn->dn_phys->dn_used,
	(longlong_t)delta);

	mutex_enter(&dn->dn_mtx);
	space = DN_USED_BYTES(dn->dn_phys);
	if (delta > 0) {
	ASSERT3U(space + delta, >=, space); /* no overflow */
	} else {
	ASSERT3U(space, >=, -delta); /* no underflow */
	}
	space += delta;
	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
	ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
	ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
	dn->dn_phys->dn_used = space >> DEV_BSHIFT;
	} else {
	dn->dn_phys->dn_used = space;
	dn->dn_phys->dn_flags \|= DNODE_FLAG_USED_BYTES;
	}
	mutex_exit(&dn->dn_mtx);
	}

	/*
	* Scans a block at the indicated "level" looking for a hole or data,
	* depending on 'flags'.
	*
	* If level > 0, then we are scanning an indirect block looking at its
	* pointers. If level == 0, then we are looking at a block of dnodes.
	*
	* If we don't find what we are looking for in the block, we return ESRCH.
	* Otherwise, return with *offset pointing to the beginning (if searching
	* forwards) or end (if searching backwards) of the range covered by the
	* block pointer we matched on (or dnode).
	*
	* The basic search algorithm used below by dnode_next_offset() is to
	* use this function to search up the block tree (widen the search) until
	* we find something (i.e., we don't return ESRCH) and then search back
	* down the tree (narrow the search) until we reach our original search
	* level.
	*/
	static int
	dnode_next_offset_level(dnode_t dn, int flags, uint64_t offset,
	int lvl, uint64_t blkfill, uint64_t txg)
	{
	dmu_buf_impl_t *db = NULL;
	void *data = NULL;
	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
	uint64_t epb = 1ULL << epbs;
	uint64_t minfill, maxfill;
	boolean_t hole;
	int i, inc, error, span;

	dprintf("probing object %llu offset %llx level %d of %u\n",
	dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);

	hole = ((flags & DNODE_FIND_HOLE) != 0);
	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
	ASSERT(txg == 0 \|\| !hole);

	if (lvl == dn->dn_phys->dn_nlevels) {
	error = 0;
	epb = dn->dn_phys->dn_nblkptr;
	data = dn->dn_phys->dn_blkptr;
	} else {
	uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
	error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
	if (error) {
	if (error != ENOENT)
	return (error);
	if (hole)
	return (0);
	/*
	* This can only happen when we are searching up
	* the block tree for data. We don't really need to
	* adjust the offset, as we will just end up looking
	* at the pointer to this block in its parent, and its
	* going to be unallocated, so we will skip over it.
	*/
	return (SET_ERROR(ESRCH));
	}
	error = dbuf_read(db, NULL, DB_RF_CANFAIL \| DB_RF_HAVESTRUCT);
	if (error) {
	dbuf_rele(db, FTAG);
	return (error);
	}
	data = db->db.db_data;
	}


	if (db != NULL && txg != 0 && (db->db_blkptr == NULL \|\|
	db->db_blkptr->blk_birth <= txg \|\|
	BP_IS_HOLE(db->db_blkptr))) {
	/*
	* This can only happen when we are searching up the tree
	* and these conditions mean that we need to keep climbing.
	*/
	error = SET_ERROR(ESRCH);
	} else if (lvl == 0) {
	dnode_phys_t *dnp = data;
	span = DNODE_SHIFT;
	ASSERT(dn->dn_type == DMU_OT_DNODE);

	for (i = (*offset >> span) & (blkfill - 1);
	i >= 0 && i < blkfill; i += inc) {
	if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
	break;
	offset += (1ULL << span) inc;
	}
	if (i < 0 \|\| i == blkfill)
	error = SET_ERROR(ESRCH);
	} else {
	blkptr_t *bp = data;
	uint64_t start = *offset;
	span = (lvl - 1) * epbs + dn->dn_datablkshift;
	minfill = 0;
	maxfill = blkfill << ((lvl - 1) * epbs);

	if (hole)
	maxfill--;
	else
	minfill++;

	offset = offset >> span;
	for (i = BF64_GET(*offset, 0, epbs);
	i >= 0 && i < epb; i += inc) {
	if (BP_GET_FILL(&bp[i]) >= minfill &&
	BP_GET_FILL(&bp[i]) <= maxfill &&
	(hole \|\| bp[i].blk_birth > txg))
	break;
	if (inc > 0 \|\| *offset > 0)
	*offset += inc;
	}
	offset = offset << span;
	if (inc < 0) {
	/* traversing backwards; position offset at the end */
	ASSERT3U(*offset, <=, start);
	offset = MIN(offset + (1ULL << span) - 1, start);
	} else if (*offset < start) {
	*offset = start;
	}
	if (i < 0 \|\| i >= epb)
	error = SET_ERROR(ESRCH);
	}

	if (db)
	dbuf_rele(db, FTAG);

	return (error);
	}

	/*
	* Find the next hole, data, or sparse region at or after *offset.
	* The value 'blkfill' tells us how many items we expect to find
	* in an L0 data block; this value is 1 for normal objects,
	* DNODES_PER_BLOCK for the meta dnode, and some fraction of
	* DNODES_PER_BLOCK when searching for sparse regions thereof.
	*
	* Examples:
	*
	* dnode_next_offset(dn, flags, offset, 1, 1, 0);
	* Finds the next/previous hole/data in a file.
	* Used in dmu_offset_next().
	*
	* dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
	* Finds the next free/allocated dnode an objset's meta-dnode.
	* Only finds objects that have new contents since txg (ie.
	* bonus buffer changes and content removal are ignored).
	* Used in dmu_object_next().
	*
	* dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
	* Finds the next L2 meta-dnode bp that's at most 1/4 full.
	* Used in dmu_object_alloc().
	*/
	int
	dnode_next_offset(dnode_t dn, int flags, uint64_t offset,
	int minlvl, uint64_t blkfill, uint64_t txg)
	{
	uint64_t initial_offset = *offset;
	int lvl, maxlvl;
	int error = 0;

	if (!(flags & DNODE_FIND_HAVELOCK))
	rw_enter(&dn->dn_struct_rwlock, RW_READER);

	if (dn->dn_phys->dn_nlevels == 0) {
	error = SET_ERROR(ESRCH);
	goto out;
	}

	if (dn->dn_datablkshift == 0) {
	if (*offset < dn->dn_datablksz) {
	if (flags & DNODE_FIND_HOLE)
	*offset = dn->dn_datablksz;
	} else {
	error = SET_ERROR(ESRCH);
	}
	goto out;
	}

	maxlvl = dn->dn_phys->dn_nlevels;

	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
	error = dnode_next_offset_level(dn,
	flags, offset, lvl, blkfill, txg);
	if (error != ESRCH)
	break;
	}

	while (error == 0 && --lvl >= minlvl) {
	error = dnode_next_offset_level(dn,
	flags, offset, lvl, blkfill, txg);
	}

	/*
	* There's always a "virtual hole" at the end of the object, even
	* if all BP's which physically exist are non-holes.
	*/
	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
	minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
	error = 0;
	}

	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
	initial_offset < offset : initial_offset > offset))
	error = SET_ERROR(ESRCH);
	out:
	if (!(flags & DNODE_FIND_HAVELOCK))
	rw_exit(&dn->dn_struct_rwlock);

	return (error);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c (revision 332525)
	@@ -1,4057 +1,4252 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
	* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2014, Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 RackTop Systems.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
	* Copyright 2017 Nexenta Systems, Inc.
	*/

	#include <sys/dmu_objset.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_synctask.h>
	#include <sys/dmu_traverse.h>
	#include <sys/dmu_impl.h>
	#include <sys/dmu_send.h>
	#include <sys/dmu_tx.h>
	#include <sys/arc.h>
	#include <sys/zio.h>
	#include <sys/zap.h>
	#include <sys/zfeature.h>
	#include <sys/unique.h>
	#include <sys/zfs_context.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/spa.h>
	+#include <sys/vdev.h>
	#include <sys/zfs_znode.h>
	#include <sys/zfs_onexit.h>
	#include <sys/zvol.h>
	#include <sys/dsl_scan.h>
	#include <sys/dsl_deadlist.h>
	#include <sys/dsl_destroy.h>
	#include <sys/dsl_userhold.h>
	#include <sys/dsl_bookmark.h>
	#include <sys/dmu_send.h>
	#include <sys/zio_checksum.h>
	#include <sys/zio_compress.h>
	#include <zfs_fletcher.h>

	SYSCTL_DECL(_vfs_zfs);

	/*
	* The SPA supports block sizes up to 16MB. However, very large blocks
	* can have an impact on i/o latency (e.g. tying up a spinning disk for
	* ~300ms), and also potentially on the memory allocator. Therefore,
	* we do not allow the recordsize to be set larger than zfs_max_recordsize
	* (default 1MB). Larger blocks can be created by changing this tunable,
	* and pools with larger blocks can always be imported and used, regardless
	* of this setting.
	*/
	int zfs_max_recordsize = 1 * 1024 * 1024;
	SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
	&zfs_max_recordsize, 0,
	"Maximum block size. Expect dragons when tuning this.");

	#define SWITCH64(x, y) \
	{ \
	uint64_t __tmp = (x); \
	(x) = (y); \
	(y) = __tmp; \
	}

	#define DS_REF_MAX (1ULL << 62)

	extern inline dsl_dataset_phys_t dsl_dataset_phys(dsl_dataset_t ds);

	+static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
	+ uint64_t obj, dmu_tx_t *tx);
	+static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
	+ dmu_tx_t *tx);
	+
	extern int spa_asize_inflation;

	static zil_header_t zero_zil;

	/*
	* Figure out how much of this delta should be propogated to the dsl_dir
	* layer. If there's a refreservation, that space has already been
	* partially accounted for in our ancestors.
	*/
	static int64_t
	parent_delta(dsl_dataset_t *ds, int64_t delta)
	{
	dsl_dataset_phys_t *ds_phys;
	uint64_t old_bytes, new_bytes;

	if (ds->ds_reserved == 0)
	return (delta);

	ds_phys = dsl_dataset_phys(ds);
	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);

	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
	return (new_bytes - old_bytes);
	}

	void
	dsl_dataset_block_born(dsl_dataset_t ds, const blkptr_t bp, dmu_tx_t *tx)
	{
	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
	int compressed = BP_GET_PSIZE(bp);
	int uncompressed = BP_GET_UCSIZE(bp);
	int64_t delta;

	dprintf_bp(bp, "ds=%p", ds);

	ASSERT(dmu_tx_is_syncing(tx));
	/* It could have been compressed away to nothing */
	if (BP_IS_HOLE(bp))
	return;
	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
	if (ds == NULL) {
	dsl_pool_mos_diduse_space(tx->tx_pool,
	used, compressed, uncompressed);
	return;
	}

	ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	mutex_enter(&ds->ds_lock);
	delta = parent_delta(ds, used);
	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
	dsl_dataset_phys(ds)->ds_unique_bytes += used;

	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
	ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
	B_TRUE;
	}

	spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
	if (f != SPA_FEATURE_NONE)
	ds->ds_feature_activation_needed[f] = B_TRUE;

	mutex_exit(&ds->ds_lock);
	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
	compressed, uncompressed, tx);
	dsl_dir_transfer_space(ds->ds_dir, used - delta,
	DD_USED_REFRSRV, DD_USED_HEAD, NULL);
	}

	+/*
	+ * Called when the specified segment has been remapped, and is thus no
	+ * longer referenced in the head dataset. The vdev must be indirect.
	+ *
	+ * If the segment is referenced by a snapshot, put it on the remap deadlist.
	+ * Otherwise, add this segment to the obsolete spacemap.
	+ */
	+void
	+dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
	+ uint64_t size, uint64_t birth, dmu_tx_t *tx)
	+{
	+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
	+
	+ ASSERT(dmu_tx_is_syncing(tx));
	+ ASSERT(birth <= tx->tx_txg);
	+ ASSERT(!ds->ds_is_snapshot);
	+
	+ if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
	+ spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
	+ } else {
	+ blkptr_t fakebp;
	+ dva_t *dva = &fakebp.blk_dva[0];
	+
	+ ASSERT(ds != NULL);
	+
	+ mutex_enter(&ds->ds_remap_deadlist_lock);
	+ if (!dsl_dataset_remap_deadlist_exists(ds)) {
	+ dsl_dataset_create_remap_deadlist(ds, tx);
	+ }
	+ mutex_exit(&ds->ds_remap_deadlist_lock);
	+
	+ BP_ZERO(&fakebp);
	+ fakebp.blk_birth = birth;
	+ DVA_SET_VDEV(dva, vdev);
	+ DVA_SET_OFFSET(dva, offset);
	+ DVA_SET_ASIZE(dva, size);
	+
	+ dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx);
	+ }
	+}
	+
	int
	dsl_dataset_block_kill(dsl_dataset_t ds, const blkptr_t bp, dmu_tx_t *tx,
	boolean_t async)
	{
	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
	int compressed = BP_GET_PSIZE(bp);
	int uncompressed = BP_GET_UCSIZE(bp);

	if (BP_IS_HOLE(bp))
	return (0);

	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT(bp->blk_birth <= tx->tx_txg);

	if (ds == NULL) {
	dsl_free(tx->tx_pool, tx->tx_txg, bp);
	dsl_pool_mos_diduse_space(tx->tx_pool,
	-used, -compressed, -uncompressed);
	return (used);
	}
	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);

	ASSERT(!ds->ds_is_snapshot);
	dmu_buf_will_dirty(ds->ds_dbuf, tx);

	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
	int64_t delta;

	dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
	dsl_free(tx->tx_pool, tx->tx_txg, bp);

	mutex_enter(&ds->ds_lock);
	ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used \|\|
	!DS_UNIQUE_IS_ACCURATE(ds));
	delta = parent_delta(ds, -used);
	dsl_dataset_phys(ds)->ds_unique_bytes -= used;
	mutex_exit(&ds->ds_lock);
	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
	delta, -compressed, -uncompressed, tx);
	dsl_dir_transfer_space(ds->ds_dir, -used - delta,
	DD_USED_REFRSRV, DD_USED_HEAD, NULL);
	} else {
	dprintf_bp(bp, "putting on dead list: %s", "");
	if (async) {
	/*
	* We are here as part of zio's write done callback,
	* which means we're a zio interrupt thread. We can't
	* call dsl_deadlist_insert() now because it may block
	* waiting for I/O. Instead, put bp on the deferred
	* queue and let dsl_pool_sync() finish the job.
	*/
	bplist_append(&ds->ds_pending_deadlist, bp);
	} else {
	dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
	}
	ASSERT3U(ds->ds_prev->ds_object, ==,
	dsl_dataset_phys(ds)->ds_prev_snap_obj);
	ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
	/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
	if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
	ds->ds_object && bp->blk_birth >
	dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
	mutex_enter(&ds->ds_prev->ds_lock);
	dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
	mutex_exit(&ds->ds_prev->ds_lock);
	}
	if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
	dsl_dir_transfer_space(ds->ds_dir, used,
	DD_USED_HEAD, DD_USED_SNAP, tx);
	}
	}
	mutex_enter(&ds->ds_lock);
	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
	mutex_exit(&ds->ds_lock);

	return (used);
	}

	/*
	* We have to release the fsid syncronously or we risk that a subsequent
	* mount of the same dataset will fail to unique_insert the fsid. This
	* failure would manifest itself as the fsid of this dataset changing
	* between mounts which makes NFS clients quite unhappy.
	*/
	static void
	dsl_dataset_evict_sync(void *dbu)
	{
	dsl_dataset_t *ds = dbu;

	ASSERT(ds->ds_owner == NULL);

	unique_remove(ds->ds_fsid_guid);
	}

	static void
	dsl_dataset_evict_async(void *dbu)
	{
	dsl_dataset_t *ds = dbu;

	ASSERT(ds->ds_owner == NULL);

	ds->ds_dbuf = NULL;

	if (ds->ds_objset != NULL)
	dmu_objset_evict(ds->ds_objset);

	if (ds->ds_prev) {
	dsl_dataset_rele(ds->ds_prev, ds);
	ds->ds_prev = NULL;
	}

	bplist_destroy(&ds->ds_pending_deadlist);
	- if (ds->ds_deadlist.dl_os != NULL)
	+ if (dsl_deadlist_is_open(&ds->ds_deadlist))
	dsl_deadlist_close(&ds->ds_deadlist);
	+ if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
	+ dsl_deadlist_close(&ds->ds_remap_deadlist);
	if (ds->ds_dir)
	dsl_dir_async_rele(ds->ds_dir, ds);

	ASSERT(!list_link_active(&ds->ds_synced_link));

	list_destroy(&ds->ds_prop_cbs);
	if (mutex_owned(&ds->ds_lock))
	mutex_exit(&ds->ds_lock);
	mutex_destroy(&ds->ds_lock);
	if (mutex_owned(&ds->ds_opening_lock))
	mutex_exit(&ds->ds_opening_lock);
	mutex_destroy(&ds->ds_opening_lock);
	mutex_destroy(&ds->ds_sendstream_lock);
	+ mutex_destroy(&ds->ds_remap_deadlist_lock);
	refcount_destroy(&ds->ds_longholds);
	rrw_destroy(&ds->ds_bp_rwlock);

	kmem_free(ds, sizeof (dsl_dataset_t));
	}

	int
	dsl_dataset_get_snapname(dsl_dataset_t *ds)
	{
	dsl_dataset_phys_t *headphys;
	int err;
	dmu_buf_t *headdbuf;
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	objset_t *mos = dp->dp_meta_objset;

	if (ds->ds_snapname[0])
	return (0);
	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
	return (0);

	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
	FTAG, &headdbuf);
	if (err != 0)
	return (err);
	headphys = headdbuf->db_data;
	err = zap_value_search(dp->dp_meta_objset,
	headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
	dmu_buf_rele(headdbuf, FTAG);
	return (err);
	}

	int
	dsl_dataset_snap_lookup(dsl_dataset_t ds, const char name, uint64_t *value)
	{
	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
	matchtype_t mt = 0;
	int err;

	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
	mt = MT_NORMALIZE;

	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
	value, mt, NULL, 0, NULL);
	if (err == ENOTSUP && (mt & MT_NORMALIZE))
	err = zap_lookup(mos, snapobj, name, 8, 1, value);
	return (err);
	}

	int
	dsl_dataset_snap_remove(dsl_dataset_t ds, const char name, dmu_tx_t *tx,
	boolean_t adj_cnt)
	{
	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
	matchtype_t mt = 0;
	int err;

	dsl_dir_snap_cmtime_update(ds->ds_dir);

	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
	mt = MT_NORMALIZE;

	err = zap_remove_norm(mos, snapobj, name, mt, tx);
	if (err == ENOTSUP && (mt & MT_NORMALIZE))
	err = zap_remove(mos, snapobj, name, tx);

	if (err == 0 && adj_cnt)
	dsl_fs_ss_count_adjust(ds->ds_dir, -1,
	DD_FIELD_SNAPSHOT_COUNT, tx);

	return (err);
	}

	boolean_t
	dsl_dataset_try_add_ref(dsl_pool_t dp, dsl_dataset_t ds, void *tag)
	{
	dmu_buf_t *dbuf = ds->ds_dbuf;
	boolean_t result = B_FALSE;

	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
	ds->ds_object, DMU_BONUS_BLKID, tag)) {

	if (ds == dmu_buf_get_user(dbuf))
	result = B_TRUE;
	else
	dmu_buf_rele(dbuf, tag);
	}

	return (result);
	}

	int
	dsl_dataset_hold_obj(dsl_pool_t dp, uint64_t dsobj, void tag,
	dsl_dataset_t **dsp)
	{
	objset_t *mos = dp->dp_meta_objset;
	dmu_buf_t *dbuf;
	dsl_dataset_t *ds;
	int err;
	dmu_object_info_t doi;

	ASSERT(dsl_pool_config_held(dp));

	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
	if (err != 0)
	return (err);

	/* Make sure dsobj has the correct object type. */
	dmu_object_info_from_db(dbuf, &doi);
	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
	dmu_buf_rele(dbuf, tag);
	return (SET_ERROR(EINVAL));
	}

	ds = dmu_buf_get_user(dbuf);
	if (ds == NULL) {
	dsl_dataset_t *winner = NULL;

	ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
	ds->ds_dbuf = dbuf;
	ds->ds_object = dsobj;
	ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;

	+ err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj,
	+ NULL, ds, &ds->ds_dir);
	+ if (err != 0) {
	+ kmem_free(ds, sizeof (dsl_dataset_t));
	+ dmu_buf_rele(dbuf, tag);
	+ return (err);
	+ }
	+
	mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
	+ mutex_init(&ds->ds_remap_deadlist_lock,
	+ NULL, MUTEX_DEFAULT, NULL);
	rrw_init(&ds->ds_bp_rwlock, B_FALSE);
	refcount_create(&ds->ds_longholds);

	bplist_create(&ds->ds_pending_deadlist);
	- dsl_deadlist_open(&ds->ds_deadlist,
	- mos, dsl_dataset_phys(ds)->ds_deadlist_obj);

	list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
	offsetof(dmu_sendarg_t, dsa_link));

	list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
	offsetof(dsl_prop_cb_record_t, cbr_ds_node));

	if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	if (!(spa_feature_table[f].fi_flags &
	ZFEATURE_FLAG_PER_DATASET))
	continue;
	err = zap_contains(mos, dsobj,
	spa_feature_table[f].fi_guid);
	if (err == 0) {
	ds->ds_feature_inuse[f] = B_TRUE;
	} else {
	ASSERT3U(err, ==, ENOENT);
	err = 0;
	}
	}
	}

	- err = dsl_dir_hold_obj(dp,
	- dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
	- if (err != 0) {
	- mutex_destroy(&ds->ds_lock);
	- mutex_destroy(&ds->ds_opening_lock);
	- mutex_destroy(&ds->ds_sendstream_lock);
	- refcount_destroy(&ds->ds_longholds);
	- bplist_destroy(&ds->ds_pending_deadlist);
	- dsl_deadlist_close(&ds->ds_deadlist);
	- kmem_free(ds, sizeof (dsl_dataset_t));
	- dmu_buf_rele(dbuf, tag);
	- return (err);
	- }
	-
	if (!ds->ds_is_snapshot) {
	ds->ds_snapname[0] = '\0';
	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	err = dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj,
	ds, &ds->ds_prev);
	}
	if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
	int zaperr = zap_lookup(mos, ds->ds_object,
	DS_FIELD_BOOKMARK_NAMES,
	sizeof (ds->ds_bookmarks), 1,
	&ds->ds_bookmarks);
	if (zaperr != ENOENT)
	VERIFY0(zaperr);
	}
	} else {
	if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
	err = dsl_dataset_get_snapname(ds);
	if (err == 0 &&
	dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
	err = zap_count(
	ds->ds_dir->dd_pool->dp_meta_objset,
	dsl_dataset_phys(ds)->ds_userrefs_obj,
	&ds->ds_userrefs);
	}
	}

	if (err == 0 && !ds->ds_is_snapshot) {
	err = dsl_prop_get_int_ds(ds,
	zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
	&ds->ds_reserved);
	if (err == 0) {
	err = dsl_prop_get_int_ds(ds,
	zfs_prop_to_name(ZFS_PROP_REFQUOTA),
	&ds->ds_quota);
	}
	} else {
	ds->ds_reserved = ds->ds_quota = 0;
	}

	+ dsl_deadlist_open(&ds->ds_deadlist,
	+ mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
	+ uint64_t remap_deadlist_obj =
	+ dsl_dataset_get_remap_deadlist_object(ds);
	+ if (remap_deadlist_obj != 0) {
	+ dsl_deadlist_open(&ds->ds_remap_deadlist, mos,
	+ remap_deadlist_obj);
	+ }
	+
	dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
	dsl_dataset_evict_async, &ds->ds_dbuf);
	if (err == 0)
	winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);

	if (err != 0 \|\| winner != NULL) {
	bplist_destroy(&ds->ds_pending_deadlist);
	dsl_deadlist_close(&ds->ds_deadlist);
	+ if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
	+ dsl_deadlist_close(&ds->ds_remap_deadlist);
	if (ds->ds_prev)
	dsl_dataset_rele(ds->ds_prev, ds);
	dsl_dir_rele(ds->ds_dir, ds);
	mutex_destroy(&ds->ds_lock);
	mutex_destroy(&ds->ds_opening_lock);
	mutex_destroy(&ds->ds_sendstream_lock);
	refcount_destroy(&ds->ds_longholds);
	kmem_free(ds, sizeof (dsl_dataset_t));
	if (err != 0) {
	dmu_buf_rele(dbuf, tag);
	return (err);
	}
	ds = winner;
	} else {
	ds->ds_fsid_guid =
	unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
	if (ds->ds_fsid_guid !=
	dsl_dataset_phys(ds)->ds_fsid_guid) {
	zfs_dbgmsg("ds_fsid_guid changed from "
	"%llx to %llx for pool %s dataset id %llu",
	(long long)
	dsl_dataset_phys(ds)->ds_fsid_guid,
	(long long)ds->ds_fsid_guid,
	spa_name(dp->dp_spa),
	dsobj);
	}
	}
	}
	ASSERT3P(ds->ds_dbuf, ==, dbuf);
	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 \|\|
	spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN \|\|
	dp->dp_origin_snap == NULL \|\| ds == dp->dp_origin_snap);
	*dsp = ds;
	return (0);
	}

	int
	dsl_dataset_hold(dsl_pool_t dp, const char name,
	void tag, dsl_dataset_t *dsp)
	{
	dsl_dir_t *dd;
	const char *snapname;
	uint64_t obj;
	int err = 0;
	dsl_dataset_t *ds;

	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
	if (err != 0)
	return (err);

	ASSERT(dsl_pool_config_held(dp));
	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
	if (obj != 0)
	err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
	else
	err = SET_ERROR(ENOENT);

	/* we may be looking for a snapshot */
	if (err == 0 && snapname != NULL) {
	dsl_dataset_t *snap_ds;

	if (*snapname++ != '@') {
	dsl_dataset_rele(ds, tag);
	dsl_dir_rele(dd, FTAG);
	return (SET_ERROR(ENOENT));
	}

	dprintf("looking for snapshot '%s'\n", snapname);
	err = dsl_dataset_snap_lookup(ds, snapname, &obj);
	if (err == 0)
	err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
	dsl_dataset_rele(ds, tag);

	if (err == 0) {
	mutex_enter(&snap_ds->ds_lock);
	if (snap_ds->ds_snapname[0] == 0)
	(void) strlcpy(snap_ds->ds_snapname, snapname,
	sizeof (snap_ds->ds_snapname));
	mutex_exit(&snap_ds->ds_lock);
	ds = snap_ds;
	}
	}
	if (err == 0)
	*dsp = ds;
	dsl_dir_rele(dd, FTAG);
	return (err);
	}

	int
	dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
	void tag, dsl_dataset_t *dsp)
	{
	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
	if (err != 0)
	return (err);
	if (!dsl_dataset_tryown(*dsp, tag)) {
	dsl_dataset_rele(*dsp, tag);
	*dsp = NULL;
	return (SET_ERROR(EBUSY));
	}
	return (0);
	}

	int
	dsl_dataset_own(dsl_pool_t dp, const char name,
	void tag, dsl_dataset_t *dsp)
	{
	int err = dsl_dataset_hold(dp, name, tag, dsp);
	if (err != 0)
	return (err);
	if (!dsl_dataset_tryown(*dsp, tag)) {
	dsl_dataset_rele(*dsp, tag);
	return (SET_ERROR(EBUSY));
	}
	return (0);
	}

	/*
	* See the comment above dsl_pool_hold() for details. In summary, a long
	* hold is used to prevent destruction of a dataset while the pool hold
	* is dropped, allowing other concurrent operations (e.g. spa_sync()).
	*
	* The dataset and pool must be held when this function is called. After it
	* is called, the pool hold may be released while the dataset is still held
	* and accessed.
	*/
	void
	dsl_dataset_long_hold(dsl_dataset_t ds, void tag)
	{
	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
	(void) refcount_add(&ds->ds_longholds, tag);
	}

	void
	dsl_dataset_long_rele(dsl_dataset_t ds, void tag)
	{
	(void) refcount_remove(&ds->ds_longholds, tag);
	}

	/* Return B_TRUE if there are any long holds on this dataset. */
	boolean_t
	dsl_dataset_long_held(dsl_dataset_t *ds)
	{
	return (!refcount_is_zero(&ds->ds_longholds));
	}

	void
	dsl_dataset_name(dsl_dataset_t ds, char name)
	{
	if (ds == NULL) {
	(void) strcpy(name, "mos");
	} else {
	dsl_dir_name(ds->ds_dir, name);
	VERIFY0(dsl_dataset_get_snapname(ds));
	if (ds->ds_snapname[0]) {
	VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
	<, ZFS_MAX_DATASET_NAME_LEN);
	/*
	* We use a "recursive" mutex so that we
	* can call dprintf_ds() with ds_lock held.
	*/
	if (!MUTEX_HELD(&ds->ds_lock)) {
	mutex_enter(&ds->ds_lock);
	VERIFY3U(strlcat(name, ds->ds_snapname,
	ZFS_MAX_DATASET_NAME_LEN), <,
	ZFS_MAX_DATASET_NAME_LEN);
	mutex_exit(&ds->ds_lock);
	} else {
	VERIFY3U(strlcat(name, ds->ds_snapname,
	ZFS_MAX_DATASET_NAME_LEN), <,
	ZFS_MAX_DATASET_NAME_LEN);
	}
	}
	}
	}

	int
	dsl_dataset_namelen(dsl_dataset_t *ds)
	{
	VERIFY0(dsl_dataset_get_snapname(ds));
	mutex_enter(&ds->ds_lock);
	int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname);
	mutex_exit(&ds->ds_lock);
	return (len);
	}

	void
	dsl_dataset_rele(dsl_dataset_t ds, void tag)
	{
	dmu_buf_rele(ds->ds_dbuf, tag);
	}

	void
	dsl_dataset_disown(dsl_dataset_t ds, void tag)
	{
	ASSERT3P(ds->ds_owner, ==, tag);
	ASSERT(ds->ds_dbuf != NULL);

	mutex_enter(&ds->ds_lock);
	ds->ds_owner = NULL;
	mutex_exit(&ds->ds_lock);
	dsl_dataset_long_rele(ds, tag);
	dsl_dataset_rele(ds, tag);
	}

	boolean_t
	dsl_dataset_tryown(dsl_dataset_t ds, void tag)
	{
	boolean_t gotit = FALSE;

	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
	mutex_enter(&ds->ds_lock);
	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
	ds->ds_owner = tag;
	dsl_dataset_long_hold(ds, tag);
	gotit = TRUE;
	}
	mutex_exit(&ds->ds_lock);
	return (gotit);
	}

	boolean_t
	dsl_dataset_has_owner(dsl_dataset_t *ds)
	{
	boolean_t rv;
	mutex_enter(&ds->ds_lock);
	rv = (ds->ds_owner != NULL);
	mutex_exit(&ds->ds_lock);
	return (rv);
	}

	static void
	dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
	{
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
	uint64_t zero = 0;

	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);

	spa_feature_incr(spa, f, tx);
	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);

	VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
	sizeof (zero), 1, &zero, tx));
	}

	void
	dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
	{
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;

	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);

	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
	spa_feature_decr(spa, f, tx);
	}

	uint64_t
	dsl_dataset_create_sync_dd(dsl_dir_t dd, dsl_dataset_t origin,
	uint64_t flags, dmu_tx_t *tx)
	{
	dsl_pool_t *dp = dd->dd_pool;
	dmu_buf_t *dbuf;
	dsl_dataset_phys_t *dsphys;
	uint64_t dsobj;
	objset_t *mos = dp->dp_meta_objset;

	if (origin == NULL)
	origin = dp->dp_origin_snap;

	ASSERT(origin == NULL \|\| origin->ds_dir->dd_pool == dp);
	ASSERT(origin == NULL \|\| dsl_dataset_phys(origin)->ds_num_children > 0);
	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);

	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
	DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
	dmu_buf_will_dirty(dbuf, tx);
	dsphys = dbuf->db_data;
	bzero(dsphys, sizeof (dsl_dataset_phys_t));
	dsphys->ds_dir_obj = dd->dd_object;
	dsphys->ds_flags = flags;
	dsphys->ds_fsid_guid = unique_create();
	do {
	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
	sizeof (dsphys->ds_guid));
	} while (dsphys->ds_guid == 0);
	dsphys->ds_snapnames_zapobj =
	zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
	DMU_OT_NONE, 0, tx);
	dsphys->ds_creation_time = gethrestime_sec();
	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;

	if (origin == NULL) {
	dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
	} else {
	dsl_dataset_t ohds; / head of the origin snapshot */

	dsphys->ds_prev_snap_obj = origin->ds_object;
	dsphys->ds_prev_snap_txg =
	dsl_dataset_phys(origin)->ds_creation_txg;
	dsphys->ds_referenced_bytes =
	dsl_dataset_phys(origin)->ds_referenced_bytes;
	dsphys->ds_compressed_bytes =
	dsl_dataset_phys(origin)->ds_compressed_bytes;
	dsphys->ds_uncompressed_bytes =
	dsl_dataset_phys(origin)->ds_uncompressed_bytes;
	rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
	dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
	rrw_exit(&origin->ds_bp_rwlock, FTAG);

	/*
	* Inherit flags that describe the dataset's contents
	* (INCONSISTENT) or properties (Case Insensitive).
	*/
	dsphys->ds_flags \|= dsl_dataset_phys(origin)->ds_flags &
	(DS_FLAG_INCONSISTENT \| DS_FLAG_CI_DATASET);

	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	if (origin->ds_feature_inuse[f])
	dsl_dataset_activate_feature(dsobj, f, tx);
	}

	dmu_buf_will_dirty(origin->ds_dbuf, tx);
	dsl_dataset_phys(origin)->ds_num_children++;

	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
	FTAG, &ohds));
	dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
	dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
	dsl_dataset_rele(ohds, FTAG);

	if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
	if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
	dsl_dataset_phys(origin)->ds_next_clones_obj =
	zap_create(mos,
	DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
	}
	VERIFY0(zap_add_int(mos,
	dsl_dataset_phys(origin)->ds_next_clones_obj,
	dsobj, tx));
	}

	dmu_buf_will_dirty(dd->dd_dbuf, tx);
	dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
	if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
	dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
	dsl_dir_phys(origin->ds_dir)->dd_clones =
	zap_create(mos,
	DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
	}
	VERIFY0(zap_add_int(mos,
	dsl_dir_phys(origin->ds_dir)->dd_clones,
	dsobj, tx));
	}
	}

	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
	dsphys->ds_flags \|= DS_FLAG_UNIQUE_ACCURATE;

	dmu_buf_rele(dbuf, FTAG);

	dmu_buf_will_dirty(dd->dd_dbuf, tx);
	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;

	return (dsobj);
	}

	static void
	dsl_dataset_zero_zil(dsl_dataset_t ds, dmu_tx_t tx)
	{
	objset_t *os;

	VERIFY0(dmu_objset_from_ds(ds, &os));
	if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	zio_t *zio;

	bzero(&os->os_zil_header, sizeof (os->os_zil_header));

	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	dsl_dataset_sync(ds, zio, tx);
	VERIFY0(zio_wait(zio));

	/* dsl_dataset_sync_done will drop this reference. */
	dmu_buf_add_ref(ds->ds_dbuf, ds);
	dsl_dataset_sync_done(ds, tx);
	}
	}

	uint64_t
	dsl_dataset_create_sync(dsl_dir_t pdd, const char lastname,
	dsl_dataset_t origin, uint64_t flags, cred_t cr, dmu_tx_t *tx)
	{
	dsl_pool_t *dp = pdd->dd_pool;
	uint64_t dsobj, ddobj;
	dsl_dir_t *dd;

	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT(lastname[0] != '@');

	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));

	dsobj = dsl_dataset_create_sync_dd(dd, origin,
	flags & ~DS_CREATE_FLAG_NODIRTY, tx);

	dsl_deleg_set_create_perms(dd, tx, cr);

	/*
	* Since we're creating a new node we know it's a leaf, so we can
	* initialize the counts if the limit feature is active.
	*/
	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
	uint64_t cnt = 0;
	objset_t *os = dd->dd_pool->dp_meta_objset;

	dsl_dir_zapify(dd, tx);
	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
	sizeof (cnt), 1, &cnt, tx));
	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
	sizeof (cnt), 1, &cnt, tx));
	}

	dsl_dir_rele(dd, FTAG);

	/*
	* If we are creating a clone, make sure we zero out any stale
	* data from the origin snapshots zil header.
	*/
	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
	dsl_dataset_t *ds;

	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
	dsl_dataset_zero_zil(ds, tx);
	dsl_dataset_rele(ds, FTAG);
	}

	return (dsobj);
	}

	#ifdef __FreeBSD__
	/* FreeBSD ioctl compat begin */
	struct destroyarg {
	nvlist_t *nvl;
	const char *snapname;
	};

	static int
	dsl_check_snap_cb(const char name, void arg)
	{
	struct destroyarg *da = arg;
	dsl_dataset_t *ds;
	char *dsname;

	dsname = kmem_asprintf("%s@%s", name, da->snapname);
	fnvlist_add_boolean(da->nvl, dsname);
	kmem_free(dsname, strlen(dsname) + 1);

	return (0);
	}

	int
	dmu_get_recursive_snaps_nvl(char fsname, const char snapname,
	nvlist_t *snaps)
	{
	struct destroyarg *da;
	int err;

	da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
	da->nvl = snaps;
	da->snapname = snapname;
	err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
	DS_FIND_CHILDREN);
	kmem_free(da, sizeof (struct destroyarg));

	return (err);
	}
	/* FreeBSD ioctl compat end */
	#endif /* __FreeBSD__ */

	/*
	* The unique space in the head dataset can be calculated by subtracting
	* the space used in the most recent snapshot, that is still being used
	* in this file system, from the space currently in use. To figure out
	* the space in the most recent snapshot still in use, we need to take
	* the total space used in the snapshot and subtract out the space that
	* has been freed up since the snapshot was taken.
	*/
	void
	dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
	{
	uint64_t mrs_used;
	uint64_t dlused, dlcomp, dluncomp;

	ASSERT(!ds->ds_is_snapshot);

	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
	mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
	else
	mrs_used = 0;

	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);

	ASSERT3U(dlused, <=, mrs_used);
	dsl_dataset_phys(ds)->ds_unique_bytes =
	dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);

	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
	SPA_VERSION_UNIQUE_ACCURATE)
	dsl_dataset_phys(ds)->ds_flags \|= DS_FLAG_UNIQUE_ACCURATE;
	}

	void
	dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
	dmu_tx_t *tx)
	{
	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	uint64_t count;
	int err;

	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
	obj, tx);
	/*
	* The err should not be ENOENT, but a bug in a previous version
	* of the code could cause upgrade_clones_cb() to not set
	* ds_next_snap_obj when it should, leading to a missing entry.
	* If we knew that the pool was created after
	* SPA_VERSION_NEXT_CLONES, we could assert that it isn't
	* ENOENT. However, at least we can check that we don't have
	* too many entries in the next_clones_obj even after failing to
	* remove this one.
	*/
	if (err != ENOENT)
	VERIFY0(err);
	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
	&count));
	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
	}


	blkptr_t *
	dsl_dataset_get_blkptr(dsl_dataset_t *ds)
	{
	return (&dsl_dataset_phys(ds)->ds_bp);
	}

	spa_t *
	dsl_dataset_get_spa(dsl_dataset_t *ds)
	{
	return (ds->ds_dir->dd_pool->dp_spa);
	}

	void
	dsl_dataset_dirty(dsl_dataset_t ds, dmu_tx_t tx)
	{
	dsl_pool_t *dp;

	if (ds == NULL) /* this is the meta-objset */
	return;

	ASSERT(ds->ds_objset != NULL);

	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
	panic("dirtying snapshot!");

	/* Must not dirty a dataset in the same txg where it got snapshotted. */
	ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);

	dp = ds->ds_dir->dd_pool;
	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
	/* up the hold count until we can be written out */
	dmu_buf_add_ref(ds->ds_dbuf, ds);
	}
	}

	boolean_t
	dsl_dataset_is_dirty(dsl_dataset_t *ds)
	{
	for (int t = 0; t < TXG_SIZE; t++) {
	if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
	ds, t))
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	static int
	dsl_dataset_snapshot_reserve_space(dsl_dataset_t ds, dmu_tx_t tx)
	{
	uint64_t asize;

	if (!dmu_tx_is_syncing(tx))
	return (0);

	/*
	* If there's an fs-only reservation, any blocks that might become
	* owned by the snapshot dataset must be accommodated by space
	* outside of the reservation.
	*/
	ASSERT(ds->ds_reserved == 0 \|\| DS_UNIQUE_IS_ACCURATE(ds));
	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
	return (SET_ERROR(ENOSPC));

	/*
	* Propagate any reserved space for this snapshot to other
	* snapshot checks in this sync group.
	*/
	if (asize > 0)
	dsl_dir_willuse_space(ds->ds_dir, asize, tx);

	return (0);
	}

	int
	dsl_dataset_snapshot_check_impl(dsl_dataset_t ds, const char snapname,
	dmu_tx_t tx, boolean_t recv, uint64_t cnt, cred_t cr)
	{
	int error;
	uint64_t value;

	ds->ds_trysnap_txg = tx->tx_txg;

	if (!dmu_tx_is_syncing(tx))
	return (0);

	/*
	* We don't allow multiple snapshots of the same txg. If there
	* is already one, try again.
	*/
	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
	return (SET_ERROR(EAGAIN));

	/*
	* Check for conflicting snapshot name.
	*/
	error = dsl_dataset_snap_lookup(ds, snapname, &value);
	if (error == 0)
	return (SET_ERROR(EEXIST));
	if (error != ENOENT)
	return (error);

	/*
	* We don't allow taking snapshots of inconsistent datasets, such as
	* those into which we are currently receiving. However, if we are
	* creating this snapshot as part of a receive, this check will be
	* executed atomically with respect to the completion of the receive
	* itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
	* case we ignore this, knowing it will be fixed up for us shortly in
	* dmu_recv_end_sync().
	*/
	if (!recv && DS_IS_INCONSISTENT(ds))
	return (SET_ERROR(EBUSY));

	/*
	* Skip the check for temporary snapshots or if we have already checked
	* the counts in dsl_dataset_snapshot_check. This means we really only
	* check the count here when we're receiving a stream.
	*/
	if (cnt != 0 && cr != NULL) {
	error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
	ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
	if (error != 0)
	return (error);
	}

	error = dsl_dataset_snapshot_reserve_space(ds, tx);
	if (error != 0)
	return (error);

	return (0);
	}

	int
	dsl_dataset_snapshot_check(void arg, dmu_tx_t tx)
	{
	dsl_dataset_snapshot_arg_t *ddsa = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	nvpair_t *pair;
	int rv = 0;

	/*
	* Pre-compute how many total new snapshots will be created for each
	* level in the tree and below. This is needed for validating the
	* snapshot limit when either taking a recursive snapshot or when
	* taking multiple snapshots.
	*
	* The problem is that the counts are not actually adjusted when
	* we are checking, only when we finally sync. For a single snapshot,
	* this is easy, the count will increase by 1 at each node up the tree,
	* but its more complicated for the recursive/multiple snapshot case.
	*
	* The dsl_fs_ss_limit_check function does recursively check the count
	* at each level up the tree but since it is validating each snapshot
	* independently we need to be sure that we are validating the complete
	* count for the entire set of snapshots. We do this by rolling up the
	* counts for each component of the name into an nvlist and then
	* checking each of those cases with the aggregated count.
	*
	* This approach properly handles not only the recursive snapshot
	* case (where we get all of those on the ddsa_snaps list) but also
	* the sibling case (e.g. snapshot a/b and a/c so that we will also
	* validate the limit on 'a' using a count of 2).
	*
	* We validate the snapshot names in the third loop and only report
	* name errors once.
	*/
	if (dmu_tx_is_syncing(tx)) {
	nvlist_t *cnt_track = NULL;
	cnt_track = fnvlist_alloc();

	/* Rollup aggregated counts into the cnt_track list */
	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
	pair != NULL;
	pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
	char *pdelim;
	uint64_t val;
	char nm[MAXPATHLEN];

	(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
	pdelim = strchr(nm, '@');
	if (pdelim == NULL)
	continue;
	*pdelim = '\0';

	do {
	if (nvlist_lookup_uint64(cnt_track, nm,
	&val) == 0) {
	/* update existing entry */
	fnvlist_add_uint64(cnt_track, nm,
	val + 1);
	} else {
	/* add to list */
	fnvlist_add_uint64(cnt_track, nm, 1);
	}

	pdelim = strrchr(nm, '/');
	if (pdelim != NULL)
	*pdelim = '\0';
	} while (pdelim != NULL);
	}

	/* Check aggregated counts at each level */
	for (pair = nvlist_next_nvpair(cnt_track, NULL);
	pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
	int error = 0;
	char *name;
	uint64_t cnt = 0;
	dsl_dataset_t *ds;

	name = nvpair_name(pair);
	cnt = fnvpair_value_uint64(pair);
	ASSERT(cnt > 0);

	error = dsl_dataset_hold(dp, name, FTAG, &ds);
	if (error == 0) {
	error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
	ZFS_PROP_SNAPSHOT_LIMIT, NULL,
	ddsa->ddsa_cr);
	dsl_dataset_rele(ds, FTAG);
	}

	if (error != 0) {
	if (ddsa->ddsa_errors != NULL)
	fnvlist_add_int32(ddsa->ddsa_errors,
	name, error);
	rv = error;
	/* only report one error for this check */
	break;
	}
	}
	nvlist_free(cnt_track);
	}

	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
	pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
	int error = 0;
	dsl_dataset_t *ds;
	char name, atp;
	char dsname[ZFS_MAX_DATASET_NAME_LEN];

	name = nvpair_name(pair);
	if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
	error = SET_ERROR(ENAMETOOLONG);
	if (error == 0) {
	atp = strchr(name, '@');
	if (atp == NULL)
	error = SET_ERROR(EINVAL);
	if (error == 0)
	(void) strlcpy(dsname, name, atp - name + 1);
	}
	if (error == 0)
	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
	if (error == 0) {
	/* passing 0/NULL skips dsl_fs_ss_limit_check */
	error = dsl_dataset_snapshot_check_impl(ds,
	atp + 1, tx, B_FALSE, 0, NULL);
	dsl_dataset_rele(ds, FTAG);
	}

	if (error != 0) {
	if (ddsa->ddsa_errors != NULL) {
	fnvlist_add_int32(ddsa->ddsa_errors,
	name, error);
	}
	rv = error;
	}
	}

	return (rv);
	}

	void
	dsl_dataset_snapshot_sync_impl(dsl_dataset_t ds, const char snapname,
	dmu_tx_t *tx)
	{
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	dmu_buf_t *dbuf;
	dsl_dataset_phys_t *dsphys;
	uint64_t dsobj, crtxg;
	objset_t *mos = dp->dp_meta_objset;
	objset_t *os;

	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));

	/*
	* If we are on an old pool, the zil must not be active, in which
	* case it will be zeroed. Usually zil_suspend() accomplishes this.
	*/
	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP \|\|
	dmu_objset_from_ds(ds, &os) != 0 \|\|
	bcmp(&os->os_phys->os_zil_header, &zero_zil,
	sizeof (zero_zil)) == 0);

	/* Should not snapshot a dirty dataset. */
	ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
	ds, tx->tx_txg));

	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);

	/*
	* The origin's ds_creation_txg has to be < TXG_INITIAL
	*/
	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
	crtxg = 1;
	else
	crtxg = tx->tx_txg;

	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
	DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
	dmu_buf_will_dirty(dbuf, tx);
	dsphys = dbuf->db_data;
	bzero(dsphys, sizeof (dsl_dataset_phys_t));
	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
	dsphys->ds_fsid_guid = unique_create();
	do {
	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
	sizeof (dsphys->ds_guid));
	} while (dsphys->ds_guid == 0);
	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
	dsphys->ds_next_snap_obj = ds->ds_object;
	dsphys->ds_num_children = 1;
	dsphys->ds_creation_time = gethrestime_sec();
	dsphys->ds_creation_txg = crtxg;
	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
	dsphys->ds_uncompressed_bytes =
	dsl_dataset_phys(ds)->ds_uncompressed_bytes;
	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
	rrw_exit(&ds->ds_bp_rwlock, FTAG);
	dmu_buf_rele(dbuf, FTAG);

	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	if (ds->ds_feature_inuse[f])
	dsl_dataset_activate_feature(dsobj, f, tx);
	}

	ASSERT3U(ds->ds_prev != 0, ==,
	dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
	if (ds->ds_prev) {
	uint64_t next_clones_obj =
	dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
	ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
	ds->ds_object \|\|
	dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
	if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
	ds->ds_object) {
	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
	dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
	dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
	} else if (next_clones_obj != 0) {
	dsl_dataset_remove_from_next_clones(ds->ds_prev,
	dsphys->ds_next_snap_obj, tx);
	VERIFY0(zap_add_int(mos,
	next_clones_obj, dsobj, tx));
	}
	}

	/*
	* If we have a reference-reservation on this dataset, we will
	* need to increase the amount of refreservation being charged
	* since our unique space is going to zero.
	*/
	if (ds->ds_reserved) {
	int64_t delta;
	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
	delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
	ds->ds_reserved);
	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
	delta, 0, 0, tx);
	}

	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	dsl_dataset_phys(ds)->ds_deadlist_obj =
	dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
	dsl_deadlist_close(&ds->ds_deadlist);
	dsl_deadlist_open(&ds->ds_deadlist, mos,
	dsl_dataset_phys(ds)->ds_deadlist_obj);
	dsl_deadlist_add_key(&ds->ds_deadlist,
	dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);

	+ if (dsl_dataset_remap_deadlist_exists(ds)) {
	+ uint64_t remap_deadlist_obj =
	+ dsl_dataset_get_remap_deadlist_object(ds);
	+ /*
	+ * Move the remap_deadlist to the snapshot. The head
	+ * will create a new remap deadlist on demand, from
	+ * dsl_dataset_block_remapped().
	+ */
	+ dsl_dataset_unset_remap_deadlist_object(ds, tx);
	+ dsl_deadlist_close(&ds->ds_remap_deadlist);
	+
	+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
	+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST,
	+ sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx));
	+ }
	+
	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
	+
	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
	dsl_dataset_phys(ds)->ds_flags \|= DS_FLAG_UNIQUE_ACCURATE;

	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
	snapname, 8, 1, &dsobj, tx));

	if (ds->ds_prev)
	dsl_dataset_rele(ds->ds_prev, ds);
	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));

	dsl_scan_ds_snapshotted(ds, tx);

	dsl_dir_snap_cmtime_update(ds->ds_dir);

	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
	}

	void
	dsl_dataset_snapshot_sync(void arg, dmu_tx_t tx)
	{
	dsl_dataset_snapshot_arg_t *ddsa = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	nvpair_t *pair;

	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
	pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
	dsl_dataset_t *ds;
	char name, atp;
	char dsname[ZFS_MAX_DATASET_NAME_LEN];

	name = nvpair_name(pair);
	atp = strchr(name, '@');
	(void) strlcpy(dsname, name, atp - name + 1);
	VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));

	dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
	if (ddsa->ddsa_props != NULL) {
	dsl_props_set_sync_impl(ds->ds_prev,
	ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
	}
	dsl_dataset_rele(ds, FTAG);
	}
	}

	/*
	* The snapshots must all be in the same pool.
	* All-or-nothing: if there are any failures, nothing will be modified.
	*/
	int
	dsl_dataset_snapshot(nvlist_t snaps, nvlist_t props, nvlist_t *errors)
	{
	dsl_dataset_snapshot_arg_t ddsa;
	nvpair_t *pair;
	boolean_t needsuspend;
	int error;
	spa_t *spa;
	char *firstname;
	nvlist_t *suspended = NULL;

	pair = nvlist_next_nvpair(snaps, NULL);
	if (pair == NULL)
	return (0);
	firstname = nvpair_name(pair);

	error = spa_open(firstname, &spa, FTAG);
	if (error != 0)
	return (error);
	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
	spa_close(spa, FTAG);

	if (needsuspend) {
	suspended = fnvlist_alloc();
	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	pair = nvlist_next_nvpair(snaps, pair)) {
	char fsname[ZFS_MAX_DATASET_NAME_LEN];
	char *snapname = nvpair_name(pair);
	char *atp;
	void *cookie;

	atp = strchr(snapname, '@');
	if (atp == NULL) {
	error = SET_ERROR(EINVAL);
	break;
	}
	(void) strlcpy(fsname, snapname, atp - snapname + 1);

	error = zil_suspend(fsname, &cookie);
	if (error != 0)
	break;
	fnvlist_add_uint64(suspended, fsname,
	(uintptr_t)cookie);
	}
	}

	ddsa.ddsa_snaps = snaps;
	ddsa.ddsa_props = props;
	ddsa.ddsa_errors = errors;
	ddsa.ddsa_cr = CRED();

	if (error == 0) {
	error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
	dsl_dataset_snapshot_sync, &ddsa,
	fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
	}

	if (suspended != NULL) {
	for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
	pair = nvlist_next_nvpair(suspended, pair)) {
	zil_resume((void *)(uintptr_t)
	fnvpair_value_uint64(pair));
	}
	fnvlist_free(suspended);
	}

	#ifdef __FreeBSD__
	#ifdef _KERNEL
	if (error == 0) {
	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	pair = nvlist_next_nvpair(snaps, pair)) {
	char *snapname = nvpair_name(pair);
	zvol_create_minors(snapname);
	}
	}
	#endif
	#endif
	return (error);
	}

	typedef struct dsl_dataset_snapshot_tmp_arg {
	const char *ddsta_fsname;
	const char *ddsta_snapname;
	minor_t ddsta_cleanup_minor;
	const char *ddsta_htag;
	} dsl_dataset_snapshot_tmp_arg_t;

	static int
	dsl_dataset_snapshot_tmp_check(void arg, dmu_tx_t tx)
	{
	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	int error;

	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
	if (error != 0)
	return (error);

	/* NULL cred means no limit check for tmp snapshot */
	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
	tx, B_FALSE, 0, NULL);
	if (error != 0) {
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}

	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(ENOTSUP));
	}
	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
	B_TRUE, tx);
	if (error != 0) {
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}

	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	static void
	dsl_dataset_snapshot_tmp_sync(void arg, dmu_tx_t tx)
	{
	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;

	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));

	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
	ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);

	dsl_dataset_rele(ds, FTAG);
	}

	int
	dsl_dataset_snapshot_tmp(const char fsname, const char snapname,
	minor_t cleanup_minor, const char *htag)
	{
	dsl_dataset_snapshot_tmp_arg_t ddsta;
	int error;
	spa_t *spa;
	boolean_t needsuspend;
	void *cookie;

	ddsta.ddsta_fsname = fsname;
	ddsta.ddsta_snapname = snapname;
	ddsta.ddsta_cleanup_minor = cleanup_minor;
	ddsta.ddsta_htag = htag;

	error = spa_open(fsname, &spa, FTAG);
	if (error != 0)
	return (error);
	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
	spa_close(spa, FTAG);

	if (needsuspend) {
	error = zil_suspend(fsname, &cookie);
	if (error != 0)
	return (error);
	}

	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
	dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);

	if (needsuspend)
	zil_resume(cookie);
	return (error);
	}

	void
	dsl_dataset_sync(dsl_dataset_t ds, zio_t zio, dmu_tx_t *tx)
	{
	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT(ds->ds_objset != NULL);
	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);

	/*
	* in case we had to change ds_fsid_guid when we opened it,
	* sync it out now.
	*/
	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;

	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
	VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
	ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
	&ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
	VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
	ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
	&ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
	VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
	ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
	&ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
	ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
	ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
	ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
	}

	dmu_objset_sync(ds->ds_objset, zio, tx);

	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	if (ds->ds_feature_activation_needed[f]) {
	if (ds->ds_feature_inuse[f])
	continue;
	dsl_dataset_activate_feature(ds->ds_object, f, tx);
	ds->ds_feature_inuse[f] = B_TRUE;
	}
	}
	}

	static int
	deadlist_enqueue_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	dsl_deadlist_t *dl = arg;
	dsl_deadlist_insert(dl, bp, tx);
	return (0);
	}

	void
	dsl_dataset_sync_done(dsl_dataset_t ds, dmu_tx_t tx)
	{
	objset_t *os = ds->ds_objset;

	bplist_iterate(&ds->ds_pending_deadlist,
	deadlist_enqueue_cb, &ds->ds_deadlist, tx);

	if (os->os_synced_dnodes != NULL) {
	multilist_destroy(os->os_synced_dnodes);
	os->os_synced_dnodes = NULL;
	}

	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));

	dmu_buf_rele(ds->ds_dbuf, ds);
	}

	int
	get_clones_stat_impl(dsl_dataset_t ds, nvlist_t val)
	{
	uint64_t count = 0;
	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	zap_cursor_t zc;
	zap_attribute_t za;

	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));

	/*
	* There may be missing entries in ds_next_clones_obj
	* due to a bug in a previous version of the code.
	* Only trust it if it has the right number of entries.
	*/
	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
	VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
	&count));
	}
	if (count != dsl_dataset_phys(ds)->ds_num_children - 1) {
	return (ENOENT);
	}
	for (zap_cursor_init(&zc, mos,
	dsl_dataset_phys(ds)->ds_next_clones_obj);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	dsl_dataset_t *clone;
	char buf[ZFS_MAX_DATASET_NAME_LEN];
	VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
	za.za_first_integer, FTAG, &clone));
	dsl_dir_name(clone->ds_dir, buf);
	fnvlist_add_boolean(val, buf);
	dsl_dataset_rele(clone, FTAG);
	}
	zap_cursor_fini(&zc);
	return (0);
	}

	void
	get_clones_stat(dsl_dataset_t ds, nvlist_t nv)
	{
	nvlist_t *propval = fnvlist_alloc();
	nvlist_t *val;

	/*
	* We use nvlist_alloc() instead of fnvlist_alloc() because the
	* latter would allocate the list with NV_UNIQUE_NAME flag.
	* As a result, every time a clone name is appended to the list
	* it would be (linearly) searched for for a duplicate name.
	* We already know that all clone names must be unique and we
	* want avoid the quadratic complexity of double-checking that
	* because we can have a large number of clones.
	*/
	VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP));

	if (get_clones_stat_impl(ds, val) == 0) {
	fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
	propval);
	}

	nvlist_free(val);
	nvlist_free(propval);
	}

	/*
	* Returns a string that represents the receive resume stats token. It should
	* be freed with strfree().
	*/
	char *
	get_receive_resume_stats_impl(dsl_dataset_t *ds)
	{
	dsl_pool_t *dp = ds->ds_dir->dd_pool;

	if (dsl_dataset_has_resume_receive_state(ds)) {
	char *str;
	void *packed;
	uint8_t *compressed;
	uint64_t val;
	nvlist_t *token_nv = fnvlist_alloc();
	size_t packed_size, compressed_size;

	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
	fnvlist_add_uint64(token_nv, "fromguid", val);
	}
	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
	fnvlist_add_uint64(token_nv, "object", val);
	}
	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
	fnvlist_add_uint64(token_nv, "offset", val);
	}
	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
	fnvlist_add_uint64(token_nv, "bytes", val);
	}
	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
	fnvlist_add_uint64(token_nv, "toguid", val);
	}
	char buf[256];
	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
	DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
	fnvlist_add_string(token_nv, "toname", buf);
	}
	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
	DS_FIELD_RESUME_LARGEBLOCK) == 0) {
	fnvlist_add_boolean(token_nv, "largeblockok");
	}
	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
	DS_FIELD_RESUME_EMBEDOK) == 0) {
	fnvlist_add_boolean(token_nv, "embedok");
	}
	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
	DS_FIELD_RESUME_COMPRESSOK) == 0) {
	fnvlist_add_boolean(token_nv, "compressok");
	}
	packed = fnvlist_pack(token_nv, &packed_size);
	fnvlist_free(token_nv);
	compressed = kmem_alloc(packed_size, KM_SLEEP);

	compressed_size = gzip_compress(packed, compressed,
	packed_size, packed_size, 6);

	zio_cksum_t cksum;
	fletcher_4_native(compressed, compressed_size, NULL, &cksum);

	str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
	for (int i = 0; i < compressed_size; i++) {
	(void) sprintf(str + i * 2, "%02x", compressed[i]);
	}
	str[compressed_size * 2] = '\0';
	char *propval = kmem_asprintf("%u-%llx-%llx-%s",
	ZFS_SEND_RESUME_TOKEN_VERSION,
	(longlong_t)cksum.zc_word[0],
	(longlong_t)packed_size, str);
	kmem_free(packed, packed_size);
	kmem_free(str, compressed_size * 2 + 1);
	kmem_free(compressed, packed_size);
	return (propval);
	}
	return (spa_strdup(""));
	}

	/*
	* Returns a string that represents the receive resume stats token of the
	* dataset's child. It should be freed with strfree().
	*/
	char *
	get_child_receive_stats(dsl_dataset_t *ds)
	{
	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
	dsl_dataset_t *recv_ds;
	dsl_dataset_name(ds, recvname);
	if (strlcat(recvname, "/", sizeof (recvname)) <
	sizeof (recvname) &&
	strlcat(recvname, recv_clone_name, sizeof (recvname)) <
	sizeof (recvname) &&
	dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG,
	&recv_ds) == 0) {
	char *propval = get_receive_resume_stats_impl(recv_ds);
	dsl_dataset_rele(recv_ds, FTAG);
	return (propval);
	}
	return (spa_strdup(""));
	}

	static void
	get_receive_resume_stats(dsl_dataset_t ds, nvlist_t nv)
	{
	char *propval = get_receive_resume_stats_impl(ds);
	if (strcmp(propval, "") != 0) {
	dsl_prop_nvlist_add_string(nv,
	ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
	} else {
	char *childval = get_child_receive_stats(ds);
	if (strcmp(childval, "") != 0) {
	dsl_prop_nvlist_add_string(nv,
	ZFS_PROP_RECEIVE_RESUME_TOKEN, childval);
	}
	strfree(childval);
	}
	strfree(propval);
	}

	uint64_t
	dsl_get_refratio(dsl_dataset_t *ds)
	{
	uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
	(dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
	dsl_dataset_phys(ds)->ds_compressed_bytes);
	return (ratio);
	}

	uint64_t
	dsl_get_logicalreferenced(dsl_dataset_t *ds)
	{
	return (dsl_dataset_phys(ds)->ds_uncompressed_bytes);
	}

	uint64_t
	dsl_get_compressratio(dsl_dataset_t *ds)
	{
	if (ds->ds_is_snapshot) {
	return (dsl_get_refratio(ds));
	} else {
	dsl_dir_t *dd = ds->ds_dir;
	mutex_enter(&dd->dd_lock);
	uint64_t val = dsl_dir_get_compressratio(dd);
	mutex_exit(&dd->dd_lock);
	return (val);
	}
	}

	uint64_t
	dsl_get_used(dsl_dataset_t *ds)
	{
	if (ds->ds_is_snapshot) {
	return (dsl_dataset_phys(ds)->ds_unique_bytes);
	} else {
	dsl_dir_t *dd = ds->ds_dir;
	mutex_enter(&dd->dd_lock);
	uint64_t val = dsl_dir_get_used(dd);
	mutex_exit(&dd->dd_lock);
	return (val);
	}
	}

	uint64_t
	dsl_get_creation(dsl_dataset_t *ds)
	{
	return (dsl_dataset_phys(ds)->ds_creation_time);
	}

	uint64_t
	dsl_get_creationtxg(dsl_dataset_t *ds)
	{
	return (dsl_dataset_phys(ds)->ds_creation_txg);
	}

	uint64_t
	dsl_get_refquota(dsl_dataset_t *ds)
	{
	return (ds->ds_quota);
	}

	uint64_t
	dsl_get_refreservation(dsl_dataset_t *ds)
	{
	return (ds->ds_reserved);
	}

	uint64_t
	dsl_get_guid(dsl_dataset_t *ds)
	{
	return (dsl_dataset_phys(ds)->ds_guid);
	}

	uint64_t
	dsl_get_unique(dsl_dataset_t *ds)
	{
	return (dsl_dataset_phys(ds)->ds_unique_bytes);
	}

	uint64_t
	dsl_get_objsetid(dsl_dataset_t *ds)
	{
	return (ds->ds_object);
	}

	uint64_t
	dsl_get_userrefs(dsl_dataset_t *ds)
	{
	return (ds->ds_userrefs);
	}

	uint64_t
	dsl_get_defer_destroy(dsl_dataset_t *ds)
	{
	return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
	}

	uint64_t
	dsl_get_referenced(dsl_dataset_t *ds)
	{
	return (dsl_dataset_phys(ds)->ds_referenced_bytes);
	}

	uint64_t
	dsl_get_numclones(dsl_dataset_t *ds)
	{
	ASSERT(ds->ds_is_snapshot);
	return (dsl_dataset_phys(ds)->ds_num_children - 1);
	}

	uint64_t
	dsl_get_inconsistent(dsl_dataset_t *ds)
	{
	return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ?
	1 : 0);
	}

	uint64_t
	dsl_get_available(dsl_dataset_t *ds)
	{
	uint64_t refdbytes = dsl_get_referenced(ds);
	uint64_t availbytes = dsl_dir_space_available(ds->ds_dir,
	NULL, 0, TRUE);
	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
	availbytes +=
	ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
	}
	if (ds->ds_quota != 0) {
	/*
	* Adjust available bytes according to refquota
	*/
	if (refdbytes < ds->ds_quota) {
	availbytes = MIN(availbytes,
	ds->ds_quota - refdbytes);
	} else {
	availbytes = 0;
	}
	}
	return (availbytes);
	}

	int
	dsl_get_written(dsl_dataset_t ds, uint64_t written)
	{
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	dsl_dataset_t *prev;
	int err = dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
	if (err == 0) {
	uint64_t comp, uncomp;
	err = dsl_dataset_space_written(prev, ds, written,
	&comp, &uncomp);
	dsl_dataset_rele(prev, FTAG);
	}
	return (err);
	}

	/*
	* 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN.
	*/
	int
	dsl_get_prev_snap(dsl_dataset_t ds, char snap)
	{
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
	dsl_dataset_name(ds->ds_prev, snap);
	return (0);
	} else {
	return (ENOENT);
	}
	}

	/*
	* Returns the mountpoint property and source for the given dataset in the value
	* and source buffers. The value buffer must be at least as large as MAXPATHLEN
	* and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN.
	* Returns 0 on success and an error on failure.
	*/
	int
	dsl_get_mountpoint(dsl_dataset_t ds, const char dsname, char *value,
	char *source)
	{
	int error;
	dsl_pool_t *dp = ds->ds_dir->dd_pool;

	/* Retrieve the mountpoint value stored in the zap opbject */
	error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1,
	ZAP_MAXVALUELEN, value, source);
	if (error != 0) {
	return (error);
	}

	/* Process the dsname and source to find the full mountpoint string */
	if (value[0] == '/') {
	char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
	char *root = buf;
	const char *relpath;

	/*
	* If we inherit the mountpoint, even from a dataset
	* with a received value, the source will be the path of
	* the dataset we inherit from. If source is
	* ZPROP_SOURCE_VAL_RECVD, the received value is not
	* inherited.
	*/
	if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
	relpath = "";
	} else {
	ASSERT0(strncmp(dsname, source, strlen(source)));
	relpath = dsname + strlen(source);
	if (relpath[0] == '/')
	relpath++;
	}

	spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN);

	/*
	* Special case an alternate root of '/'. This will
	* avoid having multiple leading slashes in the
	* mountpoint path.
	*/
	if (strcmp(root, "/") == 0)
	root++;

	/*
	* If the mountpoint is '/' then skip over this
	* if we are obtaining either an alternate root or
	* an inherited mountpoint.
	*/
	char *mnt = value;
	if (value[1] == '\0' && (root[0] != '\0' \|\|
	relpath[0] != '\0'))
	mnt = value + 1;

	if (relpath[0] == '\0') {
	(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
	root, mnt);
	} else {
	(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s",
	root, mnt, relpath[0] == '@' ? "" : "/",
	relpath);
	}
	kmem_free(buf, ZAP_MAXVALUELEN);
	} else {
	/* 'legacy' or 'none' */
	(void) snprintf(value, ZAP_MAXVALUELEN, "%s", value);
	}
	return (0);
	}

	void
	dsl_dataset_stats(dsl_dataset_t ds, nvlist_t nv)
	{
	dsl_pool_t *dp = ds->ds_dir->dd_pool;

	ASSERT(dsl_pool_config_held(dp));

	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO,
	dsl_get_refratio(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
	dsl_get_logicalreferenced(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
	dsl_get_compressratio(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
	dsl_get_used(ds));

	if (ds->ds_is_snapshot) {
	get_clones_stat(ds, nv);
	} else {
	char buf[ZFS_MAX_DATASET_NAME_LEN];
	if (dsl_get_prev_snap(ds, buf) == 0)
	dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP,
	buf);
	dsl_dir_stats(ds->ds_dir, nv);
	}

	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
	dsl_get_available(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
	dsl_get_referenced(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
	dsl_get_creation(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
	dsl_get_creationtxg(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
	dsl_get_refquota(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
	dsl_get_refreservation(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
	dsl_get_guid(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
	dsl_get_unique(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
	dsl_get_objsetid(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
	dsl_get_userrefs(ds));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
	dsl_get_defer_destroy(ds));

	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	uint64_t written;
	if (dsl_get_written(ds, &written) == 0) {
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
	written);
	}
	}

	if (!dsl_dataset_is_snapshot(ds)) {
	/*
	* A failed "newfs" (e.g. full) resumable receive leaves
	* the stats set on this dataset. Check here for the prop.
	*/
	get_receive_resume_stats(ds, nv);

	/*
	* A failed incremental resumable receive leaves the
	* stats set on our child named "%recv". Check the child
	* for the prop.
	*/
	/* 6 extra bytes for /%recv */
	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
	dsl_dataset_t *recv_ds;
	dsl_dataset_name(ds, recvname);
	if (strlcat(recvname, "/", sizeof (recvname)) <
	sizeof (recvname) &&
	strlcat(recvname, recv_clone_name, sizeof (recvname)) <
	sizeof (recvname) &&
	dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
	get_receive_resume_stats(recv_ds, nv);
	dsl_dataset_rele(recv_ds, FTAG);
	}
	}
	}

	void
	dsl_dataset_fast_stat(dsl_dataset_t ds, dmu_objset_stats_t stat)
	{
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	ASSERT(dsl_pool_config_held(dp));

	stat->dds_creation_txg = dsl_get_creationtxg(ds);
	stat->dds_inconsistent = dsl_get_inconsistent(ds);
	stat->dds_guid = dsl_get_guid(ds);
	stat->dds_origin[0] = '\0';
	if (ds->ds_is_snapshot) {
	stat->dds_is_snapshot = B_TRUE;
	stat->dds_num_clones = dsl_get_numclones(ds);
	} else {
	stat->dds_is_snapshot = B_FALSE;
	stat->dds_num_clones = 0;

	if (dsl_dir_is_clone(ds->ds_dir)) {
	dsl_dir_get_origin(ds->ds_dir, stat->dds_origin);
	}
	}
	}

	uint64_t
	dsl_dataset_fsid_guid(dsl_dataset_t *ds)
	{
	return (ds->ds_fsid_guid);
	}

	void
	dsl_dataset_space(dsl_dataset_t *ds,
	uint64_t refdbytesp, uint64_t availbytesp,
	uint64_t usedobjsp, uint64_t availobjsp)
	{
	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
	*availbytesp +=
	ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
	if (ds->ds_quota != 0) {
	/*
	* Adjust available bytes according to refquota
	*/
	if (*refdbytesp < ds->ds_quota)
	availbytesp = MIN(availbytesp,
	ds->ds_quota - *refdbytesp);
	else
	*availbytesp = 0;
	}
	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
	rrw_exit(&ds->ds_bp_rwlock, FTAG);
	availobjsp = DN_MAX_OBJECT - usedobjsp;
	}

	boolean_t
	dsl_dataset_modified_since_snap(dsl_dataset_t ds, dsl_dataset_t snap)
	{
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	uint64_t birth;

	ASSERT(dsl_pool_config_held(dp));
	if (snap == NULL)
	return (B_FALSE);
	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	birth = dsl_dataset_get_blkptr(ds)->blk_birth;
	rrw_exit(&ds->ds_bp_rwlock, FTAG);
	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
	objset_t os, os_snap;
	/*
	* It may be that only the ZIL differs, because it was
	* reset in the head. Don't count that as being
	* modified.
	*/
	if (dmu_objset_from_ds(ds, &os) != 0)
	return (B_TRUE);
	if (dmu_objset_from_ds(snap, &os_snap) != 0)
	return (B_TRUE);
	return (bcmp(&os->os_phys->os_meta_dnode,
	&os_snap->os_phys->os_meta_dnode,
	sizeof (os->os_phys->os_meta_dnode)) != 0);
	}
	return (B_FALSE);
	}

	typedef struct dsl_dataset_rename_snapshot_arg {
	const char *ddrsa_fsname;
	const char *ddrsa_oldsnapname;
	const char *ddrsa_newsnapname;
	boolean_t ddrsa_recursive;
	dmu_tx_t *ddrsa_tx;
	} dsl_dataset_rename_snapshot_arg_t;

	/* ARGSUSED */
	static int
	dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
	dsl_dataset_t hds, void arg)
	{
	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
	int error;
	uint64_t val;

	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
	if (error != 0) {
	/* ignore nonexistent snapshots */
	return (error == ENOENT ? 0 : error);
	}

	/* new name should not exist */
	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
	if (error == 0)
	error = SET_ERROR(EEXIST);
	else if (error == ENOENT)
	error = 0;

	/* dataset name + 1 for the "@" + the new snapshot name must fit */
	if (dsl_dir_namelen(hds->ds_dir) + 1 +
	strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
	error = SET_ERROR(ENAMETOOLONG);

	return (error);
	}

	static int
	dsl_dataset_rename_snapshot_check(void arg, dmu_tx_t tx)
	{
	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *hds;
	int error;

	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
	if (error != 0)
	return (error);

	if (ddrsa->ddrsa_recursive) {
	error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
	dsl_dataset_rename_snapshot_check_impl, ddrsa,
	DS_FIND_CHILDREN);
	} else {
	error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
	}
	dsl_dataset_rele(hds, FTAG);
	return (error);
	}

	static int
	dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
	dsl_dataset_t hds, void arg)
	{
	#ifdef __FreeBSD__
	#ifdef _KERNEL
	char oldname, newname;
	#endif
	#endif
	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
	dsl_dataset_t *ds;
	uint64_t val;
	dmu_tx_t *tx = ddrsa->ddrsa_tx;
	int error;

	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
	ASSERT(error == 0 \|\| error == ENOENT);
	if (error == ENOENT) {
	/* ignore nonexistent snapshots */
	return (0);
	}

	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));

	/* log before we change the name */
	spa_history_log_internal_ds(ds, "rename", tx,
	"-> @%s", ddrsa->ddrsa_newsnapname);

	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
	B_FALSE));
	mutex_enter(&ds->ds_lock);
	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
	mutex_exit(&ds->ds_lock);
	VERIFY0(zap_add(dp->dp_meta_objset,
	dsl_dataset_phys(hds)->ds_snapnames_zapobj,
	ds->ds_snapname, 8, 1, &ds->ds_object, tx));

	#ifdef __FreeBSD__
	#ifdef _KERNEL
	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
	ddrsa->ddrsa_oldsnapname);
	snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
	ddrsa->ddrsa_newsnapname);
	zfsvfs_update_fromname(oldname, newname);
	zvol_rename_minors(oldname, newname);
	kmem_free(newname, MAXPATHLEN);
	kmem_free(oldname, MAXPATHLEN);
	#endif
	#endif
	dsl_dataset_rele(ds, FTAG);

	return (0);
	}

	static void
	dsl_dataset_rename_snapshot_sync(void arg, dmu_tx_t tx)
	{
	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *hds;

	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
	ddrsa->ddrsa_tx = tx;
	if (ddrsa->ddrsa_recursive) {
	VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
	dsl_dataset_rename_snapshot_sync_impl, ddrsa,
	DS_FIND_CHILDREN));
	} else {
	VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
	}
	dsl_dataset_rele(hds, FTAG);
	}

	int
	dsl_dataset_rename_snapshot(const char *fsname,
	const char oldsnapname, const char newsnapname, boolean_t recursive)
	{
	dsl_dataset_rename_snapshot_arg_t ddrsa;

	ddrsa.ddrsa_fsname = fsname;
	ddrsa.ddrsa_oldsnapname = oldsnapname;
	ddrsa.ddrsa_newsnapname = newsnapname;
	ddrsa.ddrsa_recursive = recursive;

	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
	dsl_dataset_rename_snapshot_sync, &ddrsa,
	1, ZFS_SPACE_CHECK_RESERVED));
	}

	/*
	* If we're doing an ownership handoff, we need to make sure that there is
	* only one long hold on the dataset. We're not allowed to change anything here
	* so we don't permanently release the long hold or regular hold here. We want
	* to do this only when syncing to avoid the dataset unexpectedly going away
	* when we release the long hold.
	*/
	static int
	dsl_dataset_handoff_check(dsl_dataset_t ds, void owner, dmu_tx_t *tx)
	{
	boolean_t held;

	if (!dmu_tx_is_syncing(tx))
	return (0);

	if (owner != NULL) {
	VERIFY3P(ds->ds_owner, ==, owner);
	dsl_dataset_long_rele(ds, owner);
	}

	held = dsl_dataset_long_held(ds);

	if (owner != NULL)
	dsl_dataset_long_hold(ds, owner);

	if (held)
	return (SET_ERROR(EBUSY));

	return (0);
	}

	int
	dsl_dataset_rollback_check(void arg, dmu_tx_t tx)
	{
	dsl_dataset_rollback_arg_t *ddra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	int64_t unused_refres_delta;
	int error;

	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
	if (error != 0)
	return (error);

	/* must not be a snapshot */
	if (ds->ds_is_snapshot) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(EINVAL));
	}

	/* must have a most recent snapshot */
	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(ESRCH));
	}

	/*
	* No rollback to a snapshot created in the current txg, because
	* the rollback may dirty the dataset and create blocks that are
	* not reachable from the rootbp while having a birth txg that
	* falls into the snapshot's range.
	*/
	if (dmu_tx_is_syncing(tx) &&
	dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(EAGAIN));
	}

	/*
	* If the expected target snapshot is specified, then check that
	* the latest snapshot is it.
	*/
	if (ddra->ddra_tosnap != NULL) {
	dsl_dataset_t *snapds;

	/* Check if the target snapshot exists at all. */
	error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds);
	if (error != 0) {
	/*
	* ESRCH is used to signal that the target snapshot does
	* not exist, while ENOENT is used to report that
	* the rolled back dataset does not exist.
	* ESRCH is also used to cover other cases where the
	* target snapshot is not related to the dataset being
	* rolled back such as being in a different pool.
	*/
	if (error == ENOENT \|\| error == EXDEV)
	error = SET_ERROR(ESRCH);
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}
	ASSERT(snapds->ds_is_snapshot);

	/* Check if the snapshot is the latest snapshot indeed. */
	if (snapds != ds->ds_prev) {
	/*
	* Distinguish between the case where the only problem
	* is intervening snapshots (EEXIST) vs the snapshot
	* not being a valid target for rollback (ESRCH).
	*/
	if (snapds->ds_dir == ds->ds_dir \|\|
	(dsl_dir_is_clone(ds->ds_dir) &&
	dsl_dir_phys(ds->ds_dir)->dd_origin_obj ==
	snapds->ds_object)) {
	error = SET_ERROR(EEXIST);
	} else {
	error = SET_ERROR(ESRCH);
	}
	dsl_dataset_rele(snapds, FTAG);
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}
	dsl_dataset_rele(snapds, FTAG);
	}

	/* must not have any bookmarks after the most recent snapshot */
	nvlist_t *proprequest = fnvlist_alloc();
	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
	nvlist_t *bookmarks = fnvlist_alloc();
	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
	fnvlist_free(proprequest);
	if (error != 0) {
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}
	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
	pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
	nvlist_t *valuenv =
	fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
	zfs_prop_to_name(ZFS_PROP_CREATETXG));
	uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
	if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
	fnvlist_free(bookmarks);
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(EEXIST));
	}
	}
	fnvlist_free(bookmarks);

	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
	if (error != 0) {
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}

	/*
	* Check if the snap we are rolling back to uses more than
	* the refquota.
	*/
	if (ds->ds_quota != 0 &&
	dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(EDQUOT));
	}

	/*
	* When we do the clone swap, we will temporarily use more space
	* due to the refreservation (the head will no longer have any
	* unique space, so the entire amount of the refreservation will need
	* to be free). We will immediately destroy the clone, freeing
	* this space, but the freeing happens over many txg's.
	*/
	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
	dsl_dataset_phys(ds)->ds_unique_bytes);

	if (unused_refres_delta > 0 &&
	unused_refres_delta >
	dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(ENOSPC));
	}

	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	void
	dsl_dataset_rollback_sync(void arg, dmu_tx_t tx)
	{
	dsl_dataset_rollback_arg_t *ddra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t ds, clone;
	uint64_t cloneobj;
	char namebuf[ZFS_MAX_DATASET_NAME_LEN];

	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));

	dsl_dataset_name(ds->ds_prev, namebuf);
	fnvlist_add_string(ddra->ddra_result, "target", namebuf);

	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
	ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);

	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));

	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
	dsl_dataset_zero_zil(ds, tx);

	dsl_destroy_head_sync_impl(clone, tx);

	dsl_dataset_rele(clone, FTAG);
	dsl_dataset_rele(ds, FTAG);
	}

	/*
	* Rolls back the given filesystem or volume to the most recent snapshot.
	* The name of the most recent snapshot will be returned under key "target"
	* in the result nvlist.
	*
	* If owner != NULL:
	* - The existing dataset MUST be owned by the specified owner at entry
	* - Upon return, dataset will still be held by the same owner, whether we
	* succeed or not.
	*
	* This mode is required any time the existing filesystem is mounted. See
	* notes above zfs_suspend_fs() for further details.
	*/
	int
	dsl_dataset_rollback(const char fsname, const char tosnap, void *owner,
	nvlist_t *result)
	{
	dsl_dataset_rollback_arg_t ddra;

	ddra.ddra_fsname = fsname;
	ddra.ddra_tosnap = tosnap;
	ddra.ddra_owner = owner;
	ddra.ddra_result = result;

	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
	dsl_dataset_rollback_sync, &ddra,
	1, ZFS_SPACE_CHECK_RESERVED));
	}

	struct promotenode {
	list_node_t link;
	dsl_dataset_t *ds;
	};

	static int snaplist_space(list_t l, uint64_t mintxg, uint64_t spacep);
	static int promote_hold(dsl_dataset_promote_arg_t ddpa, dsl_pool_t dp,
	void *tag);
	static void promote_rele(dsl_dataset_promote_arg_t ddpa, void tag);

	int
	dsl_dataset_promote_check(void arg, dmu_tx_t tx)
	{
	dsl_dataset_promote_arg_t *ddpa = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *hds;
	struct promotenode *snap;
	dsl_dataset_t *origin_ds;
	int err;
	uint64_t unused;
	uint64_t ss_mv_cnt;
	size_t max_snap_len;
	boolean_t conflicting_snaps;

	err = promote_hold(ddpa, dp, FTAG);
	if (err != 0)
	return (err);

	hds = ddpa->ddpa_clone;
	snap = list_head(&ddpa->shared_snaps);
	origin_ds = snap->ds;
	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;

	snap = list_head(&ddpa->origin_snaps);

	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
	promote_rele(ddpa, FTAG);
	return (SET_ERROR(EXDEV));
	}

	/*
	* Compute and check the amount of space to transfer. Since this is
	* so expensive, don't do the preliminary check.
	*/
	if (!dmu_tx_is_syncing(tx)) {
	promote_rele(ddpa, FTAG);
	return (0);
	}

	/* compute origin's new unique space */
	snap = list_tail(&ddpa->clone_snaps);
	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
	origin_ds->ds_object);
	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
	dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
	&ddpa->unique, &unused, &unused);

	/*
	* Walk the snapshots that we are moving
	*
	* Compute space to transfer. Consider the incremental changes
	* to used by each snapshot:
	* (my used) = (prev's used) + (blocks born) - (blocks killed)
	* So each snapshot gave birth to:
	* (blocks born) = (my used) - (prev's used) + (blocks killed)
	* So a sequence would look like:
	* (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
	* Which simplifies to:
	* uN + kN + kN-1 + ... + k1 + k0
	* Note however, if we stop before we reach the ORIGIN we get:
	* uN + kN + kN-1 + ... + kM - uM-1
	*/
	conflicting_snaps = B_FALSE;
	ss_mv_cnt = 0;
	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
	for (snap = list_head(&ddpa->shared_snaps); snap;
	snap = list_next(&ddpa->shared_snaps, snap)) {
	uint64_t val, dlused, dlcomp, dluncomp;
	dsl_dataset_t *ds = snap->ds;

	ss_mv_cnt++;

	/*
	* If there are long holds, we won't be able to evict
	* the objset.
	*/
	if (dsl_dataset_long_held(ds)) {
	err = SET_ERROR(EBUSY);
	goto out;
	}

	/* Check that the snapshot name does not conflict */
	VERIFY0(dsl_dataset_get_snapname(ds));
	if (strlen(ds->ds_snapname) >= max_snap_len) {
	err = SET_ERROR(ENAMETOOLONG);
	goto out;
	}
	err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
	if (err == 0) {
	fnvlist_add_boolean(ddpa->err_ds,
	snap->ds->ds_snapname);
	conflicting_snaps = B_TRUE;
	} else if (err != ENOENT) {
	goto out;
	}

	/* The very first snapshot does not have a deadlist */
	if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
	continue;

	dsl_deadlist_space(&ds->ds_deadlist,
	&dlused, &dlcomp, &dluncomp);
	ddpa->used += dlused;
	ddpa->comp += dlcomp;
	ddpa->uncomp += dluncomp;
	}

	/*
	* In order to return the full list of conflicting snapshots, we check
	* whether there was a conflict after traversing all of them.
	*/
	if (conflicting_snaps) {
	err = SET_ERROR(EEXIST);
	goto out;
	}

	/*
	* If we are a clone of a clone then we never reached ORIGIN,
	* so we need to subtract out the clone origin's used space.
	*/
	if (ddpa->origin_origin) {
	ddpa->used -=
	dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
	ddpa->comp -=
	dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
	ddpa->uncomp -=
	dsl_dataset_phys(ddpa->origin_origin)->
	ds_uncompressed_bytes;
	}

	/* Check that there is enough space and limit headroom here */
	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
	0, ss_mv_cnt, ddpa->used, ddpa->cr);
	if (err != 0)
	goto out;

	/*
	* Compute the amounts of space that will be used by snapshots
	* after the promotion (for both origin and clone). For each,
	* it is the amount of space that will be on all of their
	* deadlists (that was not born before their new origin).
	*/
	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
	uint64_t space;

	/*
	* Note, typically this will not be a clone of a clone,
	* so dd_origin_txg will be < TXG_INITIAL, so
	* these snaplist_space() -> dsl_deadlist_space_range()
	* calls will be fast because they do not have to
	* iterate over all bps.
	*/
	snap = list_head(&ddpa->origin_snaps);
	err = snaplist_space(&ddpa->shared_snaps,
	snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
	if (err != 0)
	goto out;

	err = snaplist_space(&ddpa->clone_snaps,
	snap->ds->ds_dir->dd_origin_txg, &space);
	if (err != 0)
	goto out;
	ddpa->cloneusedsnap += space;
	}
	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
	DD_FLAG_USED_BREAKDOWN) {
	err = snaplist_space(&ddpa->origin_snaps,
	dsl_dataset_phys(origin_ds)->ds_creation_txg,
	&ddpa->originusedsnap);
	if (err != 0)
	goto out;
	}

	out:
	promote_rele(ddpa, FTAG);
	return (err);
	}

	void
	dsl_dataset_promote_sync(void arg, dmu_tx_t tx)
	{
	dsl_dataset_promote_arg_t *ddpa = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *hds;
	struct promotenode *snap;
	dsl_dataset_t *origin_ds;
	dsl_dataset_t *origin_head;
	dsl_dir_t *dd;
	dsl_dir_t *odd = NULL;
	uint64_t oldnext_obj;
	int64_t delta;
	#if defined(__FreeBSD__) && defined(_KERNEL)
	char oldname, newname;
	#endif

	VERIFY0(promote_hold(ddpa, dp, FTAG));
	hds = ddpa->ddpa_clone;

	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);

	snap = list_head(&ddpa->shared_snaps);
	origin_ds = snap->ds;
	dd = hds->ds_dir;

	snap = list_head(&ddpa->origin_snaps);
	origin_head = snap->ds;

	/*
	* We need to explicitly open odd, since origin_ds's dd will be
	* changing.
	*/
	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
	NULL, FTAG, &odd));

	/* change origin's next snap */
	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
	snap = list_tail(&ddpa->clone_snaps);
	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
	origin_ds->ds_object);
	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;

	/* change the origin's next clone */
	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
	dsl_dataset_remove_from_next_clones(origin_ds,
	snap->ds->ds_object, tx);
	VERIFY0(zap_add_int(dp->dp_meta_objset,
	dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
	oldnext_obj, tx));
	}

	/* change origin */
	dmu_buf_will_dirty(dd->dd_dbuf, tx);
	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
	dmu_buf_will_dirty(odd->dd_dbuf, tx);
	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
	origin_head->ds_dir->dd_origin_txg =
	dsl_dataset_phys(origin_ds)->ds_creation_txg;

	/* change dd_clone entries */
	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
	VERIFY0(zap_remove_int(dp->dp_meta_objset,
	dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
	VERIFY0(zap_add_int(dp->dp_meta_objset,
	dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
	hds->ds_object, tx));

	VERIFY0(zap_remove_int(dp->dp_meta_objset,
	dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
	origin_head->ds_object, tx));
	if (dsl_dir_phys(dd)->dd_clones == 0) {
	dsl_dir_phys(dd)->dd_clones =
	zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
	DMU_OT_NONE, 0, tx);
	}
	VERIFY0(zap_add_int(dp->dp_meta_objset,
	dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
	}

	#if defined(__FreeBSD__) && defined(_KERNEL)
	/* Take the spa_namespace_lock early so zvol renames don't deadlock. */
	mutex_enter(&spa_namespace_lock);

	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	#endif

	/* move snapshots to this dir */
	for (snap = list_head(&ddpa->shared_snaps); snap;
	snap = list_next(&ddpa->shared_snaps, snap)) {
	dsl_dataset_t *ds = snap->ds;

	/*
	* Property callbacks are registered to a particular
	* dsl_dir. Since ours is changing, evict the objset
	* so that they will be unregistered from the old dsl_dir.
	*/
	if (ds->ds_objset) {
	dmu_objset_evict(ds->ds_objset);
	ds->ds_objset = NULL;
	}

	/* move snap name entry */
	VERIFY0(dsl_dataset_get_snapname(ds));
	VERIFY0(dsl_dataset_snap_remove(origin_head,
	ds->ds_snapname, tx, B_TRUE));
	VERIFY0(zap_add(dp->dp_meta_objset,
	dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
	8, 1, &ds->ds_object, tx));
	dsl_fs_ss_count_adjust(hds->ds_dir, 1,
	DD_FIELD_SNAPSHOT_COUNT, tx);

	/* change containing dsl_dir */
	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
	dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
	ASSERT3P(ds->ds_dir, ==, odd);
	dsl_dir_rele(ds->ds_dir, ds);
	VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
	NULL, ds, &ds->ds_dir));

	#if defined(__FreeBSD__) && defined(_KERNEL)
	dsl_dataset_name(ds, newname);
	zfsvfs_update_fromname(oldname, newname);
	zvol_rename_minors(oldname, newname);
	#endif

	/* move any clone references */
	if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
	spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
	zap_cursor_t zc;
	zap_attribute_t za;

	for (zap_cursor_init(&zc, dp->dp_meta_objset,
	dsl_dataset_phys(ds)->ds_next_clones_obj);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	dsl_dataset_t *cnds;
	uint64_t o;

	if (za.za_first_integer == oldnext_obj) {
	/*
	* We've already moved the
	* origin's reference.
	*/
	continue;
	}

	VERIFY0(dsl_dataset_hold_obj(dp,
	za.za_first_integer, FTAG, &cnds));
	o = dsl_dir_phys(cnds->ds_dir)->
	dd_head_dataset_obj;

	VERIFY0(zap_remove_int(dp->dp_meta_objset,
	dsl_dir_phys(odd)->dd_clones, o, tx));
	VERIFY0(zap_add_int(dp->dp_meta_objset,
	dsl_dir_phys(dd)->dd_clones, o, tx));
	dsl_dataset_rele(cnds, FTAG);
	}
	zap_cursor_fini(&zc);
	}

	ASSERT(!dsl_prop_hascb(ds));
	}

	#if defined(__FreeBSD__) && defined(_KERNEL)
	mutex_exit(&spa_namespace_lock);

	kmem_free(newname, MAXPATHLEN);
	kmem_free(oldname, MAXPATHLEN);
	#endif
	/*
	* Change space accounting.
	* Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
	* both be valid, or both be 0 (resulting in delta == 0). This
	* is true for each of {clone,origin} independently.
	*/

	delta = ddpa->cloneusedsnap -
	dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
	ASSERT3S(delta, >=, 0);
	ASSERT3U(ddpa->used, >=, delta);
	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
	dsl_dir_diduse_space(dd, DD_USED_HEAD,
	ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);

	delta = ddpa->originusedsnap -
	dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
	ASSERT3S(delta, <=, 0);
	ASSERT3U(ddpa->used, >=, -delta);
	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
	dsl_dir_diduse_space(odd, DD_USED_HEAD,
	-ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);

	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;

	/* log history record */
	spa_history_log_internal_ds(hds, "promote", tx, "");

	dsl_dir_rele(odd, FTAG);
	promote_rele(ddpa, FTAG);
	}

	/*
	* Make a list of dsl_dataset_t's for the snapshots between first_obj
	* (exclusive) and last_obj (inclusive). The list will be in reverse
	* order (last_obj will be the list_head()). If first_obj == 0, do all
	* snapshots back to this dataset's origin.
	*/
	static int
	snaplist_make(dsl_pool_t *dp,
	uint64_t first_obj, uint64_t last_obj, list_t l, void tag)
	{
	uint64_t obj = last_obj;

	list_create(l, sizeof (struct promotenode),
	offsetof(struct promotenode, link));

	while (obj != first_obj) {
	dsl_dataset_t *ds;
	struct promotenode *snap;
	int err;

	err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
	ASSERT(err != ENOENT);
	if (err != 0)
	return (err);

	if (first_obj == 0)
	first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;

	snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
	snap->ds = ds;
	list_insert_tail(l, snap);
	obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
	}

	return (0);
	}

	static int
	snaplist_space(list_t l, uint64_t mintxg, uint64_t spacep)
	{
	struct promotenode *snap;

	*spacep = 0;
	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
	uint64_t used, comp, uncomp;
	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
	mintxg, UINT64_MAX, &used, &comp, &uncomp);
	*spacep += used;
	}
	return (0);
	}

	static void
	snaplist_destroy(list_t l, void tag)
	{
	struct promotenode *snap;

	if (l == NULL \|\| !list_link_active(&l->list_head))
	return;

	while ((snap = list_tail(l)) != NULL) {
	list_remove(l, snap);
	dsl_dataset_rele(snap->ds, tag);
	kmem_free(snap, sizeof (*snap));
	}
	list_destroy(l);
	}

	static int
	promote_hold(dsl_dataset_promote_arg_t ddpa, dsl_pool_t dp, void *tag)
	{
	int error;
	dsl_dir_t *dd;
	struct promotenode *snap;

	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
	&ddpa->ddpa_clone);
	if (error != 0)
	return (error);
	dd = ddpa->ddpa_clone->ds_dir;

	if (ddpa->ddpa_clone->ds_is_snapshot \|\|
	!dsl_dir_is_clone(dd)) {
	dsl_dataset_rele(ddpa->ddpa_clone, tag);
	return (SET_ERROR(EINVAL));
	}

	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
	&ddpa->shared_snaps, tag);
	if (error != 0)
	goto out;

	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
	&ddpa->clone_snaps, tag);
	if (error != 0)
	goto out;

	snap = list_head(&ddpa->shared_snaps);
	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
	dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
	&ddpa->origin_snaps, tag);
	if (error != 0)
	goto out;

	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
	error = dsl_dataset_hold_obj(dp,
	dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
	tag, &ddpa->origin_origin);
	if (error != 0)
	goto out;
	}
	out:
	if (error != 0)
	promote_rele(ddpa, tag);
	return (error);
	}

	static void
	promote_rele(dsl_dataset_promote_arg_t ddpa, void tag)
	{
	snaplist_destroy(&ddpa->shared_snaps, tag);
	snaplist_destroy(&ddpa->clone_snaps, tag);
	snaplist_destroy(&ddpa->origin_snaps, tag);
	if (ddpa->origin_origin != NULL)
	dsl_dataset_rele(ddpa->origin_origin, tag);
	dsl_dataset_rele(ddpa->ddpa_clone, tag);
	}

	/*
	* Promote a clone.
	*
	* If it fails due to a conflicting snapshot name, "conflsnap" will be filled
	* in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
	*/
	int
	dsl_dataset_promote(const char name, char conflsnap)
	{
	dsl_dataset_promote_arg_t ddpa = { 0 };
	uint64_t numsnaps;
	int error;
	nvpair_t *snap_pair;
	objset_t *os;

	/*
	* We will modify space proportional to the number of
	* snapshots. Compute numsnaps.
	*/
	error = dmu_objset_hold(name, FTAG, &os);
	if (error != 0)
	return (error);
	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
	dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
	&numsnaps);
	dmu_objset_rele(os, FTAG);
	if (error != 0)
	return (error);

	ddpa.ddpa_clonename = name;
	ddpa.err_ds = fnvlist_alloc();
	ddpa.cr = CRED();

	error = dsl_sync_task(name, dsl_dataset_promote_check,
	dsl_dataset_promote_sync, &ddpa,
	2 + numsnaps, ZFS_SPACE_CHECK_RESERVED);

	/*
	* Return the first conflicting snapshot found.
	*/
	snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL);
	if (snap_pair != NULL && conflsnap != NULL)
	(void) strcpy(conflsnap, nvpair_name(snap_pair));

	fnvlist_free(ddpa.err_ds);
	return (error);
	}

	int
	dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
	dsl_dataset_t origin_head, boolean_t force, void owner, dmu_tx_t *tx)
	{
	/*
	* "slack" factor for received datasets with refquota set on them.
	* See the bottom of this function for details on its use.
	*/
	uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation;
	int64_t unused_refres_delta;

	/* they should both be heads */
	if (clone->ds_is_snapshot \|\|
	origin_head->ds_is_snapshot)
	return (SET_ERROR(EINVAL));

	/* if we are not forcing, the branch point should be just before them */
	if (!force && clone->ds_prev != origin_head->ds_prev)
	return (SET_ERROR(EINVAL));

	/* clone should be the clone (unless they are unrelated) */
	if (clone->ds_prev != NULL &&
	clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
	origin_head->ds_dir != clone->ds_prev->ds_dir)
	return (SET_ERROR(EINVAL));

	/* the clone should be a child of the origin */
	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
	return (SET_ERROR(EINVAL));

	/* origin_head shouldn't be modified unless 'force' */
	if (!force &&
	dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
	return (SET_ERROR(ETXTBSY));

	/* origin_head should have no long holds (e.g. is not mounted) */
	if (dsl_dataset_handoff_check(origin_head, owner, tx))
	return (SET_ERROR(EBUSY));

	/* check amount of any unconsumed refreservation */
	unused_refres_delta =
	(int64_t)MIN(origin_head->ds_reserved,
	dsl_dataset_phys(origin_head)->ds_unique_bytes) -
	(int64_t)MIN(origin_head->ds_reserved,
	dsl_dataset_phys(clone)->ds_unique_bytes);

	if (unused_refres_delta > 0 &&
	unused_refres_delta >
	dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
	return (SET_ERROR(ENOSPC));

	/*
	* The clone can't be too much over the head's refquota.
	*
	* To ensure that the entire refquota can be used, we allow one
	* transaction to exceed the the refquota. Therefore, this check
	* needs to also allow for the space referenced to be more than the
	* refquota. The maximum amount of space that one transaction can use
	* on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this
	* overage ensures that we are able to receive a filesystem that
	* exceeds the refquota on the source system.
	*
	* So that overage is the refquota_slack we use below.
	*/
	if (origin_head->ds_quota != 0 &&
	dsl_dataset_phys(clone)->ds_referenced_bytes >
	origin_head->ds_quota + refquota_slack)
	return (SET_ERROR(EDQUOT));

	return (0);
	}

	+static void
	+dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone,
	+ dsl_dataset_t origin, dmu_tx_t tx)
	+{
	+ uint64_t clone_remap_dl_obj, origin_remap_dl_obj;
	+ dsl_pool_t *dp = dmu_tx_pool(tx);
	+
	+ ASSERT(dsl_pool_sync_context(dp));
	+
	+ clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone);
	+ origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin);
	+
	+ if (clone_remap_dl_obj != 0) {
	+ dsl_deadlist_close(&clone->ds_remap_deadlist);
	+ dsl_dataset_unset_remap_deadlist_object(clone, tx);
	+ }
	+ if (origin_remap_dl_obj != 0) {
	+ dsl_deadlist_close(&origin->ds_remap_deadlist);
	+ dsl_dataset_unset_remap_deadlist_object(origin, tx);
	+ }
	+
	+ if (clone_remap_dl_obj != 0) {
	+ dsl_dataset_set_remap_deadlist_object(origin,
	+ clone_remap_dl_obj, tx);
	+ dsl_deadlist_open(&origin->ds_remap_deadlist,
	+ dp->dp_meta_objset, clone_remap_dl_obj);
	+ }
	+ if (origin_remap_dl_obj != 0) {
	+ dsl_dataset_set_remap_deadlist_object(clone,
	+ origin_remap_dl_obj, tx);
	+ dsl_deadlist_open(&clone->ds_remap_deadlist,
	+ dp->dp_meta_objset, origin_remap_dl_obj);
	+ }
	+}
	+
	void
	dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
	dsl_dataset_t origin_head, dmu_tx_t tx)
	{
	dsl_pool_t *dp = dmu_tx_pool(tx);
	int64_t unused_refres_delta;

	ASSERT(clone->ds_reserved == 0);
	/*
	* NOTE: On DEBUG kernels there could be a race between this and
	* the check function if spa_asize_inflation is adjusted...
	*/
	ASSERT(origin_head->ds_quota == 0 \|\|
	dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
	DMU_MAX_ACCESS * spa_asize_inflation);
	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);

	/*
	* Swap per-dataset feature flags.
	*/
	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	if (!(spa_feature_table[f].fi_flags &
	ZFEATURE_FLAG_PER_DATASET)) {
	ASSERT(!clone->ds_feature_inuse[f]);
	ASSERT(!origin_head->ds_feature_inuse[f]);
	continue;
	}

	boolean_t clone_inuse = clone->ds_feature_inuse[f];
	boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];

	if (clone_inuse) {
	dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
	clone->ds_feature_inuse[f] = B_FALSE;
	}
	if (origin_head_inuse) {
	dsl_dataset_deactivate_feature(origin_head->ds_object,
	f, tx);
	origin_head->ds_feature_inuse[f] = B_FALSE;
	}
	if (clone_inuse) {
	dsl_dataset_activate_feature(origin_head->ds_object,
	f, tx);
	origin_head->ds_feature_inuse[f] = B_TRUE;
	}
	if (origin_head_inuse) {
	dsl_dataset_activate_feature(clone->ds_object, f, tx);
	clone->ds_feature_inuse[f] = B_TRUE;
	}
	}

	dmu_buf_will_dirty(clone->ds_dbuf, tx);
	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);

	if (clone->ds_objset != NULL) {
	dmu_objset_evict(clone->ds_objset);
	clone->ds_objset = NULL;
	}

	if (origin_head->ds_objset != NULL) {
	dmu_objset_evict(origin_head->ds_objset);
	origin_head->ds_objset = NULL;
	}

	unused_refres_delta =
	(int64_t)MIN(origin_head->ds_reserved,
	dsl_dataset_phys(origin_head)->ds_unique_bytes) -
	(int64_t)MIN(origin_head->ds_reserved,
	dsl_dataset_phys(clone)->ds_unique_bytes);

	/*
	* Reset origin's unique bytes, if it exists.
	*/
	if (clone->ds_prev) {
	dsl_dataset_t *origin = clone->ds_prev;
	uint64_t comp, uncomp;

	dmu_buf_will_dirty(origin->ds_dbuf, tx);
	dsl_deadlist_space_range(&clone->ds_deadlist,
	dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
	&dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
	}

	/* swap blkptrs */
	{
	rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
	rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
	blkptr_t tmp;
	tmp = dsl_dataset_phys(origin_head)->ds_bp;
	dsl_dataset_phys(origin_head)->ds_bp =
	dsl_dataset_phys(clone)->ds_bp;
	dsl_dataset_phys(clone)->ds_bp = tmp;
	rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
	rrw_exit(&clone->ds_bp_rwlock, FTAG);
	}

	/* set dd__bytes /
	{
	int64_t dused, dcomp, duncomp;
	uint64_t cdl_used, cdl_comp, cdl_uncomp;
	uint64_t odl_used, odl_comp, odl_uncomp;

	ASSERT3U(dsl_dir_phys(clone->ds_dir)->
	dd_used_breakdown[DD_USED_SNAP], ==, 0);

	dsl_deadlist_space(&clone->ds_deadlist,
	&cdl_used, &cdl_comp, &cdl_uncomp);
	dsl_deadlist_space(&origin_head->ds_deadlist,
	&odl_used, &odl_comp, &odl_uncomp);

	dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
	cdl_used -
	(dsl_dataset_phys(origin_head)->ds_referenced_bytes +
	odl_used);
	dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
	cdl_comp -
	(dsl_dataset_phys(origin_head)->ds_compressed_bytes +
	odl_comp);
	duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
	cdl_uncomp -
	(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
	odl_uncomp);

	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
	dused, dcomp, duncomp, tx);
	dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
	-dused, -dcomp, -duncomp, tx);

	/*
	* The difference in the space used by snapshots is the
	* difference in snapshot space due to the head's
	* deadlist (since that's the only thing that's
	* changing that affects the snapused).
	*/
	dsl_deadlist_space_range(&clone->ds_deadlist,
	origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
	&cdl_used, &cdl_comp, &cdl_uncomp);
	dsl_deadlist_space_range(&origin_head->ds_deadlist,
	origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
	&odl_used, &odl_comp, &odl_uncomp);
	dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
	DD_USED_HEAD, DD_USED_SNAP, NULL);
	}

	/* swap ds__bytes /
	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
	dsl_dataset_phys(clone)->ds_referenced_bytes);
	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
	dsl_dataset_phys(clone)->ds_compressed_bytes);
	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
	dsl_dataset_phys(clone)->ds_uncompressed_bytes);
	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
	dsl_dataset_phys(clone)->ds_unique_bytes);

	/* apply any parent delta for change in unconsumed refreservation */
	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
	unused_refres_delta, 0, 0, tx);

	/*
	* Swap deadlists.
	*/
	dsl_deadlist_close(&clone->ds_deadlist);
	dsl_deadlist_close(&origin_head->ds_deadlist);
	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
	dsl_dataset_phys(clone)->ds_deadlist_obj);
	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
	dsl_dataset_phys(clone)->ds_deadlist_obj);
	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
	dsl_dataset_phys(origin_head)->ds_deadlist_obj);
	+ dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);

	dsl_scan_ds_clone_swapped(origin_head, clone, tx);

	spa_history_log_internal_ds(clone, "clone swap", tx,
	"parent=%s", origin_head->ds_dir->dd_myname);
	}

	/*
	* Given a pool name and a dataset object number in that pool,
	* return the name of that dataset.
	*/
	int
	dsl_dsobj_to_dsname(char pname, uint64_t obj, char buf)
	{
	dsl_pool_t *dp;
	dsl_dataset_t *ds;
	int error;

	error = dsl_pool_hold(pname, FTAG, &dp);
	if (error != 0)
	return (error);

	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
	if (error == 0) {
	dsl_dataset_name(ds, buf);
	dsl_dataset_rele(ds, FTAG);
	}
	dsl_pool_rele(dp, FTAG);

	return (error);
	}

	int
	dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
	uint64_t asize, uint64_t inflight, uint64_t used, uint64_t ref_rsrv)
	{
	int error = 0;

	ASSERT3S(asize, >, 0);

	/*
	* *ref_rsrv is the portion of asize that will come from any
	* unconsumed refreservation space.
	*/
	*ref_rsrv = 0;

	mutex_enter(&ds->ds_lock);
	/*
	* Make a space adjustment for reserved bytes.
	*/
	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
	ASSERT3U(*used, >=,
	ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
	*used -=
	(ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
	*ref_rsrv =
	asize - MIN(asize, parent_delta(ds, asize + inflight));
	}

	if (!check_quota \|\| ds->ds_quota == 0) {
	mutex_exit(&ds->ds_lock);
	return (0);
	}
	/*
	* If they are requesting more space, and our current estimate
	* is over quota, they get to try again unless the actual
	* on-disk is over quota and there are no pending changes (which
	* may free up space for us).
	*/
	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
	ds->ds_quota) {
	if (inflight > 0 \|\|
	dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
	error = SET_ERROR(ERESTART);
	else
	error = SET_ERROR(EDQUOT);
	}
	mutex_exit(&ds->ds_lock);

	return (error);
	}

	typedef struct dsl_dataset_set_qr_arg {
	const char *ddsqra_name;
	zprop_source_t ddsqra_source;
	uint64_t ddsqra_value;
	} dsl_dataset_set_qr_arg_t;


	/* ARGSUSED */
	static int
	dsl_dataset_set_refquota_check(void arg, dmu_tx_t tx)
	{
	dsl_dataset_set_qr_arg_t *ddsqra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	int error;
	uint64_t newval;

	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
	return (SET_ERROR(ENOTSUP));

	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
	if (error != 0)
	return (error);

	if (ds->ds_is_snapshot) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(EINVAL));
	}

	error = dsl_prop_predict(ds->ds_dir,
	zfs_prop_to_name(ZFS_PROP_REFQUOTA),
	ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
	if (error != 0) {
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}

	if (newval == 0) {
	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes \|\|
	newval < ds->ds_reserved) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(ENOSPC));
	}

	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	static void
	dsl_dataset_set_refquota_sync(void arg, dmu_tx_t tx)
	{
	dsl_dataset_set_qr_arg_t *ddsqra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	uint64_t newval;

	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));

	dsl_prop_set_sync_impl(ds,
	zfs_prop_to_name(ZFS_PROP_REFQUOTA),
	ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
	&ddsqra->ddsqra_value, tx);

	VERIFY0(dsl_prop_get_int_ds(ds,
	zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));

	if (ds->ds_quota != newval) {
	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	ds->ds_quota = newval;
	}
	dsl_dataset_rele(ds, FTAG);
	}

	int
	dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
	uint64_t refquota)
	{
	dsl_dataset_set_qr_arg_t ddsqra;

	ddsqra.ddsqra_name = dsname;
	ddsqra.ddsqra_source = source;
	ddsqra.ddsqra_value = refquota;

	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
	dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
	}

	static int
	dsl_dataset_set_refreservation_check(void arg, dmu_tx_t tx)
	{
	dsl_dataset_set_qr_arg_t *ddsqra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	int error;
	uint64_t newval, unique;

	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
	return (SET_ERROR(ENOTSUP));

	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
	if (error != 0)
	return (error);

	if (ds->ds_is_snapshot) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(EINVAL));
	}

	error = dsl_prop_predict(ds->ds_dir,
	zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
	ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
	if (error != 0) {
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}

	/*
	* If we are doing the preliminary check in open context, the
	* space estimates may be inaccurate.
	*/
	if (!dmu_tx_is_syncing(tx)) {
	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	mutex_enter(&ds->ds_lock);
	if (!DS_UNIQUE_IS_ACCURATE(ds))
	dsl_dataset_recalc_head_uniq(ds);
	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
	mutex_exit(&ds->ds_lock);

	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
	uint64_t delta = MAX(unique, newval) -
	MAX(unique, ds->ds_reserved);

	if (delta >
	dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) \|\|
	(ds->ds_quota > 0 && newval > ds->ds_quota)) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(ENOSPC));
	}
	}

	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	void
	dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
	zprop_source_t source, uint64_t value, dmu_tx_t *tx)
	{
	uint64_t newval;
	uint64_t unique;
	int64_t delta;

	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
	source, sizeof (value), 1, &value, tx);

	VERIFY0(dsl_prop_get_int_ds(ds,
	zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));

	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	mutex_enter(&ds->ds_dir->dd_lock);
	mutex_enter(&ds->ds_lock);
	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
	delta = MAX(0, (int64_t)(newval - unique)) -
	MAX(0, (int64_t)(ds->ds_reserved - unique));
	ds->ds_reserved = newval;
	mutex_exit(&ds->ds_lock);

	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
	mutex_exit(&ds->ds_dir->dd_lock);
	}

	static void
	dsl_dataset_set_refreservation_sync(void arg, dmu_tx_t tx)
	{
	dsl_dataset_set_qr_arg_t *ddsqra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;

	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
	dsl_dataset_set_refreservation_sync_impl(ds,
	ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
	dsl_dataset_rele(ds, FTAG);
	}

	int
	dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
	uint64_t refreservation)
	{
	dsl_dataset_set_qr_arg_t ddsqra;

	ddsqra.ddsqra_name = dsname;
	ddsqra.ddsqra_source = source;
	ddsqra.ddsqra_value = refreservation;

	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
	dsl_dataset_set_refreservation_sync, &ddsqra,
	0, ZFS_SPACE_CHECK_NONE));
	}

	/*
	* Return (in *usedp) the amount of space written in new that is not
	* present in oldsnap. New may be a snapshot or the head. Old must be
	* a snapshot before new, in new's filesystem (or its origin). If not then
	* fail and return EINVAL.
	*
	* The written space is calculated by considering two components: First, we
	* ignore any freed space, and calculate the written as new's used space
	* minus old's used space. Next, we add in the amount of space that was freed
	* between the two snapshots, thus reducing new's used space relative to old's.
	* Specifically, this is the space that was born before old->ds_creation_txg,
	* and freed before new (ie. on new's deadlist or a previous deadlist).
	*
	* space freed [---------------------]
	* snapshots ---O-------O--------O-------O------
	* oldsnap new
	*/
	int
	dsl_dataset_space_written(dsl_dataset_t oldsnap, dsl_dataset_t new,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp)
	{
	int err = 0;
	uint64_t snapobj;
	dsl_pool_t *dp = new->ds_dir->dd_pool;

	ASSERT(dsl_pool_config_held(dp));

	*usedp = 0;
	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;

	*compp = 0;
	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;

	*uncompp = 0;
	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;

	snapobj = new->ds_object;
	while (snapobj != oldsnap->ds_object) {
	dsl_dataset_t *snap;
	uint64_t used, comp, uncomp;

	if (snapobj == new->ds_object) {
	snap = new;
	} else {
	err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
	if (err != 0)
	break;
	}

	if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
	dsl_dataset_phys(oldsnap)->ds_creation_txg) {
	/*
	* The blocks in the deadlist can not be born after
	* ds_prev_snap_txg, so get the whole deadlist space,
	* which is more efficient (especially for old-format
	* deadlists). Unfortunately the deadlist code
	* doesn't have enough information to make this
	* optimization itself.
	*/
	dsl_deadlist_space(&snap->ds_deadlist,
	&used, &comp, &uncomp);
	} else {
	dsl_deadlist_space_range(&snap->ds_deadlist,
	0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
	&used, &comp, &uncomp);
	}
	*usedp += used;
	*compp += comp;
	*uncompp += uncomp;

	/*
	* If we get to the beginning of the chain of snapshots
	* (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
	* was not a snapshot of/before new.
	*/
	snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
	if (snap != new)
	dsl_dataset_rele(snap, FTAG);
	if (snapobj == 0) {
	err = SET_ERROR(EINVAL);
	break;
	}

	}
	return (err);
	}

	/*
	* Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
	* lastsnap, and all snapshots in between are deleted.
	*
	* blocks that would be freed [---------------------------]
	* snapshots ---O-------O--------O-------O--------O
	* firstsnap lastsnap
	*
	* This is the set of blocks that were born after the snap before firstsnap,
	* (birth > firstsnap->prev_snap_txg) and died before the snap after the
	* last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
	* We calculate this by iterating over the relevant deadlists (from the snap
	* after lastsnap, backward to the snap after firstsnap), summing up the
	* space on the deadlist that was born after the snap before firstsnap.
	*/
	int
	dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
	dsl_dataset_t *lastsnap,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp)
	{
	int err = 0;
	uint64_t snapobj;
	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;

	ASSERT(firstsnap->ds_is_snapshot);
	ASSERT(lastsnap->ds_is_snapshot);

	/*
	* Check that the snapshots are in the same dsl_dir, and firstsnap
	* is before lastsnap.
	*/
	if (firstsnap->ds_dir != lastsnap->ds_dir \|\|
	dsl_dataset_phys(firstsnap)->ds_creation_txg >
	dsl_dataset_phys(lastsnap)->ds_creation_txg)
	return (SET_ERROR(EINVAL));

	usedp = compp = *uncompp = 0;

	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
	while (snapobj != firstsnap->ds_object) {
	dsl_dataset_t *ds;
	uint64_t used, comp, uncomp;

	err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
	if (err != 0)
	break;

	dsl_deadlist_space_range(&ds->ds_deadlist,
	dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
	&used, &comp, &uncomp);
	*usedp += used;
	*compp += comp;
	*uncompp += uncomp;

	snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
	ASSERT3U(snapobj, !=, 0);
	dsl_dataset_rele(ds, FTAG);
	}
	return (err);
	}

	/*
	* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
	* For example, they could both be snapshots of the same filesystem, and
	* 'earlier' is before 'later'. Or 'earlier' could be the origin of
	* 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's
	* filesystem. Or 'earlier' could be the origin's origin.
	*
	* If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
	*/
	boolean_t
	dsl_dataset_is_before(dsl_dataset_t later, dsl_dataset_t earlier,
	uint64_t earlier_txg)
	{
	dsl_pool_t *dp = later->ds_dir->dd_pool;
	int error;
	boolean_t ret;

	ASSERT(dsl_pool_config_held(dp));
	ASSERT(earlier->ds_is_snapshot \|\| earlier_txg != 0);

	if (earlier_txg == 0)
	earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;

	if (later->ds_is_snapshot &&
	earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
	return (B_FALSE);

	if (later->ds_dir == earlier->ds_dir)
	return (B_TRUE);
	if (!dsl_dir_is_clone(later->ds_dir))
	return (B_FALSE);

	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
	return (B_TRUE);
	dsl_dataset_t *origin;
	error = dsl_dataset_hold_obj(dp,
	dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
	if (error != 0)
	return (B_FALSE);
	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
	dsl_dataset_rele(origin, FTAG);
	return (ret);
	}

	void
	dsl_dataset_zapify(dsl_dataset_t ds, dmu_tx_t tx)
	{
	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
	}

	boolean_t
	dsl_dataset_is_zapified(dsl_dataset_t *ds)
	{
	dmu_object_info_t doi;

	dmu_object_info_from_db(ds->ds_dbuf, &doi);
	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
	}

	boolean_t
	dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
	{
	return (dsl_dataset_is_zapified(ds) &&
	zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
	ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
	+}
	+
	+uint64_t
	+dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds)
	+{
	+ uint64_t remap_deadlist_obj;
	+ int err;
	+
	+ if (!dsl_dataset_is_zapified(ds))
	+ return (0);
	+
	+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
	+ DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1,
	+ &remap_deadlist_obj);
	+
	+ if (err != 0) {
	+ VERIFY3S(err, ==, ENOENT);
	+ return (0);
	+ }
	+
	+ ASSERT(remap_deadlist_obj != 0);
	+ return (remap_deadlist_obj);
	+}
	+
	+boolean_t
	+dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds)
	+{
	+ EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist),
	+ dsl_dataset_get_remap_deadlist_object(ds) != 0);
	+ return (dsl_deadlist_is_open(&ds->ds_remap_deadlist));
	+}
	+
	+static void
	+dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj,
	+ dmu_tx_t *tx)
	+{
	+ ASSERT(obj != 0);
	+ dsl_dataset_zapify(ds, tx);
	+ VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
	+ DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx));
	+}
	+
	+static void
	+dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t ds, dmu_tx_t tx)
	+{
	+ VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset,
	+ ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx));
	+}
	+
	+void
	+dsl_dataset_destroy_remap_deadlist(dsl_dataset_t ds, dmu_tx_t tx)
	+{
	+ uint64_t remap_deadlist_object;
	+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
	+
	+ ASSERT(dmu_tx_is_syncing(tx));
	+ ASSERT(dsl_dataset_remap_deadlist_exists(ds));
	+
	+ remap_deadlist_object = ds->ds_remap_deadlist.dl_object;
	+ dsl_deadlist_close(&ds->ds_remap_deadlist);
	+ dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx);
	+ dsl_dataset_unset_remap_deadlist_object(ds, tx);
	+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	+}
	+
	+void
	+dsl_dataset_create_remap_deadlist(dsl_dataset_t ds, dmu_tx_t tx)
	+{
	+ uint64_t remap_deadlist_obj;
	+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
	+
	+ ASSERT(dmu_tx_is_syncing(tx));
	+ ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock));
	+ /*
	+ * Currently we only create remap deadlists when there are indirect
	+ * vdevs with referenced mappings.
	+ */
	+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
	+
	+ remap_deadlist_obj = dsl_deadlist_clone(
	+ &ds->ds_deadlist, UINT64_MAX,
	+ dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
	+ dsl_dataset_set_remap_deadlist_object(ds,
	+ remap_deadlist_obj, tx);
	+ dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa),
	+ remap_deadlist_obj);
	+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c (revision 332525)
	@@ -1,553 +1,566 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012 by Delphix. All rights reserved.
	+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#include <sys/dsl_dataset.h>
	#include <sys/dmu.h>
	#include <sys/refcount.h>
	#include <sys/zap.h>
	#include <sys/zfs_context.h>
	#include <sys/dsl_pool.h>

	/*
	* Deadlist concurrency:
	*
	* Deadlists can only be modified from the syncing thread.
	*
	* Except for dsl_deadlist_insert(), it can only be modified with the
	* dp_config_rwlock held with RW_WRITER.
	*
	* The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
	* be called concurrently, from open context, with the dl_config_rwlock held
	* with RW_READER.
	*
	* Therefore, we only need to provide locking between dsl_deadlist_insert() and
	* the accessors, protecting:
	* dl_phys->dl_used,comp,uncomp
	* and protecting the dl_tree from being loaded.
	* The locking is provided by dl_lock. Note that locking on the bpobj_t
	* provides its own locking, and dl_oldfmt is immutable.
	*/

	static int
	dsl_deadlist_compare(const void arg1, const void arg2)
	{
	const dsl_deadlist_entry_t *dle1 = arg1;
	const dsl_deadlist_entry_t *dle2 = arg2;

	if (dle1->dle_mintxg < dle2->dle_mintxg)
	return (-1);
	else if (dle1->dle_mintxg > dle2->dle_mintxg)
	return (+1);
	else
	return (0);
	}

	static void
	dsl_deadlist_load_tree(dsl_deadlist_t *dl)
	{
	zap_cursor_t zc;
	zap_attribute_t za;

	ASSERT(MUTEX_HELD(&dl->dl_lock));

	ASSERT(!dl->dl_oldfmt);
	if (dl->dl_havetree)
	return;

	avl_create(&dl->dl_tree, dsl_deadlist_compare,
	sizeof (dsl_deadlist_entry_t),
	offsetof(dsl_deadlist_entry_t, dle_node));
	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	dsl_deadlist_entry_t dle = kmem_alloc(sizeof (dle), KM_SLEEP);
	dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
	za.za_first_integer));
	avl_add(&dl->dl_tree, dle);
	}
	zap_cursor_fini(&zc);
	dl->dl_havetree = B_TRUE;
	}

	void
	dsl_deadlist_open(dsl_deadlist_t dl, objset_t os, uint64_t object)
	{
	dmu_object_info_t doi;

	+ ASSERT(!dsl_deadlist_is_open(dl));
	+
	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
	dl->dl_os = os;
	dl->dl_object = object;
	VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
	dmu_object_info_from_db(dl->dl_dbuf, &doi);
	if (doi.doi_type == DMU_OT_BPOBJ) {
	dmu_buf_rele(dl->dl_dbuf, dl);
	dl->dl_dbuf = NULL;
	dl->dl_oldfmt = B_TRUE;
	VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
	return;
	}

	dl->dl_oldfmt = B_FALSE;
	dl->dl_phys = dl->dl_dbuf->db_data;
	dl->dl_havetree = B_FALSE;
	}

	+boolean_t
	+dsl_deadlist_is_open(dsl_deadlist_t *dl)
	+{
	+ return (dl->dl_os != NULL);
	+}
	+
	void
	dsl_deadlist_close(dsl_deadlist_t *dl)
	{
	void *cookie = NULL;
	dsl_deadlist_entry_t *dle;

	- dl->dl_os = NULL;
	+ ASSERT(dsl_deadlist_is_open(dl));

	if (dl->dl_oldfmt) {
	dl->dl_oldfmt = B_FALSE;
	bpobj_close(&dl->dl_bpobj);
	+ dl->dl_os = NULL;
	+ dl->dl_object = 0;
	return;
	}

	if (dl->dl_havetree) {
	while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
	!= NULL) {
	bpobj_close(&dle->dle_bpobj);
	kmem_free(dle, sizeof (*dle));
	}
	avl_destroy(&dl->dl_tree);
	}
	dmu_buf_rele(dl->dl_dbuf, dl);
	mutex_destroy(&dl->dl_lock);
	dl->dl_dbuf = NULL;
	dl->dl_phys = NULL;
	+ dl->dl_os = NULL;
	+ dl->dl_object = 0;
	}

	uint64_t
	dsl_deadlist_alloc(objset_t os, dmu_tx_t tx)
	{
	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
	return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
	sizeof (dsl_deadlist_phys_t), tx));
	}

	void
	dsl_deadlist_free(objset_t os, uint64_t dlobj, dmu_tx_t tx)
	{
	dmu_object_info_t doi;
	zap_cursor_t zc;
	zap_attribute_t za;

	VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
	if (doi.doi_type == DMU_OT_BPOBJ) {
	bpobj_free(os, dlobj, tx);
	return;
	}

	for (zap_cursor_init(&zc, os, dlobj);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	uint64_t obj = za.za_first_integer;
	if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
	bpobj_decr_empty(os, tx);
	else
	bpobj_free(os, obj, tx);
	}
	zap_cursor_fini(&zc);
	VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
	}

	static void
	dle_enqueue(dsl_deadlist_t dl, dsl_deadlist_entry_t dle,
	const blkptr_t bp, dmu_tx_t tx)
	{
	ASSERT(MUTEX_HELD(&dl->dl_lock));
	if (dle->dle_bpobj.bpo_object ==
	dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
	uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
	bpobj_close(&dle->dle_bpobj);
	bpobj_decr_empty(dl->dl_os, tx);
	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
	VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
	dle->dle_mintxg, obj, tx));
	}
	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
	}

	static void
	dle_enqueue_subobj(dsl_deadlist_t dl, dsl_deadlist_entry_t dle,
	uint64_t obj, dmu_tx_t *tx)
	{
	ASSERT(MUTEX_HELD(&dl->dl_lock));
	if (dle->dle_bpobj.bpo_object !=
	dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
	bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
	} else {
	bpobj_close(&dle->dle_bpobj);
	bpobj_decr_empty(dl->dl_os, tx);
	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
	VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
	dle->dle_mintxg, obj, tx));
	}
	}

	void
	dsl_deadlist_insert(dsl_deadlist_t dl, const blkptr_t bp, dmu_tx_t *tx)
	{
	dsl_deadlist_entry_t dle_tofind;
	dsl_deadlist_entry_t *dle;
	avl_index_t where;

	if (dl->dl_oldfmt) {
	bpobj_enqueue(&dl->dl_bpobj, bp, tx);
	return;
	}

	mutex_enter(&dl->dl_lock);
	dsl_deadlist_load_tree(dl);

	dmu_buf_will_dirty(dl->dl_dbuf, tx);
	dl->dl_phys->dl_used +=
	bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
	dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
	dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);

	dle_tofind.dle_mintxg = bp->blk_birth;
	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
	if (dle == NULL)
	dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
	else
	dle = AVL_PREV(&dl->dl_tree, dle);
	dle_enqueue(dl, dle, bp, tx);
	mutex_exit(&dl->dl_lock);
	}

	/*
	* Insert new key in deadlist, which must be > all current entries.
	* mintxg is not inclusive.
	*/
	void
	dsl_deadlist_add_key(dsl_deadlist_t dl, uint64_t mintxg, dmu_tx_t tx)
	{
	uint64_t obj;
	dsl_deadlist_entry_t *dle;

	if (dl->dl_oldfmt)
	return;

	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
	dle->dle_mintxg = mintxg;

	mutex_enter(&dl->dl_lock);
	dsl_deadlist_load_tree(dl);

	obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
	avl_add(&dl->dl_tree, dle);

	VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
	mintxg, obj, tx));
	mutex_exit(&dl->dl_lock);
	}

	/*
	* Remove this key, merging its entries into the previous key.
	*/
	void
	dsl_deadlist_remove_key(dsl_deadlist_t dl, uint64_t mintxg, dmu_tx_t tx)
	{
	dsl_deadlist_entry_t dle_tofind;
	dsl_deadlist_entry_t dle, dle_prev;

	if (dl->dl_oldfmt)
	return;

	mutex_enter(&dl->dl_lock);
	dsl_deadlist_load_tree(dl);

	dle_tofind.dle_mintxg = mintxg;
	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
	dle_prev = AVL_PREV(&dl->dl_tree, dle);

	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);

	avl_remove(&dl->dl_tree, dle);
	bpobj_close(&dle->dle_bpobj);
	kmem_free(dle, sizeof (*dle));

	VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
	mutex_exit(&dl->dl_lock);
	}

	/*
	* Walk ds's snapshots to regenerate generate ZAP & AVL.
	*/
	static void
	dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
	uint64_t mrs_obj, dmu_tx_t *tx)
	{
	- dsl_deadlist_t dl;
	+ dsl_deadlist_t dl = { 0 };
	dsl_pool_t *dp = dmu_objset_pool(os);

	dsl_deadlist_open(&dl, os, dlobj);
	if (dl.dl_oldfmt) {
	dsl_deadlist_close(&dl);
	return;
	}

	while (mrs_obj != 0) {
	dsl_dataset_t *ds;
	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
	dsl_deadlist_add_key(&dl,
	dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
	mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
	dsl_dataset_rele(ds, FTAG);
	}
	dsl_deadlist_close(&dl);
	}

	uint64_t
	dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
	uint64_t mrs_obj, dmu_tx_t *tx)
	{
	dsl_deadlist_entry_t *dle;
	uint64_t newobj;

	newobj = dsl_deadlist_alloc(dl->dl_os, tx);

	if (dl->dl_oldfmt) {
	dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
	return (newobj);
	}

	mutex_enter(&dl->dl_lock);
	dsl_deadlist_load_tree(dl);

	for (dle = avl_first(&dl->dl_tree); dle;
	dle = AVL_NEXT(&dl->dl_tree, dle)) {
	uint64_t obj;

	if (dle->dle_mintxg >= maxtxg)
	break;

	obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
	VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
	dle->dle_mintxg, obj, tx));
	}
	mutex_exit(&dl->dl_lock);
	return (newobj);
	}

	void
	dsl_deadlist_space(dsl_deadlist_t *dl,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp)
	{
	+ ASSERT(dsl_deadlist_is_open(dl));
	if (dl->dl_oldfmt) {
	VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
	usedp, compp, uncompp));
	return;
	}

	mutex_enter(&dl->dl_lock);
	*usedp = dl->dl_phys->dl_used;
	*compp = dl->dl_phys->dl_comp;
	*uncompp = dl->dl_phys->dl_uncomp;
	mutex_exit(&dl->dl_lock);
	}

	/*
	* return space used in the range (mintxg, maxtxg].
	* Includes maxtxg, does not include mintxg.
	* mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
	* larger than any bp in the deadlist (eg. UINT64_MAX)).
	*/
	void
	dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp)
	{
	dsl_deadlist_entry_t *dle;
	dsl_deadlist_entry_t dle_tofind;
	avl_index_t where;

	if (dl->dl_oldfmt) {
	VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
	mintxg, maxtxg, usedp, compp, uncompp));
	return;
	}

	usedp = compp = *uncompp = 0;

	mutex_enter(&dl->dl_lock);
	dsl_deadlist_load_tree(dl);
	dle_tofind.dle_mintxg = mintxg;
	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
	/*
	* If we don't find this mintxg, there shouldn't be anything
	* after it either.
	*/
	ASSERT(dle != NULL \|\|
	avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);

	for (; dle && dle->dle_mintxg < maxtxg;
	dle = AVL_NEXT(&dl->dl_tree, dle)) {
	uint64_t used, comp, uncomp;

	VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
	&used, &comp, &uncomp));

	*usedp += used;
	*compp += comp;
	*uncompp += uncomp;
	}
	mutex_exit(&dl->dl_lock);
	}

	static void
	dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
	dmu_tx_t *tx)
	{
	dsl_deadlist_entry_t dle_tofind;
	dsl_deadlist_entry_t *dle;
	avl_index_t where;
	uint64_t used, comp, uncomp;
	bpobj_t bpo;

	ASSERT(MUTEX_HELD(&dl->dl_lock));

	VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
	VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
	bpobj_close(&bpo);

	dsl_deadlist_load_tree(dl);

	dmu_buf_will_dirty(dl->dl_dbuf, tx);
	dl->dl_phys->dl_used += used;
	dl->dl_phys->dl_comp += comp;
	dl->dl_phys->dl_uncomp += uncomp;

	dle_tofind.dle_mintxg = birth;
	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
	if (dle == NULL)
	dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
	dle_enqueue_subobj(dl, dle, obj, tx);
	}

	static int
	dsl_deadlist_insert_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	dsl_deadlist_t *dl = arg;
	dsl_deadlist_insert(dl, bp, tx);
	return (0);
	}

	/*
	* Merge the deadlist pointed to by 'obj' into dl. obj will be left as
	* an empty deadlist.
	*/
	void
	dsl_deadlist_merge(dsl_deadlist_t dl, uint64_t obj, dmu_tx_t tx)
	{
	zap_cursor_t zc;
	zap_attribute_t za;
	dmu_buf_t *bonus;
	dsl_deadlist_phys_t *dlp;
	dmu_object_info_t doi;

	VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
	if (doi.doi_type == DMU_OT_BPOBJ) {
	bpobj_t bpo;
	VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
	VERIFY3U(0, ==, bpobj_iterate(&bpo,
	dsl_deadlist_insert_cb, dl, tx));
	bpobj_close(&bpo);
	return;
	}

	mutex_enter(&dl->dl_lock);
	for (zap_cursor_init(&zc, dl->dl_os, obj);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
	dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
	VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
	}
	zap_cursor_fini(&zc);

	VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
	dlp = bonus->db_data;
	dmu_buf_will_dirty(bonus, tx);
	bzero(dlp, sizeof (*dlp));
	dmu_buf_rele(bonus, FTAG);
	mutex_exit(&dl->dl_lock);
	}

	/*
	* Remove entries on dl that are >= mintxg, and put them on the bpobj.
	*/
	void
	dsl_deadlist_move_bpobj(dsl_deadlist_t dl, bpobj_t bpo, uint64_t mintxg,
	dmu_tx_t *tx)
	{
	dsl_deadlist_entry_t dle_tofind;
	dsl_deadlist_entry_t *dle;
	avl_index_t where;

	ASSERT(!dl->dl_oldfmt);

	mutex_enter(&dl->dl_lock);
	dmu_buf_will_dirty(dl->dl_dbuf, tx);
	dsl_deadlist_load_tree(dl);

	dle_tofind.dle_mintxg = mintxg;
	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
	if (dle == NULL)
	dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
	while (dle) {
	uint64_t used, comp, uncomp;
	dsl_deadlist_entry_t *dle_next;

	bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);

	VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
	&used, &comp, &uncomp));
	ASSERT3U(dl->dl_phys->dl_used, >=, used);
	ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
	ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
	dl->dl_phys->dl_used -= used;
	dl->dl_phys->dl_comp -= comp;
	dl->dl_phys->dl_uncomp -= uncomp;

	VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
	dle->dle_mintxg, tx));

	dle_next = AVL_NEXT(&dl->dl_tree, dle);
	avl_remove(&dl->dl_tree, dle);
	bpobj_close(&dle->dle_bpobj);
	kmem_free(dle, sizeof (*dle));
	dle = dle_next;
	}
	mutex_exit(&dl->dl_lock);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c (revision 332525)
	@@ -1,1032 +1,1080 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013 Steven Hartland. All rights reserved.
	* Copyright (c) 2013 by Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#include <sys/zfs_context.h>
	#include <sys/dsl_userhold.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_synctask.h>
	#include <sys/dsl_destroy.h>
	#include <sys/dmu_tx.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_dir.h>
	#include <sys/dmu_traverse.h>
	#include <sys/dsl_scan.h>
	#include <sys/dmu_objset.h>
	#include <sys/zap.h>
	#include <sys/zfeature.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/dsl_deleg.h>
	#include <sys/dmu_impl.h>
	#include <sys/zcp.h>

	int
	dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
	{
	if (!ds->ds_is_snapshot)
	return (SET_ERROR(EINVAL));

	if (dsl_dataset_long_held(ds))
	return (SET_ERROR(EBUSY));

	/*
	* Only allow deferred destroy on pools that support it.
	* NOTE: deferred destroy is only supported on snapshots.
	*/
	if (defer) {
	if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
	SPA_VERSION_USERREFS)
	return (SET_ERROR(ENOTSUP));
	return (0);
	}

	/*
	* If this snapshot has an elevated user reference count,
	* we can't destroy it yet.
	*/
	if (ds->ds_userrefs > 0)
	return (SET_ERROR(EBUSY));

	/*
	* Can't delete a branch point.
	*/
	if (dsl_dataset_phys(ds)->ds_num_children > 1)
	return (SET_ERROR(EEXIST));

	return (0);
	}

	int
	dsl_destroy_snapshot_check(void arg, dmu_tx_t tx)
	{
	dsl_destroy_snapshot_arg_t *ddsa = arg;
	const char *dsname = ddsa->ddsa_name;
	boolean_t defer = ddsa->ddsa_defer;

	dsl_pool_t *dp = dmu_tx_pool(tx);
	int error = 0;
	dsl_dataset_t *ds;

	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);

	/*
	* If the snapshot does not exist, silently ignore it, and
	* dsl_destroy_snapshot_sync() will be a no-op
	* (it's "already destroyed").
	*/
	if (error == ENOENT)
	return (0);

	if (error == 0) {
	error = dsl_destroy_snapshot_check_impl(ds, defer);
	dsl_dataset_rele(ds, FTAG);
	}

	return (error);
	}

	struct process_old_arg {
	dsl_dataset_t *ds;
	dsl_dataset_t *ds_prev;
	boolean_t after_branch_point;
	zio_t *pio;
	uint64_t used, comp, uncomp;
	};

	static int
	process_old_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	struct process_old_arg *poa = arg;
	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;

	ASSERT(!BP_IS_HOLE(bp));

	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
	dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
	if (poa->ds_prev && !poa->after_branch_point &&
	bp->blk_birth >
	dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
	dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
	bp_get_dsize_sync(dp->dp_spa, bp);
	}
	} else {
	poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
	poa->comp += BP_GET_PSIZE(bp);
	poa->uncomp += BP_GET_UCSIZE(bp);
	dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
	}
	return (0);
	}

	static void
	process_old_deadlist(dsl_dataset_t ds, dsl_dataset_t ds_prev,
	dsl_dataset_t ds_next, boolean_t after_branch_point, dmu_tx_t tx)
	{
	struct process_old_arg poa = { 0 };
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	objset_t *mos = dp->dp_meta_objset;
	uint64_t deadlist_obj;

	ASSERT(ds->ds_deadlist.dl_oldfmt);
	ASSERT(ds_next->ds_deadlist.dl_oldfmt);

	poa.ds = ds;
	poa.ds_prev = ds_prev;
	poa.after_branch_point = after_branch_point;
	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
	process_old_cb, &poa, tx));
	VERIFY0(zio_wait(poa.pio));
	ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);

	/* change snapused */
	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
	-poa.used, -poa.comp, -poa.uncomp, tx);

	/* swap next's deadlist to our deadlist */
	dsl_deadlist_close(&ds->ds_deadlist);
	dsl_deadlist_close(&ds_next->ds_deadlist);
	deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
	dsl_dataset_phys(ds)->ds_deadlist_obj =
	dsl_dataset_phys(ds_next)->ds_deadlist_obj;
	dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
	dsl_deadlist_open(&ds->ds_deadlist, mos,
	dsl_dataset_phys(ds)->ds_deadlist_obj);
	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
	dsl_dataset_phys(ds_next)->ds_deadlist_obj);
	}

	static void
	dsl_dataset_remove_clones_key(dsl_dataset_t ds, uint64_t mintxg, dmu_tx_t tx)
	{
	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
	zap_cursor_t zc;
	zap_attribute_t za;

	/*
	* If it is the old version, dd_clones doesn't exist so we can't
	* find the clones, but dsl_deadlist_remove_key() is a no-op so it
	* doesn't matter.
	*/
	if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
	return;

	for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	dsl_dataset_t *clone;

	VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
	za.za_first_integer, FTAG, &clone));
	if (clone->ds_dir->dd_origin_txg > mintxg) {
	dsl_deadlist_remove_key(&clone->ds_deadlist,
	mintxg, tx);
	+ if (dsl_dataset_remap_deadlist_exists(clone)) {
	+ dsl_deadlist_remove_key(
	+ &clone->ds_remap_deadlist, mintxg, tx);
	+ }
	dsl_dataset_remove_clones_key(clone, mintxg, tx);
	}
	dsl_dataset_rele(clone, FTAG);
	}
	zap_cursor_fini(&zc);
	}

	+static void
	+dsl_destroy_snapshot_handle_remaps(dsl_dataset_t ds, dsl_dataset_t ds_next,
	+ dmu_tx_t *tx)
	+{
	+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
	+
	+ /* Move blocks to be obsoleted to pool's obsolete list. */
	+ if (dsl_dataset_remap_deadlist_exists(ds_next)) {
	+ if (!bpobj_is_open(&dp->dp_obsolete_bpobj))
	+ dsl_pool_create_obsolete_bpobj(dp, tx);
	+
	+ dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist,
	+ &dp->dp_obsolete_bpobj,
	+ dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
	+ }
	+
	+ /* Merge our deadlist into next's and free it. */
	+ if (dsl_dataset_remap_deadlist_exists(ds)) {
	+ uint64_t remap_deadlist_object =
	+ dsl_dataset_get_remap_deadlist_object(ds);
	+ ASSERT(remap_deadlist_object != 0);
	+
	+ mutex_enter(&ds_next->ds_remap_deadlist_lock);
	+ if (!dsl_dataset_remap_deadlist_exists(ds_next))
	+ dsl_dataset_create_remap_deadlist(ds_next, tx);
	+ mutex_exit(&ds_next->ds_remap_deadlist_lock);
	+
	+ dsl_deadlist_merge(&ds_next->ds_remap_deadlist,
	+ remap_deadlist_object, tx);
	+ dsl_dataset_destroy_remap_deadlist(ds, tx);
	+ }
	+}
	+
	void
	dsl_destroy_snapshot_sync_impl(dsl_dataset_t ds, boolean_t defer, dmu_tx_t tx)
	{
	int err;
	int after_branch_point = FALSE;
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	objset_t *mos = dp->dp_meta_objset;
	dsl_dataset_t *ds_prev = NULL;
	uint64_t obj;

	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
	rrw_exit(&ds->ds_bp_rwlock, FTAG);
	ASSERT(refcount_is_zero(&ds->ds_longholds));

	if (defer &&
	(ds->ds_userrefs > 0 \|\|
	dsl_dataset_phys(ds)->ds_num_children > 1)) {
	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	dsl_dataset_phys(ds)->ds_flags \|= DS_FLAG_DEFER_DESTROY;
	spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
	return;
	}

	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);

	/* We need to log before removing it from the namespace. */
	spa_history_log_internal_ds(ds, "destroy", tx, "");

	dsl_scan_ds_destroyed(ds, tx);

	obj = ds->ds_object;

	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	if (ds->ds_feature_inuse[f]) {
	dsl_dataset_deactivate_feature(obj, f, tx);
	ds->ds_feature_inuse[f] = B_FALSE;
	}
	}
	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	ASSERT3P(ds->ds_prev, ==, NULL);
	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
	after_branch_point =
	(dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);

	dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
	if (after_branch_point &&
	dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
	dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
	VERIFY0(zap_add_int(mos,
	dsl_dataset_phys(ds_prev)->
	ds_next_clones_obj,
	dsl_dataset_phys(ds)->ds_next_snap_obj,
	tx));
	}
	}
	if (!after_branch_point) {
	dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
	dsl_dataset_phys(ds)->ds_next_snap_obj;
	}
	}

	dsl_dataset_t *ds_next;
	uint64_t old_unique;
	uint64_t used = 0, comp = 0, uncomp = 0;

	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
	ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);

	old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;

	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
	dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
	dsl_dataset_phys(ds)->ds_prev_snap_obj;
	dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
	dsl_dataset_phys(ds)->ds_prev_snap_txg;
	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
	ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);

	if (ds_next->ds_deadlist.dl_oldfmt) {
	process_old_deadlist(ds, ds_prev, ds_next,
	after_branch_point, tx);
	} else {
	/* Adjust prev's unique space. */
	if (ds_prev && !after_branch_point) {
	dsl_deadlist_space_range(&ds_next->ds_deadlist,
	dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
	dsl_dataset_phys(ds)->ds_prev_snap_txg,
	&used, &comp, &uncomp);
	dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
	}

	/* Adjust snapused. */
	dsl_deadlist_space_range(&ds_next->ds_deadlist,
	dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
	&used, &comp, &uncomp);
	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
	-used, -comp, -uncomp, tx);

	/* Move blocks to be freed to pool's free list. */
	dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
	&dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
	tx);
	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
	DD_USED_HEAD, used, comp, uncomp, tx);

	/* Merge our deadlist into next's and free it. */
	dsl_deadlist_merge(&ds_next->ds_deadlist,
	dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
	}
	+
	dsl_deadlist_close(&ds->ds_deadlist);
	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;

	+ dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx);
	+
	/* Collapse range in clone heads */
	dsl_dataset_remove_clones_key(ds,
	dsl_dataset_phys(ds)->ds_creation_txg, tx);

	if (ds_next->ds_is_snapshot) {
	dsl_dataset_t *ds_nextnext;

	/*
	* Update next's unique to include blocks which
	* were previously shared by only this snapshot
	* and it. Those blocks will be born after the
	* prev snap and before this snap, and will have
	* died after the next snap and before the one
	* after that (ie. be on the snap after next's
	* deadlist).
	*/
	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds_next)->ds_next_snap_obj,
	FTAG, &ds_nextnext));
	dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
	dsl_dataset_phys(ds)->ds_prev_snap_txg,
	dsl_dataset_phys(ds)->ds_creation_txg,
	&used, &comp, &uncomp);
	dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
	dsl_dataset_rele(ds_nextnext, FTAG);
	ASSERT3P(ds_next->ds_prev, ==, NULL);

	/* Collapse range in this head. */
	dsl_dataset_t *hds;
	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
	dsl_deadlist_remove_key(&hds->ds_deadlist,
	dsl_dataset_phys(ds)->ds_creation_txg, tx);
	+ if (dsl_dataset_remap_deadlist_exists(hds)) {
	+ dsl_deadlist_remove_key(&hds->ds_remap_deadlist,
	+ dsl_dataset_phys(ds)->ds_creation_txg, tx);
	+ }
	dsl_dataset_rele(hds, FTAG);

	} else {
	ASSERT3P(ds_next->ds_prev, ==, ds);
	dsl_dataset_rele(ds_next->ds_prev, ds_next);
	ds_next->ds_prev = NULL;
	if (ds_prev) {
	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj,
	ds_next, &ds_next->ds_prev));
	}

	dsl_dataset_recalc_head_uniq(ds_next);

	/*
	* Reduce the amount of our unconsumed refreservation
	* being charged to our parent by the amount of
	* new unique data we have gained.
	*/
	if (old_unique < ds_next->ds_reserved) {
	int64_t mrsdelta;
	uint64_t new_unique =
	dsl_dataset_phys(ds_next)->ds_unique_bytes;

	ASSERT(old_unique <= new_unique);
	mrsdelta = MIN(new_unique - old_unique,
	ds_next->ds_reserved - old_unique);
	dsl_dir_diduse_space(ds->ds_dir,
	DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
	}
	}
	dsl_dataset_rele(ds_next, FTAG);

	/*
	* This must be done after the dsl_traverse(), because it will
	* re-open the objset.
	*/
	if (ds->ds_objset) {
	dmu_objset_evict(ds->ds_objset);
	ds->ds_objset = NULL;
	}

	/* remove from snapshot namespace */
	dsl_dataset_t *ds_head;
	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
	VERIFY0(dsl_dataset_get_snapname(ds));
	#ifdef ZFS_DEBUG
	{
	uint64_t val;

	err = dsl_dataset_snap_lookup(ds_head,
	ds->ds_snapname, &val);
	ASSERT0(err);
	ASSERT3U(val, ==, obj);
	}
	#endif
	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
	dsl_dataset_rele(ds_head, FTAG);

	if (ds_prev != NULL)
	dsl_dataset_rele(ds_prev, FTAG);

	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);

	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
	uint64_t count;
	ASSERT0(zap_count(mos,
	dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
	count == 0);
	VERIFY0(dmu_object_free(mos,
	dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
	}
	if (dsl_dataset_phys(ds)->ds_props_obj != 0)
	VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
	tx));
	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
	VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
	tx));
	dsl_dir_rele(ds->ds_dir, ds);
	ds->ds_dir = NULL;
	dmu_object_free_zapified(mos, obj, tx);
	}

	void
	dsl_destroy_snapshot_sync(void arg, dmu_tx_t tx)
	{
	dsl_destroy_snapshot_arg_t *ddsa = arg;
	const char *dsname = ddsa->ddsa_name;
	boolean_t defer = ddsa->ddsa_defer;

	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;

	int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
	if (error == ENOENT)
	return;
	ASSERT0(error);
	dsl_destroy_snapshot_sync_impl(ds, defer, tx);
	dsl_dataset_rele(ds, FTAG);
	}

	/*
	* The semantics of this function are described in the comment above
	* lzc_destroy_snaps(). To summarize:
	*
	* The snapshots must all be in the same pool.
	*
	* Snapshots that don't exist will be silently ignored (considered to be
	* "already deleted").
	*
	* On success, all snaps will be destroyed and this will return 0.
	* On failure, no snaps will be destroyed, the errlist will be filled in,
	* and this will return an errno.
	*/
	int
	dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
	nvlist_t *errlist)
	{
	if (nvlist_next_nvpair(snaps, NULL) == NULL)
	return (0);

	/*
	* lzc_destroy_snaps() is documented to take an nvlist whose
	* values "don't matter". We need to convert that nvlist to
	* one that we know can be converted to LUA. We also don't
	* care about any duplicate entries because the nvlist will
	* be converted to a LUA table which should take care of this.
	*/
	nvlist_t *snaps_normalized;
	VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP));
	for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL);
	pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) {
	fnvlist_add_boolean_value(snaps_normalized,
	nvpair_name(pair), B_TRUE);
	}

	nvlist_t *arg;
	VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP));
	fnvlist_add_nvlist(arg, "snaps", snaps_normalized);
	fnvlist_free(snaps_normalized);
	fnvlist_add_boolean_value(arg, "defer", defer);

	nvlist_t *wrapper;
	VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP));
	fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg);
	fnvlist_free(arg);

	const char *program =
	"arg = ...\n"
	"snaps = arg['snaps']\n"
	"defer = arg['defer']\n"
	"errors = { }\n"
	"has_errors = false\n"
	"for snap, v in pairs(snaps) do\n"
	" errno = zfs.check.destroy{snap, defer=defer}\n"
	" zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n"
	" if errno == ENOENT then\n"
	" snaps[snap] = nil\n"
	" elseif errno ~= 0 then\n"
	" errors[snap] = errno\n"
	" has_errors = true\n"
	" end\n"
	"end\n"
	"if has_errors then\n"
	" return errors\n"
	"end\n"
	"for snap, v in pairs(snaps) do\n"
	" errno = zfs.sync.destroy{snap, defer=defer}\n"
	" assert(errno == 0)\n"
	"end\n"
	"return { }\n";

	nvlist_t *result = fnvlist_alloc();
	int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)),
	program,
	B_TRUE,
	0,
	zfs_lua_max_memlimit,
	nvlist_next_nvpair(wrapper, NULL), result);
	if (error != 0) {
	char *errorstr = NULL;
	(void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
	if (errorstr != NULL) {
	zfs_dbgmsg(errorstr);
	}
	return (error);
	}
	fnvlist_free(wrapper);

	/*
	* lzc_destroy_snaps() is documented to fill the errlist with
	* int32 values, so we need to covert the int64 values that are
	* returned from LUA.
	*/
	int rv = 0;
	nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN);
	for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL);
	pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) {
	int32_t val = (int32_t)fnvpair_value_int64(pair);
	if (rv == 0)
	rv = val;
	fnvlist_add_int32(errlist, nvpair_name(pair), val);
	}
	fnvlist_free(result);
	return (rv);
	}

	int
	dsl_destroy_snapshot(const char *name, boolean_t defer)
	{
	int error;
	nvlist_t *nvl = fnvlist_alloc();
	nvlist_t *errlist = fnvlist_alloc();

	fnvlist_add_boolean(nvl, name);
	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
	fnvlist_free(errlist);
	fnvlist_free(nvl);
	return (error);
	}

	struct killarg {
	dsl_dataset_t *ds;
	dmu_tx_t *tx;
	};

	/* ARGSUSED */
	static int
	kill_blkptr(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	{
	struct killarg *ka = arg;
	dmu_tx_t *tx = ka->tx;

	if (bp == NULL \|\| BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp))
	return (0);

	if (zb->zb_level == ZB_ZIL_LEVEL) {
	ASSERT(zilog != NULL);
	/*
	* It's a block in the intent log. It has no
	* accounting, so just free it.
	*/
	dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
	} else {
	ASSERT(zilog == NULL);
	ASSERT3U(bp->blk_birth, >,
	dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
	(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
	}

	return (0);
	}

	static void
	old_synchronous_dataset_destroy(dsl_dataset_t ds, dmu_tx_t tx)
	{
	struct killarg ka;

	/*
	* Free everything that we point to (that's born after
	* the previous snapshot, if we are a clone)
	*
	* NB: this should be very quick, because we already
	* freed all the objects in open context.
	*/
	ka.ds = ds;
	ka.tx = tx;
	VERIFY0(traverse_dataset(ds,
	dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
	kill_blkptr, &ka));
	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) \|\|
	dsl_dataset_phys(ds)->ds_unique_bytes == 0);
	}

	int
	dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
	{
	int error;
	uint64_t count;
	objset_t *mos;

	ASSERT(!ds->ds_is_snapshot);
	if (ds->ds_is_snapshot)
	return (SET_ERROR(EINVAL));

	if (refcount_count(&ds->ds_longholds) != expected_holds)
	return (SET_ERROR(EBUSY));

	mos = ds->ds_dir->dd_pool->dp_meta_objset;

	/*
	* Can't delete a head dataset if there are snapshots of it.
	* (Except if the only snapshots are from the branch we cloned
	* from.)
	*/
	if (ds->ds_prev != NULL &&
	dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
	return (SET_ERROR(EBUSY));

	/*
	* Can't delete if there are children of this fs.
	*/
	error = zap_count(mos,
	dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
	if (error != 0)
	return (error);
	if (count != 0)
	return (SET_ERROR(EEXIST));

	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
	dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
	ds->ds_prev->ds_userrefs == 0) {
	/* We need to remove the origin snapshot as well. */
	if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
	return (SET_ERROR(EBUSY));
	}
	return (0);
	}

	int
	dsl_destroy_head_check(void arg, dmu_tx_t tx)
	{
	dsl_destroy_head_arg_t *ddha = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	int error;

	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
	if (error != 0)
	return (error);

	error = dsl_destroy_head_check_impl(ds, 0);
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}

	static void
	dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
	{
	dsl_dir_t *dd;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	objset_t *mos = dp->dp_meta_objset;
	dd_used_t t;

	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));

	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));

	ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);

	/*
	* Decrement the filesystem count for all parent filesystems.
	*
	* When we receive an incremental stream into a filesystem that already
	* exists, a temporary clone is created. We never count this temporary
	* clone, whose name begins with a '%'.
	*/
	if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
	dsl_fs_ss_count_adjust(dd->dd_parent, -1,
	DD_FIELD_FILESYSTEM_COUNT, tx);

	/*
	* Remove our reservation. The impl() routine avoids setting the
	* actual property, which would require the (already destroyed) ds.
	*/
	dsl_dir_set_reservation_sync_impl(dd, 0, tx);

	ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
	ASSERT0(dsl_dir_phys(dd)->dd_reserved);
	for (t = 0; t < DD_USED_NUM; t++)
	ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);

	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
	VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
	VERIFY0(zap_remove(mos,
	dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
	dd->dd_myname, tx));

	dsl_dir_rele(dd, FTAG);
	dmu_object_free_zapified(mos, ddobj, tx);
	}

	void
	dsl_destroy_head_sync_impl(dsl_dataset_t ds, dmu_tx_t tx)
	{
	dsl_pool_t *dp = dmu_tx_pool(tx);
	objset_t *mos = dp->dp_meta_objset;
	uint64_t obj, ddobj, prevobj = 0;
	boolean_t rmorigin;

	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
	ASSERT(ds->ds_prev == NULL \|\|
	dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
	rrw_exit(&ds->ds_bp_rwlock, FTAG);
	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));

	/* We need to log before removing it from the namespace. */
	spa_history_log_internal_ds(ds, "destroy", tx, "");

	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
	DS_IS_DEFER_DESTROY(ds->ds_prev) &&
	dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
	ds->ds_prev->ds_userrefs == 0);

	/* Remove our reservation. */
	if (ds->ds_reserved != 0) {
	dsl_dataset_set_refreservation_sync_impl(ds,
	(ZPROP_SRC_NONE \| ZPROP_SRC_LOCAL \| ZPROP_SRC_RECEIVED),
	0, tx);
	ASSERT0(ds->ds_reserved);
	}

	obj = ds->ds_object;

	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
	if (ds->ds_feature_inuse[f]) {
	dsl_dataset_deactivate_feature(obj, f, tx);
	ds->ds_feature_inuse[f] = B_FALSE;
	}
	}

	dsl_scan_ds_destroyed(ds, tx);

	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	/* This is a clone */
	ASSERT(ds->ds_prev != NULL);
	ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
	obj);
	ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);

	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
	if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
	dsl_dataset_remove_from_next_clones(ds->ds_prev,
	obj, tx);
	}

	ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
	dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
	}

	/*
	* Destroy the deadlist. Unless it's a clone, the
	- * deadlist should be empty. (If it's a clone, it's
	- * safe to ignore the deadlist contents.)
	+ * deadlist should be empty since the dataset has no snapshots.
	+ * (If it's a clone, it's safe to ignore the deadlist contents
	+ * since they are still referenced by the origin snapshot.)
	*/
	dsl_deadlist_close(&ds->ds_deadlist);
	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
	+
	+ if (dsl_dataset_remap_deadlist_exists(ds))
	+ dsl_dataset_destroy_remap_deadlist(ds, tx);

	objset_t *os;
	VERIFY0(dmu_objset_from_ds(ds, &os));

	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
	old_synchronous_dataset_destroy(ds, tx);
	} else {
	/*
	* Move the bptree into the pool's list of trees to
	* clean up and update space accounting information.
	*/
	uint64_t used, comp, uncomp;

	zil_destroy_sync(dmu_objset_zil(os), tx);

	if (!spa_feature_is_active(dp->dp_spa,
	SPA_FEATURE_ASYNC_DESTROY)) {
	dsl_scan_t *scn = dp->dp_scan;
	spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
	tx);
	dp->dp_bptree_obj = bptree_alloc(mos, tx);
	VERIFY0(zap_add(mos,
	DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
	&dp->dp_bptree_obj, tx));
	ASSERT(!scn->scn_async_destroying);
	scn->scn_async_destroying = B_TRUE;
	}

	used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
	comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
	uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;

	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) \|\|
	dsl_dataset_phys(ds)->ds_unique_bytes == used);

	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	bptree_add(mos, dp->dp_bptree_obj,
	&dsl_dataset_phys(ds)->ds_bp,
	dsl_dataset_phys(ds)->ds_prev_snap_txg,
	used, comp, uncomp, tx);
	rrw_exit(&ds->ds_bp_rwlock, FTAG);
	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
	-used, -comp, -uncomp, tx);
	dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
	used, comp, uncomp, tx);
	}

	if (ds->ds_prev != NULL) {
	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
	VERIFY0(zap_remove_int(mos,
	dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
	ds->ds_object, tx));
	}
	prevobj = ds->ds_prev->ds_object;
	dsl_dataset_rele(ds->ds_prev, ds);
	ds->ds_prev = NULL;
	}

	/*
	* This must be done after the dsl_traverse(), because it will
	* re-open the objset.
	*/
	if (ds->ds_objset) {
	dmu_objset_evict(ds->ds_objset);
	ds->ds_objset = NULL;
	}

	/* Erase the link in the dir */
	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
	ddobj = ds->ds_dir->dd_object;
	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
	VERIFY0(zap_destroy(mos,
	dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));

	if (ds->ds_bookmarks != 0) {
	VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
	spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
	}

	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);

	ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
	ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
	ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
	dsl_dir_rele(ds->ds_dir, ds);
	ds->ds_dir = NULL;
	dmu_object_free_zapified(mos, obj, tx);

	dsl_dir_destroy_sync(ddobj, tx);

	if (rmorigin) {
	dsl_dataset_t *prev;
	VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
	dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
	dsl_dataset_rele(prev, FTAG);
	}
	}

	void
	dsl_destroy_head_sync(void arg, dmu_tx_t tx)
	{
	dsl_destroy_head_arg_t *ddha = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;

	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
	dsl_destroy_head_sync_impl(ds, tx);
	dsl_dataset_rele(ds, FTAG);
	}

	static void
	dsl_destroy_head_begin_sync(void arg, dmu_tx_t tx)
	{
	dsl_destroy_head_arg_t *ddha = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;

	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));

	/* Mark it as inconsistent on-disk, in case we crash */
	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	dsl_dataset_phys(ds)->ds_flags \|= DS_FLAG_INCONSISTENT;

	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
	dsl_dataset_rele(ds, FTAG);
	}

	int
	dsl_destroy_head(const char *name)
	{
	dsl_destroy_head_arg_t ddha;
	int error;
	spa_t *spa;
	boolean_t isenabled;

	#ifdef _KERNEL
	zfs_destroy_unmount_origin(name);
	#endif

	error = spa_open(name, &spa, FTAG);
	if (error != 0)
	return (error);
	isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
	spa_close(spa, FTAG);

	ddha.ddha_name = name;

	if (!isenabled) {
	objset_t *os;

	error = dsl_sync_task(name, dsl_destroy_head_check,
	dsl_destroy_head_begin_sync, &ddha,
	0, ZFS_SPACE_CHECK_NONE);
	if (error != 0)
	return (error);

	/*
	* Head deletion is processed in one txg on old pools;
	* remove the objects from open context so that the txg sync
	* is not too long.
	*/
	error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
	if (error == 0) {
	uint64_t prev_snap_txg =
	dsl_dataset_phys(dmu_objset_ds(os))->
	ds_prev_snap_txg;
	for (uint64_t obj = 0; error == 0;
	error = dmu_object_next(os, &obj, FALSE,
	prev_snap_txg))
	(void) dmu_free_long_object(os, obj);
	/* sync out all frees */
	txg_wait_synced(dmu_objset_pool(os), 0);
	dmu_objset_disown(os, FTAG);
	}
	}

	return (dsl_sync_task(name, dsl_destroy_head_check,
	dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
	}

	/*
	* Note, this function is used as the callback for dmu_objset_find(). We
	* always return 0 so that we will continue to find and process
	* inconsistent datasets, even if we encounter an error trying to
	* process one of them.
	*/
	/* ARGSUSED */
	int
	dsl_destroy_inconsistent(const char dsname, void arg)
	{
	objset_t *os;

	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
	boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));

	/*
	* If the dataset is inconsistent because a resumable receive
	* has failed, then do not destroy it.
	*/
	if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
	need_destroy = B_FALSE;

	dmu_objset_rele(os, FTAG);
	if (need_destroy)
	(void) dsl_destroy_head(dsname);
	}
	return (0);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c (revision 332525)
	@@ -1,2076 +1,2126 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
	* All rights reserved.
	* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	* Copyright (c) 2014 Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	*/

	#include <sys/dmu.h>
	#include <sys/dmu_objset.h>
	#include <sys/dmu_tx.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_synctask.h>
	#include <sys/dsl_deleg.h>
	#include <sys/dmu_impl.h>
	#include <sys/spa.h>
	#include <sys/metaslab.h>
	#include <sys/zap.h>
	#include <sys/zio.h>
	#include <sys/arc.h>
	#include <sys/sunddi.h>
	#include <sys/zvol.h>
	#ifdef _KERNEL
	#include <sys/zfs_vfsops.h>
	#endif
	#include <sys/zfeature.h>
	#include <sys/policy.h>
	#include <sys/zfs_znode.h>
	#include "zfs_namecheck.h"
	#include "zfs_prop.h"

	/*
	* Filesystem and Snapshot Limits
	* ------------------------------
	*
	* These limits are used to restrict the number of filesystems and/or snapshots
	* that can be created at a given level in the tree or below. A typical
	* use-case is with a delegated dataset where the administrator wants to ensure
	* that a user within the zone is not creating too many additional filesystems
	* or snapshots, even though they're not exceeding their space quota.
	*
	* The filesystem and snapshot counts are stored as extensible properties. This
	* capability is controlled by a feature flag and must be enabled to be used.
	* Once enabled, the feature is not active until the first limit is set. At
	* that point, future operations to create/destroy filesystems or snapshots
	* will validate and update the counts.
	*
	* Because the count properties will not exist before the feature is active,
	* the counts are updated when a limit is first set on an uninitialized
	* dsl_dir node in the tree (The filesystem/snapshot count on a node includes
	* all of the nested filesystems/snapshots. Thus, a new leaf node has a
	* filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
	* snapshot count properties on a node indicate uninitialized counts on that
	* node.) When first setting a limit on an uninitialized node, the code starts
	* at the filesystem with the new limit and descends into all sub-filesystems
	* to add the count properties.
	*
	* In practice this is lightweight since a limit is typically set when the
	* filesystem is created and thus has no children. Once valid, changing the
	* limit value won't require a re-traversal since the counts are already valid.
	* When recursively fixing the counts, if a node with a limit is encountered
	* during the descent, the counts are known to be valid and there is no need to
	* descend into that filesystem's children. The counts on filesystems above the
	* one with the new limit will still be uninitialized, unless a limit is
	* eventually set on one of those filesystems. The counts are always recursively
	* updated when a limit is set on a dataset, unless there is already a limit.
	* When a new limit value is set on a filesystem with an existing limit, it is
	* possible for the new limit to be less than the current count at that level
	* since a user who can change the limit is also allowed to exceed the limit.
	*
	* Once the feature is active, then whenever a filesystem or snapshot is
	* created, the code recurses up the tree, validating the new count against the
	* limit at each initialized level. In practice, most levels will not have a
	* limit set. If there is a limit at any initialized level up the tree, the
	* check must pass or the creation will fail. Likewise, when a filesystem or
	* snapshot is destroyed, the counts are recursively adjusted all the way up
	* the initizized nodes in the tree. Renaming a filesystem into different point
	* in the tree will first validate, then update the counts on each branch up to
	* the common ancestor. A receive will also validate the counts and then update
	* them.
	*
	* An exception to the above behavior is that the limit is not enforced if the
	* user has permission to modify the limit. This is primarily so that
	* recursive snapshots in the global zone always work. We want to prevent a
	* denial-of-service in which a lower level delegated dataset could max out its
	* limit and thus block recursive snapshots from being taken in the global zone.
	* Because of this, it is possible for the snapshot count to be over the limit
	* and snapshots taken in the global zone could cause a lower level dataset to
	* hit or exceed its limit. The administrator taking the global zone recursive
	* snapshot should be aware of this side-effect and behave accordingly.
	* For consistency, the filesystem limit is also not enforced if the user can
	* modify the limit.
	*
	* The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
	* and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
	* dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
	* dsl_dir_init_fs_ss_count().
	*
	* There is a special case when we receive a filesystem that already exists. In
	* this case a temporary clone name of %X is created (see dmu_recv_begin). We
	* never update the filesystem counts for temporary clones.
	*
	* Likewise, we do not update the snapshot counts for temporary snapshots,
	* such as those created by zfs diff.
	*/

	extern inline dsl_dir_phys_t dsl_dir_phys(dsl_dir_t dd);

	static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);

	+typedef struct ddulrt_arg {
	+ dsl_dir_t *ddulrta_dd;
	+ uint64_t ddlrta_txg;
	+} ddulrt_arg_t;
	+
	static void
	dsl_dir_evict_async(void *dbu)
	{
	dsl_dir_t *dd = dbu;
	dsl_pool_t *dp = dd->dd_pool;
	int t;

	dd->dd_dbuf = NULL;

	for (t = 0; t < TXG_SIZE; t++) {
	ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
	ASSERT(dd->dd_tempreserved[t] == 0);
	ASSERT(dd->dd_space_towrite[t] == 0);
	}

	if (dd->dd_parent)
	dsl_dir_async_rele(dd->dd_parent, dd);

	spa_async_close(dd->dd_pool->dp_spa, dd);

	dsl_prop_fini(dd);
	mutex_destroy(&dd->dd_lock);
	kmem_free(dd, sizeof (dsl_dir_t));
	}

	int
	dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
	const char tail, void tag, dsl_dir_t **ddp)
	{
	dmu_buf_t *dbuf;
	dsl_dir_t *dd;
	int err;

	ASSERT(dsl_pool_config_held(dp));

	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
	if (err != 0)
	return (err);
	dd = dmu_buf_get_user(dbuf);
	#ifdef ZFS_DEBUG
	{
	dmu_object_info_t doi;
	dmu_object_info_from_db(dbuf, &doi);
	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
	ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
	}
	#endif
	if (dd == NULL) {
	dsl_dir_t *winner;

	dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
	dd->dd_object = ddobj;
	dd->dd_dbuf = dbuf;
	dd->dd_pool = dp;
	mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
	dsl_prop_init(dd);

	dsl_dir_snap_cmtime_update(dd);

	if (dsl_dir_phys(dd)->dd_parent_obj) {
	err = dsl_dir_hold_obj(dp,
	dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
	&dd->dd_parent);
	if (err != 0)
	goto errout;
	if (tail) {
	#ifdef ZFS_DEBUG
	uint64_t foundobj;

	err = zap_lookup(dp->dp_meta_objset,
	dsl_dir_phys(dd->dd_parent)->
	dd_child_dir_zapobj, tail,
	sizeof (foundobj), 1, &foundobj);
	ASSERT(err \|\| foundobj == ddobj);
	#endif
	(void) strcpy(dd->dd_myname, tail);
	} else {
	err = zap_value_search(dp->dp_meta_objset,
	dsl_dir_phys(dd->dd_parent)->
	dd_child_dir_zapobj,
	ddobj, 0, dd->dd_myname);
	}
	if (err != 0)
	goto errout;
	} else {
	(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
	}

	if (dsl_dir_is_clone(dd)) {
	dmu_buf_t *origin_bonus;
	dsl_dataset_phys_t *origin_phys;

	/*
	* We can't open the origin dataset, because
	* that would require opening this dsl_dir.
	* Just look at its phys directly instead.
	*/
	err = dmu_bonus_hold(dp->dp_meta_objset,
	dsl_dir_phys(dd)->dd_origin_obj, FTAG,
	&origin_bonus);
	if (err != 0)
	goto errout;
	origin_phys = origin_bonus->db_data;
	dd->dd_origin_txg =
	origin_phys->ds_creation_txg;
	dmu_buf_rele(origin_bonus, FTAG);
	}

	dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
	&dd->dd_dbuf);
	winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
	if (winner != NULL) {
	if (dd->dd_parent)
	dsl_dir_rele(dd->dd_parent, dd);
	dsl_prop_fini(dd);
	mutex_destroy(&dd->dd_lock);
	kmem_free(dd, sizeof (dsl_dir_t));
	dd = winner;
	} else {
	spa_open_ref(dp->dp_spa, dd);
	}
	}

	/*
	* The dsl_dir_t has both open-to-close and instantiate-to-evict
	* holds on the spa. We need the open-to-close holds because
	* otherwise the spa_refcnt wouldn't change when we open a
	* dir which the spa also has open, so we could incorrectly
	* think it was OK to unload/export/destroy the pool. We need
	* the instantiate-to-evict hold because the dsl_dir_t has a
	* pointer to the dd_pool, which has a pointer to the spa_t.
	*/
	spa_open_ref(dp->dp_spa, tag);
	ASSERT3P(dd->dd_pool, ==, dp);
	ASSERT3U(dd->dd_object, ==, ddobj);
	ASSERT3P(dd->dd_dbuf, ==, dbuf);
	*ddp = dd;
	return (0);

	errout:
	if (dd->dd_parent)
	dsl_dir_rele(dd->dd_parent, dd);
	dsl_prop_fini(dd);
	mutex_destroy(&dd->dd_lock);
	kmem_free(dd, sizeof (dsl_dir_t));
	dmu_buf_rele(dbuf, tag);
	return (err);
	}

	void
	dsl_dir_rele(dsl_dir_t dd, void tag)
	{
	dprintf_dd(dd, "%s\n", "");
	spa_close(dd->dd_pool->dp_spa, tag);
	dmu_buf_rele(dd->dd_dbuf, tag);
	}

	/*
	* Remove a reference to the given dsl dir that is being asynchronously
	* released. Async releases occur from a taskq performing eviction of
	* dsl datasets and dirs. This process is identical to a normal release
	* with the exception of using the async API for releasing the reference on
	* the spa.
	*/
	void
	dsl_dir_async_rele(dsl_dir_t dd, void tag)
	{
	dprintf_dd(dd, "%s\n", "");
	spa_async_close(dd->dd_pool->dp_spa, tag);
	dmu_buf_rele(dd->dd_dbuf, tag);
	}

	/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
	void
	dsl_dir_name(dsl_dir_t dd, char buf)
	{
	if (dd->dd_parent) {
	dsl_dir_name(dd->dd_parent, buf);
	VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
	ZFS_MAX_DATASET_NAME_LEN);
	} else {
	buf[0] = '\0';
	}
	if (!MUTEX_HELD(&dd->dd_lock)) {
	/*
	* recursive mutex so that we can use
	* dprintf_dd() with dd_lock held
	*/
	mutex_enter(&dd->dd_lock);
	VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
	<, ZFS_MAX_DATASET_NAME_LEN);
	mutex_exit(&dd->dd_lock);
	} else {
	VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
	<, ZFS_MAX_DATASET_NAME_LEN);
	}
	}

	/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
	int
	dsl_dir_namelen(dsl_dir_t *dd)
	{
	int result = 0;

	if (dd->dd_parent) {
	/* parent's name + 1 for the "/" */
	result = dsl_dir_namelen(dd->dd_parent) + 1;
	}

	if (!MUTEX_HELD(&dd->dd_lock)) {
	/* see dsl_dir_name */
	mutex_enter(&dd->dd_lock);
	result += strlen(dd->dd_myname);
	mutex_exit(&dd->dd_lock);
	} else {
	result += strlen(dd->dd_myname);
	}

	return (result);
	}

	static int
	getcomponent(const char path, char component, const char **nextp)
	{
	char *p;

	if ((path == NULL) \|\| (path[0] == '\0'))
	return (SET_ERROR(ENOENT));
	/* This would be a good place to reserve some namespace... */
	p = strpbrk(path, "/@");
	if (p && (p[1] == '/' \|\| p[1] == '@')) {
	/* two separators in a row */
	return (SET_ERROR(EINVAL));
	}
	if (p == NULL \|\| p == path) {
	/*
	* if the first thing is an @ or /, it had better be an
	* @ and it had better not have any more ats or slashes,
	* and it had better have something after the @.
	*/
	if (p != NULL &&
	(p[0] != '@' \|\| strpbrk(path+1, "/@") \|\| p[1] == '\0'))
	return (SET_ERROR(EINVAL));
	if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
	return (SET_ERROR(ENAMETOOLONG));
	(void) strcpy(component, path);
	p = NULL;
	} else if (p[0] == '/') {
	if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
	return (SET_ERROR(ENAMETOOLONG));
	(void) strncpy(component, path, p - path);
	component[p - path] = '\0';
	p++;
	} else if (p[0] == '@') {
	/*
	* if the next separator is an @, there better not be
	* any more slashes.
	*/
	if (strchr(path, '/'))
	return (SET_ERROR(EINVAL));
	if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
	return (SET_ERROR(ENAMETOOLONG));
	(void) strncpy(component, path, p - path);
	component[p - path] = '\0';
	} else {
	panic("invalid p=%p", (void *)p);
	}
	*nextp = p;
	return (0);
	}

	/*
	* Return the dsl_dir_t, and possibly the last component which couldn't
	* be found in *tail. The name must be in the specified dsl_pool_t. This
	* thread must hold the dp_config_rwlock for the pool. Returns NULL if the
	* path is bogus, or if tail==NULL and we couldn't parse the whole name.
	* (*tail)[0] == '@' means that the last component is a snapshot.
	*/
	int
	dsl_dir_hold(dsl_pool_t dp, const char name, void *tag,
	dsl_dir_t ddp, const char tailp)
	{
	char buf[ZFS_MAX_DATASET_NAME_LEN];
	const char spaname, next, *nextnext = NULL;
	int err;
	dsl_dir_t *dd;
	uint64_t ddobj;

	err = getcomponent(name, buf, &next);
	if (err != 0)
	return (err);

	/* Make sure the name is in the specified pool. */
	spaname = spa_name(dp->dp_spa);
	if (strcmp(buf, spaname) != 0)
	return (SET_ERROR(EXDEV));

	ASSERT(dsl_pool_config_held(dp));

	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
	if (err != 0) {
	return (err);
	}

	while (next != NULL) {
	dsl_dir_t *child_dd;
	err = getcomponent(next, buf, &nextnext);
	if (err != 0)
	break;
	ASSERT(next[0] != '\0');
	if (next[0] == '@')
	break;
	dprintf("looking up %s in obj%lld\n",
	buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);

	err = zap_lookup(dp->dp_meta_objset,
	dsl_dir_phys(dd)->dd_child_dir_zapobj,
	buf, sizeof (ddobj), 1, &ddobj);
	if (err != 0) {
	if (err == ENOENT)
	err = 0;
	break;
	}

	err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
	if (err != 0)
	break;
	dsl_dir_rele(dd, tag);
	dd = child_dd;
	next = nextnext;
	}

	if (err != 0) {
	dsl_dir_rele(dd, tag);
	return (err);
	}

	/*
	* It's an error if there's more than one component left, or
	* tailp==NULL and there's any component left.
	*/
	if (next != NULL &&
	(tailp == NULL \|\| (nextnext && nextnext[0] != '\0'))) {
	/* bad path name */
	dsl_dir_rele(dd, tag);
	dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
	err = SET_ERROR(ENOENT);
	}
	if (tailp != NULL)
	*tailp = next;
	*ddp = dd;
	return (err);
	}

	/*
	* If the counts are already initialized for this filesystem and its
	* descendants then do nothing, otherwise initialize the counts.
	*
	* The counts on this filesystem, and those below, may be uninitialized due to
	* either the use of a pre-existing pool which did not support the
	* filesystem/snapshot limit feature, or one in which the feature had not yet
	* been enabled.
	*
	* Recursively descend the filesystem tree and update the filesystem/snapshot
	* counts on each filesystem below, then update the cumulative count on the
	* current filesystem. If the filesystem already has a count set on it,
	* then we know that its counts, and the counts on the filesystems below it,
	* are already correct, so we don't have to update this filesystem.
	*/
	static void
	dsl_dir_init_fs_ss_count(dsl_dir_t dd, dmu_tx_t tx)
	{
	uint64_t my_fs_cnt = 0;
	uint64_t my_ss_cnt = 0;
	dsl_pool_t *dp = dd->dd_pool;
	objset_t *os = dp->dp_meta_objset;
	zap_cursor_t *zc;
	zap_attribute_t *za;
	dsl_dataset_t *ds;

	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
	ASSERT(dsl_pool_config_held(dp));
	ASSERT(dmu_tx_is_syncing(tx));

	dsl_dir_zapify(dd, tx);

	/*
	* If the filesystem count has already been initialized then we
	* don't need to recurse down any further.
	*/
	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
	return;

	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);

	/* Iterate my child dirs */
	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
	zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
	dsl_dir_t *chld_dd;
	uint64_t count;

	VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
	&chld_dd));

	/*
	* Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
	* temporary datasets.
	*/
	if (chld_dd->dd_myname[0] == '$' \|\|
	chld_dd->dd_myname[0] == '%') {
	dsl_dir_rele(chld_dd, FTAG);
	continue;
	}

	my_fs_cnt++; /* count this child */

	dsl_dir_init_fs_ss_count(chld_dd, tx);

	VERIFY0(zap_lookup(os, chld_dd->dd_object,
	DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
	my_fs_cnt += count;
	VERIFY0(zap_lookup(os, chld_dd->dd_object,
	DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
	my_ss_cnt += count;

	dsl_dir_rele(chld_dd, FTAG);
	}
	zap_cursor_fini(zc);
	/* Count my snapshots (we counted children's snapshots above) */
	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
	dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));

	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
	zap_cursor_retrieve(zc, za) == 0;
	zap_cursor_advance(zc)) {
	/* Don't count temporary snapshots */
	if (za->za_name[0] != '%')
	my_ss_cnt++;
	}
	zap_cursor_fini(zc);

	dsl_dataset_rele(ds, FTAG);

	kmem_free(zc, sizeof (zap_cursor_t));
	kmem_free(za, sizeof (zap_attribute_t));

	/* we're in a sync task, update counts */
	dmu_buf_will_dirty(dd->dd_dbuf, tx);
	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
	sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
	sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
	}

	static int
	dsl_dir_actv_fs_ss_limit_check(void arg, dmu_tx_t tx)
	{
	char ddname = (char )arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	dsl_dir_t *dd;
	int error;

	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
	if (error != 0)
	return (error);

	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(ENOTSUP));
	}

	dd = ds->ds_dir;
	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
	dsl_dir_is_zapified(dd) &&
	zap_contains(dp->dp_meta_objset, dd->dd_object,
	DD_FIELD_FILESYSTEM_COUNT) == 0) {
	dsl_dataset_rele(ds, FTAG);
	return (SET_ERROR(EALREADY));
	}

	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	static void
	dsl_dir_actv_fs_ss_limit_sync(void arg, dmu_tx_t tx)
	{
	char ddname = (char )arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	spa_t *spa;

	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));

	spa = dsl_dataset_get_spa(ds);

	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
	/*
	* Since the feature was not active and we're now setting a
	* limit, increment the feature-active counter so that the
	* feature becomes active for the first time.
	*
	* We are already in a sync task so we can update the MOS.
	*/
	spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
	}

	/*
	* Since we are now setting a non-UINT64_MAX limit on the filesystem,
	* we need to ensure the counts are correct. Descend down the tree from
	* this point and update all of the counts to be accurate.
	*/
	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);

	dsl_dataset_rele(ds, FTAG);
	}

	/*
	* Make sure the feature is enabled and activate it if necessary.
	* Since we're setting a limit, ensure the on-disk counts are valid.
	* This is only called by the ioctl path when setting a limit value.
	*
	* We do not need to validate the new limit, since users who can change the
	* limit are also allowed to exceed the limit.
	*/
	int
	dsl_dir_activate_fs_ss_limit(const char *ddname)
	{
	int error;

	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
	dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
	ZFS_SPACE_CHECK_RESERVED);

	if (error == EALREADY)
	error = 0;

	return (error);
	}

	/*
	* Used to determine if the filesystem_limit or snapshot_limit should be
	* enforced. We allow the limit to be exceeded if the user has permission to
	* write the property value. We pass in the creds that we got in the open
	* context since we will always be the GZ root in syncing context. We also have
	* to handle the case where we are allowed to change the limit on the current
	* dataset, but there may be another limit in the tree above.
	*
	* We can never modify these two properties within a non-global zone. In
	* addition, the other checks are modeled on zfs_secpolicy_write_perms. We
	* can't use that function since we are already holding the dp_config_rwlock.
	* In addition, we already have the dd and dealing with snapshots is simplified
	* in this code.
	*/

	typedef enum {
	ENFORCE_ALWAYS,
	ENFORCE_NEVER,
	ENFORCE_ABOVE
	} enforce_res_t;

	static enforce_res_t
	dsl_enforce_ds_ss_limits(dsl_dir_t dd, zfs_prop_t prop, cred_t cr)
	{
	enforce_res_t enforce = ENFORCE_ALWAYS;
	uint64_t obj;
	dsl_dataset_t *ds;
	uint64_t zoned;

	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT \|\|
	prop == ZFS_PROP_SNAPSHOT_LIMIT);

	#ifdef _KERNEL
	#ifdef __FreeBSD__
	if (jailed(cr))
	#else
	if (crgetzoneid(cr) != GLOBAL_ZONEID)
	#endif
	return (ENFORCE_ALWAYS);

	if (secpolicy_zfs(cr) == 0)
	return (ENFORCE_NEVER);
	#endif

	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
	return (ENFORCE_ALWAYS);

	ASSERT(dsl_pool_config_held(dd->dd_pool));

	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
	return (ENFORCE_ALWAYS);

	if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) \|\| zoned) {
	/* Only root can access zoned fs's from the GZ */
	enforce = ENFORCE_ALWAYS;
	} else {
	if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
	enforce = ENFORCE_ABOVE;
	}

	dsl_dataset_rele(ds, FTAG);
	return (enforce);
	}

	+static void
	+dsl_dir_update_last_remap_txg_sync(void varg, dmu_tx_t tx)
	+{
	+ ddulrt_arg_t *arg = varg;
	+ uint64_t last_remap_txg;
	+ dsl_dir_t *dd = arg->ddulrta_dd;
	+ objset_t *mos = dd->dd_pool->dp_meta_objset;
	+
	+ dsl_dir_zapify(dd, tx);
	+ if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
	+ sizeof (last_remap_txg), 1, &last_remap_txg) != 0 \|\|
	+ last_remap_txg < arg->ddlrta_txg) {
	+ VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
	+ sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx));
	+ }
	+}
	+
	+int
	+dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg)
	+{
	+ ddulrt_arg_t arg;
	+ arg.ddulrta_dd = dd;
	+ arg.ddlrta_txg = txg;
	+
	+ return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa),
	+ NULL, dsl_dir_update_last_remap_txg_sync, &arg,
	+ 1, ZFS_SPACE_CHECK_RESERVED));
	+}
	+
	/*
	* Check if adding additional child filesystem(s) would exceed any filesystem
	* limits or adding additional snapshot(s) would exceed any snapshot limits.
	* The prop argument indicates which limit to check.
	*
	* Note that all filesystem limits up to the root (or the highest
	* initialized) filesystem or the given ancestor must be satisfied.
	*/
	int
	dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
	dsl_dir_t ancestor, cred_t cr)
	{
	objset_t *os = dd->dd_pool->dp_meta_objset;
	uint64_t limit, count;
	char *count_prop;
	enforce_res_t enforce;
	int err = 0;

	ASSERT(dsl_pool_config_held(dd->dd_pool));
	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT \|\|
	prop == ZFS_PROP_SNAPSHOT_LIMIT);

	/*
	* If we're allowed to change the limit, don't enforce the limit
	* e.g. this can happen if a snapshot is taken by an administrative
	* user in the global zone (i.e. a recursive snapshot by root).
	* However, we must handle the case of delegated permissions where we
	* are allowed to change the limit on the current dataset, but there
	* is another limit in the tree above.
	*/
	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
	if (enforce == ENFORCE_NEVER)
	return (0);

	/*
	* e.g. if renaming a dataset with no snapshots, count adjustment
	* is 0.
	*/
	if (delta == 0)
	return (0);

	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
	/*
	* We don't enforce the limit for temporary snapshots. This is
	* indicated by a NULL cred_t argument.
	*/
	if (cr == NULL)
	return (0);

	count_prop = DD_FIELD_SNAPSHOT_COUNT;
	} else {
	count_prop = DD_FIELD_FILESYSTEM_COUNT;
	}

	/*
	* If an ancestor has been provided, stop checking the limit once we
	* hit that dir. We need this during rename so that we don't overcount
	* the check once we recurse up to the common ancestor.
	*/
	if (ancestor == dd)
	return (0);

	/*
	* If we hit an uninitialized node while recursing up the tree, we can
	* stop since we know there is no limit here (or above). The counts are
	* not valid on this node and we know we won't touch this node's counts.
	*/
	if (!dsl_dir_is_zapified(dd) \|\| zap_lookup(os, dd->dd_object,
	count_prop, sizeof (count), 1, &count) == ENOENT)
	return (0);

	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
	B_FALSE);
	if (err != 0)
	return (err);

	/* Is there a limit which we've hit? */
	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
	return (SET_ERROR(EDQUOT));

	if (dd->dd_parent != NULL)
	err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
	ancestor, cr);

	return (err);
	}

	/*
	* Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
	* parents. When a new filesystem/snapshot is created, increment the count on
	* all parents, and when a filesystem/snapshot is destroyed, decrement the
	* count.
	*/
	void
	dsl_fs_ss_count_adjust(dsl_dir_t dd, int64_t delta, const char prop,
	dmu_tx_t *tx)
	{
	int err;
	objset_t *os = dd->dd_pool->dp_meta_objset;
	uint64_t count;

	ASSERT(dsl_pool_config_held(dd->dd_pool));
	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 \|\|
	strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);

	/*
	* When we receive an incremental stream into a filesystem that already
	* exists, a temporary clone is created. We don't count this temporary
	* clone, whose name begins with a '%'. We also ignore hidden ($FREE,
	* $MOS & $ORIGIN) objsets.
	*/
	if ((dd->dd_myname[0] == '%' \|\| dd->dd_myname[0] == '$') &&
	strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
	return;

	/*
	* e.g. if renaming a dataset with no snapshots, count adjustment is 0
	*/
	if (delta == 0)
	return;

	/*
	* If we hit an uninitialized node while recursing up the tree, we can
	* stop since we know the counts are not valid on this node and we
	* know we shouldn't touch this node's counts. An uninitialized count
	* on the node indicates that either the feature has not yet been
	* activated or there are no limits on this part of the tree.
	*/
	if (!dsl_dir_is_zapified(dd) \|\| (err = zap_lookup(os, dd->dd_object,
	prop, sizeof (count), 1, &count)) == ENOENT)
	return;
	VERIFY0(err);

	count += delta;
	/* Use a signed verify to make sure we're not neg. */
	VERIFY3S(count, >=, 0);

	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
	tx));

	/* Roll up this additional count into our ancestors */
	if (dd->dd_parent != NULL)
	dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
	}

	uint64_t
	dsl_dir_create_sync(dsl_pool_t dp, dsl_dir_t pds, const char *name,
	dmu_tx_t *tx)
	{
	objset_t *mos = dp->dp_meta_objset;
	uint64_t ddobj;
	dsl_dir_phys_t *ddphys;
	dmu_buf_t *dbuf;

	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
	DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
	if (pds) {
	VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
	name, sizeof (uint64_t), 1, &ddobj, tx));
	} else {
	/* it's the root dir */
	VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
	}
	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
	dmu_buf_will_dirty(dbuf, tx);
	ddphys = dbuf->db_data;

	ddphys->dd_creation_time = gethrestime_sec();
	if (pds) {
	ddphys->dd_parent_obj = pds->dd_object;

	/* update the filesystem counts */
	dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
	}
	ddphys->dd_props_zapobj = zap_create(mos,
	DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
	ddphys->dd_child_dir_zapobj = zap_create(mos,
	DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
	ddphys->dd_flags \|= DD_FLAG_USED_BREAKDOWN;
	dmu_buf_rele(dbuf, FTAG);

	return (ddobj);
	}

	boolean_t
	dsl_dir_is_clone(dsl_dir_t *dd)
	{
	return (dsl_dir_phys(dd)->dd_origin_obj &&
	(dd->dd_pool->dp_origin_snap == NULL \|\|
	dsl_dir_phys(dd)->dd_origin_obj !=
	dd->dd_pool->dp_origin_snap->ds_object));
	}


	uint64_t
	dsl_dir_get_used(dsl_dir_t *dd)
	{
	return (dsl_dir_phys(dd)->dd_used_bytes);
	}

	uint64_t
	dsl_dir_get_quota(dsl_dir_t *dd)
	{
	return (dsl_dir_phys(dd)->dd_quota);
	}

	uint64_t
	dsl_dir_get_reservation(dsl_dir_t *dd)
	{
	return (dsl_dir_phys(dd)->dd_reserved);
	}

	uint64_t
	dsl_dir_get_compressratio(dsl_dir_t *dd)
	{
	/* a fixed point number, 100x the ratio */
	return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
	(dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
	dsl_dir_phys(dd)->dd_compressed_bytes));
	}

	uint64_t
	dsl_dir_get_logicalused(dsl_dir_t *dd)
	{
	return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
	}

	uint64_t
	dsl_dir_get_usedsnap(dsl_dir_t *dd)
	{
	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
	}

	uint64_t
	dsl_dir_get_usedds(dsl_dir_t *dd)
	{
	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
	}

	uint64_t
	dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
	{
	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
	}

	uint64_t
	dsl_dir_get_usedchild(dsl_dir_t *dd)
	{
	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
	dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
	}

	void
	dsl_dir_get_origin(dsl_dir_t dd, char buf)
	{
	dsl_dataset_t *ds;
	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
	dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));

	dsl_dataset_name(ds, buf);

	dsl_dataset_rele(ds, FTAG);
	}

	int
	dsl_dir_get_filesystem_count(dsl_dir_t dd, uint64_t count)
	{
	if (dsl_dir_is_zapified(dd)) {
	objset_t *os = dd->dd_pool->dp_meta_objset;
	return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
	sizeof (*count), 1, count));
	} else {
	return (ENOENT);
	}
	}

	int
	dsl_dir_get_snapshot_count(dsl_dir_t dd, uint64_t count)
	{
	if (dsl_dir_is_zapified(dd)) {
	objset_t *os = dd->dd_pool->dp_meta_objset;
	return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
	sizeof (*count), 1, count));
	} else {
	return (ENOENT);
	}
	}

	+int
	+dsl_dir_get_remaptxg(dsl_dir_t dd, uint64_t count)
	+{
	+ if (dsl_dir_is_zapified(dd)) {
	+ objset_t *os = dd->dd_pool->dp_meta_objset;
	+ return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
	+ sizeof (*count), 1, count));
	+ } else {
	+ return (ENOENT);
	+ }
	+}
	+
	void
	dsl_dir_stats(dsl_dir_t dd, nvlist_t nv)
	{
	mutex_enter(&dd->dd_lock);
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
	dsl_dir_get_quota(dd));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
	dsl_dir_get_reservation(dd));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
	dsl_dir_get_logicalused(dd));
	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
	dsl_dir_get_usedsnap(dd));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
	dsl_dir_get_usedds(dd));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
	dsl_dir_get_usedrefreserv(dd));
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
	dsl_dir_get_usedchild(dd));
	}
	mutex_exit(&dd->dd_lock);

	uint64_t count;
	if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
	count);
	}
	if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
	+ count);
	+ }
	+ if (dsl_dir_get_remaptxg(dd, &count) == 0) {
	+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG,
	count);
	}

	if (dsl_dir_is_clone(dd)) {
	char buf[ZFS_MAX_DATASET_NAME_LEN];
	dsl_dir_get_origin(dd, buf);
	dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
	}

	}

	void
	dsl_dir_dirty(dsl_dir_t dd, dmu_tx_t tx)
	{
	dsl_pool_t *dp = dd->dd_pool;

	ASSERT(dsl_dir_phys(dd));

	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
	/* up the hold count until we can be written out */
	dmu_buf_add_ref(dd->dd_dbuf, dd);
	}
	}

	static int64_t
	parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
	{
	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
	uint64_t new_accounted =
	MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
	return (new_accounted - old_accounted);
	}

	void
	dsl_dir_sync(dsl_dir_t dd, dmu_tx_t tx)
	{
	ASSERT(dmu_tx_is_syncing(tx));

	mutex_enter(&dd->dd_lock);
	ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
	mutex_exit(&dd->dd_lock);

	/* release the hold from dsl_dir_dirty */
	dmu_buf_rele(dd->dd_dbuf, dd);
	}

	static uint64_t
	dsl_dir_space_towrite(dsl_dir_t *dd)
	{
	uint64_t space = 0;

	ASSERT(MUTEX_HELD(&dd->dd_lock));

	for (int i = 0; i < TXG_SIZE; i++) {
	space += dd->dd_space_towrite[i & TXG_MASK];
	ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
	}
	return (space);
	}

	/*
	* How much space would dd have available if ancestor had delta applied
	* to it? If ondiskonly is set, we're only interested in what's
	* on-disk, not estimated pending changes.
	*/
	uint64_t
	dsl_dir_space_available(dsl_dir_t *dd,
	dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
	{
	uint64_t parentspace, myspace, quota, used;

	/*
	* If there are no restrictions otherwise, assume we have
	* unlimited space available.
	*/
	quota = UINT64_MAX;
	parentspace = UINT64_MAX;

	if (dd->dd_parent != NULL) {
	parentspace = dsl_dir_space_available(dd->dd_parent,
	ancestor, delta, ondiskonly);
	}

	mutex_enter(&dd->dd_lock);
	if (dsl_dir_phys(dd)->dd_quota != 0)
	quota = dsl_dir_phys(dd)->dd_quota;
	used = dsl_dir_phys(dd)->dd_used_bytes;
	if (!ondiskonly)
	used += dsl_dir_space_towrite(dd);

	if (dd->dd_parent == NULL) {
	uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
	quota = MIN(quota, poolsize);
	}

	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
	/*
	* We have some space reserved, in addition to what our
	* parent gave us.
	*/
	parentspace += dsl_dir_phys(dd)->dd_reserved - used;
	}

	if (dd == ancestor) {
	ASSERT(delta <= 0);
	ASSERT(used >= -delta);
	used += delta;
	if (parentspace != UINT64_MAX)
	parentspace -= delta;
	}

	if (used > quota) {
	/* over quota */
	myspace = 0;
	} else {
	/*
	* the lesser of the space provided by our parent and
	* the space left in our quota
	*/
	myspace = MIN(parentspace, quota - used);
	}

	mutex_exit(&dd->dd_lock);

	return (myspace);
	}

	struct tempreserve {
	list_node_t tr_node;
	dsl_dir_t *tr_ds;
	uint64_t tr_size;
	};

	static int
	dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
	boolean_t ignorequota, list_t *tr_list,
	dmu_tx_t *tx, boolean_t first)
	{
	uint64_t txg = tx->tx_txg;
	uint64_t quota;
	struct tempreserve *tr;
	int retval = EDQUOT;
	uint64_t ref_rsrv = 0;

	ASSERT3U(txg, !=, 0);
	ASSERT3S(asize, >, 0);

	mutex_enter(&dd->dd_lock);

	/*
	* Check against the dsl_dir's quota. We don't add in the delta
	* when checking for over-quota because they get one free hit.
	*/
	uint64_t est_inflight = dsl_dir_space_towrite(dd);
	for (int i = 0; i < TXG_SIZE; i++)
	est_inflight += dd->dd_tempreserved[i];
	uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;

	/*
	* On the first iteration, fetch the dataset's used-on-disk and
	* refreservation values. Also, if checkrefquota is set, test if
	* allocating this space would exceed the dataset's refquota.
	*/
	if (first && tx->tx_objset) {
	int error;
	dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;

	error = dsl_dataset_check_quota(ds, !netfree,
	asize, est_inflight, &used_on_disk, &ref_rsrv);
	if (error != 0) {
	mutex_exit(&dd->dd_lock);
	return (error);
	}
	}

	/*
	* If this transaction will result in a net free of space,
	* we want to let it through.
	*/
	if (ignorequota \|\| netfree \|\| dsl_dir_phys(dd)->dd_quota == 0)
	quota = UINT64_MAX;
	else
	quota = dsl_dir_phys(dd)->dd_quota;

	/*
	* Adjust the quota against the actual pool size at the root
	* minus any outstanding deferred frees.
	* To ensure that it's possible to remove files from a full
	* pool without inducing transient overcommits, we throttle
	* netfree transactions against a quota that is slightly larger,
	* but still within the pool's allocation slop. In cases where
	* we're very close to full, this will allow a steady trickle of
	* removes to get through.
	*/
	uint64_t deferred = 0;
	if (dd->dd_parent == NULL) {
	spa_t *spa = dd->dd_pool->dp_spa;
	uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
	deferred = metaslab_class_get_deferred(spa_normal_class(spa));
	if (poolsize - deferred < quota) {
	quota = poolsize - deferred;
	retval = ENOSPC;
	}
	}

	/*
	* If they are requesting more space, and our current estimate
	* is over quota, they get to try again unless the actual
	* on-disk is over quota and there are no pending changes (which
	* may free up space for us).
	*/
	if (used_on_disk + est_inflight >= quota) {
	if (est_inflight > 0 \|\| used_on_disk < quota \|\|
	(retval == ENOSPC && used_on_disk < quota + deferred))
	retval = ERESTART;
	dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
	"quota=%lluK tr=%lluK err=%d\n",
	used_on_disk>>10, est_inflight>>10,
	quota>>10, asize>>10, retval);
	mutex_exit(&dd->dd_lock);
	return (SET_ERROR(retval));
	}

	/* We need to up our estimated delta before dropping dd_lock */
	dd->dd_tempreserved[txg & TXG_MASK] += asize;

	uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
	asize - ref_rsrv);
	mutex_exit(&dd->dd_lock);

	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
	tr->tr_ds = dd;
	tr->tr_size = asize;
	list_insert_tail(tr_list, tr);

	/* see if it's OK with our parent */
	if (dd->dd_parent != NULL && parent_rsrv != 0) {
	boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);

	return (dsl_dir_tempreserve_impl(dd->dd_parent,
	parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE));
	} else {
	return (0);
	}
	}

	/*
	* Reserve space in this dsl_dir, to be used in this tx's txg.
	* After the space has been dirtied (and dsl_dir_willuse_space()
	* has been called), the reservation should be canceled, using
	* dsl_dir_tempreserve_clear().
	*/
	int
	dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
	boolean_t netfree, void *tr_cookiep, dmu_tx_t tx)
	{
	int err;
	list_t *tr_list;

	if (asize == 0) {
	*tr_cookiep = NULL;
	return (0);
	}

	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
	list_create(tr_list, sizeof (struct tempreserve),
	offsetof(struct tempreserve, tr_node));
	ASSERT3S(asize, >, 0);

	err = arc_tempreserve_space(lsize, tx->tx_txg);
	if (err == 0) {
	struct tempreserve *tr;

	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
	tr->tr_size = lsize;
	list_insert_tail(tr_list, tr);
	} else {
	if (err == EAGAIN) {
	/*
	* If arc_memory_throttle() detected that pageout
	* is running and we are low on memory, we delay new
	* non-pageout transactions to give pageout an
	* advantage.
	*
	* It is unfortunate to be delaying while the caller's
	* locks are held.
	*/
	txg_delay(dd->dd_pool, tx->tx_txg,
	MSEC2NSEC(10), MSEC2NSEC(10));
	err = SET_ERROR(ERESTART);
	}
	}

	if (err == 0) {
	err = dsl_dir_tempreserve_impl(dd, asize, netfree,
	B_FALSE, tr_list, tx, B_TRUE);
	}

	if (err != 0)
	dsl_dir_tempreserve_clear(tr_list, tx);
	else
	*tr_cookiep = tr_list;

	return (err);
	}

	/*
	* Clear a temporary reservation that we previously made with
	* dsl_dir_tempreserve_space().
	*/
	void
	dsl_dir_tempreserve_clear(void tr_cookie, dmu_tx_t tx)
	{
	int txgidx = tx->tx_txg & TXG_MASK;
	list_t *tr_list = tr_cookie;
	struct tempreserve *tr;

	ASSERT3U(tx->tx_txg, !=, 0);

	if (tr_cookie == NULL)
	return;

	while ((tr = list_head(tr_list)) != NULL) {
	if (tr->tr_ds) {
	mutex_enter(&tr->tr_ds->dd_lock);
	ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
	tr->tr_size);
	tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
	mutex_exit(&tr->tr_ds->dd_lock);
	} else {
	arc_tempreserve_clear(tr->tr_size);
	}
	list_remove(tr_list, tr);
	kmem_free(tr, sizeof (struct tempreserve));
	}

	kmem_free(tr_list, sizeof (list_t));
	}

	/*
	* This should be called from open context when we think we're going to write
	* or free space, for example when dirtying data. Be conservative; it's okay
	* to write less space or free more, but we don't want to write more or free
	* less than the amount specified.
	*/
	void
	dsl_dir_willuse_space(dsl_dir_t dd, int64_t space, dmu_tx_t tx)
	{
	int64_t parent_space;
	uint64_t est_used;

	mutex_enter(&dd->dd_lock);
	if (space > 0)
	dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;

	est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
	parent_space = parent_delta(dd, est_used, space);
	mutex_exit(&dd->dd_lock);

	/* Make sure that we clean up dd_space_to* */
	dsl_dir_dirty(dd, tx);

	/* XXX this is potentially expensive and unnecessary... */
	if (parent_space && dd->dd_parent)
	dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
	}

	/* call from syncing context when we actually write/free space for this dd */
	void
	dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
	int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
	{
	int64_t accounted_delta;

	/*
	* dsl_dataset_set_refreservation_sync_impl() calls this with
	* dd_lock held, so that it can atomically update
	* ds->ds_reserved and the dsl_dir accounting, so that
	* dsl_dataset_check_quota() can see dataset and dir accounting
	* consistently.
	*/
	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);

	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT(type < DD_USED_NUM);

	dmu_buf_will_dirty(dd->dd_dbuf, tx);

	if (needlock)
	mutex_enter(&dd->dd_lock);
	accounted_delta =
	parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
	ASSERT(used >= 0 \|\| dsl_dir_phys(dd)->dd_used_bytes >= -used);
	ASSERT(compressed >= 0 \|\|
	dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
	ASSERT(uncompressed >= 0 \|\|
	dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
	dsl_dir_phys(dd)->dd_used_bytes += used;
	dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
	dsl_dir_phys(dd)->dd_compressed_bytes += compressed;

	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
	ASSERT(used > 0 \|\|
	dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
	dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
	#ifdef DEBUG
	dd_used_t t;
	uint64_t u = 0;
	for (t = 0; t < DD_USED_NUM; t++)
	u += dsl_dir_phys(dd)->dd_used_breakdown[t];
	ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
	#endif
	}
	if (needlock)
	mutex_exit(&dd->dd_lock);

	if (dd->dd_parent != NULL) {
	dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
	accounted_delta, compressed, uncompressed, tx);
	dsl_dir_transfer_space(dd->dd_parent,
	used - accounted_delta,
	DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL);
	}
	}

	void
	dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
	dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
	{
	ASSERT(tx == NULL \|\| dmu_tx_is_syncing(tx));
	ASSERT(oldtype < DD_USED_NUM);
	ASSERT(newtype < DD_USED_NUM);

	if (delta == 0 \|\|
	!(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
	return;

	if (tx != NULL)
	dmu_buf_will_dirty(dd->dd_dbuf, tx);
	mutex_enter(&dd->dd_lock);
	ASSERT(delta > 0 ?
	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
	dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
	ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
	dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
	mutex_exit(&dd->dd_lock);
	}

	typedef struct dsl_dir_set_qr_arg {
	const char *ddsqra_name;
	zprop_source_t ddsqra_source;
	uint64_t ddsqra_value;
	} dsl_dir_set_qr_arg_t;

	static int
	dsl_dir_set_quota_check(void arg, dmu_tx_t tx)
	{
	dsl_dir_set_qr_arg_t *ddsqra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	int error;
	uint64_t towrite, newval;

	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
	if (error != 0)
	return (error);

	error = dsl_prop_predict(ds->ds_dir, "quota",
	ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
	if (error != 0) {
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}

	if (newval == 0) {
	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	mutex_enter(&ds->ds_dir->dd_lock);
	/*
	* If we are doing the preliminary check in open context, and
	* there are pending changes, then don't fail it, since the
	* pending changes could under-estimate the amount of space to be
	* freed up.
	*/
	towrite = dsl_dir_space_towrite(ds->ds_dir);
	if ((dmu_tx_is_syncing(tx) \|\| towrite == 0) &&
	(newval < dsl_dir_phys(ds->ds_dir)->dd_reserved \|\|
	newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
	error = SET_ERROR(ENOSPC);
	}
	mutex_exit(&ds->ds_dir->dd_lock);
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}

	static void
	dsl_dir_set_quota_sync(void arg, dmu_tx_t tx)
	{
	dsl_dir_set_qr_arg_t *ddsqra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	uint64_t newval;

	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));

	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
	ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
	&ddsqra->ddsqra_value, tx);

	VERIFY0(dsl_prop_get_int_ds(ds,
	zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
	} else {
	newval = ddsqra->ddsqra_value;
	spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
	zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
	}

	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
	mutex_enter(&ds->ds_dir->dd_lock);
	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
	mutex_exit(&ds->ds_dir->dd_lock);
	dsl_dataset_rele(ds, FTAG);
	}

	int
	dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
	{
	dsl_dir_set_qr_arg_t ddsqra;

	ddsqra.ddsqra_name = ddname;
	ddsqra.ddsqra_source = source;
	ddsqra.ddsqra_value = quota;

	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
	dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
	}

	int
	dsl_dir_set_reservation_check(void arg, dmu_tx_t tx)
	{
	dsl_dir_set_qr_arg_t *ddsqra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	dsl_dir_t *dd;
	uint64_t newval, used, avail;
	int error;

	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
	if (error != 0)
	return (error);
	dd = ds->ds_dir;

	/*
	* If we are doing the preliminary check in open context, the
	* space estimates may be inaccurate.
	*/
	if (!dmu_tx_is_syncing(tx)) {
	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	error = dsl_prop_predict(ds->ds_dir,
	zfs_prop_to_name(ZFS_PROP_RESERVATION),
	ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
	if (error != 0) {
	dsl_dataset_rele(ds, FTAG);
	return (error);
	}

	mutex_enter(&dd->dd_lock);
	used = dsl_dir_phys(dd)->dd_used_bytes;
	mutex_exit(&dd->dd_lock);

	if (dd->dd_parent) {
	avail = dsl_dir_space_available(dd->dd_parent,
	NULL, 0, FALSE);
	} else {
	avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
	}

	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
	uint64_t delta = MAX(used, newval) -
	MAX(used, dsl_dir_phys(dd)->dd_reserved);

	if (delta > avail \|\|
	(dsl_dir_phys(dd)->dd_quota > 0 &&
	newval > dsl_dir_phys(dd)->dd_quota))
	error = SET_ERROR(ENOSPC);
	}

	dsl_dataset_rele(ds, FTAG);
	return (error);
	}

	void
	dsl_dir_set_reservation_sync_impl(dsl_dir_t dd, uint64_t value, dmu_tx_t tx)
	{
	uint64_t used;
	int64_t delta;

	dmu_buf_will_dirty(dd->dd_dbuf, tx);

	mutex_enter(&dd->dd_lock);
	used = dsl_dir_phys(dd)->dd_used_bytes;
	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
	dsl_dir_phys(dd)->dd_reserved = value;

	if (dd->dd_parent != NULL) {
	/* Roll up this additional usage into our ancestors */
	dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
	delta, 0, 0, tx);
	}
	mutex_exit(&dd->dd_lock);
	}

	static void
	dsl_dir_set_reservation_sync(void arg, dmu_tx_t tx)
	{
	dsl_dir_set_qr_arg_t *ddsqra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dataset_t *ds;
	uint64_t newval;

	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));

	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
	dsl_prop_set_sync_impl(ds,
	zfs_prop_to_name(ZFS_PROP_RESERVATION),
	ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
	&ddsqra->ddsqra_value, tx);

	VERIFY0(dsl_prop_get_int_ds(ds,
	zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
	} else {
	newval = ddsqra->ddsqra_value;
	spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
	zfs_prop_to_name(ZFS_PROP_RESERVATION),
	(longlong_t)newval);
	}

	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
	dsl_dataset_rele(ds, FTAG);
	}

	int
	dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
	uint64_t reservation)
	{
	dsl_dir_set_qr_arg_t ddsqra;

	ddsqra.ddsqra_name = ddname;
	ddsqra.ddsqra_source = source;
	ddsqra.ddsqra_value = reservation;

	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
	dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
	}

	static dsl_dir_t *
	closest_common_ancestor(dsl_dir_t ds1, dsl_dir_t ds2)
	{
	for (; ds1; ds1 = ds1->dd_parent) {
	dsl_dir_t *dd;
	for (dd = ds2; dd; dd = dd->dd_parent) {
	if (ds1 == dd)
	return (dd);
	}
	}
	return (NULL);
	}

	/*
	* If delta is applied to dd, how much of that delta would be applied to
	* ancestor? Syncing context only.
	*/
	static int64_t
	would_change(dsl_dir_t dd, int64_t delta, dsl_dir_t ancestor)
	{
	if (dd == ancestor)
	return (delta);

	mutex_enter(&dd->dd_lock);
	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
	mutex_exit(&dd->dd_lock);
	return (would_change(dd->dd_parent, delta, ancestor));
	}

	typedef struct dsl_dir_rename_arg {
	const char *ddra_oldname;
	const char *ddra_newname;
	cred_t *ddra_cred;
	} dsl_dir_rename_arg_t;

	/* ARGSUSED */
	static int
	dsl_valid_rename(dsl_pool_t dp, dsl_dataset_t ds, void *arg)
	{
	int *deltap = arg;
	char namebuf[ZFS_MAX_DATASET_NAME_LEN];

	dsl_dataset_name(ds, namebuf);

	if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN)
	return (SET_ERROR(ENAMETOOLONG));
	return (0);
	}

	static int
	dsl_dir_rename_check(void arg, dmu_tx_t tx)
	{
	dsl_dir_rename_arg_t *ddra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dir_t dd, newparent;
	const char *mynewname;
	int error;
	int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);

	/* target dir should exist */
	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
	if (error != 0)
	return (error);

	/* new parent should exist */
	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
	&newparent, &mynewname);
	if (error != 0) {
	dsl_dir_rele(dd, FTAG);
	return (error);
	}

	/* can't rename to different pool */
	if (dd->dd_pool != newparent->dd_pool) {
	dsl_dir_rele(newparent, FTAG);
	dsl_dir_rele(dd, FTAG);
	return (SET_ERROR(EXDEV));
	}

	/* new name should not already exist */
	if (mynewname == NULL) {
	dsl_dir_rele(newparent, FTAG);
	dsl_dir_rele(dd, FTAG);
	return (SET_ERROR(EEXIST));
	}

	/* if the name length is growing, validate child name lengths */
	if (delta > 0) {
	error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
	&delta, DS_FIND_CHILDREN \| DS_FIND_SNAPSHOTS);
	if (error != 0) {
	dsl_dir_rele(newparent, FTAG);
	dsl_dir_rele(dd, FTAG);
	return (error);
	}
	}

	if (dmu_tx_is_syncing(tx)) {
	if (spa_feature_is_active(dp->dp_spa,
	SPA_FEATURE_FS_SS_LIMIT)) {
	/*
	* Although this is the check function and we don't
	* normally make on-disk changes in check functions,
	* we need to do that here.
	*
	* Ensure this portion of the tree's counts have been
	* initialized in case the new parent has limits set.
	*/
	dsl_dir_init_fs_ss_count(dd, tx);
	}
	}

	if (newparent != dd->dd_parent) {
	/* is there enough space? */
	uint64_t myspace =
	MAX(dsl_dir_phys(dd)->dd_used_bytes,
	dsl_dir_phys(dd)->dd_reserved);
	objset_t *os = dd->dd_pool->dp_meta_objset;
	uint64_t fs_cnt = 0;
	uint64_t ss_cnt = 0;

	if (dsl_dir_is_zapified(dd)) {
	int err;

	err = zap_lookup(os, dd->dd_object,
	DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
	&fs_cnt);
	if (err != ENOENT && err != 0) {
	dsl_dir_rele(newparent, FTAG);
	dsl_dir_rele(dd, FTAG);
	return (err);
	}

	/*
	* have to add 1 for the filesystem itself that we're
	* moving
	*/
	fs_cnt++;

	err = zap_lookup(os, dd->dd_object,
	DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
	&ss_cnt);
	if (err != ENOENT && err != 0) {
	dsl_dir_rele(newparent, FTAG);
	dsl_dir_rele(dd, FTAG);
	return (err);
	}
	}

	/* no rename into our descendant */
	if (closest_common_ancestor(dd, newparent) == dd) {
	dsl_dir_rele(newparent, FTAG);
	dsl_dir_rele(dd, FTAG);
	return (SET_ERROR(EINVAL));
	}

	error = dsl_dir_transfer_possible(dd->dd_parent,
	newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
	if (error != 0) {
	dsl_dir_rele(newparent, FTAG);
	dsl_dir_rele(dd, FTAG);
	return (error);
	}
	}

	dsl_dir_rele(newparent, FTAG);
	dsl_dir_rele(dd, FTAG);
	return (0);
	}

	static void
	dsl_dir_rename_sync(void arg, dmu_tx_t tx)
	{
	dsl_dir_rename_arg_t *ddra = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dir_t dd, newparent;
	const char *mynewname;
	int error;
	objset_t *mos = dp->dp_meta_objset;

	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
	&mynewname));

	/* Log this before we change the name. */
	spa_history_log_internal_dd(dd, "rename", tx,
	"-> %s", ddra->ddra_newname);

	if (newparent != dd->dd_parent) {
	objset_t *os = dd->dd_pool->dp_meta_objset;
	uint64_t fs_cnt = 0;
	uint64_t ss_cnt = 0;

	/*
	* We already made sure the dd counts were initialized in the
	* check function.
	*/
	if (spa_feature_is_active(dp->dp_spa,
	SPA_FEATURE_FS_SS_LIMIT)) {
	VERIFY0(zap_lookup(os, dd->dd_object,
	DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
	&fs_cnt));
	/* add 1 for the filesystem itself that we're moving */
	fs_cnt++;

	VERIFY0(zap_lookup(os, dd->dd_object,
	DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
	&ss_cnt));
	}

	dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
	DD_FIELD_FILESYSTEM_COUNT, tx);
	dsl_fs_ss_count_adjust(newparent, fs_cnt,
	DD_FIELD_FILESYSTEM_COUNT, tx);

	dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
	DD_FIELD_SNAPSHOT_COUNT, tx);
	dsl_fs_ss_count_adjust(newparent, ss_cnt,
	DD_FIELD_SNAPSHOT_COUNT, tx);

	dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
	-dsl_dir_phys(dd)->dd_used_bytes,
	-dsl_dir_phys(dd)->dd_compressed_bytes,
	-dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
	dsl_dir_diduse_space(newparent, DD_USED_CHILD,
	dsl_dir_phys(dd)->dd_used_bytes,
	dsl_dir_phys(dd)->dd_compressed_bytes,
	dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);

	if (dsl_dir_phys(dd)->dd_reserved >
	dsl_dir_phys(dd)->dd_used_bytes) {
	uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
	dsl_dir_phys(dd)->dd_used_bytes;

	dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
	-unused_rsrv, 0, 0, tx);
	dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
	unused_rsrv, 0, 0, tx);
	}
	}

	dmu_buf_will_dirty(dd->dd_dbuf, tx);

	/* remove from old parent zapobj */
	error = zap_remove(mos,
	dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
	dd->dd_myname, tx);
	ASSERT0(error);

	(void) strcpy(dd->dd_myname, mynewname);
	dsl_dir_rele(dd->dd_parent, dd);
	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
	VERIFY0(dsl_dir_hold_obj(dp,
	newparent->dd_object, NULL, dd, &dd->dd_parent));

	/* add to new parent zapobj */
	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
	dd->dd_myname, 8, 1, &dd->dd_object, tx));

	#ifdef __FreeBSD__
	#ifdef _KERNEL
	zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
	zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname);
	#endif
	#endif

	dsl_prop_notify_all(dd);

	dsl_dir_rele(newparent, FTAG);
	dsl_dir_rele(dd, FTAG);
	}

	int
	dsl_dir_rename(const char oldname, const char newname)
	{
	dsl_dir_rename_arg_t ddra;

	ddra.ddra_oldname = oldname;
	ddra.ddra_newname = newname;
	ddra.ddra_cred = CRED();

	return (dsl_sync_task(oldname,
	dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
	3, ZFS_SPACE_CHECK_RESERVED));
	}

	int
	dsl_dir_transfer_possible(dsl_dir_t sdd, dsl_dir_t tdd,
	uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
	{
	dsl_dir_t *ancestor;
	int64_t adelta;
	uint64_t avail;
	int err;

	ancestor = closest_common_ancestor(sdd, tdd);
	adelta = would_change(sdd, -space, ancestor);
	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
	if (avail < space)
	return (SET_ERROR(ENOSPC));

	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
	ancestor, cr);
	if (err != 0)
	return (err);
	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
	ancestor, cr);
	if (err != 0)
	return (err);

	return (0);
	}

	timestruc_t
	dsl_dir_snap_cmtime(dsl_dir_t *dd)
	{
	timestruc_t t;

	mutex_enter(&dd->dd_lock);
	t = dd->dd_snap_cmtime;
	mutex_exit(&dd->dd_lock);

	return (t);
	}

	void
	dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
	{
	timestruc_t t;

	gethrestime(&t);
	mutex_enter(&dd->dd_lock);
	dd->dd_snap_cmtime = t;
	mutex_exit(&dd->dd_lock);
	}

	void
	dsl_dir_zapify(dsl_dir_t dd, dmu_tx_t tx)
	{
	objset_t *mos = dd->dd_pool->dp_meta_objset;
	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
	}

	boolean_t
	dsl_dir_is_zapified(dsl_dir_t *dd)
	{
	dmu_object_info_t doi;

	dmu_object_info_from_db(dd->dd_dbuf, &doi);
	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (revision 332525)
	@@ -1,1231 +1,1277 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013 Steven Hartland. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
	*/

	#include <sys/dsl_pool.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_synctask.h>
	#include <sys/dsl_scan.h>
	#include <sys/dnode.h>
	#include <sys/dmu_tx.h>
	#include <sys/dmu_objset.h>
	#include <sys/arc.h>
	#include <sys/zap.h>
	#include <sys/zio.h>
	#include <sys/zfs_context.h>
	#include <sys/fs/zfs.h>
	#include <sys/zfs_znode.h>
	#include <sys/spa_impl.h>
	#include <sys/dsl_deadlist.h>
	#include <sys/bptree.h>
	#include <sys/zfeature.h>
	#include <sys/zil_impl.h>
	#include <sys/dsl_userhold.h>

	#if defined(__FreeBSD__) && defined(_KERNEL)
	#include <sys/types.h>
	#include <sys/sysctl.h>
	#endif

	/*
	* ZFS Write Throttle
	* ------------------
	*
	* ZFS must limit the rate of incoming writes to the rate at which it is able
	* to sync data modifications to the backend storage. Throttling by too much
	* creates an artificial limit; throttling by too little can only be sustained
	* for short periods and would lead to highly lumpy performance. On a per-pool
	* basis, ZFS tracks the amount of modified (dirty) data. As operations change
	* data, the amount of dirty data increases; as ZFS syncs out data, the amount
	* of dirty data decreases. When the amount of dirty data exceeds a
	* predetermined threshold further modifications are blocked until the amount
	* of dirty data decreases (as data is synced out).
	*
	* The limit on dirty data is tunable, and should be adjusted according to
	* both the IO capacity and available memory of the system. The larger the
	* window, the more ZFS is able to aggregate and amortize metadata (and data)
	* changes. However, memory is a limited resource, and allowing for more dirty
	* data comes at the cost of keeping other useful data in memory (for example
	* ZFS data cached by the ARC).
	*
	* Implementation
	*
	* As buffers are modified dsl_pool_willuse_space() increments both the per-
	* txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
	* dirty space used; dsl_pool_dirty_space() decrements those values as data
	* is synced out from dsl_pool_sync(). While only the poolwide value is
	* relevant, the per-txg value is useful for debugging. The tunable
	* zfs_dirty_data_max determines the dirty space limit. Once that value is
	* exceeded, new writes are halted until space frees up.
	*
	* The zfs_dirty_data_sync tunable dictates the threshold at which we
	* ensure that there is a txg syncing (see the comment in txg.c for a full
	* description of transaction group stages).
	*
	* The IO scheduler uses both the dirty space limit and current amount of
	* dirty data as inputs. Those values affect the number of concurrent IOs ZFS
	* issues. See the comment in vdev_queue.c for details of the IO scheduler.
	*
	* The delay is also calculated based on the amount of dirty data. See the
	* comment above dmu_tx_delay() for details.
	*/

	/*
	* zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
	* capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system.
	*/
	uint64_t zfs_dirty_data_max;
	uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
	int zfs_dirty_data_max_percent = 10;

	/*
	* If there is at least this much dirty data, push out a txg.
	*/
	uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;

	/*
	* Once there is this amount of dirty data, the dmu_tx_delay() will kick in
	* and delay each transaction.
	* This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
	*/
	int zfs_delay_min_dirty_percent = 60;

	/*
	* This controls how quickly the delay approaches infinity.
	* Larger values cause it to delay more for a given amount of dirty data.
	* Therefore larger values will cause there to be less dirty data for a
	* given throughput.
	*
	* For the smoothest delay, this value should be about 1 billion divided
	* by the maximum number of operations per second. This will smoothly
	* handle between 10x and 1/10th this number.
	*
	* Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
	* multiply in dmu_tx_delay().
	*/
	uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;

	/*
	* This determines the number of threads used by the dp_sync_taskq.
	*/
	int zfs_sync_taskq_batch_pct = 75;

	/*
	* These tunables determine the behavior of how zil_itxg_clean() is
	* called via zil_clean() in the context of spa_sync(). When an itxg
	* list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
	* If the dispatch fails, the call to zil_itxg_clean() will occur
	* synchronously in the context of spa_sync(), which can negatively
	* impact the performance of spa_sync() (e.g. in the case of the itxg
	* list having a large number of itxs that needs to be cleaned).
	*
	* Thus, these tunables can be used to manipulate the behavior of the
	* taskq used by zil_clean(); they determine the number of taskq entries
	* that are pre-populated when the taskq is first created (via the
	* "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
	* taskq entries that are cached after an on-demand allocation (via the
	* "zfs_zil_clean_taskq_maxalloc").
	*
	* The idea being, we want to try reasonably hard to ensure there will
	* already be a taskq entry pre-allocated by the time that it is needed
	* by zil_clean(). This way, we can avoid the possibility of an
	* on-demand allocation of a new taskq entry from failing, which would
	* result in zil_itxg_clean() being called synchronously from zil_clean()
	* (which can adversely affect performance of spa_sync()).
	*
	* Additionally, the number of threads used by the taskq can be
	* configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
	*/
	int zfs_zil_clean_taskq_nthr_pct = 100;
	int zfs_zil_clean_taskq_minalloc = 1024;
	int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;

	#if defined(__FreeBSD__) && defined(_KERNEL)

	extern int zfs_vdev_async_write_active_max_dirty_percent;

	SYSCTL_DECL(_vfs_zfs);

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
	&zfs_dirty_data_max, 0,
	"The maximum amount of dirty data in bytes after which new writes are "
	"halted until space becomes available");

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
	&zfs_dirty_data_max_max, 0,
	"The absolute cap on dirty_data_max when auto calculating");

	static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS);
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
	CTLTYPE_INT \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN, 0, sizeof(int),
	sysctl_zfs_dirty_data_max_percent, "I",
	"The percent of physical memory used to auto calculate dirty_data_max");

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN,
	&zfs_dirty_data_sync, 0,
	"Force a txg if the number of dirty buffer bytes exceed this value");

	static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
	/* No zfs_delay_min_dirty_percent tunable due to limit requirements */
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
	CTLTYPE_INT \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(int),
	sysctl_zfs_delay_min_dirty_percent, "I",
	"The limit of outstanding dirty data before transactions are delayed");

	static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
	/* No zfs_delay_scale tunable due to limit requirements */
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
	CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t),
	sysctl_zfs_delay_scale, "QU",
	"Controls how quickly the delay approaches infinity");

	static int
	sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS)
	{
	int val, err;

	val = zfs_dirty_data_max_percent;
	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (val < 0 \|\| val > 100)
	return (EINVAL);

	zfs_dirty_data_max_percent = val;

	return (0);
	}

	static int
	sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
	{
	int val, err;

	val = zfs_delay_min_dirty_percent;
	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (val < zfs_vdev_async_write_active_max_dirty_percent)
	return (EINVAL);

	zfs_delay_min_dirty_percent = val;

	return (0);
	}

	static int
	sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
	{
	uint64_t val;
	int err;

	val = zfs_delay_scale;
	err = sysctl_handle_64(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (val > UINT64_MAX / zfs_dirty_data_max)
	return (EINVAL);

	zfs_delay_scale = val;

	return (0);
	}
	#endif

	int
	dsl_pool_open_special_dir(dsl_pool_t dp, const char name, dsl_dir_t **ddp)
	{
	uint64_t obj;
	int err;

	err = zap_lookup(dp->dp_meta_objset,
	dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
	name, sizeof (obj), 1, &obj);
	if (err)
	return (err);

	return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
	}

	static dsl_pool_t *
	dsl_pool_open_impl(spa_t *spa, uint64_t txg)
	{
	dsl_pool_t *dp;
	blkptr_t *bp = spa_get_rootblkptr(spa);

	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
	dp->dp_spa = spa;
	dp->dp_meta_rootbp = *bp;
	rrw_init(&dp->dp_config_rwlock, B_TRUE);
	txg_init(dp, txg);

	txg_list_create(&dp->dp_dirty_datasets, spa,
	offsetof(dsl_dataset_t, ds_dirty_link));
	txg_list_create(&dp->dp_dirty_zilogs, spa,
	offsetof(zilog_t, zl_dirty_link));
	txg_list_create(&dp->dp_dirty_dirs, spa,
	offsetof(dsl_dir_t, dd_dirty_link));
	txg_list_create(&dp->dp_sync_tasks, spa,
	offsetof(dsl_sync_task_t, dst_node));

	dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
	zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
	TASKQ_THREADS_CPU_PCT);

	dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
	zfs_zil_clean_taskq_nthr_pct, minclsyspri,
	zfs_zil_clean_taskq_minalloc,
	zfs_zil_clean_taskq_maxalloc,
	TASKQ_PREPOPULATE \| TASKQ_THREADS_CPU_PCT);

	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);

	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
	1, 4, 0);

	return (dp);
	}

	int
	dsl_pool_init(spa_t spa, uint64_t txg, dsl_pool_t *dpp)
	{
	int err;
	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);

	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
	&dp->dp_meta_objset);
	if (err != 0)
	dsl_pool_close(dp);
	else
	*dpp = dp;

	return (err);
	}

	int
	dsl_pool_open(dsl_pool_t *dp)
	{
	int err;
	dsl_dir_t *dd;
	dsl_dataset_t *ds;
	uint64_t obj;

	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
	&dp->dp_root_dir_obj);
	if (err)
	goto out;

	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
	NULL, dp, &dp->dp_root_dir);
	if (err)
	goto out;

	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
	if (err)
	goto out;

	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
	err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
	if (err)
	goto out;
	err = dsl_dataset_hold_obj(dp,
	dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
	if (err == 0) {
	err = dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
	&dp->dp_origin_snap);
	dsl_dataset_rele(ds, FTAG);
	}
	dsl_dir_rele(dd, dp);
	if (err)
	goto out;
	}

	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
	err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
	&dp->dp_free_dir);
	if (err)
	goto out;

	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
	if (err)
	goto out;
	VERIFY0(bpobj_open(&dp->dp_free_bpobj,
	dp->dp_meta_objset, obj));
	}

	+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	+ DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj);
	+ if (err == 0) {
	+ VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj,
	+ dp->dp_meta_objset, obj));
	+ } else if (err == ENOENT) {
	+ /*
	+ * We might not have created the remap bpobj yet.
	+ */
	+ err = 0;
	+ } else {
	+ goto out;
	+ }
	+ }
	+
	/*
	- * Note: errors ignored, because the leak dir will not exist if we
	- * have not encountered a leak yet.
	+ * Note: errors ignored, because the these special dirs, used for
	+ * space accounting, are only created on demand.
	*/
	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
	&dp->dp_leak_dir);

	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
	&dp->dp_bptree_obj);
	if (err != 0)
	goto out;
	}

	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
	&dp->dp_empty_bpobj);
	if (err != 0)
	goto out;
	}

	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
	&dp->dp_tmp_userrefs_obj);
	if (err == ENOENT)
	err = 0;
	if (err)
	goto out;

	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);

	out:
	rrw_exit(&dp->dp_config_rwlock, FTAG);
	return (err);
	}

	void
	dsl_pool_close(dsl_pool_t *dp)
	{
	/*
	* Drop our references from dsl_pool_open().
	*
	* Since we held the origin_snap from "syncing" context (which
	* includes pool-opening context), it actually only got a "ref"
	* and not a hold, so just drop that here.
	*/
	- if (dp->dp_origin_snap)
	+ if (dp->dp_origin_snap != NULL)
	dsl_dataset_rele(dp->dp_origin_snap, dp);
	- if (dp->dp_mos_dir)
	+ if (dp->dp_mos_dir != NULL)
	dsl_dir_rele(dp->dp_mos_dir, dp);
	- if (dp->dp_free_dir)
	+ if (dp->dp_free_dir != NULL)
	dsl_dir_rele(dp->dp_free_dir, dp);
	- if (dp->dp_leak_dir)
	+ if (dp->dp_leak_dir != NULL)
	dsl_dir_rele(dp->dp_leak_dir, dp);
	- if (dp->dp_root_dir)
	+ if (dp->dp_root_dir != NULL)
	dsl_dir_rele(dp->dp_root_dir, dp);

	bpobj_close(&dp->dp_free_bpobj);
	+ bpobj_close(&dp->dp_obsolete_bpobj);

	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
	- if (dp->dp_meta_objset)
	+ if (dp->dp_meta_objset != NULL)
	dmu_objset_evict(dp->dp_meta_objset);

	txg_list_destroy(&dp->dp_dirty_datasets);
	txg_list_destroy(&dp->dp_dirty_zilogs);
	txg_list_destroy(&dp->dp_sync_tasks);
	txg_list_destroy(&dp->dp_dirty_dirs);

	taskq_destroy(dp->dp_zil_clean_taskq);
	taskq_destroy(dp->dp_sync_taskq);

	/*
	* We can't set retry to TRUE since we're explicitly specifying
	* a spa to flush. This is good enough; any missed buffers for
	* this spa won't cause trouble, and they'll eventually fall
	* out of the ARC just like any other unused buffer.
	*/
	arc_flush(dp->dp_spa, FALSE);

	txg_fini(dp);
	dsl_scan_fini(dp);
	dmu_buf_user_evict_wait();

	rrw_destroy(&dp->dp_config_rwlock);
	mutex_destroy(&dp->dp_lock);
	taskq_destroy(dp->dp_vnrele_taskq);
	- if (dp->dp_blkstats)
	+ if (dp->dp_blkstats != NULL)
	kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
	kmem_free(dp, sizeof (dsl_pool_t));
	+}
	+
	+void
	+dsl_pool_create_obsolete_bpobj(dsl_pool_t dp, dmu_tx_t tx)
	+{
	+ uint64_t obj;
	+ /*
	+ * Currently, we only create the obsolete_bpobj where there are
	+ * indirect vdevs with referenced mappings.
	+ */
	+ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL));
	+ /* create and open the obsolete_bpobj */
	+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
	+ VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj));
	+ VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	+ DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
	+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	+}
	+
	+void
	+dsl_pool_destroy_obsolete_bpobj(dsl_pool_t dp, dmu_tx_t tx)
	+{
	+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	+ VERIFY0(zap_remove(dp->dp_meta_objset,
	+ DMU_POOL_DIRECTORY_OBJECT,
	+ DMU_POOL_OBSOLETE_BPOBJ, tx));
	+ bpobj_free(dp->dp_meta_objset,
	+ dp->dp_obsolete_bpobj.bpo_object, tx);
	+ bpobj_close(&dp->dp_obsolete_bpobj);
	}

	dsl_pool_t *
	dsl_pool_create(spa_t spa, nvlist_t zplprops, uint64_t txg)
	{
	int err;
	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
	dsl_dataset_t *ds;
	uint64_t obj;

	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);

	/* create and open the MOS (meta-objset) */
	dp->dp_meta_objset = dmu_objset_create_impl(spa,
	NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);

	/* create the pool directory */
	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
	ASSERT0(err);

	/* Initialize scan structures */
	VERIFY0(dsl_scan_init(dp, txg));

	/* create and open the root dir */
	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
	VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
	NULL, dp, &dp->dp_root_dir));

	/* create and open the meta-objset dir */
	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
	VERIFY0(dsl_pool_open_special_dir(dp,
	MOS_DIR_NAME, &dp->dp_mos_dir));

	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
	/* create and open the free dir */
	(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
	FREE_DIR_NAME, tx);
	VERIFY0(dsl_pool_open_special_dir(dp,
	FREE_DIR_NAME, &dp->dp_free_dir));

	/* create and open the free_bplist */
	obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
	VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
	VERIFY0(bpobj_open(&dp->dp_free_bpobj,
	dp->dp_meta_objset, obj));
	}

	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
	dsl_pool_create_origin(dp, tx);

	/* create the root dataset */
	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);

	/* create the root objset */
	VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
	#ifdef _KERNEL
	{
	objset_t *os;
	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	os = dmu_objset_create_impl(dp->dp_spa, ds,
	dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
	rrw_exit(&ds->ds_bp_rwlock, FTAG);
	zfs_create_fs(os, kcred, zplprops, tx);
	}
	#endif
	dsl_dataset_rele(ds, FTAG);

	dmu_tx_commit(tx);

	rrw_exit(&dp->dp_config_rwlock, FTAG);

	return (dp);
	}

	/*
	* Account for the meta-objset space in its placeholder dsl_dir.
	*/
	void
	dsl_pool_mos_diduse_space(dsl_pool_t *dp,
	int64_t used, int64_t comp, int64_t uncomp)
	{
	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
	mutex_enter(&dp->dp_lock);
	dp->dp_mos_used_delta += used;
	dp->dp_mos_compressed_delta += comp;
	dp->dp_mos_uncompressed_delta += uncomp;
	mutex_exit(&dp->dp_lock);
	}

	static void
	dsl_pool_sync_mos(dsl_pool_t dp, dmu_tx_t tx)
	{
	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
	VERIFY0(zio_wait(zio));
	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
	}

	static void
	dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
	{
	ASSERT(MUTEX_HELD(&dp->dp_lock));

	if (delta < 0)
	ASSERT3U(-delta, <=, dp->dp_dirty_total);

	dp->dp_dirty_total += delta;

	/*
	* Note: we signal even when increasing dp_dirty_total.
	* This ensures forward progress -- each thread wakes the next waiter.
	*/
	if (dp->dp_dirty_total < zfs_dirty_data_max)
	cv_signal(&dp->dp_spaceavail_cv);
	}

	void
	dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
	{
	zio_t *zio;
	dmu_tx_t *tx;
	dsl_dir_t *dd;
	dsl_dataset_t *ds;
	objset_t *mos = dp->dp_meta_objset;
	list_t synced_datasets;

	list_create(&synced_datasets, sizeof (dsl_dataset_t),
	offsetof(dsl_dataset_t, ds_synced_link));

	tx = dmu_tx_create_assigned(dp, txg);

	/*
	* Write out all dirty blocks of dirty datasets.
	*/
	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
	/*
	* We must not sync any non-MOS datasets twice, because
	* we may have taken a snapshot of them. However, we
	* may sync newly-created datasets on pass 2.
	*/
	ASSERT(!list_link_active(&ds->ds_synced_link));
	list_insert_tail(&synced_datasets, ds);
	dsl_dataset_sync(ds, zio, tx);
	}
	VERIFY0(zio_wait(zio));

	/*
	* We have written all of the accounted dirty data, so our
	* dp_space_towrite should now be zero. However, some seldom-used
	* code paths do not adhere to this (e.g. dbuf_undirty(), also
	* rounding error in dbuf_write_physdone).
	* Shore up the accounting of any dirtied space now.
	*/
	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);

	/*
	* Update the long range free counter after
	* we're done syncing user data
	*/
	mutex_enter(&dp->dp_lock);
	ASSERT(spa_sync_pass(dp->dp_spa) == 1 \|\|
	dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
	dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
	mutex_exit(&dp->dp_lock);

	/*
	* After the data blocks have been written (ensured by the zio_wait()
	* above), update the user/group space accounting. This happens
	* in tasks dispatched to dp_sync_taskq, so wait for them before
	* continuing.
	*/
	for (ds = list_head(&synced_datasets); ds != NULL;
	ds = list_next(&synced_datasets, ds)) {
	dmu_objset_do_userquota_updates(ds->ds_objset, tx);
	}
	taskq_wait(dp->dp_sync_taskq);

	/*
	* Sync the datasets again to push out the changes due to
	* userspace updates. This must be done before we process the
	* sync tasks, so that any snapshots will have the correct
	* user accounting information (and we won't get confused
	* about which blocks are part of the snapshot).
	*/
	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
	ASSERT(list_link_active(&ds->ds_synced_link));
	dmu_buf_rele(ds->ds_dbuf, ds);
	dsl_dataset_sync(ds, zio, tx);
	}
	VERIFY0(zio_wait(zio));

	/*
	* Now that the datasets have been completely synced, we can
	* clean up our in-memory structures accumulated while syncing:
	*
	* - move dead blocks from the pending deadlist to the on-disk deadlist
	* - release hold from dsl_dataset_dirty()
	*/
	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
	dsl_dataset_sync_done(ds, tx);
	}
	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
	dsl_dir_sync(dd, tx);
	}

	/*
	* The MOS's space is accounted for in the pool/$MOS
	* (dp_mos_dir). We can't modify the mos while we're syncing
	* it, so we remember the deltas and apply them here.
	*/
	if (dp->dp_mos_used_delta != 0 \|\| dp->dp_mos_compressed_delta != 0 \|\|
	dp->dp_mos_uncompressed_delta != 0) {
	dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
	dp->dp_mos_used_delta,
	dp->dp_mos_compressed_delta,
	dp->dp_mos_uncompressed_delta, tx);
	dp->dp_mos_used_delta = 0;
	dp->dp_mos_compressed_delta = 0;
	dp->dp_mos_uncompressed_delta = 0;
	}

	if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) {
	dsl_pool_sync_mos(dp, tx);
	}

	/*
	* If we modify a dataset in the same txg that we want to destroy it,
	* its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
	* dsl_dir_destroy_check() will fail if there are unexpected holds.
	* Therefore, we want to sync the MOS (thus syncing the dd_dbuf
	* and clearing the hold on it) before we process the sync_tasks.
	* The MOS data dirtied by the sync_tasks will be synced on the next
	* pass.
	*/
	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
	dsl_sync_task_t *dst;
	/*
	* No more sync tasks should have been added while we
	* were syncing.
	*/
	ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
	while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
	dsl_sync_task_sync(dst, tx);
	}

	dmu_tx_commit(tx);

	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
	}

	void
	dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
	{
	zilog_t *zilog;

	while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) {
	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
	/*
	* We don't remove the zilog from the dp_dirty_zilogs
	* list until after we've cleaned it. This ensures that
	* callers of zilog_is_dirty() receive an accurate
	* answer when they are racing with the spa sync thread.
	*/
	zil_clean(zilog, txg);
	(void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
	ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
	dmu_buf_rele(ds->ds_dbuf, zilog);
	}
	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
	}

	/*
	* TRUE if the current thread is the tx_sync_thread or if we
	* are being called from SPA context during pool initialization.
	*/
	int
	dsl_pool_sync_context(dsl_pool_t *dp)
	{
	return (curthread == dp->dp_tx.tx_sync_thread \|\|
	spa_is_initializing(dp->dp_spa) \|\|
	taskq_member(dp->dp_sync_taskq, curthread));
	}

	uint64_t
	dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
	{
	uint64_t space, resv;

	/*
	* If we're trying to assess whether it's OK to do a free,
	* cut the reservation in half to allow forward progress
	* (e.g. make it possible to rm(1) files from a full pool).
	*/
	space = spa_get_dspace(dp->dp_spa);
	resv = spa_get_slop_space(dp->dp_spa);
	if (netfree)
	resv >>= 1;

	return (space - resv);
	}

	boolean_t
	dsl_pool_need_dirty_delay(dsl_pool_t *dp)
	{
	uint64_t delay_min_bytes =
	zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
	boolean_t rv;

	mutex_enter(&dp->dp_lock);
	if (dp->dp_dirty_total > zfs_dirty_data_sync)
	txg_kick(dp);
	rv = (dp->dp_dirty_total > delay_min_bytes);
	mutex_exit(&dp->dp_lock);
	return (rv);
	}

	void
	dsl_pool_dirty_space(dsl_pool_t dp, int64_t space, dmu_tx_t tx)
	{
	if (space > 0) {
	mutex_enter(&dp->dp_lock);
	dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
	dsl_pool_dirty_delta(dp, space);
	mutex_exit(&dp->dp_lock);
	}
	}

	void
	dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
	{
	ASSERT3S(space, >=, 0);
	if (space == 0)
	return;
	mutex_enter(&dp->dp_lock);
	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
	/* XXX writing something we didn't dirty? */
	space = dp->dp_dirty_pertxg[txg & TXG_MASK];
	}
	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
	ASSERT3U(dp->dp_dirty_total, >=, space);
	dsl_pool_dirty_delta(dp, -space);
	mutex_exit(&dp->dp_lock);
	}

	/* ARGSUSED */
	static int
	upgrade_clones_cb(dsl_pool_t dp, dsl_dataset_t hds, void *arg)
	{
	dmu_tx_t *tx = arg;
	dsl_dataset_t ds, prev = NULL;
	int err;

	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
	if (err)
	return (err);

	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	err = dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
	if (err) {
	dsl_dataset_rele(ds, FTAG);
	return (err);
	}

	if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
	break;
	dsl_dataset_rele(ds, FTAG);
	ds = prev;
	prev = NULL;
	}

	if (prev == NULL) {
	prev = dp->dp_origin_snap;

	/*
	* The $ORIGIN can't have any data, or the accounting
	* will be wrong.
	*/
	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
	rrw_exit(&ds->ds_bp_rwlock, FTAG);

	/* The origin doesn't get attached to itself */
	if (ds->ds_object == prev->ds_object) {
	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
	dsl_dataset_phys(ds)->ds_prev_snap_txg =
	dsl_dataset_phys(prev)->ds_creation_txg;

	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
	dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;

	dmu_buf_will_dirty(prev->ds_dbuf, tx);
	dsl_dataset_phys(prev)->ds_num_children++;

	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
	ASSERT(ds->ds_prev == NULL);
	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj,
	ds, &ds->ds_prev));
	}
	}

	ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);

	if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
	dmu_buf_will_dirty(prev->ds_dbuf, tx);
	dsl_dataset_phys(prev)->ds_next_clones_obj =
	zap_create(dp->dp_meta_objset,
	DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
	}
	VERIFY0(zap_add_int(dp->dp_meta_objset,
	dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));

	dsl_dataset_rele(ds, FTAG);
	if (prev != dp->dp_origin_snap)
	dsl_dataset_rele(prev, FTAG);
	return (0);
	}

	void
	dsl_pool_upgrade_clones(dsl_pool_t dp, dmu_tx_t tx)
	{
	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT(dp->dp_origin_snap != NULL);

	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
	tx, DS_FIND_CHILDREN \| DS_FIND_SERIALIZE));
	}

	/* ARGSUSED */
	static int
	upgrade_dir_clones_cb(dsl_pool_t dp, dsl_dataset_t ds, void *arg)
	{
	dmu_tx_t *tx = arg;
	objset_t *mos = dp->dp_meta_objset;

	if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
	dsl_dataset_t *origin;

	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));

	if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
	dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
	dsl_dir_phys(origin->ds_dir)->dd_clones =
	zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
	0, tx);
	}

	VERIFY0(zap_add_int(dp->dp_meta_objset,
	dsl_dir_phys(origin->ds_dir)->dd_clones,
	ds->ds_object, tx));

	dsl_dataset_rele(origin, FTAG);
	}
	return (0);
	}

	void
	dsl_pool_upgrade_dir_clones(dsl_pool_t dp, dmu_tx_t tx)
	{
	ASSERT(dmu_tx_is_syncing(tx));
	uint64_t obj;

	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
	VERIFY0(dsl_pool_open_special_dir(dp,
	FREE_DIR_NAME, &dp->dp_free_dir));

	/*
	* We can't use bpobj_alloc(), because spa_version() still
	* returns the old version, and we need a new-version bpobj with
	* subobj support. So call dmu_object_alloc() directly.
	*/
	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
	SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));

	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN \| DS_FIND_SERIALIZE));
	}

	void
	dsl_pool_create_origin(dsl_pool_t dp, dmu_tx_t tx)
	{
	uint64_t dsobj;
	dsl_dataset_t *ds;

	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT(dp->dp_origin_snap == NULL);
	ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));

	/* create the origin dir, ds, & snap-ds */
	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
	NULL, 0, kcred, tx);
	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
	VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
	dp, &dp->dp_origin_snap));
	dsl_dataset_rele(ds, FTAG);
	}

	taskq_t *
	dsl_pool_vnrele_taskq(dsl_pool_t *dp)
	{
	return (dp->dp_vnrele_taskq);
	}

	/*
	* Walk through the pool-wide zap object of temporary snapshot user holds
	* and release them.
	*/
	void
	dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
	{
	zap_attribute_t za;
	zap_cursor_t zc;
	objset_t *mos = dp->dp_meta_objset;
	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
	nvlist_t *holds;

	if (zapobj == 0)
	return;
	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);

	holds = fnvlist_alloc();

	for (zap_cursor_init(&zc, mos, zapobj);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	char *htag;
	nvlist_t *tags;

	htag = strchr(za.za_name, '-');
	*htag = '\0';
	++htag;
	if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
	tags = fnvlist_alloc();
	fnvlist_add_boolean(tags, htag);
	fnvlist_add_nvlist(holds, za.za_name, tags);
	fnvlist_free(tags);
	} else {
	fnvlist_add_boolean(tags, htag);
	}
	}
	dsl_dataset_user_release_tmp(dp, holds);
	fnvlist_free(holds);
	zap_cursor_fini(&zc);
	}

	/*
	* Create the pool-wide zap object for storing temporary snapshot holds.
	*/
	void
	dsl_pool_user_hold_create_obj(dsl_pool_t dp, dmu_tx_t tx)
	{
	objset_t *mos = dp->dp_meta_objset;

	ASSERT(dp->dp_tmp_userrefs_obj == 0);
	ASSERT(dmu_tx_is_syncing(tx));

	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
	}

	static int
	dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
	const char tag, uint64_t now, dmu_tx_t tx, boolean_t holding)
	{
	objset_t *mos = dp->dp_meta_objset;
	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
	char *name;
	int error;

	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
	ASSERT(dmu_tx_is_syncing(tx));

	/*
	* If the pool was created prior to SPA_VERSION_USERREFS, the
	* zap object for temporary holds might not exist yet.
	*/
	if (zapobj == 0) {
	if (holding) {
	dsl_pool_user_hold_create_obj(dp, tx);
	zapobj = dp->dp_tmp_userrefs_obj;
	} else {
	return (SET_ERROR(ENOENT));
	}
	}

	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
	if (holding)
	error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
	else
	error = zap_remove(mos, zapobj, name, tx);
	strfree(name);

	return (error);
	}

	/*
	* Add a temporary hold for the given dataset object and tag.
	*/
	int
	dsl_pool_user_hold(dsl_pool_t dp, uint64_t dsobj, const char tag,
	uint64_t now, dmu_tx_t *tx)
	{
	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
	}

	/*
	* Release a temporary hold for the given dataset object and tag.
	*/
	int
	dsl_pool_user_release(dsl_pool_t dp, uint64_t dsobj, const char tag,
	dmu_tx_t *tx)
	{
	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
	tx, B_FALSE));
	}

	/*
	* DSL Pool Configuration Lock
	*
	* The dp_config_rwlock protects against changes to DSL state (e.g. dataset
	* creation / destruction / rename / property setting). It must be held for
	* read to hold a dataset or dsl_dir. I.e. you must call
	* dsl_pool_config_enter() or dsl_pool_hold() before calling
	* dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock
	* must be held continuously until all datasets and dsl_dirs are released.
	*
	* The only exception to this rule is that if a "long hold" is placed on
	* a dataset, then the dp_config_rwlock may be dropped while the dataset
	* is still held. The long hold will prevent the dataset from being
	* destroyed -- the destroy will fail with EBUSY. A long hold can be
	* obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
	* (by calling dsl_{dataset,objset}_{try}own{_obj}).
	*
	* Legitimate long-holders (including owners) should be long-running, cancelable
	* tasks that should cause "zfs destroy" to fail. This includes DMU
	* consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
	* "zfs send", and "zfs diff". There are several other long-holders whose
	* uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
	*
	* The usual formula for long-holding would be:
	* dsl_pool_hold()
	* dsl_dataset_hold()
	* ... perform checks ...
	* dsl_dataset_long_hold()
	* dsl_pool_rele()
	* ... perform long-running task ...
	* dsl_dataset_long_rele()
	* dsl_dataset_rele()
	*
	* Note that when the long hold is released, the dataset is still held but
	* the pool is not held. The dataset may change arbitrarily during this time
	* (e.g. it could be destroyed). Therefore you shouldn't do anything to the
	* dataset except release it.
	*
	* User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
	* or modifying operations.
	*
	* Modifying operations should generally use dsl_sync_task(). The synctask
	* infrastructure enforces proper locking strategy with respect to the
	* dp_config_rwlock. See the comment above dsl_sync_task() for details.
	*
	* Read-only operations will manually hold the pool, then the dataset, obtain
	* information from the dataset, then release the pool and dataset.
	* dmu_objset_{hold,rele}() are convenience routines that also do the pool
	* hold/rele.
	*/

	int
	dsl_pool_hold(const char name, void tag, dsl_pool_t **dp)
	{
	spa_t *spa;
	int error;

	error = spa_open(name, &spa, tag);
	if (error == 0) {
	*dp = spa_get_dsl(spa);
	dsl_pool_config_enter(*dp, tag);
	}
	return (error);
	}

	void
	dsl_pool_rele(dsl_pool_t dp, void tag)
	{
	dsl_pool_config_exit(dp, tag);
	spa_close(dp->dp_spa, tag);
	}

	void
	dsl_pool_config_enter(dsl_pool_t dp, void tag)
	{
	/*
	* We use a "reentrant" reader-writer lock, but not reentrantly.
	*
	* The rrwlock can (with the track_all flag) track all reading threads,
	* which is very useful for debugging which code path failed to release
	* the lock, and for verifying that the current thread does hold
	* the lock.
	*
	* (Unlike a rwlock, which knows that N threads hold it for
	* read, but not which threads, so rw_held(RW_READER) returns TRUE
	* if any thread holds it for read, even if this thread doesn't).
	*/
	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
	rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
	}

	void
	dsl_pool_config_enter_prio(dsl_pool_t dp, void tag)
	{
	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
	rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
	}

	void
	dsl_pool_config_exit(dsl_pool_t dp, void tag)
	{
	rrw_exit(&dp->dp_config_rwlock, tag);
	}

	boolean_t
	dsl_pool_config_held(dsl_pool_t *dp)
	{
	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
	}

	boolean_t
	dsl_pool_config_held_writer(dsl_pool_t *dp)
	{
	return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 332525)
	@@ -1,2037 +1,2078 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright 2016 Gary Mills
	- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright 2017 Joyent, Inc.
	* Copyright (c) 2017 Datto Inc.
	*/

	#include <sys/dsl_scan.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_synctask.h>
	#include <sys/dnode.h>
	#include <sys/dmu_tx.h>
	#include <sys/dmu_objset.h>
	#include <sys/arc.h>
	#include <sys/zap.h>
	#include <sys/zio.h>
	#include <sys/zfs_context.h>
	#include <sys/fs/zfs.h>
	#include <sys/zfs_znode.h>
	#include <sys/spa_impl.h>
	#include <sys/vdev_impl.h>
	#include <sys/zil_impl.h>
	#include <sys/zio_checksum.h>
	#include <sys/ddt.h>
	#include <sys/sa.h>
	#include <sys/sa_impl.h>
	#include <sys/zfeature.h>
	#include <sys/abd.h>
	#ifdef _KERNEL
	#include <sys/zfs_vfsops.h>
	#endif

	typedef int (scan_cb_t)(dsl_pool_t , const blkptr_t ,
	const zbookmark_phys_t *);

	static scan_cb_t dsl_scan_scrub_cb;
	static void dsl_scan_cancel_sync(void , dmu_tx_t );
	static void dsl_scan_sync_state(dsl_scan_t , dmu_tx_t );
	static boolean_t dsl_scan_restarting(dsl_scan_t , dmu_tx_t );

	unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
	unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
	unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
	unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */

	unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
	unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
	+unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
	unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
	per txg */
	boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
	boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN,
	&zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev");
	SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN,
	&zfs_resilver_delay, 0, "Number of ticks to delay resilver");
	SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN,
	&zfs_scrub_delay, 0, "Number of ticks to delay scrub");
	SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN,
	&zfs_scan_idle, 0, "Idle scan window in clock ticks");
	SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,
	&zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg");
	SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN,
	&zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
	SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN,
	&zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN,
	&zfs_no_scrub_io, 0, "Disable scrub I/O");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN,
	&zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");

	enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
	/* max number of blocks to free in a single TXG */
	-uint64_t zfs_free_max_blocks = UINT64_MAX;
	+uint64_t zfs_async_block_max_blocks = UINT64_MAX;
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
	- &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG");
	+ &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG");


	#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB \|\| \
	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)

	extern int zfs_txg_timeout;

	/*
	* Enable/disable the processing of the free_bpobj object.
	*/
	boolean_t zfs_free_bpobj_enabled = B_TRUE;

	SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN,
	&zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing");

	/* the order has to match pool_scan_type */
	static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
	NULL,
	dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
	dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
	};

	int
	dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
	{
	int err;
	dsl_scan_t *scn;
	spa_t *spa = dp->dp_spa;
	uint64_t f;

	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
	scn->scn_dp = dp;

	/*
	* It's possible that we're resuming a scan after a reboot so
	* make sure that the scan_async_destroying flag is initialized
	* appropriately.
	*/
	ASSERT(!scn->scn_async_destroying);
	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
	SPA_FEATURE_ASYNC_DESTROY);

	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	"scrub_func", sizeof (uint64_t), 1, &f);
	if (err == 0) {
	/*
	* There was an old-style scrub in progress. Restart a
	* new-style scrub from the beginning.
	*/
	scn->scn_restart_txg = txg;
	zfs_dbgmsg("old-style scrub was in progress; "
	"restarting new-style scrub in txg %llu",
	scn->scn_restart_txg);

	/*
	* Load the queue obj from the old location so that it
	* can be freed by dsl_scan_done().
	*/
	(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	"scrub_queue", sizeof (uint64_t), 1,
	&scn->scn_phys.scn_queue_obj);
	} else {
	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
	&scn->scn_phys);
	if (err == ENOENT)
	return (0);
	else if (err)
	return (err);

	if (scn->scn_phys.scn_state == DSS_SCANNING &&
	spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
	/*
	* A new-type scrub was in progress on an old
	* pool, and the pool was accessed by old
	* software. Restart from the beginning, since
	* the old software may have changed the pool in
	* the meantime.
	*/
	scn->scn_restart_txg = txg;
	zfs_dbgmsg("new-style scrub was modified "
	"by old software; restarting in txg %llu",
	scn->scn_restart_txg);
	}
	}

	spa_scan_stat_init(spa);
	return (0);
	}

	void
	dsl_scan_fini(dsl_pool_t *dp)
	{
	if (dp->dp_scan) {
	kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
	dp->dp_scan = NULL;
	}
	}

	/* ARGSUSED */
	static int
	dsl_scan_setup_check(void arg, dmu_tx_t tx)
	{
	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;

	if (scn->scn_phys.scn_state == DSS_SCANNING)
	return (SET_ERROR(EBUSY));

	return (0);
	}

	static void
	dsl_scan_setup_sync(void arg, dmu_tx_t tx)
	{
	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
	pool_scan_func_t *funcp = arg;
	dmu_object_type_t ot = 0;
	dsl_pool_t *dp = scn->scn_dp;
	spa_t *spa = dp->dp_spa;

	ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
	ASSERT(funcp > POOL_SCAN_NONE && funcp < POOL_SCAN_FUNCS);
	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
	scn->scn_phys.scn_func = *funcp;
	scn->scn_phys.scn_state = DSS_SCANNING;
	scn->scn_phys.scn_min_txg = 0;
	scn->scn_phys.scn_max_txg = tx->tx_txg;
	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
	scn->scn_phys.scn_start_time = gethrestime_sec();
	scn->scn_phys.scn_errors = 0;
	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
	scn->scn_restart_txg = 0;
	scn->scn_done_txg = 0;
	spa_scan_stat_init(spa);

	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
	scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;

	/* rewrite all disk labels */
	vdev_config_dirty(spa->spa_root_vdev);

	if (vdev_resilver_needed(spa->spa_root_vdev,
	&scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
	spa_event_notify(spa, NULL, NULL,
	ESC_ZFS_RESILVER_START);
	} else {
	spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
	}

	spa->spa_scrub_started = B_TRUE;
	/*
	* If this is an incremental scrub, limit the DDT scrub phase
	* to just the auto-ditto class (for correctness); the rest
	* of the scrub should go faster using top-down pruning.
	*/
	if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
	scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;

	}

	/* back to the generic stuff */

	if (dp->dp_blkstats == NULL) {
	dp->dp_blkstats =
	kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
	}
	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));

	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
	ot = DMU_OT_ZAP_OTHER;

	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
	ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);

	dsl_scan_sync_state(scn, tx);

	spa_history_log_internal(spa, "scan setup", tx,
	"func=%u mintxg=%llu maxtxg=%llu",
	*funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
	}

	/* ARGSUSED */
	static void
	dsl_scan_done(dsl_scan_t scn, boolean_t complete, dmu_tx_t tx)
	{
	static const char *old_names[] = {
	"scrub_bookmark",
	"scrub_ddt_bookmark",
	"scrub_ddt_class_max",
	"scrub_queue",
	"scrub_min_txg",
	"scrub_max_txg",
	"scrub_func",
	"scrub_errors",
	NULL
	};

	dsl_pool_t *dp = scn->scn_dp;
	spa_t *spa = dp->dp_spa;
	int i;

	/* Remove any remnants of an old-style scrub. */
	for (i = 0; old_names[i]; i++) {
	(void) zap_remove(dp->dp_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
	}

	if (scn->scn_phys.scn_queue_obj != 0) {
	VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, tx));
	scn->scn_phys.scn_queue_obj = 0;
	}

	scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;

	/*
	* If we were "restarted" from a stopped state, don't bother
	* with anything else.
	*/
	if (scn->scn_phys.scn_state != DSS_SCANNING)
	return;

	if (complete)
	scn->scn_phys.scn_state = DSS_FINISHED;
	else
	scn->scn_phys.scn_state = DSS_CANCELED;

	if (dsl_scan_restarting(scn, tx))
	spa_history_log_internal(spa, "scan aborted, restarting", tx,
	"errors=%llu", spa_get_errlog_size(spa));
	else if (!complete)
	spa_history_log_internal(spa, "scan cancelled", tx,
	"errors=%llu", spa_get_errlog_size(spa));
	else
	spa_history_log_internal(spa, "scan done", tx,
	"errors=%llu", spa_get_errlog_size(spa));

	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
	mutex_enter(&spa->spa_scrub_lock);
	while (spa->spa_scrub_inflight > 0) {
	cv_wait(&spa->spa_scrub_io_cv,
	&spa->spa_scrub_lock);
	}
	mutex_exit(&spa->spa_scrub_lock);
	spa->spa_scrub_started = B_FALSE;
	spa->spa_scrub_active = B_FALSE;

	/*
	* If the scrub/resilver completed, update all DTLs to
	* reflect this. Whether it succeeded or not, vacate
	* all temporary scrub DTLs.
	*/
	vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
	complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
	if (complete) {
	spa_event_notify(spa, NULL, NULL,
	scn->scn_phys.scn_min_txg ?
	ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
	}
	spa_errlog_rotate(spa);

	/*
	* We may have finished replacing a device.
	* Let the async thread assess this and handle the detach.
	*/
	spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
	}

	scn->scn_phys.scn_end_time = gethrestime_sec();
	}

	/* ARGSUSED */
	static int
	dsl_scan_cancel_check(void arg, dmu_tx_t tx)
	{
	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;

	if (scn->scn_phys.scn_state != DSS_SCANNING)
	return (SET_ERROR(ENOENT));
	return (0);
	}

	/* ARGSUSED */
	static void
	dsl_scan_cancel_sync(void arg, dmu_tx_t tx)
	{
	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;

	dsl_scan_done(scn, B_FALSE, tx);
	dsl_scan_sync_state(scn, tx);
	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
	}

	int
	dsl_scan_cancel(dsl_pool_t *dp)
	{
	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
	dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
	}

	boolean_t
	dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
	{
	if (dsl_scan_scrubbing(scn->scn_dp) &&
	scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED)
	return (B_TRUE);

	return (B_FALSE);
	}

	static int
	dsl_scrub_pause_resume_check(void arg, dmu_tx_t tx)
	{
	pool_scrub_cmd_t *cmd = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_scan_t *scn = dp->dp_scan;

	if (*cmd == POOL_SCRUB_PAUSE) {
	/* can't pause a scrub when there is no in-progress scrub */
	if (!dsl_scan_scrubbing(dp))
	return (SET_ERROR(ENOENT));

	/* can't pause a paused scrub */
	if (dsl_scan_is_paused_scrub(scn))
	return (SET_ERROR(EBUSY));
	} else if (*cmd != POOL_SCRUB_NORMAL) {
	return (SET_ERROR(ENOTSUP));
	}

	return (0);
	}

	static void
	dsl_scrub_pause_resume_sync(void arg, dmu_tx_t tx)
	{
	pool_scrub_cmd_t *cmd = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	spa_t *spa = dp->dp_spa;
	dsl_scan_t *scn = dp->dp_scan;

	if (*cmd == POOL_SCRUB_PAUSE) {
	/* can't pause a scrub when there is no in-progress scrub */
	spa->spa_scan_pass_scrub_pause = gethrestime_sec();
	scn->scn_phys.scn_flags \|= DSF_SCRUB_PAUSED;
	dsl_scan_sync_state(scn, tx);
	spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
	} else {
	ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
	if (dsl_scan_is_paused_scrub(scn)) {
	/*
	* We need to keep track of how much time we spend
	* paused per pass so that we can adjust the scrub rate
	* shown in the output of 'zpool status'
	*/
	spa->spa_scan_pass_scrub_spent_paused +=
	gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
	spa->spa_scan_pass_scrub_pause = 0;
	scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
	dsl_scan_sync_state(scn, tx);
	}
	}
	}

	/*
	* Set scrub pause/resume state if it makes sense to do so
	*/
	int
	dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
	{
	return (dsl_sync_task(spa_name(dp->dp_spa),
	dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
	ZFS_SPACE_CHECK_RESERVED));
	}

	boolean_t
	dsl_scan_scrubbing(const dsl_pool_t *dp)
	{
	dsl_scan_t *scn = dp->dp_scan;

	if (scn->scn_phys.scn_state == DSS_SCANNING &&
	scn->scn_phys.scn_func == POOL_SCAN_SCRUB)
	return (B_TRUE);

	return (B_FALSE);
	}

	static void dsl_scan_visitbp(blkptr_t bp, const zbookmark_phys_t zb,
	dnode_phys_t dnp, dsl_dataset_t ds, dsl_scan_t *scn,
	dmu_objset_type_t ostype, dmu_tx_t *tx);
	static void dsl_scan_visitdnode(dsl_scan_t , dsl_dataset_t ds,
	dmu_objset_type_t ostype,
	dnode_phys_t dnp, uint64_t object, dmu_tx_t tx);

	void
	dsl_free(dsl_pool_t dp, uint64_t txg, const blkptr_t bp)
	{
	zio_free(dp->dp_spa, txg, bp);
	}

	void
	dsl_free_sync(zio_t pio, dsl_pool_t dp, uint64_t txg, const blkptr_t *bpp)
	{
	ASSERT(dsl_pool_sync_context(dp));
	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
	pio->io_flags));
	}

	static uint64_t
	dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
	{
	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
	if (ds->ds_is_snapshot)
	return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
	return (smt);
	}

	static void
	dsl_scan_sync_state(dsl_scan_t scn, dmu_tx_t tx)
	{
	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
	&scn->scn_phys, tx));
	}

	extern int zfs_vdev_async_write_active_min_dirty_percent;

	static boolean_t
	dsl_scan_check_suspend(dsl_scan_t scn, const zbookmark_phys_t zb)
	{
	/* we never skip user/group accounting objects */
	if (zb && (int64_t)zb->zb_object < 0)
	return (B_FALSE);

	if (scn->scn_suspending)
	return (B_TRUE); /* we're already suspending */

	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
	return (B_FALSE); /* we're resuming */

	/* We only know how to resume from level-0 blocks. */
	if (zb && zb->zb_level != 0)
	return (B_FALSE);

	/*
	* We suspend if:
	* - we have scanned for the maximum time: an entire txg
	* timeout (default 5 sec)
	* or
	* - we have scanned for at least the minimum time (default 1 sec
	* for scrub, 3 sec for resilver), and either we have sufficient
	* dirty data that we are starting to write more quickly
	* (default 30%), or someone is explicitly waiting for this txg
	* to complete.
	* or
	* - the spa is shutting down because this pool is being exported
	* or the machine is rebooting.
	*/
	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
	zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
	uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
	if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout \|\|
	(NSEC2MSEC(elapsed_nanosecs) > mintime &&
	(txg_sync_waiting(scn->scn_dp) \|\|
	dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) \|\|
	spa_shutting_down(scn->scn_dp->dp_spa)) {
	if (zb) {
	dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
	(longlong_t)zb->zb_objset,
	(longlong_t)zb->zb_object,
	(longlong_t)zb->zb_level,
	(longlong_t)zb->zb_blkid);
	scn->scn_phys.scn_bookmark = *zb;
	}
	dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n",
	(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
	(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
	(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
	(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
	scn->scn_suspending = B_TRUE;
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	typedef struct zil_scan_arg {
	dsl_pool_t *zsa_dp;
	zil_header_t *zsa_zh;
	} zil_scan_arg_t;

	/* ARGSUSED */
	static int
	dsl_scan_zil_block(zilog_t zilog, blkptr_t bp, void *arg, uint64_t claim_txg)
	{
	zil_scan_arg_t *zsa = arg;
	dsl_pool_t *dp = zsa->zsa_dp;
	dsl_scan_t *scn = dp->dp_scan;
	zil_header_t *zh = zsa->zsa_zh;
	zbookmark_phys_t zb;

	if (BP_IS_HOLE(bp) \|\| bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
	return (0);

	/*
	* One block ("stubby") can be allocated a long time ago; we
	* want to visit that one because it has been allocated
	* (on-disk) even if it hasn't been claimed (even though for
	* scrub there's nothing to do to it).
	*/
	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
	return (0);

	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
	ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);

	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
	return (0);
	}

	/* ARGSUSED */
	static int
	dsl_scan_zil_record(zilog_t zilog, lr_t lrc, void *arg, uint64_t claim_txg)
	{
	if (lrc->lrc_txtype == TX_WRITE) {
	zil_scan_arg_t *zsa = arg;
	dsl_pool_t *dp = zsa->zsa_dp;
	dsl_scan_t *scn = dp->dp_scan;
	zil_header_t *zh = zsa->zsa_zh;
	lr_write_t lr = (lr_write_t )lrc;
	blkptr_t *bp = &lr->lr_blkptr;
	zbookmark_phys_t zb;

	if (BP_IS_HOLE(bp) \|\|
	bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
	return (0);

	/*
	* birth can be < claim_txg if this record's txg is
	* already txg sync'ed (but this log block contains
	* other records that are not synced)
	*/
	if (claim_txg == 0 \|\| bp->blk_birth < claim_txg)
	return (0);

	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
	lr->lr_foid, ZB_ZIL_LEVEL,
	lr->lr_offset / BP_GET_LSIZE(bp));

	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
	}
	return (0);
	}

	static void
	dsl_scan_zil(dsl_pool_t dp, zil_header_t zh)
	{
	uint64_t claim_txg = zh->zh_claim_txg;
	zil_scan_arg_t zsa = { dp, zh };
	zilog_t *zilog;

	/*
	* We only want to visit blocks that have been claimed but not yet
	* replayed (or, in read-only mode, blocks that would be claimed).
	*/
	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
	return;

	zilog = zil_alloc(dp->dp_meta_objset, zh);

	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
	claim_txg);

	zil_free(zilog);
	}

	/* ARGSUSED */
	static void
	dsl_scan_prefetch(dsl_scan_t scn, arc_buf_t buf, blkptr_t *bp,
	uint64_t objset, uint64_t object, uint64_t blkid)
	{
	zbookmark_phys_t czb;
	arc_flags_t flags = ARC_FLAG_NOWAIT \| ARC_FLAG_PREFETCH;

	if (zfs_no_scrub_prefetch)
	return;

	if (BP_IS_HOLE(bp) \|\| bp->blk_birth <= scn->scn_phys.scn_min_txg \|\|
	(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
	return;

	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);

	(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
	NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SCAN_THREAD, &flags, &czb);
	}

	static boolean_t
	dsl_scan_check_resume(dsl_scan_t scn, const dnode_phys_t dnp,
	const zbookmark_phys_t *zb)
	{
	/*
	* We never skip over user/group accounting objects (obj<0)
	*/
	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
	(int64_t)zb->zb_object >= 0) {
	/*
	* If we already visited this bp & everything below (in
	* a prior txg sync), don't bother doing it again.
	*/
	if (zbookmark_subtree_completed(dnp, zb,
	&scn->scn_phys.scn_bookmark))
	return (B_TRUE);

	/*
	* If we found the block we're trying to resume from, or
	* we went past it to a different object, zero it out to
	* indicate that it's OK to start checking for suspending
	* again.
	*/
	if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 \|\|
	zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
	dprintf("resuming at %llx/%llx/%llx/%llx\n",
	(longlong_t)zb->zb_objset,
	(longlong_t)zb->zb_object,
	(longlong_t)zb->zb_level,
	(longlong_t)zb->zb_blkid);
	bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
	}
	}
	return (B_FALSE);
	}

	/*
	* Return nonzero on i/o error.
	* Return new buf to write out in *bufp.
	*/
	static int
	dsl_scan_recurse(dsl_scan_t scn, dsl_dataset_t ds, dmu_objset_type_t ostype,
	dnode_phys_t dnp, const blkptr_t bp,
	const zbookmark_phys_t zb, dmu_tx_t tx)
	{
	dsl_pool_t *dp = scn->scn_dp;
	int zio_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SCAN_THREAD;
	int err;

	if (BP_GET_LEVEL(bp) > 0) {
	arc_flags_t flags = ARC_FLAG_WAIT;
	int i;
	blkptr_t *cbp;
	int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
	arc_buf_t *buf;

	err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
	ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
	if (err) {
	scn->scn_phys.scn_errors++;
	return (err);
	}
	for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
	dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
	zb->zb_object, zb->zb_blkid * epb + i);
	}
	for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
	zbookmark_phys_t czb;

	SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
	zb->zb_level - 1,
	zb->zb_blkid * epb + i);
	dsl_scan_visitbp(cbp, &czb, dnp,
	ds, scn, ostype, tx);
	}
	arc_buf_destroy(buf, &buf);
	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
	arc_flags_t flags = ARC_FLAG_WAIT;
	dnode_phys_t *cdnp;
	int i, j;
	int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
	arc_buf_t *buf;

	err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
	ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
	if (err) {
	scn->scn_phys.scn_errors++;
	return (err);
	}
	for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
	for (j = 0; j < cdnp->dn_nblkptr; j++) {
	blkptr_t *cbp = &cdnp->dn_blkptr[j];
	dsl_scan_prefetch(scn, buf, cbp,
	zb->zb_objset, zb->zb_blkid * epb + i, j);
	}
	}
	for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
	dsl_scan_visitdnode(scn, ds, ostype,
	cdnp, zb->zb_blkid * epb + i, tx);
	}

	arc_buf_destroy(buf, &buf);
	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
	arc_flags_t flags = ARC_FLAG_WAIT;
	objset_phys_t *osp;
	arc_buf_t *buf;

	err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
	ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
	if (err) {
	scn->scn_phys.scn_errors++;
	return (err);
	}

	osp = buf->b_data;

	dsl_scan_visitdnode(scn, ds, osp->os_type,
	&osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);

	if (OBJSET_BUF_HAS_USERUSED(buf)) {
	/*
	* We also always visit user/group accounting
	* objects, and never skip them, even if we are
	* suspending. This is necessary so that the space
	* deltas from this txg get integrated.
	*/
	dsl_scan_visitdnode(scn, ds, osp->os_type,
	&osp->os_groupused_dnode,
	DMU_GROUPUSED_OBJECT, tx);
	dsl_scan_visitdnode(scn, ds, osp->os_type,
	&osp->os_userused_dnode,
	DMU_USERUSED_OBJECT, tx);
	}
	arc_buf_destroy(buf, &buf);
	}

	return (0);
	}

	static void
	dsl_scan_visitdnode(dsl_scan_t scn, dsl_dataset_t ds,
	dmu_objset_type_t ostype, dnode_phys_t *dnp,
	uint64_t object, dmu_tx_t *tx)
	{
	int j;

	for (j = 0; j < dnp->dn_nblkptr; j++) {
	zbookmark_phys_t czb;

	SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
	dnp->dn_nlevels - 1, j);
	dsl_scan_visitbp(&dnp->dn_blkptr[j],
	&czb, dnp, ds, scn, ostype, tx);
	}

	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
	zbookmark_phys_t czb;
	SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
	0, DMU_SPILL_BLKID);
	dsl_scan_visitbp(&dnp->dn_spill,
	&czb, dnp, ds, scn, ostype, tx);
	}
	}

	/*
	* The arguments are in this order because mdb can only print the
	* first 5; we want them to be useful.
	*/
	static void
	dsl_scan_visitbp(blkptr_t bp, const zbookmark_phys_t zb,
	dnode_phys_t dnp, dsl_dataset_t ds, dsl_scan_t *scn,
	dmu_objset_type_t ostype, dmu_tx_t *tx)
	{
	dsl_pool_t *dp = scn->scn_dp;
	arc_buf_t *buf = NULL;
	blkptr_t bp_toread = *bp;

	/* ASSERT(pbuf == NULL \|\| arc_released(pbuf)); */

	if (dsl_scan_check_suspend(scn, zb))
	return;

	if (dsl_scan_check_resume(scn, dnp, zb))
	return;

	if (BP_IS_HOLE(bp))
	return;

	scn->scn_visited_this_txg++;

	dprintf_bp(bp,
	"visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
	ds, ds ? ds->ds_object : 0,
	zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
	bp);

	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
	return;

	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)
	return;

	/*
	* If dsl_scan_ddt() has already visited this block, it will have
	* already done any translations or scrubbing, so don't call the
	* callback again.
	*/
	if (ddt_class_contains(dp->dp_spa,
	scn->scn_phys.scn_ddt_class_max, bp)) {
	ASSERT(buf == NULL);
	return;
	}

	/*
	* If this block is from the future (after cur_max_txg), then we
	* are doing this on behalf of a deleted snapshot, and we will
	* revisit the future block on the next pass of this dataset.
	* Don't scan it now unless we need to because something
	* under it was modified.
	*/
	if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
	scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
	}
	}

	static void
	dsl_scan_visit_rootbp(dsl_scan_t scn, dsl_dataset_t ds, blkptr_t *bp,
	dmu_tx_t *tx)
	{
	zbookmark_phys_t zb;

	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
	ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
	dsl_scan_visitbp(bp, &zb, NULL,
	ds, scn, DMU_OST_NONE, tx);

	dprintf_ds(ds, "finished scan%s", "");
	}

	void
	dsl_scan_ds_destroyed(dsl_dataset_t ds, dmu_tx_t tx)
	{
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	dsl_scan_t *scn = dp->dp_scan;
	uint64_t mintxg;

	if (scn->scn_phys.scn_state != DSS_SCANNING)
	return;

	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
	if (ds->ds_is_snapshot) {
	/*
	* Note:
	* - scn_cur_{min,max}_txg stays the same.
	* - Setting the flag is not really necessary if
	* scn_cur_max_txg == scn_max_txg, because there
	* is nothing after this snapshot that we care
	* about. However, we set it anyway and then
	* ignore it when we retraverse it in
	* dsl_scan_visitds().
	*/
	scn->scn_phys.scn_bookmark.zb_objset =
	dsl_dataset_phys(ds)->ds_next_snap_obj;
	zfs_dbgmsg("destroying ds %llu; currently traversing; "
	"reset zb_objset to %llu",
	(u_longlong_t)ds->ds_object,
	(u_longlong_t)dsl_dataset_phys(ds)->
	ds_next_snap_obj);
	scn->scn_phys.scn_flags \|= DSF_VISIT_DS_AGAIN;
	} else {
	SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
	ZB_DESTROYED_OBJSET, 0, 0, 0);
	zfs_dbgmsg("destroying ds %llu; currently traversing; "
	"reset bookmark to -1,0,0,0",
	(u_longlong_t)ds->ds_object);
	}
	} else if (zap_lookup_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
	VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
	if (ds->ds_is_snapshot) {
	/*
	* We keep the same mintxg; it could be >
	* ds_creation_txg if the previous snapshot was
	* deleted too.
	*/
	VERIFY(zap_add_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj,
	dsl_dataset_phys(ds)->ds_next_snap_obj,
	mintxg, tx) == 0);
	zfs_dbgmsg("destroying ds %llu; in queue; "
	"replacing with %llu",
	(u_longlong_t)ds->ds_object,
	(u_longlong_t)dsl_dataset_phys(ds)->
	ds_next_snap_obj);
	} else {
	zfs_dbgmsg("destroying ds %llu; in queue; removing",
	(u_longlong_t)ds->ds_object);
	}
	}

	/*
	* dsl_scan_sync() should be called after this, and should sync
	* out our changed state, but just to be safe, do it here.
	*/
	dsl_scan_sync_state(scn, tx);
	}

	void
	dsl_scan_ds_snapshotted(dsl_dataset_t ds, dmu_tx_t tx)
	{
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	dsl_scan_t *scn = dp->dp_scan;
	uint64_t mintxg;

	if (scn->scn_phys.scn_state != DSS_SCANNING)
	return;

	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);

	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
	scn->scn_phys.scn_bookmark.zb_objset =
	dsl_dataset_phys(ds)->ds_prev_snap_obj;
	zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
	"reset zb_objset to %llu",
	(u_longlong_t)ds->ds_object,
	(u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
	} else if (zap_lookup_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
	VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
	VERIFY(zap_add_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
	zfs_dbgmsg("snapshotting ds %llu; in queue; "
	"replacing with %llu",
	(u_longlong_t)ds->ds_object,
	(u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
	}
	dsl_scan_sync_state(scn, tx);
	}

	void
	dsl_scan_ds_clone_swapped(dsl_dataset_t ds1, dsl_dataset_t ds2, dmu_tx_t *tx)
	{
	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
	dsl_scan_t *scn = dp->dp_scan;
	uint64_t mintxg;

	if (scn->scn_phys.scn_state != DSS_SCANNING)
	return;

	if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
	scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
	zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
	"reset zb_objset to %llu",
	(u_longlong_t)ds1->ds_object,
	(u_longlong_t)ds2->ds_object);
	} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
	scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
	zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
	"reset zb_objset to %llu",
	(u_longlong_t)ds2->ds_object,
	(u_longlong_t)ds1->ds_object);
	}

	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
	ds1->ds_object, &mintxg) == 0) {
	int err;

	ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
	ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
	VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
	err = zap_add_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
	VERIFY(err == 0 \|\| err == EEXIST);
	if (err == EEXIST) {
	/* Both were there to begin with */
	VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj,
	ds1->ds_object, mintxg, tx));
	}
	zfs_dbgmsg("clone_swap ds %llu; in queue; "
	"replacing with %llu",
	(u_longlong_t)ds1->ds_object,
	(u_longlong_t)ds2->ds_object);
	} else if (zap_lookup_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
	ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
	ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
	VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
	VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
	zfs_dbgmsg("clone_swap ds %llu; in queue; "
	"replacing with %llu",
	(u_longlong_t)ds2->ds_object,
	(u_longlong_t)ds1->ds_object);
	}

	dsl_scan_sync_state(scn, tx);
	}

	struct enqueue_clones_arg {
	dmu_tx_t *tx;
	uint64_t originobj;
	};

	/* ARGSUSED */
	static int
	enqueue_clones_cb(dsl_pool_t dp, dsl_dataset_t hds, void *arg)
	{
	struct enqueue_clones_arg *eca = arg;
	dsl_dataset_t *ds;
	int err;
	dsl_scan_t *scn = dp->dp_scan;

	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
	return (0);

	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
	if (err)
	return (err);

	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
	dsl_dataset_t *prev;
	err = dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);

	dsl_dataset_rele(ds, FTAG);
	if (err)
	return (err);
	ds = prev;
	}
	VERIFY(zap_add_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds->ds_object,
	dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	static void
	dsl_scan_visitds(dsl_scan_t scn, uint64_t dsobj, dmu_tx_t tx)
	{
	dsl_pool_t *dp = scn->scn_dp;
	dsl_dataset_t *ds;
	- objset_t *os;

	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));

	if (scn->scn_phys.scn_cur_min_txg >=
	scn->scn_phys.scn_max_txg) {
	/*
	* This can happen if this snapshot was created after the
	* scan started, and we already completed a previous snapshot
	* that was created after the scan started. This snapshot
	* only references blocks with:
	*
	* birth < our ds_creation_txg
	* cur_min_txg is no less than ds_creation_txg.
	* We have already visited these blocks.
	* or
	* birth > scn_max_txg
	* The scan requested not to visit these blocks.
	*
	* Subsequent snapshots (and clones) can reference our
	* blocks, or blocks with even higher birth times.
	* Therefore we do not need to visit them either,
	* so we do not add them to the work queue.
	*
	* Note that checking for cur_min_txg >= cur_max_txg
	* is not sufficient, because in that case we may need to
	* visit subsequent snapshots. This happens when min_txg > 0,
	* which raises cur_min_txg. In this case we will visit
	* this dataset but skip all of its blocks, because the
	* rootbp's birth time is < cur_min_txg. Then we will
	* add the next snapshots/clones to the work queue.
	*/
	char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
	dsl_dataset_name(ds, dsname);
	zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
	"cur_min_txg (%llu) >= max_txg (%llu)",
	dsobj, dsname,
	scn->scn_phys.scn_cur_min_txg,
	scn->scn_phys.scn_max_txg);
	kmem_free(dsname, MAXNAMELEN);

	goto out;
	}

	- if (dmu_objset_from_ds(ds, &os))
	- goto out;
	-
	/*
	- * Only the ZIL in the head (non-snapshot) is valid. Even though
	+ * Only the ZIL in the head (non-snapshot) is valid. Even though
	* snapshots can have ZIL block pointers (which may be the same
	- * BP as in the head), they must be ignored. So we traverse the
	- * ZIL here, rather than in scan_recurse(), because the regular
	- * snapshot block-sharing rules don't apply to it.
	+ * BP as in the head), they must be ignored. In addition, $ORIGIN
	+ * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
	+ * need to look for a ZIL in it either. So we traverse the ZIL here,
	+ * rather than in scan_recurse(), because the regular snapshot
	+ * block-sharing rules don't apply to it.
	*/
	- if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
	+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) &&
	+ ds->ds_dir != dp->dp_origin_snap->ds_dir) {
	+ objset_t *os;
	+ if (dmu_objset_from_ds(ds, &os) != 0) {
	+ goto out;
	+ }
	dsl_scan_zil(dp, &os->os_zil_header);
	+ }

	/*
	* Iterate over the bps in this ds.
	*/
	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
	dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
	rrw_exit(&ds->ds_bp_rwlock, FTAG);

	char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
	dsl_dataset_name(ds, dsname);
	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
	"suspending=%u",
	(longlong_t)dsobj, dsname,
	(longlong_t)scn->scn_phys.scn_cur_min_txg,
	(longlong_t)scn->scn_phys.scn_cur_max_txg,
	(int)scn->scn_suspending);
	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);

	if (scn->scn_suspending)
	goto out;

	/*
	* We've finished this pass over this dataset.
	*/

	/*
	* If we did not completely visit this dataset, do another pass.
	*/
	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
	zfs_dbgmsg("incomplete pass; visiting again");
	scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
	VERIFY(zap_add_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, ds->ds_object,
	scn->scn_phys.scn_cur_max_txg, tx) == 0);
	goto out;
	}

	/*
	* Add descendent datasets to work queue.
	*/
	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
	VERIFY(zap_add_int_key(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj,
	dsl_dataset_phys(ds)->ds_next_snap_obj,
	dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
	}
	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
	boolean_t usenext = B_FALSE;
	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
	uint64_t count;
	/*
	* A bug in a previous version of the code could
	* cause upgrade_clones_cb() to not set
	* ds_next_snap_obj when it should, leading to a
	* missing entry. Therefore we can only use the
	* next_clones_obj when its count is correct.
	*/
	int err = zap_count(dp->dp_meta_objset,
	dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
	if (err == 0 &&
	count == dsl_dataset_phys(ds)->ds_num_children - 1)
	usenext = B_TRUE;
	}

	if (usenext) {
	VERIFY0(zap_join_key(dp->dp_meta_objset,
	dsl_dataset_phys(ds)->ds_next_clones_obj,
	scn->scn_phys.scn_queue_obj,
	dsl_dataset_phys(ds)->ds_creation_txg, tx));
	} else {
	struct enqueue_clones_arg eca;
	eca.tx = tx;
	eca.originobj = ds->ds_object;

	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
	}
	}

	out:
	dsl_dataset_rele(ds, FTAG);
	}

	/* ARGSUSED */
	static int
	enqueue_cb(dsl_pool_t dp, dsl_dataset_t hds, void *arg)
	{
	dmu_tx_t *tx = arg;
	dsl_dataset_t *ds;
	int err;
	dsl_scan_t *scn = dp->dp_scan;

	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
	if (err)
	return (err);

	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	dsl_dataset_t *prev;
	err = dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
	if (err) {
	dsl_dataset_rele(ds, FTAG);
	return (err);
	}

	/*
	* If this is a clone, we don't need to worry about it for now.
	*/
	if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
	dsl_dataset_rele(ds, FTAG);
	dsl_dataset_rele(prev, FTAG);
	return (0);
	}
	dsl_dataset_rele(ds, FTAG);
	ds = prev;
	}

	VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
	ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	/*
	* Scrub/dedup interaction.
	*
	* If there are N references to a deduped block, we don't want to scrub it
	* N times -- ideally, we should scrub it exactly once.
	*
	* We leverage the fact that the dde's replication class (enum ddt_class)
	* is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
	* (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
	*
	* To prevent excess scrubbing, the scrub begins by walking the DDT
	* to find all blocks with refcnt > 1, and scrubs each of these once.
	* Since there are two replication classes which contain blocks with
	* refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
	* Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
	*
	* There would be nothing more to say if a block's refcnt couldn't change
	* during a scrub, but of course it can so we must account for changes
	* in a block's replication class.
	*
	* Here's an example of what can occur:
	*
	* If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
	* when visited during the top-down scrub phase, it will be scrubbed twice.
	* This negates our scrub optimization, but is otherwise harmless.
	*
	* If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
	* on each visit during the top-down scrub phase, it will never be scrubbed.
	* To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
	* reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
	* DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
	* while a scrub is in progress, it scrubs the block right then.
	*/
	static void
	dsl_scan_ddt(dsl_scan_t scn, dmu_tx_t tx)
	{
	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
	ddt_entry_t dde = { 0 };
	int error;
	uint64_t n = 0;

	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
	ddt_t *ddt;

	if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
	break;
	dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
	(longlong_t)ddb->ddb_class,
	(longlong_t)ddb->ddb_type,
	(longlong_t)ddb->ddb_checksum,
	(longlong_t)ddb->ddb_cursor);

	/* There should be no pending changes to the dedup table */
	ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
	ASSERT(avl_first(&ddt->ddt_tree) == NULL);

	dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
	n++;

	if (dsl_scan_check_suspend(scn, NULL))
	break;
	}

	zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; "
	"suspending=%u", (longlong_t)n,
	(int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);

	ASSERT(error == 0 \|\| error == ENOENT);
	ASSERT(error != ENOENT \|\|
	ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
	}

	/* ARGSUSED */
	void
	dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
	ddt_entry_t dde, dmu_tx_t tx)
	{
	const ddt_key_t *ddk = &dde->dde_key;
	ddt_phys_t *ddp = dde->dde_phys;
	blkptr_t bp;
	zbookmark_phys_t zb = { 0 };

	if (scn->scn_phys.scn_state != DSS_SCANNING)
	return;

	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	if (ddp->ddp_phys_birth == 0 \|\|
	ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
	continue;
	ddt_bp_create(checksum, ddk, ddp, &bp);

	scn->scn_visited_this_txg++;
	scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
	}
	}

	static void
	dsl_scan_visit(dsl_scan_t scn, dmu_tx_t tx)
	{
	dsl_pool_t *dp = scn->scn_dp;
	zap_cursor_t zc;
	zap_attribute_t za;

	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
	scn->scn_phys.scn_ddt_class_max) {
	scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
	scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
	dsl_scan_ddt(scn, tx);
	if (scn->scn_suspending)
	return;
	}

	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
	/* First do the MOS & ORIGIN */

	scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
	scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
	dsl_scan_visit_rootbp(scn, NULL,
	&dp->dp_meta_rootbp, tx);
	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
	if (scn->scn_suspending)
	return;

	if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	enqueue_cb, tx, DS_FIND_CHILDREN));
	} else {
	dsl_scan_visitds(scn,
	dp->dp_origin_snap->ds_object, tx);
	}
	ASSERT(!scn->scn_suspending);
	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
	ZB_DESTROYED_OBJSET) {
	/*
	* If we were suspended, continue from here. Note if the
	* ds we were suspended on was deleted, the zb_objset may
	* be -1, so we will skip this and find a new objset
	* below.
	*/
	dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
	if (scn->scn_suspending)
	return;
	}

	/*
	* In case we were suspended right at the end of the ds, zero the
	* bookmark so we don't think that we're still trying to resume.
	*/
	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));

	/* keep pulling things out of the zap-object-as-queue */
	while (zap_cursor_init(&zc, dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj),
	zap_cursor_retrieve(&zc, &za) == 0) {
	dsl_dataset_t *ds;
	uint64_t dsobj;

	dsobj = zfs_strtonum(za.za_name, NULL);
	VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
	scn->scn_phys.scn_queue_obj, dsobj, tx));

	/* Set up min/max txg */
	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
	if (za.za_first_integer != 0) {
	scn->scn_phys.scn_cur_min_txg =
	MAX(scn->scn_phys.scn_min_txg,
	za.za_first_integer);
	} else {
	scn->scn_phys.scn_cur_min_txg =
	MAX(scn->scn_phys.scn_min_txg,
	dsl_dataset_phys(ds)->ds_prev_snap_txg);
	}
	scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
	dsl_dataset_rele(ds, FTAG);

	dsl_scan_visitds(scn, dsobj, tx);
	zap_cursor_fini(&zc);
	if (scn->scn_suspending)
	return;
	}
	zap_cursor_fini(&zc);
	}

	static boolean_t
	-dsl_scan_free_should_suspend(dsl_scan_t *scn)
	+dsl_scan_async_block_should_pause(dsl_scan_t *scn)
	{
	uint64_t elapsed_nanosecs;

	if (zfs_recover)
	return (B_FALSE);

	- if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
	+ if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks)
	return (B_TRUE);

	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout \|\|
	- (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
	+ (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
	txg_sync_waiting(scn->scn_dp)) \|\|
	spa_shutting_down(scn->scn_dp->dp_spa));
	}

	static int
	dsl_scan_free_block_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	dsl_scan_t *scn = arg;

	if (!scn->scn_is_bptree \|\|
	(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
	- if (dsl_scan_free_should_suspend(scn))
	+ if (dsl_scan_async_block_should_pause(scn))
	return (SET_ERROR(ERESTART));
	}

	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
	dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
	-bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
	-BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
	scn->scn_visited_this_txg++;
	return (0);
	}

	+static int
	+dsl_scan_obsolete_block_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	+{
	+ dsl_scan_t *scn = arg;
	+ const dva_t *dva = &bp->blk_dva[0];
	+
	+ if (dsl_scan_async_block_should_pause(scn))
	+ return (SET_ERROR(ERESTART));
	+
	+ spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
	+ DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
	+ DVA_GET_ASIZE(dva), tx);
	+ scn->scn_visited_this_txg++;
	+ return (0);
	+}
	+
	boolean_t
	dsl_scan_active(dsl_scan_t *scn)
	{
	spa_t *spa = scn->scn_dp->dp_spa;
	uint64_t used = 0, comp, uncomp;

	if (spa->spa_load_state != SPA_LOAD_NONE)
	return (B_FALSE);
	if (spa_shutting_down(spa))
	return (B_FALSE);
	if ((scn->scn_phys.scn_state == DSS_SCANNING &&
	!dsl_scan_is_paused_scrub(scn)) \|\|
	(scn->scn_async_destroying && !scn->scn_async_stalled))
	return (B_TRUE);

	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
	(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
	&used, &comp, &uncomp);
	}
	return (used != 0);
	}

	/* Called whenever a txg syncs. */
	void
	dsl_scan_sync(dsl_pool_t dp, dmu_tx_t tx)
	{
	dsl_scan_t *scn = dp->dp_scan;
	spa_t *spa = dp->dp_spa;
	int err = 0;

	/*
	* Check for scn_restart_txg before checking spa_load_state, so
	* that we can restart an old-style scan while the pool is being
	* imported (see dsl_scan_init).
	*/
	if (dsl_scan_restarting(scn, tx)) {
	pool_scan_func_t func = POOL_SCAN_SCRUB;
	dsl_scan_done(scn, B_FALSE, tx);
	if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
	func = POOL_SCAN_RESILVER;
	zfs_dbgmsg("restarting scan func=%u txg=%llu",
	func, tx->tx_txg);
	dsl_scan_setup_sync(&func, tx);
	}

	/*
	* Only process scans in sync pass 1.
	*/
	if (spa_sync_pass(dp->dp_spa) > 1)
	return;

	/*
	* If the spa is shutting down, then stop scanning. This will
	* ensure that the scan does not dirty any new data during the
	* shutdown phase.
	*/
	if (spa_shutting_down(spa))
	return;

	/*
	* If the scan is inactive due to a stalled async destroy, try again.
	*/
	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
	return;

	scn->scn_visited_this_txg = 0;
	scn->scn_suspending = B_FALSE;
	scn->scn_sync_start_time = gethrtime();
	spa->spa_scrub_active = B_TRUE;

	/*
	* First process the async destroys. If we suspend, don't do
	* any scrubbing or resilvering. This ensures that there are no
	* async destroys while we are scanning, so the scan code doesn't
	* have to worry about traversing it. It is also faster to free the
	* blocks than to scrub them.
	*/
	if (zfs_free_bpobj_enabled &&
	spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
	scn->scn_is_bptree = B_FALSE;
	+ scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
	NULL, ZIO_FLAG_MUSTSUCCEED);
	err = bpobj_iterate(&dp->dp_free_bpobj,
	dsl_scan_free_block_cb, scn, tx);
	VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));

	if (err != 0 && err != ERESTART)
	zfs_panic_recover("error %u from bpobj_iterate()", err);
	}

	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
	ASSERT(scn->scn_async_destroying);
	scn->scn_is_bptree = B_TRUE;
	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
	NULL, ZIO_FLAG_MUSTSUCCEED);
	err = bptree_iterate(dp->dp_meta_objset,
	dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
	VERIFY0(zio_wait(scn->scn_zio_root));

	if (err == EIO \|\| err == ECKSUM) {
	err = 0;
	} else if (err != 0 && err != ERESTART) {
	zfs_panic_recover("error %u from "
	"traverse_dataset_destroyed()", err);
	}

	if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
	/* finished; deactivate async destroy feature */
	spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
	ASSERT(!spa_feature_is_active(spa,
	SPA_FEATURE_ASYNC_DESTROY));
	VERIFY0(zap_remove(dp->dp_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_BPTREE_OBJ, tx));
	VERIFY0(bptree_free(dp->dp_meta_objset,
	dp->dp_bptree_obj, tx));
	dp->dp_bptree_obj = 0;
	scn->scn_async_destroying = B_FALSE;
	scn->scn_async_stalled = B_FALSE;
	} else {
	/*
	* If we didn't make progress, mark the async
	* destroy as stalled, so that we will not initiate
	* a spa_sync() on its behalf. Note that we only
	* check this if we are not finished, because if the
	* bptree had no blocks for us to visit, we can
	* finish without "making progress".
	*/
	scn->scn_async_stalled =
	(scn->scn_visited_this_txg == 0);
	}
	}
	if (scn->scn_visited_this_txg) {
	zfs_dbgmsg("freed %llu blocks in %llums from "
	"free_bpobj/bptree txg %llu; err=%d",
	(longlong_t)scn->scn_visited_this_txg,
	(longlong_t)
	NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
	(longlong_t)tx->tx_txg, err);
	scn->scn_visited_this_txg = 0;

	/*
	* Write out changes to the DDT that may be required as a
	* result of the blocks freed. This ensures that the DDT
	* is clean when a scrub/resilver runs.
	*/
	ddt_sync(spa, tx->tx_txg);
	}
	if (err != 0)
	return;
	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
	zfs_free_leak_on_eio &&
	(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 \|\|
	dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 \|\|
	dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
	/*
	* We have finished background destroying, but there is still
	* some space left in the dp_free_dir. Transfer this leaked
	* space to the dp_leak_dir.
	*/
	if (dp->dp_leak_dir == NULL) {
	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
	(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
	LEAK_DIR_NAME, tx);
	VERIFY0(dsl_pool_open_special_dir(dp,
	LEAK_DIR_NAME, &dp->dp_leak_dir));
	rrw_exit(&dp->dp_config_rwlock, FTAG);
	}
	dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
	dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
	dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
	dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
	dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
	-dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
	-dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
	-dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
	}
	+
	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
	/* finished; verify that space accounting went to zero */
	ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
	ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
	ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
	+ }
	+
	+ EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
	+ 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	+ DMU_POOL_OBSOLETE_BPOBJ));
	+ if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
	+ ASSERT(spa_feature_is_active(dp->dp_spa,
	+ SPA_FEATURE_OBSOLETE_COUNTS));
	+
	+ scn->scn_is_bptree = B_FALSE;
	+ scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
	+ err = bpobj_iterate(&dp->dp_obsolete_bpobj,
	+ dsl_scan_obsolete_block_cb, scn, tx);
	+ if (err != 0 && err != ERESTART)
	+ zfs_panic_recover("error %u from bpobj_iterate()", err);
	+
	+ if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
	+ dsl_pool_destroy_obsolete_bpobj(dp, tx);
	}

	if (scn->scn_phys.scn_state != DSS_SCANNING)
	return;

	if (scn->scn_done_txg == tx->tx_txg) {
	ASSERT(!scn->scn_suspending);
	/* finished with scan. */
	zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
	dsl_scan_done(scn, B_TRUE, tx);
	ASSERT3U(spa->spa_scrub_inflight, ==, 0);
	dsl_scan_sync_state(scn, tx);
	return;
	}

	if (dsl_scan_is_paused_scrub(scn))
	return;

	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
	scn->scn_phys.scn_ddt_class_max) {
	zfs_dbgmsg("doing scan sync txg %llu; "
	"ddt bm=%llu/%llu/%llu/%llx",
	(longlong_t)tx->tx_txg,
	(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
	(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
	(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
	(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
	ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
	ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
	ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
	ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
	} else {
	zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
	(longlong_t)tx->tx_txg,
	(longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
	(longlong_t)scn->scn_phys.scn_bookmark.zb_object,
	(longlong_t)scn->scn_phys.scn_bookmark.zb_level,
	(longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
	}

	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
	NULL, ZIO_FLAG_CANFAIL);
	dsl_pool_config_enter(dp, FTAG);
	dsl_scan_visit(scn, tx);
	dsl_pool_config_exit(dp, FTAG);
	(void) zio_wait(scn->scn_zio_root);
	scn->scn_zio_root = NULL;

	zfs_dbgmsg("visited %llu blocks in %llums",
	(longlong_t)scn->scn_visited_this_txg,
	(longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));

	if (!scn->scn_suspending) {
	scn->scn_done_txg = tx->tx_txg + 1;
	zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
	tx->tx_txg, scn->scn_done_txg);
	}

	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
	mutex_enter(&spa->spa_scrub_lock);
	while (spa->spa_scrub_inflight > 0) {
	cv_wait(&spa->spa_scrub_io_cv,
	&spa->spa_scrub_lock);
	}
	mutex_exit(&spa->spa_scrub_lock);
	}

	dsl_scan_sync_state(scn, tx);
	}

	/*
	* This will start a new scan, or restart an existing one.
	*/
	void
	dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
	{
	if (txg == 0) {
	dmu_tx_t *tx;
	tx = dmu_tx_create_dd(dp->dp_mos_dir);
	VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));

	txg = dmu_tx_get_txg(tx);
	dp->dp_scan->scn_restart_txg = txg;
	dmu_tx_commit(tx);
	} else {
	dp->dp_scan->scn_restart_txg = txg;
	}
	zfs_dbgmsg("restarting resilver txg=%llu", txg);
	}

	boolean_t
	dsl_scan_resilvering(dsl_pool_t *dp)
	{
	return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
	dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
	}

	/*
	* scrub consumers
	*/

	static void
	count_block(zfs_all_blkstats_t zab, const blkptr_t bp)
	{
	int i;

	/*
	* If we resume after a reboot, zab will be NULL; don't record
	* incomplete stats in that case.
	*/
	if (zab == NULL)
	return;

	for (i = 0; i < 4; i++) {
	int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
	int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
	if (t & DMU_OT_NEWTYPE)
	t = DMU_OT_OTHER;
	zfs_blkstat_t *zb = &zab->zab_type[l][t];
	int equal;

	zb->zb_count++;
	zb->zb_asize += BP_GET_ASIZE(bp);
	zb->zb_lsize += BP_GET_LSIZE(bp);
	zb->zb_psize += BP_GET_PSIZE(bp);
	zb->zb_gangs += BP_COUNT_GANG(bp);

	switch (BP_GET_NDVAS(bp)) {
	case 2:
	if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	DVA_GET_VDEV(&bp->blk_dva[1]))
	zb->zb_ditto_2_of_2_samevdev++;
	break;
	case 3:
	equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
	DVA_GET_VDEV(&bp->blk_dva[1])) +
	(DVA_GET_VDEV(&bp->blk_dva[0]) ==
	DVA_GET_VDEV(&bp->blk_dva[2])) +
	(DVA_GET_VDEV(&bp->blk_dva[1]) ==
	DVA_GET_VDEV(&bp->blk_dva[2]));
	if (equal == 1)
	zb->zb_ditto_2_of_3_samevdev++;
	else if (equal == 3)
	zb->zb_ditto_3_of_3_samevdev++;
	break;
	}
	}
	}

	static void
	dsl_scan_scrub_done(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;

	abd_free(zio->io_abd);

	mutex_enter(&spa->spa_scrub_lock);
	spa->spa_scrub_inflight--;
	cv_broadcast(&spa->spa_scrub_io_cv);

	if (zio->io_error && (zio->io_error != ECKSUM \|\|
	!(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
	spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
	}
	mutex_exit(&spa->spa_scrub_lock);
	}

	static int
	dsl_scan_scrub_cb(dsl_pool_t *dp,
	const blkptr_t bp, const zbookmark_phys_t zb)
	{
	dsl_scan_t *scn = dp->dp_scan;
	size_t size = BP_GET_PSIZE(bp);
	spa_t *spa = dp->dp_spa;
	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
	boolean_t needs_io;
	int zio_flags = ZIO_FLAG_SCAN_THREAD \| ZIO_FLAG_RAW \| ZIO_FLAG_CANFAIL;
	unsigned int scan_delay = 0;

	if (phys_birth <= scn->scn_phys.scn_min_txg \|\|
	phys_birth >= scn->scn_phys.scn_max_txg)
	return (0);

	count_block(dp->dp_blkstats, bp);

	if (BP_IS_EMBEDDED(bp))
	return (0);

	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
	zio_flags \|= ZIO_FLAG_SCRUB;
	needs_io = B_TRUE;
	scan_delay = zfs_scrub_delay;
	} else {
	ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
	zio_flags \|= ZIO_FLAG_RESILVER;
	needs_io = B_FALSE;
	scan_delay = zfs_resilver_delay;
	}

	/* If it's an intent log block, failure is expected. */
	if (zb->zb_level == ZB_ZIL_LEVEL)
	zio_flags \|= ZIO_FLAG_SPECULATIVE;

	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
	vdev_t *vd = vdev_lookup_top(spa,
	DVA_GET_VDEV(&bp->blk_dva[d]));

	/*
	* Keep track of how much data we've examined so that
	* zpool(1M) status can make useful progress reports.
	*/
	scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
	spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);

	/* if it's a resilver, this may not be in the target range */
	if (!needs_io) {
	if (DVA_GET_GANG(&bp->blk_dva[d])) {
	/*
	* Gang members may be spread across multiple
	* vdevs, so the best estimate we have is the
	* scrub range, which has already been checked.
	* XXX -- it would be better to change our
	* allocation policy to ensure that all
	* gang members reside on the same vdev.
	*/
	needs_io = B_TRUE;
	} else {
	needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
	phys_birth, 1);
	}
	}
	}

	if (needs_io && !zfs_no_scrub_io) {
	vdev_t *rvd = spa->spa_root_vdev;
	uint64_t maxinflight = rvd->vdev_children *
	MAX(zfs_top_maxinflight, 1);

	mutex_enter(&spa->spa_scrub_lock);
	while (spa->spa_scrub_inflight >= maxinflight)
	cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
	spa->spa_scrub_inflight++;
	mutex_exit(&spa->spa_scrub_lock);

	/*
	* If we're seeing recent (zfs_scan_idle) "important" I/Os
	* then throttle our workload to limit the impact of a scan.
	*/
	if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
	delay(MAX((int)scan_delay, 0));

	zio_nowait(zio_read(NULL, spa, bp,
	abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
	NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
	}

	/* do not relocate this block */
	return (0);
	}

	/*
	* Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
	* Can also be called to resume a paused scrub.
	*/
	int
	dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
	{
	spa_t *spa = dp->dp_spa;
	dsl_scan_t *scn = dp->dp_scan;

	/*
	* Purge all vdev caches and probe all devices. We do this here
	* rather than in sync context because this requires a writer lock
	* on the spa_config lock, which we can't do from sync context. The
	* spa_scrub_reopen flag indicates that vdev_open() should not
	* attempt to start another scrub.
	*/
	spa_vdev_state_enter(spa, SCL_NONE);
	spa->spa_scrub_reopen = B_TRUE;
	vdev_reopen(spa->spa_root_vdev);
	spa->spa_scrub_reopen = B_FALSE;
	(void) spa_vdev_state_exit(spa, NULL, 0);

	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
	/* got scrub start cmd, resume paused scrub */
	int err = dsl_scrub_set_pause_resume(scn->scn_dp,
	POOL_SCRUB_NORMAL);
	if (err == 0) {
	spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
	return (ECANCELED);
	}

	return (SET_ERROR(err));
	}

	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
	dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
	}

	static boolean_t
	dsl_scan_restarting(dsl_scan_t scn, dmu_tx_t tx)
	{
	return (scn->scn_restart_txg != 0 &&
	scn->scn_restart_txg <= tx->tx_txg);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c (revision 332525)
	@@ -1,3506 +1,3899 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#include <sys/zfs_context.h>
	#include <sys/dmu.h>
	#include <sys/dmu_tx.h>
	#include <sys/space_map.h>
	#include <sys/metaslab_impl.h>
	#include <sys/vdev_impl.h>
	#include <sys/zio.h>
	#include <sys/spa_impl.h>
	#include <sys/zfeature.h>
	+#include <sys/vdev_indirect_mapping.h>

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");

	#define GANG_ALLOCATION(flags) \
	((flags) & (METASLAB_GANG_CHILD \| METASLAB_GANG_HEADER))

	uint64_t metaslab_aliquot = 512ULL << 10;
	uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
	SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
	&metaslab_gang_bang, 0,
	"Force gang block allocation for blocks larger than or equal to this value");

	/*
	* The in-core space map representation is more compact than its on-disk form.
	* The zfs_condense_pct determines how much more compact the in-core
	* space map representation must be before we compact it on-disk.
	* Values should be greater than or equal to 100.
	*/
	int zfs_condense_pct = 200;
	SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
	&zfs_condense_pct, 0,
	"Condense on-disk spacemap when it is more than this many percents"
	" of in-memory counterpart");

	/*
	* Condensing a metaslab is not guaranteed to actually reduce the amount of
	* space used on disk. In particular, a space map uses data in increments of
	* MAX(1 << ashift, space_map_blksize), so a metaslab might use the
	* same number of blocks after condensing. Since the goal of condensing is to
	* reduce the number of IOPs required to read the space map, we only want to
	* condense when we can be sure we will reduce the number of blocks used by the
	* space map. Unfortunately, we cannot precisely compute whether or not this is
	* the case in metaslab_should_condense since we are holding ms_lock. Instead,
	* we apply the following heuristic: do not condense a spacemap unless the
	* uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
	* blocks.
	*/
	int zfs_metaslab_condense_block_threshold = 4;

	/*
	* The zfs_mg_noalloc_threshold defines which metaslab groups should
	* be eligible for allocation. The value is defined as a percentage of
	* free space. Metaslab groups that have more free space than
	* zfs_mg_noalloc_threshold are always eligible for allocations. Once
	* a metaslab group's free space is less than or equal to the
	* zfs_mg_noalloc_threshold the allocator will avoid allocating to that
	* group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
	* Once all groups in the pool reach zfs_mg_noalloc_threshold then all
	* groups are allowed to accept allocations. Gang blocks are always
	* eligible to allocate on any metaslab group. The default value of 0 means
	* no metaslab group will be excluded based on this criterion.
	*/
	int zfs_mg_noalloc_threshold = 0;
	SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
	&zfs_mg_noalloc_threshold, 0,
	"Percentage of metaslab group size that should be free"
	" to make it eligible for allocation");

	/*
	* Metaslab groups are considered eligible for allocations if their
	* fragmenation metric (measured as a percentage) is less than or equal to
	* zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
	* then it will be skipped unless all metaslab groups within the metaslab
	* class have also crossed this threshold.
	*/
	int zfs_mg_fragmentation_threshold = 85;
	SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
	&zfs_mg_fragmentation_threshold, 0,
	"Percentage of metaslab group size that should be considered "
	"eligible for allocations unless all metaslab groups within the metaslab class "
	"have also crossed this threshold");

	/*
	* Allow metaslabs to keep their active state as long as their fragmentation
	* percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
	* active metaslab that exceeds this threshold will no longer keep its active
	* status allowing better metaslabs to be selected.
	*/
	int zfs_metaslab_fragmentation_threshold = 70;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
	&zfs_metaslab_fragmentation_threshold, 0,
	"Maximum percentage of metaslab fragmentation level to keep their active state");

	/*
	* When set will load all metaslabs when pool is first opened.
	*/
	int metaslab_debug_load = 0;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
	&metaslab_debug_load, 0,
	"Load all metaslabs when pool is first opened");

	/*
	* When set will prevent metaslabs from being unloaded.
	*/
	int metaslab_debug_unload = 0;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
	&metaslab_debug_unload, 0,
	"Prevent metaslabs from being unloaded");

	/*
	* Minimum size which forces the dynamic allocator to change
	* it's allocation strategy. Once the space map cannot satisfy
	* an allocation of this size then it switches to using more
	* aggressive strategy (i.e search by size rather than offset).
	*/
	uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
	SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
	&metaslab_df_alloc_threshold, 0,
	"Minimum size which forces the dynamic allocator to change it's allocation strategy");

	/*
	* The minimum free space, in percent, which must be available
	* in a space map to continue allocations in a first-fit fashion.
	* Once the space map's free space drops below this level we dynamically
	* switch to using best-fit allocations.
	*/
	int metaslab_df_free_pct = 4;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
	&metaslab_df_free_pct, 0,
	"The minimum free space, in percent, which must be available in a "
	"space map to continue allocations in a first-fit fashion");

	/*
	* A metaslab is considered "free" if it contains a contiguous
	* segment which is greater than metaslab_min_alloc_size.
	*/
	uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
	SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
	&metaslab_min_alloc_size, 0,
	"A metaslab is considered \"free\" if it contains a contiguous "
	"segment which is greater than vfs.zfs.metaslab.min_alloc_size");

	/*
	* Percentage of all cpus that can be used by the metaslab taskq.
	*/
	int metaslab_load_pct = 50;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
	&metaslab_load_pct, 0,
	"Percentage of cpus that can be used by the metaslab taskq");

	/*
	* Determines how many txgs a metaslab may remain loaded without having any
	* allocations from it. As long as a metaslab continues to be used we will
	* keep it loaded.
	*/
	int metaslab_unload_delay = TXG_SIZE * 2;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
	&metaslab_unload_delay, 0,
	"Number of TXGs that an unused metaslab can be kept in memory");

	/*
	* Max number of metaslabs per group to preload.
	*/
	int metaslab_preload_limit = SPA_DVAS_PER_BP;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
	&metaslab_preload_limit, 0,
	"Max number of metaslabs per group to preload");

	/*
	* Enable/disable preloading of metaslab.
	*/
	boolean_t metaslab_preload_enabled = B_TRUE;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
	&metaslab_preload_enabled, 0,
	"Max number of metaslabs per group to preload");

	/*
	* Enable/disable fragmentation weighting on metaslabs.
	*/
	boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
	&metaslab_fragmentation_factor_enabled, 0,
	"Enable fragmentation weighting on metaslabs");

	/*
	* Enable/disable lba weighting (i.e. outer tracks are given preference).
	*/
	boolean_t metaslab_lba_weighting_enabled = B_TRUE;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
	&metaslab_lba_weighting_enabled, 0,
	"Enable LBA weighting (i.e. outer tracks are given preference)");

	/*
	* Enable/disable metaslab group biasing.
	*/
	boolean_t metaslab_bias_enabled = B_TRUE;
	SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
	&metaslab_bias_enabled, 0,
	"Enable metaslab group biasing");

	/*
	+ * Enable/disable remapping of indirect DVAs to their concrete vdevs.
	+ */
	+boolean_t zfs_remap_blkptr_enable = B_TRUE;
	+
	+/*
	* Enable/disable segment-based metaslab selection.
	*/
	boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;

	/*
	* When using segment-based metaslab selection, we will continue
	* allocating from the active metaslab until we have exhausted
	* zfs_metaslab_switch_threshold of its buckets.
	*/
	int zfs_metaslab_switch_threshold = 2;

	/*
	* Internal switch to enable/disable the metaslab allocation tracing
	* facility.
	*/
	boolean_t metaslab_trace_enabled = B_TRUE;

	/*
	* Maximum entries that the metaslab allocation tracing facility will keep
	* in a given list when running in non-debug mode. We limit the number
	* of entries in non-debug mode to prevent us from using up too much memory.
	* The limit should be sufficiently large that we don't expect any allocation
	* to every exceed this value. In debug mode, the system will panic if this
	* limit is ever reached allowing for further investigation.
	*/
	uint64_t metaslab_trace_max_entries = 5000;

	static uint64_t metaslab_weight(metaslab_t *);
	static void metaslab_set_fragmentation(metaslab_t *);
	+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
	+static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);

	kmem_cache_t *metaslab_alloc_trace_cache;

	/*
	* ==========================================================================
	* Metaslab classes
	* ==========================================================================
	*/
	metaslab_class_t *
	metaslab_class_create(spa_t spa, metaslab_ops_t ops)
	{
	metaslab_class_t *mc;

	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);

	mc->mc_spa = spa;
	mc->mc_rotor = NULL;
	mc->mc_ops = ops;
	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
	refcount_create_tracked(&mc->mc_alloc_slots);

	return (mc);
	}

	void
	metaslab_class_destroy(metaslab_class_t *mc)
	{
	ASSERT(mc->mc_rotor == NULL);
	ASSERT(mc->mc_alloc == 0);
	ASSERT(mc->mc_deferred == 0);
	ASSERT(mc->mc_space == 0);
	ASSERT(mc->mc_dspace == 0);

	refcount_destroy(&mc->mc_alloc_slots);
	mutex_destroy(&mc->mc_lock);
	kmem_free(mc, sizeof (metaslab_class_t));
	}

	int
	metaslab_class_validate(metaslab_class_t *mc)
	{
	metaslab_group_t *mg;
	vdev_t *vd;

	/*
	* Must hold one of the spa_config locks.
	*/
	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) \|\|
	spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));

	if ((mg = mc->mc_rotor) == NULL)
	return (0);

	do {
	vd = mg->mg_vd;
	ASSERT(vd->vdev_mg != NULL);
	ASSERT3P(vd->vdev_top, ==, vd);
	ASSERT3P(mg->mg_class, ==, mc);
	ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
	} while ((mg = mg->mg_next) != mc->mc_rotor);

	return (0);
	}

	void
	metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
	int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
	{
	atomic_add_64(&mc->mc_alloc, alloc_delta);
	atomic_add_64(&mc->mc_deferred, defer_delta);
	atomic_add_64(&mc->mc_space, space_delta);
	atomic_add_64(&mc->mc_dspace, dspace_delta);
	}

	void
	metaslab_class_minblocksize_update(metaslab_class_t *mc)
	{
	metaslab_group_t *mg;
	vdev_t *vd;
	uint64_t minashift = UINT64_MAX;

	if ((mg = mc->mc_rotor) == NULL) {
	mc->mc_minblocksize = SPA_MINBLOCKSIZE;
	return;
	}

	do {
	vd = mg->mg_vd;
	if (vd->vdev_ashift < minashift)
	minashift = vd->vdev_ashift;
	} while ((mg = mg->mg_next) != mc->mc_rotor);

	mc->mc_minblocksize = 1ULL << minashift;
	}

	uint64_t
	metaslab_class_get_alloc(metaslab_class_t *mc)
	{
	return (mc->mc_alloc);
	}

	uint64_t
	metaslab_class_get_deferred(metaslab_class_t *mc)
	{
	return (mc->mc_deferred);
	}

	uint64_t
	metaslab_class_get_space(metaslab_class_t *mc)
	{
	return (mc->mc_space);
	}

	uint64_t
	metaslab_class_get_dspace(metaslab_class_t *mc)
	{
	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
	}

	uint64_t
	metaslab_class_get_minblocksize(metaslab_class_t *mc)
	{
	return (mc->mc_minblocksize);
	}

	void
	metaslab_class_histogram_verify(metaslab_class_t *mc)
	{
	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
	uint64_t *mc_hist;
	int i;

	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
	return;

	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
	KM_SLEEP);

	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	metaslab_group_t *mg = tvd->vdev_mg;

	/*
	* Skip any holes, uninitialized top-levels, or
	* vdevs that are not in this metalab class.
	*/
	- if (tvd->vdev_ishole \|\| tvd->vdev_ms_shift == 0 \|\|
	+ if (!vdev_is_concrete(tvd) \|\| tvd->vdev_ms_shift == 0 \|\|
	mg->mg_class != mc) {
	continue;
	}

	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
	mc_hist[i] += mg->mg_histogram[i];
	}

	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
	VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);

	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
	}

	/*
	* Calculate the metaslab class's fragmentation metric. The metric
	* is weighted based on the space contribution of each metaslab group.
	* The return value will be a number between 0 and 100 (inclusive), or
	* ZFS_FRAG_INVALID if the metric has not been set. See comment above the
	* zfs_frag_table for more information about the metric.
	*/
	uint64_t
	metaslab_class_fragmentation(metaslab_class_t *mc)
	{
	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
	uint64_t fragmentation = 0;

	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);

	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	metaslab_group_t *mg = tvd->vdev_mg;

	/*
	- * Skip any holes, uninitialized top-levels, or
	- * vdevs that are not in this metalab class.
	+ * Skip any holes, uninitialized top-levels,
	+ * or vdevs that are not in this metalab class.
	*/
	- if (tvd->vdev_ishole \|\| tvd->vdev_ms_shift == 0 \|\|
	+ if (!vdev_is_concrete(tvd) \|\| tvd->vdev_ms_shift == 0 \|\|
	mg->mg_class != mc) {
	continue;
	}

	/*
	* If a metaslab group does not contain a fragmentation
	* metric then just bail out.
	*/
	if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
	return (ZFS_FRAG_INVALID);
	}

	/*
	* Determine how much this metaslab_group is contributing
	* to the overall pool fragmentation metric.
	*/
	fragmentation += mg->mg_fragmentation *
	metaslab_group_get_space(mg);
	}
	fragmentation /= metaslab_class_get_space(mc);

	ASSERT3U(fragmentation, <=, 100);
	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
	return (fragmentation);
	}

	/*
	* Calculate the amount of expandable space that is available in
	* this metaslab class. If a device is expanded then its expandable
	* space will be the amount of allocatable space that is currently not
	* part of this metaslab class.
	*/
	uint64_t
	metaslab_class_expandable_space(metaslab_class_t *mc)
	{
	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
	uint64_t space = 0;

	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
	for (int c = 0; c < rvd->vdev_children; c++) {
	uint64_t tspace;
	vdev_t *tvd = rvd->vdev_child[c];
	metaslab_group_t *mg = tvd->vdev_mg;

	- if (tvd->vdev_ishole \|\| tvd->vdev_ms_shift == 0 \|\|
	+ if (!vdev_is_concrete(tvd) \|\| tvd->vdev_ms_shift == 0 \|\|
	mg->mg_class != mc) {
	continue;
	}

	/*
	* Calculate if we have enough space to add additional
	* metaslabs. We report the expandable space in terms
	* of the metaslab size since that's the unit of expansion.
	* Adjust by efi system partition size.
	*/
	tspace = tvd->vdev_max_asize - tvd->vdev_asize;
	if (tspace > mc->mc_spa->spa_bootsize) {
	tspace -= mc->mc_spa->spa_bootsize;
	}
	space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
	}
	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
	return (space);
	}

	static int
	metaslab_compare(const void x1, const void x2)
	{
	const metaslab_t *m1 = x1;
	const metaslab_t *m2 = x2;

	if (m1->ms_weight < m2->ms_weight)
	return (1);
	if (m1->ms_weight > m2->ms_weight)
	return (-1);

	/*
	* If the weights are identical, use the offset to force uniqueness.
	*/
	if (m1->ms_start < m2->ms_start)
	return (-1);
	if (m1->ms_start > m2->ms_start)
	return (1);

	ASSERT3P(m1, ==, m2);

	return (0);
	}

	/*
	* Verify that the space accounting on disk matches the in-core range_trees.
	*/
	void
	metaslab_verify_space(metaslab_t *msp, uint64_t txg)
	{
	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
	uint64_t allocated = 0;
	uint64_t sm_free_space, msp_free_space;

	ASSERT(MUTEX_HELD(&msp->ms_lock));

	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
	return;

	/*
	* We can only verify the metaslab space when we're called
	* from syncing context with a loaded metaslab that has an allocated
	* space map. Calling this in non-syncing context does not
	* provide a consistent view of the metaslab since we're performing
	* allocations in the future.
	*/
	if (txg != spa_syncing_txg(spa) \|\| msp->ms_sm == NULL \|\|
	!msp->ms_loaded)
	return;

	sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
	space_map_alloc_delta(msp->ms_sm);

	/*
	* Account for future allocations since we would have already
	* deducted that space from the ms_freetree.
	*/
	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
	allocated +=
	range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
	}

	msp_free_space = range_tree_space(msp->ms_tree) + allocated +
	msp->ms_deferspace + range_tree_space(msp->ms_freedtree);

	VERIFY3U(sm_free_space, ==, msp_free_space);
	}

	/*
	* ==========================================================================
	* Metaslab groups
	* ==========================================================================
	*/
	/*
	* Update the allocatable flag and the metaslab group's capacity.
	* The allocatable flag is set to true if the capacity is below
	* the zfs_mg_noalloc_threshold or has a fragmentation value that is
	* greater than zfs_mg_fragmentation_threshold. If a metaslab group
	* transitions from allocatable to non-allocatable or vice versa then the
	* metaslab group's class is updated to reflect the transition.
	*/
	static void
	metaslab_group_alloc_update(metaslab_group_t *mg)
	{
	vdev_t *vd = mg->mg_vd;
	metaslab_class_t *mc = mg->mg_class;
	vdev_stat_t *vs = &vd->vdev_stat;
	boolean_t was_allocatable;
	boolean_t was_initialized;

	ASSERT(vd == vd->vdev_top);
	+ ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
	+ SCL_ALLOC);

	mutex_enter(&mg->mg_lock);
	was_allocatable = mg->mg_allocatable;
	was_initialized = mg->mg_initialized;

	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
	(vs->vs_space + 1);

	mutex_enter(&mc->mc_lock);

	/*
	* If the metaslab group was just added then it won't
	* have any space until we finish syncing out this txg.
	* At that point we will consider it initialized and available
	* for allocations. We also don't consider non-activated
	* metaslab groups (e.g. vdevs that are in the middle of being removed)
	* to be initialized, because they can't be used for allocation.
	*/
	mg->mg_initialized = metaslab_group_initialized(mg);
	if (!was_initialized && mg->mg_initialized) {
	mc->mc_groups++;
	} else if (was_initialized && !mg->mg_initialized) {
	ASSERT3U(mc->mc_groups, >, 0);
	mc->mc_groups--;
	}
	if (mg->mg_initialized)
	mg->mg_no_free_space = B_FALSE;

	/*
	* A metaslab group is considered allocatable if it has plenty
	* of free space or is not heavily fragmented. We only take
	* fragmentation into account if the metaslab group has a valid
	* fragmentation metric (i.e. a value between 0 and 100).
	*/
	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
	mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
	(mg->mg_fragmentation == ZFS_FRAG_INVALID \|\|
	mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));

	/*
	* The mc_alloc_groups maintains a count of the number of
	* groups in this metaslab class that are still above the
	* zfs_mg_noalloc_threshold. This is used by the allocating
	* threads to determine if they should avoid allocations to
	* a given group. The allocator will avoid allocations to a group
	* if that group has reached or is below the zfs_mg_noalloc_threshold
	* and there are still other groups that are above the threshold.
	* When a group transitions from allocatable to non-allocatable or
	* vice versa we update the metaslab class to reflect that change.
	* When the mc_alloc_groups value drops to 0 that means that all
	* groups have reached the zfs_mg_noalloc_threshold making all groups
	* eligible for allocations. This effectively means that all devices
	* are balanced again.
	*/
	if (was_allocatable && !mg->mg_allocatable)
	mc->mc_alloc_groups--;
	else if (!was_allocatable && mg->mg_allocatable)
	mc->mc_alloc_groups++;
	mutex_exit(&mc->mc_lock);

	mutex_exit(&mg->mg_lock);
	}

	metaslab_group_t *
	metaslab_group_create(metaslab_class_t mc, vdev_t vd)
	{
	metaslab_group_t *mg;

	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
	sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
	mg->mg_vd = vd;
	mg->mg_class = mc;
	mg->mg_activation_count = 0;
	mg->mg_initialized = B_FALSE;
	mg->mg_no_free_space = B_TRUE;
	refcount_create_tracked(&mg->mg_alloc_queue_depth);

	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
	minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);

	return (mg);
	}

	void
	metaslab_group_destroy(metaslab_group_t *mg)
	{
	ASSERT(mg->mg_prev == NULL);
	ASSERT(mg->mg_next == NULL);
	/*
	* We may have gone below zero with the activation count
	* either because we never activated in the first place or
	* because we're done, and possibly removing the vdev.
	*/
	ASSERT(mg->mg_activation_count <= 0);

	taskq_destroy(mg->mg_taskq);
	avl_destroy(&mg->mg_metaslab_tree);
	mutex_destroy(&mg->mg_lock);
	refcount_destroy(&mg->mg_alloc_queue_depth);
	kmem_free(mg, sizeof (metaslab_group_t));
	}

	void
	metaslab_group_activate(metaslab_group_t *mg)
	{
	metaslab_class_t *mc = mg->mg_class;
	metaslab_group_t mgprev, mgnext;

	- ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
	+ ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);

	ASSERT(mc->mc_rotor != mg);
	ASSERT(mg->mg_prev == NULL);
	ASSERT(mg->mg_next == NULL);
	ASSERT(mg->mg_activation_count <= 0);

	if (++mg->mg_activation_count <= 0)
	return;

	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
	metaslab_group_alloc_update(mg);

	if ((mgprev = mc->mc_rotor) == NULL) {
	mg->mg_prev = mg;
	mg->mg_next = mg;
	} else {
	mgnext = mgprev->mg_next;
	mg->mg_prev = mgprev;
	mg->mg_next = mgnext;
	mgprev->mg_next = mg;
	mgnext->mg_prev = mg;
	}
	mc->mc_rotor = mg;
	metaslab_class_minblocksize_update(mc);
	}

	+/*
	+ * Passivate a metaslab group and remove it from the allocation rotor.
	+ * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
	+ * a metaslab group. This function will momentarily drop spa_config_locks
	+ * that are lower than the SCL_ALLOC lock (see comment below).
	+ */
	void
	metaslab_group_passivate(metaslab_group_t *mg)
	{
	metaslab_class_t *mc = mg->mg_class;
	+ spa_t *spa = mc->mc_spa;
	metaslab_group_t mgprev, mgnext;
	+ int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);

	- ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
	+ ASSERT3U(spa_config_held(spa, SCL_ALLOC \| SCL_ZIO, RW_WRITER), ==,
	+ (SCL_ALLOC \| SCL_ZIO));

	if (--mg->mg_activation_count != 0) {
	ASSERT(mc->mc_rotor != mg);
	ASSERT(mg->mg_prev == NULL);
	ASSERT(mg->mg_next == NULL);
	ASSERT(mg->mg_activation_count < 0);
	return;
	}

	+ /*
	+ * The spa_config_lock is an array of rwlocks, ordered as
	+ * follows (from highest to lowest):
	+ * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
	+ * SCL_ZIO > SCL_FREE > SCL_VDEV
	+ * (For more information about the spa_config_lock see spa_misc.c)
	+ * The higher the lock, the broader its coverage. When we passivate
	+ * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
	+ * config locks. However, the metaslab group's taskq might be trying
	+ * to preload metaslabs so we must drop the SCL_ZIO lock and any
	+ * lower locks to allow the I/O to complete. At a minimum,
	+ * we continue to hold the SCL_ALLOC lock, which prevents any future
	+ * allocations from taking place and any changes to the vdev tree.
	+ */
	+ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
	taskq_wait(mg->mg_taskq);
	+ spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
	metaslab_group_alloc_update(mg);

	mgprev = mg->mg_prev;
	mgnext = mg->mg_next;

	if (mg == mgnext) {
	mc->mc_rotor = NULL;
	} else {
	mc->mc_rotor = mgnext;
	mgprev->mg_next = mgnext;
	mgnext->mg_prev = mgprev;
	}

	mg->mg_prev = NULL;
	mg->mg_next = NULL;
	metaslab_class_minblocksize_update(mc);
	}

	boolean_t
	metaslab_group_initialized(metaslab_group_t *mg)
	{
	vdev_t *vd = mg->mg_vd;
	vdev_stat_t *vs = &vd->vdev_stat;

	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
	}

	uint64_t
	metaslab_group_get_space(metaslab_group_t *mg)
	{
	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
	}

	void
	metaslab_group_histogram_verify(metaslab_group_t *mg)
	{
	uint64_t *mg_hist;
	vdev_t *vd = mg->mg_vd;
	uint64_t ashift = vd->vdev_ashift;
	int i;

	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
	return;

	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
	KM_SLEEP);

	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
	SPACE_MAP_HISTOGRAM_SIZE + ashift);

	for (int m = 0; m < vd->vdev_ms_count; m++) {
	metaslab_t *msp = vd->vdev_ms[m];

	if (msp->ms_sm == NULL)
	continue;

	for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
	mg_hist[i + ashift] +=
	msp->ms_sm->sm_phys->smp_histogram[i];
	}

	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
	VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);

	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
	}

	static void
	metaslab_group_histogram_add(metaslab_group_t mg, metaslab_t msp)
	{
	metaslab_class_t *mc = mg->mg_class;
	uint64_t ashift = mg->mg_vd->vdev_ashift;

	ASSERT(MUTEX_HELD(&msp->ms_lock));
	if (msp->ms_sm == NULL)
	return;

	mutex_enter(&mg->mg_lock);
	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
	mg->mg_histogram[i + ashift] +=
	msp->ms_sm->sm_phys->smp_histogram[i];
	mc->mc_histogram[i + ashift] +=
	msp->ms_sm->sm_phys->smp_histogram[i];
	}
	mutex_exit(&mg->mg_lock);
	}

	void
	metaslab_group_histogram_remove(metaslab_group_t mg, metaslab_t msp)
	{
	metaslab_class_t *mc = mg->mg_class;
	uint64_t ashift = mg->mg_vd->vdev_ashift;

	ASSERT(MUTEX_HELD(&msp->ms_lock));
	if (msp->ms_sm == NULL)
	return;

	mutex_enter(&mg->mg_lock);
	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
	ASSERT3U(mg->mg_histogram[i + ashift], >=,
	msp->ms_sm->sm_phys->smp_histogram[i]);
	ASSERT3U(mc->mc_histogram[i + ashift], >=,
	msp->ms_sm->sm_phys->smp_histogram[i]);

	mg->mg_histogram[i + ashift] -=
	msp->ms_sm->sm_phys->smp_histogram[i];
	mc->mc_histogram[i + ashift] -=
	msp->ms_sm->sm_phys->smp_histogram[i];
	}
	mutex_exit(&mg->mg_lock);
	}

	static void
	metaslab_group_add(metaslab_group_t mg, metaslab_t msp)
	{
	ASSERT(msp->ms_group == NULL);
	mutex_enter(&mg->mg_lock);
	msp->ms_group = mg;
	msp->ms_weight = 0;
	avl_add(&mg->mg_metaslab_tree, msp);
	mutex_exit(&mg->mg_lock);

	mutex_enter(&msp->ms_lock);
	metaslab_group_histogram_add(mg, msp);
	mutex_exit(&msp->ms_lock);
	}

	static void
	metaslab_group_remove(metaslab_group_t mg, metaslab_t msp)
	{
	mutex_enter(&msp->ms_lock);
	metaslab_group_histogram_remove(mg, msp);
	mutex_exit(&msp->ms_lock);

	mutex_enter(&mg->mg_lock);
	ASSERT(msp->ms_group == mg);
	avl_remove(&mg->mg_metaslab_tree, msp);
	msp->ms_group = NULL;
	mutex_exit(&mg->mg_lock);
	}

	static void
	metaslab_group_sort(metaslab_group_t mg, metaslab_t msp, uint64_t weight)
	{
	/*
	* Although in principle the weight can be any value, in
	* practice we do not use values in the range [1, 511].
	*/
	ASSERT(weight >= SPA_MINBLOCKSIZE \|\| weight == 0);
	ASSERT(MUTEX_HELD(&msp->ms_lock));

	mutex_enter(&mg->mg_lock);
	ASSERT(msp->ms_group == mg);
	avl_remove(&mg->mg_metaslab_tree, msp);
	msp->ms_weight = weight;
	avl_add(&mg->mg_metaslab_tree, msp);
	mutex_exit(&mg->mg_lock);
	}

	/*
	* Calculate the fragmentation for a given metaslab group. We can use
	* a simple average here since all metaslabs within the group must have
	* the same size. The return value will be a value between 0 and 100
	* (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
	* group have a fragmentation metric.
	*/
	uint64_t
	metaslab_group_fragmentation(metaslab_group_t *mg)
	{
	vdev_t *vd = mg->mg_vd;
	uint64_t fragmentation = 0;
	uint64_t valid_ms = 0;

	for (int m = 0; m < vd->vdev_ms_count; m++) {
	metaslab_t *msp = vd->vdev_ms[m];

	if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
	continue;

	valid_ms++;
	fragmentation += msp->ms_fragmentation;
	}

	if (valid_ms <= vd->vdev_ms_count / 2)
	return (ZFS_FRAG_INVALID);

	fragmentation /= valid_ms;
	ASSERT3U(fragmentation, <=, 100);
	return (fragmentation);
	}

	/*
	* Determine if a given metaslab group should skip allocations. A metaslab
	* group should avoid allocations if its free capacity is less than the
	* zfs_mg_noalloc_threshold or its fragmentation metric is greater than
	* zfs_mg_fragmentation_threshold and there is at least one metaslab group
	* that can still handle allocations. If the allocation throttle is enabled
	* then we skip allocations to devices that have reached their maximum
	* allocation queue depth unless the selected metaslab group is the only
	* eligible group remaining.
	*/
	static boolean_t
	metaslab_group_allocatable(metaslab_group_t mg, metaslab_group_t rotor,
	uint64_t psize)
	{
	spa_t *spa = mg->mg_vd->vdev_spa;
	metaslab_class_t *mc = mg->mg_class;

	/*
	* We can only consider skipping this metaslab group if it's
	* in the normal metaslab class and there are other metaslab
	* groups to select from. Otherwise, we always consider it eligible
	* for allocations.
	*/
	if (mc != spa_normal_class(spa) \|\| mc->mc_groups <= 1)
	return (B_TRUE);

	/*
	* If the metaslab group's mg_allocatable flag is set (see comments
	* in metaslab_group_alloc_update() for more information) and
	* the allocation throttle is disabled then allow allocations to this
	* device. However, if the allocation throttle is enabled then
	* check if we have reached our allocation limit (mg_alloc_queue_depth)
	* to determine if we should allow allocations to this metaslab group.
	* If all metaslab groups are no longer considered allocatable
	* (mc_alloc_groups == 0) or we're trying to allocate the smallest
	* gang block size then we allow allocations on this metaslab group
	* regardless of the mg_allocatable or throttle settings.
	*/
	if (mg->mg_allocatable) {
	metaslab_group_t *mgp;
	int64_t qdepth;
	uint64_t qmax = mg->mg_max_alloc_queue_depth;

	if (!mc->mc_alloc_throttle_enabled)
	return (B_TRUE);

	/*
	* If this metaslab group does not have any free space, then
	* there is no point in looking further.
	*/
	if (mg->mg_no_free_space)
	return (B_FALSE);

	qdepth = refcount_count(&mg->mg_alloc_queue_depth);

	/*
	* If this metaslab group is below its qmax or it's
	* the only allocatable metasable group, then attempt
	* to allocate from it.
	*/
	if (qdepth < qmax \|\| mc->mc_alloc_groups == 1)
	return (B_TRUE);
	ASSERT3U(mc->mc_alloc_groups, >, 1);

	/*
	* Since this metaslab group is at or over its qmax, we
	* need to determine if there are metaslab groups after this
	* one that might be able to handle this allocation. This is
	* racy since we can't hold the locks for all metaslab
	* groups at the same time when we make this check.
	*/
	for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
	qmax = mgp->mg_max_alloc_queue_depth;

	qdepth = refcount_count(&mgp->mg_alloc_queue_depth);

	/*
	* If there is another metaslab group that
	* might be able to handle the allocation, then
	* we return false so that we skip this group.
	*/
	if (qdepth < qmax && !mgp->mg_no_free_space)
	return (B_FALSE);
	}

	/*
	* We didn't find another group to handle the allocation
	* so we can't skip this metaslab group even though
	* we are at or over our qmax.
	*/
	return (B_TRUE);

	} else if (mc->mc_alloc_groups == 0 \|\| psize == SPA_MINBLOCKSIZE) {
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	/*
	* ==========================================================================
	* Range tree callbacks
	* ==========================================================================
	*/

	/*
	* Comparison function for the private size-ordered tree. Tree is sorted
	* by size, larger sizes at the end of the tree.
	*/
	static int
	metaslab_rangesize_compare(const void x1, const void x2)
	{
	const range_seg_t *r1 = x1;
	const range_seg_t *r2 = x2;
	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
	uint64_t rs_size2 = r2->rs_end - r2->rs_start;

	if (rs_size1 < rs_size2)
	return (-1);
	if (rs_size1 > rs_size2)
	return (1);

	if (r1->rs_start < r2->rs_start)
	return (-1);

	if (r1->rs_start > r2->rs_start)
	return (1);

	return (0);
	}

	/*
	* Create any block allocator specific components. The current allocators
	* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
	*/
	static void
	metaslab_rt_create(range_tree_t rt, void arg)
	{
	metaslab_t *msp = arg;

	ASSERT3P(rt->rt_arg, ==, msp);
	ASSERT(msp->ms_tree == NULL);

	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
	sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
	}

	/*
	* Destroy the block allocator specific components.
	*/
	static void
	metaslab_rt_destroy(range_tree_t rt, void arg)
	{
	metaslab_t *msp = arg;

	ASSERT3P(rt->rt_arg, ==, msp);
	ASSERT3P(msp->ms_tree, ==, rt);
	ASSERT0(avl_numnodes(&msp->ms_size_tree));

	avl_destroy(&msp->ms_size_tree);
	}

	static void
	metaslab_rt_add(range_tree_t rt, range_seg_t rs, void *arg)
	{
	metaslab_t *msp = arg;

	ASSERT3P(rt->rt_arg, ==, msp);
	ASSERT3P(msp->ms_tree, ==, rt);
	VERIFY(!msp->ms_condensing);
	avl_add(&msp->ms_size_tree, rs);
	}

	static void
	metaslab_rt_remove(range_tree_t rt, range_seg_t rs, void *arg)
	{
	metaslab_t *msp = arg;

	ASSERT3P(rt->rt_arg, ==, msp);
	ASSERT3P(msp->ms_tree, ==, rt);
	VERIFY(!msp->ms_condensing);
	avl_remove(&msp->ms_size_tree, rs);
	}

	static void
	metaslab_rt_vacate(range_tree_t rt, void arg)
	{
	metaslab_t *msp = arg;

	ASSERT3P(rt->rt_arg, ==, msp);
	ASSERT3P(msp->ms_tree, ==, rt);

	/*
	* Normally one would walk the tree freeing nodes along the way.
	* Since the nodes are shared with the range trees we can avoid
	* walking all nodes and just reinitialize the avl tree. The nodes
	* will be freed by the range tree, so we don't want to free them here.
	*/
	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
	sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
	}

	static range_tree_ops_t metaslab_rt_ops = {
	metaslab_rt_create,
	metaslab_rt_destroy,
	metaslab_rt_add,
	metaslab_rt_remove,
	metaslab_rt_vacate
	};

	/*
	* ==========================================================================
	* Common allocator routines
	* ==========================================================================
	*/

	/*
	* Return the maximum contiguous segment within the metaslab.
	*/
	uint64_t
	metaslab_block_maxsize(metaslab_t *msp)
	{
	avl_tree_t *t = &msp->ms_size_tree;
	range_seg_t *rs;

	if (t == NULL \|\| (rs = avl_last(t)) == NULL)
	return (0ULL);

	return (rs->rs_end - rs->rs_start);
	}

	static range_seg_t *
	metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
	{
	range_seg_t *rs, rsearch;
	avl_index_t where;

	rsearch.rs_start = start;
	rsearch.rs_end = start + size;

	rs = avl_find(t, &rsearch, &where);
	if (rs == NULL) {
	rs = avl_nearest(t, where, AVL_AFTER);
	}

	return (rs);
	}

	/*
	* This is a helper function that can be used by the allocator to find
	* a suitable block to allocate. This will search the specified AVL
	* tree looking for a block that matches the specified criteria.
	*/
	static uint64_t
	metaslab_block_picker(avl_tree_t t, uint64_t cursor, uint64_t size,
	uint64_t align)
	{
	range_seg_t rs = metaslab_block_find(t, cursor, size);

	while (rs != NULL) {
	uint64_t offset = P2ROUNDUP(rs->rs_start, align);

	if (offset + size <= rs->rs_end) {
	*cursor = offset + size;
	return (offset);
	}
	rs = AVL_NEXT(t, rs);
	}

	/*
	* If we know we've searched the whole map (*cursor == 0), give up.
	* Otherwise, reset the cursor to the beginning and try again.
	*/
	if (*cursor == 0)
	return (-1ULL);

	*cursor = 0;
	return (metaslab_block_picker(t, cursor, size, align));
	}

	/*
	* ==========================================================================
	* The first-fit block allocator
	* ==========================================================================
	*/
	static uint64_t
	metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
	{
	/*
	* Find the largest power of 2 block size that evenly divides the
	* requested size. This is used to try to allocate blocks with similar
	* alignment from the same area of the metaslab (i.e. same cursor
	* bucket) but it does not guarantee that other allocations sizes
	* may exist in the same region.
	*/
	uint64_t align = size & -size;
	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
	avl_tree_t *t = &msp->ms_tree->rt_root;

	return (metaslab_block_picker(t, cursor, size, align));
	}

	static metaslab_ops_t metaslab_ff_ops = {
	metaslab_ff_alloc
	};

	/*
	* ==========================================================================
	* Dynamic block allocator -
	* Uses the first fit allocation scheme until space get low and then
	* adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
	* and metaslab_df_free_pct to determine when to switch the allocation scheme.
	* ==========================================================================
	*/
	static uint64_t
	metaslab_df_alloc(metaslab_t *msp, uint64_t size)
	{
	/*
	* Find the largest power of 2 block size that evenly divides the
	* requested size. This is used to try to allocate blocks with similar
	* alignment from the same area of the metaslab (i.e. same cursor
	* bucket) but it does not guarantee that other allocations sizes
	* may exist in the same region.
	*/
	uint64_t align = size & -size;
	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
	range_tree_t *rt = msp->ms_tree;
	avl_tree_t *t = &rt->rt_root;
	uint64_t max_size = metaslab_block_maxsize(msp);
	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;

	ASSERT(MUTEX_HELD(&msp->ms_lock));
	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));

	if (max_size < size)
	return (-1ULL);

	/*
	* If we're running low on space switch to using the size
	* sorted AVL tree (best-fit).
	*/
	if (max_size < metaslab_df_alloc_threshold \|\|
	free_pct < metaslab_df_free_pct) {
	t = &msp->ms_size_tree;
	*cursor = 0;
	}

	return (metaslab_block_picker(t, cursor, size, 1ULL));
	}

	static metaslab_ops_t metaslab_df_ops = {
	metaslab_df_alloc
	};

	/*
	* ==========================================================================
	* Cursor fit block allocator -
	* Select the largest region in the metaslab, set the cursor to the beginning
	* of the range and the cursor_end to the end of the range. As allocations
	* are made advance the cursor. Continue allocating from the cursor until
	* the range is exhausted and then find a new range.
	* ==========================================================================
	*/
	static uint64_t
	metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
	{
	range_tree_t *rt = msp->ms_tree;
	avl_tree_t *t = &msp->ms_size_tree;
	uint64_t *cursor = &msp->ms_lbas[0];
	uint64_t *cursor_end = &msp->ms_lbas[1];
	uint64_t offset = 0;

	ASSERT(MUTEX_HELD(&msp->ms_lock));
	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));

	ASSERT3U(cursor_end, >=, cursor);

	if ((cursor + size) > cursor_end) {
	range_seg_t *rs;

	rs = avl_last(&msp->ms_size_tree);
	if (rs == NULL \|\| (rs->rs_end - rs->rs_start) < size)
	return (-1ULL);

	*cursor = rs->rs_start;
	*cursor_end = rs->rs_end;
	}

	offset = *cursor;
	*cursor += size;

	return (offset);
	}

	static metaslab_ops_t metaslab_cf_ops = {
	metaslab_cf_alloc
	};

	/*
	* ==========================================================================
	* New dynamic fit allocator -
	* Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
	* contiguous blocks. If no region is found then just use the largest segment
	* that remains.
	* ==========================================================================
	*/

	/*
	* Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
	* to request from the allocator.
	*/
	uint64_t metaslab_ndf_clump_shift = 4;

	static uint64_t
	metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
	{
	avl_tree_t *t = &msp->ms_tree->rt_root;
	avl_index_t where;
	range_seg_t *rs, rsearch;
	uint64_t hbit = highbit64(size);
	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
	uint64_t max_size = metaslab_block_maxsize(msp);

	ASSERT(MUTEX_HELD(&msp->ms_lock));
	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));

	if (max_size < size)
	return (-1ULL);

	rsearch.rs_start = *cursor;
	rsearch.rs_end = *cursor + size;

	rs = avl_find(t, &rsearch, &where);
	if (rs == NULL \|\| (rs->rs_end - rs->rs_start) < size) {
	t = &msp->ms_size_tree;

	rsearch.rs_start = 0;
	rsearch.rs_end = MIN(max_size,
	1ULL << (hbit + metaslab_ndf_clump_shift));
	rs = avl_find(t, &rsearch, &where);
	if (rs == NULL)
	rs = avl_nearest(t, where, AVL_AFTER);
	ASSERT(rs != NULL);
	}

	if ((rs->rs_end - rs->rs_start) >= size) {
	*cursor = rs->rs_start + size;
	return (rs->rs_start);
	}
	return (-1ULL);
	}

	static metaslab_ops_t metaslab_ndf_ops = {
	metaslab_ndf_alloc
	};

	metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;

	/*
	* ==========================================================================
	* Metaslabs
	* ==========================================================================
	*/

	/*
	* Wait for any in-progress metaslab loads to complete.
	*/
	void
	metaslab_load_wait(metaslab_t *msp)
	{
	ASSERT(MUTEX_HELD(&msp->ms_lock));

	while (msp->ms_loading) {
	ASSERT(!msp->ms_loaded);
	cv_wait(&msp->ms_load_cv, &msp->ms_lock);
	}
	}

	int
	metaslab_load(metaslab_t *msp)
	{
	int error = 0;
	boolean_t success = B_FALSE;

	ASSERT(MUTEX_HELD(&msp->ms_lock));
	ASSERT(!msp->ms_loaded);
	ASSERT(!msp->ms_loading);

	msp->ms_loading = B_TRUE;
	+ /*
	+ * Nobody else can manipulate a loading metaslab, so it's now safe
	+ * to drop the lock. This way we don't have to hold the lock while
	+ * reading the spacemap from disk.
	+ */
	+ mutex_exit(&msp->ms_lock);

	/*
	* If the space map has not been allocated yet, then treat
	* all the space in the metaslab as free and add it to the
	* ms_tree.
	*/
	if (msp->ms_sm != NULL)
	error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
	else
	range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);

	success = (error == 0);
	+
	+ mutex_enter(&msp->ms_lock);
	msp->ms_loading = B_FALSE;

	if (success) {
	ASSERT3P(msp->ms_group, !=, NULL);
	msp->ms_loaded = B_TRUE;

	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	range_tree_walk(msp->ms_defertree[t],
	range_tree_remove, msp->ms_tree);
	}
	msp->ms_max_size = metaslab_block_maxsize(msp);
	}
	cv_broadcast(&msp->ms_load_cv);
	return (error);
	}

	void
	metaslab_unload(metaslab_t *msp)
	{
	ASSERT(MUTEX_HELD(&msp->ms_lock));
	range_tree_vacate(msp->ms_tree, NULL, NULL);
	msp->ms_loaded = B_FALSE;
	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
	msp->ms_max_size = 0;
	}

	int
	metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
	metaslab_t **msp)
	{
	vdev_t *vd = mg->mg_vd;
	objset_t *mos = vd->vdev_spa->spa_meta_objset;
	metaslab_t *ms;
	int error;

	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
	+ mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
	ms->ms_id = id;
	ms->ms_start = id << vd->vdev_ms_shift;
	ms->ms_size = 1ULL << vd->vdev_ms_shift;

	/*
	* We only open space map objects that already exist. All others
	* will be opened when we finally allocate an object for it.
	*/
	if (object != 0) {
	error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
	- ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
	+ ms->ms_size, vd->vdev_ashift);

	if (error != 0) {
	kmem_free(ms, sizeof (metaslab_t));
	return (error);
	}

	ASSERT(ms->ms_sm != NULL);
	}

	/*
	* We create the main range tree here, but we don't create the
	* other range trees until metaslab_sync_done(). This serves
	* two purposes: it allows metaslab_sync_done() to detect the
	* addition of new space; and for debugging, it ensures that we'd
	* data fault on any attempt to use this metaslab before it's ready.
	*/
	- ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
	+ ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms);
	metaslab_group_add(mg, ms);

	metaslab_set_fragmentation(ms);

	/*
	* If we're opening an existing pool (txg == 0) or creating
	* a new one (txg == TXG_INITIAL), all space is available now.
	* If we're adding space to an existing pool, the new space
	* does not become available until after this txg has synced.
	* The metaslab's weight will also be initialized when we sync
	* out this txg. This ensures that we don't attempt to allocate
	* from it before we have initialized it completely.
	*/
	if (txg <= TXG_INITIAL)
	metaslab_sync_done(ms, 0);

	/*
	* If metaslab_debug_load is set and we're initializing a metaslab
	* that has an allocated space map object then load the its space
	* map so that can verify frees.
	*/
	if (metaslab_debug_load && ms->ms_sm != NULL) {
	mutex_enter(&ms->ms_lock);
	VERIFY0(metaslab_load(ms));
	mutex_exit(&ms->ms_lock);
	}

	if (txg != 0) {
	vdev_dirty(vd, 0, NULL, txg);
	vdev_dirty(vd, VDD_METASLAB, ms, txg);
	}

	*msp = ms;

	return (0);
	}

	void
	metaslab_fini(metaslab_t *msp)
	{
	metaslab_group_t *mg = msp->ms_group;

	metaslab_group_remove(mg, msp);

	mutex_enter(&msp->ms_lock);
	VERIFY(msp->ms_group == NULL);
	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
	0, -msp->ms_size);
	space_map_close(msp->ms_sm);

	metaslab_unload(msp);
	range_tree_destroy(msp->ms_tree);
	range_tree_destroy(msp->ms_freeingtree);
	range_tree_destroy(msp->ms_freedtree);

	for (int t = 0; t < TXG_SIZE; t++) {
	range_tree_destroy(msp->ms_alloctree[t]);
	}

	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	range_tree_destroy(msp->ms_defertree[t]);
	}

	ASSERT0(msp->ms_deferspace);

	mutex_exit(&msp->ms_lock);
	cv_destroy(&msp->ms_load_cv);
	mutex_destroy(&msp->ms_lock);
	+ mutex_destroy(&msp->ms_sync_lock);

	kmem_free(msp, sizeof (metaslab_t));
	}

	#define FRAGMENTATION_TABLE_SIZE 17

	/*
	* This table defines a segment size based fragmentation metric that will
	* allow each metaslab to derive its own fragmentation value. This is done
	* by calculating the space in each bucket of the spacemap histogram and
	* multiplying that by the fragmetation metric in this table. Doing
	* this for all buckets and dividing it by the total amount of free
	* space in this metaslab (i.e. the total free space in all buckets) gives
	* us the fragmentation metric. This means that a high fragmentation metric
	* equates to most of the free space being comprised of small segments.
	* Conversely, if the metric is low, then most of the free space is in
	* large segments. A 10% change in fragmentation equates to approximately
	* double the number of segments.
	*
	* This table defines 0% fragmented space using 16MB segments. Testing has
	* shown that segments that are greater than or equal to 16MB do not suffer
	* from drastic performance problems. Using this value, we derive the rest
	* of the table. Since the fragmentation value is never stored on disk, it
	* is possible to change these calculations in the future.
	*/
	int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
	100, /* 512B */
	100, /* 1K */
	98, /* 2K */
	95, /* 4K */
	90, /* 8K */
	80, /* 16K */
	70, /* 32K */
	60, /* 64K */
	50, /* 128K */
	40, /* 256K */
	30, /* 512K */
	20, /* 1M */
	15, /* 2M */
	10, /* 4M */
	5, /* 8M */
	0 /* 16M */
	};

	/*
	* Calclate the metaslab's fragmentation metric. A return value
	* of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
	* not support this metric. Otherwise, the return value should be in the
	* range [0, 100].
	*/
	static void
	metaslab_set_fragmentation(metaslab_t *msp)
	{
	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
	uint64_t fragmentation = 0;
	uint64_t total = 0;
	boolean_t feature_enabled = spa_feature_is_enabled(spa,
	SPA_FEATURE_SPACEMAP_HISTOGRAM);

	if (!feature_enabled) {
	msp->ms_fragmentation = ZFS_FRAG_INVALID;
	return;
	}

	/*
	* A null space map means that the entire metaslab is free
	* and thus is not fragmented.
	*/
	if (msp->ms_sm == NULL) {
	msp->ms_fragmentation = 0;
	return;
	}

	/*
	* If this metaslab's space map has not been upgraded, flag it
	* so that we upgrade next time we encounter it.
	*/
	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
	uint64_t txg = spa_syncing_txg(spa);
	vdev_t *vd = msp->ms_group->mg_vd;

	/*
	* If we've reached the final dirty txg, then we must
	* be shutting down the pool. We don't want to dirty
	* any data past this point so skip setting the condense
	* flag. We can retry this action the next time the pool
	* is imported.
	*/
	if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
	msp->ms_condense_wanted = B_TRUE;
	vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
	spa_dbgmsg(spa, "txg %llu, requesting force condense: "
	"ms_id %llu, vdev_id %llu", txg, msp->ms_id,
	vd->vdev_id);
	}
	msp->ms_fragmentation = ZFS_FRAG_INVALID;
	return;
	}

	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
	uint64_t space = 0;
	uint8_t shift = msp->ms_sm->sm_shift;

	int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
	FRAGMENTATION_TABLE_SIZE - 1);

	if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
	continue;

	space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
	total += space;

	ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
	fragmentation += space * zfs_frag_table[idx];
	}

	if (total > 0)
	fragmentation /= total;
	ASSERT3U(fragmentation, <=, 100);

	msp->ms_fragmentation = fragmentation;
	}

	/*
	* Compute a weight -- a selection preference value -- for the given metaslab.
	* This is based on the amount of free space, the level of fragmentation,
	* the LBA range, and whether the metaslab is loaded.
	*/
	static uint64_t
	metaslab_space_weight(metaslab_t *msp)
	{
	metaslab_group_t *mg = msp->ms_group;
	vdev_t *vd = mg->mg_vd;
	uint64_t weight, space;

	ASSERT(MUTEX_HELD(&msp->ms_lock));
	ASSERT(!vd->vdev_removing);

	/*
	* The baseline weight is the metaslab's free space.
	*/
	space = msp->ms_size - space_map_allocated(msp->ms_sm);

	if (metaslab_fragmentation_factor_enabled &&
	msp->ms_fragmentation != ZFS_FRAG_INVALID) {
	/*
	* Use the fragmentation information to inversely scale
	* down the baseline weight. We need to ensure that we
	* don't exclude this metaslab completely when it's 100%
	* fragmented. To avoid this we reduce the fragmented value
	* by 1.
	*/
	space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;

	/*
	* If space < SPA_MINBLOCKSIZE, then we will not allocate from
	* this metaslab again. The fragmentation metric may have
	* decreased the space to something smaller than
	* SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
	* so that we can consume any remaining space.
	*/
	if (space > 0 && space < SPA_MINBLOCKSIZE)
	space = SPA_MINBLOCKSIZE;
	}
	weight = space;

	/*
	* Modern disks have uniform bit density and constant angular velocity.
	* Therefore, the outer recording zones are faster (higher bandwidth)
	* than the inner zones by the ratio of outer to inner track diameter,
	* which is typically around 2:1. We account for this by assigning
	* higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
	* In effect, this means that we'll select the metaslab with the most
	* free bandwidth rather than simply the one with the most free space.
	*/
	if (metaslab_lba_weighting_enabled) {
	weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
	ASSERT(weight >= space && weight <= 2 * space);
	}

	/*
	* If this metaslab is one we're actively using, adjust its
	* weight to make it preferable to any inactive metaslab so
	* we'll polish it off. If the fragmentation on this metaslab
	* has exceed our threshold, then don't mark it active.
	*/
	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
	msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
	weight \|= (msp->ms_weight & METASLAB_ACTIVE_MASK);
	}

	WEIGHT_SET_SPACEBASED(weight);
	return (weight);
	}

	/*
	* Return the weight of the specified metaslab, according to the segment-based
	* weighting algorithm. The metaslab must be loaded. This function can
	* be called within a sync pass since it relies only on the metaslab's
	* range tree which is always accurate when the metaslab is loaded.
	*/
	static uint64_t
	metaslab_weight_from_range_tree(metaslab_t *msp)
	{
	uint64_t weight = 0;
	uint32_t segments = 0;

	ASSERT(msp->ms_loaded);

	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
	i--) {
	uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
	int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;

	segments <<= 1;
	segments += msp->ms_tree->rt_histogram[i];

	/*
	* The range tree provides more precision than the space map
	* and must be downgraded so that all values fit within the
	* space map's histogram. This allows us to compare loaded
	* vs. unloaded metaslabs to determine which metaslab is
	* considered "best".
	*/
	if (i > max_idx)
	continue;

	if (segments != 0) {
	WEIGHT_SET_COUNT(weight, segments);
	WEIGHT_SET_INDEX(weight, i);
	WEIGHT_SET_ACTIVE(weight, 0);
	break;
	}
	}
	return (weight);
	}

	/*
	* Calculate the weight based on the on-disk histogram. This should only
	* be called after a sync pass has completely finished since the on-disk
	* information is updated in metaslab_sync().
	*/
	static uint64_t
	metaslab_weight_from_spacemap(metaslab_t *msp)
	{
	uint64_t weight = 0;

	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
	if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
	WEIGHT_SET_COUNT(weight,
	msp->ms_sm->sm_phys->smp_histogram[i]);
	WEIGHT_SET_INDEX(weight, i +
	msp->ms_sm->sm_shift);
	WEIGHT_SET_ACTIVE(weight, 0);
	break;
	}
	}
	return (weight);
	}

	/*
	* Compute a segment-based weight for the specified metaslab. The weight
	* is determined by highest bucket in the histogram. The information
	* for the highest bucket is encoded into the weight value.
	*/
	static uint64_t
	metaslab_segment_weight(metaslab_t *msp)
	{
	metaslab_group_t *mg = msp->ms_group;
	uint64_t weight = 0;
	uint8_t shift = mg->mg_vd->vdev_ashift;

	ASSERT(MUTEX_HELD(&msp->ms_lock));

	/*
	* The metaslab is completely free.
	*/
	if (space_map_allocated(msp->ms_sm) == 0) {
	int idx = highbit64(msp->ms_size) - 1;
	int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;

	if (idx < max_idx) {
	WEIGHT_SET_COUNT(weight, 1ULL);
	WEIGHT_SET_INDEX(weight, idx);
	} else {
	WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
	WEIGHT_SET_INDEX(weight, max_idx);
	}
	WEIGHT_SET_ACTIVE(weight, 0);
	ASSERT(!WEIGHT_IS_SPACEBASED(weight));

	return (weight);
	}

	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));

	/*
	* If the metaslab is fully allocated then just make the weight 0.
	*/
	if (space_map_allocated(msp->ms_sm) == msp->ms_size)
	return (0);
	/*
	* If the metaslab is already loaded, then use the range tree to
	* determine the weight. Otherwise, we rely on the space map information
	* to generate the weight.
	*/
	if (msp->ms_loaded) {
	weight = metaslab_weight_from_range_tree(msp);
	} else {
	weight = metaslab_weight_from_spacemap(msp);
	}

	/*
	* If the metaslab was active the last time we calculated its weight
	* then keep it active. We want to consume the entire region that
	* is associated with this weight.
	*/
	if (msp->ms_activation_weight != 0 && weight != 0)
	WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
	return (weight);
	}

	/*
	* Determine if we should attempt to allocate from this metaslab. If the
	* metaslab has a maximum size then we can quickly determine if the desired
	* allocation size can be satisfied. Otherwise, if we're using segment-based
	* weighting then we can determine the maximum allocation that this metaslab
	* can accommodate based on the index encoded in the weight. If we're using
	* space-based weights then rely on the entire weight (excluding the weight
	* type bit).
	*/
	boolean_t
	metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
	{
	boolean_t should_allocate;

	if (msp->ms_max_size != 0)
	return (msp->ms_max_size >= asize);

	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
	/*
	* The metaslab segment weight indicates segments in the
	* range [2^i, 2^(i+1)), where i is the index in the weight.
	* Since the asize might be in the middle of the range, we
	* should attempt the allocation if asize < 2^(i+1).
	*/
	should_allocate = (asize <
	1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
	} else {
	should_allocate = (asize <=
	(msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
	}
	return (should_allocate);
	}

	static uint64_t
	metaslab_weight(metaslab_t *msp)
	{
	vdev_t *vd = msp->ms_group->mg_vd;
	spa_t *spa = vd->vdev_spa;
	uint64_t weight;

	ASSERT(MUTEX_HELD(&msp->ms_lock));

	/*
	- * This vdev is in the process of being removed so there is nothing
	+ * If this vdev is in the process of being removed, there is nothing
	* for us to do here.
	*/
	- if (vd->vdev_removing) {
	- ASSERT0(space_map_allocated(msp->ms_sm));
	- ASSERT0(vd->vdev_ms_shift);
	+ if (vd->vdev_removing)
	return (0);
	- }

	metaslab_set_fragmentation(msp);

	/*
	* Update the maximum size if the metaslab is loaded. This will
	* ensure that we get an accurate maximum size if newly freed space
	* has been added back into the free tree.
	*/
	if (msp->ms_loaded)
	msp->ms_max_size = metaslab_block_maxsize(msp);

	/*
	* Segment-based weighting requires space map histogram support.
	*/
	if (zfs_metaslab_segment_weight_enabled &&
	spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
	(msp->ms_sm == NULL \|\| msp->ms_sm->sm_dbuf->db_size ==
	sizeof (space_map_phys_t))) {
	weight = metaslab_segment_weight(msp);
	} else {
	weight = metaslab_space_weight(msp);
	}
	return (weight);
	}

	static int
	metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
	{
	ASSERT(MUTEX_HELD(&msp->ms_lock));

	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
	metaslab_load_wait(msp);
	if (!msp->ms_loaded) {
	int error = metaslab_load(msp);
	if (error) {
	metaslab_group_sort(msp->ms_group, msp, 0);
	return (error);
	}
	}

	msp->ms_activation_weight = msp->ms_weight;
	metaslab_group_sort(msp->ms_group, msp,
	msp->ms_weight \| activation_weight);
	}
	ASSERT(msp->ms_loaded);
	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);

	return (0);
	}

	static void
	metaslab_passivate(metaslab_t *msp, uint64_t weight)
	{
	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;

	/*
	* If size < SPA_MINBLOCKSIZE, then we will not allocate from
	* this metaslab again. In that case, it had better be empty,
	* or we would be leaving space on the table.
	*/
	ASSERT(size >= SPA_MINBLOCKSIZE \|\|
	range_tree_space(msp->ms_tree) == 0);
	ASSERT0(weight & METASLAB_ACTIVE_MASK);

	msp->ms_activation_weight = 0;
	metaslab_group_sort(msp->ms_group, msp, weight);
	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
	}

	/*
	* Segment-based metaslabs are activated once and remain active until
	* we either fail an allocation attempt (similar to space-based metaslabs)
	* or have exhausted the free space in zfs_metaslab_switch_threshold
	* buckets since the metaslab was activated. This function checks to see
	* if we've exhaused the zfs_metaslab_switch_threshold buckets in the
	* metaslab and passivates it proactively. This will allow us to select a
	* metaslabs with larger contiguous region if any remaining within this
	* metaslab group. If we're in sync pass > 1, then we continue using this
	* metaslab so that we don't dirty more block and cause more sync passes.
	*/
	void
	metaslab_segment_may_passivate(metaslab_t *msp)
	{
	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;

	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) \|\| spa_sync_pass(spa) > 1)
	return;

	/*
	* Since we are in the middle of a sync pass, the most accurate
	* information that is accessible to us is the in-core range tree
	* histogram; calculate the new weight based on that information.
	*/
	uint64_t weight = metaslab_weight_from_range_tree(msp);
	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
	int current_idx = WEIGHT_GET_INDEX(weight);

	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
	metaslab_passivate(msp, weight);
	}

	static void
	metaslab_preload(void *arg)
	{
	metaslab_t *msp = arg;
	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;

	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));

	mutex_enter(&msp->ms_lock);
	metaslab_load_wait(msp);
	if (!msp->ms_loaded)
	(void) metaslab_load(msp);
	msp->ms_selected_txg = spa_syncing_txg(spa);
	mutex_exit(&msp->ms_lock);
	}

	static void
	metaslab_group_preload(metaslab_group_t *mg)
	{
	spa_t *spa = mg->mg_vd->vdev_spa;
	metaslab_t *msp;
	avl_tree_t *t = &mg->mg_metaslab_tree;
	int m = 0;

	if (spa_shutting_down(spa) \|\| !metaslab_preload_enabled) {
	taskq_wait(mg->mg_taskq);
	return;
	}

	mutex_enter(&mg->mg_lock);
	+
	/*
	* Load the next potential metaslabs
	*/
	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
	+ ASSERT3P(msp->ms_group, ==, mg);
	+
	/*
	* We preload only the maximum number of metaslabs specified
	* by metaslab_preload_limit. If a metaslab is being forced
	* to condense then we preload it too. This will ensure
	* that force condensing happens in the next txg.
	*/
	if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
	continue;
	}

	VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
	msp, TQ_SLEEP) != 0);
	}
	mutex_exit(&mg->mg_lock);
	}

	/*
	* Determine if the space map's on-disk footprint is past our tolerance
	* for inefficiency. We would like to use the following criteria to make
	* our decision:
	*
	* 1. The size of the space map object should not dramatically increase as a
	* result of writing out the free space range tree.
	*
	* 2. The minimal on-disk space map representation is zfs_condense_pct/100
	* times the size than the free space range tree representation
	- * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
	+ * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
	*
	* 3. The on-disk size of the space map should actually decrease.
	*
	* Checking the first condition is tricky since we don't want to walk
	* the entire AVL tree calculating the estimated on-disk size. Instead we
	* use the size-ordered range tree in the metaslab and calculate the
	* size required to write out the largest segment in our free tree. If the
	* size required to represent that segment on disk is larger than the space
	* map object then we avoid condensing this map.
	*
	* To determine the second criterion we use a best-case estimate and assume
	* each segment can be represented on-disk as a single 64-bit entry. We refer
	* to this best-case estimate as the space map's minimal form.
	*
	* Unfortunately, we cannot compute the on-disk size of the space map in this
	* context because we cannot accurately compute the effects of compression, etc.
	* Instead, we apply the heuristic described in the block comment for
	* zfs_metaslab_condense_block_threshold - we only condense if the space used
	* is greater than a threshold number of blocks.
	*/
	static boolean_t
	metaslab_should_condense(metaslab_t *msp)
	{
	space_map_t *sm = msp->ms_sm;
	range_seg_t *rs;
	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
	dmu_object_info_t doi;
	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;

	ASSERT(MUTEX_HELD(&msp->ms_lock));
	ASSERT(msp->ms_loaded);

	/*
	* Use the ms_size_tree range tree, which is ordered by size, to
	* obtain the largest segment in the free tree. We always condense
	* metaslabs that are empty and metaslabs for which a condense
	* request has been made.
	*/
	rs = avl_last(&msp->ms_size_tree);
	if (rs == NULL \|\| msp->ms_condense_wanted)
	return (B_TRUE);

	/*
	* Calculate the number of 64-bit entries this segment would
	* require when written to disk. If this single segment would be
	* larger on-disk than the entire current on-disk structure, then
	* clearly condensing will increase the on-disk structure size.
	*/
	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
	entries = size / (MIN(size, SM_RUN_MAX));
	segsz = entries * sizeof (uint64_t);

	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
	object_size = space_map_length(msp->ms_sm);

	dmu_object_info_from_db(sm->sm_dbuf, &doi);
	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);

	return (segsz <= object_size &&
	object_size >= (optimal_size * zfs_condense_pct / 100) &&
	object_size > zfs_metaslab_condense_block_threshold * record_size);
	}

	/*
	* Condense the on-disk space map representation to its minimized form.
	* The minimized form consists of a small number of allocations followed by
	* the entries of the free range tree.
	*/
	static void
	metaslab_condense(metaslab_t msp, uint64_t txg, dmu_tx_t tx)
	{
	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
	range_tree_t *condense_tree;
	space_map_t *sm = msp->ms_sm;

	ASSERT(MUTEX_HELD(&msp->ms_lock));
	ASSERT3U(spa_sync_pass(spa), ==, 1);
	ASSERT(msp->ms_loaded);


	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
	"spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
	msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
	msp->ms_group->mg_vd->vdev_spa->spa_name,
	space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
	msp->ms_condense_wanted ? "TRUE" : "FALSE");

	msp->ms_condense_wanted = B_FALSE;

	/*
	* Create an range tree that is 100% allocated. We remove segments
	* that have been freed in this txg, any deferred frees that exist,
	* and any allocation in the future. Removing segments should be
	* a relatively inexpensive operation since we expect these trees to
	* have a small number of nodes.
	*/
	- condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
	+ condense_tree = range_tree_create(NULL, NULL);
	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);

	/*
	* Remove what's been freed in this txg from the condense_tree.
	* Since we're in sync_pass 1, we know that all the frees from
	* this txg are in the freeingtree.
	*/
	range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);

	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	range_tree_walk(msp->ms_defertree[t],
	range_tree_remove, condense_tree);
	}

	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
	range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
	range_tree_remove, condense_tree);
	}

	/*
	* We're about to drop the metaslab's lock thus allowing
	* other consumers to change it's content. Set the
	* metaslab's ms_condensing flag to ensure that
	* allocations on this metaslab do not occur while we're
	* in the middle of committing it to disk. This is only critical
	* for the ms_tree as all other range trees use per txg
	* views of their content.
	*/
	msp->ms_condensing = B_TRUE;

	mutex_exit(&msp->ms_lock);
	space_map_truncate(sm, tx);
	- mutex_enter(&msp->ms_lock);

	/*
	* While we would ideally like to create a space map representation
	* that consists only of allocation records, doing so can be
	* prohibitively expensive because the in-core free tree can be
	* large, and therefore computationally expensive to subtract
	* from the condense_tree. Instead we sync out two trees, a cheap
	* allocation only tree followed by the in-core free tree. While not
	* optimal, this is typically close to optimal, and much cheaper to
	* compute.
	*/
	space_map_write(sm, condense_tree, SM_ALLOC, tx);
	range_tree_vacate(condense_tree, NULL, NULL);
	range_tree_destroy(condense_tree);

	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
	+ mutex_enter(&msp->ms_lock);
	msp->ms_condensing = B_FALSE;
	}

	/*
	* Write a metaslab to disk in the context of the specified transaction group.
	*/
	void
	metaslab_sync(metaslab_t *msp, uint64_t txg)
	{
	metaslab_group_t *mg = msp->ms_group;
	vdev_t *vd = mg->mg_vd;
	spa_t *spa = vd->vdev_spa;
	objset_t *mos = spa_meta_objset(spa);
	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
	dmu_tx_t *tx;
	uint64_t object = space_map_object(msp->ms_sm);

	ASSERT(!vd->vdev_ishole);

	/*
	* This metaslab has just been added so there's no work to do now.
	*/
	if (msp->ms_freeingtree == NULL) {
	ASSERT3P(alloctree, ==, NULL);
	return;
	}

	ASSERT3P(alloctree, !=, NULL);
	ASSERT3P(msp->ms_freeingtree, !=, NULL);
	ASSERT3P(msp->ms_freedtree, !=, NULL);

	/*
	* Normally, we don't want to process a metaslab if there
	* are no allocations or frees to perform. However, if the metaslab
	* is being forced to condense and it's loaded, we need to let it
	* through.
	*/
	if (range_tree_space(alloctree) == 0 &&
	range_tree_space(msp->ms_freeingtree) == 0 &&
	!(msp->ms_loaded && msp->ms_condense_wanted))
	return;


	VERIFY(txg <= spa_final_dirty_txg(spa));

	/*
	* The only state that can actually be changing concurrently with
	* metaslab_sync() is the metaslab's ms_tree. No other thread can
	* be modifying this txg's alloctree, freeingtree, freedtree, or
	- * space_map_phys_t. Therefore, we only hold ms_lock to satify
	- * space map ASSERTs. We drop it whenever we call into the DMU,
	- * because the DMU can call down to us (e.g. via zio_free()) at
	- * any time.
	+ * space_map_phys_t. We drop ms_lock whenever we could call
	+ * into the DMU, because the DMU can call down to us
	+ * (e.g. via zio_free()) at any time.
	+ *
	+ * The spa_vdev_remove_thread() can be reading metaslab state
	+ * concurrently, and it is locked out by the ms_sync_lock. Note
	+ * that the ms_lock is insufficient for this, because it is dropped
	+ * by space_map_write().
	*/

	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);

	if (msp->ms_sm == NULL) {
	uint64_t new_object;

	new_object = space_map_alloc(mos, tx);
	VERIFY3U(new_object, !=, 0);

	VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
	- msp->ms_start, msp->ms_size, vd->vdev_ashift,
	- &msp->ms_lock));
	+ msp->ms_start, msp->ms_size, vd->vdev_ashift));
	ASSERT(msp->ms_sm != NULL);
	}

	+ mutex_enter(&msp->ms_sync_lock);
	mutex_enter(&msp->ms_lock);

	/*
	* Note: metaslab_condense() clears the space map's histogram.
	* Therefore we must verify and remove this histogram before
	* condensing.
	*/
	metaslab_group_histogram_verify(mg);
	metaslab_class_histogram_verify(mg->mg_class);
	metaslab_group_histogram_remove(mg, msp);

	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
	metaslab_should_condense(msp)) {
	metaslab_condense(msp, txg, tx);
	} else {
	+ mutex_exit(&msp->ms_lock);
	space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
	space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
	+ mutex_enter(&msp->ms_lock);
	}

	if (msp->ms_loaded) {
	/*
	- * When the space map is loaded, we have an accruate
	+ * When the space map is loaded, we have an accurate
	* histogram in the range tree. This gives us an opportunity
	* to bring the space map's histogram up-to-date so we clear
	* it first before updating it.
	*/
	space_map_histogram_clear(msp->ms_sm);
	space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);

	/*
	* Since we've cleared the histogram we need to add back
	* any free space that has already been processed, plus
	* any deferred space. This allows the on-disk histogram
	* to accurately reflect all free space even if some space
	* is not yet available for allocation (i.e. deferred).
	*/
	space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);

	/*
	* Add back any deferred free space that has not been
	* added back into the in-core free tree yet. This will
	* ensure that we don't end up with a space map histogram
	* that is completely empty unless the metaslab is fully
	* allocated.
	*/
	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	space_map_histogram_add(msp->ms_sm,
	msp->ms_defertree[t], tx);
	}
	}

	/*
	* Always add the free space from this sync pass to the space
	* map histogram. We want to make sure that the on-disk histogram
	* accounts for all free space. If the space map is not loaded,
	* then we will lose some accuracy but will correct it the next
	* time we load the space map.
	*/
	space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);

	metaslab_group_histogram_add(mg, msp);
	metaslab_group_histogram_verify(mg);
	metaslab_class_histogram_verify(mg->mg_class);

	/*
	* For sync pass 1, we avoid traversing this txg's free range tree
	* and instead will just swap the pointers for freeingtree and
	* freedtree. We can safely do this since the freed_tree is
	* guaranteed to be empty on the initial pass.
	*/
	if (spa_sync_pass(spa) == 1) {
	range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
	} else {
	range_tree_vacate(msp->ms_freeingtree,
	range_tree_add, msp->ms_freedtree);
	}
	range_tree_vacate(alloctree, NULL, NULL);

	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
	ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
	ASSERT0(range_tree_space(msp->ms_freeingtree));

	mutex_exit(&msp->ms_lock);

	if (object != space_map_object(msp->ms_sm)) {
	object = space_map_object(msp->ms_sm);
	dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
	msp->ms_id, sizeof (uint64_t), &object, tx);
	}
	+ mutex_exit(&msp->ms_sync_lock);
	dmu_tx_commit(tx);
	}

	/*
	* Called after a transaction group has completely synced to mark
	* all of the metaslab's free space as usable.
	*/
	void
	metaslab_sync_done(metaslab_t *msp, uint64_t txg)
	{
	metaslab_group_t *mg = msp->ms_group;
	vdev_t *vd = mg->mg_vd;
	spa_t *spa = vd->vdev_spa;
	range_tree_t **defer_tree;
	int64_t alloc_delta, defer_delta;
	boolean_t defer_allowed = B_TRUE;

	ASSERT(!vd->vdev_ishole);

	mutex_enter(&msp->ms_lock);

	/*
	* If this metaslab is just becoming available, initialize its
	* range trees and add its capacity to the vdev.
	*/
	if (msp->ms_freedtree == NULL) {
	for (int t = 0; t < TXG_SIZE; t++) {
	ASSERT(msp->ms_alloctree[t] == NULL);

	- msp->ms_alloctree[t] = range_tree_create(NULL, msp,
	- &msp->ms_lock);
	+ msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
	}

	ASSERT3P(msp->ms_freeingtree, ==, NULL);
	- msp->ms_freeingtree = range_tree_create(NULL, msp,
	- &msp->ms_lock);
	+ msp->ms_freeingtree = range_tree_create(NULL, NULL);

	ASSERT3P(msp->ms_freedtree, ==, NULL);
	- msp->ms_freedtree = range_tree_create(NULL, msp,
	- &msp->ms_lock);
	+ msp->ms_freedtree = range_tree_create(NULL, NULL);

	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
	ASSERT(msp->ms_defertree[t] == NULL);

	- msp->ms_defertree[t] = range_tree_create(NULL, msp,
	- &msp->ms_lock);
	+ msp->ms_defertree[t] = range_tree_create(NULL, NULL);
	}

	vdev_space_update(vd, 0, 0, msp->ms_size);
	}

	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];

	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
	metaslab_class_get_alloc(spa_normal_class(spa));
	- if (free_space <= spa_get_slop_space(spa)) {
	+ if (free_space <= spa_get_slop_space(spa) \|\| vd->vdev_removing) {
	defer_allowed = B_FALSE;
	}

	defer_delta = 0;
	alloc_delta = space_map_alloc_delta(msp->ms_sm);
	if (defer_allowed) {
	defer_delta = range_tree_space(msp->ms_freedtree) -
	range_tree_space(*defer_tree);
	} else {
	defer_delta -= range_tree_space(*defer_tree);
	}

	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);

	/*
	* If there's a metaslab_load() in progress, wait for it to complete
	* so that we have a consistent view of the in-core space map.
	*/
	metaslab_load_wait(msp);

	/*
	* Move the frees from the defer_tree back to the free
	* range tree (if it's loaded). Swap the freed_tree and the
	* defer_tree -- this is safe to do because we've just emptied out
	* the defer_tree.
	*/
	range_tree_vacate(*defer_tree,
	msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
	if (defer_allowed) {
	range_tree_swap(&msp->ms_freedtree, defer_tree);
	} else {
	range_tree_vacate(msp->ms_freedtree,
	msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
	}

	space_map_update(msp->ms_sm);

	msp->ms_deferspace += defer_delta;
	ASSERT3S(msp->ms_deferspace, >=, 0);
	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
	if (msp->ms_deferspace != 0) {
	/*
	* Keep syncing this metaslab until all deferred frees
	* are back in circulation.
	*/
	vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
	}

	/*
	* Calculate the new weights before unloading any metaslabs.
	* This will give us the most accurate weighting.
	*/
	metaslab_group_sort(mg, msp, metaslab_weight(msp));

	/*
	* If the metaslab is loaded and we've not tried to load or allocate
	* from it in 'metaslab_unload_delay' txgs, then unload it.
	*/
	if (msp->ms_loaded &&
	msp->ms_selected_txg + metaslab_unload_delay < txg) {
	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
	VERIFY0(range_tree_space(
	msp->ms_alloctree[(txg + t) & TXG_MASK]));
	}

	if (!metaslab_debug_unload)
	metaslab_unload(msp);
	}

	+ ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
	+ ASSERT0(range_tree_space(msp->ms_freeingtree));
	+ ASSERT0(range_tree_space(msp->ms_freedtree));
	+
	mutex_exit(&msp->ms_lock);
	}

	void
	metaslab_sync_reassess(metaslab_group_t *mg)
	{
	+ spa_t *spa = mg->mg_class->mc_spa;
	+
	+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
	metaslab_group_alloc_update(mg);
	mg->mg_fragmentation = metaslab_group_fragmentation(mg);

	/*
	- * Preload the next potential metaslabs
	+ * Preload the next potential metaslabs but only on active
	+ * metaslab groups. We can get into a state where the metaslab
	+ * is no longer active since we dirty metaslabs as we remove a
	+ * a device, thus potentially making the metaslab group eligible
	+ * for preloading.
	*/
	- metaslab_group_preload(mg);
	+ if (mg->mg_activation_count > 0) {
	+ metaslab_group_preload(mg);
	+ }
	+ spa_config_exit(spa, SCL_ALLOC, FTAG);
	}

	static uint64_t
	metaslab_distance(metaslab_t msp, dva_t dva)
	{
	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
	uint64_t start = msp->ms_id;

	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
	return (1ULL << 63);

	if (offset < start)
	return ((start - offset) << ms_shift);
	if (offset > start)
	return ((offset - start) << ms_shift);
	return (0);
	}

	/*
	* ==========================================================================
	* Metaslab allocation tracing facility
	* ==========================================================================
	*/
	kstat_t *metaslab_trace_ksp;
	kstat_named_t metaslab_trace_over_limit;

	void
	metaslab_alloc_trace_init(void)
	{
	ASSERT(metaslab_alloc_trace_cache == NULL);
	metaslab_alloc_trace_cache = kmem_cache_create(
	"metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
	0, NULL, NULL, NULL, NULL, NULL, 0);
	metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
	"misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
	if (metaslab_trace_ksp != NULL) {
	metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
	kstat_named_init(&metaslab_trace_over_limit,
	"metaslab_trace_over_limit", KSTAT_DATA_UINT64);
	kstat_install(metaslab_trace_ksp);
	}
	}

	void
	metaslab_alloc_trace_fini(void)
	{
	if (metaslab_trace_ksp != NULL) {
	kstat_delete(metaslab_trace_ksp);
	metaslab_trace_ksp = NULL;
	}
	kmem_cache_destroy(metaslab_alloc_trace_cache);
	metaslab_alloc_trace_cache = NULL;
	}

	/*
	* Add an allocation trace element to the allocation tracing list.
	*/
	static void
	metaslab_trace_add(zio_alloc_list_t zal, metaslab_group_t mg,
	metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
	{
	if (!metaslab_trace_enabled)
	return;

	/*
	* When the tracing list reaches its maximum we remove
	* the second element in the list before adding a new one.
	* By removing the second element we preserve the original
	* entry as a clue to what allocations steps have already been
	* performed.
	*/
	if (zal->zal_size == metaslab_trace_max_entries) {
	metaslab_alloc_trace_t *mat_next;
	#ifdef DEBUG
	panic("too many entries in allocation list");
	#endif
	atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
	zal->zal_size--;
	mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
	list_remove(&zal->zal_list, mat_next);
	kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
	}

	metaslab_alloc_trace_t *mat =
	kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
	list_link_init(&mat->mat_list_node);
	mat->mat_mg = mg;
	mat->mat_msp = msp;
	mat->mat_size = psize;
	mat->mat_dva_id = dva_id;
	mat->mat_offset = offset;
	mat->mat_weight = 0;

	if (msp != NULL)
	mat->mat_weight = msp->ms_weight;

	/*
	* The list is part of the zio so locking is not required. Only
	* a single thread will perform allocations for a given zio.
	*/
	list_insert_tail(&zal->zal_list, mat);
	zal->zal_size++;

	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
	}

	void
	metaslab_trace_init(zio_alloc_list_t *zal)
	{
	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
	offsetof(metaslab_alloc_trace_t, mat_list_node));
	zal->zal_size = 0;
	}

	void
	metaslab_trace_fini(zio_alloc_list_t *zal)
	{
	metaslab_alloc_trace_t *mat;

	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
	kmem_cache_free(metaslab_alloc_trace_cache, mat);
	list_destroy(&zal->zal_list);
	zal->zal_size = 0;
	}

	/*
	* ==========================================================================
	* Metaslab block operations
	* ==========================================================================
	*/

	static void
	metaslab_group_alloc_increment(spa_t spa, uint64_t vdev, void tag, int flags)
	{
	if (!(flags & METASLAB_ASYNC_ALLOC) \|\|
	flags & METASLAB_DONT_THROTTLE)
	return;

	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
	if (!mg->mg_class->mc_alloc_throttle_enabled)
	return;

	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
	}

	void
	metaslab_group_alloc_decrement(spa_t spa, uint64_t vdev, void tag, int flags)
	{
	if (!(flags & METASLAB_ASYNC_ALLOC) \|\|
	flags & METASLAB_DONT_THROTTLE)
	return;

	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
	if (!mg->mg_class->mc_alloc_throttle_enabled)
	return;

	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
	}

	void
	metaslab_group_alloc_verify(spa_t spa, const blkptr_t bp, void *tag)
	{
	#ifdef ZFS_DEBUG
	const dva_t *dva = bp->blk_dva;
	int ndvas = BP_GET_NDVAS(bp);

	for (int d = 0; d < ndvas; d++) {
	uint64_t vdev = DVA_GET_VDEV(&dva[d]);
	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
	VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
	}
	#endif
	}

	static uint64_t
	metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
	{
	uint64_t start;
	range_tree_t *rt = msp->ms_tree;
	metaslab_class_t *mc = msp->ms_group->mg_class;

	VERIFY(!msp->ms_condensing);

	start = mc->mc_ops->msop_alloc(msp, size);
	if (start != -1ULL) {
	metaslab_group_t *mg = msp->ms_group;
	vdev_t *vd = mg->mg_vd;

	VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
	VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
	range_tree_remove(rt, start, size);

	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
	vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);

	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);

	/* Track the last successful allocation */
	msp->ms_alloc_txg = txg;
	metaslab_verify_space(msp, txg);
	}

	/*
	* Now that we've attempted the allocation we need to update the
	* metaslab's maximum block size since it may have changed.
	*/
	msp->ms_max_size = metaslab_block_maxsize(msp);
	return (start);
	}

	static uint64_t
	metaslab_group_alloc_normal(metaslab_group_t mg, zio_alloc_list_t zal,
	uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
	{
	metaslab_t *msp = NULL;
	uint64_t offset = -1ULL;
	uint64_t activation_weight;
	uint64_t target_distance;
	int i;

	activation_weight = METASLAB_WEIGHT_PRIMARY;
	for (i = 0; i < d; i++) {
	if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
	activation_weight = METASLAB_WEIGHT_SECONDARY;
	break;
	}
	}

	metaslab_t search = kmem_alloc(sizeof (search), KM_SLEEP);
	search->ms_weight = UINT64_MAX;
	search->ms_start = 0;
	for (;;) {
	boolean_t was_active;
	avl_tree_t *t = &mg->mg_metaslab_tree;
	avl_index_t idx;

	mutex_enter(&mg->mg_lock);

	/*
	* Find the metaslab with the highest weight that is less
	* than what we've already tried. In the common case, this
	* means that we will examine each metaslab at most once.
	* Note that concurrent callers could reorder metaslabs
	* by activation/passivation once we have dropped the mg_lock.
	* If a metaslab is activated by another thread, and we fail
	* to allocate from the metaslab we have selected, we may
	* not try the newly-activated metaslab, and instead activate
	* another metaslab. This is not optimal, but generally
	* does not cause any problems (a possible exception being
	* if every metaslab is completely full except for the
	* the newly-activated metaslab which we fail to examine).
	*/
	msp = avl_find(t, search, &idx);
	if (msp == NULL)
	msp = avl_nearest(t, idx, AVL_AFTER);
	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {

	if (!metaslab_should_allocate(msp, asize)) {
	metaslab_trace_add(zal, mg, msp, asize, d,
	TRACE_TOO_SMALL);
	continue;
	}

	/*
	* If the selected metaslab is condensing, skip it.
	*/
	if (msp->ms_condensing)
	continue;

	was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
	if (activation_weight == METASLAB_WEIGHT_PRIMARY)
	break;

	target_distance = min_distance +
	(space_map_allocated(msp->ms_sm) != 0 ? 0 :
	min_distance >> 1);

	for (i = 0; i < d; i++) {
	if (metaslab_distance(msp, &dva[i]) <
	target_distance)
	break;
	}
	if (i == d)
	break;
	}
	mutex_exit(&mg->mg_lock);
	if (msp == NULL) {
	kmem_free(search, sizeof (*search));
	return (-1ULL);
	}
	search->ms_weight = msp->ms_weight;
	search->ms_start = msp->ms_start + 1;

	mutex_enter(&msp->ms_lock);

	/*
	* Ensure that the metaslab we have selected is still
	* capable of handling our request. It's possible that
	* another thread may have changed the weight while we
	* were blocked on the metaslab lock. We check the
	* active status first to see if we need to reselect
	* a new metaslab.
	*/
	if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
	mutex_exit(&msp->ms_lock);
	continue;
	}

	if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
	activation_weight == METASLAB_WEIGHT_PRIMARY) {
	metaslab_passivate(msp,
	msp->ms_weight & ~METASLAB_ACTIVE_MASK);
	mutex_exit(&msp->ms_lock);
	continue;
	}

	if (metaslab_activate(msp, activation_weight) != 0) {
	mutex_exit(&msp->ms_lock);
	continue;
	}
	msp->ms_selected_txg = txg;

	/*
	* Now that we have the lock, recheck to see if we should
	* continue to use this metaslab for this allocation. The
	* the metaslab is now loaded so metaslab_should_allocate() can
	* accurately determine if the allocation attempt should
	* proceed.
	*/
	if (!metaslab_should_allocate(msp, asize)) {
	/* Passivate this metaslab and select a new one. */
	metaslab_trace_add(zal, mg, msp, asize, d,
	TRACE_TOO_SMALL);
	goto next;
	}

	/*
	* If this metaslab is currently condensing then pick again as
	* we can't manipulate this metaslab until it's committed
	* to disk.
	*/
	if (msp->ms_condensing) {
	metaslab_trace_add(zal, mg, msp, asize, d,
	TRACE_CONDENSING);
	mutex_exit(&msp->ms_lock);
	continue;
	}

	offset = metaslab_block_alloc(msp, asize, txg);
	metaslab_trace_add(zal, mg, msp, asize, d, offset);

	if (offset != -1ULL) {
	/* Proactively passivate the metaslab, if needed */
	metaslab_segment_may_passivate(msp);
	break;
	}
	next:
	ASSERT(msp->ms_loaded);

	/*
	* We were unable to allocate from this metaslab so determine
	* a new weight for this metaslab. Now that we have loaded
	* the metaslab we can provide a better hint to the metaslab
	* selector.
	*
	* For space-based metaslabs, we use the maximum block size.
	* This information is only available when the metaslab
	* is loaded and is more accurate than the generic free
	* space weight that was calculated by metaslab_weight().
	* This information allows us to quickly compare the maximum
	* available allocation in the metaslab to the allocation
	* size being requested.
	*
	* For segment-based metaslabs, determine the new weight
	* based on the highest bucket in the range tree. We
	* explicitly use the loaded segment weight (i.e. the range
	* tree histogram) since it contains the space that is
	* currently available for allocation and is accurate
	* even within a sync pass.
	*/
	if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
	uint64_t weight = metaslab_block_maxsize(msp);
	WEIGHT_SET_SPACEBASED(weight);
	metaslab_passivate(msp, weight);
	} else {
	metaslab_passivate(msp,
	metaslab_weight_from_range_tree(msp));
	}

	/*
	* We have just failed an allocation attempt, check
	* that metaslab_should_allocate() agrees. Otherwise,
	* we may end up in an infinite loop retrying the same
	* metaslab.
	*/
	ASSERT(!metaslab_should_allocate(msp, asize));
	mutex_exit(&msp->ms_lock);
	}
	mutex_exit(&msp->ms_lock);
	kmem_free(search, sizeof (*search));
	return (offset);
	}

	static uint64_t
	metaslab_group_alloc(metaslab_group_t mg, zio_alloc_list_t zal,
	uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
	{
	uint64_t offset;
	ASSERT(mg->mg_initialized);

	offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
	min_distance, dva, d);

	mutex_enter(&mg->mg_lock);
	if (offset == -1ULL) {
	mg->mg_failed_allocations++;
	metaslab_trace_add(zal, mg, NULL, asize, d,
	TRACE_GROUP_FAILURE);
	if (asize == SPA_GANGBLOCKSIZE) {
	/*
	* This metaslab group was unable to allocate
	* the minimum gang block size so it must be out of
	* space. We must notify the allocation throttle
	* to start skipping allocation attempts to this
	* metaslab group until more space becomes available.
	* Note: this failure cannot be caused by the
	* allocation throttle since the allocation throttle
	* is only responsible for skipping devices and
	* not failing block allocations.
	*/
	mg->mg_no_free_space = B_TRUE;
	}
	}
	mg->mg_allocations++;
	mutex_exit(&mg->mg_lock);
	return (offset);
	}

	/*
	* If we have to write a ditto block (i.e. more than one DVA for a given BP)
	* on the same vdev as an existing DVA of this BP, then try to allocate it
	* at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
	* existing DVAs.
	*/
	int ditto_same_vdev_distance_shift = 3;

	/*
	* Allocate a block for the specified i/o.
	*/
	-static int
	+int
	metaslab_alloc_dva(spa_t spa, metaslab_class_t mc, uint64_t psize,
	dva_t dva, int d, dva_t hintdva, uint64_t txg, int flags,
	zio_alloc_list_t *zal)
	{
	metaslab_group_t mg, rotor;
	vdev_t *vd;
	boolean_t try_hard = B_FALSE;

	ASSERT(!DVA_IS_VALID(&dva[d]));

	/*
	* For testing, make some blocks above a certain size be gang blocks.
	*/
	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
	metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
	return (SET_ERROR(ENOSPC));
	}

	/*
	* Start at the rotor and loop through all mgs until we find something.
	* Note that there's no locking on mc_rotor or mc_aliquot because
	* nothing actually breaks if we miss a few updates -- we just won't
	* allocate quite as evenly. It all balances out over time.
	*
	* If we are doing ditto or log blocks, try to spread them across
	* consecutive vdevs. If we're forced to reuse a vdev before we've
	* allocated all of our ditto blocks, then try and spread them out on
	* that vdev as much as possible. If it turns out to not be possible,
	* gradually lower our standards until anything becomes acceptable.
	* Also, allocating on consecutive vdevs (as opposed to random vdevs)
	* gives us hope of containing our fault domains to something we're
	* able to reason about. Otherwise, any two top-level vdev failures
	* will guarantee the loss of data. With consecutive allocation,
	* only two adjacent top-level vdev failures will result in data loss.
	*
	* If we are doing gang blocks (hintdva is non-NULL), try to keep
	* ourselves on the same vdev as our gang block header. That
	* way, we can hope for locality in vdev_cache, plus it makes our
	* fault domains something tractable.
	*/
	if (hintdva) {
	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));

	/*
	* It's possible the vdev we're using as the hint no
	- * longer exists (i.e. removed). Consult the rotor when
	+ * longer exists or its mg has been closed (e.g. by
	+ * device removal). Consult the rotor when
	* all else fails.
	*/
	- if (vd != NULL) {
	+ if (vd != NULL && vd->vdev_mg != NULL) {
	mg = vd->vdev_mg;

	if (flags & METASLAB_HINTBP_AVOID &&
	mg->mg_next != NULL)
	mg = mg->mg_next;
	} else {
	mg = mc->mc_rotor;
	}
	} else if (d != 0) {
	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
	mg = vd->vdev_mg->mg_next;
	} else {
	mg = mc->mc_rotor;
	}

	/*
	* If the hint put us into the wrong metaslab class, or into a
	* metaslab group that has been passivated, just follow the rotor.
	*/
	if (mg->mg_class != mc \|\| mg->mg_activation_count <= 0)
	mg = mc->mc_rotor;

	rotor = mg;
	top:
	do {
	boolean_t allocatable;

	ASSERT(mg->mg_activation_count == 1);
	vd = mg->mg_vd;

	/*
	* Don't allocate from faulted devices.
	*/
	if (try_hard) {
	spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
	allocatable = vdev_allocatable(vd);
	spa_config_exit(spa, SCL_ZIO, FTAG);
	} else {
	allocatable = vdev_allocatable(vd);
	}

	/*
	* Determine if the selected metaslab group is eligible
	* for allocations. If we're ganging then don't allow
	* this metaslab group to skip allocations since that would
	* inadvertently return ENOSPC and suspend the pool
	* even though space is still available.
	*/
	if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
	allocatable = metaslab_group_allocatable(mg, rotor,
	psize);
	}

	if (!allocatable) {
	metaslab_trace_add(zal, mg, NULL, psize, d,
	TRACE_NOT_ALLOCATABLE);
	goto next;
	}

	ASSERT(mg->mg_initialized);

	/*
	* Avoid writing single-copy data to a failing,
	* non-redundant vdev, unless we've already tried all
	* other vdevs.
	*/
	if ((vd->vdev_stat.vs_write_errors > 0 \|\|
	vd->vdev_state < VDEV_STATE_HEALTHY) &&
	d == 0 && !try_hard && vd->vdev_children == 0) {
	metaslab_trace_add(zal, mg, NULL, psize, d,
	TRACE_VDEV_ERROR);
	goto next;
	}

	ASSERT(mg->mg_class == mc);

	/*
	* If we don't need to try hard, then require that the
	* block be 1/8th of the device away from any other DVAs
	* in this BP. If we are trying hard, allow any offset
	* to be used (distance=0).
	*/
	uint64_t distance = 0;
	if (!try_hard) {
	distance = vd->vdev_asize >>
	ditto_same_vdev_distance_shift;
	if (distance <= (1ULL << vd->vdev_ms_shift))
	distance = 0;
	}

	uint64_t asize = vdev_psize_to_asize(vd, psize);
	ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);

	uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
	distance, dva, d);

	if (offset != -1ULL) {
	/*
	* If we've just selected this metaslab group,
	* figure out whether the corresponding vdev is
	* over- or under-used relative to the pool,
	* and set an allocation bias to even it out.
	*/
	if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
	vdev_stat_t *vs = &vd->vdev_stat;
	int64_t vu, cu;

	vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
	cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);

	/*
	* Calculate how much more or less we should
	* try to allocate from this device during
	* this iteration around the rotor.
	* For example, if a device is 80% full
	* and the pool is 20% full then we should
	* reduce allocations by 60% on this device.
	*
	* mg_bias = (20 - 80) * 512K / 100 = -307K
	*
	* This reduces allocations by 307K for this
	* iteration.
	*/
	mg->mg_bias = ((cu - vu) *
	(int64_t)mg->mg_aliquot) / 100;
	} else if (!metaslab_bias_enabled) {
	mg->mg_bias = 0;
	}

	if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
	mg->mg_aliquot + mg->mg_bias) {
	mc->mc_rotor = mg->mg_next;
	mc->mc_aliquot = 0;
	}

	DVA_SET_VDEV(&dva[d], vd->vdev_id);
	DVA_SET_OFFSET(&dva[d], offset);
	DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
	DVA_SET_ASIZE(&dva[d], asize);

	return (0);
	}
	next:
	mc->mc_rotor = mg->mg_next;
	mc->mc_aliquot = 0;
	} while ((mg = mg->mg_next) != rotor);

	/*
	* If we haven't tried hard, do so now.
	*/
	if (!try_hard) {
	try_hard = B_TRUE;
	goto top;
	}

	bzero(&dva[d], sizeof (dva_t));

	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
	return (SET_ERROR(ENOSPC));
	}

	+void
	+metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
	+ uint64_t txg)
	+{
	+ metaslab_t *msp;
	+ spa_t *spa = vd->vdev_spa;
	+
	+ ASSERT3U(txg, ==, spa->spa_syncing_txg);
	+ ASSERT(vdev_is_concrete(vd));
	+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
	+ ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
	+
	+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	+
	+ VERIFY(!msp->ms_condensing);
	+ VERIFY3U(offset, >=, msp->ms_start);
	+ VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
	+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
	+ VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
	+
	+ metaslab_check_free_impl(vd, offset, asize);
	+ mutex_enter(&msp->ms_lock);
	+ if (range_tree_space(msp->ms_freeingtree) == 0) {
	+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
	+ }
	+ range_tree_add(msp->ms_freeingtree, offset, asize);
	+ mutex_exit(&msp->ms_lock);
	+}
	+
	+/* ARGSUSED */
	+void
	+metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
	+ uint64_t size, void *arg)
	+{
	+ uint64_t *txgp = arg;
	+
	+ if (vd->vdev_ops->vdev_op_remap != NULL)
	+ vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
	+ else
	+ metaslab_free_impl(vd, offset, size, *txgp);
	+}
	+
	+static void
	+metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
	+ uint64_t txg)
	+{
	+ spa_t *spa = vd->vdev_spa;
	+
	+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
	+
	+ if (txg > spa_freeze_txg(spa))
	+ return;
	+
	+ if (spa->spa_vdev_removal != NULL &&
	+ spa->spa_vdev_removal->svr_vdev == vd &&
	+ vdev_is_concrete(vd)) {
	+ /*
	+ * Note: we check if the vdev is concrete because when
	+ * we complete the removal, we first change the vdev to be
	+ * an indirect vdev (in open context), and then (in syncing
	+ * context) clear spa_vdev_removal.
	+ */
	+ free_from_removing_vdev(vd, offset, size, txg);
	+ } else if (vd->vdev_ops->vdev_op_remap != NULL) {
	+ vdev_indirect_mark_obsolete(vd, offset, size, txg);
	+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
	+ metaslab_free_impl_cb, &txg);
	+ } else {
	+ metaslab_free_concrete(vd, offset, size, txg);
	+ }
	+}
	+
	+typedef struct remap_blkptr_cb_arg {
	+ blkptr_t *rbca_bp;
	+ spa_remap_cb_t rbca_cb;
	+ vdev_t *rbca_remap_vd;
	+ uint64_t rbca_remap_offset;
	+ void *rbca_cb_arg;
	+} remap_blkptr_cb_arg_t;
	+
	+void
	+remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
	+ uint64_t size, void *arg)
	+{
	+ remap_blkptr_cb_arg_t *rbca = arg;
	+ blkptr_t *bp = rbca->rbca_bp;
	+
	+ /* We can not remap split blocks. */
	+ if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
	+ return;
	+ ASSERT0(inner_offset);
	+
	+ if (rbca->rbca_cb != NULL) {
	+ /*
	+ * At this point we know that we are not handling split
	+ * blocks and we invoke the callback on the previous
	+ * vdev which must be indirect.
	+ */
	+ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
	+
	+ rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
	+ rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
	+
	+ /* set up remap_blkptr_cb_arg for the next call */
	+ rbca->rbca_remap_vd = vd;
	+ rbca->rbca_remap_offset = offset;
	+ }
	+
	+ /*
	+ * The phys birth time is that of dva[0]. This ensures that we know
	+ * when each dva was written, so that resilver can determine which
	+ * blocks need to be scrubbed (i.e. those written during the time
	+ * the vdev was offline). It also ensures that the key used in
	+ * the ARC hash table is unique (i.e. dva[0] + phys_birth). If
	+ * we didn't change the phys_birth, a lookup in the ARC for a
	+ * remapped BP could find the data that was previously stored at
	+ * this vdev + offset.
	+ */
	+ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
	+ DVA_GET_VDEV(&bp->blk_dva[0]));
	+ vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
	+ bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
	+ DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
	+
	+ DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
	+ DVA_SET_OFFSET(&bp->blk_dva[0], offset);
	+}
	+
	/*
	- * Free the block represented by DVA in the context of the specified
	- * transaction group.
	+ * If the block pointer contains any indirect DVAs, modify them to refer to
	+ * concrete DVAs. Note that this will sometimes not be possible, leaving
	+ * the indirect DVA in place. This happens if the indirect DVA spans multiple
	+ * segments in the mapping (i.e. it is a "split block").
	+ *
	+ * If the BP was remapped, calls the callback on the original dva (note the
	+ * callback can be called multiple times if the original indirect DVA refers
	+ * to another indirect DVA, etc).
	+ *
	+ * Returns TRUE if the BP was remapped.
	*/
	-static void
	-metaslab_free_dva(spa_t spa, const dva_t dva, uint64_t txg, boolean_t now)
	+boolean_t
	+spa_remap_blkptr(spa_t spa, blkptr_t bp, spa_remap_cb_t callback, void *arg)
	{
	- uint64_t vdev = DVA_GET_VDEV(dva);
	+ remap_blkptr_cb_arg_t rbca;
	+
	+ if (!zfs_remap_blkptr_enable)
	+ return (B_FALSE);
	+
	+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
	+ return (B_FALSE);
	+
	+ /*
	+ * Dedup BP's can not be remapped, because ddt_phys_select() depends
	+ * on DVA[0] being the same in the BP as in the DDT (dedup table).
	+ */
	+ if (BP_GET_DEDUP(bp))
	+ return (B_FALSE);
	+
	+ /*
	+ * Gang blocks can not be remapped, because
	+ * zio_checksum_gang_verifier() depends on the DVA[0] that's in
	+ * the BP used to read the gang block header (GBH) being the same
	+ * as the DVA[0] that we allocated for the GBH.
	+ */
	+ if (BP_IS_GANG(bp))
	+ return (B_FALSE);
	+
	+ /*
	+ * Embedded BP's have no DVA to remap.
	+ */
	+ if (BP_GET_NDVAS(bp) < 1)
	+ return (B_FALSE);
	+
	+ /*
	+ * Note: we only remap dva[0]. If we remapped other dvas, we
	+ * would no longer know what their phys birth txg is.
	+ */
	+ dva_t *dva = &bp->blk_dva[0];
	+
	uint64_t offset = DVA_GET_OFFSET(dva);
	uint64_t size = DVA_GET_ASIZE(dva);
	- vdev_t *vd;
	+ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
	+
	+ if (vd->vdev_ops->vdev_op_remap == NULL)
	+ return (B_FALSE);
	+
	+ rbca.rbca_bp = bp;
	+ rbca.rbca_cb = callback;
	+ rbca.rbca_remap_vd = vd;
	+ rbca.rbca_remap_offset = offset;
	+ rbca.rbca_cb_arg = arg;
	+
	+ /*
	+ * remap_blkptr_cb() will be called in order for each level of
	+ * indirection, until a concrete vdev is reached or a split block is
	+ * encountered. old_vd and old_offset are updated within the callback
	+ * as we go from the one indirect vdev to the next one (either concrete
	+ * or indirect again) in that order.
	+ */
	+ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
	+
	+ /* Check if the DVA wasn't remapped because it is a split block */
	+ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
	+ return (B_FALSE);
	+
	+ return (B_TRUE);
	+}
	+
	+/*
	+ * Undo the allocation of a DVA which happened in the given transaction group.
	+ */
	+void
	+metaslab_unalloc_dva(spa_t spa, const dva_t dva, uint64_t txg)
	+{
	metaslab_t *msp;
	+ vdev_t *vd;
	+ uint64_t vdev = DVA_GET_VDEV(dva);
	+ uint64_t offset = DVA_GET_OFFSET(dva);
	+ uint64_t size = DVA_GET_ASIZE(dva);

	ASSERT(DVA_IS_VALID(dva));
	+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);

	if (txg > spa_freeze_txg(spa))
	return;

	if ((vd = vdev_lookup_top(spa, vdev)) == NULL \|\|
	(offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
	cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
	(u_longlong_t)vdev, (u_longlong_t)offset);
	ASSERT(0);
	return;
	}

	- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	+ ASSERT(!vd->vdev_removing);
	+ ASSERT(vdev_is_concrete(vd));
	+ ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
	+ ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);

	if (DVA_GET_GANG(dva))
	size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);

	+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	+
	mutex_enter(&msp->ms_lock);
	+ range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
	+ offset, size);

	- if (now) {
	- range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
	- offset, size);
	-
	- VERIFY(!msp->ms_condensing);
	- VERIFY3U(offset, >=, msp->ms_start);
	- VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
	- VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
	- msp->ms_size);
	- VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
	- VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
	- range_tree_add(msp->ms_tree, offset, size);
	- msp->ms_max_size = metaslab_block_maxsize(msp);
	- } else {
	- VERIFY3U(txg, ==, spa->spa_syncing_txg);
	- if (range_tree_space(msp->ms_freeingtree) == 0)
	- vdev_dirty(vd, VDD_METASLAB, msp, txg);
	- range_tree_add(msp->ms_freeingtree, offset, size);
	- }
	-
	+ VERIFY(!msp->ms_condensing);
	+ VERIFY3U(offset, >=, msp->ms_start);
	+ VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
	+ VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
	+ msp->ms_size);
	+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
	+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
	+ range_tree_add(msp->ms_tree, offset, size);
	mutex_exit(&msp->ms_lock);
	}

	/*
	- * Intent log support: upon opening the pool after a crash, notify the SPA
	- * of blocks that the intent log has allocated for immediate write, but
	- * which are still considered free by the SPA because the last transaction
	- * group didn't commit yet.
	+ * Free the block represented by DVA in the context of the specified
	+ * transaction group.
	*/
	-static int
	-metaslab_claim_dva(spa_t spa, const dva_t dva, uint64_t txg)
	+void
	+metaslab_free_dva(spa_t spa, const dva_t dva, uint64_t txg)
	{
	uint64_t vdev = DVA_GET_VDEV(dva);
	uint64_t offset = DVA_GET_OFFSET(dva);
	uint64_t size = DVA_GET_ASIZE(dva);
	- vdev_t *vd;
	- metaslab_t *msp;
	- int error = 0;
	+ vdev_t *vd = vdev_lookup_top(spa, vdev);

	ASSERT(DVA_IS_VALID(dva));
	+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);

	- if ((vd = vdev_lookup_top(spa, vdev)) == NULL \|\|
	- (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
	- return (SET_ERROR(ENXIO));
	-
	- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	-
	- if (DVA_GET_GANG(dva))
	+ if (DVA_GET_GANG(dva)) {
	size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
	-
	- mutex_enter(&msp->ms_lock);
	-
	- if ((txg != 0 && spa_writeable(spa)) \|\| !msp->ms_loaded)
	- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
	-
	- if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
	- error = SET_ERROR(ENOENT);
	-
	- if (error \|\| txg == 0) { /* txg == 0 indicates dry run */
	- mutex_exit(&msp->ms_lock);
	- return (error);
	}

	- VERIFY(!msp->ms_condensing);
	- VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
	- VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
	- VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
	- range_tree_remove(msp->ms_tree, offset, size);
	-
	- if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
	- if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
	- vdev_dirty(vd, VDD_METASLAB, msp, txg);
	- range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
	- }
	-
	- mutex_exit(&msp->ms_lock);
	-
	- return (0);
	+ metaslab_free_impl(vd, offset, size, txg);
	}

	/*
	* Reserve some allocation slots. The reservation system must be called
	* before we call into the allocator. If there aren't any available slots
	* then the I/O will be throttled until an I/O completes and its slots are
	* freed up. The function returns true if it was successful in placing
	* the reservation.
	*/
	boolean_t
	metaslab_class_throttle_reserve(metaslab_class_t mc, int slots, zio_t zio,
	int flags)
	{
	uint64_t available_slots = 0;
	boolean_t slot_reserved = B_FALSE;

	ASSERT(mc->mc_alloc_throttle_enabled);
	mutex_enter(&mc->mc_lock);

	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
	if (reserved_slots < mc->mc_alloc_max_slots)
	available_slots = mc->mc_alloc_max_slots - reserved_slots;

	if (slots <= available_slots \|\| GANG_ALLOCATION(flags)) {
	/*
	* We reserve the slots individually so that we can unreserve
	* them individually when an I/O completes.
	*/
	for (int d = 0; d < slots; d++) {
	reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
	}
	zio->io_flags \|= ZIO_FLAG_IO_ALLOCATING;
	slot_reserved = B_TRUE;
	}

	mutex_exit(&mc->mc_lock);
	return (slot_reserved);
	}

	void
	metaslab_class_throttle_unreserve(metaslab_class_t mc, int slots, zio_t zio)
	{
	ASSERT(mc->mc_alloc_throttle_enabled);
	mutex_enter(&mc->mc_lock);
	for (int d = 0; d < slots; d++) {
	(void) refcount_remove(&mc->mc_alloc_slots, zio);
	}
	mutex_exit(&mc->mc_lock);
	}

	+static int
	+metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
	+ uint64_t txg)
	+{
	+ metaslab_t *msp;
	+ spa_t *spa = vd->vdev_spa;
	+ int error = 0;
	+
	+ if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
	+ return (ENXIO);
	+
	+ ASSERT3P(vd->vdev_ms, !=, NULL);
	+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	+
	+ mutex_enter(&msp->ms_lock);
	+
	+ if ((txg != 0 && spa_writeable(spa)) \|\| !msp->ms_loaded)
	+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
	+
	+ if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
	+ error = SET_ERROR(ENOENT);
	+
	+ if (error \|\| txg == 0) { /* txg == 0 indicates dry run */
	+ mutex_exit(&msp->ms_lock);
	+ return (error);
	+ }
	+
	+ VERIFY(!msp->ms_condensing);
	+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
	+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
	+ VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
	+ range_tree_remove(msp->ms_tree, offset, size);
	+
	+ if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
	+ if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
	+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
	+ range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
	+ }
	+
	+ mutex_exit(&msp->ms_lock);
	+
	+ return (0);
	+}
	+
	+typedef struct metaslab_claim_cb_arg_t {
	+ uint64_t mcca_txg;
	+ int mcca_error;
	+} metaslab_claim_cb_arg_t;
	+
	+/* ARGSUSED */
	+static void
	+metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
	+ uint64_t size, void *arg)
	+{
	+ metaslab_claim_cb_arg_t *mcca_arg = arg;
	+
	+ if (mcca_arg->mcca_error == 0) {
	+ mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
	+ size, mcca_arg->mcca_txg);
	+ }
	+}
	+
	int
	+metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
	+{
	+ if (vd->vdev_ops->vdev_op_remap != NULL) {
	+ metaslab_claim_cb_arg_t arg;
	+
	+ /*
	+ * Only zdb(1M) can claim on indirect vdevs. This is used
	+ * to detect leaks of mapped space (that are not accounted
	+ * for in the obsolete counts, spacemap, or bpobj).
	+ */
	+ ASSERT(!spa_writeable(vd->vdev_spa));
	+ arg.mcca_error = 0;
	+ arg.mcca_txg = txg;
	+
	+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
	+ metaslab_claim_impl_cb, &arg);
	+
	+ if (arg.mcca_error == 0) {
	+ arg.mcca_error = metaslab_claim_concrete(vd,
	+ offset, size, txg);
	+ }
	+ return (arg.mcca_error);
	+ } else {
	+ return (metaslab_claim_concrete(vd, offset, size, txg));
	+ }
	+}
	+
	+/*
	+ * Intent log support: upon opening the pool after a crash, notify the SPA
	+ * of blocks that the intent log has allocated for immediate write, but
	+ * which are still considered free by the SPA because the last transaction
	+ * group didn't commit yet.
	+ */
	+static int
	+metaslab_claim_dva(spa_t spa, const dva_t dva, uint64_t txg)
	+{
	+ uint64_t vdev = DVA_GET_VDEV(dva);
	+ uint64_t offset = DVA_GET_OFFSET(dva);
	+ uint64_t size = DVA_GET_ASIZE(dva);
	+ vdev_t *vd;
	+
	+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
	+ return (SET_ERROR(ENXIO));
	+ }
	+
	+ ASSERT(DVA_IS_VALID(dva));
	+
	+ if (DVA_GET_GANG(dva))
	+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
	+
	+ return (metaslab_claim_impl(vd, offset, size, txg));
	+}
	+
	+int
	metaslab_alloc(spa_t spa, metaslab_class_t mc, uint64_t psize, blkptr_t *bp,
	int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
	zio_alloc_list_t zal, zio_t zio)
	{
	dva_t *dva = bp->blk_dva;
	dva_t *hintdva = hintbp->blk_dva;
	int error = 0;

	ASSERT(bp->blk_birth == 0);
	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);

	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);

	if (mc->mc_rotor == NULL) { /* no vdevs in this class */
	spa_config_exit(spa, SCL_ALLOC, FTAG);
	return (SET_ERROR(ENOSPC));
	}

	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
	ASSERT(BP_GET_NDVAS(bp) == 0);
	ASSERT(hintbp == NULL \|\| ndvas <= BP_GET_NDVAS(hintbp));
	ASSERT3P(zal, !=, NULL);

	for (int d = 0; d < ndvas; d++) {
	error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
	txg, flags, zal);
	if (error != 0) {
	for (d--; d >= 0; d--) {
	- metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
	+ metaslab_unalloc_dva(spa, &dva[d], txg);
	metaslab_group_alloc_decrement(spa,
	DVA_GET_VDEV(&dva[d]), zio, flags);
	bzero(&dva[d], sizeof (dva_t));
	}
	spa_config_exit(spa, SCL_ALLOC, FTAG);
	return (error);
	} else {
	/*
	* Update the metaslab group's queue depth
	* based on the newly allocated dva.
	*/
	metaslab_group_alloc_increment(spa,
	DVA_GET_VDEV(&dva[d]), zio, flags);
	}

	}
	ASSERT(error == 0);
	ASSERT(BP_GET_NDVAS(bp) == ndvas);

	spa_config_exit(spa, SCL_ALLOC, FTAG);

	BP_SET_BIRTH(bp, txg, txg);

	return (0);
	}

	void
	metaslab_free(spa_t spa, const blkptr_t bp, uint64_t txg, boolean_t now)
	{
	const dva_t *dva = bp->blk_dva;
	int ndvas = BP_GET_NDVAS(bp);

	ASSERT(!BP_IS_HOLE(bp));
	ASSERT(!now \|\| bp->blk_birth >= spa_syncing_txg(spa));

	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);

	- for (int d = 0; d < ndvas; d++)
	- metaslab_free_dva(spa, &dva[d], txg, now);
	+ for (int d = 0; d < ndvas; d++) {
	+ if (now) {
	+ metaslab_unalloc_dva(spa, &dva[d], txg);
	+ } else {
	+ metaslab_free_dva(spa, &dva[d], txg);
	+ }
	+ }

	spa_config_exit(spa, SCL_FREE, FTAG);
	}

	int
	metaslab_claim(spa_t spa, const blkptr_t bp, uint64_t txg)
	{
	const dva_t *dva = bp->blk_dva;
	int ndvas = BP_GET_NDVAS(bp);
	int error = 0;

	ASSERT(!BP_IS_HOLE(bp));

	if (txg != 0) {
	/*
	* First do a dry run to make sure all DVAs are claimable,
	* so we don't have to unwind from partial failures below.
	*/
	if ((error = metaslab_claim(spa, bp, 0)) != 0)
	return (error);
	}

	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);

	for (int d = 0; d < ndvas; d++)
	if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
	break;

	spa_config_exit(spa, SCL_ALLOC, FTAG);

	ASSERT(error == 0 \|\| txg == 0);

	return (error);
	}

	+/* ARGSUSED */
	+static void
	+metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
	+ uint64_t size, void *arg)
	+{
	+ if (vd->vdev_ops == &vdev_indirect_ops)
	+ return;
	+
	+ metaslab_check_free_impl(vd, offset, size);
	+}
	+
	+static void
	+metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
	+{
	+ metaslab_t *msp;
	+ spa_t *spa = vd->vdev_spa;
	+
	+ if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
	+ return;
	+
	+ if (vd->vdev_ops->vdev_op_remap != NULL) {
	+ vd->vdev_ops->vdev_op_remap(vd, offset, size,
	+ metaslab_check_free_impl_cb, NULL);
	+ return;
	+ }
	+
	+ ASSERT(vdev_is_concrete(vd));
	+ ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
	+ ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
	+
	+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
	+
	+ mutex_enter(&msp->ms_lock);
	+ if (msp->ms_loaded)
	+ range_tree_verify(msp->ms_tree, offset, size);
	+
	+ range_tree_verify(msp->ms_freeingtree, offset, size);
	+ range_tree_verify(msp->ms_freedtree, offset, size);
	+ for (int j = 0; j < TXG_DEFER_SIZE; j++)
	+ range_tree_verify(msp->ms_defertree[j], offset, size);
	+ mutex_exit(&msp->ms_lock);
	+}
	+
	void
	metaslab_check_free(spa_t spa, const blkptr_t bp)
	{
	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
	return;

	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
	uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
	vdev_t *vd = vdev_lookup_top(spa, vdev);
	uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
	uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
	- metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];

	- if (msp->ms_loaded)
	- range_tree_verify(msp->ms_tree, offset, size);
	+ if (DVA_GET_GANG(&bp->blk_dva[i]))
	+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);

	- range_tree_verify(msp->ms_freeingtree, offset, size);
	- range_tree_verify(msp->ms_freedtree, offset, size);
	- for (int j = 0; j < TXG_DEFER_SIZE; j++)
	- range_tree_verify(msp->ms_defertree[j], offset, size);
	+ ASSERT3P(vd, !=, NULL);
	+
	+ metaslab_check_free_impl(vd, offset, size);
	}
	spa_config_exit(spa, SCL_VDEV, FTAG);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c (revision 332525)
	@@ -1,411 +1,403 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/
	/*
	- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
	+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/dmu.h>
	#include <sys/dnode.h>
	#include <sys/zio.h>
	#include <sys/range_tree.h>

	kmem_cache_t *range_seg_cache;

	void
	range_tree_init(void)
	{
	ASSERT(range_seg_cache == NULL);
	range_seg_cache = kmem_cache_create("range_seg_cache",
	sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	}

	void
	range_tree_fini(void)
	{
	kmem_cache_destroy(range_seg_cache);
	range_seg_cache = NULL;
	}

	void
	range_tree_stat_verify(range_tree_t *rt)
	{
	range_seg_t *rs;
	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
	int i;

	for (rs = avl_first(&rt->rt_root); rs != NULL;
	rs = AVL_NEXT(&rt->rt_root, rs)) {
	uint64_t size = rs->rs_end - rs->rs_start;
	int idx = highbit64(size) - 1;

	hist[idx]++;
	ASSERT3U(hist[idx], !=, 0);
	}

	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
	if (hist[i] != rt->rt_histogram[i]) {
	zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
	i, hist, hist[i], rt->rt_histogram[i]);
	}
	VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
	}
	}

	static void
	range_tree_stat_incr(range_tree_t rt, range_seg_t rs)
	{
	uint64_t size = rs->rs_end - rs->rs_start;
	int idx = highbit64(size) - 1;

	ASSERT(size != 0);
	ASSERT3U(idx, <,
	sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));

	- ASSERT(MUTEX_HELD(rt->rt_lock));
	rt->rt_histogram[idx]++;
	ASSERT3U(rt->rt_histogram[idx], !=, 0);
	}

	static void
	range_tree_stat_decr(range_tree_t rt, range_seg_t rs)
	{
	uint64_t size = rs->rs_end - rs->rs_start;
	int idx = highbit64(size) - 1;

	ASSERT(size != 0);
	ASSERT3U(idx, <,
	sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));

	- ASSERT(MUTEX_HELD(rt->rt_lock));
	ASSERT3U(rt->rt_histogram[idx], !=, 0);
	rt->rt_histogram[idx]--;
	}

	/*
	* NOTE: caller is responsible for all locking.
	*/
	static int
	range_tree_seg_compare(const void x1, const void x2)
	{
	const range_seg_t *r1 = x1;
	const range_seg_t *r2 = x2;

	if (r1->rs_start < r2->rs_start) {
	if (r1->rs_end > r2->rs_start)
	return (0);
	return (-1);
	}
	if (r1->rs_start > r2->rs_start) {
	if (r1->rs_start < r2->rs_end)
	return (0);
	return (1);
	}
	return (0);
	}

	range_tree_t *
	-range_tree_create(range_tree_ops_t ops, void arg, kmutex_t *lp)
	+range_tree_create(range_tree_ops_t ops, void arg)
	{
	range_tree_t *rt;

	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);

	avl_create(&rt->rt_root, range_tree_seg_compare,
	sizeof (range_seg_t), offsetof(range_seg_t, rs_node));

	- rt->rt_lock = lp;
	rt->rt_ops = ops;
	rt->rt_arg = arg;

	if (rt->rt_ops != NULL)
	rt->rt_ops->rtop_create(rt, rt->rt_arg);

	return (rt);
	}

	void
	range_tree_destroy(range_tree_t *rt)
	{
	VERIFY0(rt->rt_space);

	if (rt->rt_ops != NULL)
	rt->rt_ops->rtop_destroy(rt, rt->rt_arg);

	avl_destroy(&rt->rt_root);
	kmem_free(rt, sizeof (*rt));
	}

	void
	range_tree_add(void *arg, uint64_t start, uint64_t size)
	{
	range_tree_t *rt = arg;
	avl_index_t where;
	range_seg_t rsearch, rs_before, rs_after, *rs;
	uint64_t end = start + size;
	boolean_t merge_before, merge_after;

	- ASSERT(MUTEX_HELD(rt->rt_lock));
	VERIFY(size != 0);

	rsearch.rs_start = start;
	rsearch.rs_end = end;
	rs = avl_find(&rt->rt_root, &rsearch, &where);

	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
	zfs_panic_recover("zfs: allocating allocated segment"
	"(offset=%llu size=%llu)\n",
	(longlong_t)start, (longlong_t)size);
	return;
	}

	/* Make sure we don't overlap with either of our neighbors */
	VERIFY(rs == NULL);

	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);

	merge_before = (rs_before != NULL && rs_before->rs_end == start);
	merge_after = (rs_after != NULL && rs_after->rs_start == end);

	if (merge_before && merge_after) {
	avl_remove(&rt->rt_root, rs_before);
	if (rt->rt_ops != NULL) {
	rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
	rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
	}

	range_tree_stat_decr(rt, rs_before);
	range_tree_stat_decr(rt, rs_after);

	rs_after->rs_start = rs_before->rs_start;
	kmem_cache_free(range_seg_cache, rs_before);
	rs = rs_after;
	} else if (merge_before) {
	if (rt->rt_ops != NULL)
	rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);

	range_tree_stat_decr(rt, rs_before);

	rs_before->rs_end = end;
	rs = rs_before;
	} else if (merge_after) {
	if (rt->rt_ops != NULL)
	rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);

	range_tree_stat_decr(rt, rs_after);

	rs_after->rs_start = start;
	rs = rs_after;
	} else {
	rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
	rs->rs_start = start;
	rs->rs_end = end;
	avl_insert(&rt->rt_root, rs, where);
	}

	if (rt->rt_ops != NULL)
	rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);

	range_tree_stat_incr(rt, rs);
	rt->rt_space += size;
	}

	void
	range_tree_remove(void *arg, uint64_t start, uint64_t size)
	{
	range_tree_t *rt = arg;
	avl_index_t where;
	range_seg_t rsearch, rs, newseg;
	uint64_t end = start + size;
	boolean_t left_over, right_over;

	- ASSERT(MUTEX_HELD(rt->rt_lock));
	VERIFY3U(size, !=, 0);
	VERIFY3U(size, <=, rt->rt_space);

	rsearch.rs_start = start;
	rsearch.rs_end = end;
	rs = avl_find(&rt->rt_root, &rsearch, &where);

	/* Make sure we completely overlap with someone */
	if (rs == NULL) {
	zfs_panic_recover("zfs: freeing free segment "
	"(offset=%llu size=%llu)",
	(longlong_t)start, (longlong_t)size);
	return;
	}
	VERIFY3U(rs->rs_start, <=, start);
	VERIFY3U(rs->rs_end, >=, end);

	left_over = (rs->rs_start != start);
	right_over = (rs->rs_end != end);

	range_tree_stat_decr(rt, rs);

	if (rt->rt_ops != NULL)
	rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);

	if (left_over && right_over) {
	newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
	newseg->rs_start = end;
	newseg->rs_end = rs->rs_end;
	range_tree_stat_incr(rt, newseg);

	rs->rs_end = start;

	avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
	if (rt->rt_ops != NULL)
	rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
	} else if (left_over) {
	rs->rs_end = start;
	} else if (right_over) {
	rs->rs_start = end;
	} else {
	avl_remove(&rt->rt_root, rs);
	kmem_cache_free(range_seg_cache, rs);
	rs = NULL;
	}

	if (rs != NULL) {
	range_tree_stat_incr(rt, rs);

	if (rt->rt_ops != NULL)
	rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
	}

	rt->rt_space -= size;
	}

	static range_seg_t *
	range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
	{
	avl_index_t where;
	range_seg_t rsearch;
	uint64_t end = start + size;

	- ASSERT(MUTEX_HELD(rt->rt_lock));
	VERIFY(size != 0);

	rsearch.rs_start = start;
	rsearch.rs_end = end;
	return (avl_find(&rt->rt_root, &rsearch, &where));
	}

	static range_seg_t *
	range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
	{
	range_seg_t *rs = range_tree_find_impl(rt, start, size);
	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
	return (rs);
	return (NULL);
	}

	void
	range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
	{
	range_seg_t *rs;

	- mutex_enter(rt->rt_lock);
	rs = range_tree_find(rt, off, size);
	if (rs != NULL)
	panic("freeing free block; rs=%p", (void *)rs);
	- mutex_exit(rt->rt_lock);
	}

	boolean_t
	range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
	{
	return (range_tree_find(rt, start, size) != NULL);
	}

	/*
	* Ensure that this range is not in the tree, regardless of whether
	* it is currently in the tree.
	*/
	void
	range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
	{
	range_seg_t *rs;

	+ if (size == 0)
	+ return;
	+
	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
	uint64_t free_start = MAX(rs->rs_start, start);
	uint64_t free_end = MIN(rs->rs_end, start + size);
	range_tree_remove(rt, free_start, free_end - free_start);
	}
	}

	void
	range_tree_swap(range_tree_t rtsrc, range_tree_t rtdst)
	{
	range_tree_t *rt;

	- ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
	ASSERT0(range_tree_space(*rtdst));
	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));

	rt = *rtsrc;
	rtsrc = rtdst;
	*rtdst = rt;
	}

	void
	range_tree_vacate(range_tree_t rt, range_tree_func_t func, void *arg)
	{
	range_seg_t *rs;
	void *cookie = NULL;

	- ASSERT(MUTEX_HELD(rt->rt_lock));

	if (rt->rt_ops != NULL)
	rt->rt_ops->rtop_vacate(rt, rt->rt_arg);

	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
	if (func != NULL)
	func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
	kmem_cache_free(range_seg_cache, rs);
	}

	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
	rt->rt_space = 0;
	}

	void
	range_tree_walk(range_tree_t rt, range_tree_func_t func, void *arg)
	{
	range_seg_t *rs;

	- ASSERT(MUTEX_HELD(rt->rt_lock));

	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
	func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
	}

	uint64_t
	range_tree_space(range_tree_t *rt)
	{
	return (rt->rt_space);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 332525)
	@@ -1,7411 +1,7339 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	* Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright 2013 Saso Kiselkov. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2016 Toomas Soome <tsoome@me.com>
	* Copyright 2017 Joyent, Inc.
	* Copyright (c) 2017 Datto Inc.
	* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
	*/

	/*
	* SPA: Storage Pool Allocator
	*
	* This file contains all the routines used when modifying on-disk SPA state.
	* This includes opening, importing, destroying, exporting a pool, and syncing a
	* pool.
	*/

	#include <sys/zfs_context.h>
	#include <sys/fm/fs/zfs.h>
	#include <sys/spa_impl.h>
	#include <sys/zio.h>
	#include <sys/zio_checksum.h>
	#include <sys/dmu.h>
	#include <sys/dmu_tx.h>
	#include <sys/zap.h>
	#include <sys/zil.h>
	#include <sys/ddt.h>
	#include <sys/vdev_impl.h>
	+#include <sys/vdev_removal.h>
	+#include <sys/vdev_indirect_mapping.h>
	+#include <sys/vdev_indirect_births.h>
	#include <sys/metaslab.h>
	#include <sys/metaslab_impl.h>
	#include <sys/uberblock_impl.h>
	#include <sys/txg.h>
	#include <sys/avl.h>
	+#include <sys/bpobj.h>
	#include <sys/dmu_traverse.h>
	#include <sys/dmu_objset.h>
	#include <sys/unique.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_synctask.h>
	#include <sys/fs/zfs.h>
	#include <sys/arc.h>
	#include <sys/callb.h>
	#include <sys/spa_boot.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/dsl_scan.h>
	#include <sys/dmu_send.h>
	#include <sys/dsl_destroy.h>
	#include <sys/dsl_userhold.h>
	#include <sys/zfeature.h>
	#include <sys/zvol.h>
	#include <sys/trim_map.h>
	#include <sys/abd.h>

	#ifdef _KERNEL
	#include <sys/callb.h>
	#include <sys/cpupart.h>
	#include <sys/zone.h>
	#endif /* _KERNEL */

	#include "zfs_prop.h"
	#include "zfs_comutil.h"

	/* Check hostid on import? */
	static int check_hostid = 1;

	/*
	* The interval, in seconds, at which failed configuration cache file writes
	* should be retried.
	*/
	-static int zfs_ccw_retry_interval = 300;
	+int zfs_ccw_retry_interval = 300;

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
	"Check hostid on import?");
	TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
	SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
	&zfs_ccw_retry_interval, 0,
	"Configuration cache file write, retry after failure, interval (seconds)");

	typedef enum zti_modes {
	ZTI_MODE_FIXED, /* value is # of threads (min 1) */
	ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
	ZTI_MODE_NULL, /* don't create a taskq */
	ZTI_NMODES
	} zti_modes_t;

	#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
	#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
	#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }

	#define ZTI_N(n) ZTI_P(n, 1)
	#define ZTI_ONE ZTI_N(1)

	typedef struct zio_taskq_info {
	zti_modes_t zti_mode;
	uint_t zti_value;
	uint_t zti_count;
	} zio_taskq_info_t;

	static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
	"issue", "issue_high", "intr", "intr_high"
	};

	/*
	* This table defines the taskq settings for each ZFS I/O type. When
	* initializing a pool, we use this table to create an appropriately sized
	* taskq. Some operations are low volume and therefore have a small, static
	* number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
	* macros. Other operations process a large amount of data; the ZTI_BATCH
	* macro causes us to create a taskq oriented for throughput. Some operations
	* are so high frequency and short-lived that the taskq itself can become a a
	* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
	* additional degree of parallelism specified by the number of threads per-
	* taskq and the number of taskqs; when dispatching an event in this case, the
	* particular taskq is chosen at random.
	*
	* The different taskq priorities are to handle the different contexts (issue
	* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
	* need to be handled with minimum delay.
	*/
	const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
	/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
	{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
	{ ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */
	{ ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */
	{ ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
	{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
	{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
	};

	-static sysevent_t spa_event_create(spa_t spa, vdev_t vd, nvlist_t hist_nvl,
	- const char *name);
	-static void spa_event_post(sysevent_t *ev);
	static void spa_sync_version(void arg, dmu_tx_t tx);
	static void spa_sync_props(void arg, dmu_tx_t tx);
	static boolean_t spa_has_active_shared_spare(spa_t *spa);
	static int spa_load_impl(spa_t spa, uint64_t, nvlist_t config,
	- spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
	+ spa_load_state_t state, spa_import_type_t type, boolean_t trust_config,
	char **ereport);
	static void spa_vdev_resilver_done(spa_t *spa);

	uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
	#ifdef PSRSET_BIND
	id_t zio_taskq_psrset_bind = PS_NONE;
	#endif
	#ifdef SYSDC
	boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
	uint_t zio_taskq_basedc = 80; /* base duty cycle */
	#endif

	boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
	extern int zfs_sync_pass_deferred_free;

	/*
	* This (illegal) pool name is used when temporarily importing a spa_t in order
	* to get the vdev stats associated with the imported devices.
	*/
	#define TRYIMPORT_NAME "$import"

	/*
	* ==========================================================================
	* SPA properties routines
	* ==========================================================================
	*/

	/*
	* Add a (source=src, propname=propval) list to an nvlist.
	*/
	static void
	spa_prop_add_list(nvlist_t nvl, zpool_prop_t prop, char strval,
	uint64_t intval, zprop_source_t src)
	{
	const char *propname = zpool_prop_to_name(prop);
	nvlist_t *propval;

	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);

	if (strval != NULL)
	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
	else
	VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);

	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
	nvlist_free(propval);
	}

	/*
	* Get property values from the spa configuration.
	*/
	static void
	spa_prop_get_config(spa_t spa, nvlist_t *nvp)
	{
	vdev_t *rvd = spa->spa_root_vdev;
	dsl_pool_t *pool = spa->spa_dsl_pool;
	uint64_t size, alloc, cap, version;
	zprop_source_t src = ZPROP_SRC_NONE;
	spa_config_dirent_t *dp;
	metaslab_class_t *mc = spa_normal_class(spa);

	ASSERT(MUTEX_HELD(&spa->spa_props_lock));

	if (rvd != NULL) {
	alloc = metaslab_class_get_alloc(spa_normal_class(spa));
	size = metaslab_class_get_space(spa_normal_class(spa));
	spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
	spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
	spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
	spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
	size - alloc, src);

	spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
	metaslab_class_fragmentation(mc), src);
	spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
	metaslab_class_expandable_space(mc), src);
	spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
	(spa_mode(spa) == FREAD), src);

	cap = (size == 0) ? 0 : (alloc * 100 / size);
	spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);

	spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
	ddt_get_pool_dedup_ratio(spa), src);

	spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
	rvd->vdev_state, src);

	version = spa_version(spa);
	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
	src = ZPROP_SRC_DEFAULT;
	else
	src = ZPROP_SRC_LOCAL;
	spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
	}

	if (pool != NULL) {
	/*
	* The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
	* when opening pools before this version freedir will be NULL.
	*/
	if (pool->dp_free_dir != NULL) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
	dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
	src);
	} else {
	spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
	NULL, 0, src);
	}

	if (pool->dp_leak_dir != NULL) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
	dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
	src);
	} else {
	spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
	NULL, 0, src);
	}
	}

	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);

	if (spa->spa_comment != NULL) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
	0, ZPROP_SRC_LOCAL);
	}

	if (spa->spa_root != NULL)
	spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
	0, ZPROP_SRC_LOCAL);

	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
	MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
	} else {
	spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
	SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
	}

	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
	if (dp->scd_path == NULL) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
	"none", 0, ZPROP_SRC_LOCAL);
	} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
	dp->scd_path, 0, ZPROP_SRC_LOCAL);
	}
	}
	}

	/*
	* Get zpool property values.
	*/
	int
	spa_prop_get(spa_t spa, nvlist_t *nvp)
	{
	objset_t *mos = spa->spa_meta_objset;
	zap_cursor_t zc;
	zap_attribute_t za;
	int err;

	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	mutex_enter(&spa->spa_props_lock);

	/*
	* Get properties from the spa config.
	*/
	spa_prop_get_config(spa, nvp);

	/* If no pool property object, no more prop to get. */
	if (mos == NULL \|\| spa->spa_pool_props_object == 0) {
	mutex_exit(&spa->spa_props_lock);
	return (0);
	}

	/*
	* Get properties from the MOS pool property object.
	*/
	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
	(err = zap_cursor_retrieve(&zc, &za)) == 0;
	zap_cursor_advance(&zc)) {
	uint64_t intval = 0;
	char *strval = NULL;
	zprop_source_t src = ZPROP_SRC_DEFAULT;
	zpool_prop_t prop;

	if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
	continue;

	switch (za.za_integer_length) {
	case 8:
	/* integer property */
	if (za.za_first_integer !=
	zpool_prop_default_numeric(prop))
	src = ZPROP_SRC_LOCAL;

	if (prop == ZPOOL_PROP_BOOTFS) {
	dsl_pool_t *dp;
	dsl_dataset_t *ds = NULL;

	dp = spa_get_dsl(spa);
	dsl_pool_config_enter(dp, FTAG);
	if (err = dsl_dataset_hold_obj(dp,
	za.za_first_integer, FTAG, &ds)) {
	dsl_pool_config_exit(dp, FTAG);
	break;
	}

	strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
	KM_SLEEP);
	dsl_dataset_name(ds, strval);
	dsl_dataset_rele(ds, FTAG);
	dsl_pool_config_exit(dp, FTAG);
	} else {
	strval = NULL;
	intval = za.za_first_integer;
	}

	spa_prop_add_list(*nvp, prop, strval, intval, src);

	if (strval != NULL)
	kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);

	break;

	case 1:
	/* string property */
	strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
	err = zap_lookup(mos, spa->spa_pool_props_object,
	za.za_name, 1, za.za_num_integers, strval);
	if (err) {
	kmem_free(strval, za.za_num_integers);
	break;
	}
	spa_prop_add_list(*nvp, prop, strval, 0, src);
	kmem_free(strval, za.za_num_integers);
	break;

	default:
	break;
	}
	}
	zap_cursor_fini(&zc);
	mutex_exit(&spa->spa_props_lock);
	out:
	if (err && err != ENOENT) {
	nvlist_free(*nvp);
	*nvp = NULL;
	return (err);
	}

	return (0);
	}

	/*
	* Validate the given pool properties nvlist and modify the list
	* for the property values to be set.
	*/
	static int
	spa_prop_validate(spa_t spa, nvlist_t props)
	{
	nvpair_t *elem;
	int error = 0, reset_bootfs = 0;
	uint64_t objnum = 0;
	boolean_t has_feature = B_FALSE;

	elem = NULL;
	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
	uint64_t intval;
	char strval, slash, check, fname;
	const char *propname = nvpair_name(elem);
	zpool_prop_t prop = zpool_name_to_prop(propname);

	switch (prop) {
	case ZPOOL_PROP_INVAL:
	if (!zpool_prop_feature(propname)) {
	error = SET_ERROR(EINVAL);
	break;
	}

	/*
	* Sanitize the input.
	*/
	if (nvpair_type(elem) != DATA_TYPE_UINT64) {
	error = SET_ERROR(EINVAL);
	break;
	}

	if (nvpair_value_uint64(elem, &intval) != 0) {
	error = SET_ERROR(EINVAL);
	break;
	}

	if (intval != 0) {
	error = SET_ERROR(EINVAL);
	break;
	}

	fname = strchr(propname, '@') + 1;
	if (zfeature_lookup_name(fname, NULL) != 0) {
	error = SET_ERROR(EINVAL);
	break;
	}

	has_feature = B_TRUE;
	break;

	case ZPOOL_PROP_VERSION:
	error = nvpair_value_uint64(elem, &intval);
	if (!error &&
	(intval < spa_version(spa) \|\|
	intval > SPA_VERSION_BEFORE_FEATURES \|\|
	has_feature))
	error = SET_ERROR(EINVAL);
	break;

	case ZPOOL_PROP_DELEGATION:
	case ZPOOL_PROP_AUTOREPLACE:
	case ZPOOL_PROP_LISTSNAPS:
	case ZPOOL_PROP_AUTOEXPAND:
	error = nvpair_value_uint64(elem, &intval);
	if (!error && intval > 1)
	error = SET_ERROR(EINVAL);
	break;

	case ZPOOL_PROP_BOOTFS:
	/*
	* If the pool version is less than SPA_VERSION_BOOTFS,
	* or the pool is still being created (version == 0),
	* the bootfs property cannot be set.
	*/
	if (spa_version(spa) < SPA_VERSION_BOOTFS) {
	error = SET_ERROR(ENOTSUP);
	break;
	}

	/*
	* Make sure the vdev config is bootable
	*/
	if (!vdev_is_bootable(spa->spa_root_vdev)) {
	error = SET_ERROR(ENOTSUP);
	break;
	}

	reset_bootfs = 1;

	error = nvpair_value_string(elem, &strval);

	if (!error) {
	objset_t *os;
	uint64_t propval;

	if (strval == NULL \|\| strval[0] == '\0') {
	objnum = zpool_prop_default_numeric(
	ZPOOL_PROP_BOOTFS);
	break;
	}

	if (error = dmu_objset_hold(strval, FTAG, &os))
	break;

	/*
	* Must be ZPL, and its property settings
	* must be supported by GRUB (compression
	* is not gzip, and large blocks are not used).
	*/

	if (dmu_objset_type(os) != DMU_OST_ZFS) {
	error = SET_ERROR(ENOTSUP);
	} else if ((error =
	dsl_prop_get_int_ds(dmu_objset_ds(os),
	zfs_prop_to_name(ZFS_PROP_COMPRESSION),
	&propval)) == 0 &&
	!BOOTFS_COMPRESS_VALID(propval)) {
	error = SET_ERROR(ENOTSUP);
	} else {
	objnum = dmu_objset_id(os);
	}
	dmu_objset_rele(os, FTAG);
	}
	break;

	case ZPOOL_PROP_FAILUREMODE:
	error = nvpair_value_uint64(elem, &intval);
	if (!error && (intval < ZIO_FAILURE_MODE_WAIT \|\|
	intval > ZIO_FAILURE_MODE_PANIC))
	error = SET_ERROR(EINVAL);

	/*
	* This is a special case which only occurs when
	* the pool has completely failed. This allows
	* the user to change the in-core failmode property
	* without syncing it out to disk (I/Os might
	* currently be blocked). We do this by returning
	* EIO to the caller (spa_prop_set) to trick it
	* into thinking we encountered a property validation
	* error.
	*/
	if (!error && spa_suspended(spa)) {
	spa->spa_failmode = intval;
	error = SET_ERROR(EIO);
	}
	break;

	case ZPOOL_PROP_CACHEFILE:
	if ((error = nvpair_value_string(elem, &strval)) != 0)
	break;

	if (strval[0] == '\0')
	break;

	if (strcmp(strval, "none") == 0)
	break;

	if (strval[0] != '/') {
	error = SET_ERROR(EINVAL);
	break;
	}

	slash = strrchr(strval, '/');
	ASSERT(slash != NULL);

	if (slash[1] == '\0' \|\| strcmp(slash, "/.") == 0 \|\|
	strcmp(slash, "/..") == 0)
	error = SET_ERROR(EINVAL);
	break;

	case ZPOOL_PROP_COMMENT:
	if ((error = nvpair_value_string(elem, &strval)) != 0)
	break;
	for (check = strval; *check != '\0'; check++) {
	/*
	* The kernel doesn't have an easy isprint()
	* check. For this kernel check, we merely
	* check ASCII apart from DEL. Fix this if
	* there is an easy-to-use kernel isprint().
	*/
	if (*check >= 0x7f) {
	error = SET_ERROR(EINVAL);
	break;
	}
	}
	if (strlen(strval) > ZPROP_MAX_COMMENT)
	error = E2BIG;
	break;

	case ZPOOL_PROP_DEDUPDITTO:
	if (spa_version(spa) < SPA_VERSION_DEDUP)
	error = SET_ERROR(ENOTSUP);
	else
	error = nvpair_value_uint64(elem, &intval);
	if (error == 0 &&
	intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
	error = SET_ERROR(EINVAL);
	break;
	}

	if (error)
	break;
	}

	if (!error && reset_bootfs) {
	error = nvlist_remove(props,
	zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);

	if (!error) {
	error = nvlist_add_uint64(props,
	zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
	}
	}

	return (error);
	}

	void
	spa_configfile_set(spa_t spa, nvlist_t nvp, boolean_t need_sync)
	{
	char *cachefile;
	spa_config_dirent_t *dp;

	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
	&cachefile) != 0)
	return;

	dp = kmem_alloc(sizeof (spa_config_dirent_t),
	KM_SLEEP);

	if (cachefile[0] == '\0')
	dp->scd_path = spa_strdup(spa_config_path);
	else if (strcmp(cachefile, "none") == 0)
	dp->scd_path = NULL;
	else
	dp->scd_path = spa_strdup(cachefile);

	list_insert_head(&spa->spa_config_list, dp);
	if (need_sync)
	spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
	}

	int
	spa_prop_set(spa_t spa, nvlist_t nvp)
	{
	int error;
	nvpair_t *elem = NULL;
	boolean_t need_sync = B_FALSE;

	if ((error = spa_prop_validate(spa, nvp)) != 0)
	return (error);

	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
	zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));

	if (prop == ZPOOL_PROP_CACHEFILE \|\|
	prop == ZPOOL_PROP_ALTROOT \|\|
	prop == ZPOOL_PROP_READONLY)
	continue;

	if (prop == ZPOOL_PROP_VERSION \|\| prop == ZPOOL_PROP_INVAL) {
	uint64_t ver;

	if (prop == ZPOOL_PROP_VERSION) {
	VERIFY(nvpair_value_uint64(elem, &ver) == 0);
	} else {
	ASSERT(zpool_prop_feature(nvpair_name(elem)));
	ver = SPA_VERSION_FEATURES;
	need_sync = B_TRUE;
	}

	/* Save time if the version is already set. */
	if (ver == spa_version(spa))
	continue;

	/*
	* In addition to the pool directory object, we might
	* create the pool properties object, the features for
	* read object, the features for write object, or the
	* feature descriptions object.
	*/
	error = dsl_sync_task(spa->spa_name, NULL,
	spa_sync_version, &ver,
	6, ZFS_SPACE_CHECK_RESERVED);
	if (error)
	return (error);
	continue;
	}

	need_sync = B_TRUE;
	break;
	}

	if (need_sync) {
	return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
	nvp, 6, ZFS_SPACE_CHECK_RESERVED));
	}

	return (0);
	}

	/*
	* If the bootfs property value is dsobj, clear it.
	*/
	void
	spa_prop_clear_bootfs(spa_t spa, uint64_t dsobj, dmu_tx_t tx)
	{
	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
	VERIFY(zap_remove(spa->spa_meta_objset,
	spa->spa_pool_props_object,
	zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
	spa->spa_bootfs = 0;
	}
	}

	/ARGSUSED/
	static int
	spa_change_guid_check(void arg, dmu_tx_t tx)
	{
	uint64_t *newguid = arg;
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	uint64_t vdev_state;

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	vdev_state = rvd->vdev_state;
	spa_config_exit(spa, SCL_STATE, FTAG);

	if (vdev_state != VDEV_STATE_HEALTHY)
	return (SET_ERROR(ENXIO));

	ASSERT3U(spa_guid(spa), !=, *newguid);

	return (0);
	}

	static void
	spa_change_guid_sync(void arg, dmu_tx_t tx)
	{
	uint64_t *newguid = arg;
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	uint64_t oldguid;
	vdev_t *rvd = spa->spa_root_vdev;

	oldguid = spa_guid(spa);

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	rvd->vdev_guid = *newguid;
	rvd->vdev_guid_sum += (*newguid - oldguid);
	vdev_config_dirty(rvd);
	spa_config_exit(spa, SCL_STATE, FTAG);

	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
	oldguid, *newguid);
	}

	/*
	* Change the GUID for the pool. This is done so that we can later
	* re-import a pool built from a clone of our own vdevs. We will modify
	* the root vdev's guid, our own pool guid, and then mark all of our
	* vdevs dirty. Note that we must make sure that all our vdevs are
	* online when we do this, or else any vdevs that weren't present
	* would be orphaned from our pool. We are also going to issue a
	* sysevent to update any watchers.
	*/
	int
	spa_change_guid(spa_t *spa)
	{
	int error;
	uint64_t guid;

	mutex_enter(&spa->spa_vdev_top_lock);
	mutex_enter(&spa_namespace_lock);
	guid = spa_generate_guid(NULL);

	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
	spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);

	if (error == 0) {
	- spa_config_sync(spa, B_FALSE, B_TRUE);
	+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
	}

	mutex_exit(&spa_namespace_lock);
	mutex_exit(&spa->spa_vdev_top_lock);

	return (error);
	}

	/*
	* ==========================================================================
	* SPA state manipulation (open/create/destroy/import/export)
	* ==========================================================================
	*/

	static int
	spa_error_entry_compare(const void a, const void b)
	{
	spa_error_entry_t sa = (spa_error_entry_t )a;
	spa_error_entry_t sb = (spa_error_entry_t )b;
	int ret;

	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
	sizeof (zbookmark_phys_t));

	if (ret < 0)
	return (-1);
	else if (ret > 0)
	return (1);
	else
	return (0);
	}

	/*
	* Utility function which retrieves copies of the current logs and
	* re-initializes them in the process.
	*/
	void
	spa_get_errlists(spa_t spa, avl_tree_t last, avl_tree_t *scrub)
	{
	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));

	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));

	avl_create(&spa->spa_errlist_scrub,
	spa_error_entry_compare, sizeof (spa_error_entry_t),
	offsetof(spa_error_entry_t, se_avl));
	avl_create(&spa->spa_errlist_last,
	spa_error_entry_compare, sizeof (spa_error_entry_t),
	offsetof(spa_error_entry_t, se_avl));
	}

	static void
	spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
	{
	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
	enum zti_modes mode = ztip->zti_mode;
	uint_t value = ztip->zti_value;
	uint_t count = ztip->zti_count;
	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
	char name[32];
	uint_t flags = 0;
	boolean_t batch = B_FALSE;

	if (mode == ZTI_MODE_NULL) {
	tqs->stqs_count = 0;
	tqs->stqs_taskq = NULL;
	return;
	}

	ASSERT3U(count, >, 0);

	tqs->stqs_count = count;
	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);

	switch (mode) {
	case ZTI_MODE_FIXED:
	ASSERT3U(value, >=, 1);
	value = MAX(value, 1);
	break;

	case ZTI_MODE_BATCH:
	batch = B_TRUE;
	flags \|= TASKQ_THREADS_CPU_PCT;
	value = zio_taskq_batch_pct;
	break;

	default:
	panic("unrecognized mode for %s_%s taskq (%u:%u) in "
	"spa_activate()",
	zio_type_name[t], zio_taskq_types[q], mode, value);
	break;
	}

	for (uint_t i = 0; i < count; i++) {
	taskq_t *tq;

	if (count > 1) {
	(void) snprintf(name, sizeof (name), "%s_%s_%u",
	zio_type_name[t], zio_taskq_types[q], i);
	} else {
	(void) snprintf(name, sizeof (name), "%s_%s",
	zio_type_name[t], zio_taskq_types[q]);
	}

	#ifdef SYSDC
	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
	if (batch)
	flags \|= TASKQ_DC_BATCH;

	tq = taskq_create_sysdc(name, value, 50, INT_MAX,
	spa->spa_proc, zio_taskq_basedc, flags);
	} else {
	#endif
	pri_t pri = maxclsyspri;
	/*
	* The write issue taskq can be extremely CPU
	* intensive. Run it at slightly lower priority
	* than the other taskqs.
	* FreeBSD notes:
	* - numerically higher priorities are lower priorities;
	* - if priorities divided by four (RQ_PPQ) are equal
	* then a difference between them is insignificant.
	*/
	if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
	#ifdef illumos
	pri--;
	#else
	pri += 4;
	#endif

	tq = taskq_create_proc(name, value, pri, 50,
	INT_MAX, spa->spa_proc, flags);
	#ifdef SYSDC
	}
	#endif

	tqs->stqs_taskq[i] = tq;
	}
	}

	static void
	spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
	{
	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];

	if (tqs->stqs_taskq == NULL) {
	ASSERT0(tqs->stqs_count);
	return;
	}

	for (uint_t i = 0; i < tqs->stqs_count; i++) {
	ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
	taskq_destroy(tqs->stqs_taskq[i]);
	}

	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
	tqs->stqs_taskq = NULL;
	}

	/*
	* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
	* Note that a type may have multiple discrete taskqs to avoid lock contention
	* on the taskq itself. In that case we choose which taskq at random by using
	* the low bits of gethrtime().
	*/
	void
	spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
	task_func_t func, void arg, uint_t flags, taskq_ent_t *ent)
	{
	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
	taskq_t *tq;

	ASSERT3P(tqs->stqs_taskq, !=, NULL);
	ASSERT3U(tqs->stqs_count, !=, 0);

	if (tqs->stqs_count == 1) {
	tq = tqs->stqs_taskq[0];
	} else {
	#ifdef _KERNEL
	tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count];
	#else
	tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
	#endif
	}

	taskq_dispatch_ent(tq, func, arg, flags, ent);
	}

	static void
	spa_create_zio_taskqs(spa_t *spa)
	{
	for (int t = 0; t < ZIO_TYPES; t++) {
	for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
	spa_taskqs_init(spa, t, q);
	}
	}
	}

	#ifdef _KERNEL
	#ifdef SPA_PROCESS
	static void
	spa_thread(void *arg)
	{
	callb_cpr_t cprinfo;

	spa_t *spa = arg;
	user_t *pu = PTOU(curproc);

	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
	spa->spa_name);

	ASSERT(curproc != &p0);
	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
	"zpool-%s", spa->spa_name);
	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));

	#ifdef PSRSET_BIND
	/* bind this thread to the requested psrset */
	if (zio_taskq_psrset_bind != PS_NONE) {
	pool_lock();
	mutex_enter(&cpu_lock);
	mutex_enter(&pidlock);
	mutex_enter(&curproc->p_lock);

	if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
	0, NULL, NULL) == 0) {
	curthread->t_bind_pset = zio_taskq_psrset_bind;
	} else {
	cmn_err(CE_WARN,
	"Couldn't bind process for zfs pool \"%s\" to "
	"pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
	}

	mutex_exit(&curproc->p_lock);
	mutex_exit(&pidlock);
	mutex_exit(&cpu_lock);
	pool_unlock();
	}
	#endif

	#ifdef SYSDC
	if (zio_taskq_sysdc) {
	sysdc_thread_enter(curthread, 100, 0);
	}
	#endif

	spa->spa_proc = curproc;
	spa->spa_did = curthread->t_did;

	spa_create_zio_taskqs(spa);

	mutex_enter(&spa->spa_proc_lock);
	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);

	spa->spa_proc_state = SPA_PROC_ACTIVE;
	cv_broadcast(&spa->spa_proc_cv);

	CALLB_CPR_SAFE_BEGIN(&cprinfo);
	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
	cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);

	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
	spa->spa_proc_state = SPA_PROC_GONE;
	spa->spa_proc = &p0;
	cv_broadcast(&spa->spa_proc_cv);
	CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */

	mutex_enter(&curproc->p_lock);
	lwp_exit();
	}
	#endif /* SPA_PROCESS */
	#endif

	/*
	* Activate an uninitialized pool.
	*/
	static void
	spa_activate(spa_t *spa, int mode)
	{
	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);

	spa->spa_state = POOL_STATE_ACTIVE;
	spa->spa_mode = mode;

	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);

	/* Try to create a covering process */
	mutex_enter(&spa->spa_proc_lock);
	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
	ASSERT(spa->spa_proc == &p0);
	spa->spa_did = 0;

	#ifdef SPA_PROCESS
	/* Only create a process if we're going to be around a while. */
	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
	if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
	NULL, 0) == 0) {
	spa->spa_proc_state = SPA_PROC_CREATED;
	while (spa->spa_proc_state == SPA_PROC_CREATED) {
	cv_wait(&spa->spa_proc_cv,
	&spa->spa_proc_lock);
	}
	ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
	ASSERT(spa->spa_proc != &p0);
	ASSERT(spa->spa_did != 0);
	} else {
	#ifdef _KERNEL
	cmn_err(CE_WARN,
	"Couldn't create process for zfs pool \"%s\"\n",
	spa->spa_name);
	#endif
	}
	}
	#endif /* SPA_PROCESS */
	mutex_exit(&spa->spa_proc_lock);

	/* If we didn't create a process, we need to create our taskqs. */
	ASSERT(spa->spa_proc == &p0);
	if (spa->spa_proc == &p0) {
	spa_create_zio_taskqs(spa);
	}

	/*
	* Start TRIM thread.
	*/
	trim_thread_create(spa);

	+ for (size_t i = 0; i < TXG_SIZE; i++)
	+ spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0);
	+
	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
	offsetof(vdev_t, vdev_config_dirty_node));
	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
	offsetof(objset_t, os_evicting_node));
	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
	offsetof(vdev_t, vdev_state_dirty_node));

	txg_list_create(&spa->spa_vdev_txg_list, spa,
	offsetof(struct vdev, vdev_txg_node));

	avl_create(&spa->spa_errlist_scrub,
	spa_error_entry_compare, sizeof (spa_error_entry_t),
	offsetof(spa_error_entry_t, se_avl));
	avl_create(&spa->spa_errlist_last,
	spa_error_entry_compare, sizeof (spa_error_entry_t),
	offsetof(spa_error_entry_t, se_avl));
	}

	/*
	* Opposite of spa_activate().
	*/
	static void
	spa_deactivate(spa_t *spa)
	{
	ASSERT(spa->spa_sync_on == B_FALSE);
	ASSERT(spa->spa_dsl_pool == NULL);
	ASSERT(spa->spa_root_vdev == NULL);
	ASSERT(spa->spa_async_zio_root == NULL);
	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);

	/*
	* Stop TRIM thread in case spa_unload() wasn't called directly
	* before spa_deactivate().
	*/
	trim_thread_destroy(spa);

	spa_evicting_os_wait(spa);

	txg_list_destroy(&spa->spa_vdev_txg_list);

	list_destroy(&spa->spa_config_dirty_list);
	list_destroy(&spa->spa_evicting_os_list);
	list_destroy(&spa->spa_state_dirty_list);

	for (int t = 0; t < ZIO_TYPES; t++) {
	for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
	spa_taskqs_fini(spa, t, q);
	}
	}

	+ for (size_t i = 0; i < TXG_SIZE; i++) {
	+ ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
	+ VERIFY0(zio_wait(spa->spa_txg_zio[i]));
	+ spa->spa_txg_zio[i] = NULL;
	+ }
	+
	metaslab_class_destroy(spa->spa_normal_class);
	spa->spa_normal_class = NULL;

	metaslab_class_destroy(spa->spa_log_class);
	spa->spa_log_class = NULL;

	/*
	* If this was part of an import or the open otherwise failed, we may
	* still have errors left in the queues. Empty them just in case.
	*/
	spa_errlog_drain(spa);

	avl_destroy(&spa->spa_errlist_scrub);
	avl_destroy(&spa->spa_errlist_last);

	spa->spa_state = POOL_STATE_UNINITIALIZED;

	mutex_enter(&spa->spa_proc_lock);
	if (spa->spa_proc_state != SPA_PROC_NONE) {
	ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
	spa->spa_proc_state = SPA_PROC_DEACTIVATE;
	cv_broadcast(&spa->spa_proc_cv);
	while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
	ASSERT(spa->spa_proc != &p0);
	cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
	}
	ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
	spa->spa_proc_state = SPA_PROC_NONE;
	}
	ASSERT(spa->spa_proc == &p0);
	mutex_exit(&spa->spa_proc_lock);

	#ifdef SPA_PROCESS
	/*
	* We want to make sure spa_thread() has actually exited the ZFS
	* module, so that the module can't be unloaded out from underneath
	* it.
	*/
	if (spa->spa_did != 0) {
	thread_join(spa->spa_did);
	spa->spa_did = 0;
	}
	#endif /* SPA_PROCESS */
	}

	/*
	* Verify a pool configuration, and construct the vdev tree appropriately. This
	* will create all the necessary vdevs in the appropriate layout, with each vdev
	* in the CLOSED state. This will prep the pool before open/creation/import.
	* All vdev validation is done by the vdev_alloc() routine.
	*/
	static int
	spa_config_parse(spa_t spa, vdev_t vdp, nvlist_t nv, vdev_t *parent,
	uint_t id, int atype)
	{
	nvlist_t **child;
	uint_t children;
	int error;

	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
	return (error);

	if ((*vdp)->vdev_ops->vdev_op_leaf)
	return (0);

	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	&child, &children);

	if (error == ENOENT)
	return (0);

	if (error) {
	vdev_free(*vdp);
	*vdp = NULL;
	return (SET_ERROR(EINVAL));
	}

	for (int c = 0; c < children; c++) {
	vdev_t *vd;
	if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
	atype)) != 0) {
	vdev_free(*vdp);
	*vdp = NULL;
	return (error);
	}
	}

	ASSERT(*vdp != NULL);

	return (0);
	}

	/*
	* Opposite of spa_load().
	*/
	static void
	spa_unload(spa_t *spa)
	{
	int i;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	/*
	* Stop TRIM thread.
	*/
	trim_thread_destroy(spa);

	/*
	* Stop async tasks.
	*/
	spa_async_suspend(spa);

	/*
	* Stop syncing.
	*/
	if (spa->spa_sync_on) {
	txg_sync_stop(spa->spa_dsl_pool);
	spa->spa_sync_on = B_FALSE;
	}

	/*
	* Even though vdev_free() also calls vdev_metaslab_fini, we need
	* to call it earlier, before we wait for async i/o to complete.
	* This ensures that there is no async metaslab prefetching, by
	* calling taskq_wait(mg_taskq).
	*/
	if (spa->spa_root_vdev != NULL) {
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
	vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
	spa_config_exit(spa, SCL_ALL, FTAG);
	}

	/*
	* Wait for any outstanding async I/O to complete.
	*/
	if (spa->spa_async_zio_root != NULL) {
	for (int i = 0; i < max_ncpus; i++)
	(void) zio_wait(spa->spa_async_zio_root[i]);
	kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
	spa->spa_async_zio_root = NULL;
	}

	+ if (spa->spa_vdev_removal != NULL) {
	+ spa_vdev_removal_destroy(spa->spa_vdev_removal);
	+ spa->spa_vdev_removal = NULL;
	+ }
	+
	+ spa_condense_fini(spa);
	+
	bpobj_close(&spa->spa_deferred_bpobj);

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	/*
	* Close all vdevs.
	*/
	if (spa->spa_root_vdev)
	vdev_free(spa->spa_root_vdev);
	ASSERT(spa->spa_root_vdev == NULL);

	/*
	* Close the dsl pool.
	*/
	if (spa->spa_dsl_pool) {
	dsl_pool_close(spa->spa_dsl_pool);
	spa->spa_dsl_pool = NULL;
	spa->spa_meta_objset = NULL;
	}

	ddt_unload(spa);

	/*
	* Drop and purge level 2 cache
	*/
	spa_l2cache_drop(spa);

	for (i = 0; i < spa->spa_spares.sav_count; i++)
	vdev_free(spa->spa_spares.sav_vdevs[i]);
	if (spa->spa_spares.sav_vdevs) {
	kmem_free(spa->spa_spares.sav_vdevs,
	spa->spa_spares.sav_count * sizeof (void *));
	spa->spa_spares.sav_vdevs = NULL;
	}
	if (spa->spa_spares.sav_config) {
	nvlist_free(spa->spa_spares.sav_config);
	spa->spa_spares.sav_config = NULL;
	}
	spa->spa_spares.sav_count = 0;

	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
	vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
	vdev_free(spa->spa_l2cache.sav_vdevs[i]);
	}
	if (spa->spa_l2cache.sav_vdevs) {
	kmem_free(spa->spa_l2cache.sav_vdevs,
	spa->spa_l2cache.sav_count * sizeof (void *));
	spa->spa_l2cache.sav_vdevs = NULL;
	}
	if (spa->spa_l2cache.sav_config) {
	nvlist_free(spa->spa_l2cache.sav_config);
	spa->spa_l2cache.sav_config = NULL;
	}
	spa->spa_l2cache.sav_count = 0;

	spa->spa_async_suspended = 0;

	+ spa->spa_indirect_vdevs_loaded = B_FALSE;
	+
	if (spa->spa_comment != NULL) {
	spa_strfree(spa->spa_comment);
	spa->spa_comment = NULL;
	}

	spa_config_exit(spa, SCL_ALL, FTAG);
	}

	/*
	* Load (or re-load) the current list of vdevs describing the active spares for
	* this pool. When this is called, we have some form of basic information in
	* 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
	* then re-generate a more complete list including status information.
	*/
	-static void
	+void
	spa_load_spares(spa_t *spa)
	{
	nvlist_t **spares;
	uint_t nspares;
	int i;
	vdev_t vd, tvd;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	/*
	* First, close and free any existing spare vdevs.
	*/
	for (i = 0; i < spa->spa_spares.sav_count; i++) {
	vd = spa->spa_spares.sav_vdevs[i];

	/* Undo the call to spa_activate() below */
	if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
	B_FALSE)) != NULL && tvd->vdev_isspare)
	spa_spare_remove(tvd);
	vdev_close(vd);
	vdev_free(vd);
	}

	if (spa->spa_spares.sav_vdevs)
	kmem_free(spa->spa_spares.sav_vdevs,
	spa->spa_spares.sav_count * sizeof (void *));

	if (spa->spa_spares.sav_config == NULL)
	nspares = 0;
	else
	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);

	spa->spa_spares.sav_count = (int)nspares;
	spa->spa_spares.sav_vdevs = NULL;

	if (nspares == 0)
	return;

	/*
	* Construct the array of vdevs, opening them to get status in the
	* process. For each spare, there is potentially two different vdev_t
	* structures associated with it: one in the list of spares (used only
	* for basic validation purposes) and one in the active vdev
	* configuration (if it's spared in). During this phase we open and
	* validate each vdev on the spare list. If the vdev also exists in the
	* active configuration, then we also mark this vdev as an active spare.
	*/
	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
	KM_SLEEP);
	for (i = 0; i < spa->spa_spares.sav_count; i++) {
	VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
	VDEV_ALLOC_SPARE) == 0);
	ASSERT(vd != NULL);

	spa->spa_spares.sav_vdevs[i] = vd;

	if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
	B_FALSE)) != NULL) {
	if (!tvd->vdev_isspare)
	spa_spare_add(tvd);

	/*
	* We only mark the spare active if we were successfully
	* able to load the vdev. Otherwise, importing a pool
	* with a bad active spare would result in strange
	* behavior, because multiple pool would think the spare
	* is actively in use.
	*
	* There is a vulnerability here to an equally bizarre
	* circumstance, where a dead active spare is later
	* brought back to life (onlined or otherwise). Given
	* the rarity of this scenario, and the extra complexity
	* it adds, we ignore the possibility.
	*/
	if (!vdev_is_dead(tvd))
	spa_spare_activate(tvd);
	}

	vd->vdev_top = vd;
	vd->vdev_aux = &spa->spa_spares;

	if (vdev_open(vd) != 0)
	continue;

	if (vdev_validate_aux(vd) == 0)
	spa_spare_add(vd);
	}

	/*
	* Recompute the stashed list of spares, with status information
	* this time.
	*/
	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
	DATA_TYPE_NVLIST_ARRAY) == 0);

	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
	KM_SLEEP);
	for (i = 0; i < spa->spa_spares.sav_count; i++)
	spares[i] = vdev_config_generate(spa,
	spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
	for (i = 0; i < spa->spa_spares.sav_count; i++)
	nvlist_free(spares[i]);
	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
	}

	/*
	* Load (or re-load) the current list of vdevs describing the active l2cache for
	* this pool. When this is called, we have some form of basic information in
	* 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
	* then re-generate a more complete list including status information.
	* Devices which are already active have their details maintained, and are
	* not re-opened.
	*/
	-static void
	+void
	spa_load_l2cache(spa_t *spa)
	{
	nvlist_t **l2cache;
	uint_t nl2cache;
	int i, j, oldnvdevs;
	uint64_t guid;
	vdev_t vd, oldvdevs, *newvdevs;
	spa_aux_vdev_t *sav = &spa->spa_l2cache;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	if (sav->sav_config != NULL) {
	VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
	ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
	newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
	} else {
	nl2cache = 0;
	newvdevs = NULL;
	}

	oldvdevs = sav->sav_vdevs;
	oldnvdevs = sav->sav_count;
	sav->sav_vdevs = NULL;
	sav->sav_count = 0;

	/*
	* Process new nvlist of vdevs.
	*/
	for (i = 0; i < nl2cache; i++) {
	VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
	&guid) == 0);

	newvdevs[i] = NULL;
	for (j = 0; j < oldnvdevs; j++) {
	vd = oldvdevs[j];
	if (vd != NULL && guid == vd->vdev_guid) {
	/*
	* Retain previous vdev for add/remove ops.
	*/
	newvdevs[i] = vd;
	oldvdevs[j] = NULL;
	break;
	}
	}

	if (newvdevs[i] == NULL) {
	/*
	* Create new vdev
	*/
	VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
	VDEV_ALLOC_L2CACHE) == 0);
	ASSERT(vd != NULL);
	newvdevs[i] = vd;

	/*
	* Commit this vdev as an l2cache device,
	* even if it fails to open.
	*/
	spa_l2cache_add(vd);

	vd->vdev_top = vd;
	vd->vdev_aux = sav;

	spa_l2cache_activate(vd);

	if (vdev_open(vd) != 0)
	continue;

	(void) vdev_validate_aux(vd);

	if (!vdev_is_dead(vd))
	l2arc_add_vdev(spa, vd);
	}
	}

	/*
	* Purge vdevs that were dropped
	*/
	for (i = 0; i < oldnvdevs; i++) {
	uint64_t pool;

	vd = oldvdevs[i];
	if (vd != NULL) {
	ASSERT(vd->vdev_isl2cache);

	if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
	pool != 0ULL && l2arc_vdev_present(vd))
	l2arc_remove_vdev(vd);
	vdev_clear_stats(vd);
	vdev_free(vd);
	}
	}

	if (oldvdevs)
	kmem_free(oldvdevs, oldnvdevs * sizeof (void *));

	if (sav->sav_config == NULL)
	goto out;

	sav->sav_vdevs = newvdevs;
	sav->sav_count = (int)nl2cache;

	/*
	* Recompute the stashed list of l2cache devices, with status
	* information this time.
	*/
	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
	DATA_TYPE_NVLIST_ARRAY) == 0);

	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
	for (i = 0; i < sav->sav_count; i++)
	l2cache[i] = vdev_config_generate(spa,
	sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
	ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
	out:
	for (i = 0; i < sav->sav_count; i++)
	nvlist_free(l2cache[i]);
	if (sav->sav_count)
	kmem_free(l2cache, sav->sav_count * sizeof (void *));
	}

	static int
	load_nvlist(spa_t spa, uint64_t obj, nvlist_t *value)
	{
	dmu_buf_t *db;
	char *packed = NULL;
	size_t nvsize = 0;
	int error;
	*value = NULL;

	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
	if (error != 0)
	return (error);

	nvsize = (uint64_t )db->db_data;
	dmu_buf_rele(db, FTAG);

	packed = kmem_alloc(nvsize, KM_SLEEP);
	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
	DMU_READ_PREFETCH);
	if (error == 0)
	error = nvlist_unpack(packed, nvsize, value, 0);
	kmem_free(packed, nvsize);

	return (error);
	}

	/*
	* Checks to see if the given vdev could not be opened, in which case we post a
	* sysevent to notify the autoreplace code that the device has been removed.
	*/
	static void
	spa_check_removed(vdev_t *vd)
	{
	for (int c = 0; c < vd->vdev_children; c++)
	spa_check_removed(vd->vdev_child[c]);

	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
	- !vd->vdev_ishole) {
	+ vdev_is_concrete(vd)) {
	zfs_post_autoreplace(vd->vdev_spa, vd);
	spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
	}
	}

	static void
	spa_config_valid_zaps(vdev_t vd, vdev_t mvd)
	{
	ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);

	vd->vdev_top_zap = mvd->vdev_top_zap;
	vd->vdev_leaf_zap = mvd->vdev_leaf_zap;

	for (uint64_t i = 0; i < vd->vdev_children; i++) {
	spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
	}
	}

	/*
	* Validate the current config against the MOS config
	*/
	static boolean_t
	spa_config_valid(spa_t spa, nvlist_t config)
	{
	vdev_t mrvd, rvd = spa->spa_root_vdev;
	nvlist_t *nv;

	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);

	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);

	/*
	* If we're doing a normal import, then build up any additional
	* diagnostic information about missing devices in this config.
	* We'll pass this up to the user for further processing.
	*/
	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
	nvlist_t *child, nv;
	uint64_t idx = 0;

	child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
	KM_SLEEP);
	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	vdev_t *mtvd = mrvd->vdev_child[c];

	if (tvd->vdev_ops == &vdev_missing_ops &&
	mtvd->vdev_ops != &vdev_missing_ops &&
	mtvd->vdev_islog)
	child[idx++] = vdev_config_generate(spa, mtvd,
	B_FALSE, 0);
	}

	if (idx) {
	VERIFY(nvlist_add_nvlist_array(nv,
	ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
	VERIFY(nvlist_add_nvlist(spa->spa_load_info,
	ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);

	for (int i = 0; i < idx; i++)
	nvlist_free(child[i]);
	}
	nvlist_free(nv);
	kmem_free(child, rvd->vdev_children * sizeof (char **));
	}

	/*
	* Compare the root vdev tree with the information we have
	* from the MOS config (mrvd). Check each top-level vdev
	* with the corresponding MOS config top-level (mtvd).
	*/
	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	vdev_t *mtvd = mrvd->vdev_child[c];

	/*
	* Resolve any "missing" vdevs in the current configuration.
	+ * Also trust the MOS config about any "indirect" vdevs.
	* If we find that the MOS config has more accurate information
	* about the top-level vdev then use that vdev instead.
	*/
	- if (tvd->vdev_ops == &vdev_missing_ops &&
	- mtvd->vdev_ops != &vdev_missing_ops) {
	+ if ((tvd->vdev_ops == &vdev_missing_ops &&
	+ mtvd->vdev_ops != &vdev_missing_ops) \|\|
	+ (mtvd->vdev_ops == &vdev_indirect_ops &&
	+ tvd->vdev_ops != &vdev_indirect_ops)) {

	- if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
	- continue;
	-
	/*
	* Device specific actions.
	*/
	if (mtvd->vdev_islog) {
	+ if (!(spa->spa_import_flags &
	+ ZFS_IMPORT_MISSING_LOG)) {
	+ continue;
	+ }
	+
	spa_set_log_state(spa, SPA_LOG_CLEAR);
	- } else {
	- /*
	- * XXX - once we have 'readonly' pool
	- * support we should be able to handle
	- * missing data devices by transitioning
	- * the pool to readonly.
	- */
	+ } else if (mtvd->vdev_ops != &vdev_indirect_ops) {
	continue;
	}

	/*
	* Swap the missing vdev with the data we were
	* able to obtain from the MOS config.
	*/
	vdev_remove_child(rvd, tvd);
	vdev_remove_child(mrvd, mtvd);

	vdev_add_child(rvd, mtvd);
	vdev_add_child(mrvd, tvd);

	- spa_config_exit(spa, SCL_ALL, FTAG);
	- vdev_load(mtvd);
	- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	-
	vdev_reopen(rvd);
	} else {
	if (mtvd->vdev_islog) {
	/*
	* Load the slog device's state from the MOS
	* config since it's possible that the label
	* does not contain the most up-to-date
	* information.
	*/
	vdev_load_log_state(tvd, mtvd);
	vdev_reopen(tvd);
	}

	/*
	* Per-vdev ZAP info is stored exclusively in the MOS.
	*/
	spa_config_valid_zaps(tvd, mtvd);
	}
	+
	+ /*
	+ * Never trust this info from userland; always use what's
	+ * in the MOS. This prevents it from getting out of sync
	+ * with the rest of the info in the MOS.
	+ */
	+ tvd->vdev_removing = mtvd->vdev_removing;
	+ tvd->vdev_indirect_config = mtvd->vdev_indirect_config;
	}

	vdev_free(mrvd);
	spa_config_exit(spa, SCL_ALL, FTAG);

	/*
	* Ensure we were able to validate the config.
	*/
	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
	}

	/*
	* Check for missing log devices
	*/
	static boolean_t
	spa_check_logs(spa_t *spa)
	{
	boolean_t rv = B_FALSE;
	dsl_pool_t *dp = spa_get_dsl(spa);

	switch (spa->spa_log_state) {
	case SPA_LOG_MISSING:
	/* need to recheck in case slog has been restored */
	case SPA_LOG_UNKNOWN:
	rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
	if (rv)
	spa_set_log_state(spa, SPA_LOG_MISSING);
	break;
	}
	return (rv);
	}

	static boolean_t
	spa_passivate_log(spa_t *spa)
	{
	vdev_t *rvd = spa->spa_root_vdev;
	boolean_t slog_found = B_FALSE;

	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));

	if (!spa_has_slogs(spa))
	return (B_FALSE);

	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	metaslab_group_t *mg = tvd->vdev_mg;

	if (tvd->vdev_islog) {
	metaslab_group_passivate(mg);
	slog_found = B_TRUE;
	}
	}

	return (slog_found);
	}

	static void
	spa_activate_log(spa_t *spa)
	{
	vdev_t *rvd = spa->spa_root_vdev;

	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));

	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	metaslab_group_t *mg = tvd->vdev_mg;

	if (tvd->vdev_islog)
	metaslab_group_activate(mg);
	}
	}

	int
	-spa_offline_log(spa_t *spa)
	+spa_reset_logs(spa_t *spa)
	{
	int error;

	- error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
	+ error = dmu_objset_find(spa_name(spa), zil_reset,
	NULL, DS_FIND_CHILDREN);
	if (error == 0) {
	/*
	* We successfully offlined the log device, sync out the
	* current txg so that the "stubby" block can be removed
	* by zil_sync().
	*/
	txg_wait_synced(spa->spa_dsl_pool, 0);
	}
	return (error);
	}

	static void
	spa_aux_check_removed(spa_aux_vdev_t *sav)
	{
	int i;

	for (i = 0; i < sav->sav_count; i++)
	spa_check_removed(sav->sav_vdevs[i]);
	}

	void
	spa_claim_notify(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;

	if (zio->io_error)
	return;

	mutex_enter(&spa->spa_props_lock); /* any mutex will do */
	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
	spa->spa_claim_max_txg = zio->io_bp->blk_birth;
	mutex_exit(&spa->spa_props_lock);
	}

	typedef struct spa_load_error {
	uint64_t sle_meta_count;
	uint64_t sle_data_count;
	} spa_load_error_t;

	static void
	spa_load_verify_done(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;
	spa_load_error_t *sle = zio->io_private;
	dmu_object_type_t type = BP_GET_TYPE(bp);
	int error = zio->io_error;
	spa_t *spa = zio->io_spa;

	abd_free(zio->io_abd);
	if (error) {
	if ((BP_GET_LEVEL(bp) != 0 \|\| DMU_OT_IS_METADATA(type)) &&
	type != DMU_OT_INTENT_LOG)
	atomic_inc_64(&sle->sle_meta_count);
	else
	atomic_inc_64(&sle->sle_data_count);
	}

	mutex_enter(&spa->spa_scrub_lock);
	spa->spa_scrub_inflight--;
	cv_broadcast(&spa->spa_scrub_io_cv);
	mutex_exit(&spa->spa_scrub_lock);
	}

	/*
	* Maximum number of concurrent scrub i/os to create while verifying
	* a pool while importing it.
	*/
	int spa_load_verify_maxinflight = 10000;
	boolean_t spa_load_verify_metadata = B_TRUE;
	boolean_t spa_load_verify_data = B_TRUE;

	SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
	&spa_load_verify_maxinflight, 0,
	"Maximum number of concurrent scrub I/Os to create while verifying a "
	"pool while importing it");

	SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
	&spa_load_verify_metadata, 0,
	"Check metadata on import?");

	SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
	&spa_load_verify_data, 0,
	"Check user data on import?");

	/ARGSUSED/
	static int
	spa_load_verify_cb(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	{
	if (bp == NULL \|\| BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp))
	return (0);
	/*
	* Note: normally this routine will not be called if
	* spa_load_verify_metadata is not set. However, it may be useful
	* to manually set the flag after the traversal has begun.
	*/
	if (!spa_load_verify_metadata)
	return (0);
	if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
	return (0);

	zio_t *rio = arg;
	size_t size = BP_GET_PSIZE(bp);

	mutex_enter(&spa->spa_scrub_lock);
	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
	cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
	spa->spa_scrub_inflight++;
	mutex_exit(&spa->spa_scrub_lock);

	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
	spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
	ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_CANFAIL \|
	ZIO_FLAG_SCRUB \| ZIO_FLAG_RAW, zb));
	return (0);
	}

	/* ARGSUSED */
	int
	verify_dataset_name_len(dsl_pool_t dp, dsl_dataset_t ds, void *arg)
	{
	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
	return (SET_ERROR(ENAMETOOLONG));

	return (0);
	}

	static int
	spa_load_verify(spa_t *spa)
	{
	zio_t *rio;
	spa_load_error_t sle = { 0 };
	zpool_rewind_policy_t policy;
	boolean_t verify_ok = B_FALSE;
	int error = 0;

	zpool_get_rewind_policy(spa->spa_config, &policy);

	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
	return (0);

	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
	error = dmu_objset_find_dp(spa->spa_dsl_pool,
	spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
	DS_FIND_CHILDREN);
	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
	if (error != 0)
	return (error);

	rio = zio_root(spa, NULL, &sle,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE);

	if (spa_load_verify_metadata) {
	error = traverse_pool(spa, spa->spa_verify_min_txg,
	TRAVERSE_PRE \| TRAVERSE_PREFETCH_METADATA,
	spa_load_verify_cb, rio);
	}

	(void) zio_wait(rio);

	spa->spa_load_meta_errors = sle.sle_meta_count;
	spa->spa_load_data_errors = sle.sle_data_count;

	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
	sle.sle_data_count <= policy.zrp_maxdata) {
	int64_t loss = 0;

	verify_ok = B_TRUE;
	spa->spa_load_txg = spa->spa_uberblock.ub_txg;
	spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;

	loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
	VERIFY(nvlist_add_uint64(spa->spa_load_info,
	ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
	VERIFY(nvlist_add_int64(spa->spa_load_info,
	ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
	VERIFY(nvlist_add_uint64(spa->spa_load_info,
	ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
	} else {
	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
	}

	if (error) {
	if (error != ENXIO && error != EIO)
	error = SET_ERROR(EIO);
	return (error);
	}

	return (verify_ok ? 0 : EIO);
	}

	/*
	* Find a value in the pool props object.
	*/
	static void
	spa_prop_find(spa_t spa, zpool_prop_t prop, uint64_t val)
	{
	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
	zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
	}

	/*
	* Find a value in the pool directory object.
	*/
	static int
	spa_dir_prop(spa_t spa, const char name, uint64_t *val)
	{
	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	name, sizeof (uint64_t), 1, val));
	}

	static int
	spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
	{
	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
	- return (err);
	+ return (SET_ERROR(err));
	}

	/*
	* Fix up config after a partly-completed split. This is done with the
	* ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
	* pool have that entry in their config, but only the splitting one contains
	* a list of all the guids of the vdevs that are being split off.
	*
	* This function determines what to do with that list: either rejoin
	* all the disks to the pool, or complete the splitting process. To attempt
	* the rejoin, each disk that is offlined is marked online again, and
	* we do a reopen() call. If the vdev label for every disk that was
	* marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
	* then we call vdev_split() on each disk, and complete the split.
	*
	* Otherwise we leave the config alone, with all the vdevs in place in
	* the original pool.
	*/
	static void
	spa_try_repair(spa_t spa, nvlist_t config)
	{
	uint_t extracted;
	uint64_t *glist;
	uint_t i, gcount;
	nvlist_t *nvl;
	vdev_t **vd;
	boolean_t attempt_reopen;

	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
	return;

	/* check that the config is complete */
	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
	&glist, &gcount) != 0)
	return;

	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);

	/* attempt to online all the vdevs & validate */
	attempt_reopen = B_TRUE;
	for (i = 0; i < gcount; i++) {
	if (glist[i] == 0) /* vdev is hole */
	continue;

	vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
	if (vd[i] == NULL) {
	/*
	* Don't bother attempting to reopen the disks;
	* just do the split.
	*/
	attempt_reopen = B_FALSE;
	} else {
	/* attempt to re-online it */
	vd[i]->vdev_offline = B_FALSE;
	}
	}

	if (attempt_reopen) {
	vdev_reopen(spa->spa_root_vdev);

	/* check each device to see what state it's in */
	for (extracted = 0, i = 0; i < gcount; i++) {
	if (vd[i] != NULL &&
	vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
	break;
	++extracted;
	}
	}

	/*
	* If every disk has been moved to the new pool, or if we never
	* even attempted to look at them, then we split them off for
	* good.
	*/
	if (!attempt_reopen \|\| gcount == extracted) {
	for (i = 0; i < gcount; i++)
	if (vd[i] != NULL)
	vdev_split(vd[i]);
	vdev_reopen(spa->spa_root_vdev);
	}

	kmem_free(vd, gcount * sizeof (vdev_t *));
	}

	static int
	spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
	boolean_t mosconfig)
	{
	nvlist_t *config = spa->spa_config;
	char *ereport = FM_EREPORT_ZFS_POOL;
	char *comment;
	int error;
	uint64_t pool_guid;
	nvlist_t *nvl;

	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
	return (SET_ERROR(EINVAL));

	ASSERT(spa->spa_comment == NULL);
	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
	spa->spa_comment = spa_strdup(comment);

	/*
	* Versioning wasn't explicitly added to the label until later, so if
	* it's not present treat it as the initial version.
	*/
	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	&spa->spa_ubsync.ub_version) != 0)
	spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;

	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
	&spa->spa_config_txg);

	if ((state == SPA_LOAD_IMPORT \|\| state == SPA_LOAD_TRYIMPORT) &&
	spa_guid_exists(pool_guid, 0)) {
	error = SET_ERROR(EEXIST);
	} else {
	spa->spa_config_guid = pool_guid;

	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
	&nvl) == 0) {
	VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
	KM_SLEEP) == 0);
	}

	nvlist_free(spa->spa_load_info);
	spa->spa_load_info = fnvlist_alloc();

	gethrestime(&spa->spa_loaded_ts);
	error = spa_load_impl(spa, pool_guid, config, state, type,
	mosconfig, &ereport);
	}

	/*
	* Don't count references from objsets that are already closed
	* and are making their way through the eviction process.
	*/
	spa_evicting_os_wait(spa);
	spa->spa_minref = refcount_count(&spa->spa_refcount);
	if (error) {
	if (error != EEXIST) {
	spa->spa_loaded_ts.tv_sec = 0;
	spa->spa_loaded_ts.tv_nsec = 0;
	}
	if (error != EBADF) {
	zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
	}
	}
	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
	spa->spa_ena = 0;

	return (error);
	}

	/*
	* Count the number of per-vdev ZAPs associated with all of the vdevs in the
	* vdev tree rooted in the given vd, and ensure that each ZAP is present in the
	* spa's per-vdev ZAP list.
	*/
	static uint64_t
	vdev_count_verify_zaps(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	uint64_t total = 0;
	if (vd->vdev_top_zap != 0) {
	total++;
	ASSERT0(zap_lookup_int(spa->spa_meta_objset,
	spa->spa_all_vdev_zaps, vd->vdev_top_zap));
	}
	if (vd->vdev_leaf_zap != 0) {
	total++;
	ASSERT0(zap_lookup_int(spa->spa_meta_objset,
	spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
	}

	for (uint64_t i = 0; i < vd->vdev_children; i++) {
	total += vdev_count_verify_zaps(vd->vdev_child[i]);
	}

	return (total);
	}

	/*
	* Load an existing storage pool, using the pool's builtin spa_config as a
	* source of configuration information.
	*/
	static int
	spa_load_impl(spa_t spa, uint64_t pool_guid, nvlist_t config,
	- spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
	+ spa_load_state_t state, spa_import_type_t type, boolean_t trust_config,
	char **ereport)
	{
	int error = 0;
	nvlist_t *nvroot = NULL;
	nvlist_t *label;
	vdev_t *rvd;
	uberblock_t *ub = &spa->spa_uberblock;
	uint64_t children, config_cache_txg = spa->spa_config_txg;
	int orig_mode = spa->spa_mode;
	int parse;
	uint64_t obj;
	boolean_t missing_feat_write = B_FALSE;

	/*
	* If this is an untrusted config, access the pool in read-only mode.
	* This prevents things like resilvering recently removed devices.
	*/
	- if (!mosconfig)
	+ if (!trust_config)
	spa->spa_mode = FREAD;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	spa->spa_load_state = state;

	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
	return (SET_ERROR(EINVAL));

	parse = (type == SPA_IMPORT_EXISTING ?
	VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);

	/*
	* Create "The Godfather" zio to hold all async IOs
	*/
	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
	KM_SLEEP);
	for (int i = 0; i < max_ncpus; i++) {
	spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \|
	ZIO_FLAG_GODFATHER);
	}

	/*
	* Parse the configuration into a vdev tree. We explicitly set the
	* value that will be returned by spa_version() since parsing the
	* configuration requires knowing the version number.
	*/
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
	spa_config_exit(spa, SCL_ALL, FTAG);

	if (error != 0)
	return (error);

	ASSERT(spa->spa_root_vdev == rvd);
	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);

	if (type != SPA_IMPORT_ASSEMBLE) {
	ASSERT(spa_guid(spa) == pool_guid);
	}

	/*
	* Try to open all vdevs, loading each label in the process.
	*/
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	error = vdev_open(rvd);
	spa_config_exit(spa, SCL_ALL, FTAG);
	if (error != 0)
	return (error);

	/*
	* We need to validate the vdev labels against the configuration that
	* we have in hand, which is dependent on the setting of mosconfig. If
	* mosconfig is true then we're validating the vdev labels based on
	* that config. Otherwise, we're validating against the cached config
	* (zpool.cache) that was read when we loaded the zfs module, and then
	* later we will recursively call spa_load() and validate against
	* the vdev config.
	*
	* If we're assembling a new pool that's been split off from an
	* existing pool, the labels haven't yet been updated so we skip
	* validation for now.
	*/
	if (type != SPA_IMPORT_ASSEMBLE) {
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	- error = vdev_validate(rvd, mosconfig);
	+ error = vdev_validate(rvd, trust_config);
	spa_config_exit(spa, SCL_ALL, FTAG);

	if (error != 0)
	return (error);

	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
	return (SET_ERROR(ENXIO));
	}

	/*
	* Find the best uberblock.
	*/
	vdev_uberblock_load(rvd, ub, &label);

	/*
	* If we weren't able to find a single valid uberblock, return failure.
	*/
	if (ub->ub_txg == 0) {
	nvlist_free(label);
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
	}

	/*
	* If the pool has an unsupported version we can't open it.
	*/
	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
	nvlist_free(label);
	return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
	}

	if (ub->ub_version >= SPA_VERSION_FEATURES) {
	nvlist_t *features;

	/*
	* If we weren't able to find what's necessary for reading the
	* MOS in the label, return failure.
	*/
	if (label == NULL \|\| nvlist_lookup_nvlist(label,
	ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
	nvlist_free(label);
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
	ENXIO));
	}

	/*
	* Update our in-core representation with the definitive values
	* from the label.
	*/
	nvlist_free(spa->spa_label_features);
	VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
	}

	nvlist_free(label);

	/*
	* Look through entries in the label nvlist's features_for_read. If
	* there is a feature listed there which we don't understand then we
	* cannot open a pool.
	*/
	if (ub->ub_version >= SPA_VERSION_FEATURES) {
	nvlist_t *unsup_feat;

	VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
	0);

	for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
	NULL); nvp != NULL;
	nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
	if (!zfeature_is_supported(nvpair_name(nvp))) {
	VERIFY(nvlist_add_string(unsup_feat,
	nvpair_name(nvp), "") == 0);
	}
	}

	if (!nvlist_empty(unsup_feat)) {
	VERIFY(nvlist_add_nvlist(spa->spa_load_info,
	ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
	nvlist_free(unsup_feat);
	return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
	ENOTSUP));
	}

	nvlist_free(unsup_feat);
	}

	/*
	* If the vdev guid sum doesn't match the uberblock, we have an
	* incomplete configuration. We first check to see if the pool
	* is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
	* If it is, defer the vdev_guid_sum check till later so we
	* can handle missing vdevs.
	*/
	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
	- &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
	+ &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE &&
	rvd->vdev_guid_sum != ub->ub_guid_sum)
	return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));

	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_try_repair(spa, config);
	spa_config_exit(spa, SCL_ALL, FTAG);
	nvlist_free(spa->spa_config_splitting);
	spa->spa_config_splitting = NULL;
	}

	/*
	* Initialize internal SPA structures.
	*/
	spa->spa_state = POOL_STATE_ACTIVE;
	spa->spa_ubsync = spa->spa_uberblock;
	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
	TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
	spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
	spa->spa_claim_max_txg = spa->spa_first_txg;
	spa->spa_prev_software_version = ub->ub_software_version;

	+ /*
	+ * Everything that we read before we do spa_remove_init() must
	+ * have been rewritten after the last device removal was initiated.
	+ * Otherwise we could be reading from indirect vdevs before
	+ * we have loaded their mappings.
	+ */
	+
	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
	if (error)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;

	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	+ /*
	+ * Validate the config, using the MOS config to fill in any
	+ * information which might be missing. If we fail to validate
	+ * the config then declare the pool unfit for use. If we're
	+ * assembling a pool from a split, the log is not transferred
	+ * over.
	+ */
	+ if (type != SPA_IMPORT_ASSEMBLE) {
	+ nvlist_t *mos_config;
	+ if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
	+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	+
	+ if (!spa_config_valid(spa, mos_config)) {
	+ nvlist_free(mos_config);
	+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
	+ ENXIO));
	+ }
	+ nvlist_free(mos_config);
	+
	+ /*
	+ * Now that we've validated the config, check the state of the
	+ * root vdev. If it can't be opened, it indicates one or
	+ * more toplevel vdevs are faulted.
	+ */
	+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
	+ return (SET_ERROR(ENXIO));
	+ }
	+
	+ /*
	+ * Everything that we read before spa_remove_init() must be stored
	+ * on concreted vdevs. Therefore we do this as early as possible.
	+ */
	+ if (spa_remove_init(spa) != 0)
	+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	+
	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
	boolean_t missing_feat_read = B_FALSE;
	nvlist_t unsup_feat, enabled_feat;

	if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
	&spa->spa_feat_for_read_obj) != 0) {
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	}

	if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
	&spa->spa_feat_for_write_obj) != 0) {
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	}

	if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
	&spa->spa_feat_desc_obj) != 0) {
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	}

	enabled_feat = fnvlist_alloc();
	unsup_feat = fnvlist_alloc();

	if (!spa_features_check(spa, B_FALSE,
	unsup_feat, enabled_feat))
	missing_feat_read = B_TRUE;

	if (spa_writeable(spa) \|\| state == SPA_LOAD_TRYIMPORT) {
	if (!spa_features_check(spa, B_TRUE,
	unsup_feat, enabled_feat)) {
	missing_feat_write = B_TRUE;
	}
	}

	fnvlist_add_nvlist(spa->spa_load_info,
	ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);

	if (!nvlist_empty(unsup_feat)) {
	fnvlist_add_nvlist(spa->spa_load_info,
	ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
	}

	fnvlist_free(enabled_feat);
	fnvlist_free(unsup_feat);

	if (!missing_feat_read) {
	fnvlist_add_boolean(spa->spa_load_info,
	ZPOOL_CONFIG_CAN_RDONLY);
	}

	/*
	* If the state is SPA_LOAD_TRYIMPORT, our objective is
	* twofold: to determine whether the pool is available for
	* import in read-write mode and (if it is not) whether the
	* pool is available for import in read-only mode. If the pool
	* is available for import in read-write mode, it is displayed
	* as available in userland; if it is not available for import
	* in read-only mode, it is displayed as unavailable in
	* userland. If the pool is available for import in read-only
	* mode but not read-write mode, it is displayed as unavailable
	* in userland with a special note that the pool is actually
	* available for open in read-only mode.
	*
	* As a result, if the state is SPA_LOAD_TRYIMPORT and we are
	* missing a feature for write, we must first determine whether
	* the pool can be opened read-only before returning to
	* userland in order to know whether to display the
	* abovementioned note.
	*/
	if (missing_feat_read \|\| (missing_feat_write &&
	spa_writeable(spa))) {
	return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
	ENOTSUP));
	}

	/*
	* Load refcounts for ZFS features from disk into an in-memory
	* cache during SPA initialization.
	*/
	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
	uint64_t refcount;

	error = feature_get_refcount_from_disk(spa,
	&spa_feature_table[i], &refcount);
	if (error == 0) {
	spa->spa_feat_refcount_cache[i] = refcount;
	} else if (error == ENOTSUP) {
	spa->spa_feat_refcount_cache[i] =
	SPA_FEATURE_DISABLED;
	} else {
	return (spa_vdev_err(rvd,
	VDEV_AUX_CORRUPT_DATA, EIO));
	}
	}
	}

	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
	if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
	&spa->spa_feat_enabled_txg_obj) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	}

	spa->spa_is_initializing = B_TRUE;
	error = dsl_pool_open(spa->spa_dsl_pool);
	spa->spa_is_initializing = B_FALSE;
	if (error != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	- if (!mosconfig) {
	+ if (!trust_config) {
	uint64_t hostid;
	- nvlist_t policy = NULL, nvconfig;
	+ nvlist_t *policy = NULL;
	+ nvlist_t *mos_config;

	- if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
	+ if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	- if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
	+ if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
	ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
	char *hostname;
	unsigned long myhostid = 0;

	- VERIFY(nvlist_lookup_string(nvconfig,
	+ VERIFY(nvlist_lookup_string(mos_config,
	ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);

	#ifdef _KERNEL
	myhostid = zone_get_hostid(NULL);
	#else /* _KERNEL */
	/*
	* We're emulating the system's hostid in userland, so
	* we can't use zone_get_hostid().
	*/
	(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
	#endif /* _KERNEL */
	if (check_hostid && hostid != 0 && myhostid != 0 &&
	hostid != myhostid) {
	- nvlist_free(nvconfig);
	+ nvlist_free(mos_config);
	cmn_err(CE_WARN, "pool '%s' could not be "
	"loaded as it was last accessed by "
	"another system (host: %s hostid: 0x%lx). "
	"See: http://illumos.org/msg/ZFS-8000-EY",
	spa_name(spa), hostname,
	(unsigned long)hostid);
	return (SET_ERROR(EBADF));
	}
	}
	if (nvlist_lookup_nvlist(spa->spa_config,
	ZPOOL_REWIND_POLICY, &policy) == 0)
	- VERIFY(nvlist_add_nvlist(nvconfig,
	+ VERIFY(nvlist_add_nvlist(mos_config,
	ZPOOL_REWIND_POLICY, policy) == 0);

	- spa_config_set(spa, nvconfig);
	+ spa_config_set(spa, mos_config);
	spa_unload(spa);
	spa_deactivate(spa);
	spa_activate(spa, orig_mode);

	return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
	}

	/* Grab the secret checksum salt from the MOS. */
	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_CHECKSUM_SALT, 1,
	sizeof (spa->spa_cksum_salt.zcs_bytes),
	spa->spa_cksum_salt.zcs_bytes);
	if (error == ENOENT) {
	/* Generate a new salt for subsequent use */
	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
	sizeof (spa->spa_cksum_salt.zcs_bytes));
	} else if (error != 0) {
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	}

	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
	if (error != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	/*
	* Load the bit that tells us to use the new accounting function
	* (raid-z deflation). If we have an older pool, this will not
	* be present.
	*/
	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
	&spa->spa_creation_version);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	/*
	* Load the persistent error log. If we have an older pool, this will
	* not be present.
	*/
	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
	&spa->spa_errlog_scrub);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	/*
	* Load the history object. If we have an older pool, this
	* will not be present.
	*/
	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	/*
	* Load the per-vdev ZAP map. If we have an older pool, this will not
	* be present; in this case, defer its creation to a later time to
	* avoid dirtying the MOS this early / out of sync context. See
	* spa_sync_config_object.
	*/

	/* The sentinel is only available in the MOS config. */
	nvlist_t *mos_config;
	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
	&spa->spa_all_vdev_zaps);

	if (error == ENOENT) {
	VERIFY(!nvlist_exists(mos_config,
	ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
	spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
	ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
	} else if (error != 0) {
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
	/*
	* An older version of ZFS overwrote the sentinel value, so
	* we have orphaned per-vdev ZAPs in the MOS. Defer their
	* destruction to later; see spa_sync_config_object.
	*/
	spa->spa_avz_action = AVZ_ACTION_DESTROY;
	/*
	* We're assuming that no vdevs have had their ZAPs created
	* before this. Better be sure of it.
	*/
	ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
	}
	nvlist_free(mos_config);

	/*
	* If we're assembling the pool from the split-off vdevs of
	* an existing pool, we don't want to attach the spares & cache
	* devices.
	*/

	/*
	* Load any hot spares for this pool.
	*/
	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
	ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
	if (load_nvlist(spa, spa->spa_spares.sav_object,
	&spa->spa_spares.sav_config) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_spares(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	} else if (error == 0) {
	spa->spa_spares.sav_sync = B_TRUE;
	}

	/*
	* Load any level 2 ARC devices for this pool.
	*/
	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
	&spa->spa_l2cache.sav_object);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
	ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
	if (load_nvlist(spa, spa->spa_l2cache.sav_object,
	&spa->spa_l2cache.sav_config) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_l2cache(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	} else if (error == 0) {
	spa->spa_l2cache.sav_sync = B_TRUE;
	}

	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);

	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
	if (error && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	if (error == 0) {
	uint64_t autoreplace;

	spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
	spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
	spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
	spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
	spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
	spa_prop_find(spa, ZPOOL_PROP_BOOTSIZE, &spa->spa_bootsize);
	spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
	&spa->spa_dedup_ditto);

	spa->spa_autoreplace = (autoreplace != 0);
	}

	/*
	* If the 'autoreplace' property is set, then post a resource notifying
	* the ZFS DE that it should not issue any faults for unopenable
	* devices. We also iterate over the vdevs, and post a sysevent for any
	* unopenable vdevs so that the normal autoreplace handler can take
	* over.
	*/
	if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
	spa_check_removed(spa->spa_root_vdev);
	/*
	* For the import case, this is done in spa_import(), because
	* at this point we're using the spare definitions from
	* the MOS config, not necessarily from the userland config.
	*/
	if (state != SPA_LOAD_IMPORT) {
	spa_aux_check_removed(&spa->spa_spares);
	spa_aux_check_removed(&spa->spa_l2cache);
	}
	}

	/*
	* Load the vdev state for all toplevel vdevs.
	*/
	- vdev_load(rvd);
	+ error = vdev_load(rvd);
	+ if (error != 0) {
	+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
	+ }

	+ error = spa_condense_init(spa);
	+ if (error != 0) {
	+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
	+ }
	+
	/*
	* Propagate the leaf DTLs we just loaded all the way up the tree.
	*/
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
	spa_config_exit(spa, SCL_ALL, FTAG);

	/*
	* Load the DDTs (dedup tables).
	*/
	error = ddt_load(spa);
	if (error != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	spa_update_dspace(spa);

	- /*
	- * Validate the config, using the MOS config to fill in any
	- * information which might be missing. If we fail to validate
	- * the config then declare the pool unfit for use. If we're
	- * assembling a pool from a split, the log is not transferred
	- * over.
	- */
	- if (type != SPA_IMPORT_ASSEMBLE) {
	- nvlist_t *nvconfig;
	-
	- if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
	- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	-
	- if (!spa_config_valid(spa, nvconfig)) {
	- nvlist_free(nvconfig);
	- return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
	- ENXIO));
	- }
	- nvlist_free(nvconfig);
	-
	- /*
	- * Now that we've validated the config, check the state of the
	- * root vdev. If it can't be opened, it indicates one or
	- * more toplevel vdevs are faulted.
	- */
	- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
	- return (SET_ERROR(ENXIO));
	-
	- if (spa_writeable(spa) && spa_check_logs(spa)) {
	- *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
	- return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
	- }
	+ if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa) &&
	+ spa_check_logs(spa)) {
	+ *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
	+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
	}

	if (missing_feat_write) {
	ASSERT(state == SPA_LOAD_TRYIMPORT);

	/*
	* At this point, we know that we can open the pool in
	* read-only mode but not read-write mode. We now have enough
	* information and can return to userland.
	*/
	return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
	}

	/*
	* We've successfully opened the pool, verify that we're ready
	* to start pushing transactions.
	*/
	if (state != SPA_LOAD_TRYIMPORT) {
	if (error = spa_load_verify(spa))
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
	error));
	}

	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER \|\|
	spa->spa_load_max_txg == UINT64_MAX)) {
	dmu_tx_t *tx;
	int need_update = B_FALSE;
	dsl_pool_t *dp = spa_get_dsl(spa);

	+ /*
	+ * We must check this before we start the sync thread, because
	+ * we only want to start a condense thread for condense
	+ * operations that were in progress when the pool was
	+ * imported. Once we start syncing, spa_sync() could
	+ * initiate a condense (and start a thread for it). In
	+ * that case it would be wrong to start a second
	+ * condense thread.
	+ */
	+ boolean_t condense_in_progress =
	+ (spa->spa_condensing_indirect != NULL);
	+
	ASSERT(state != SPA_LOAD_TRYIMPORT);

	/*
	* Claim log blocks that haven't been committed yet.
	* This must all happen in a single txg.
	* Note: spa_claim_max_txg is updated by spa_claim_notify(),
	* invoked from zil_claim_log_block()'s i/o done callback.
	* Price of rollback is that we abandon the log.
	*/
	spa->spa_claiming = B_TRUE;

	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	zil_claim, tx, DS_FIND_CHILDREN);
	dmu_tx_commit(tx);

	spa->spa_claiming = B_FALSE;

	spa_set_log_state(spa, SPA_LOG_GOOD);
	spa->spa_sync_on = B_TRUE;
	txg_sync_start(spa->spa_dsl_pool);

	/*
	* Wait for all claims to sync. We sync up to the highest
	* claimed log block birth time so that claimed log blocks
	* don't appear to be from the future. spa_claim_max_txg
	* will have been set for us by either zil_check_log_chain()
	* (invoked from spa_check_logs()) or zil_claim() above.
	*/
	txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);

	/*
	* If the config cache is stale, or we have uninitialized
	* metaslabs (see spa_vdev_add()), then update the config.
	*
	* If this is a verbatim import, trust the current
	* in-core spa_config and update the disk labels.
	*/
	if (config_cache_txg != spa->spa_config_txg \|\|
	state == SPA_LOAD_IMPORT \|\|
	state == SPA_LOAD_RECOVER \|\|
	(spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
	need_update = B_TRUE;

	for (int c = 0; c < rvd->vdev_children; c++)
	if (rvd->vdev_child[c]->vdev_ms_array == 0)
	need_update = B_TRUE;

	/*
	* Update the config cache asychronously in case we're the
	* root pool, in which case the config cache isn't writable yet.
	*/
	if (need_update)
	spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);

	/*
	* Check all DTLs to see if anything needs resilvering.
	*/
	if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
	vdev_resilver_needed(rvd, NULL, NULL))
	spa_async_request(spa, SPA_ASYNC_RESILVER);

	/*
	* Log the fact that we booted up (so that we can detect if
	* we rebooted in the middle of an operation).
	*/
	spa_history_log_version(spa, "open");

	/*
	* Delete any inconsistent datasets.
	*/
	(void) dmu_objset_find(spa_name(spa),
	dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);

	/*
	* Clean up any stale temporary dataset userrefs.
	*/
	dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
	+
	+ /*
	+ * Note: unlike condensing, we don't need an analogous
	+ * "removal_in_progress" dance because no other thread
	+ * can start a removal while we hold the spa_namespace_lock.
	+ */
	+ spa_restart_removal(spa);
	+
	+ if (condense_in_progress)
	+ spa_condense_indirect_restart(spa);
	}

	return (0);
	}

	static int
	spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
	{
	int mode = spa->spa_mode;

	spa_unload(spa);
	spa_deactivate(spa);

	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;

	spa_activate(spa, mode);
	spa_async_suspend(spa);

	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
	}

	/*
	* If spa_load() fails this function will try loading prior txg's. If
	* 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
	* will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
	* function will not rewind the pool and will return the same error as
	* spa_load().
	*/
	static int
	spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
	uint64_t max_request, int rewind_flags)
	{
	nvlist_t *loadinfo = NULL;
	nvlist_t *config = NULL;
	int load_error, rewind_error;
	uint64_t safe_rewind_txg;
	uint64_t min_txg;

	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
	spa->spa_load_max_txg = spa->spa_load_txg;
	spa_set_log_state(spa, SPA_LOG_CLEAR);
	} else {
	spa->spa_load_max_txg = max_request;
	if (max_request != UINT64_MAX)
	spa->spa_extreme_rewind = B_TRUE;
	}

	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
	mosconfig);
	if (load_error == 0)
	return (0);

	if (spa->spa_root_vdev != NULL)
	config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);

	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;

	if (rewind_flags & ZPOOL_NEVER_REWIND) {
	nvlist_free(config);
	return (load_error);
	}

	if (state == SPA_LOAD_RECOVER) {
	/* Price of rolling back is discarding txgs, including log */
	spa_set_log_state(spa, SPA_LOG_CLEAR);
	} else {
	/*
	* If we aren't rolling back save the load info from our first
	* import attempt so that we can restore it after attempting
	* to rewind.
	*/
	loadinfo = spa->spa_load_info;
	spa->spa_load_info = fnvlist_alloc();
	}

	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
	TXG_INITIAL : safe_rewind_txg;

	/*
	* Continue as long as we're finding errors, we're still within
	* the acceptable rewind range, and we're still finding uberblocks
	*/
	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
	spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
	if (spa->spa_load_max_txg < safe_rewind_txg)
	spa->spa_extreme_rewind = B_TRUE;
	rewind_error = spa_load_retry(spa, state, mosconfig);
	}

	spa->spa_extreme_rewind = B_FALSE;
	spa->spa_load_max_txg = UINT64_MAX;

	if (config && (rewind_error \|\| state != SPA_LOAD_RECOVER))
	spa_config_set(spa, config);
	else
	nvlist_free(config);

	if (state == SPA_LOAD_RECOVER) {
	ASSERT3P(loadinfo, ==, NULL);
	return (rewind_error);
	} else {
	/* Store the rewind info as part of the initial load info */
	fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
	spa->spa_load_info);

	/* Restore the initial load info */
	fnvlist_free(spa->spa_load_info);
	spa->spa_load_info = loadinfo;

	return (load_error);
	}
	}

	/*
	* Pool Open/Import
	*
	* The import case is identical to an open except that the configuration is sent
	* down from userland, instead of grabbed from the configuration cache. For the
	* case of an open, the pool configuration will exist in the
	* POOL_STATE_UNINITIALIZED state.
	*
	* The stats information (gen/count/ustats) is used to gather vdev statistics at
	* the same time open the pool, without having to keep around the spa_t in some
	* ambiguous state.
	*/
	static int
	spa_open_common(const char pool, spa_t spapp, void tag, nvlist_t *nvpolicy,
	nvlist_t **config)
	{
	spa_t *spa;
	spa_load_state_t state = SPA_LOAD_OPEN;
	int error;
	int locked = B_FALSE;
	int firstopen = B_FALSE;

	*spapp = NULL;

	/*
	* As disgusting as this is, we need to support recursive calls to this
	* function because dsl_dir_open() is called during spa_load(), and ends
	* up calling spa_open() again. The real fix is to figure out how to
	* avoid dsl_dir_open() calling this in the first place.
	*/
	if (mutex_owner(&spa_namespace_lock) != curthread) {
	mutex_enter(&spa_namespace_lock);
	locked = B_TRUE;
	}

	if ((spa = spa_lookup(pool)) == NULL) {
	if (locked)
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(ENOENT));
	}

	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
	zpool_rewind_policy_t policy;

	firstopen = B_TRUE;

	zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
	&policy);
	if (policy.zrp_request & ZPOOL_DO_REWIND)
	state = SPA_LOAD_RECOVER;

	spa_activate(spa, spa_mode_global);

	if (state != SPA_LOAD_RECOVER)
	spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;

	error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
	policy.zrp_request);

	if (error == EBADF) {
	/*
	* If vdev_validate() returns failure (indicated by
	* EBADF), it indicates that one of the vdevs indicates
	* that the pool has been exported or destroyed. If
	* this is the case, the config cache is out of sync and
	* we should remove the pool from the namespace.
	*/
	spa_unload(spa);
	spa_deactivate(spa);
	- spa_config_sync(spa, B_TRUE, B_TRUE);
	+ spa_write_cachefile(spa, B_TRUE, B_TRUE);
	spa_remove(spa);
	if (locked)
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(ENOENT));
	}

	if (error) {
	/*
	* We can't open the pool, but we still have useful
	* information: the state of each vdev after the
	* attempted vdev_open(). Return this to the user.
	*/
	if (config != NULL && spa->spa_config) {
	VERIFY(nvlist_dup(spa->spa_config, config,
	KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist(*config,
	ZPOOL_CONFIG_LOAD_INFO,
	spa->spa_load_info) == 0);
	}
	spa_unload(spa);
	spa_deactivate(spa);
	spa->spa_last_open_failed = error;
	if (locked)
	mutex_exit(&spa_namespace_lock);
	*spapp = NULL;
	return (error);
	}
	}

	spa_open_ref(spa, tag);

	if (config != NULL)
	*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);

	/*
	* If we've recovered the pool, pass back any information we
	* gathered while doing the load.
	*/
	if (state == SPA_LOAD_RECOVER) {
	VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
	spa->spa_load_info) == 0);
	}

	if (locked) {
	spa->spa_last_open_failed = 0;
	spa->spa_last_ubsync_txg = 0;
	spa->spa_load_txg = 0;
	mutex_exit(&spa_namespace_lock);
	#ifdef __FreeBSD__
	#ifdef _KERNEL
	if (firstopen)
	zvol_create_minors(spa->spa_name);
	#endif
	#endif
	}

	*spapp = spa;

	return (0);
	}

	int
	spa_open_rewind(const char name, spa_t spapp, void tag, nvlist_t *policy,
	nvlist_t **config)
	{
	return (spa_open_common(name, spapp, tag, policy, config));
	}

	int
	spa_open(const char name, spa_t spapp, void tag)
	{
	return (spa_open_common(name, spapp, tag, NULL, NULL));
	}

	/*
	* Lookup the given spa_t, incrementing the inject count in the process,
	* preventing it from being exported or destroyed.
	*/
	spa_t *
	spa_inject_addref(char *name)
	{
	spa_t *spa;

	mutex_enter(&spa_namespace_lock);
	if ((spa = spa_lookup(name)) == NULL) {
	mutex_exit(&spa_namespace_lock);
	return (NULL);
	}
	spa->spa_inject_ref++;
	mutex_exit(&spa_namespace_lock);

	return (spa);
	}

	void
	spa_inject_delref(spa_t *spa)
	{
	mutex_enter(&spa_namespace_lock);
	spa->spa_inject_ref--;
	mutex_exit(&spa_namespace_lock);
	}

	/*
	* Add spares device information to the nvlist.
	*/
	static void
	spa_add_spares(spa_t spa, nvlist_t config)
	{
	nvlist_t **spares;
	uint_t i, nspares;
	nvlist_t *nvroot;
	uint64_t guid;
	vdev_stat_t *vs;
	uint_t vsc;
	uint64_t pool;

	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));

	if (spa->spa_spares.sav_count == 0)
	return;

	VERIFY(nvlist_lookup_nvlist(config,
	ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
	if (nspares != 0) {
	VERIFY(nvlist_add_nvlist_array(nvroot,
	ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
	VERIFY(nvlist_lookup_nvlist_array(nvroot,
	ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);

	/*
	* Go through and find any spares which have since been
	* repurposed as an active spare. If this is the case, update
	* their status appropriately.
	*/
	for (i = 0; i < nspares; i++) {
	VERIFY(nvlist_lookup_uint64(spares[i],
	ZPOOL_CONFIG_GUID, &guid) == 0);
	if (spa_spare_exists(guid, &pool, NULL) &&
	pool != 0ULL) {
	VERIFY(nvlist_lookup_uint64_array(
	spares[i], ZPOOL_CONFIG_VDEV_STATS,
	(uint64_t **)&vs, &vsc) == 0);
	vs->vs_state = VDEV_STATE_CANT_OPEN;
	vs->vs_aux = VDEV_AUX_SPARED;
	}
	}
	}
	}

	/*
	* Add l2cache device information to the nvlist, including vdev stats.
	*/
	static void
	spa_add_l2cache(spa_t spa, nvlist_t config)
	{
	nvlist_t **l2cache;
	uint_t i, j, nl2cache;
	nvlist_t *nvroot;
	uint64_t guid;
	vdev_t *vd;
	vdev_stat_t *vs;
	uint_t vsc;

	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));

	if (spa->spa_l2cache.sav_count == 0)
	return;

	VERIFY(nvlist_lookup_nvlist(config,
	ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
	ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
	if (nl2cache != 0) {
	VERIFY(nvlist_add_nvlist_array(nvroot,
	ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
	VERIFY(nvlist_lookup_nvlist_array(nvroot,
	ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);

	/*
	* Update level 2 cache device stats.
	*/

	for (i = 0; i < nl2cache; i++) {
	VERIFY(nvlist_lookup_uint64(l2cache[i],
	ZPOOL_CONFIG_GUID, &guid) == 0);

	vd = NULL;
	for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
	if (guid ==
	spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
	vd = spa->spa_l2cache.sav_vdevs[j];
	break;
	}
	}
	ASSERT(vd != NULL);

	VERIFY(nvlist_lookup_uint64_array(l2cache[i],
	ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
	== 0);
	vdev_get_stats(vd, vs);
	}
	}
	}

	static void
	spa_add_feature_stats(spa_t spa, nvlist_t config)
	{
	nvlist_t *features;
	zap_cursor_t zc;
	zap_attribute_t za;

	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	/* We may be unable to read features if pool is suspended. */
	if (spa_suspended(spa))
	goto out;

	if (spa->spa_feat_for_read_obj != 0) {
	for (zap_cursor_init(&zc, spa->spa_meta_objset,
	spa->spa_feat_for_read_obj);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	ASSERT(za.za_integer_length == sizeof (uint64_t) &&
	za.za_num_integers == 1);
	VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
	za.za_first_integer));
	}
	zap_cursor_fini(&zc);
	}

	if (spa->spa_feat_for_write_obj != 0) {
	for (zap_cursor_init(&zc, spa->spa_meta_objset,
	spa->spa_feat_for_write_obj);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	ASSERT(za.za_integer_length == sizeof (uint64_t) &&
	za.za_num_integers == 1);
	VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
	za.za_first_integer));
	}
	zap_cursor_fini(&zc);
	}

	out:
	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
	features) == 0);
	nvlist_free(features);
	}

	int
	spa_get_stats(const char name, nvlist_t *config,
	char *altroot, size_t buflen)
	{
	int error;
	spa_t *spa;

	*config = NULL;
	error = spa_open_common(name, &spa, FTAG, NULL, config);

	if (spa != NULL) {
	/*
	* This still leaves a window of inconsistency where the spares
	* or l2cache devices could change and the config would be
	* self-inconsistent.
	*/
	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

	if (*config != NULL) {
	uint64_t loadtimes[2];

	loadtimes[0] = spa->spa_loaded_ts.tv_sec;
	loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
	VERIFY(nvlist_add_uint64_array(*config,
	ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);

	VERIFY(nvlist_add_uint64(*config,
	ZPOOL_CONFIG_ERRCOUNT,
	spa_get_errlog_size(spa)) == 0);

	if (spa_suspended(spa))
	VERIFY(nvlist_add_uint64(*config,
	ZPOOL_CONFIG_SUSPENDED,
	spa->spa_failmode) == 0);

	spa_add_spares(spa, *config);
	spa_add_l2cache(spa, *config);
	spa_add_feature_stats(spa, *config);
	}
	}

	/*
	* We want to get the alternate root even for faulted pools, so we cheat
	* and call spa_lookup() directly.
	*/
	if (altroot) {
	if (spa == NULL) {
	mutex_enter(&spa_namespace_lock);
	spa = spa_lookup(name);
	if (spa)
	spa_altroot(spa, altroot, buflen);
	else
	altroot[0] = '\0';
	spa = NULL;
	mutex_exit(&spa_namespace_lock);
	} else {
	spa_altroot(spa, altroot, buflen);
	}
	}

	if (spa != NULL) {
	spa_config_exit(spa, SCL_CONFIG, FTAG);
	spa_close(spa, FTAG);
	}

	return (error);
	}

	/*
	* Validate that the auxiliary device array is well formed. We must have an
	* array of nvlists, each which describes a valid leaf vdev. If this is an
	* import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
	* specified, as long as they are well-formed.
	*/
	static int
	spa_validate_aux_devs(spa_t spa, nvlist_t nvroot, uint64_t crtxg, int mode,
	spa_aux_vdev_t sav, const char config, uint64_t version,
	vdev_labeltype_t label)
	{
	nvlist_t **dev;
	uint_t i, ndev;
	vdev_t *vd;
	int error;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	/*
	* It's acceptable to have no devs specified.
	*/
	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
	return (0);

	if (ndev == 0)
	return (SET_ERROR(EINVAL));

	/*
	* Make sure the pool is formatted with a version that supports this
	* device type.
	*/
	if (spa_version(spa) < version)
	return (SET_ERROR(ENOTSUP));

	/*
	* Set the pending device list so we correctly handle device in-use
	* checking.
	*/
	sav->sav_pending = dev;
	sav->sav_npending = ndev;

	for (i = 0; i < ndev; i++) {
	if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
	mode)) != 0)
	goto out;

	if (!vd->vdev_ops->vdev_op_leaf) {
	vdev_free(vd);
	error = SET_ERROR(EINVAL);
	goto out;
	}

	/*
	* The L2ARC currently only supports disk devices in
	* kernel context. For user-level testing, we allow it.
	*/
	#ifdef _KERNEL
	if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
	strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
	error = SET_ERROR(ENOTBLK);
	vdev_free(vd);
	goto out;
	}
	#endif
	vd->vdev_top = vd;

	if ((error = vdev_open(vd)) == 0 &&
	(error = vdev_label_init(vd, crtxg, label)) == 0) {
	VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
	vd->vdev_guid) == 0);
	}

	vdev_free(vd);

	if (error &&
	(mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
	goto out;
	else
	error = 0;
	}

	out:
	sav->sav_pending = NULL;
	sav->sav_npending = 0;
	return (error);
	}

	static int
	spa_validate_aux(spa_t spa, nvlist_t nvroot, uint64_t crtxg, int mode)
	{
	int error;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
	&spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
	VDEV_LABEL_SPARE)) != 0) {
	return (error);
	}

	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
	&spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
	VDEV_LABEL_L2CACHE));
	}

	static void
	spa_set_aux_vdevs(spa_aux_vdev_t sav, nvlist_t *devs, int ndevs,
	const char *config)
	{
	int i;

	if (sav->sav_config != NULL) {
	nvlist_t **olddevs;
	uint_t oldndevs;
	nvlist_t **newdevs;

	/*
	* Generate new dev list by concatentating with the
	* current dev list.
	*/
	VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
	&olddevs, &oldndevs) == 0);

	newdevs = kmem_alloc(sizeof (void )
	(ndevs + oldndevs), KM_SLEEP);
	for (i = 0; i < oldndevs; i++)
	VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
	KM_SLEEP) == 0);
	for (i = 0; i < ndevs; i++)
	VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
	KM_SLEEP) == 0);

	VERIFY(nvlist_remove(sav->sav_config, config,
	DATA_TYPE_NVLIST_ARRAY) == 0);

	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
	config, newdevs, ndevs + oldndevs) == 0);
	for (i = 0; i < oldndevs + ndevs; i++)
	nvlist_free(newdevs[i]);
	kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
	} else {
	/*
	* Generate a new dev list.
	*/
	VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
	KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
	devs, ndevs) == 0);
	}
	}

	/*
	* Stop and drop level 2 ARC devices
	*/
	void
	spa_l2cache_drop(spa_t *spa)
	{
	vdev_t *vd;
	int i;
	spa_aux_vdev_t *sav = &spa->spa_l2cache;

	for (i = 0; i < sav->sav_count; i++) {
	uint64_t pool;

	vd = sav->sav_vdevs[i];
	ASSERT(vd != NULL);

	if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
	pool != 0ULL && l2arc_vdev_present(vd))
	l2arc_remove_vdev(vd);
	}
	}

	/*
	* Pool Creation
	*/
	int
	spa_create(const char pool, nvlist_t nvroot, nvlist_t *props,
	nvlist_t *zplprops)
	{
	spa_t *spa;
	char *altroot = NULL;
	vdev_t *rvd;
	dsl_pool_t *dp;
	dmu_tx_t *tx;
	int error = 0;
	uint64_t txg = TXG_INITIAL;
	nvlist_t spares, l2cache;
	uint_t nspares, nl2cache;
	uint64_t version, obj;
	boolean_t has_features;

	/*
	* If this pool already exists, return failure.
	*/
	mutex_enter(&spa_namespace_lock);
	if (spa_lookup(pool) != NULL) {
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(EEXIST));
	}

	/*
	* Allocate a new spa_t structure.
	*/
	(void) nvlist_lookup_string(props,
	zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
	spa = spa_add(pool, NULL, altroot);
	spa_activate(spa, spa_mode_global);

	if (props && (error = spa_prop_validate(spa, props))) {
	spa_deactivate(spa);
	spa_remove(spa);
	mutex_exit(&spa_namespace_lock);
	return (error);
	}

	has_features = B_FALSE;
	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
	elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
	if (zpool_prop_feature(nvpair_name(elem)))
	has_features = B_TRUE;
	}

	if (has_features \|\| nvlist_lookup_uint64(props,
	zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
	version = SPA_VERSION;
	}
	ASSERT(SPA_VERSION_IS_SUPPORTED(version));

	spa->spa_first_txg = txg;
	spa->spa_uberblock.ub_txg = txg - 1;
	spa->spa_uberblock.ub_version = version;
	spa->spa_ubsync = spa->spa_uberblock;
	spa->spa_load_state = SPA_LOAD_CREATE;
	+ spa->spa_removing_phys.sr_state = DSS_NONE;
	+ spa->spa_removing_phys.sr_removing_vdev = -1;
	+ spa->spa_removing_phys.sr_prev_indirect_vdev = -1;

	/*
	* Create "The Godfather" zio to hold all async IOs
	*/
	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
	KM_SLEEP);
	for (int i = 0; i < max_ncpus; i++) {
	spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \|
	ZIO_FLAG_GODFATHER);
	}

	/*
	* Create the root vdev.
	*/
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);

	ASSERT(error != 0 \|\| rvd != NULL);
	ASSERT(error != 0 \|\| spa->spa_root_vdev == rvd);

	if (error == 0 && !zfs_allocatable_devs(nvroot))
	error = SET_ERROR(EINVAL);

	if (error == 0 &&
	(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
	(error = spa_validate_aux(spa, nvroot, txg,
	VDEV_ALLOC_ADD)) == 0) {
	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_ashift_optimize(rvd->vdev_child[c]);
	vdev_metaslab_set_size(rvd->vdev_child[c]);
	vdev_expand(rvd->vdev_child[c], txg);
	}
	}

	spa_config_exit(spa, SCL_ALL, FTAG);

	if (error != 0) {
	spa_unload(spa);
	spa_deactivate(spa);
	spa_remove(spa);
	mutex_exit(&spa_namespace_lock);
	return (error);
	}

	/*
	* Get the list of spares, if specified.
	*/
	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	&spares, &nspares) == 0) {
	VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
	KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_spares(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	spa->spa_spares.sav_sync = B_TRUE;
	}

	/*
	* Get the list of level 2 cache devices, if specified.
	*/
	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	&l2cache, &nl2cache) == 0) {
	VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
	NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
	ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_l2cache(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	spa->spa_l2cache.sav_sync = B_TRUE;
	}

	spa->spa_is_initializing = B_TRUE;
	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
	spa->spa_meta_objset = dp->dp_meta_objset;
	spa->spa_is_initializing = B_FALSE;

	/*
	* Create DDTs (dedup tables).
	*/
	ddt_create(spa);

	spa_update_dspace(spa);

	tx = dmu_tx_create_assigned(dp, txg);

	/*
	* Create the pool config object.
	*/
	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
	DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
	DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);

	if (zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
	sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
	cmn_err(CE_PANIC, "failed to add pool config");
	}

	if (spa_version(spa) >= SPA_VERSION_FEATURES)
	spa_feature_create_zap_objects(spa, tx);

	if (zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
	sizeof (uint64_t), 1, &version, tx) != 0) {
	cmn_err(CE_PANIC, "failed to add pool version");
	}

	/* Newly created pools with the right version are always deflated. */
	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
	spa->spa_deflate = TRUE;
	if (zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
	sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
	cmn_err(CE_PANIC, "failed to add deflate");
	}
	}

	/*
	* Create the deferred-free bpobj. Turn off compression
	* because sync-to-convergence takes longer if the blocksize
	* keeps changing.
	*/
	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
	dmu_object_set_compress(spa->spa_meta_objset, obj,
	ZIO_COMPRESS_OFF, tx);
	if (zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
	sizeof (uint64_t), 1, &obj, tx) != 0) {
	cmn_err(CE_PANIC, "failed to add bpobj");
	}
	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
	spa->spa_meta_objset, obj));

	/*
	* Create the pool's history object.
	*/
	if (version >= SPA_VERSION_ZPOOL_HISTORY)
	spa_history_create_obj(spa, tx);

	/*
	* Generate some random noise for salted checksums to operate on.
	*/
	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
	sizeof (spa->spa_cksum_salt.zcs_bytes));

	/*
	* Set pool properties.
	*/
	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);

	if (props != NULL) {
	spa_configfile_set(spa, props, B_FALSE);
	spa_sync_props(props, tx);
	}

	dmu_tx_commit(tx);

	spa->spa_sync_on = B_TRUE;
	txg_sync_start(spa->spa_dsl_pool);

	/*
	* We explicitly wait for the first transaction to complete so that our
	* bean counters are appropriately updated.
	*/
	txg_wait_synced(spa->spa_dsl_pool, txg);

	- spa_config_sync(spa, B_FALSE, B_TRUE);
	+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);

	spa_history_log_version(spa, "create");

	/*
	* Don't count references from objsets that are already closed
	* and are making their way through the eviction process.
	*/
	spa_evicting_os_wait(spa);
	spa->spa_minref = refcount_count(&spa->spa_refcount);
	spa->spa_load_state = SPA_LOAD_NONE;

	mutex_exit(&spa_namespace_lock);

	return (0);
	}

	#ifdef _KERNEL
	#ifdef illumos
	/*
	* Get the root pool information from the root disk, then import the root pool
	* during the system boot up time.
	*/
	extern int vdev_disk_read_rootlabel(char , char , nvlist_t **);

	static nvlist_t *
	spa_generate_rootconf(char devpath, char devid, uint64_t *guid)
	{
	nvlist_t *config;
	nvlist_t nvtop, nvroot;
	uint64_t pgid;

	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
	return (NULL);

	/*
	* Add this top-level vdev to the child array.
	*/
	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvtop) == 0);
	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	&pgid) == 0);
	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);

	/*
	* Put this pool's top-level vdevs into a root vdev.
	*/
	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
	VDEV_TYPE_ROOT) == 0);
	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	&nvtop, 1) == 0);

	/*
	* Replace the existing vdev_tree with the new root vdev in
	* this pool's configuration (remove the old, add the new).
	*/
	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
	nvlist_free(nvroot);
	return (config);
	}

	/*
	* Walk the vdev tree and see if we can find a device with "better"
	* configuration. A configuration is "better" if the label on that
	* device has a more recent txg.
	*/
	static void
	spa_alt_rootvdev(vdev_t vd, vdev_t avd, uint64_t txg)
	{
	for (int c = 0; c < vd->vdev_children; c++)
	spa_alt_rootvdev(vd->vdev_child[c], avd, txg);

	if (vd->vdev_ops->vdev_op_leaf) {
	nvlist_t *label;
	uint64_t label_txg;

	if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
	&label) != 0)
	return;

	VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
	&label_txg) == 0);

	/*
	* Do we have a better boot device?
	*/
	if (label_txg > *txg) {
	*txg = label_txg;
	*avd = vd;
	}
	nvlist_free(label);
	}
	}

	/*
	* Import a root pool.
	*
	* For x86. devpath_list will consist of devid and/or physpath name of
	* the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
	* The GRUB "findroot" command will return the vdev we should boot.
	*
	* For Sparc, devpath_list consists the physpath name of the booting device
	* no matter the rootpool is a single device pool or a mirrored pool.
	* e.g.
	* "/pci@1f,0/ide@d/disk@0,0:a"
	*/
	int
	spa_import_rootpool(char devpath, char devid)
	{
	spa_t *spa;
	vdev_t rvd, bvd, *avd = NULL;
	nvlist_t config, nvtop;
	uint64_t guid, txg;
	char *pname;
	int error;

	/*
	* Read the label from the boot device and generate a configuration.
	*/
	config = spa_generate_rootconf(devpath, devid, &guid);
	#if defined(_OBP) && defined(_KERNEL)
	if (config == NULL) {
	if (strstr(devpath, "/iscsi/ssd") != NULL) {
	/* iscsi boot */
	get_iscsi_bootpath_phy(devpath);
	config = spa_generate_rootconf(devpath, devid, &guid);
	}
	}
	#endif
	if (config == NULL) {
	cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
	devpath);
	return (SET_ERROR(EIO));
	}

	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	&pname) == 0);
	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);

	mutex_enter(&spa_namespace_lock);
	if ((spa = spa_lookup(pname)) != NULL) {
	/*
	* Remove the existing root pool from the namespace so that we
	* can replace it with the correct config we just read in.
	*/
	spa_remove(spa);
	}

	spa = spa_add(pname, config, NULL);
	spa->spa_is_root = B_TRUE;
	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	&spa->spa_ubsync.ub_version) != 0)
	spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;

	/*
	* Build up a vdev tree based on the boot device's label config.
	*/
	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvtop) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
	VDEV_ALLOC_ROOTPOOL);
	spa_config_exit(spa, SCL_ALL, FTAG);
	if (error) {
	mutex_exit(&spa_namespace_lock);
	nvlist_free(config);
	cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
	pname);
	return (error);
	}

	/*
	* Get the boot vdev.
	*/
	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
	cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
	(u_longlong_t)guid);
	error = SET_ERROR(ENOENT);
	goto out;
	}

	/*
	* Determine if there is a better boot device.
	*/
	avd = bvd;
	spa_alt_rootvdev(rvd, &avd, &txg);
	if (avd != bvd) {
	cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
	"try booting from '%s'", avd->vdev_path);
	error = SET_ERROR(EINVAL);
	goto out;
	}

	/*
	* If the boot device is part of a spare vdev then ensure that
	* we're booting off the active spare.
	*/
	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
	!bvd->vdev_isspare) {
	cmn_err(CE_NOTE, "The boot device is currently spared. Please "
	"try booting from '%s'",
	bvd->vdev_parent->
	vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
	error = SET_ERROR(EINVAL);
	goto out;
	}

	error = 0;
	out:
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	vdev_free(rvd);
	spa_config_exit(spa, SCL_ALL, FTAG);
	mutex_exit(&spa_namespace_lock);

	nvlist_free(config);
	return (error);
	}

	#else /* !illumos */

	extern int vdev_geom_read_pool_label(const char name, nvlist_t **configs,
	uint64_t *count);

	static nvlist_t *
	spa_generate_rootconf(const char *name)
	{
	nvlist_t configs, tops;
	nvlist_t *config;
	nvlist_t best_cfg, nvtop, *nvroot;
	uint64_t *holes;
	uint64_t best_txg;
	uint64_t nchildren;
	uint64_t pgid;
	uint64_t count;
	uint64_t i;
	uint_t nholes;

	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
	return (NULL);

	ASSERT3U(count, !=, 0);
	best_txg = 0;
	for (i = 0; i < count; i++) {
	uint64_t txg;

	VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
	&txg) == 0);
	if (txg > best_txg) {
	best_txg = txg;
	best_cfg = configs[i];
	}
	}

	nchildren = 1;
	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
	holes = NULL;
	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
	&holes, &nholes);

	tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
	for (i = 0; i < nchildren; i++) {
	if (i >= count)
	break;
	if (configs[i] == NULL)
	continue;
	VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
	&nvtop) == 0);
	nvlist_dup(nvtop, &tops[i], KM_SLEEP);
	}
	for (i = 0; holes != NULL && i < nholes; i++) {
	if (i >= nchildren)
	continue;
	if (tops[holes[i]] != NULL)
	continue;
	nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
	VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
	VDEV_TYPE_HOLE) == 0);
	VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
	holes[i]) == 0);
	VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
	0) == 0);
	}
	for (i = 0; i < nchildren; i++) {
	if (tops[i] != NULL)
	continue;
	nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
	VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
	VDEV_TYPE_MISSING) == 0);
	VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
	i) == 0);
	VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
	0) == 0);
	}

	/*
	* Create pool config based on the best vdev config.
	*/
	nvlist_dup(best_cfg, &config, KM_SLEEP);

	/*
	* Put this pool's top-level vdevs into a root vdev.
	*/
	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	&pgid) == 0);
	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
	VDEV_TYPE_ROOT) == 0);
	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	tops, nchildren) == 0);

	/*
	* Replace the existing vdev_tree with the new root vdev in
	* this pool's configuration (remove the old, add the new).
	*/
	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);

	/*
	* Drop vdev config elements that should not be present at pool level.
	*/
	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);

	for (i = 0; i < count; i++)
	nvlist_free(configs[i]);
	kmem_free(configs, count * sizeof(void *));
	for (i = 0; i < nchildren; i++)
	nvlist_free(tops[i]);
	kmem_free(tops, nchildren * sizeof(void *));
	nvlist_free(nvroot);
	return (config);
	}

	int
	spa_import_rootpool(const char *name)
	{
	spa_t *spa;
	vdev_t rvd, bvd, *avd = NULL;
	nvlist_t config, nvtop;
	uint64_t txg;
	char *pname;
	int error;

	/*
	* Read the label from the boot device and generate a configuration.
	*/
	config = spa_generate_rootconf(name);

	mutex_enter(&spa_namespace_lock);
	if (config != NULL) {
	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	&pname) == 0 && strcmp(name, pname) == 0);
	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
	== 0);

	if ((spa = spa_lookup(pname)) != NULL) {
	/*
	* The pool could already be imported,
	* e.g., after reboot -r.
	*/
	if (spa->spa_state == POOL_STATE_ACTIVE) {
	mutex_exit(&spa_namespace_lock);
	nvlist_free(config);
	return (0);
	}

	/*
	* Remove the existing root pool from the namespace so
	* that we can replace it with the correct config
	* we just read in.
	*/
	spa_remove(spa);
	}
	spa = spa_add(pname, config, NULL);

	/*
	* Set spa_ubsync.ub_version as it can be used in vdev_alloc()
	* via spa_version().
	*/
	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	&spa->spa_ubsync.ub_version) != 0)
	spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
	} else if ((spa = spa_lookup(name)) == NULL) {
	mutex_exit(&spa_namespace_lock);
	nvlist_free(config);
	cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
	name);
	return (EIO);
	} else {
	VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
	}
	spa->spa_is_root = B_TRUE;
	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;

	/*
	* Build up a vdev tree based on the boot device's label config.
	*/
	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvtop) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
	VDEV_ALLOC_ROOTPOOL);
	spa_config_exit(spa, SCL_ALL, FTAG);
	if (error) {
	mutex_exit(&spa_namespace_lock);
	nvlist_free(config);
	cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
	pname);
	return (error);
	}

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	vdev_free(rvd);
	spa_config_exit(spa, SCL_ALL, FTAG);
	mutex_exit(&spa_namespace_lock);

	nvlist_free(config);
	return (0);
	}

	#endif /* illumos */
	#endif /* _KERNEL */

	/*
	* Import a non-root pool into the system.
	*/
	int
	spa_import(const char pool, nvlist_t config, nvlist_t *props, uint64_t flags)
	{
	spa_t *spa;
	char *altroot = NULL;
	spa_load_state_t state = SPA_LOAD_IMPORT;
	zpool_rewind_policy_t policy;
	uint64_t mode = spa_mode_global;
	uint64_t readonly = B_FALSE;
	int error;
	nvlist_t *nvroot;
	nvlist_t spares, l2cache;
	uint_t nspares, nl2cache;

	/*
	* If a pool with this name exists, return failure.
	*/
	mutex_enter(&spa_namespace_lock);
	if (spa_lookup(pool) != NULL) {
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(EEXIST));
	}

	/*
	* Create and initialize the spa structure.
	*/
	(void) nvlist_lookup_string(props,
	zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
	(void) nvlist_lookup_uint64(props,
	zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
	if (readonly)
	mode = FREAD;
	spa = spa_add(pool, config, altroot);
	spa->spa_import_flags = flags;

	/*
	* Verbatim import - Take a pool and insert it into the namespace
	* as if it had been loaded at boot.
	*/
	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
	if (props != NULL)
	spa_configfile_set(spa, props, B_FALSE);

	- spa_config_sync(spa, B_FALSE, B_TRUE);
	+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);

	mutex_exit(&spa_namespace_lock);
	return (0);
	}

	spa_activate(spa, mode);

	/*
	* Don't start async tasks until we know everything is healthy.
	*/
	spa_async_suspend(spa);

	zpool_get_rewind_policy(config, &policy);
	if (policy.zrp_request & ZPOOL_DO_REWIND)
	state = SPA_LOAD_RECOVER;

	/*
	* Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
	* because the user-supplied config is actually the one to trust when
	* doing an import.
	*/
	if (state != SPA_LOAD_RECOVER)
	spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;

	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
	policy.zrp_request);

	/*
	* Propagate anything learned while loading the pool and pass it
	* back to caller (i.e. rewind info, missing devices, etc).
	*/
	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
	spa->spa_load_info) == 0);

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	/*
	* Toss any existing sparelist, as it doesn't have any validity
	* anymore, and conflicts with spa_has_spare().
	*/
	if (spa->spa_spares.sav_config) {
	nvlist_free(spa->spa_spares.sav_config);
	spa->spa_spares.sav_config = NULL;
	spa_load_spares(spa);
	}
	if (spa->spa_l2cache.sav_config) {
	nvlist_free(spa->spa_l2cache.sav_config);
	spa->spa_l2cache.sav_config = NULL;
	spa_load_l2cache(spa);
	}

	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvroot) == 0);
	if (error == 0)
	error = spa_validate_aux(spa, nvroot, -1ULL,
	VDEV_ALLOC_SPARE);
	if (error == 0)
	error = spa_validate_aux(spa, nvroot, -1ULL,
	VDEV_ALLOC_L2CACHE);
	spa_config_exit(spa, SCL_ALL, FTAG);

	if (props != NULL)
	spa_configfile_set(spa, props, B_FALSE);

	if (error != 0 \|\| (props && spa_writeable(spa) &&
	(error = spa_prop_set(spa, props)))) {
	spa_unload(spa);
	spa_deactivate(spa);
	spa_remove(spa);
	mutex_exit(&spa_namespace_lock);
	return (error);
	}

	spa_async_resume(spa);

	/*
	* Override any spares and level 2 cache devices as specified by
	* the user, as these may have correct device names/devids, etc.
	*/
	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	&spares, &nspares) == 0) {
	if (spa->spa_spares.sav_config)
	VERIFY(nvlist_remove(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
	else
	VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
	NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_spares(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	spa->spa_spares.sav_sync = B_TRUE;
	}
	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	&l2cache, &nl2cache) == 0) {
	if (spa->spa_l2cache.sav_config)
	VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
	ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
	else
	VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
	NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
	ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_l2cache(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	spa->spa_l2cache.sav_sync = B_TRUE;
	}

	/*
	* Check for any removed devices.
	*/
	if (spa->spa_autoreplace) {
	spa_aux_check_removed(&spa->spa_spares);
	spa_aux_check_removed(&spa->spa_l2cache);
	}

	if (spa_writeable(spa)) {
	/*
	* Update the config cache to include the newly-imported pool.
	*/
	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
	}

	/*
	* It's possible that the pool was expanded while it was exported.
	* We kick off an async task to handle this for us.
	*/
	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);

	spa_history_log_version(spa, "import");

	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);

	mutex_exit(&spa_namespace_lock);

	#ifdef __FreeBSD__
	#ifdef _KERNEL
	zvol_create_minors(pool);
	#endif
	#endif
	return (0);
	}

	nvlist_t *
	spa_tryimport(nvlist_t *tryconfig)
	{
	nvlist_t *config = NULL;
	char *poolname;
	spa_t *spa;
	uint64_t state;
	int error;

	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
	return (NULL);

	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
	return (NULL);

	/*
	* Create and initialize the spa structure.
	*/
	mutex_enter(&spa_namespace_lock);
	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
	spa_activate(spa, FREAD);

	/*
	* Pass off the heavy lifting to spa_load().
	* Pass TRUE for mosconfig because the user-supplied config
	* is actually the one to trust when doing an import.
	*/
	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);

	/*
	* If 'tryconfig' was at least parsable, return the current config.
	*/
	if (spa->spa_root_vdev != NULL) {
	config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
	poolname) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	state) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
	spa->spa_uberblock.ub_timestamp) == 0);
	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
	spa->spa_load_info) == 0);

	/*
	* If the bootfs property exists on this pool then we
	* copy it out so that external consumers can tell which
	* pools are bootable.
	*/
	if ((!error \|\| error == EEXIST) && spa->spa_bootfs) {
	char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);

	/*
	* We have to play games with the name since the
	* pool was opened as TRYIMPORT_NAME.
	*/
	if (dsl_dsobj_to_dsname(spa_name(spa),
	spa->spa_bootfs, tmpname) == 0) {
	char *cp;
	char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);

	cp = strchr(tmpname, '/');
	if (cp == NULL) {
	(void) strlcpy(dsname, tmpname,
	MAXPATHLEN);
	} else {
	(void) snprintf(dsname, MAXPATHLEN,
	"%s/%s", poolname, ++cp);
	}
	VERIFY(nvlist_add_string(config,
	ZPOOL_CONFIG_BOOTFS, dsname) == 0);
	kmem_free(dsname, MAXPATHLEN);
	}
	kmem_free(tmpname, MAXPATHLEN);
	}

	/*
	* Add the list of hot spares and level 2 cache devices.
	*/
	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	spa_add_spares(spa, config);
	spa_add_l2cache(spa, config);
	spa_config_exit(spa, SCL_CONFIG, FTAG);
	}

	spa_unload(spa);
	spa_deactivate(spa);
	spa_remove(spa);
	mutex_exit(&spa_namespace_lock);

	return (config);
	}

	/*
	* Pool export/destroy
	*
	* The act of destroying or exporting a pool is very simple. We make sure there
	* is no more pending I/O and any references to the pool are gone. Then, we
	* update the pool state and sync all the labels to disk, removing the
	* configuration from the cache afterwards. If the 'hardforce' flag is set, then
	* we don't sync the labels or remove the configuration cache.
	*/
	static int
	spa_export_common(char pool, int new_state, nvlist_t *oldconfig,
	boolean_t force, boolean_t hardforce)
	{
	spa_t *spa;

	if (oldconfig)
	*oldconfig = NULL;

	if (!(spa_mode_global & FWRITE))
	return (SET_ERROR(EROFS));

	mutex_enter(&spa_namespace_lock);
	if ((spa = spa_lookup(pool)) == NULL) {
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(ENOENT));
	}

	/*
	* Put a hold on the pool, drop the namespace lock, stop async tasks,
	* reacquire the namespace lock, and see if we can export.
	*/
	spa_open_ref(spa, FTAG);
	mutex_exit(&spa_namespace_lock);
	spa_async_suspend(spa);
	mutex_enter(&spa_namespace_lock);
	spa_close(spa, FTAG);

	/*
	* The pool will be in core if it's openable,
	* in which case we can modify its state.
	*/
	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
	/*
	* Objsets may be open only because they're dirty, so we
	* have to force it to sync before checking spa_refcnt.
	*/
	txg_wait_synced(spa->spa_dsl_pool, 0);
	spa_evicting_os_wait(spa);

	/*
	* A pool cannot be exported or destroyed if there are active
	* references. If we are resetting a pool, allow references by
	* fault injection handlers.
	*/
	if (!spa_refcount_zero(spa) \|\|
	(spa->spa_inject_ref != 0 &&
	new_state != POOL_STATE_UNINITIALIZED)) {
	spa_async_resume(spa);
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(EBUSY));
	}

	/*
	* A pool cannot be exported if it has an active shared spare.
	* This is to prevent other pools stealing the active spare
	* from an exported pool. At user's own will, such pool can
	* be forcedly exported.
	*/
	if (!force && new_state == POOL_STATE_EXPORTED &&
	spa_has_active_shared_spare(spa)) {
	spa_async_resume(spa);
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(EXDEV));
	}

	/*
	* We want this to be reflected on every label,
	* so mark them all dirty. spa_unload() will do the
	* final sync that pushes these changes out.
	*/
	if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa->spa_state = new_state;
	spa->spa_final_txg = spa_last_synced_txg(spa) +
	TXG_DEFER_SIZE + 1;
	vdev_config_dirty(spa->spa_root_vdev);
	spa_config_exit(spa, SCL_ALL, FTAG);
	}
	}

	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);

	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
	spa_unload(spa);
	spa_deactivate(spa);
	}

	if (oldconfig && spa->spa_config)
	VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);

	if (new_state != POOL_STATE_UNINITIALIZED) {
	if (!hardforce)
	- spa_config_sync(spa, B_TRUE, B_TRUE);
	+ spa_write_cachefile(spa, B_TRUE, B_TRUE);
	spa_remove(spa);
	}
	mutex_exit(&spa_namespace_lock);

	return (0);
	}

	/*
	* Destroy a storage pool.
	*/
	int
	spa_destroy(char *pool)
	{
	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
	B_FALSE, B_FALSE));
	}

	/*
	* Export a storage pool.
	*/
	int
	spa_export(char pool, nvlist_t *oldconfig, boolean_t force,
	boolean_t hardforce)
	{
	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
	force, hardforce));
	}

	/*
	* Similar to spa_export(), this unloads the spa_t without actually removing it
	* from the namespace in any way.
	*/
	int
	spa_reset(char *pool)
	{
	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
	B_FALSE, B_FALSE));
	}

	/*
	* ==========================================================================
	* Device manipulation
	* ==========================================================================
	*/

	/*
	* Add a device to a storage pool.
	*/
	int
	spa_vdev_add(spa_t spa, nvlist_t nvroot)
	{
	uint64_t txg, id;
	int error;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t vd, tvd;
	nvlist_t spares, l2cache;
	uint_t nspares, nl2cache;

	ASSERT(spa_writeable(spa));

	txg = spa_vdev_enter(spa);

	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
	VDEV_ALLOC_ADD)) != 0)
	return (spa_vdev_exit(spa, NULL, txg, error));

	spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */

	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
	&nspares) != 0)
	nspares = 0;

	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
	&nl2cache) != 0)
	nl2cache = 0;

	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
	return (spa_vdev_exit(spa, vd, txg, EINVAL));

	if (vd->vdev_children != 0 &&
	(error = vdev_create(vd, txg, B_FALSE)) != 0)
	return (spa_vdev_exit(spa, vd, txg, error));

	/*
	* We must validate the spares and l2cache devices after checking the
	* children. Otherwise, vdev_inuse() will blindly overwrite the spare.
	*/
	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
	return (spa_vdev_exit(spa, vd, txg, error));

	/*
	- * Transfer each new top-level vdev from vd to rvd.
	+ * If we are in the middle of a device removal, we can only add
	+ * devices which match the existing devices in the pool.
	+ * If we are in the middle of a removal, or have some indirect
	+ * vdevs, we can not add raidz toplevels.
	*/
	+ if (spa->spa_vdev_removal != NULL \|\|
	+ spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
	+ for (int c = 0; c < vd->vdev_children; c++) {
	+ tvd = vd->vdev_child[c];
	+ if (spa->spa_vdev_removal != NULL &&
	+ tvd->vdev_ashift !=
	+ spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
	+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
	+ }
	+ /* Fail if top level vdev is raidz */
	+ if (tvd->vdev_ops == &vdev_raidz_ops) {
	+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
	+ }
	+ /*
	+ * Need the top level mirror to be
	+ * a mirror of leaf vdevs only
	+ */
	+ if (tvd->vdev_ops == &vdev_mirror_ops) {
	+ for (uint64_t cid = 0;
	+ cid < tvd->vdev_children; cid++) {
	+ vdev_t *cvd = tvd->vdev_child[cid];
	+ if (!cvd->vdev_ops->vdev_op_leaf) {
	+ return (spa_vdev_exit(spa, vd,
	+ txg, EINVAL));
	+ }
	+ }
	+ }
	+ }
	+ }
	+
	for (int c = 0; c < vd->vdev_children; c++) {

	/*
	* Set the vdev id to the first hole, if one exists.
	*/
	for (id = 0; id < rvd->vdev_children; id++) {
	if (rvd->vdev_child[id]->vdev_ishole) {
	vdev_free(rvd->vdev_child[id]);
	break;
	}
	}
	tvd = vd->vdev_child[c];
	vdev_remove_child(vd, tvd);
	tvd->vdev_id = id;
	vdev_add_child(rvd, tvd);
	vdev_config_dirty(tvd);
	}

	if (nspares != 0) {
	spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
	ZPOOL_CONFIG_SPARES);
	spa_load_spares(spa);
	spa->spa_spares.sav_sync = B_TRUE;
	}

	if (nl2cache != 0) {
	spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
	ZPOOL_CONFIG_L2CACHE);
	spa_load_l2cache(spa);
	spa->spa_l2cache.sav_sync = B_TRUE;
	}

	/*
	* We have to be careful when adding new vdevs to an existing pool.
	* If other threads start allocating from these vdevs before we
	* sync the config cache, and we lose power, then upon reboot we may
	* fail to open the pool because there are DVAs that the config cache
	* can't translate. Therefore, we first add the vdevs without
	* initializing metaslabs; sync the config cache (via spa_vdev_exit());
	* and then let spa_config_update() initialize the new metaslabs.
	*
	* spa_load() checks for added-but-not-initialized vdevs, so that
	* if we lose power at any point in this sequence, the remaining
	* steps will be completed the next time we load the pool.
	*/
	(void) spa_vdev_exit(spa, vd, txg, 0);

	mutex_enter(&spa_namespace_lock);
	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
	mutex_exit(&spa_namespace_lock);

	return (0);
	}

	/*
	* Attach a device to a mirror. The arguments are the path to any device
	* in the mirror, and the nvroot for the new device. If the path specifies
	* a device that is not mirrored, we automatically insert the mirror vdev.
	*
	* If 'replacing' is specified, the new device is intended to replace the
	* existing device; in this case the two devices are made into their own
	* mirror using the 'replacing' vdev, which is functionally identical to
	* the mirror vdev (it actually reuses all the same ops) but has a few
	* extra rules: you can't attach to it after it's been created, and upon
	* completion of resilvering, the first disk (the one being replaced)
	* is automatically detached.
	*/
	int
	spa_vdev_attach(spa_t spa, uint64_t guid, nvlist_t nvroot, int replacing)
	{
	uint64_t txg, dtl_max_txg;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t oldvd, newvd, newrootvd, pvd, *tvd;
	vdev_ops_t *pvops;
	char oldvdpath, newvdpath;
	int newvd_isspare;
	int error;

	ASSERT(spa_writeable(spa));

	txg = spa_vdev_enter(spa);

	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);

	+ if (spa->spa_vdev_removal != NULL \|\|
	+ spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
	+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
	+ }
	+
	if (oldvd == NULL)
	return (spa_vdev_exit(spa, NULL, txg, ENODEV));

	if (!oldvd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));

	pvd = oldvd->vdev_parent;

	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
	VDEV_ALLOC_ATTACH)) != 0)
	return (spa_vdev_exit(spa, NULL, txg, EINVAL));

	if (newrootvd->vdev_children != 1)
	return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));

	newvd = newrootvd->vdev_child[0];

	if (!newvd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));

	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
	return (spa_vdev_exit(spa, newrootvd, txg, error));

	/*
	* Spares can't replace logs
	*/
	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

	if (!replacing) {
	/*
	* For attach, the only allowable parent is a mirror or the root
	* vdev.
	*/
	if (pvd->vdev_ops != &vdev_mirror_ops &&
	pvd->vdev_ops != &vdev_root_ops)
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

	pvops = &vdev_mirror_ops;
	} else {
	/*
	* Active hot spares can only be replaced by inactive hot
	* spares.
	*/
	if (pvd->vdev_ops == &vdev_spare_ops &&
	oldvd->vdev_isspare &&
	!spa_has_spare(spa, newvd->vdev_guid))
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

	/*
	* If the source is a hot spare, and the parent isn't already a
	* spare, then we want to create a new hot spare. Otherwise, we
	* want to create a replacing vdev. The user is not allowed to
	* attach to a spared vdev child unless the 'isspare' state is
	* the same (spare replaces spare, non-spare replaces
	* non-spare).
	*/
	if (pvd->vdev_ops == &vdev_replacing_ops &&
	spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
	} else if (pvd->vdev_ops == &vdev_spare_ops &&
	newvd->vdev_isspare != oldvd->vdev_isspare) {
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
	}

	if (newvd->vdev_isspare)
	pvops = &vdev_spare_ops;
	else
	pvops = &vdev_replacing_ops;
	}

	/*
	* Make sure the new device is big enough.
	*/
	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
	return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));

	/*
	* The new device cannot have a higher alignment requirement
	* than the top-level vdev.
	*/
	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
	return (spa_vdev_exit(spa, newrootvd, txg, EDOM));

	/*
	* If this is an in-place replacement, update oldvd's path and devid
	* to make it distinguishable from newvd, and unopenable from now on.
	*/
	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
	spa_strfree(oldvd->vdev_path);
	oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
	KM_SLEEP);
	(void) sprintf(oldvd->vdev_path, "%s/%s",
	newvd->vdev_path, "old");
	if (oldvd->vdev_devid != NULL) {
	spa_strfree(oldvd->vdev_devid);
	oldvd->vdev_devid = NULL;
	}
	}

	/* mark the device being resilvered */
	newvd->vdev_resilver_txg = txg;

	/*
	* If the parent is not a mirror, or if we're replacing, insert the new
	* mirror/replacing/spare vdev above oldvd.
	*/
	if (pvd->vdev_ops != pvops)
	pvd = vdev_add_parent(oldvd, pvops);

	ASSERT(pvd->vdev_top->vdev_parent == rvd);
	ASSERT(pvd->vdev_ops == pvops);
	ASSERT(oldvd->vdev_parent == pvd);

	/*
	* Extract the new device from its root and add it to pvd.
	*/
	vdev_remove_child(newrootvd, newvd);
	newvd->vdev_id = pvd->vdev_children;
	newvd->vdev_crtxg = oldvd->vdev_crtxg;
	vdev_add_child(pvd, newvd);

	tvd = newvd->vdev_top;
	ASSERT(pvd->vdev_top == tvd);
	ASSERT(tvd->vdev_parent == rvd);

	vdev_config_dirty(tvd);

	/*
	* Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
	* for any dmu_sync-ed blocks. It will propagate upward when
	* spa_vdev_exit() calls vdev_dtl_reassess().
	*/
	dtl_max_txg = txg + TXG_CONCURRENT_STATES;

	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
	dtl_max_txg - TXG_INITIAL);

	if (newvd->vdev_isspare) {
	spa_spare_activate(newvd);
	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
	}

	oldvdpath = spa_strdup(oldvd->vdev_path);
	newvdpath = spa_strdup(newvd->vdev_path);
	newvd_isspare = newvd->vdev_isspare;

	/*
	* Mark newvd's DTL dirty in this txg.
	*/
	vdev_dirty(tvd, VDD_DTL, newvd, txg);

	/*
	* Schedule the resilver to restart in the future. We do this to
	* ensure that dmu_sync-ed blocks have been stitched into the
	* respective datasets.
	*/
	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);

	if (spa->spa_bootfs)
	spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);

	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);

	/*
	* Commit the config
	*/
	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);

	spa_history_log_internal(spa, "vdev attach", NULL,
	"%s vdev=%s %s vdev=%s",
	replacing && newvd_isspare ? "spare in" :
	replacing ? "replace" : "attach", newvdpath,
	replacing ? "for" : "to", oldvdpath);

	spa_strfree(oldvdpath);
	spa_strfree(newvdpath);

	return (0);
	}

	/*
	* Detach a device from a mirror or replacing vdev.
	*
	* If 'replace_done' is specified, only detach if the parent
	* is a replacing vdev.
	*/
	int
	spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
	{
	uint64_t txg;
	int error;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t vd, pvd, cvd, tvd;
	boolean_t unspare = B_FALSE;
	uint64_t unspare_guid = 0;
	char *vdpath;

	ASSERT(spa_writeable(spa));

	txg = spa_vdev_enter(spa);

	vd = spa_lookup_by_guid(spa, guid, B_FALSE);

	if (vd == NULL)
	return (spa_vdev_exit(spa, NULL, txg, ENODEV));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));

	pvd = vd->vdev_parent;

	/*
	* If the parent/child relationship is not as expected, don't do it.
	* Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
	* vdev that's replacing B with C. The user's intent in replacing
	* is to go from M(A,B) to M(A,C). If the user decides to cancel
	* the replace by detaching C, the expected behavior is to end up
	* M(A,B). But suppose that right after deciding to detach C,
	* the replacement of B completes. We would have M(A,C), and then
	* ask to detach C, which would leave us with just A -- not what
	* the user wanted. To prevent this, we make sure that the
	* parent/child relationship hasn't changed -- in this example,
	* that C's parent is still the replacing vdev R.
	*/
	if (pvd->vdev_guid != pguid && pguid != 0)
	return (spa_vdev_exit(spa, NULL, txg, EBUSY));

	/*
	* Only 'replacing' or 'spare' vdevs can be replaced.
	*/
	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
	pvd->vdev_ops != &vdev_spare_ops)
	return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));

	ASSERT(pvd->vdev_ops != &vdev_spare_ops \|\|
	spa_version(spa) >= SPA_VERSION_SPARES);

	/*
	* Only mirror, replacing, and spare vdevs support detach.
	*/
	if (pvd->vdev_ops != &vdev_replacing_ops &&
	pvd->vdev_ops != &vdev_mirror_ops &&
	pvd->vdev_ops != &vdev_spare_ops)
	return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));

	/*
	* If this device has the only valid copy of some data,
	* we cannot safely detach it.
	*/
	if (vdev_dtl_required(vd))
	return (spa_vdev_exit(spa, NULL, txg, EBUSY));

	ASSERT(pvd->vdev_children >= 2);

	/*
	* If we are detaching the second disk from a replacing vdev, then
	* check to see if we changed the original vdev's path to have "/old"
	* at the end in spa_vdev_attach(). If so, undo that change now.
	*/
	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
	vd->vdev_path != NULL) {
	size_t len = strlen(vd->vdev_path);

	for (int c = 0; c < pvd->vdev_children; c++) {
	cvd = pvd->vdev_child[c];

	if (cvd == vd \|\| cvd->vdev_path == NULL)
	continue;

	if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
	strcmp(cvd->vdev_path + len, "/old") == 0) {
	spa_strfree(cvd->vdev_path);
	cvd->vdev_path = spa_strdup(vd->vdev_path);
	break;
	}
	}
	}

	/*
	* If we are detaching the original disk from a spare, then it implies
	* that the spare should become a real disk, and be removed from the
	* active spare list for the pool.
	*/
	if (pvd->vdev_ops == &vdev_spare_ops &&
	vd->vdev_id == 0 &&
	pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
	unspare = B_TRUE;

	/*
	* Erase the disk labels so the disk can be used for other things.
	* This must be done after all other error cases are handled,
	* but before we disembowel vd (so we can still do I/O to it).
	* But if we can't do it, don't treat the error as fatal --
	* it may be that the unwritability of the disk is the reason
	* it's being detached!
	*/
	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);

	/*
	* Remove vd from its parent and compact the parent's children.
	*/
	vdev_remove_child(pvd, vd);
	vdev_compact_children(pvd);

	/*
	* Remember one of the remaining children so we can get tvd below.
	*/
	cvd = pvd->vdev_child[pvd->vdev_children - 1];

	/*
	* If we need to remove the remaining child from the list of hot spares,
	* do it now, marking the vdev as no longer a spare in the process.
	* We must do this before vdev_remove_parent(), because that can
	* change the GUID if it creates a new toplevel GUID. For a similar
	* reason, we must remove the spare now, in the same txg as the detach;
	* otherwise someone could attach a new sibling, change the GUID, and
	* the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
	*/
	if (unspare) {
	ASSERT(cvd->vdev_isspare);
	spa_spare_remove(cvd);
	unspare_guid = cvd->vdev_guid;
	(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
	cvd->vdev_unspare = B_TRUE;
	}

	/*
	* If the parent mirror/replacing vdev only has one child,
	* the parent is no longer needed. Remove it from the tree.
	*/
	if (pvd->vdev_children == 1) {
	if (pvd->vdev_ops == &vdev_spare_ops)
	cvd->vdev_unspare = B_FALSE;
	vdev_remove_parent(cvd);
	}


	/*
	* We don't set tvd until now because the parent we just removed
	* may have been the previous top-level vdev.
	*/
	tvd = cvd->vdev_top;
	ASSERT(tvd->vdev_parent == rvd);

	/*
	* Reevaluate the parent vdev state.
	*/
	vdev_propagate_state(cvd);

	/*
	* If the 'autoexpand' property is set on the pool then automatically
	* try to expand the size of the pool. For example if the device we
	* just detached was smaller than the others, it may be possible to
	* add metaslabs (i.e. grow the pool). We need to reopen the vdev
	* first so that we can obtain the updated sizes of the leaf vdevs.
	*/
	if (spa->spa_autoexpand) {
	vdev_reopen(tvd);
	vdev_expand(tvd, txg);
	}

	vdev_config_dirty(tvd);

	/*
	* Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
	* vd->vdev_detached is set and free vd's DTL object in syncing context.
	* But first make sure we're not on any other txg's DTL list, to
	* prevent vd from being accessed after it's freed.
	*/
	vdpath = spa_strdup(vd->vdev_path);
	for (int t = 0; t < TXG_SIZE; t++)
	(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
	vd->vdev_detached = B_TRUE;
	vdev_dirty(tvd, VDD_DTL, vd, txg);

	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);

	/* hang on to the spa before we release the lock */
	spa_open_ref(spa, FTAG);

	error = spa_vdev_exit(spa, vd, txg, 0);

	spa_history_log_internal(spa, "detach", NULL,
	"vdev=%s", vdpath);
	spa_strfree(vdpath);

	/*
	* If this was the removal of the original device in a hot spare vdev,
	* then we want to go through and remove the device from the hot spare
	* list of every other pool.
	*/
	if (unspare) {
	spa_t *altspa = NULL;

	mutex_enter(&spa_namespace_lock);
	while ((altspa = spa_next(altspa)) != NULL) {
	if (altspa->spa_state != POOL_STATE_ACTIVE \|\|
	altspa == spa)
	continue;

	spa_open_ref(altspa, FTAG);
	mutex_exit(&spa_namespace_lock);
	(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
	mutex_enter(&spa_namespace_lock);
	spa_close(altspa, FTAG);
	}
	mutex_exit(&spa_namespace_lock);

	/* search the rest of the vdevs for spares to remove */
	spa_vdev_resilver_done(spa);
	}

	/* all done with the spa; OK to release */
	mutex_enter(&spa_namespace_lock);
	spa_close(spa, FTAG);
	mutex_exit(&spa_namespace_lock);

	return (error);
	}

	/*
	* Split a set of devices from their mirrors, and create a new pool from them.
	*/
	int
	spa_vdev_split_mirror(spa_t spa, char newname, nvlist_t *config,
	nvlist_t *props, boolean_t exp)
	{
	int error = 0;
	uint64_t txg, *glist;
	spa_t *newspa;
	uint_t c, children, lastlog;
	nvlist_t *child, nvl, *tmp;
	dmu_tx_t *tx;
	char *altroot = NULL;
	vdev_t rvd, vml = NULL; / vdev modify list */
	boolean_t activate_slog;

	ASSERT(spa_writeable(spa));

	txg = spa_vdev_enter(spa);

	/* clear the log and flush everything up to now */
	activate_slog = spa_passivate_log(spa);
	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
	- error = spa_offline_log(spa);
	+ error = spa_reset_logs(spa);
	txg = spa_vdev_config_enter(spa);

	if (activate_slog)
	spa_activate_log(spa);

	if (error != 0)
	return (spa_vdev_exit(spa, NULL, txg, error));

	/* check new spa name before going any further */
	if (spa_lookup(newname) != NULL)
	return (spa_vdev_exit(spa, NULL, txg, EEXIST));

	/*
	* scan through all the children to ensure they're all mirrors
	*/
	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 \|\|
	nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
	&children) != 0)
	return (spa_vdev_exit(spa, NULL, txg, EINVAL));

	/* first, check to ensure we've got the right child count */
	rvd = spa->spa_root_vdev;
	lastlog = 0;
	for (c = 0; c < rvd->vdev_children; c++) {
	vdev_t *vd = rvd->vdev_child[c];

	/* don't count the holes & logs as children */
	- if (vd->vdev_islog \|\| vd->vdev_ishole) {
	+ if (vd->vdev_islog \|\| !vdev_is_concrete(vd)) {
	if (lastlog == 0)
	lastlog = c;
	continue;
	}

	lastlog = 0;
	}
	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
	return (spa_vdev_exit(spa, NULL, txg, EINVAL));

	/* next, ensure no spare or cache devices are part of the split */
	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 \|\|
	nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
	return (spa_vdev_exit(spa, NULL, txg, EINVAL));

	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);

	/* then, loop over each vdev and validate it */
	for (c = 0; c < children; c++) {
	uint64_t is_hole = 0;

	(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
	&is_hole);

	if (is_hole != 0) {
	if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole \|\|
	spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
	continue;
	} else {
	error = SET_ERROR(EINVAL);
	break;
	}
	}

	/* which disk is going to be split? */
	if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
	&glist[c]) != 0) {
	error = SET_ERROR(EINVAL);
	break;
	}

	/* look it up in the spa */
	vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
	if (vml[c] == NULL) {
	error = SET_ERROR(ENODEV);
	break;
	}

	/* make sure there's nothing stopping the split */
	if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops \|\|
	vml[c]->vdev_islog \|\|
	- vml[c]->vdev_ishole \|\|
	+ !vdev_is_concrete(vml[c]) \|\|
	vml[c]->vdev_isspare \|\|
	vml[c]->vdev_isl2cache \|\|
	!vdev_writeable(vml[c]) \|\|
	vml[c]->vdev_children != 0 \|\|
	vml[c]->vdev_state != VDEV_STATE_HEALTHY \|\|
	c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
	error = SET_ERROR(EINVAL);
	break;
	}

	if (vdev_dtl_required(vml[c])) {
	error = SET_ERROR(EBUSY);
	break;
	}

	/* we need certain info from the top level */
	VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
	vml[c]->vdev_top->vdev_ms_array) == 0);
	VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
	vml[c]->vdev_top->vdev_ms_shift) == 0);
	VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
	vml[c]->vdev_top->vdev_asize) == 0);
	VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
	vml[c]->vdev_top->vdev_ashift) == 0);

	/* transfer per-vdev ZAPs */
	ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
	VERIFY0(nvlist_add_uint64(child[c],
	ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));

	ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
	VERIFY0(nvlist_add_uint64(child[c],
	ZPOOL_CONFIG_VDEV_TOP_ZAP,
	vml[c]->vdev_parent->vdev_top_zap));
	}

	if (error != 0) {
	kmem_free(vml, children * sizeof (vdev_t *));
	kmem_free(glist, children * sizeof (uint64_t));
	return (spa_vdev_exit(spa, NULL, txg, error));
	}

	/* stop writers from using the disks */
	for (c = 0; c < children; c++) {
	if (vml[c] != NULL)
	vml[c]->vdev_offline = B_TRUE;
	}
	vdev_reopen(spa->spa_root_vdev);

	/*
	* Temporarily record the splitting vdevs in the spa config. This
	* will disappear once the config is regenerated.
	*/
	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
	glist, children) == 0);
	kmem_free(glist, children * sizeof (uint64_t));

	mutex_enter(&spa->spa_props_lock);
	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
	nvl) == 0);
	mutex_exit(&spa->spa_props_lock);
	spa->spa_config_splitting = nvl;
	vdev_config_dirty(spa->spa_root_vdev);

	/* configure and create the new pool */
	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
	spa_version(spa)) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
	spa->spa_config_txg) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	spa_generate_guid(NULL)) == 0);
	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
	(void) nvlist_lookup_string(props,
	zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);

	/* add the new pool to the namespace */
	newspa = spa_add(newname, config, altroot);
	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
	newspa->spa_config_txg = spa->spa_config_txg;
	spa_set_log_state(newspa, SPA_LOG_CLEAR);

	/* release the spa config lock, retaining the namespace lock */
	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);

	if (zio_injection_enabled)
	zio_handle_panic_injection(spa, FTAG, 1);

	spa_activate(newspa, spa_mode_global);
	spa_async_suspend(newspa);

	#ifndef illumos
	/* mark that we are creating new spa by splitting */
	newspa->spa_splitting_newspa = B_TRUE;
	#endif
	/* create the new pool from the disks of the original pool */
	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
	#ifndef illumos
	newspa->spa_splitting_newspa = B_FALSE;
	#endif
	if (error)
	goto out;

	/* if that worked, generate a real config for the new pool */
	if (newspa->spa_root_vdev != NULL) {
	VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
	NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
	ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
	spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
	B_TRUE));
	}

	/* set the props */
	if (props != NULL) {
	spa_configfile_set(newspa, props, B_FALSE);
	error = spa_prop_set(newspa, props);
	if (error)
	goto out;
	}

	/* flush everything */
	txg = spa_vdev_config_enter(newspa);
	vdev_config_dirty(newspa->spa_root_vdev);
	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);

	if (zio_injection_enabled)
	zio_handle_panic_injection(spa, FTAG, 2);

	spa_async_resume(newspa);

	/* finally, update the original pool's config */
	txg = spa_vdev_config_enter(spa);
	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error != 0)
	dmu_tx_abort(tx);
	for (c = 0; c < children; c++) {
	if (vml[c] != NULL) {
	vdev_split(vml[c]);
	if (error == 0)
	spa_history_log_internal(spa, "detach", tx,
	"vdev=%s", vml[c]->vdev_path);

	vdev_free(vml[c]);
	}
	}
	spa->spa_avz_action = AVZ_ACTION_REBUILD;
	vdev_config_dirty(spa->spa_root_vdev);
	spa->spa_config_splitting = NULL;
	nvlist_free(nvl);
	if (error == 0)
	dmu_tx_commit(tx);
	(void) spa_vdev_exit(spa, NULL, txg, 0);

	if (zio_injection_enabled)
	zio_handle_panic_injection(spa, FTAG, 3);

	/* split is complete; log a history record */
	spa_history_log_internal(newspa, "split", NULL,
	"from pool %s", spa_name(spa));

	kmem_free(vml, children * sizeof (vdev_t *));

	/* if we're not going to mount the filesystems in userland, export */
	if (exp)
	error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
	B_FALSE, B_FALSE);

	return (error);

	out:
	spa_unload(newspa);
	spa_deactivate(newspa);
	spa_remove(newspa);

	txg = spa_vdev_config_enter(spa);

	/* re-online all offlined disks */
	for (c = 0; c < children; c++) {
	if (vml[c] != NULL)
	vml[c]->vdev_offline = B_FALSE;
	}
	vdev_reopen(spa->spa_root_vdev);

	nvlist_free(spa->spa_config_splitting);
	spa->spa_config_splitting = NULL;
	(void) spa_vdev_exit(spa, NULL, txg, error);

	kmem_free(vml, children * sizeof (vdev_t *));
	return (error);
	}

	-static nvlist_t *
	-spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
	-{
	- for (int i = 0; i < count; i++) {
	- uint64_t guid;
	-
	- VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
	- &guid) == 0);
	-
	- if (guid == target_guid)
	- return (nvpp[i]);
	- }
	-
	- return (NULL);
	-}
	-
	-static void
	-spa_vdev_remove_aux(nvlist_t config, char name, nvlist_t **dev, int count,
	- nvlist_t *dev_to_remove)
	-{
	- nvlist_t **newdev = NULL;
	-
	- if (count > 1)
	- newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
	-
	- for (int i = 0, j = 0; i < count; i++) {
	- if (dev[i] == dev_to_remove)
	- continue;
	- VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
	- }
	-
	- VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
	- VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
	-
	- for (int i = 0; i < count - 1; i++)
	- nvlist_free(newdev[i]);
	-
	- if (count > 1)
	- kmem_free(newdev, (count - 1) * sizeof (void *));
	-}
	-
	/*
	- * Evacuate the device.
	- */
	-static int
	-spa_vdev_remove_evacuate(spa_t spa, vdev_t vd)
	-{
	- uint64_t txg;
	- int error = 0;
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
	- ASSERT(vd == vd->vdev_top);
	-
	- /*
	- * Evacuate the device. We don't hold the config lock as writer
	- * since we need to do I/O but we do keep the
	- * spa_namespace_lock held. Once this completes the device
	- * should no longer have any blocks allocated on it.
	- */
	- if (vd->vdev_islog) {
	- if (vd->vdev_stat.vs_alloc != 0)
	- error = spa_offline_log(spa);
	- } else {
	- error = SET_ERROR(ENOTSUP);
	- }
	-
	- if (error)
	- return (error);
	-
	- /*
	- * The evacuation succeeded. Remove any remaining MOS metadata
	- * associated with this vdev, and wait for these changes to sync.
	- */
	- ASSERT0(vd->vdev_stat.vs_alloc);
	- txg = spa_vdev_config_enter(spa);
	- vd->vdev_removing = B_TRUE;
	- vdev_dirty_leaves(vd, VDD_DTL, txg);
	- vdev_config_dirty(vd);
	- spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
	-
	- return (0);
	-}
	-
	-/*
	- * Complete the removal by cleaning up the namespace.
	- */
	-static void
	-spa_vdev_remove_from_namespace(spa_t spa, vdev_t vd)
	-{
	- vdev_t *rvd = spa->spa_root_vdev;
	- uint64_t id = vd->vdev_id;
	- boolean_t last_vdev = (id == (rvd->vdev_children - 1));
	-
	- ASSERT(MUTEX_HELD(&spa_namespace_lock));
	- ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	- ASSERT(vd == vd->vdev_top);
	-
	- /*
	- * Only remove any devices which are empty.
	- */
	- if (vd->vdev_stat.vs_alloc != 0)
	- return;
	-
	- (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
	-
	- if (list_link_active(&vd->vdev_state_dirty_node))
	- vdev_state_clean(vd);
	- if (list_link_active(&vd->vdev_config_dirty_node))
	- vdev_config_clean(vd);
	-
	- vdev_free(vd);
	-
	- if (last_vdev) {
	- vdev_compact_children(rvd);
	- } else {
	- vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
	- vdev_add_child(rvd, vd);
	- }
	- vdev_config_dirty(rvd);
	-
	- /*
	- * Reassess the health of our root vdev.
	- */
	- vdev_reopen(rvd);
	-}
	-
	-/*
	- * Remove a device from the pool -
	- *
	- * Removing a device from the vdev namespace requires several steps
	- * and can take a significant amount of time. As a result we use
	- * the spa_vdev_config_[enter/exit] functions which allow us to
	- * grab and release the spa_config_lock while still holding the namespace
	- * lock. During each step the configuration is synced out.
	- *
	- * Currently, this supports removing only hot spares, slogs, and level 2 ARC
	- * devices.
	- */
	-int
	-spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
	-{
	- vdev_t *vd;
	- sysevent_t *ev = NULL;
	- metaslab_group_t *mg;
	- nvlist_t spares, l2cache, *nv;
	- uint64_t txg = 0;
	- uint_t nspares, nl2cache;
	- int error = 0;
	- boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
	-
	- ASSERT(spa_writeable(spa));
	-
	- if (!locked)
	- txg = spa_vdev_enter(spa);
	-
	- vd = spa_lookup_by_guid(spa, guid, B_FALSE);
	-
	- if (spa->spa_spares.sav_vdevs != NULL &&
	- nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
	- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
	- (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
	- /*
	- * Only remove the hot spare if it's not currently in use
	- * in this pool.
	- */
	- if (vd == NULL \|\| unspare) {
	- if (vd == NULL)
	- vd = spa_lookup_by_guid(spa, guid, B_TRUE);
	- ev = spa_event_create(spa, vd, NULL,
	- ESC_ZFS_VDEV_REMOVE_AUX);
	- spa_vdev_remove_aux(spa->spa_spares.sav_config,
	- ZPOOL_CONFIG_SPARES, spares, nspares, nv);
	- spa_load_spares(spa);
	- spa->spa_spares.sav_sync = B_TRUE;
	- } else {
	- error = SET_ERROR(EBUSY);
	- }
	- } else if (spa->spa_l2cache.sav_vdevs != NULL &&
	- nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
	- ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
	- (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
	- /*
	- * Cache devices can always be removed.
	- */
	- vd = spa_lookup_by_guid(spa, guid, B_TRUE);
	- ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
	- spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
	- ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
	- spa_load_l2cache(spa);
	- spa->spa_l2cache.sav_sync = B_TRUE;
	- } else if (vd != NULL && vd->vdev_islog) {
	- ASSERT(!locked);
	- ASSERT(vd == vd->vdev_top);
	-
	- mg = vd->vdev_mg;
	-
	- /*
	- * Stop allocating from this vdev.
	- */
	- metaslab_group_passivate(mg);
	-
	- /*
	- * Wait for the youngest allocations and frees to sync,
	- * and then wait for the deferral of those frees to finish.
	- */
	- spa_vdev_config_exit(spa, NULL,
	- txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
	-
	- /*
	- * Attempt to evacuate the vdev.
	- */
	- error = spa_vdev_remove_evacuate(spa, vd);
	-
	- txg = spa_vdev_config_enter(spa);
	-
	- /*
	- * If we couldn't evacuate the vdev, unwind.
	- */
	- if (error) {
	- metaslab_group_activate(mg);
	- return (spa_vdev_exit(spa, NULL, txg, error));
	- }
	-
	- /*
	- * Clean up the vdev namespace.
	- */
	- ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV);
	- spa_vdev_remove_from_namespace(spa, vd);
	-
	- } else if (vd != NULL) {
	- /*
	- * Normal vdevs cannot be removed (yet).
	- */
	- error = SET_ERROR(ENOTSUP);
	- } else {
	- /*
	- * There is no vdev of any kind with the specified guid.
	- */
	- error = SET_ERROR(ENOENT);
	- }
	-
	- if (!locked)
	- error = spa_vdev_exit(spa, NULL, txg, error);
	-
	- if (ev)
	- spa_event_post(ev);
	-
	- return (error);
	-}
	-
	-/*
	* Find any device that's done replacing, or a vdev marked 'unspare' that's
	* currently spared, so we can detach it.
	*/
	static vdev_t *
	spa_vdev_resilver_done_hunt(vdev_t *vd)
	{
	vdev_t newvd, oldvd;

	for (int c = 0; c < vd->vdev_children; c++) {
	oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
	if (oldvd != NULL)
	return (oldvd);
	}

	/*
	* Check for a completed replacement. We always consider the first
	* vdev in the list to be the oldest vdev, and the last one to be
	* the newest (see spa_vdev_attach() for how that works). In
	* the case where the newest vdev is faulted, we will not automatically
	* remove it after a resilver completes. This is OK as it will require
	* user intervention to determine which disk the admin wishes to keep.
	*/
	if (vd->vdev_ops == &vdev_replacing_ops) {
	ASSERT(vd->vdev_children > 1);

	newvd = vd->vdev_child[vd->vdev_children - 1];
	oldvd = vd->vdev_child[0];

	if (vdev_dtl_empty(newvd, DTL_MISSING) &&
	vdev_dtl_empty(newvd, DTL_OUTAGE) &&
	!vdev_dtl_required(oldvd))
	return (oldvd);
	}

	/*
	* Check for a completed resilver with the 'unspare' flag set.
	*/
	if (vd->vdev_ops == &vdev_spare_ops) {
	vdev_t *first = vd->vdev_child[0];
	vdev_t *last = vd->vdev_child[vd->vdev_children - 1];

	if (last->vdev_unspare) {
	oldvd = first;
	newvd = last;
	} else if (first->vdev_unspare) {
	oldvd = last;
	newvd = first;
	} else {
	oldvd = NULL;
	}

	if (oldvd != NULL &&
	vdev_dtl_empty(newvd, DTL_MISSING) &&
	vdev_dtl_empty(newvd, DTL_OUTAGE) &&
	!vdev_dtl_required(oldvd))
	return (oldvd);

	/*
	* If there are more than two spares attached to a disk,
	* and those spares are not required, then we want to
	* attempt to free them up now so that they can be used
	* by other pools. Once we're back down to a single
	* disk+spare, we stop removing them.
	*/
	if (vd->vdev_children > 2) {
	newvd = vd->vdev_child[1];

	if (newvd->vdev_isspare && last->vdev_isspare &&
	vdev_dtl_empty(last, DTL_MISSING) &&
	vdev_dtl_empty(last, DTL_OUTAGE) &&
	!vdev_dtl_required(newvd))
	return (newvd);
	}
	}

	return (NULL);
	}

	static void
	spa_vdev_resilver_done(spa_t *spa)
	{
	vdev_t vd, pvd, *ppvd;
	uint64_t guid, sguid, pguid, ppguid;

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
	pvd = vd->vdev_parent;
	ppvd = pvd->vdev_parent;
	guid = vd->vdev_guid;
	pguid = pvd->vdev_guid;
	ppguid = ppvd->vdev_guid;
	sguid = 0;
	/*
	* If we have just finished replacing a hot spared device, then
	* we need to detach the parent's first child (the original hot
	* spare) as well.
	*/
	if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
	ppvd->vdev_children == 2) {
	ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
	sguid = ppvd->vdev_child[1]->vdev_guid;
	}
	ASSERT(vd->vdev_resilver_txg == 0 \|\| !vdev_dtl_required(vd));

	spa_config_exit(spa, SCL_ALL, FTAG);
	if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
	return;
	if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
	return;
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	}

	spa_config_exit(spa, SCL_ALL, FTAG);
	}

	/*
	* Update the stored path or FRU for this vdev.
	*/
	int
	spa_vdev_set_common(spa_t spa, uint64_t guid, const char value,
	boolean_t ispath)
	{
	vdev_t *vd;
	boolean_t sync = B_FALSE;

	ASSERT(spa_writeable(spa));

	spa_vdev_state_enter(spa, SCL_ALL);

	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENOENT));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));

	if (ispath) {
	if (strcmp(value, vd->vdev_path) != 0) {
	spa_strfree(vd->vdev_path);
	vd->vdev_path = spa_strdup(value);
	sync = B_TRUE;
	}
	} else {
	if (vd->vdev_fru == NULL) {
	vd->vdev_fru = spa_strdup(value);
	sync = B_TRUE;
	} else if (strcmp(value, vd->vdev_fru) != 0) {
	spa_strfree(vd->vdev_fru);
	vd->vdev_fru = spa_strdup(value);
	sync = B_TRUE;
	}
	}

	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
	}

	int
	spa_vdev_setpath(spa_t spa, uint64_t guid, const char newpath)
	{
	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
	}

	int
	spa_vdev_setfru(spa_t spa, uint64_t guid, const char newfru)
	{
	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
	}

	/*
	* ==========================================================================
	* SPA Scanning
	* ==========================================================================
	*/
	int
	spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
	{
	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);

	if (dsl_scan_resilvering(spa->spa_dsl_pool))
	return (SET_ERROR(EBUSY));

	return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
	}

	int
	spa_scan_stop(spa_t *spa)
	{
	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
	if (dsl_scan_resilvering(spa->spa_dsl_pool))
	return (SET_ERROR(EBUSY));
	return (dsl_scan_cancel(spa->spa_dsl_pool));
	}

	int
	spa_scan(spa_t *spa, pool_scan_func_t func)
	{
	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);

	if (func >= POOL_SCAN_FUNCS \|\| func == POOL_SCAN_NONE)
	return (SET_ERROR(ENOTSUP));

	/*
	* If a resilver was requested, but there is no DTL on a
	* writeable leaf device, we have nothing to do.
	*/
	if (func == POOL_SCAN_RESILVER &&
	!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
	spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
	return (0);
	}

	return (dsl_scan(spa->spa_dsl_pool, func));
	}

	/*
	* ==========================================================================
	* SPA async task processing
	* ==========================================================================
	*/

	static void
	spa_async_remove(spa_t spa, vdev_t vd)
	{
	if (vd->vdev_remove_wanted) {
	vd->vdev_remove_wanted = B_FALSE;
	vd->vdev_delayed_close = B_FALSE;
	vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);

	/*
	* We want to clear the stats, but we don't want to do a full
	* vdev_clear() as that will cause us to throw away
	* degraded/faulted state as well as attempt to reopen the
	* device, all of which is a waste.
	*/
	vd->vdev_stat.vs_read_errors = 0;
	vd->vdev_stat.vs_write_errors = 0;
	vd->vdev_stat.vs_checksum_errors = 0;

	vdev_state_dirty(vd->vdev_top);
	/* Tell userspace that the vdev is gone. */
	zfs_post_remove(spa, vd);
	}

	for (int c = 0; c < vd->vdev_children; c++)
	spa_async_remove(spa, vd->vdev_child[c]);
	}

	static void
	spa_async_probe(spa_t spa, vdev_t vd)
	{
	if (vd->vdev_probe_wanted) {
	vd->vdev_probe_wanted = B_FALSE;
	vdev_reopen(vd); /* vdev_open() does the actual probe */
	}

	for (int c = 0; c < vd->vdev_children; c++)
	spa_async_probe(spa, vd->vdev_child[c]);
	}

	static void
	spa_async_autoexpand(spa_t spa, vdev_t vd)
	{
	sysevent_id_t eid;
	nvlist_t *attr;
	char *physpath;

	if (!spa->spa_autoexpand)
	return;

	for (int c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];
	spa_async_autoexpand(spa, cvd);
	}

	if (!vd->vdev_ops->vdev_op_leaf \|\| vd->vdev_physpath == NULL)
	return;

	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);

	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);

	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
	ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);

	nvlist_free(attr);
	kmem_free(physpath, MAXPATHLEN);
	}

	static void
	spa_async_thread(void *arg)
	{
	spa_t spa = (spa_t )arg;
	int tasks;

	ASSERT(spa->spa_sync_on);

	mutex_enter(&spa->spa_async_lock);
	tasks = spa->spa_async_tasks;
	spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
	mutex_exit(&spa->spa_async_lock);

	/*
	* See if the config needs to be updated.
	*/
	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
	uint64_t old_space, new_space;

	mutex_enter(&spa_namespace_lock);
	old_space = metaslab_class_get_space(spa_normal_class(spa));
	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
	new_space = metaslab_class_get_space(spa_normal_class(spa));
	mutex_exit(&spa_namespace_lock);

	/*
	* If the pool grew as a result of the config update,
	* then log an internal history event.
	*/
	if (new_space != old_space) {
	spa_history_log_internal(spa, "vdev online", NULL,
	"pool '%s' size: %llu(+%llu)",
	spa_name(spa), new_space, new_space - old_space);
	}
	}

	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	spa_async_autoexpand(spa, spa->spa_root_vdev);
	spa_config_exit(spa, SCL_CONFIG, FTAG);
	}

	/*
	* See if any devices need to be probed.
	*/
	if (tasks & SPA_ASYNC_PROBE) {
	spa_vdev_state_enter(spa, SCL_NONE);
	spa_async_probe(spa, spa->spa_root_vdev);
	(void) spa_vdev_state_exit(spa, NULL, 0);
	}

	/*
	* If any devices are done replacing, detach them.
	*/
	if (tasks & SPA_ASYNC_RESILVER_DONE)
	spa_vdev_resilver_done(spa);

	/*
	* Kick off a resilver.
	*/
	if (tasks & SPA_ASYNC_RESILVER)
	dsl_resilver_restart(spa->spa_dsl_pool, 0);

	/*
	* Let the world know that we're done.
	*/
	mutex_enter(&spa->spa_async_lock);
	spa->spa_async_thread = NULL;
	cv_broadcast(&spa->spa_async_cv);
	mutex_exit(&spa->spa_async_lock);
	thread_exit();
	}

	static void
	spa_async_thread_vd(void *arg)
	{
	spa_t *spa = arg;
	int tasks;

	mutex_enter(&spa->spa_async_lock);
	tasks = spa->spa_async_tasks;
	retry:
	spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
	mutex_exit(&spa->spa_async_lock);

	/*
	* See if any devices need to be marked REMOVED.
	*/
	if (tasks & SPA_ASYNC_REMOVE) {
	spa_vdev_state_enter(spa, SCL_NONE);
	spa_async_remove(spa, spa->spa_root_vdev);
	for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
	spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
	for (int i = 0; i < spa->spa_spares.sav_count; i++)
	spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
	(void) spa_vdev_state_exit(spa, NULL, 0);
	}

	/*
	* Let the world know that we're done.
	*/
	mutex_enter(&spa->spa_async_lock);
	tasks = spa->spa_async_tasks;
	if ((tasks & SPA_ASYNC_REMOVE) != 0)
	goto retry;
	spa->spa_async_thread_vd = NULL;
	cv_broadcast(&spa->spa_async_cv);
	mutex_exit(&spa->spa_async_lock);
	thread_exit();
	}

	void
	spa_async_suspend(spa_t *spa)
	{
	mutex_enter(&spa->spa_async_lock);
	spa->spa_async_suspended++;
	- while (spa->spa_async_thread != NULL &&
	- spa->spa_async_thread_vd != NULL)
	+ while (spa->spa_async_thread != NULL \|\|
	+ spa->spa_async_thread_vd != NULL \|\|
	+ spa->spa_condense_thread != NULL)
	cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
	mutex_exit(&spa->spa_async_lock);
	+
	+ spa_vdev_remove_suspend(spa);
	}

	void
	spa_async_resume(spa_t *spa)
	{
	mutex_enter(&spa->spa_async_lock);
	ASSERT(spa->spa_async_suspended != 0);
	spa->spa_async_suspended--;
	mutex_exit(&spa->spa_async_lock);
	+ spa_restart_removal(spa);
	}

	static boolean_t
	spa_async_tasks_pending(spa_t *spa)
	{
	uint_t non_config_tasks;
	uint_t config_task;
	boolean_t config_task_suspended;

	non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE \|
	SPA_ASYNC_REMOVE);
	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
	if (spa->spa_ccw_fail_time == 0) {
	config_task_suspended = B_FALSE;
	} else {
	config_task_suspended =
	(gethrtime() - spa->spa_ccw_fail_time) <
	(zfs_ccw_retry_interval * NANOSEC);
	}

	return (non_config_tasks \|\| (config_task && !config_task_suspended));
	}

	static void
	spa_async_dispatch(spa_t *spa)
	{
	mutex_enter(&spa->spa_async_lock);
	if (spa_async_tasks_pending(spa) &&
	!spa->spa_async_suspended &&
	spa->spa_async_thread == NULL &&
	rootdir != NULL)
	spa->spa_async_thread = thread_create(NULL, 0,
	spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
	mutex_exit(&spa->spa_async_lock);
	}

	static void
	spa_async_dispatch_vd(spa_t *spa)
	{
	mutex_enter(&spa->spa_async_lock);
	if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
	!spa->spa_async_suspended &&
	spa->spa_async_thread_vd == NULL &&
	rootdir != NULL)
	spa->spa_async_thread_vd = thread_create(NULL, 0,
	spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
	mutex_exit(&spa->spa_async_lock);
	}

	void
	spa_async_request(spa_t *spa, int task)
	{
	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
	mutex_enter(&spa->spa_async_lock);
	spa->spa_async_tasks \|= task;
	mutex_exit(&spa->spa_async_lock);
	spa_async_dispatch_vd(spa);
	}

	/*
	* ==========================================================================
	* SPA syncing routines
	* ==========================================================================
	*/

	static int
	bpobj_enqueue_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	bpobj_t *bpo = arg;
	bpobj_enqueue(bpo, bp, tx);
	return (0);
	}

	static int
	spa_free_sync_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	zio_t *zio = arg;

	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
	BP_GET_PSIZE(bp), zio->io_flags));
	return (0);
	}

	/*
	* Note: this simple function is not inlined to make it easier to dtrace the
	* amount of time spent syncing frees.
	*/
	static void
	spa_sync_frees(spa_t spa, bplist_t bpl, dmu_tx_t *tx)
	{
	zio_t *zio = zio_root(spa, NULL, NULL, 0);
	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
	VERIFY(zio_wait(zio) == 0);
	}

	/*
	* Note: this simple function is not inlined to make it easier to dtrace the
	* amount of time spent syncing deferred frees.
	*/
	static void
	spa_sync_deferred_frees(spa_t spa, dmu_tx_t tx)
	{
	zio_t *zio = zio_root(spa, NULL, NULL, 0);
	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
	spa_free_sync_cb, zio, tx), ==, 0);
	VERIFY0(zio_wait(zio));
	}


	static void
	spa_sync_nvlist(spa_t spa, uint64_t obj, nvlist_t nv, dmu_tx_t *tx)
	{
	char *packed = NULL;
	size_t bufsize;
	size_t nvsize = 0;
	dmu_buf_t *db;

	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);

	/*
	* Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
	* information. This avoids the dmu_buf_will_dirty() path and
	* saves us a pre-read to get data we don't actually care about.
	*/
	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
	packed = kmem_alloc(bufsize, KM_SLEEP);

	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
	KM_SLEEP) == 0);
	bzero(packed + nvsize, bufsize - nvsize);

	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);

	kmem_free(packed, bufsize);

	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
	dmu_buf_will_dirty(db, tx);
	(uint64_t )db->db_data = nvsize;
	dmu_buf_rele(db, FTAG);
	}

	static void
	spa_sync_aux_dev(spa_t spa, spa_aux_vdev_t sav, dmu_tx_t *tx,
	const char config, const char entry)
	{
	nvlist_t *nvroot;
	nvlist_t **list;
	int i;

	if (!sav->sav_sync)
	return;

	/*
	* Update the MOS nvlist describing the list of available devices.
	* spa_validate_aux() will have already made sure this nvlist is
	* valid and the vdevs are labeled appropriately.
	*/
	if (sav->sav_object == 0) {
	sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
	DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
	sizeof (uint64_t), tx);
	VERIFY(zap_update(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
	&sav->sav_object, tx) == 0);
	}

	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	if (sav->sav_count == 0) {
	VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
	} else {
	list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
	for (i = 0; i < sav->sav_count; i++)
	list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
	B_FALSE, VDEV_CONFIG_L2CACHE);
	VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
	sav->sav_count) == 0);
	for (i = 0; i < sav->sav_count; i++)
	nvlist_free(list[i]);
	kmem_free(list, sav->sav_count * sizeof (void *));
	}

	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
	nvlist_free(nvroot);

	sav->sav_sync = B_FALSE;
	}

	/*
	* Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
	* The all-vdev ZAP must be empty.
	*/
	static void
	spa_avz_build(vdev_t vd, uint64_t avz, dmu_tx_t tx)
	{
	spa_t *spa = vd->vdev_spa;
	if (vd->vdev_top_zap != 0) {
	VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
	vd->vdev_top_zap, tx));
	}
	if (vd->vdev_leaf_zap != 0) {
	VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
	vd->vdev_leaf_zap, tx));
	}
	for (uint64_t i = 0; i < vd->vdev_children; i++) {
	spa_avz_build(vd->vdev_child[i], avz, tx);
	}
	}

	static void
	spa_sync_config_object(spa_t spa, dmu_tx_t tx)
	{
	nvlist_t *config;

	/*
	* If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
	* its config may not be dirty but we still need to build per-vdev ZAPs.
	* Similarly, if the pool is being assembled (e.g. after a split), we
	* need to rebuild the AVZ although the config may not be dirty.
	*/
	if (list_is_empty(&spa->spa_config_dirty_list) &&
	spa->spa_avz_action == AVZ_ACTION_NONE)
	return;

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE \|\|
	spa->spa_avz_action == AVZ_ACTION_INITIALIZE \|\|
	spa->spa_all_vdev_zaps != 0);

	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
	/* Make and build the new AVZ */
	uint64_t new_avz = zap_create(spa->spa_meta_objset,
	DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
	spa_avz_build(spa->spa_root_vdev, new_avz, tx);

	/* Diff old AVZ with new one */
	zap_cursor_t zc;
	zap_attribute_t za;

	for (zap_cursor_init(&zc, spa->spa_meta_objset,
	spa->spa_all_vdev_zaps);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	uint64_t vdzap = za.za_first_integer;
	if (zap_lookup_int(spa->spa_meta_objset, new_avz,
	vdzap) == ENOENT) {
	/*
	* ZAP is listed in old AVZ but not in new one;
	* destroy it
	*/
	VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
	tx));
	}
	}

	zap_cursor_fini(&zc);

	/* Destroy the old AVZ */
	VERIFY0(zap_destroy(spa->spa_meta_objset,
	spa->spa_all_vdev_zaps, tx));

	/* Replace the old AVZ in the dir obj with the new one */
	VERIFY0(zap_update(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
	sizeof (new_avz), 1, &new_avz, tx));

	spa->spa_all_vdev_zaps = new_avz;
	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
	zap_cursor_t zc;
	zap_attribute_t za;

	/* Walk through the AVZ and destroy all listed ZAPs */
	for (zap_cursor_init(&zc, spa->spa_meta_objset,
	spa->spa_all_vdev_zaps);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	uint64_t zap = za.za_first_integer;
	VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
	}

	zap_cursor_fini(&zc);

	/* Destroy and unlink the AVZ itself */
	VERIFY0(zap_destroy(spa->spa_meta_objset,
	spa->spa_all_vdev_zaps, tx));
	VERIFY0(zap_remove(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
	spa->spa_all_vdev_zaps = 0;
	}

	if (spa->spa_all_vdev_zaps == 0) {
	spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
	DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_VDEV_ZAP_MAP, tx);
	}
	spa->spa_avz_action = AVZ_ACTION_NONE;

	/* Create ZAPs for vdevs that don't have them. */
	vdev_construct_zaps(spa->spa_root_vdev, tx);

	config = spa_config_generate(spa, spa->spa_root_vdev,
	dmu_tx_get_txg(tx), B_FALSE);

	/*
	* If we're upgrading the spa version then make sure that
	* the config object gets updated with the correct version.
	*/
	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
	spa->spa_uberblock.ub_version);

	spa_config_exit(spa, SCL_STATE, FTAG);

	nvlist_free(spa->spa_config_syncing);
	spa->spa_config_syncing = config;

	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
	}

	static void
	spa_sync_version(void arg, dmu_tx_t tx)
	{
	uint64_t *versionp = arg;
	uint64_t version = *versionp;
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;

	/*
	* Setting the version is special cased when first creating the pool.
	*/
	ASSERT(tx->tx_txg != TXG_INITIAL);

	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
	ASSERT(version >= spa_version(spa));

	spa->spa_uberblock.ub_version = version;
	vdev_config_dirty(spa->spa_root_vdev);
	spa_history_log_internal(spa, "set", tx, "version=%lld", version);
	}

	/*
	* Set zpool properties.
	*/
	static void
	spa_sync_props(void arg, dmu_tx_t tx)
	{
	nvlist_t *nvp = arg;
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	objset_t *mos = spa->spa_meta_objset;
	nvpair_t *elem = NULL;

	mutex_enter(&spa->spa_props_lock);

	while ((elem = nvlist_next_nvpair(nvp, elem))) {
	uint64_t intval;
	char strval, fname;
	zpool_prop_t prop;
	const char *propname;
	zprop_type_t proptype;
	spa_feature_t fid;

	switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
	case ZPOOL_PROP_INVAL:
	/*
	* We checked this earlier in spa_prop_validate().
	*/
	ASSERT(zpool_prop_feature(nvpair_name(elem)));

	fname = strchr(nvpair_name(elem), '@') + 1;
	VERIFY0(zfeature_lookup_name(fname, &fid));

	spa_feature_enable(spa, fid, tx);
	spa_history_log_internal(spa, "set", tx,
	"%s=enabled", nvpair_name(elem));
	break;

	case ZPOOL_PROP_VERSION:
	intval = fnvpair_value_uint64(elem);
	/*
	* The version is synced seperatly before other
	* properties and should be correct by now.
	*/
	ASSERT3U(spa_version(spa), >=, intval);
	break;

	case ZPOOL_PROP_ALTROOT:
	/*
	* 'altroot' is a non-persistent property. It should
	* have been set temporarily at creation or import time.
	*/
	ASSERT(spa->spa_root != NULL);
	break;

	case ZPOOL_PROP_READONLY:
	case ZPOOL_PROP_CACHEFILE:
	/*
	* 'readonly' and 'cachefile' are also non-persisitent
	* properties.
	*/
	break;
	case ZPOOL_PROP_COMMENT:
	strval = fnvpair_value_string(elem);
	if (spa->spa_comment != NULL)
	spa_strfree(spa->spa_comment);
	spa->spa_comment = spa_strdup(strval);
	/*
	* We need to dirty the configuration on all the vdevs
	* so that their labels get updated. It's unnecessary
	* to do this for pool creation since the vdev's
	* configuratoin has already been dirtied.
	*/
	if (tx->tx_txg != TXG_INITIAL)
	vdev_config_dirty(spa->spa_root_vdev);
	spa_history_log_internal(spa, "set", tx,
	"%s=%s", nvpair_name(elem), strval);
	break;
	default:
	/*
	* Set pool property values in the poolprops mos object.
	*/
	if (spa->spa_pool_props_object == 0) {
	spa->spa_pool_props_object =
	zap_create_link(mos, DMU_OT_POOL_PROPS,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
	tx);
	}

	/* normalize the property name */
	propname = zpool_prop_to_name(prop);
	proptype = zpool_prop_get_type(prop);

	if (nvpair_type(elem) == DATA_TYPE_STRING) {
	ASSERT(proptype == PROP_TYPE_STRING);
	strval = fnvpair_value_string(elem);
	VERIFY0(zap_update(mos,
	spa->spa_pool_props_object, propname,
	1, strlen(strval) + 1, strval, tx));
	spa_history_log_internal(spa, "set", tx,
	"%s=%s", nvpair_name(elem), strval);
	} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
	intval = fnvpair_value_uint64(elem);

	if (proptype == PROP_TYPE_INDEX) {
	const char *unused;
	VERIFY0(zpool_prop_index_to_string(
	prop, intval, &unused));
	}
	VERIFY0(zap_update(mos,
	spa->spa_pool_props_object, propname,
	8, 1, &intval, tx));
	spa_history_log_internal(spa, "set", tx,
	"%s=%lld", nvpair_name(elem), intval);
	} else {
	ASSERT(0); /* not allowed */
	}

	switch (prop) {
	case ZPOOL_PROP_DELEGATION:
	spa->spa_delegation = intval;
	break;
	case ZPOOL_PROP_BOOTFS:
	spa->spa_bootfs = intval;
	break;
	case ZPOOL_PROP_FAILUREMODE:
	spa->spa_failmode = intval;
	break;
	case ZPOOL_PROP_AUTOEXPAND:
	spa->spa_autoexpand = intval;
	if (tx->tx_txg != TXG_INITIAL)
	spa_async_request(spa,
	SPA_ASYNC_AUTOEXPAND);
	break;
	case ZPOOL_PROP_DEDUPDITTO:
	spa->spa_dedup_ditto = intval;
	break;
	default:
	break;
	}
	}

	}

	mutex_exit(&spa->spa_props_lock);
	}

	/*
	* Perform one-time upgrade on-disk changes. spa_version() does not
	* reflect the new version this txg, so there must be no changes this
	* txg to anything that the upgrade code depends on after it executes.
	* Therefore this must be called after dsl_pool_sync() does the sync
	* tasks.
	*/
	static void
	spa_sync_upgrades(spa_t spa, dmu_tx_t tx)
	{
	dsl_pool_t *dp = spa->spa_dsl_pool;

	ASSERT(spa->spa_sync_pass == 1);

	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);

	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
	spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
	dsl_pool_create_origin(dp, tx);

	/* Keeping the origin open increases spa_minref */
	spa->spa_minref += 3;
	}

	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
	spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
	dsl_pool_upgrade_clones(dp, tx);
	}

	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
	spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
	dsl_pool_upgrade_dir_clones(dp, tx);

	/* Keeping the freedir open increases spa_minref */
	spa->spa_minref += 3;
	}

	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
	spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
	spa_feature_create_zap_objects(spa, tx);
	}

	/*
	* LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
	* when possibility to use lz4 compression for metadata was added
	* Old pools that have this feature enabled must be upgraded to have
	* this feature active
	*/
	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
	boolean_t lz4_en = spa_feature_is_enabled(spa,
	SPA_FEATURE_LZ4_COMPRESS);
	boolean_t lz4_ac = spa_feature_is_active(spa,
	SPA_FEATURE_LZ4_COMPRESS);

	if (lz4_en && !lz4_ac)
	spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
	}

	/*
	* If we haven't written the salt, do so now. Note that the
	* feature may not be activated yet, but that's fine since
	* the presence of this ZAP entry is backwards compatible.
	*/
	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_CHECKSUM_SALT) == ENOENT) {
	VERIFY0(zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
	sizeof (spa->spa_cksum_salt.zcs_bytes),
	spa->spa_cksum_salt.zcs_bytes, tx));
	}

	rrw_exit(&dp->dp_config_rwlock, FTAG);
	}

	+static void
	+vdev_indirect_state_sync_verify(vdev_t *vd)
	+{
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
	+
	+ if (vd->vdev_ops == &vdev_indirect_ops) {
	+ ASSERT(vim != NULL);
	+ ASSERT(vib != NULL);
	+ }
	+
	+ if (vdev_obsolete_sm_object(vd) != 0) {
	+ ASSERT(vd->vdev_obsolete_sm != NULL);
	+ ASSERT(vd->vdev_removing \|\|
	+ vd->vdev_ops == &vdev_indirect_ops);
	+ ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
	+ ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
	+
	+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
	+ space_map_object(vd->vdev_obsolete_sm));
	+ ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
	+ space_map_allocated(vd->vdev_obsolete_sm));
	+ }
	+ ASSERT(vd->vdev_obsolete_segments != NULL);
	+
	+ /*
	+ * Since frees / remaps to an indirect vdev can only
	+ * happen in syncing context, the obsolete segments
	+ * tree must be empty when we start syncing.
	+ */
	+ ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
	+}
	+
	/*
	* Sync the specified transaction group. New blocks may be dirtied as
	* part of the process, so we iterate until it converges.
	*/
	void
	spa_sync(spa_t *spa, uint64_t txg)
	{
	dsl_pool_t *dp = spa->spa_dsl_pool;
	objset_t *mos = spa->spa_meta_objset;
	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t *vd;
	dmu_tx_t *tx;
	int error;
	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
	zfs_vdev_queue_depth_pct / 100;

	VERIFY(spa_writeable(spa));

	/*
	+ * Wait for i/os issued in open context that need to complete
	+ * before this txg syncs.
	+ */
	+ VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK]));
	+ spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0);
	+
	+ /*
	* Lock out configuration changes.
	*/
	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

	spa->spa_syncing_txg = txg;
	spa->spa_sync_pass = 0;

	mutex_enter(&spa->spa_alloc_lock);
	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
	mutex_exit(&spa->spa_alloc_lock);

	/*
	* If there are any pending vdev state changes, convert them
	* into config changes that go out with this transaction group.
	*/
	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	while (list_head(&spa->spa_state_dirty_list) != NULL) {
	/*
	* We need the write lock here because, for aux vdevs,
	* calling vdev_config_dirty() modifies sav_config.
	* This is ugly and will become unnecessary when we
	* eliminate the aux vdev wart by integrating all vdevs
	* into the root vdev tree.
	*/
	spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	spa_config_enter(spa, SCL_CONFIG \| SCL_STATE, FTAG, RW_WRITER);
	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
	vdev_state_clean(vd);
	vdev_config_dirty(vd);
	}
	spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	spa_config_enter(spa, SCL_CONFIG \| SCL_STATE, FTAG, RW_READER);
	}
	spa_config_exit(spa, SCL_STATE, FTAG);

	tx = dmu_tx_create_assigned(dp, txg);

	spa->spa_sync_starttime = gethrtime();
	#ifdef illumos
	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
	spa->spa_sync_starttime + spa->spa_deadman_synctime));
	#else /* !illumos */
	#ifdef _KERNEL
	callout_schedule(&spa->spa_deadman_cycid,
	hz * spa->spa_deadman_synctime / NANOSEC);
	#endif
	#endif /* illumos */

	/*
	* If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
	* set spa_deflate if we have no raid-z vdevs.
	*/
	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
	spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
	int i;

	for (i = 0; i < rvd->vdev_children; i++) {
	vd = rvd->vdev_child[i];
	if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
	break;
	}
	if (i == rvd->vdev_children) {
	spa->spa_deflate = TRUE;
	VERIFY(0 == zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
	sizeof (uint64_t), 1, &spa->spa_deflate, tx));
	}
	}

	/*
	* Set the top-level vdev's max queue depth. Evaluate each
	* top-level's async write queue depth in case it changed.
	* The max queue depth will not change in the middle of syncing
	* out this txg.
	*/
	uint64_t queue_depth_total = 0;
	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	metaslab_group_t *mg = tvd->vdev_mg;

	if (mg == NULL \|\| mg->mg_class != spa_normal_class(spa) \|\|
	!metaslab_group_initialized(mg))
	continue;

	/*
	* It is safe to do a lock-free check here because only async
	* allocations look at mg_max_alloc_queue_depth, and async
	* allocations all happen from spa_sync().
	*/
	ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
	mg->mg_max_alloc_queue_depth = max_queue_depth;
	queue_depth_total += mg->mg_max_alloc_queue_depth;
	}
	metaslab_class_t *mc = spa_normal_class(spa);
	ASSERT0(refcount_count(&mc->mc_alloc_slots));
	mc->mc_alloc_max_slots = queue_depth_total;
	mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;

	ASSERT3U(mc->mc_alloc_max_slots, <=,
	max_queue_depth * rvd->vdev_children);

	+ for (int c = 0; c < rvd->vdev_children; c++) {
	+ vdev_t *vd = rvd->vdev_child[c];
	+ vdev_indirect_state_sync_verify(vd);
	+
	+ if (vdev_indirect_should_condense(vd)) {
	+ spa_condense_indirect_start_sync(vd, tx);
	+ break;
	+ }
	+ }
	+
	/*
	* Iterate to convergence.
	*/
	do {
	int pass = ++spa->spa_sync_pass;

	spa_sync_config_object(spa, tx);
	spa_sync_aux_dev(spa, &spa->spa_spares, tx,
	ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
	spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
	ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
	spa_errlog_sync(spa, txg);
	dsl_pool_sync(dp, txg);

	if (pass < zfs_sync_pass_deferred_free) {
	spa_sync_frees(spa, free_bpl, tx);
	} else {
	/*
	* We can not defer frees in pass 1, because
	* we sync the deferred frees later in pass 1.
	*/
	ASSERT3U(pass, >, 1);
	bplist_iterate(free_bpl, bpobj_enqueue_cb,
	&spa->spa_deferred_bpobj, tx);
	}

	ddt_sync(spa, txg);
	dsl_scan_sync(dp, tx);

	- while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
	+ if (spa->spa_vdev_removal != NULL)
	+ svr_sync(spa, tx);
	+
	+ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
	+ != NULL)
	vdev_sync(vd, txg);

	if (pass == 1) {
	spa_sync_upgrades(spa, tx);
	ASSERT3U(txg, >=,
	spa->spa_uberblock.ub_rootbp.blk_birth);
	/*
	* Note: We need to check if the MOS is dirty
	* because we could have marked the MOS dirty
	* without updating the uberblock (e.g. if we
	* have sync tasks but no dirty user data). We
	* need to check the uberblock's rootbp because
	* it is updated if we have synced out dirty
	* data (though in this case the MOS will most
	* likely also be dirty due to second order
	* effects, we don't want to rely on that here).
	*/
	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
	!dmu_objset_is_dirty(mos, txg)) {
	/*
	* Nothing changed on the first pass,
	* therefore this TXG is a no-op. Avoid
	* syncing deferred frees, so that we
	* can keep this TXG as a no-op.
	*/
	ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
	txg));
	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
	ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
	break;
	}
	spa_sync_deferred_frees(spa, tx);
	}

	} while (dmu_objset_is_dirty(mos, txg));

	if (!list_is_empty(&spa->spa_config_dirty_list)) {
	/*
	* Make sure that the number of ZAPs for all the vdevs matches
	* the number of ZAPs in the per-vdev ZAP list. This only gets
	* called if the config is dirty; otherwise there may be
	* outstanding AVZ operations that weren't completed in
	* spa_sync_config_object.
	*/
	uint64_t all_vdev_zap_entry_count;
	ASSERT0(zap_count(spa->spa_meta_objset,
	spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
	ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
	all_vdev_zap_entry_count);
	}

	+ if (spa->spa_vdev_removal != NULL) {
	+ ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
	+ }
	+
	/*
	* Rewrite the vdev configuration (which includes the uberblock)
	* to commit the transaction group.
	*
	* If there are no dirty vdevs, we sync the uberblock to a few
	* random top-level vdevs that are known to be visible in the
	* config cache (see spa_vdev_add() for a complete description).
	* If there are dirty vdevs, sync the uberblock to all vdevs.
	*/
	for (;;) {
	/*
	* We hold SCL_STATE to prevent vdev open/close/etc.
	* while we're attempting to write the vdev labels.
	*/
	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

	if (list_is_empty(&spa->spa_config_dirty_list)) {
	vdev_t *svd[SPA_DVAS_PER_BP];
	int svdcount = 0;
	int children = rvd->vdev_children;
	int c0 = spa_get_random(children);

	for (int c = 0; c < children; c++) {
	vd = rvd->vdev_child[(c0 + c) % children];
	- if (vd->vdev_ms_array == 0 \|\| vd->vdev_islog)
	+ if (vd->vdev_ms_array == 0 \|\| vd->vdev_islog \|\|
	+ !vdev_is_concrete(vd))
	continue;
	svd[svdcount++] = vd;
	if (svdcount == SPA_DVAS_PER_BP)
	break;
	}
	error = vdev_config_sync(svd, svdcount, txg);
	} else {
	error = vdev_config_sync(rvd->vdev_child,
	rvd->vdev_children, txg);
	}

	if (error == 0)
	spa->spa_last_synced_guid = rvd->vdev_guid;

	spa_config_exit(spa, SCL_STATE, FTAG);

	if (error == 0)
	break;
	zio_suspend(spa, NULL);
	zio_resume_wait(spa);
	}
	dmu_tx_commit(tx);

	#ifdef illumos
	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
	#else /* !illumos */
	#ifdef _KERNEL
	callout_drain(&spa->spa_deadman_cycid);
	#endif
	#endif /* illumos */

	/*
	* Clear the dirty config list.
	*/
	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
	vdev_config_clean(vd);

	/*
	* Now that the new config has synced transactionally,
	* let it become visible to the config cache.
	*/
	if (spa->spa_config_syncing != NULL) {
	spa_config_set(spa, spa->spa_config_syncing);
	spa->spa_config_txg = txg;
	spa->spa_config_syncing = NULL;
	}

	dsl_pool_sync_done(dp, txg);

	mutex_enter(&spa->spa_alloc_lock);
	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
	mutex_exit(&spa->spa_alloc_lock);

	/*
	* Update usable space statistics.
	*/
	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
	vdev_sync_done(vd, txg);

	spa_update_dspace(spa);

	/*
	* It had better be the case that we didn't dirty anything
	* since vdev_config_sync().
	*/
	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));

	spa->spa_sync_pass = 0;

	/*
	* Update the last synced uberblock here. We want to do this at
	* the end of spa_sync() so that consumers of spa_last_synced_txg()
	* will be guaranteed that all the processing associated with
	* that txg has been completed.
	*/
	spa->spa_ubsync = spa->spa_uberblock;
	spa_config_exit(spa, SCL_CONFIG, FTAG);

	spa_handle_ignored_writes(spa);

	/*
	* If any async tasks have been requested, kick them off.
	*/
	spa_async_dispatch(spa);
	spa_async_dispatch_vd(spa);
	}

	/*
	* Sync all pools. We don't want to hold the namespace lock across these
	* operations, so we take a reference on the spa_t and drop the lock during the
	* sync.
	*/
	void
	spa_sync_allpools(void)
	{
	spa_t *spa = NULL;
	mutex_enter(&spa_namespace_lock);
	while ((spa = spa_next(spa)) != NULL) {
	if (spa_state(spa) != POOL_STATE_ACTIVE \|\|
	!spa_writeable(spa) \|\| spa_suspended(spa))
	continue;
	spa_open_ref(spa, FTAG);
	mutex_exit(&spa_namespace_lock);
	txg_wait_synced(spa_get_dsl(spa), 0);
	mutex_enter(&spa_namespace_lock);
	spa_close(spa, FTAG);
	}
	mutex_exit(&spa_namespace_lock);
	}

	/*
	* ==========================================================================
	* Miscellaneous routines
	* ==========================================================================
	*/

	/*
	* Remove all pools in the system.
	*/
	void
	spa_evict_all(void)
	{
	spa_t *spa;

	/*
	* Remove all cached state. All pools should be closed now,
	* so every spa in the AVL tree should be unreferenced.
	*/
	mutex_enter(&spa_namespace_lock);
	while ((spa = spa_next(NULL)) != NULL) {
	/*
	* Stop async tasks. The async thread may need to detach
	* a device that's been replaced, which requires grabbing
	* spa_namespace_lock, so we must drop it here.
	*/
	spa_open_ref(spa, FTAG);
	mutex_exit(&spa_namespace_lock);
	spa_async_suspend(spa);
	mutex_enter(&spa_namespace_lock);
	spa_close(spa, FTAG);

	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
	spa_unload(spa);
	spa_deactivate(spa);
	}
	spa_remove(spa);
	}
	mutex_exit(&spa_namespace_lock);
	}

	vdev_t *
	spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
	{
	vdev_t *vd;
	int i;

	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
	return (vd);

	if (aux) {
	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
	vd = spa->spa_l2cache.sav_vdevs[i];
	if (vd->vdev_guid == guid)
	return (vd);
	}

	for (i = 0; i < spa->spa_spares.sav_count; i++) {
	vd = spa->spa_spares.sav_vdevs[i];
	if (vd->vdev_guid == guid)
	return (vd);
	}
	}

	return (NULL);
	}

	void
	spa_upgrade(spa_t *spa, uint64_t version)
	{
	ASSERT(spa_writeable(spa));

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	/*
	* This should only be called for a non-faulted pool, and since a
	* future version would result in an unopenable pool, this shouldn't be
	* possible.
	*/
	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);

	spa->spa_uberblock.ub_version = version;
	vdev_config_dirty(spa->spa_root_vdev);

	spa_config_exit(spa, SCL_ALL, FTAG);

	txg_wait_synced(spa_get_dsl(spa), 0);
	}

	boolean_t
	spa_has_spare(spa_t *spa, uint64_t guid)
	{
	int i;
	uint64_t spareguid;
	spa_aux_vdev_t *sav = &spa->spa_spares;

	for (i = 0; i < sav->sav_count; i++)
	if (sav->sav_vdevs[i]->vdev_guid == guid)
	return (B_TRUE);

	for (i = 0; i < sav->sav_npending; i++) {
	if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
	&spareguid) == 0 && spareguid == guid)
	return (B_TRUE);
	}

	return (B_FALSE);
	}

	/*
	* Check if a pool has an active shared spare device.
	* Note: reference count of an active spare is 2, as a spare and as a replace
	*/
	static boolean_t
	spa_has_active_shared_spare(spa_t *spa)
	{
	int i, refcnt;
	uint64_t pool;
	spa_aux_vdev_t *sav = &spa->spa_spares;

	for (i = 0; i < sav->sav_count; i++) {
	if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
	&refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
	refcnt > 2)
	return (B_TRUE);
	}

	return (B_FALSE);
	}

	-static sysevent_t *
	+sysevent_t *
	spa_event_create(spa_t spa, vdev_t vd, nvlist_t hist_nvl, const char name)
	{
	sysevent_t *ev = NULL;
	#ifdef _KERNEL
	sysevent_attr_list_t *attr = NULL;
	sysevent_value_t value;

	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
	SE_SLEEP);
	ASSERT(ev != NULL);

	value.value_type = SE_DATA_TYPE_STRING;
	value.value.sv_string = spa_name(spa);
	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
	goto done;

	value.value_type = SE_DATA_TYPE_UINT64;
	value.value.sv_uint64 = spa_guid(spa);
	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
	goto done;

	if (vd) {
	value.value_type = SE_DATA_TYPE_UINT64;
	value.value.sv_uint64 = vd->vdev_guid;
	if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
	SE_SLEEP) != 0)
	goto done;

	if (vd->vdev_path) {
	value.value_type = SE_DATA_TYPE_STRING;
	value.value.sv_string = vd->vdev_path;
	if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
	&value, SE_SLEEP) != 0)
	goto done;
	}
	}

	if (hist_nvl != NULL) {
	fnvlist_merge((nvlist_t *)attr, hist_nvl);
	}

	if (sysevent_attach_attributes(ev, attr) != 0)
	goto done;
	attr = NULL;

	done:
	if (attr)
	sysevent_free_attr(attr);

	#endif
	return (ev);
	}

	-static void
	+void
	spa_event_post(sysevent_t *ev)
	{
	#ifdef _KERNEL
	sysevent_id_t eid;

	(void) log_sysevent(ev, SE_SLEEP, &eid);
	+ sysevent_free(ev);
	+#endif
	+}
	+
	+void
	+spa_event_discard(sysevent_t *ev)
	+{
	+#ifdef _KERNEL
	sysevent_free(ev);
	#endif
	}

	/*
	* Post a sysevent corresponding to the given event. The 'name' must be one of
	* the event definitions in sys/sysevent/eventdefs.h. The payload will be
	* filled in from the spa and (optionally) the vdev and history nvl. This
	* doesn't do anything in the userland libzpool, as we don't want consumers to
	* misinterpret ztest or zdb as real changes.
	*/
	void
	spa_event_notify(spa_t spa, vdev_t vd, nvlist_t hist_nvl, const char name)
	{
	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c (revision 332525)
	@@ -1,563 +1,563 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	* Copyright 2017 Joyent, Inc.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/fm/fs/zfs.h>
	#include <sys/spa_impl.h>
	#include <sys/nvpair.h>
	#include <sys/uio.h>
	#include <sys/fs/zfs.h>
	#include <sys/vdev_impl.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/utsname.h>
	#include <sys/sunddi.h>
	#include <sys/zfeature.h>
	#ifdef _KERNEL
	#include <sys/kobj.h>
	#include <sys/zone.h>
	#endif

	/*
	* Pool configuration repository.
	*
	* Pool configuration is stored as a packed nvlist on the filesystem. By
	* default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
	* (when the ZFS module is loaded). Pools can also have the 'cachefile'
	* property set that allows them to be stored in an alternate location until
	* the control of external software.
	*
	* For each cache file, we have a single nvlist which holds all the
	* configuration information. When the module loads, we read this information
	* from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is
	* maintained independently in spa.c. Whenever the namespace is modified, or
	- * the configuration of a pool is changed, we call spa_config_sync(), which
	+ * the configuration of a pool is changed, we call spa_write_cachefile(), which
	* walks through all the active pools and writes the configuration to disk.
	*/

	static uint64_t spa_config_generation = 1;

	/*
	* This can be overridden in userland to preserve an alternate namespace for
	* userland pools when doing testing.
	*/
	const char *spa_config_path = ZPOOL_CACHE;

	/*
	* Called when the module is first loaded, this routine loads the configuration
	* file into the SPA namespace. It does not actually open or load the pools; it
	* only populates the namespace.
	*/
	void
	spa_config_load(void)
	{
	void *buf = NULL;
	nvlist_t nvlist, child;
	nvpair_t *nvpair;
	char *pathname;
	struct _buf *file;
	uint64_t fsize;

	/*
	* Open the configuration file.
	*/
	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);

	(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);

	file = kobj_open_file(pathname);

	kmem_free(pathname, MAXPATHLEN);

	if (file == (struct _buf *)-1)
	return;

	if (kobj_get_filesize(file, &fsize) != 0)
	goto out;

	buf = kmem_alloc(fsize, KM_SLEEP);

	/*
	* Read the nvlist from the file.
	*/
	if (kobj_read_file(file, buf, fsize, 0) < 0)
	goto out;

	/*
	* Unpack the nvlist.
	*/
	if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
	goto out;

	/*
	* Iterate over all elements in the nvlist, creating a new spa_t for
	* each one with the specified configuration.
	*/
	mutex_enter(&spa_namespace_lock);
	nvpair = NULL;
	while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
	if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
	continue;

	child = fnvpair_value_nvlist(nvpair);

	if (spa_lookup(nvpair_name(nvpair)) != NULL)
	continue;
	(void) spa_add(nvpair_name(nvpair), child, NULL);
	}
	mutex_exit(&spa_namespace_lock);

	nvlist_free(nvlist);

	out:
	if (buf != NULL)
	kmem_free(buf, fsize);

	kobj_close_file(file);
	}

	static void
	spa_config_clean(nvlist_t *nvl)
	{
	nvlist_t **child;
	nvlist_t *nvroot = NULL;
	uint_t c, children;

	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
	&children) == 0) {
	for (c = 0; c < children; c++)
	spa_config_clean(child[c]);
	}

	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0)
	spa_config_clean(nvroot);

	nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY);
	nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY);
	}

	static int
	spa_config_write(spa_config_dirent_t dp, nvlist_t nvl)
	{
	size_t buflen;
	char *buf;
	vnode_t *vp;
	int oflags = FWRITE \| FTRUNC \| FCREAT \| FOFFMAX;
	char *temp;
	int err;

	/*
	* If the nvlist is empty (NULL), then remove the old cachefile.
	*/
	if (nvl == NULL) {
	err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
	return (err);
	}

	/*
	* Pack the configuration into a buffer.
	*/
	buf = fnvlist_pack(nvl, &buflen);
	temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP);

	/*
	* Write the configuration to disk. We need to do the traditional
	* 'write to temporary file, sync, move over original' to make sure we
	* always have a consistent view of the data.
	*/
	(void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path);

	err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0);
	if (err == 0) {
	err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
	0, RLIM64_INFINITY, kcred, NULL);
	if (err == 0)
	err = VOP_FSYNC(vp, FSYNC, kcred, NULL);
	if (err == 0)
	err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE);
	(void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL);
	}

	(void) vn_remove(temp, UIO_SYSSPACE, RMFILE);

	fnvlist_pack_free(buf, buflen);
	kmem_free(temp, MAXPATHLEN);
	return (err);
	}

	/*
	* Synchronize pool configuration to disk. This must be called with the
	* namespace lock held. Synchronizing the pool cache is typically done after
	* the configuration has been synced to the MOS. This exposes a window where
	* the MOS config will have been updated but the cache file has not. If
	* the system were to crash at that instant then the cached config may not
	- * contain the correct information to open the pool and an explicity import
	+ * contain the correct information to open the pool and an explicit import
	* would be required.
	*/
	void
	-spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
	+spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
	{
	spa_config_dirent_t dp, tdp;
	nvlist_t *nvl;
	boolean_t ccw_failure;
	int error;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	if (rootdir == NULL \|\| !(spa_mode_global & FWRITE))
	return;

	/*
	* Iterate over all cachefiles for the pool, past or present. When the
	* cachefile is changed, the new one is pushed onto this list, allowing
	* us to update previous cachefiles that no longer contain this pool.
	*/
	ccw_failure = B_FALSE;
	for (dp = list_head(&target->spa_config_list); dp != NULL;
	dp = list_next(&target->spa_config_list, dp)) {
	spa_t *spa = NULL;
	if (dp->scd_path == NULL)
	continue;

	/*
	* Iterate over all pools, adding any matching pools to 'nvl'.
	*/
	nvl = NULL;
	while ((spa = spa_next(spa)) != NULL) {
	nvlist_t *nvroot = NULL;
	/*
	* Skip over our own pool if we're about to remove
	* ourselves from the spa namespace or any pool that
	* is readonly. Since we cannot guarantee that a
	* readonly pool would successfully import upon reboot,
	* we don't allow them to be written to the cache file.
	*/
	if ((spa == target && removing) \|\|
	(spa_state(spa) == POOL_STATE_ACTIVE &&
	!spa_writeable(spa)))
	continue;

	mutex_enter(&spa->spa_props_lock);
	tdp = list_head(&spa->spa_config_list);
	if (spa->spa_config == NULL \|\|
	tdp->scd_path == NULL \|\|
	strcmp(tdp->scd_path, dp->scd_path) != 0) {
	mutex_exit(&spa->spa_props_lock);
	continue;
	}

	if (nvl == NULL)
	nvl = fnvlist_alloc();

	fnvlist_add_nvlist(nvl, spa->spa_name,
	spa->spa_config);
	mutex_exit(&spa->spa_props_lock);

	if (nvlist_lookup_nvlist(nvl, spa->spa_name, &nvroot) == 0)
	spa_config_clean(nvroot);
	}

	error = spa_config_write(dp, nvl);
	if (error != 0)
	ccw_failure = B_TRUE;
	nvlist_free(nvl);
	}

	if (ccw_failure) {
	/*
	* Keep trying so that configuration data is
	* written if/when any temporary filesystem
	* resource issues are resolved.
	*/
	if (target->spa_ccw_fail_time == 0) {
	zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
	target, NULL, NULL, 0, 0);
	}
	target->spa_ccw_fail_time = gethrtime();
	spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
	} else {
	/*
	* Do not rate limit future attempts to update
	* the config cache.
	*/
	target->spa_ccw_fail_time = 0;
	}

	/*
	* Remove any config entries older than the current one.
	*/
	dp = list_head(&target->spa_config_list);
	while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) {
	list_remove(&target->spa_config_list, tdp);
	if (tdp->scd_path != NULL)
	spa_strfree(tdp->scd_path);
	kmem_free(tdp, sizeof (spa_config_dirent_t));
	}

	spa_config_generation++;

	if (postsysevent)
	spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
	}

	/*
	* Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
	* and we don't want to allow the local zone to see all the pools anyway.
	* So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
	* information for all pool visible within the zone.
	*/
	nvlist_t *
	spa_all_configs(uint64_t *generation)
	{
	nvlist_t *pools;
	spa_t *spa = NULL;

	if (*generation == spa_config_generation)
	return (NULL);

	pools = fnvlist_alloc();

	mutex_enter(&spa_namespace_lock);
	while ((spa = spa_next(spa)) != NULL) {
	if (INGLOBALZONE(curthread) \|\|
	zone_dataset_visible(spa_name(spa), NULL)) {
	mutex_enter(&spa->spa_props_lock);
	fnvlist_add_nvlist(pools, spa_name(spa),
	spa->spa_config);
	mutex_exit(&spa->spa_props_lock);
	}
	}
	*generation = spa_config_generation;
	mutex_exit(&spa_namespace_lock);

	return (pools);
	}

	void
	spa_config_set(spa_t spa, nvlist_t config)
	{
	mutex_enter(&spa->spa_props_lock);
	nvlist_free(spa->spa_config);
	spa->spa_config = config;
	mutex_exit(&spa->spa_props_lock);
	}

	/*
	* Generate the pool's configuration based on the current in-core state.
	*
	* We infer whether to generate a complete config or just one top-level config
	* based on whether vd is the root vdev.
	*/
	nvlist_t *
	spa_config_generate(spa_t spa, vdev_t vd, uint64_t txg, int getstats)
	{
	nvlist_t config, nvroot;
	vdev_t *rvd = spa->spa_root_vdev;
	unsigned long hostid = 0;
	boolean_t locked = B_FALSE;
	uint64_t split_guid;

	if (vd == NULL) {
	vd = rvd;
	locked = B_TRUE;
	spa_config_enter(spa, SCL_CONFIG \| SCL_STATE, FTAG, RW_READER);
	}

	ASSERT(spa_config_held(spa, SCL_CONFIG \| SCL_STATE, RW_READER) ==
	(SCL_CONFIG \| SCL_STATE));

	/*
	* If txg is -1, report the current value of spa->spa_config_txg.
	*/
	if (txg == -1ULL)
	txg = spa->spa_config_txg;

	config = fnvlist_alloc();

	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
	fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
	if (spa->spa_comment != NULL) {
	fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
	spa->spa_comment);
	}

	#ifdef _KERNEL
	hostid = zone_get_hostid(NULL);
	#else /* _KERNEL */
	/*
	* We're emulating the system's hostid in userland, so we can't use
	* zone_get_hostid().
	*/
	(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
	#endif /* _KERNEL */
	if (hostid != 0) {
	fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
	}
	fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename);

	int config_gen_flags = 0;
	if (vd != rvd) {
	fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
	vd->vdev_top->vdev_guid);
	fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
	vd->vdev_guid);
	if (vd->vdev_isspare) {
	fnvlist_add_uint64(config,
	ZPOOL_CONFIG_IS_SPARE, 1ULL);
	}
	if (vd->vdev_islog) {
	fnvlist_add_uint64(config,
	ZPOOL_CONFIG_IS_LOG, 1ULL);
	}
	vd = vd->vdev_top; /* label contains top config */
	} else {
	/*
	* Only add the (potentially large) split information
	* in the mos config, and not in the vdev labels
	*/
	if (spa->spa_config_splitting != NULL)
	fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
	spa->spa_config_splitting);
	fnvlist_add_boolean(config,
	ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS);

	config_gen_flags \|= VDEV_CONFIG_MOS;
	}

	/*
	* Add the top-level config. We even add this on pools which
	* don't support holes in the namespace.
	*/
	vdev_top_config_generate(spa, config);

	/*
	* If we're splitting, record the original pool's guid.
	*/
	if (spa->spa_config_splitting != NULL &&
	nvlist_lookup_uint64(spa->spa_config_splitting,
	ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
	fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
	split_guid);
	}

	nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags);
	fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot);
	nvlist_free(nvroot);

	/*
	* Store what's necessary for reading the MOS in the label.
	*/
	fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
	spa->spa_label_features);

	if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
	ddt_histogram_t *ddh;
	ddt_stat_t *dds;
	ddt_object_t *ddo;

	ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
	ddt_get_dedup_histogram(spa, ddh);
	fnvlist_add_uint64_array(config,
	ZPOOL_CONFIG_DDT_HISTOGRAM,
	(uint64_t )ddh, sizeof (ddh) / sizeof (uint64_t));
	kmem_free(ddh, sizeof (ddt_histogram_t));

	ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
	ddt_get_dedup_object_stats(spa, ddo);
	fnvlist_add_uint64_array(config,
	ZPOOL_CONFIG_DDT_OBJ_STATS,
	(uint64_t )ddo, sizeof (ddo) / sizeof (uint64_t));
	kmem_free(ddo, sizeof (ddt_object_t));

	dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
	ddt_get_dedup_stats(spa, dds);
	fnvlist_add_uint64_array(config,
	ZPOOL_CONFIG_DDT_STATS,
	(uint64_t )dds, sizeof (dds) / sizeof (uint64_t));
	kmem_free(dds, sizeof (ddt_stat_t));
	}

	if (locked)
	spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);

	return (config);
	}

	/*
	* Update all disk labels, generate a fresh config based on the current
	* in-core state, and sync the global config cache (do not sync the config
	* cache if this is a booting rootpool).
	*/
	void
	spa_config_update(spa_t *spa, int what)
	{
	vdev_t *rvd = spa->spa_root_vdev;
	uint64_t txg;
	int c;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	txg = spa_last_synced_txg(spa) + 1;
	if (what == SPA_CONFIG_UPDATE_POOL) {
	vdev_config_dirty(rvd);
	} else {
	/*
	* If we have top-level vdevs that were added but have
	* not yet been prepared for allocation, do that now.
	* (It's safe now because the config cache is up to date,
	* so it will be able to translate the new DVAs.)
	* See comments in spa_vdev_add() for full details.
	*/
	for (c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	if (tvd->vdev_ms_array == 0) {
	vdev_ashift_optimize(tvd);
	vdev_metaslab_set_size(tvd);
	}
	vdev_expand(tvd, txg);
	}
	}
	spa_config_exit(spa, SCL_ALL, FTAG);

	/*
	* Wait for the mosconfig to be regenerated and synced.
	*/
	txg_wait_synced(spa->spa_dsl_pool, txg);

	/*
	* Update the global config cache to reflect the new mosconfig.
	*/
	- spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
	+ spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);

	if (what == SPA_CONFIG_UPDATE_POOL)
	spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c (revision 332525)
	@@ -1,2207 +1,2276 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright 2013 Saso Kiselkov. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright (c) 2017 Datto Inc.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa_impl.h>
	#include <sys/spa_boot.h>
	#include <sys/zio.h>
	#include <sys/zio_checksum.h>
	#include <sys/zio_compress.h>
	#include <sys/dmu.h>
	#include <sys/dmu_tx.h>
	#include <sys/zap.h>
	#include <sys/zil.h>
	#include <sys/vdev_impl.h>
	#include <sys/vdev_file.h>
	#include <sys/metaslab.h>
	#include <sys/uberblock_impl.h>
	#include <sys/txg.h>
	#include <sys/avl.h>
	#include <sys/unique.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_scan.h>
	#include <sys/fs/zfs.h>
	#include <sys/metaslab_impl.h>
	#include <sys/arc.h>
	#include <sys/ddt.h>
	#include "zfs_prop.h"
	#include <sys/zfeature.h>

	#if defined(__FreeBSD__) && defined(_KERNEL)
	#include <sys/types.h>
	#include <sys/sysctl.h>
	#endif

	/*
	* SPA locking
	*
	* There are four basic locks for managing spa_t structures:
	*
	* spa_namespace_lock (global mutex)
	*
	* This lock must be acquired to do any of the following:
	*
	* - Lookup a spa_t by name
	* - Add or remove a spa_t from the namespace
	* - Increase spa_refcount from non-zero
	* - Check if spa_refcount is zero
	* - Rename a spa_t
	* - add/remove/attach/detach devices
	* - Held for the duration of create/destroy/import/export
	*
	* It does not need to handle recursion. A create or destroy may
	* reference objects (files or zvols) in other pools, but by
	* definition they must have an existing reference, and will never need
	* to lookup a spa_t by name.
	*
	* spa_refcount (per-spa refcount_t protected by mutex)
	*
	* This reference count keep track of any active users of the spa_t. The
	* spa_t cannot be destroyed or freed while this is non-zero. Internally,
	* the refcount is never really 'zero' - opening a pool implicitly keeps
	* some references in the DMU. Internally we check against spa_minref, but
	* present the image of a zero/non-zero value to consumers.
	*
	* spa_config_lock[] (per-spa array of rwlocks)
	*
	* This protects the spa_t from config changes, and must be held in
	* the following circumstances:
	*
	* - RW_READER to perform I/O to the spa
	* - RW_WRITER to change the vdev config
	*
	* The locking order is fairly straightforward:
	*
	* spa_namespace_lock -> spa_refcount
	*
	* The namespace lock must be acquired to increase the refcount from 0
	* or to check if it is zero.
	*
	* spa_refcount -> spa_config_lock[]
	*
	* There must be at least one valid reference on the spa_t to acquire
	* the config lock.
	*
	* spa_namespace_lock -> spa_config_lock[]
	*
	* The namespace lock must always be taken before the config lock.
	*
	*
	* The spa_namespace_lock can be acquired directly and is globally visible.
	*
	* The namespace is manipulated using the following functions, all of which
	* require the spa_namespace_lock to be held.
	*
	* spa_lookup() Lookup a spa_t by name.
	*
	* spa_add() Create a new spa_t in the namespace.
	*
	* spa_remove() Remove a spa_t from the namespace. This also
	* frees up any memory associated with the spa_t.
	*
	* spa_next() Returns the next spa_t in the system, or the
	* first if NULL is passed.
	*
	* spa_evict_all() Shutdown and remove all spa_t structures in
	* the system.
	*
	* spa_guid_exists() Determine whether a pool/device guid exists.
	*
	* The spa_refcount is manipulated using the following functions:
	*
	* spa_open_ref() Adds a reference to the given spa_t. Must be
	* called with spa_namespace_lock held if the
	* refcount is currently zero.
	*
	* spa_close() Remove a reference from the spa_t. This will
	* not free the spa_t or remove it from the
	* namespace. No locking is required.
	*
	* spa_refcount_zero() Returns true if the refcount is currently
	* zero. Must be called with spa_namespace_lock
	* held.
	*
	* The spa_config_lock[] is an array of rwlocks, ordered as follows:
	* SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
	* spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
	*
	* To read the configuration, it suffices to hold one of these locks as reader.
	* To modify the configuration, you must hold all locks as writer. To modify
	* vdev state without altering the vdev tree's topology (e.g. online/offline),
	* you must hold SCL_STATE and SCL_ZIO as writer.
	*
	* We use these distinct config locks to avoid recursive lock entry.
	* For example, spa_sync() (which holds SCL_CONFIG as reader) induces
	* block allocations (SCL_ALLOC), which may require reading space maps
	* from disk (dmu_read() -> zio_read() -> SCL_ZIO).
	*
	* The spa config locks cannot be normal rwlocks because we need the
	* ability to hand off ownership. For example, SCL_ZIO is acquired
	* by the issuing thread and later released by an interrupt thread.
	* They do, however, obey the usual write-wanted semantics to prevent
	* writer (i.e. system administrator) starvation.
	*
	* The lock acquisition rules are as follows:
	*
	* SCL_CONFIG
	* Protects changes to the vdev tree topology, such as vdev
	* add/remove/attach/detach. Protects the dirty config list
	* (spa_config_dirty_list) and the set of spares and l2arc devices.
	*
	* SCL_STATE
	* Protects changes to pool state and vdev state, such as vdev
	* online/offline/fault/degrade/clear. Protects the dirty state list
	* (spa_state_dirty_list) and global pool state (spa_state).
	*
	* SCL_ALLOC
	* Protects changes to metaslab groups and classes.
	* Held as reader by metaslab_alloc() and metaslab_claim().
	*
	* SCL_ZIO
	* Held by bp-level zios (those which have no io_vd upon entry)
	* to prevent changes to the vdev tree. The bp-level zio implicitly
	* protects all of its vdev child zios, which do not hold SCL_ZIO.
	*
	* SCL_FREE
	* Protects changes to metaslab groups and classes.
	* Held as reader by metaslab_free(). SCL_FREE is distinct from
	* SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
	* blocks in zio_done() while another i/o that holds either
	* SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
	*
	* SCL_VDEV
	* Held as reader to prevent changes to the vdev tree during trivial
	* inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the
	* other locks, and lower than all of them, to ensure that it's safe
	* to acquire regardless of caller context.
	*
	* In addition, the following rules apply:
	*
	* (a) spa_props_lock protects pool properties, spa_config and spa_config_list.
	* The lock ordering is SCL_CONFIG > spa_props_lock.
	*
	* (b) I/O operations on leaf vdevs. For any zio operation that takes
	* an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
	* or zio_write_phys() -- the caller must ensure that the config cannot
	* cannot change in the interim, and that the vdev cannot be reopened.
	* SCL_STATE as reader suffices for both.
	*
	* The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
	*
	* spa_vdev_enter() Acquire the namespace lock and the config lock
	* for writing.
	*
	* spa_vdev_exit() Release the config lock, wait for all I/O
	* to complete, sync the updated configs to the
	* cache, and release the namespace lock.
	*
	* vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
	* Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
	* locking is, always, based on spa_namespace_lock and spa_config_lock[].
	*
	* spa_rename() is also implemented within this file since it requires
	* manipulation of the namespace.
	*/

	static avl_tree_t spa_namespace_avl;
	kmutex_t spa_namespace_lock;
	static kcondvar_t spa_namespace_cv;
	static int spa_active_count;
	int spa_max_replication_override = SPA_DVAS_PER_BP;

	static kmutex_t spa_spare_lock;
	static avl_tree_t spa_spare_avl;
	static kmutex_t spa_l2cache_lock;
	static avl_tree_t spa_l2cache_avl;

	kmem_cache_t *spa_buffer_pool;
	int spa_mode_global;

	#ifdef ZFS_DEBUG
	-/* Everything except dprintf and spa is on by default in debug builds */
	-int zfs_flags = ~(ZFS_DEBUG_DPRINTF \| ZFS_DEBUG_SPA);
	+/*
	+ * Everything except dprintf, spa, and indirect_remap is on by default
	+ * in debug builds.
	+ */
	+int zfs_flags = ~(ZFS_DEBUG_DPRINTF \| ZFS_DEBUG_SPA \| ZFS_DEBUG_INDIRECT_REMAP);
	#else
	int zfs_flags = 0;
	#endif

	/*
	* zfs_recover can be set to nonzero to attempt to recover from
	* otherwise-fatal errors, typically caused by on-disk corruption. When
	* set, calls to zfs_panic_recover() will turn into warning messages.
	* This should only be used as a last resort, as it typically results
	* in leaked space, or worse.
	*/
	boolean_t zfs_recover = B_FALSE;

	/*
	* If destroy encounters an EIO while reading metadata (e.g. indirect
	* blocks), space referenced by the missing metadata can not be freed.
	* Normally this causes the background destroy to become "stalled", as
	* it is unable to make forward progress. While in this stalled state,
	* all remaining space to free from the error-encountering filesystem is
	* "temporarily leaked". Set this flag to cause it to ignore the EIO,
	* permanently leak the space from indirect blocks that can not be read,
	* and continue to free everything else that it can.
	*
	* The default, "stalling" behavior is useful if the storage partially
	* fails (i.e. some but not all i/os fail), and then later recovers. In
	* this case, we will be able to continue pool operations while it is
	* partially failed, and when it recovers, we can continue to free the
	* space, with no leaks. However, note that this case is actually
	* fairly rare.
	*
	* Typically pools either (a) fail completely (but perhaps temporarily,
	* e.g. a top-level vdev going offline), or (b) have localized,
	* permanent errors (e.g. disk returns the wrong data due to bit flip or
	* firmware bug). In case (a), this setting does not matter because the
	* pool will be suspended and the sync thread will not be able to make
	* forward progress regardless. In case (b), because the error is
	* permanent, the best we can do is leak the minimum amount of space,
	* which is what setting this flag will do. Therefore, it is reasonable
	* for this flag to normally be set, but we chose the more conservative
	* approach of not setting it, so that there is no possibility of
	* leaking space in the "partial temporary" failure case.
	*/
	boolean_t zfs_free_leak_on_eio = B_FALSE;

	/*
	* Expiration time in milliseconds. This value has two meanings. First it is
	* used to determine when the spa_deadman() logic should fire. By default the
	* spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
	* Secondly, the value determines if an I/O is considered "hung". Any I/O that
	* has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
	* in a system panic.
	*/
	uint64_t zfs_deadman_synctime_ms = 1000000ULL;

	/*
	* Check time in milliseconds. This defines the frequency at which we check
	* for hung I/O.
	*/
	uint64_t zfs_deadman_checktime_ms = 5000ULL;

	/*
	* Default value of -1 for zfs_deadman_enabled is resolved in
	* zfs_deadman_init()
	*/
	int zfs_deadman_enabled = -1;

	/*
	* The worst case is single-sector max-parity RAID-Z blocks, in which
	* case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
	* times the size; so just assume that. Add to this the fact that
	* we can have up to 3 DVAs per bp, and one more factor of 2 because
	* the block may be dittoed with up to 3 DVAs by ddt_sync(). All together,
	* the worst case is:
	* (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
	*/
	int spa_asize_inflation = 24;

	#if defined(__FreeBSD__) && defined(_KERNEL)
	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
	"Try to recover from otherwise-fatal errors.");

	static int
	sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
	{
	int err, val;

	val = zfs_flags;
	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	/*
	* ZFS_DEBUG_MODIFY must be enabled prior to boot so all
	* arc buffers in the system have the necessary additional
	* checksum data. However, it is safe to disable at any
	* time.
	*/
	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
	val &= ~ZFS_DEBUG_MODIFY;
	zfs_flags = val;

	return (0);
	}

	SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
	CTLTYPE_UINT \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN, 0, sizeof(int),
	sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags,
	CTLTYPE_UINT \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(int),
	sysctl_vfs_zfs_debug_flags, "IU",
	"Debug flags for ZFS testing (deprecated, see vfs.zfs.debugflags).");

	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN,
	&zfs_deadman_synctime_ms, 0,
	"Stalled ZFS I/O expiration time in milliseconds");
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN,
	&zfs_deadman_checktime_ms, 0,
	"Period of checks for stalled ZFS I/O in milliseconds");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
	&zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
	SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
	&spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
	#endif

	#ifndef illumos
	#ifdef _KERNEL
	static void
	zfs_deadman_init()
	{
	/*
	* If we are not i386 or amd64 or in a virtual machine,
	* disable ZFS deadman thread by default
	*/
	if (zfs_deadman_enabled == -1) {
	#if defined(__amd64__) \|\| defined(__i386__)
	zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
	#else
	zfs_deadman_enabled = 0;
	#endif
	}
	}
	#endif /* _KERNEL */
	#endif /* !illumos */

	/*
	* Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
	* the pool to be consumed. This ensures that we don't run the pool
	* completely out of space, due to unaccounted changes (e.g. to the MOS).
	* It also limits the worst-case time to allocate space. If we have
	* less than this amount of free space, most ZPL operations (e.g. write,
	* create) will return ENOSPC.
	*
	* Certain operations (e.g. file removal, most administrative actions) can
	* use half the slop space. They will only return ENOSPC if less than half
	* the slop space is free. Typically, once the pool has less than the slop
	* space free, the user will use these operations to free up space in the pool.
	* These are the operations that call dsl_pool_adjustedsize() with the netfree
	* argument set to TRUE.
	*
	* A very restricted set of operations are always permitted, regardless of
	* the amount of free space. These are the operations that call
	* dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these
	* operations result in a net increase in the amount of space used,
	* it is possible to run the pool completely out of space, causing it to
	* be permanently read-only.
	*
	* Note that on very small pools, the slop space will be larger than
	* 3.2%, in an effort to have it be at least spa_min_slop (128MB),
	* but we never allow it to be more than half the pool size.
	*
	* See also the comments in zfs_space_check_t.
	*/
	int spa_slop_shift = 5;
	SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
	&spa_slop_shift, 0,
	"Shift value of reserved space (1/(2^spa_slop_shift)).");
	uint64_t spa_min_slop = 128 * 1024 * 1024;
	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
	&spa_min_slop, 0,
	"Minimal value of reserved space");

	/*
	* ==========================================================================
	* SPA config locking
	* ==========================================================================
	*/
	static void
	spa_config_lock_init(spa_t *spa)
	{
	for (int i = 0; i < SCL_LOCKS; i++) {
	spa_config_lock_t *scl = &spa->spa_config_lock[i];
	mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
	refcount_create_untracked(&scl->scl_count);
	scl->scl_writer = NULL;
	scl->scl_write_wanted = 0;
	}
	}

	static void
	spa_config_lock_destroy(spa_t *spa)
	{
	for (int i = 0; i < SCL_LOCKS; i++) {
	spa_config_lock_t *scl = &spa->spa_config_lock[i];
	mutex_destroy(&scl->scl_lock);
	cv_destroy(&scl->scl_cv);
	refcount_destroy(&scl->scl_count);
	ASSERT(scl->scl_writer == NULL);
	ASSERT(scl->scl_write_wanted == 0);
	}
	}

	int
	spa_config_tryenter(spa_t spa, int locks, void tag, krw_t rw)
	{
	for (int i = 0; i < SCL_LOCKS; i++) {
	spa_config_lock_t *scl = &spa->spa_config_lock[i];
	if (!(locks & (1 << i)))
	continue;
	mutex_enter(&scl->scl_lock);
	if (rw == RW_READER) {
	if (scl->scl_writer \|\| scl->scl_write_wanted) {
	mutex_exit(&scl->scl_lock);
	spa_config_exit(spa, locks & ((1 << i) - 1),
	tag);
	return (0);
	}
	} else {
	ASSERT(scl->scl_writer != curthread);
	if (!refcount_is_zero(&scl->scl_count)) {
	mutex_exit(&scl->scl_lock);
	spa_config_exit(spa, locks & ((1 << i) - 1),
	tag);
	return (0);
	}
	scl->scl_writer = curthread;
	}
	(void) refcount_add(&scl->scl_count, tag);
	mutex_exit(&scl->scl_lock);
	}
	return (1);
	}

	void
	spa_config_enter(spa_t spa, int locks, void tag, krw_t rw)
	{
	int wlocks_held = 0;

	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);

	for (int i = 0; i < SCL_LOCKS; i++) {
	spa_config_lock_t *scl = &spa->spa_config_lock[i];
	if (scl->scl_writer == curthread)
	wlocks_held \|= (1 << i);
	if (!(locks & (1 << i)))
	continue;
	mutex_enter(&scl->scl_lock);
	if (rw == RW_READER) {
	while (scl->scl_writer \|\| scl->scl_write_wanted) {
	cv_wait(&scl->scl_cv, &scl->scl_lock);
	}
	} else {
	ASSERT(scl->scl_writer != curthread);
	while (!refcount_is_zero(&scl->scl_count)) {
	scl->scl_write_wanted++;
	cv_wait(&scl->scl_cv, &scl->scl_lock);
	scl->scl_write_wanted--;
	}
	scl->scl_writer = curthread;
	}
	(void) refcount_add(&scl->scl_count, tag);
	mutex_exit(&scl->scl_lock);
	}
	- ASSERT(wlocks_held <= locks);
	+ ASSERT3U(wlocks_held, <=, locks);
	}

	void
	spa_config_exit(spa_t spa, int locks, void tag)
	{
	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
	spa_config_lock_t *scl = &spa->spa_config_lock[i];
	if (!(locks & (1 << i)))
	continue;
	mutex_enter(&scl->scl_lock);
	ASSERT(!refcount_is_zero(&scl->scl_count));
	if (refcount_remove(&scl->scl_count, tag) == 0) {
	ASSERT(scl->scl_writer == NULL \|\|
	scl->scl_writer == curthread);
	scl->scl_writer = NULL; /* OK in either case */
	cv_broadcast(&scl->scl_cv);
	}
	mutex_exit(&scl->scl_lock);
	}
	}

	int
	spa_config_held(spa_t *spa, int locks, krw_t rw)
	{
	int locks_held = 0;

	for (int i = 0; i < SCL_LOCKS; i++) {
	spa_config_lock_t *scl = &spa->spa_config_lock[i];
	if (!(locks & (1 << i)))
	continue;
	if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) \|\|
	(rw == RW_WRITER && scl->scl_writer == curthread))
	locks_held \|= 1 << i;
	}

	return (locks_held);
	}

	/*
	* ==========================================================================
	* SPA namespace functions
	* ==========================================================================
	*/

	/*
	* Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
	* Returns NULL if no matching spa_t is found.
	*/
	spa_t *
	spa_lookup(const char *name)
	{
	static spa_t search; /* spa_t is large; don't allocate on stack */
	spa_t *spa;
	avl_index_t where;
	char *cp;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));

	/*
	* If it's a full dataset name, figure out the pool name and
	* just use that.
	*/
	cp = strpbrk(search.spa_name, "/@#");
	if (cp != NULL)
	*cp = '\0';

	spa = avl_find(&spa_namespace_avl, &search, &where);

	return (spa);
	}

	/*
	* Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
	* If the zfs_deadman_enabled flag is set then it inspects all vdev queues
	* looking for potentially hung I/Os.
	*/
	static void
	spa_deadman(void *arg, int pending)
	{
	spa_t *spa = arg;

	/*
	* Disable the deadman timer if the pool is suspended.
	*/
	if (spa_suspended(spa)) {
	#ifdef illumos
	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
	#else
	/* Nothing. just don't schedule any future callouts. */
	#endif
	return;
	}

	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
	(gethrtime() - spa->spa_sync_starttime) / NANOSEC,
	++spa->spa_deadman_calls);
	if (zfs_deadman_enabled)
	vdev_deadman(spa->spa_root_vdev);
	#ifdef __FreeBSD__
	#ifdef _KERNEL
	callout_schedule(&spa->spa_deadman_cycid,
	hz * zfs_deadman_checktime_ms / MILLISEC);
	#endif
	#endif
	}

	#if defined(__FreeBSD__) && defined(_KERNEL)
	static void
	spa_deadman_timeout(void *arg)
	{
	spa_t *spa = arg;

	taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task);
	}
	#endif

	/*
	* Create an uninitialized spa_t with the given name. Requires
	* spa_namespace_lock. The caller must ensure that the spa_t doesn't already
	* exist by calling spa_lookup() first.
	*/
	spa_t *
	spa_add(const char name, nvlist_t config, const char *altroot)
	{
	spa_t *spa;
	spa_config_dirent_t *dp;
	#ifdef illumos
	cyc_handler_t hdlr;
	cyc_time_t when;
	#endif

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);

	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);

	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);

	for (int t = 0; t < TXG_SIZE; t++)
	bplist_create(&spa->spa_free_bplist[t]);

	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
	spa->spa_state = POOL_STATE_UNINITIALIZED;
	spa->spa_freeze_txg = UINT64_MAX;
	spa->spa_final_txg = UINT64_MAX;
	spa->spa_load_max_txg = UINT64_MAX;
	spa->spa_proc = &p0;
	spa->spa_proc_state = SPA_PROC_NONE;

	#ifdef illumos
	hdlr.cyh_func = spa_deadman;
	hdlr.cyh_arg = spa;
	hdlr.cyh_level = CY_LOW_LEVEL;
	#endif

	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);

	#ifdef illumos
	/*
	* This determines how often we need to check for hung I/Os after
	* the cyclic has already fired. Since checking for hung I/Os is
	* an expensive operation we don't want to check too frequently.
	* Instead wait for 5 seconds before checking again.
	*/
	when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
	when.cyt_when = CY_INFINITY;
	mutex_enter(&cpu_lock);
	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
	mutex_exit(&cpu_lock);
	#else /* !illumos */
	#ifdef _KERNEL
	/*
	* callout(9) does not provide a way to initialize a callout with
	* a function and an argument, so we use callout_reset() to schedule
	* the callout in the very distant future. Even if that event ever
	* fires, it should be okayas we won't have any active zio-s.
	* But normally spa_sync() will reschedule the callout with a proper
	* timeout.
	* callout(9) does not allow the callback function to sleep but
	* vdev_deadman() needs to acquire vq_lock and illumos mutexes are
	* emulated using sx(9). For this reason spa_deadman_timeout()
	* will schedule spa_deadman() as task on a taskqueue that allows
	* sleeping.
	*/
	TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa);
	callout_init(&spa->spa_deadman_cycid, 1);
	callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0,
	spa_deadman_timeout, spa, 0);
	#endif
	#endif
	refcount_create(&spa->spa_refcount);
	spa_config_lock_init(spa);

	avl_add(&spa_namespace_avl, spa);

	/*
	* Set the alternate root, if there is one.
	*/
	if (altroot) {
	spa->spa_root = spa_strdup(altroot);
	spa_active_count++;
	}

	avl_create(&spa->spa_alloc_tree, zio_bookmark_compare,
	sizeof (zio_t), offsetof(zio_t, io_alloc_node));

	/*
	* Every pool starts with the default cachefile
	*/
	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
	offsetof(spa_config_dirent_t, scd_link));

	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
	list_insert_head(&spa->spa_config_list, dp);

	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
	KM_SLEEP) == 0);

	if (config != NULL) {
	nvlist_t *features;

	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
	&features) == 0) {
	VERIFY(nvlist_dup(features, &spa->spa_label_features,
	0) == 0);
	}

	VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
	}

	if (spa->spa_label_features == NULL) {
	VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
	KM_SLEEP) == 0);
	}

	spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);

	spa->spa_min_ashift = INT_MAX;
	spa->spa_max_ashift = 0;

	/*
	* As a pool is being created, treat all features as disabled by
	* setting SPA_FEATURE_DISABLED for all entries in the feature
	* refcount cache.
	*/
	for (int i = 0; i < SPA_FEATURES; i++) {
	spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
	}

	return (spa);
	}

	/*
	* Removes a spa_t from the namespace, freeing up any memory used. Requires
	* spa_namespace_lock. This is called only after the spa_t has been closed and
	* deactivated.
	*/
	void
	spa_remove(spa_t *spa)
	{
	spa_config_dirent_t *dp;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));
	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
	ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);

	nvlist_free(spa->spa_config_splitting);

	avl_remove(&spa_namespace_avl, spa);
	cv_broadcast(&spa_namespace_cv);

	if (spa->spa_root) {
	spa_strfree(spa->spa_root);
	spa_active_count--;
	}

	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
	list_remove(&spa->spa_config_list, dp);
	if (dp->scd_path != NULL)
	spa_strfree(dp->scd_path);
	kmem_free(dp, sizeof (spa_config_dirent_t));
	}

	avl_destroy(&spa->spa_alloc_tree);
	list_destroy(&spa->spa_config_list);

	nvlist_free(spa->spa_label_features);
	nvlist_free(spa->spa_load_info);
	spa_config_set(spa, NULL);

	#ifdef illumos
	mutex_enter(&cpu_lock);
	if (spa->spa_deadman_cycid != CYCLIC_NONE)
	cyclic_remove(spa->spa_deadman_cycid);
	mutex_exit(&cpu_lock);
	spa->spa_deadman_cycid = CYCLIC_NONE;
	#else /* !illumos */
	#ifdef _KERNEL
	callout_drain(&spa->spa_deadman_cycid);
	taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task);
	#endif
	#endif

	refcount_destroy(&spa->spa_refcount);

	spa_config_lock_destroy(spa);

	for (int t = 0; t < TXG_SIZE; t++)
	bplist_destroy(&spa->spa_free_bplist[t]);

	zio_checksum_templates_free(spa);

	cv_destroy(&spa->spa_async_cv);
	cv_destroy(&spa->spa_evicting_os_cv);
	cv_destroy(&spa->spa_proc_cv);
	cv_destroy(&spa->spa_scrub_io_cv);
	cv_destroy(&spa->spa_suspend_cv);

	mutex_destroy(&spa->spa_alloc_lock);
	mutex_destroy(&spa->spa_async_lock);
	mutex_destroy(&spa->spa_errlist_lock);
	mutex_destroy(&spa->spa_errlog_lock);
	mutex_destroy(&spa->spa_evicting_os_lock);
	mutex_destroy(&spa->spa_history_lock);
	mutex_destroy(&spa->spa_proc_lock);
	mutex_destroy(&spa->spa_props_lock);
	mutex_destroy(&spa->spa_cksum_tmpls_lock);
	mutex_destroy(&spa->spa_scrub_lock);
	mutex_destroy(&spa->spa_suspend_lock);
	mutex_destroy(&spa->spa_vdev_top_lock);

	kmem_free(spa, sizeof (spa_t));
	}

	/*
	* Given a pool, return the next pool in the namespace, or NULL if there is
	* none. If 'prev' is NULL, return the first pool.
	*/
	spa_t *
	spa_next(spa_t *prev)
	{
	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	if (prev)
	return (AVL_NEXT(&spa_namespace_avl, prev));
	else
	return (avl_first(&spa_namespace_avl));
	}

	/*
	* ==========================================================================
	* SPA refcount functions
	* ==========================================================================
	*/

	/*
	* Add a reference to the given spa_t. Must have at least one reference, or
	* have the namespace lock held.
	*/
	void
	spa_open_ref(spa_t spa, void tag)
	{
	ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref \|\|
	MUTEX_HELD(&spa_namespace_lock));
	(void) refcount_add(&spa->spa_refcount, tag);
	}

	/*
	* Remove a reference to the given spa_t. Must have at least one reference, or
	* have the namespace lock held.
	*/
	void
	spa_close(spa_t spa, void tag)
	{
	ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref \|\|
	MUTEX_HELD(&spa_namespace_lock));
	(void) refcount_remove(&spa->spa_refcount, tag);
	}

	/*
	* Remove a reference to the given spa_t held by a dsl dir that is
	* being asynchronously released. Async releases occur from a taskq
	* performing eviction of dsl datasets and dirs. The namespace lock
	* isn't held and the hold by the object being evicted may contribute to
	* spa_minref (e.g. dataset or directory released during pool export),
	* so the asserts in spa_close() do not apply.
	*/
	void
	spa_async_close(spa_t spa, void tag)
	{
	(void) refcount_remove(&spa->spa_refcount, tag);
	}

	/*
	* Check to see if the spa refcount is zero. Must be called with
	* spa_namespace_lock held. We really compare against spa_minref, which is the
	* number of references acquired when opening a pool
	*/
	boolean_t
	spa_refcount_zero(spa_t *spa)
	{
	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
	}

	/*
	* ==========================================================================
	* SPA spare and l2cache tracking
	* ==========================================================================
	*/

	/*
	* Hot spares and cache devices are tracked using the same code below,
	* for 'auxiliary' devices.
	*/

	typedef struct spa_aux {
	uint64_t aux_guid;
	uint64_t aux_pool;
	avl_node_t aux_avl;
	int aux_count;
	} spa_aux_t;

	static int
	spa_aux_compare(const void a, const void b)
	{
	const spa_aux_t *sa = a;
	const spa_aux_t *sb = b;

	if (sa->aux_guid < sb->aux_guid)
	return (-1);
	else if (sa->aux_guid > sb->aux_guid)
	return (1);
	else
	return (0);
	}

	void
	spa_aux_add(vdev_t vd, avl_tree_t avl)
	{
	avl_index_t where;
	spa_aux_t search;
	spa_aux_t *aux;

	search.aux_guid = vd->vdev_guid;
	if ((aux = avl_find(avl, &search, &where)) != NULL) {
	aux->aux_count++;
	} else {
	aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
	aux->aux_guid = vd->vdev_guid;
	aux->aux_count = 1;
	avl_insert(avl, aux, where);
	}
	}

	void
	spa_aux_remove(vdev_t vd, avl_tree_t avl)
	{
	spa_aux_t search;
	spa_aux_t *aux;
	avl_index_t where;

	search.aux_guid = vd->vdev_guid;
	aux = avl_find(avl, &search, &where);

	ASSERT(aux != NULL);

	if (--aux->aux_count == 0) {
	avl_remove(avl, aux);
	kmem_free(aux, sizeof (spa_aux_t));
	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
	aux->aux_pool = 0ULL;
	}
	}

	boolean_t
	spa_aux_exists(uint64_t guid, uint64_t pool, int refcnt, avl_tree_t *avl)
	{
	spa_aux_t search, *found;

	search.aux_guid = guid;
	found = avl_find(avl, &search, NULL);

	if (pool) {
	if (found)
	*pool = found->aux_pool;
	else
	*pool = 0ULL;
	}

	if (refcnt) {
	if (found)
	*refcnt = found->aux_count;
	else
	*refcnt = 0;
	}

	return (found != NULL);
	}

	void
	spa_aux_activate(vdev_t vd, avl_tree_t avl)
	{
	spa_aux_t search, *found;
	avl_index_t where;

	search.aux_guid = vd->vdev_guid;
	found = avl_find(avl, &search, &where);
	ASSERT(found != NULL);
	ASSERT(found->aux_pool == 0ULL);

	found->aux_pool = spa_guid(vd->vdev_spa);
	}

	/*
	* Spares are tracked globally due to the following constraints:
	*
	* - A spare may be part of multiple pools.
	* - A spare may be added to a pool even if it's actively in use within
	* another pool.
	* - A spare in use in any pool can only be the source of a replacement if
	* the target is a spare in the same pool.
	*
	* We keep track of all spares on the system through the use of a reference
	* counted AVL tree. When a vdev is added as a spare, or used as a replacement
	* spare, then we bump the reference count in the AVL tree. In addition, we set
	* the 'vdev_isspare' member to indicate that the device is a spare (active or
	* inactive). When a spare is made active (used to replace a device in the
	* pool), we also keep track of which pool its been made a part of.
	*
	* The 'spa_spare_lock' protects the AVL tree. These functions are normally
	* called under the spa_namespace lock as part of vdev reconfiguration. The
	* separate spare lock exists for the status query path, which does not need to
	* be completely consistent with respect to other vdev configuration changes.
	*/

	static int
	spa_spare_compare(const void a, const void b)
	{
	return (spa_aux_compare(a, b));
	}

	void
	spa_spare_add(vdev_t *vd)
	{
	mutex_enter(&spa_spare_lock);
	ASSERT(!vd->vdev_isspare);
	spa_aux_add(vd, &spa_spare_avl);
	vd->vdev_isspare = B_TRUE;
	mutex_exit(&spa_spare_lock);
	}

	void
	spa_spare_remove(vdev_t *vd)
	{
	mutex_enter(&spa_spare_lock);
	ASSERT(vd->vdev_isspare);
	spa_aux_remove(vd, &spa_spare_avl);
	vd->vdev_isspare = B_FALSE;
	mutex_exit(&spa_spare_lock);
	}

	boolean_t
	spa_spare_exists(uint64_t guid, uint64_t pool, int refcnt)
	{
	boolean_t found;

	mutex_enter(&spa_spare_lock);
	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
	mutex_exit(&spa_spare_lock);

	return (found);
	}

	void
	spa_spare_activate(vdev_t *vd)
	{
	mutex_enter(&spa_spare_lock);
	ASSERT(vd->vdev_isspare);
	spa_aux_activate(vd, &spa_spare_avl);
	mutex_exit(&spa_spare_lock);
	}

	/*
	* Level 2 ARC devices are tracked globally for the same reasons as spares.
	* Cache devices currently only support one pool per cache device, and so
	* for these devices the aux reference count is currently unused beyond 1.
	*/

	static int
	spa_l2cache_compare(const void a, const void b)
	{
	return (spa_aux_compare(a, b));
	}

	void
	spa_l2cache_add(vdev_t *vd)
	{
	mutex_enter(&spa_l2cache_lock);
	ASSERT(!vd->vdev_isl2cache);
	spa_aux_add(vd, &spa_l2cache_avl);
	vd->vdev_isl2cache = B_TRUE;
	mutex_exit(&spa_l2cache_lock);
	}

	void
	spa_l2cache_remove(vdev_t *vd)
	{
	mutex_enter(&spa_l2cache_lock);
	ASSERT(vd->vdev_isl2cache);
	spa_aux_remove(vd, &spa_l2cache_avl);
	vd->vdev_isl2cache = B_FALSE;
	mutex_exit(&spa_l2cache_lock);
	}

	boolean_t
	spa_l2cache_exists(uint64_t guid, uint64_t *pool)
	{
	boolean_t found;

	mutex_enter(&spa_l2cache_lock);
	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
	mutex_exit(&spa_l2cache_lock);

	return (found);
	}

	void
	spa_l2cache_activate(vdev_t *vd)
	{
	mutex_enter(&spa_l2cache_lock);
	ASSERT(vd->vdev_isl2cache);
	spa_aux_activate(vd, &spa_l2cache_avl);
	mutex_exit(&spa_l2cache_lock);
	}

	/*
	* ==========================================================================
	* SPA vdev locking
	* ==========================================================================
	*/

	/*
	* Lock the given spa_t for the purpose of adding or removing a vdev.
	* Grabs the global spa_namespace_lock plus the spa config lock for writing.
	* It returns the next transaction group for the spa_t.
	*/
	uint64_t
	spa_vdev_enter(spa_t *spa)
	{
	mutex_enter(&spa->spa_vdev_top_lock);
	mutex_enter(&spa_namespace_lock);
	return (spa_vdev_config_enter(spa));
	}

	/*
	* Internal implementation for spa_vdev_enter(). Used when a vdev
	* operation requires multiple syncs (i.e. removing a device) while
	* keeping the spa_namespace_lock held.
	*/
	uint64_t
	spa_vdev_config_enter(spa_t *spa)
	{
	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);

	return (spa_last_synced_txg(spa) + 1);
	}

	/*
	* Used in combination with spa_vdev_config_enter() to allow the syncing
	* of multiple transactions without releasing the spa_namespace_lock.
	*/
	void
	spa_vdev_config_exit(spa_t spa, vdev_t vd, uint64_t txg, int error, char *tag)
	{
	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	int config_changed = B_FALSE;

	ASSERT(txg > spa_last_synced_txg(spa));

	spa->spa_pending_vdev = NULL;

	/*
	* Reassess the DTLs.
	*/
	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);

	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
	config_changed = B_TRUE;
	spa->spa_config_generation++;
	}

	/*
	* Verify the metaslab classes.
	*/
	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);

	spa_config_exit(spa, SCL_ALL, spa);

	/*
	* Panic the system if the specified tag requires it. This
	* is useful for ensuring that configurations are updated
	* transactionally.
	*/
	if (zio_injection_enabled)
	zio_handle_panic_injection(spa, tag, 0);

	/*
	* Note: this txg_wait_synced() is important because it ensures
	* that there won't be more than one config change per txg.
	* This allows us to use the txg as the generation number.
	*/
	if (error == 0)
	txg_wait_synced(spa->spa_dsl_pool, txg);

	if (vd != NULL) {
	ASSERT(!vd->vdev_detached \|\| vd->vdev_dtl_sm == NULL);
	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
	vdev_free(vd);
	spa_config_exit(spa, SCL_ALL, spa);
	}

	/*
	* If the config changed, update the config cache.
	*/
	if (config_changed)
	- spa_config_sync(spa, B_FALSE, B_TRUE);
	+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
	}

	/*
	* Unlock the spa_t after adding or removing a vdev. Besides undoing the
	* locking of spa_vdev_enter(), we also want make sure the transactions have
	* synced to disk, and then update the global configuration cache with the new
	* information.
	*/
	int
	spa_vdev_exit(spa_t spa, vdev_t vd, uint64_t txg, int error)
	{
	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
	mutex_exit(&spa_namespace_lock);
	mutex_exit(&spa->spa_vdev_top_lock);

	return (error);
	}

	/*
	* Lock the given spa_t for the purpose of changing vdev state.
	*/
	void
	spa_vdev_state_enter(spa_t *spa, int oplocks)
	{
	int locks = SCL_STATE_ALL \| oplocks;

	/*
	* Root pools may need to read of the underlying devfs filesystem
	* when opening up a vdev. Unfortunately if we're holding the
	* SCL_ZIO lock it will result in a deadlock when we try to issue
	* the read from the root filesystem. Instead we "prefetch"
	* the associated vnodes that we need prior to opening the
	* underlying devices and cache them so that we can prevent
	* any I/O when we are doing the actual open.
	*/
	if (spa_is_root(spa)) {
	int low = locks & ~(SCL_ZIO - 1);
	int high = locks & ~low;

	spa_config_enter(spa, high, spa, RW_WRITER);
	vdev_hold(spa->spa_root_vdev);
	spa_config_enter(spa, low, spa, RW_WRITER);
	} else {
	spa_config_enter(spa, locks, spa, RW_WRITER);
	}
	spa->spa_vdev_locks = locks;
	}

	int
	spa_vdev_state_exit(spa_t spa, vdev_t vd, int error)
	{
	boolean_t config_changed = B_FALSE;

	if (vd != NULL \|\| error == 0)
	vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
	0, 0, B_FALSE);

	if (vd != NULL) {
	vdev_state_dirty(vd->vdev_top);
	config_changed = B_TRUE;
	spa->spa_config_generation++;
	}

	if (spa_is_root(spa))
	vdev_rele(spa->spa_root_vdev);

	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
	spa_config_exit(spa, spa->spa_vdev_locks, spa);

	/*
	* If anything changed, wait for it to sync. This ensures that,
	* from the system administrator's perspective, zpool(1M) commands
	* are synchronous. This is important for things like zpool offline:
	* when the command completes, you expect no further I/O from ZFS.
	*/
	if (vd != NULL)
	txg_wait_synced(spa->spa_dsl_pool, 0);

	/*
	* If the config changed, update the config cache.
	*/
	if (config_changed) {
	mutex_enter(&spa_namespace_lock);
	- spa_config_sync(spa, B_FALSE, B_TRUE);
	+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
	mutex_exit(&spa_namespace_lock);
	}

	return (error);
	}

	/*
	* ==========================================================================
	* Miscellaneous functions
	* ==========================================================================
	*/

	void
	spa_activate_mos_feature(spa_t spa, const char feature, dmu_tx_t *tx)
	{
	if (!nvlist_exists(spa->spa_label_features, feature)) {
	fnvlist_add_boolean(spa->spa_label_features, feature);
	/*
	* When we are creating the pool (tx_txg==TXG_INITIAL), we can't
	* dirty the vdev config because lock SCL_CONFIG is not held.
	* Thankfully, in this case we don't need to dirty the config
	* because it will be written out anyway when we finish
	* creating the pool.
	*/
	if (tx->tx_txg != TXG_INITIAL)
	vdev_config_dirty(spa->spa_root_vdev);
	}
	}

	void
	spa_deactivate_mos_feature(spa_t spa, const char feature)
	{
	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
	vdev_config_dirty(spa->spa_root_vdev);
	}

	/*
	* Rename a spa_t.
	*/
	int
	spa_rename(const char name, const char newname)
	{
	spa_t *spa;
	int err;

	/*
	* Lookup the spa_t and grab the config lock for writing. We need to
	* actually open the pool so that we can sync out the necessary labels.
	* It's OK to call spa_open() with the namespace lock held because we
	* allow recursive calls for other reasons.
	*/
	mutex_enter(&spa_namespace_lock);
	if ((err = spa_open(name, &spa, FTAG)) != 0) {
	mutex_exit(&spa_namespace_lock);
	return (err);
	}

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	avl_remove(&spa_namespace_avl, spa);
	(void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
	avl_add(&spa_namespace_avl, spa);

	/*
	* Sync all labels to disk with the new names by marking the root vdev
	* dirty and waiting for it to sync. It will pick up the new pool name
	* during the sync.
	*/
	vdev_config_dirty(spa->spa_root_vdev);

	spa_config_exit(spa, SCL_ALL, FTAG);

	txg_wait_synced(spa->spa_dsl_pool, 0);

	/*
	* Sync the updated config cache.
	*/
	- spa_config_sync(spa, B_FALSE, B_TRUE);
	+ spa_write_cachefile(spa, B_FALSE, B_TRUE);

	spa_close(spa, FTAG);

	mutex_exit(&spa_namespace_lock);

	return (0);
	}

	/*
	* Return the spa_t associated with given pool_guid, if it exists. If
	* device_guid is non-zero, determine whether the pool exists and contains
	* a device with the specified device_guid.
	*/
	spa_t *
	spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
	{
	spa_t *spa;
	avl_tree_t *t = &spa_namespace_avl;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
	continue;
	if (spa->spa_root_vdev == NULL)
	continue;
	if (spa_guid(spa) == pool_guid) {
	if (device_guid == 0)
	break;

	if (vdev_lookup_by_guid(spa->spa_root_vdev,
	device_guid) != NULL)
	break;

	/*
	* Check any devices we may be in the process of adding.
	*/
	if (spa->spa_pending_vdev) {
	if (vdev_lookup_by_guid(spa->spa_pending_vdev,
	device_guid) != NULL)
	break;
	}
	}
	}

	return (spa);
	}

	/*
	* Determine whether a pool with the given pool_guid exists.
	*/
	boolean_t
	spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
	{
	return (spa_by_guid(pool_guid, device_guid) != NULL);
	}

	char *
	spa_strdup(const char *s)
	{
	size_t len;
	char *new;

	len = strlen(s);
	new = kmem_alloc(len + 1, KM_SLEEP);
	bcopy(s, new, len);
	new[len] = '\0';

	return (new);
	}

	void
	spa_strfree(char *s)
	{
	kmem_free(s, strlen(s) + 1);
	}

	uint64_t
	spa_get_random(uint64_t range)
	{
	uint64_t r;

	ASSERT(range != 0);

	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));

	return (r % range);
	}

	uint64_t
	spa_generate_guid(spa_t *spa)
	{
	uint64_t guid = spa_get_random(-1ULL);

	if (spa != NULL) {
	while (guid == 0 \|\| spa_guid_exists(spa_guid(spa), guid))
	guid = spa_get_random(-1ULL);
	} else {
	while (guid == 0 \|\| spa_guid_exists(guid, 0))
	guid = spa_get_random(-1ULL);
	}

	return (guid);
	}

	void
	snprintf_blkptr(char buf, size_t buflen, const blkptr_t bp)
	{
	char type[256];
	char *checksum = NULL;
	char *compress = NULL;

	if (bp != NULL) {
	if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
	dmu_object_byteswap_t bswap =
	DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
	(void) snprintf(type, sizeof (type), "bswap %s %s",
	DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
	"metadata" : "data",
	dmu_ot_byteswap[bswap].ob_name);
	} else {
	(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
	sizeof (type));
	}
	if (!BP_IS_EMBEDDED(bp)) {
	checksum =
	zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
	}
	compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
	}

	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
	compress);
	}

	void
	spa_freeze(spa_t *spa)
	{
	uint64_t freeze_txg = 0;

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	if (spa->spa_freeze_txg == UINT64_MAX) {
	freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
	spa->spa_freeze_txg = freeze_txg;
	}
	spa_config_exit(spa, SCL_ALL, FTAG);
	if (freeze_txg != 0)
	txg_wait_synced(spa_get_dsl(spa), freeze_txg);
	}

	void
	zfs_panic_recover(const char *fmt, ...)
	{
	va_list adx;

	va_start(adx, fmt);
	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
	va_end(adx);
	}

	/*
	* This is a stripped-down version of strtoull, suitable only for converting
	* lowercase hexadecimal numbers that don't overflow.
	*/
	uint64_t
	zfs_strtonum(const char str, char *nptr)
	{
	uint64_t val = 0;
	char c;
	int digit;

	while ((c = *str) != '\0') {
	if (c >= '0' && c <= '9')
	digit = c - '0';
	else if (c >= 'a' && c <= 'f')
	digit = 10 + c - 'a';
	else
	break;

	val *= 16;
	val += digit;

	str++;
	}

	if (nptr)
	nptr = (char )str;

	return (val);
	}

	/*
	* ==========================================================================
	* Accessor functions
	* ==========================================================================
	*/

	boolean_t
	spa_shutting_down(spa_t *spa)
	{
	return (spa->spa_async_suspended);
	}

	dsl_pool_t *
	spa_get_dsl(spa_t *spa)
	{
	return (spa->spa_dsl_pool);
	}

	boolean_t
	spa_is_initializing(spa_t *spa)
	{
	return (spa->spa_is_initializing);
	}

	+boolean_t
	+spa_indirect_vdevs_loaded(spa_t *spa)
	+{
	+ return (spa->spa_indirect_vdevs_loaded);
	+}
	+
	blkptr_t *
	spa_get_rootblkptr(spa_t *spa)
	{
	return (&spa->spa_ubsync.ub_rootbp);
	}

	void
	spa_set_rootblkptr(spa_t spa, const blkptr_t bp)
	{
	spa->spa_uberblock.ub_rootbp = *bp;
	}

	void
	spa_altroot(spa_t spa, char buf, size_t buflen)
	{
	if (spa->spa_root == NULL)
	buf[0] = '\0';
	else
	(void) strncpy(buf, spa->spa_root, buflen);
	}

	int
	spa_sync_pass(spa_t *spa)
	{
	return (spa->spa_sync_pass);
	}

	char *
	spa_name(spa_t *spa)
	{
	return (spa->spa_name);
	}

	uint64_t
	spa_guid(spa_t *spa)
	{
	dsl_pool_t *dp = spa_get_dsl(spa);
	uint64_t guid;

	/*
	* If we fail to parse the config during spa_load(), we can go through
	* the error path (which posts an ereport) and end up here with no root
	* vdev. We stash the original pool guid in 'spa_config_guid' to handle
	* this case.
	*/
	if (spa->spa_root_vdev == NULL)
	return (spa->spa_config_guid);

	guid = spa->spa_last_synced_guid != 0 ?
	spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;

	/*
	* Return the most recently synced out guid unless we're
	* in syncing context.
	*/
	if (dp && dsl_pool_sync_context(dp))
	return (spa->spa_root_vdev->vdev_guid);
	else
	return (guid);
	}

	uint64_t
	spa_load_guid(spa_t *spa)
	{
	/*
	* This is a GUID that exists solely as a reference for the
	* purposes of the arc. It is generated at load time, and
	* is never written to persistent storage.
	*/
	return (spa->spa_load_guid);
	}

	uint64_t
	spa_last_synced_txg(spa_t *spa)
	{
	return (spa->spa_ubsync.ub_txg);
	}

	uint64_t
	spa_first_txg(spa_t *spa)
	{
	return (spa->spa_first_txg);
	}

	uint64_t
	spa_syncing_txg(spa_t *spa)
	{
	return (spa->spa_syncing_txg);
	}

	/*
	* Return the last txg where data can be dirtied. The final txgs
	* will be used to just clear out any deferred frees that remain.
	*/
	uint64_t
	spa_final_dirty_txg(spa_t *spa)
	{
	return (spa->spa_final_txg - TXG_DEFER_SIZE);
	}

	pool_state_t
	spa_state(spa_t *spa)
	{
	return (spa->spa_state);
	}

	spa_load_state_t
	spa_load_state(spa_t *spa)
	{
	return (spa->spa_load_state);
	}

	uint64_t
	spa_freeze_txg(spa_t *spa)
	{
	return (spa->spa_freeze_txg);
	}

	/* ARGSUSED */
	uint64_t
	spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
	{
	return (lsize * spa_asize_inflation);
	}

	/*
	* Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%),
	* or at least 128MB, unless that would cause it to be more than half the
	* pool size.
	*
	* See the comment above spa_slop_shift for details.
	*/
	uint64_t
	spa_get_slop_space(spa_t *spa)
	{
	uint64_t space = spa_get_dspace(spa);
	return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
	}

	uint64_t
	spa_get_dspace(spa_t *spa)
	{
	return (spa->spa_dspace);
	}

	void
	spa_update_dspace(spa_t *spa)
	{
	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
	ddt_get_dedup_dspace(spa);
	+ if (spa->spa_vdev_removal != NULL) {
	+ /*
	+ * We can't allocate from the removing device, so
	+ * subtract its size. This prevents the DMU/DSL from
	+ * filling up the (now smaller) pool while we are in the
	+ * middle of removing the device.
	+ *
	+ * Note that the DMU/DSL doesn't actually know or care
	+ * how much space is allocated (it does its own tracking
	+ * of how much space has been logically used). So it
	+ * doesn't matter that the data we are moving may be
	+ * allocated twice (on the old device and the new
	+ * device).
	+ */
	+ vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
	+ spa->spa_dspace -= spa_deflate(spa) ?
	+ vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
	+ }
	}

	/*
	* Return the failure mode that has been set to this pool. The default
	* behavior will be to block all I/Os when a complete failure occurs.
	*/
	uint8_t
	spa_get_failmode(spa_t *spa)
	{
	return (spa->spa_failmode);
	}

	boolean_t
	spa_suspended(spa_t *spa)
	{
	return (spa->spa_suspended);
	}

	uint64_t
	spa_version(spa_t *spa)
	{
	return (spa->spa_ubsync.ub_version);
	}

	boolean_t
	spa_deflate(spa_t *spa)
	{
	return (spa->spa_deflate);
	}

	metaslab_class_t *
	spa_normal_class(spa_t *spa)
	{
	return (spa->spa_normal_class);
	}

	metaslab_class_t *
	spa_log_class(spa_t *spa)
	{
	return (spa->spa_log_class);
	}

	void
	spa_evicting_os_register(spa_t spa, objset_t os)
	{
	mutex_enter(&spa->spa_evicting_os_lock);
	list_insert_head(&spa->spa_evicting_os_list, os);
	mutex_exit(&spa->spa_evicting_os_lock);
	}

	void
	spa_evicting_os_deregister(spa_t spa, objset_t os)
	{
	mutex_enter(&spa->spa_evicting_os_lock);
	list_remove(&spa->spa_evicting_os_list, os);
	cv_broadcast(&spa->spa_evicting_os_cv);
	mutex_exit(&spa->spa_evicting_os_lock);
	}

	void
	spa_evicting_os_wait(spa_t *spa)
	{
	mutex_enter(&spa->spa_evicting_os_lock);
	while (!list_is_empty(&spa->spa_evicting_os_list))
	cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
	mutex_exit(&spa->spa_evicting_os_lock);

	dmu_buf_user_evict_wait();
	}

	int
	spa_max_replication(spa_t *spa)
	{
	/*
	* As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
	* handle BPs with more than one DVA allocated. Set our max
	* replication level accordingly.
	*/
	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
	return (1);
	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
	}

	int
	spa_prev_software_version(spa_t *spa)
	{
	return (spa->spa_prev_software_version);
	}

	uint64_t
	spa_deadman_synctime(spa_t *spa)
	{
	return (spa->spa_deadman_synctime);
	}

	uint64_t
	dva_get_dsize_sync(spa_t spa, const dva_t dva)
	{
	uint64_t asize = DVA_GET_ASIZE(dva);
	uint64_t dsize = asize;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);

	if (asize != 0 && spa->spa_deflate) {
	uint64_t vdev = DVA_GET_VDEV(dva);
	vdev_t *vd = vdev_lookup_top(spa, vdev);
	if (vd == NULL) {
	panic(
	"dva_get_dsize_sync(): bad DVA %llu:%llu",
	(u_longlong_t)vdev, (u_longlong_t)asize);
	}
	dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
	}

	return (dsize);
	}

	uint64_t
	bp_get_dsize_sync(spa_t spa, const blkptr_t bp)
	{
	uint64_t dsize = 0;

	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
	dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);

	return (dsize);
	}

	uint64_t
	bp_get_dsize(spa_t spa, const blkptr_t bp)
	{
	uint64_t dsize = 0;

	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
	dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);

	spa_config_exit(spa, SCL_VDEV, FTAG);

	return (dsize);
	}

	/*
	* ==========================================================================
	* Initialization and Termination
	* ==========================================================================
	*/

	static int
	spa_name_compare(const void a1, const void a2)
	{
	const spa_t *s1 = a1;
	const spa_t *s2 = a2;
	int s;

	s = strcmp(s1->spa_name, s2->spa_name);
	if (s > 0)
	return (1);
	if (s < 0)
	return (-1);
	return (0);
	}

	int
	spa_busy(void)
	{
	return (spa_active_count);
	}

	void
	spa_boot_init()
	{
	spa_config_load();
	}

	#ifdef _KERNEL
	EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
	#endif

	void
	spa_init(int mode)
	{
	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);

	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
	offsetof(spa_t, spa_avl));

	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
	offsetof(spa_aux_t, aux_avl));

	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
	offsetof(spa_aux_t, aux_avl));

	spa_mode_global = mode;

	#ifdef illumos
	#ifdef _KERNEL
	spa_arch_init();
	#else
	if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
	arc_procfd = open("/proc/self/ctl", O_WRONLY);
	if (arc_procfd == -1) {
	perror("could not enable watchpoints: "
	"opening /proc/self/ctl failed: ");
	} else {
	arc_watch = B_TRUE;
	}
	}
	#endif
	#endif /* illumos */
	refcount_sysinit();
	unique_init();
	range_tree_init();
	metaslab_alloc_trace_init();
	zio_init();
	lz4_init();
	dmu_init();
	zil_init();
	vdev_cache_stat_init();
	vdev_file_init();
	zfs_prop_init();
	zpool_prop_init();
	zpool_feature_init();
	spa_config_load();
	l2arc_start();
	#ifndef illumos
	#ifdef _KERNEL
	zfs_deadman_init();
	#endif
	#endif /* !illumos */
	}

	void
	spa_fini(void)
	{
	l2arc_stop();

	spa_evict_all();

	vdev_file_fini();
	vdev_cache_stat_fini();
	zil_fini();
	dmu_fini();
	lz4_fini();
	zio_fini();
	metaslab_alloc_trace_fini();
	range_tree_fini();
	unique_fini();
	refcount_fini();

	avl_destroy(&spa_namespace_avl);
	avl_destroy(&spa_spare_avl);
	avl_destroy(&spa_l2cache_avl);

	cv_destroy(&spa_namespace_cv);
	mutex_destroy(&spa_namespace_lock);
	mutex_destroy(&spa_spare_lock);
	mutex_destroy(&spa_l2cache_lock);
	}

	/*
	* Return whether this pool has slogs. No locking needed.
	* It's not a problem if the wrong answer is returned as it's only for
	* performance and not correctness
	*/
	boolean_t
	spa_has_slogs(spa_t *spa)
	{
	return (spa->spa_log_class->mc_rotor != NULL);
	}

	spa_log_state_t
	spa_get_log_state(spa_t *spa)
	{
	return (spa->spa_log_state);
	}

	void
	spa_set_log_state(spa_t *spa, spa_log_state_t state)
	{
	spa->spa_log_state = state;
	}

	boolean_t
	spa_is_root(spa_t *spa)
	{
	return (spa->spa_is_root);
	}

	boolean_t
	spa_writeable(spa_t *spa)
	{
	return (!!(spa->spa_mode & FWRITE));
	}

	/*
	* Returns true if there is a pending sync task in any of the current
	* syncing txg, the current quiescing txg, or the current open txg.
	*/
	boolean_t
	spa_has_pending_synctask(spa_t *spa)
	{
	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
	}

	int
	spa_mode(spa_t *spa)
	{
	return (spa->spa_mode);
	}

	uint64_t
	spa_bootfs(spa_t *spa)
	{
	return (spa->spa_bootfs);
	}

	uint64_t
	spa_delegation(spa_t *spa)
	{
	return (spa->spa_delegation);
	}

	objset_t *
	spa_meta_objset(spa_t *spa)
	{
	return (spa->spa_meta_objset);
	}

	enum zio_checksum
	spa_dedup_checksum(spa_t *spa)
	{
	return (spa->spa_dedup_checksum);
	}

	/*
	* Reset pool scan stat per scan pass (or reboot).
	*/
	void
	spa_scan_stat_init(spa_t *spa)
	{
	/* data not stored on disk */
	spa->spa_scan_pass_start = gethrestime_sec();
	if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
	spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
	else
	spa->spa_scan_pass_scrub_pause = 0;
	spa->spa_scan_pass_scrub_spent_paused = 0;
	spa->spa_scan_pass_exam = 0;
	vdev_scan_stat_init(spa->spa_root_vdev);
	}

	/*
	* Get scan stats for zpool status reports
	*/
	int
	spa_scan_get_stats(spa_t spa, pool_scan_stat_t ps)
	{
	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;

	if (scn == NULL \|\| scn->scn_phys.scn_func == POOL_SCAN_NONE)
	return (SET_ERROR(ENOENT));
	bzero(ps, sizeof (pool_scan_stat_t));

	/* data stored on disk */
	ps->pss_func = scn->scn_phys.scn_func;
	ps->pss_start_time = scn->scn_phys.scn_start_time;
	ps->pss_end_time = scn->scn_phys.scn_end_time;
	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
	ps->pss_examined = scn->scn_phys.scn_examined;
	ps->pss_to_process = scn->scn_phys.scn_to_process;
	ps->pss_processed = scn->scn_phys.scn_processed;
	ps->pss_errors = scn->scn_phys.scn_errors;
	ps->pss_state = scn->scn_phys.scn_state;

	/* data not stored on disk */
	ps->pss_pass_start = spa->spa_scan_pass_start;
	ps->pss_pass_exam = spa->spa_scan_pass_exam;
	ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
	ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;

	return (0);
	}

	boolean_t
	spa_debug_enabled(spa_t *spa)
	{
	return (spa->spa_debug);
	}

	int
	spa_maxblocksize(spa_t *spa)
	{
	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
	return (SPA_MAXBLOCKSIZE);
	else
	return (SPA_OLD_MAXBLOCKSIZE);
	+}
	+
	+/*
	+ * Returns the txg that the last device removal completed. No indirect mappings
	+ * have been added since this txg.
	+ */
	+uint64_t
	+spa_get_last_removal_txg(spa_t *spa)
	+{
	+ uint64_t vdevid;
	+ uint64_t ret = -1ULL;
	+
	+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	+ /*
	+ * sr_prev_indirect_vdev is only modified while holding all the
	+ * config locks, so it is sufficient to hold SCL_VDEV as reader when
	+ * examining it.
	+ */
	+ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
	+
	+ while (vdevid != -1ULL) {
	+ vdev_t *vd = vdev_lookup_top(spa, vdevid);
	+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
	+
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	+
	+ /*
	+ * If the removal did not remap any data, we don't care.
	+ */
	+ if (vdev_indirect_births_count(vib) != 0) {
	+ ret = vdev_indirect_births_last_entry_txg(vib);
	+ break;
	+ }
	+
	+ vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
	+ }
	+ spa_config_exit(spa, SCL_VDEV, FTAG);
	+
	+ IMPLY(ret != -1ULL,
	+ spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
	+
	+ return (ret);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c (revision 332525)
	@@ -1,548 +1,566 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/
	/*
	* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/dmu.h>
	#include <sys/dmu_tx.h>
	#include <sys/dnode.h>
	#include <sys/dsl_pool.h>
	#include <sys/zio.h>
	#include <sys/space_map.h>
	#include <sys/refcount.h>
	#include <sys/zfeature.h>

	SYSCTL_DECL(_vfs_zfs);

	/*
	* The data for a given space map can be kept on blocks of any size.
	* Larger blocks entail fewer i/o operations, but they also cause the
	* DMU to keep more data in-core, and also to waste more i/o bandwidth
	* when only a few blocks have changed since the last transaction group.
	*/
	int space_map_blksz = (1 << 12);
	SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_blksz, CTLFLAG_RDTUN, &space_map_blksz, 0,
	"Maximum block size for space map. Must be power of 2 and greater than 4096.");

	/*
	- * Load the space map disk into the specified range tree. Segments of maptype
	- * are added to the range tree, other segment types are removed.
	- *
	- * Note: space_map_load() will drop sm_lock across dmu_read() calls.
	- * The caller must be OK with this.
	+ * Iterate through the space map, invoking the callback on each (non-debug)
	+ * space map entry.
	*/
	int
	-space_map_load(space_map_t sm, range_tree_t rt, maptype_t maptype)
	+space_map_iterate(space_map_t sm, sm_cb_t callback, void arg)
	{
	uint64_t entry, entry_map, *entry_map_end;
	- uint64_t bufsize, size, offset, end, space;
	+ uint64_t bufsize, size, offset, end;
	int error = 0;

	- ASSERT(MUTEX_HELD(sm->sm_lock));
	-
	end = space_map_length(sm);
	- space = space_map_allocated(sm);

	- VERIFY0(range_tree_space(rt));
	-
	- if (maptype == SM_FREE) {
	- range_tree_add(rt, sm->sm_start, sm->sm_size);
	- space = sm->sm_size - space;
	- }
	-
	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
	entry_map = zio_buf_alloc(bufsize);

	- mutex_exit(sm->sm_lock);
	if (end > bufsize) {
	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
	end - bufsize, ZIO_PRIORITY_SYNC_READ);
	}
	- mutex_enter(sm->sm_lock);

	- for (offset = 0; offset < end; offset += bufsize) {
	+ for (offset = 0; offset < end && error == 0; offset += bufsize) {
	size = MIN(end - offset, bufsize);
	VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
	VERIFY(size != 0);
	ASSERT3U(sm->sm_blksz, !=, 0);

	dprintf("object=%llu offset=%llx size=%llx\n",
	space_map_object(sm), offset, size);

	- mutex_exit(sm->sm_lock);
	error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
	entry_map, DMU_READ_PREFETCH);
	- mutex_enter(sm->sm_lock);
	if (error != 0)
	break;

	entry_map_end = entry_map + (size / sizeof (uint64_t));
	- for (entry = entry_map; entry < entry_map_end; entry++) {
	+ for (entry = entry_map; entry < entry_map_end && error == 0;
	+ entry++) {
	uint64_t e = *entry;
	uint64_t offset, size;

	- if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
	+ if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
	continue;

	offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
	sm->sm_start;
	size = SM_RUN_DECODE(e) << sm->sm_shift;

	VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
	VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
	VERIFY3U(offset, >=, sm->sm_start);
	VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
	- if (SM_TYPE_DECODE(e) == maptype) {
	- VERIFY3U(range_tree_space(rt) + size, <=,
	- sm->sm_size);
	- range_tree_add(rt, offset, size);
	- } else {
	- range_tree_remove(rt, offset, size);
	- }
	+ error = callback(SM_TYPE_DECODE(e), offset, size, arg);
	}
	}

	- if (error == 0)
	+ zio_buf_free(entry_map, bufsize);
	+ return (error);
	+}
	+
	+typedef struct space_map_load_arg {
	+ space_map_t *smla_sm;
	+ range_tree_t *smla_rt;
	+ maptype_t smla_type;
	+} space_map_load_arg_t;
	+
	+static int
	+space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size,
	+ void *arg)
	+{
	+ space_map_load_arg_t *smla = arg;
	+ if (type == smla->smla_type) {
	+ VERIFY3U(range_tree_space(smla->smla_rt) + size, <=,
	+ smla->smla_sm->sm_size);
	+ range_tree_add(smla->smla_rt, offset, size);
	+ } else {
	+ range_tree_remove(smla->smla_rt, offset, size);
	+ }
	+
	+ return (0);
	+}
	+
	+/*
	+ * Load the space map disk into the specified range tree. Segments of maptype
	+ * are added to the range tree, other segment types are removed.
	+ */
	+int
	+space_map_load(space_map_t sm, range_tree_t rt, maptype_t maptype)
	+{
	+ uint64_t space;
	+ int err;
	+ space_map_load_arg_t smla;
	+
	+ VERIFY0(range_tree_space(rt));
	+ space = space_map_allocated(sm);
	+
	+ if (maptype == SM_FREE) {
	+ range_tree_add(rt, sm->sm_start, sm->sm_size);
	+ space = sm->sm_size - space;
	+ }
	+
	+ smla.smla_rt = rt;
	+ smla.smla_sm = sm;
	+ smla.smla_type = maptype;
	+ err = space_map_iterate(sm, space_map_load_callback, &smla);
	+
	+ if (err == 0) {
	VERIFY3U(range_tree_space(rt), ==, space);
	- else
	+ } else {
	range_tree_vacate(rt, NULL, NULL);
	+ }

	- zio_buf_free(entry_map, bufsize);
	- return (error);
	+ return (err);
	}

	void
	space_map_histogram_clear(space_map_t *sm)
	{
	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
	return;

	bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
	}

	boolean_t
	space_map_histogram_verify(space_map_t sm, range_tree_t rt)
	{
	/*
	* Verify that the in-core range tree does not have any
	* ranges smaller than our sm_shift size.
	*/
	for (int i = 0; i < sm->sm_shift; i++) {
	if (rt->rt_histogram[i] != 0)
	return (B_FALSE);
	}
	return (B_TRUE);
	}

	void
	space_map_histogram_add(space_map_t sm, range_tree_t rt, dmu_tx_t *tx)
	{
	int idx = 0;

	- ASSERT(MUTEX_HELD(rt->rt_lock));
	ASSERT(dmu_tx_is_syncing(tx));
	VERIFY3U(space_map_object(sm), !=, 0);

	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
	return;

	dmu_buf_will_dirty(sm->sm_dbuf, tx);

	ASSERT(space_map_histogram_verify(sm, rt));
	/*
	* Transfer the content of the range tree histogram to the space
	* map histogram. The space map histogram contains 32 buckets ranging
	* between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
	* however, can represent ranges from 2^0 to 2^63. Since the space
	* map only cares about allocatable blocks (minimum of sm_shift) we
	* can safely ignore all ranges in the range tree smaller than sm_shift.
	*/
	for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {

	/*
	* Since the largest histogram bucket in the space map is
	* 2^(32+sm_shift-1), we need to normalize the values in
	* the range tree for any bucket larger than that size. For
	* example given an sm_shift of 9, ranges larger than 2^40
	* would get normalized as if they were 1TB ranges. Assume
	* the range tree had a count of 5 in the 2^44 (16TB) bucket,
	* the calculation below would normalize this to 5 * 2^4 (16).
	*/
	ASSERT3U(i, >=, idx + sm->sm_shift);
	sm->sm_phys->smp_histogram[idx] +=
	rt->rt_histogram[i] << (i - idx - sm->sm_shift);

	/*
	* Increment the space map's index as long as we haven't
	* reached the maximum bucket size. Accumulate all ranges
	* larger than the max bucket size into the last bucket.
	*/
	if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
	ASSERT3U(idx + sm->sm_shift, ==, i);
	idx++;
	ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
	}
	}
	}

	uint64_t
	space_map_entries(space_map_t sm, range_tree_t rt)
	{
	avl_tree_t *t = &rt->rt_root;
	range_seg_t *rs;
	uint64_t size, entries;

	/*
	* All space_maps always have a debug entry so account for it here.
	*/
	entries = 1;

	/*
	* Traverse the range tree and calculate the number of space map
	* entries that would be required to write out the range tree.
	*/
	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
	entries += howmany(size, SM_RUN_MAX);
	}
	return (entries);
	}

	-/*
	- * Note: space_map_write() will drop sm_lock across dmu_write() calls.
	- */
	void
	space_map_write(space_map_t sm, range_tree_t rt, maptype_t maptype,
	dmu_tx_t *tx)
	{
	objset_t *os = sm->sm_os;
	spa_t *spa = dmu_objset_spa(os);
	avl_tree_t *t = &rt->rt_root;
	range_seg_t *rs;
	uint64_t size, total, rt_space, nodes;
	uint64_t entry, entry_map, *entry_map_end;
	uint64_t expected_entries, actual_entries = 1;

	- ASSERT(MUTEX_HELD(rt->rt_lock));
	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
	VERIFY3U(space_map_object(sm), !=, 0);
	dmu_buf_will_dirty(sm->sm_dbuf, tx);

	/*
	* This field is no longer necessary since the in-core space map
	* now contains the object number but is maintained for backwards
	* compatibility.
	*/
	sm->sm_phys->smp_object = sm->sm_object;

	if (range_tree_space(rt) == 0) {
	VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
	return;
	}

	if (maptype == SM_ALLOC)
	sm->sm_phys->smp_alloc += range_tree_space(rt);
	else
	sm->sm_phys->smp_alloc -= range_tree_space(rt);

	expected_entries = space_map_entries(sm, rt);

	entry_map = zio_buf_alloc(sm->sm_blksz);
	entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
	entry = entry_map;

	*entry++ = SM_DEBUG_ENCODE(1) \|
	SM_DEBUG_ACTION_ENCODE(maptype) \|
	SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) \|
	SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));

	total = 0;
	nodes = avl_numnodes(&rt->rt_root);
	rt_space = range_tree_space(rt);
	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
	uint64_t start;

	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
	start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;

	total += size << sm->sm_shift;

	while (size != 0) {
	uint64_t run_len;

	run_len = MIN(size, SM_RUN_MAX);

	if (entry == entry_map_end) {
	- mutex_exit(rt->rt_lock);
	dmu_write(os, space_map_object(sm),
	sm->sm_phys->smp_objsize, sm->sm_blksz,
	entry_map, tx);
	- mutex_enter(rt->rt_lock);
	sm->sm_phys->smp_objsize += sm->sm_blksz;
	entry = entry_map;
	}

	*entry++ = SM_OFFSET_ENCODE(start) \|
	SM_TYPE_ENCODE(maptype) \|
	SM_RUN_ENCODE(run_len);

	start += run_len;
	size -= run_len;
	actual_entries++;
	}
	}

	if (entry != entry_map) {
	size = (entry - entry_map) * sizeof (uint64_t);
	- mutex_exit(rt->rt_lock);
	dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize,
	size, entry_map, tx);
	- mutex_enter(rt->rt_lock);
	sm->sm_phys->smp_objsize += size;
	}
	ASSERT3U(expected_entries, ==, actual_entries);

	/*
	* Ensure that the space_map's accounting wasn't changed
	* while we were in the middle of writing it out.
	*/
	VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
	VERIFY3U(range_tree_space(rt), ==, rt_space);
	VERIFY3U(range_tree_space(rt), ==, total);

	zio_buf_free(entry_map, sm->sm_blksz);
	}

	static int
	space_map_open_impl(space_map_t *sm)
	{
	int error;
	u_longlong_t blocks;

	error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
	if (error)
	return (error);

	dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
	sm->sm_phys = sm->sm_dbuf->db_data;
	return (0);
	}

	int
	space_map_open(space_map_t *smp, objset_t os, uint64_t object,
	- uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp)
	+ uint64_t start, uint64_t size, uint8_t shift)
	{
	space_map_t *sm;
	int error;

	ASSERT(*smp == NULL);
	ASSERT(os != NULL);
	ASSERT(object != 0);

	sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);

	sm->sm_start = start;
	sm->sm_size = size;
	sm->sm_shift = shift;
	- sm->sm_lock = lp;
	sm->sm_os = os;
	sm->sm_object = object;

	error = space_map_open_impl(sm);
	if (error != 0) {
	space_map_close(sm);
	return (error);
	}

	*smp = sm;

	return (0);
	}

	void
	space_map_close(space_map_t *sm)
	{
	if (sm == NULL)
	return;

	if (sm->sm_dbuf != NULL)
	dmu_buf_rele(sm->sm_dbuf, sm);
	sm->sm_dbuf = NULL;
	sm->sm_phys = NULL;

	kmem_free(sm, sizeof (*sm));
	}

	void
	space_map_truncate(space_map_t sm, dmu_tx_t tx)
	{
	objset_t *os = sm->sm_os;
	spa_t *spa = dmu_objset_spa(os);
	dmu_object_info_t doi;

	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
	ASSERT(dmu_tx_is_syncing(tx));
	VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa));

	dmu_object_info_from_db(sm->sm_dbuf, &doi);

	/*
	* If the space map has the wrong bonus size (because
	* SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
	* the wrong block size (because space_map_blksz has changed),
	* free and re-allocate its object with the updated sizes.
	*
	* Otherwise, just truncate the current object.
	*/
	if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
	doi.doi_bonus_size != sizeof (space_map_phys_t)) \|\|
	doi.doi_data_block_size != space_map_blksz) {
	zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
	"object[%llu]: old bonus %u, old blocksz %u",
	dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
	doi.doi_bonus_size, doi.doi_data_block_size);

	space_map_free(sm, tx);
	dmu_buf_rele(sm->sm_dbuf, sm);

	sm->sm_object = space_map_alloc(sm->sm_os, tx);
	VERIFY0(space_map_open_impl(sm));
	} else {
	VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));

	/*
	* If the spacemap is reallocated, its histogram
	* will be reset. Do the same in the common case so that
	* bugs related to the uncommon case do not go unnoticed.
	*/
	bzero(sm->sm_phys->smp_histogram,
	sizeof (sm->sm_phys->smp_histogram));
	}

	dmu_buf_will_dirty(sm->sm_dbuf, tx);
	sm->sm_phys->smp_objsize = 0;
	sm->sm_phys->smp_alloc = 0;
	}

	/*
	* Update the in-core space_map allocation and length values.
	*/
	void
	space_map_update(space_map_t *sm)
	{
	if (sm == NULL)
	return;

	- ASSERT(MUTEX_HELD(sm->sm_lock));
	-
	sm->sm_alloc = sm->sm_phys->smp_alloc;
	sm->sm_length = sm->sm_phys->smp_objsize;
	}

	uint64_t
	space_map_alloc(objset_t os, dmu_tx_t tx)
	{
	spa_t *spa = dmu_objset_spa(os);
	uint64_t object;
	int bonuslen;

	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
	spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
	bonuslen = sizeof (space_map_phys_t);
	ASSERT3U(bonuslen, <=, dmu_bonus_max());
	} else {
	bonuslen = SPACE_MAP_SIZE_V0;
	}

	object = dmu_object_alloc(os,
	DMU_OT_SPACE_MAP, space_map_blksz,
	DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);

	return (object);
	}

	void
	-space_map_free(space_map_t sm, dmu_tx_t tx)
	+space_map_free_obj(objset_t os, uint64_t smobj, dmu_tx_t tx)
	{
	- spa_t *spa;
	-
	- if (sm == NULL)
	- return;
	-
	- spa = dmu_objset_spa(sm->sm_os);
	+ spa_t *spa = dmu_objset_spa(os);
	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
	dmu_object_info_t doi;

	- dmu_object_info_from_db(sm->sm_dbuf, &doi);
	+ VERIFY0(dmu_object_info(os, smobj, &doi));
	if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
	- VERIFY(spa_feature_is_active(spa,
	- SPA_FEATURE_SPACEMAP_HISTOGRAM));
	spa_feature_decr(spa,
	SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
	}
	}

	- VERIFY3U(dmu_object_free(sm->sm_os, space_map_object(sm), tx), ==, 0);
	+ VERIFY0(dmu_object_free(os, smobj, tx));
	+}
	+
	+void
	+space_map_free(space_map_t sm, dmu_tx_t tx)
	+{
	+ if (sm == NULL)
	+ return;
	+
	+ space_map_free_obj(sm->sm_os, space_map_object(sm), tx);
	sm->sm_object = 0;
	}

	uint64_t
	space_map_object(space_map_t *sm)
	{
	return (sm != NULL ? sm->sm_object : 0);
	}

	/*
	* Returns the already synced, on-disk allocated space.
	*/
	uint64_t
	space_map_allocated(space_map_t *sm)
	{
	return (sm != NULL ? sm->sm_alloc : 0);
	}

	/*
	* Returns the already synced, on-disk length;
	*/
	uint64_t
	space_map_length(space_map_t *sm)
	{
	return (sm != NULL ? sm->sm_length : 0);
	}

	/*
	* Returns the allocated space that is currently syncing.
	*/
	int64_t
	space_map_alloc_delta(space_map_t *sm)
	{
	if (sm == NULL)
	return (0);
	ASSERT(sm->sm_dbuf != NULL);
	return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c (revision 332525)
	@@ -1,159 +1,155 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/
	/*
	* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/range_tree.h>
	#include <sys/space_reftree.h>

	/*
	* Space reference trees.
	*
	* A range tree is a collection of integers. Every integer is either
	* in the tree, or it's not. A space reference tree generalizes
	* the idea: it allows its members to have arbitrary reference counts,
	* as opposed to the implicit reference count of 0 or 1 in a range tree.
	* This representation comes in handy when computing the union or
	* intersection of multiple space maps. For example, the union of
	* N range trees is the subset of the reference tree with refcnt >= 1.
	* The intersection of N range trees is the subset with refcnt >= N.
	*
	* [It's very much like a Fourier transform. Unions and intersections
	* are hard to perform in the 'range tree domain', so we convert the trees
	* into the 'reference count domain', where it's trivial, then invert.]
	*
	* vdev_dtl_reassess() uses computations of this form to determine
	* DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
	* has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
	* has an outage wherever refcnt >= vdev_children.
	*/
	static int
	space_reftree_compare(const void x1, const void x2)
	{
	const space_ref_t *sr1 = x1;
	const space_ref_t *sr2 = x2;

	if (sr1->sr_offset < sr2->sr_offset)
	return (-1);
	if (sr1->sr_offset > sr2->sr_offset)
	return (1);

	if (sr1 < sr2)
	return (-1);
	if (sr1 > sr2)
	return (1);

	return (0);
	}

	void
	space_reftree_create(avl_tree_t *t)
	{
	avl_create(t, space_reftree_compare,
	sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
	}

	void
	space_reftree_destroy(avl_tree_t *t)
	{
	space_ref_t *sr;
	void *cookie = NULL;

	while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
	kmem_free(sr, sizeof (*sr));

	avl_destroy(t);
	}

	static void
	space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
	{
	space_ref_t *sr;

	sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
	sr->sr_offset = offset;
	sr->sr_refcnt = refcnt;

	avl_add(t, sr);
	}

	void
	space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
	int64_t refcnt)
	{
	space_reftree_add_node(t, start, refcnt);
	space_reftree_add_node(t, end, -refcnt);
	}

	/*
	* Convert (or add) a range tree into a reference tree.
	*/
	void
	space_reftree_add_map(avl_tree_t t, range_tree_t rt, int64_t refcnt)
	{
	range_seg_t *rs;

	- ASSERT(MUTEX_HELD(rt->rt_lock));
	-
	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
	space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt);
	}

	/*
	* Convert a reference tree into a range tree. The range tree will contain
	* all members of the reference tree for which refcnt >= minref.
	*/
	void
	space_reftree_generate_map(avl_tree_t t, range_tree_t rt, int64_t minref)
	{
	uint64_t start = -1ULL;
	int64_t refcnt = 0;
	space_ref_t *sr;
	-
	- ASSERT(MUTEX_HELD(rt->rt_lock));

	range_tree_vacate(rt, NULL, NULL);

	for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
	refcnt += sr->sr_refcnt;
	if (refcnt >= minref) {
	if (start == -1ULL) {
	start = sr->sr_offset;
	}
	} else {
	if (start != -1ULL) {
	uint64_t end = sr->sr_offset;
	ASSERT(start <= end);
	if (end > start)
	range_tree_add(rt, start, end - start);
	start = -1ULL;
	}
	}
	}
	ASSERT(refcnt == 0);
	ASSERT(start == -1ULL);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h (revision 332525)
	@@ -1,93 +1,95 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_BPOBJ_H
	#define _SYS_BPOBJ_H

	#include <sys/dmu.h>
	#include <sys/spa.h>
	#include <sys/txg.h>
	#include <sys/zio.h>
	#include <sys/zfs_context.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	typedef struct bpobj_phys {
	/*
	* This is the bonus buffer for the dead lists. The object's
	* contents is an array of bpo_entries blkptr_t's, representing
	* a total of bpo_bytes physical space.
	*/
	uint64_t bpo_num_blkptrs;
	uint64_t bpo_bytes;
	uint64_t bpo_comp;
	uint64_t bpo_uncomp;
	uint64_t bpo_subobjs;
	uint64_t bpo_num_subobjs;
	} bpobj_phys_t;

	#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
	#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))

	typedef struct bpobj {
	kmutex_t bpo_lock;
	objset_t *bpo_os;
	uint64_t bpo_object;
	int bpo_epb;
	uint8_t bpo_havecomp;
	uint8_t bpo_havesubobj;
	bpobj_phys_t *bpo_phys;
	dmu_buf_t *bpo_dbuf;
	dmu_buf_t *bpo_cached_dbuf;
	} bpobj_t;

	typedef int bpobj_itor_t(void arg, const blkptr_t bp, dmu_tx_t *tx);

	uint64_t bpobj_alloc(objset_t mos, int blocksize, dmu_tx_t tx);
	uint64_t bpobj_alloc_empty(objset_t os, int blocksize, dmu_tx_t tx);
	void bpobj_free(objset_t os, uint64_t obj, dmu_tx_t tx);
	void bpobj_decr_empty(objset_t os, dmu_tx_t tx);

	int bpobj_open(bpobj_t bpo, objset_t mos, uint64_t object);
	void bpobj_close(bpobj_t *bpo);
	+boolean_t bpobj_is_open(const bpobj_t *bpo);

	int bpobj_iterate(bpobj_t bpo, bpobj_itor_t func, void arg, dmu_tx_t *tx);
	int bpobj_iterate_nofree(bpobj_t bpo, bpobj_itor_t func, void , dmu_tx_t *);

	void bpobj_enqueue_subobj(bpobj_t bpo, uint64_t subobj, dmu_tx_t tx);
	void bpobj_enqueue(bpobj_t bpo, const blkptr_t bp, dmu_tx_t *tx);

	int bpobj_space(bpobj_t *bpo,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	+boolean_t bpobj_is_empty(bpobj_t *bpo);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_BPOBJ_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h (revision 332525)
	@@ -1,403 +1,405 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	*/

	#ifndef _SYS_DBUF_H
	#define _SYS_DBUF_H

	#include <sys/dmu.h>
	#include <sys/spa.h>
	#include <sys/txg.h>
	#include <sys/zio.h>
	#include <sys/arc.h>
	#include <sys/zfs_context.h>
	#include <sys/refcount.h>
	#include <sys/zrlock.h>
	#include <sys/multilist.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	#define IN_DMU_SYNC 2

	/*
	* define flags for dbuf_read
	*/

	#define DB_RF_MUST_SUCCEED (1 << 0)
	#define DB_RF_CANFAIL (1 << 1)
	#define DB_RF_HAVESTRUCT (1 << 2)
	#define DB_RF_NOPREFETCH (1 << 3)
	#define DB_RF_NEVERWAIT (1 << 4)
	#define DB_RF_CACHED (1 << 5)

	/*
	* The simplified state transition diagram for dbufs looks like:
	*
	* +----> READ ----+
	* \| \|
	* \| V
	* (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
	* \| ^ ^
	* \| \| \|
	* +----> FILL ----+ \|
	* \| \|
	* \| \|
	* +--------> NOFILL -------+
	*
	* DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
	* to find all dbufs in a range of a dnode and must be less than any other
	* dbuf_states_t (see comment on dn_dbufs in dnode.h).
	*/
	typedef enum dbuf_states {
	DB_SEARCH = -1,
	DB_UNCACHED,
	DB_FILL,
	DB_NOFILL,
	DB_READ,
	DB_CACHED,
	DB_EVICTING
	} dbuf_states_t;

	struct dnode;
	struct dmu_tx;

	/*
	* level = 0 means the user data
	* level = 1 means the single indirect block
	* etc.
	*/

	struct dmu_buf_impl;

	typedef enum override_states {
	DR_NOT_OVERRIDDEN,
	DR_IN_DMU_SYNC,
	DR_OVERRIDDEN
	} override_states_t;

	typedef struct dbuf_dirty_record {
	/* link on our parents dirty list */
	list_node_t dr_dirty_node;

	/* transaction group this data will sync in */
	uint64_t dr_txg;

	/* zio of outstanding write IO */
	zio_t *dr_zio;

	/* pointer back to our dbuf */
	struct dmu_buf_impl *dr_dbuf;

	/* pointer to next dirty record */
	struct dbuf_dirty_record *dr_next;

	/* pointer to parent dirty record */
	struct dbuf_dirty_record *dr_parent;

	/* How much space was changed to dsl_pool_dirty_space() for this? */
	unsigned int dr_accounted;

	/* A copy of the bp that points to us */
	blkptr_t dr_bp_copy;

	union dirty_types {
	struct dirty_indirect {

	/* protect access to list */
	kmutex_t dr_mtx;

	/* Our list of dirty children */
	list_t dr_children;
	} di;
	struct dirty_leaf {

	/*
	* dr_data is set when we dirty the buffer
	* so that we can retain the pointer even if it
	* gets COW'd in a subsequent transaction group.
	*/
	arc_buf_t *dr_data;
	blkptr_t dr_overridden_by;
	override_states_t dr_override_state;
	uint8_t dr_copies;
	boolean_t dr_nopwrite;
	} dl;
	} dt;
	} dbuf_dirty_record_t;

	typedef struct dmu_buf_impl {
	/*
	* The following members are immutable, with the exception of
	* db.db_data, which is protected by db_mtx.
	*/

	/* the publicly visible structure */
	dmu_buf_t db;

	/* the objset we belong to */
	struct objset *db_objset;

	/*
	* handle to safely access the dnode we belong to (NULL when evicted)
	*/
	struct dnode_handle *db_dnode_handle;

	/*
	* our parent buffer; if the dnode points to us directly,
	* db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
	* only accessed by sync thread ???
	* (NULL when evicted)
	* May change from NULL to non-NULL under the protection of db_mtx
	* (see dbuf_check_blkptr())
	*/
	struct dmu_buf_impl *db_parent;

	/*
	* link for hash table of all dmu_buf_impl_t's
	*/
	struct dmu_buf_impl *db_hash_next;

	/* our block number */
	uint64_t db_blkid;

	/*
	* Pointer to the blkptr_t which points to us. May be NULL if we
	* don't have one yet. (NULL when evicted)
	*/
	blkptr_t *db_blkptr;

	/*
	* Our indirection level. Data buffers have db_level==0.
	* Indirect buffers which point to data buffers have
	* db_level==1. etc. Buffers which contain dnodes have
	* db_level==0, since the dnodes are stored in a file.
	*/
	uint8_t db_level;

	/* db_mtx protects the members below */
	kmutex_t db_mtx;

	/*
	* Current state of the buffer
	*/
	dbuf_states_t db_state;

	/*
	* Refcount accessed by dmu_buf_{hold,rele}.
	* If nonzero, the buffer can't be destroyed.
	* Protected by db_mtx.
	*/
	refcount_t db_holds;

	/* buffer holding our data */
	arc_buf_t *db_buf;

	kcondvar_t db_changed;
	dbuf_dirty_record_t *db_data_pending;

	/* pointer to most recent dirty record for this buffer */
	dbuf_dirty_record_t *db_last_dirty;

	/*
	* Our link on the owner dnodes's dn_dbufs list.
	* Protected by its dn_dbufs_mtx.
	*/
	avl_node_t db_link;

	/*
	* Link in dbuf_cache.
	*/
	multilist_node_t db_cache_link;

	/* Data which is unique to data (leaf) blocks: */

	/* User callback information. */
	dmu_buf_user_t *db_user;

	/*
	* Evict user data as soon as the dirty and reference
	* counts are equal.
	*/
	uint8_t db_user_immediate_evict;

	/*
	* This block was freed while a read or write was
	* active.
	*/
	uint8_t db_freed_in_flight;

	/*
	* dnode_evict_dbufs() or dnode_evict_bonus() tried to
	* evict this dbuf, but couldn't due to outstanding
	* references. Evict once the refcount drops to 0.
	*/
	uint8_t db_pending_evict;

	uint8_t db_dirtycnt;
	} dmu_buf_impl_t;

	/* Note: the dbuf hash table is exposed only for the mdb module */
	#define DBUF_MUTEXES 256
	#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
	typedef struct dbuf_hash_table {
	uint64_t hash_table_mask;
	dmu_buf_impl_t **hash_table;
	kmutex_t hash_mutexes[DBUF_MUTEXES];
	} dbuf_hash_table_t;

	uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);

	dmu_buf_impl_t dbuf_create_tlib(struct dnode dn, char *data);
	void dbuf_create_bonus(struct dnode *dn);
	int dbuf_spill_set_blksz(dmu_buf_t db, uint64_t blksz, dmu_tx_t tx);
	void dbuf_spill_hold(struct dnode dn, dmu_buf_impl_t dbp, void tag);

	void dbuf_rm_spill(struct dnode dn, dmu_tx_t tx);

	dmu_buf_impl_t dbuf_hold(struct dnode dn, uint64_t blkid, void *tag);
	dmu_buf_impl_t dbuf_hold_level(struct dnode dn, int level, uint64_t blkid,
	void *tag);
	int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
	boolean_t fail_sparse, boolean_t fail_uncached,
	void tag, dmu_buf_impl_t *dbp);

	void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
	zio_priority_t prio, arc_flags_t aflags);

	void dbuf_add_ref(dmu_buf_impl_t db, void tag);
	boolean_t dbuf_try_add_ref(dmu_buf_t db, objset_t os, uint64_t obj,
	uint64_t blkid, void *tag);
	uint64_t dbuf_refcount(dmu_buf_impl_t *db);

	void dbuf_rele(dmu_buf_impl_t db, void tag);
	void dbuf_rele_and_unlock(dmu_buf_impl_t db, void tag);

	dmu_buf_impl_t dbuf_find(struct objset os, uint64_t object, uint8_t level,
	uint64_t blkid);

	int dbuf_read(dmu_buf_impl_t db, zio_t zio, uint32_t flags);
	void dmu_buf_will_not_fill(dmu_buf_t db, dmu_tx_t tx);
	void dmu_buf_will_fill(dmu_buf_t db, dmu_tx_t tx);
	void dmu_buf_fill_done(dmu_buf_t db, dmu_tx_t tx);
	void dbuf_assign_arcbuf(dmu_buf_impl_t db, arc_buf_t buf, dmu_tx_t *tx);
	dbuf_dirty_record_t dbuf_dirty(dmu_buf_impl_t db, dmu_tx_t *tx);
	arc_buf_t dbuf_loan_arcbuf(dmu_buf_impl_t db);
	void dmu_buf_write_embedded(dmu_buf_t dbuf, void data,
	bp_embedded_type_t etype, enum zio_compress comp,
	int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);

	void dbuf_destroy(dmu_buf_impl_t *db);

	void dbuf_setdirty(dmu_buf_impl_t db, dmu_tx_t tx);
	void dbuf_unoverride(dbuf_dirty_record_t *dr);
	void dbuf_sync_list(list_t list, int level, dmu_tx_t tx);
	void dbuf_release_bp(dmu_buf_impl_t *db);

	+boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
	+
	void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
	struct dmu_tx *);

	void dbuf_new_size(dmu_buf_impl_t db, int size, dmu_tx_t tx);

	#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
	#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
	#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
	#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
	#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))

	void dbuf_init(void);
	void dbuf_fini(void);

	boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);

	#define DBUF_GET_BUFC_TYPE(_db) \
	(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)

	#define DBUF_IS_CACHEABLE(_db) \
	((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL \|\| \
	(dbuf_is_metadata(_db) && \
	((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))

	#define DBUF_IS_L2CACHEABLE(_db) \
	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL \|\| \
	(dbuf_is_metadata(_db) && \
	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))

	#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \
	((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL \|\| \
	(((_level) > 0 \|\| \
	DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \
	((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA)))

	#ifdef ZFS_DEBUG

	/*
	* There should be a ## between the string literal and fmt, to make it
	* clear that we're joining two strings together, but gcc does not
	* support that preprocessor token.
	*/
	#define dprintf_dbuf(dbuf, fmt, ...) do { \
	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	char __db_buf[32]; \
	uint64_t __db_obj = (dbuf)->db.db_object; \
	if (__db_obj == DMU_META_DNODE_OBJECT) \
	(void) strcpy(__db_buf, "mdn"); \
	else \
	(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
	(u_longlong_t)__db_obj); \
	dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
	"obj=%s lvl=%u blkid=%lld " fmt, \
	__db_buf, (dbuf)->db_level, \
	(u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
	} \
	_NOTE(CONSTCOND) } while (0)

	#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
	kmem_free(__blkbuf, BP_SPRINTF_LEN); \
	} \
	_NOTE(CONSTCOND) } while (0)

	#define DBUF_VERIFY(db) dbuf_verify(db)

	#else

	#define dprintf_dbuf(db, fmt, ...)
	#define dprintf_dbuf_bp(db, bp, fmt, ...)
	#define DBUF_VERIFY(db)

	#endif


	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DBUF_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 332525)
	@@ -1,977 +1,984 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2012, Joyent, Inc. All rights reserved.
	* Copyright 2013 DEY Storage Systems, Inc.
	* Copyright 2014 HybridCluster. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright 2013 Saso Kiselkov. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#ifndef _SYS_DMU_H
	#define _SYS_DMU_H

	/*
	* This file describes the interface that the DMU provides for its
	* consumers.
	*
	* The DMU also interacts with the SPA. That interface is described in
	* dmu_spa.h.
	*/

	#include <sys/zfs_context.h>
	#include <sys/cred.h>
	#include <sys/fs/zfs.h>
	#include <sys/zio_compress.h>
	#include <sys/zio_priority.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	struct uio;
	struct xuio;
	struct page;
	struct vnode;
	struct spa;
	struct zilog;
	struct zio;
	struct blkptr;
	struct zap_cursor;
	struct dsl_dataset;
	struct dsl_pool;
	struct dnode;
	struct drr_begin;
	struct drr_end;
	struct zbookmark_phys;
	struct spa;
	struct nvlist;
	struct arc_buf;
	struct zio_prop;
	struct sa_handle;
	struct file;

	typedef struct objset objset_t;
	typedef struct dmu_tx dmu_tx_t;
	typedef struct dsl_dir dsl_dir_t;
	typedef struct dnode dnode_t;

	typedef enum dmu_object_byteswap {
	DMU_BSWAP_UINT8,
	DMU_BSWAP_UINT16,
	DMU_BSWAP_UINT32,
	DMU_BSWAP_UINT64,
	DMU_BSWAP_ZAP,
	DMU_BSWAP_DNODE,
	DMU_BSWAP_OBJSET,
	DMU_BSWAP_ZNODE,
	DMU_BSWAP_OLDACL,
	DMU_BSWAP_ACL,
	/*
	* Allocating a new byteswap type number makes the on-disk format
	* incompatible with any other format that uses the same number.
	*
	* Data can usually be structured to work with one of the
	* DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
	*/
	DMU_BSWAP_NUMFUNCS
	} dmu_object_byteswap_t;

	#define DMU_OT_NEWTYPE 0x80
	#define DMU_OT_METADATA 0x40
	#define DMU_OT_BYTESWAP_MASK 0x3f

	/*
	* Defines a uint8_t object type. Object types specify if the data
	* in the object is metadata (boolean) and how to byteswap the data
	* (dmu_object_byteswap_t).
	*/
	#define DMU_OT(byteswap, metadata) \
	(DMU_OT_NEWTYPE \| \
	((metadata) ? DMU_OT_METADATA : 0) \| \
	((byteswap) & DMU_OT_BYTESWAP_MASK))

	#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
	(ot) < DMU_OT_NUMTYPES)

	#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
	((ot) & DMU_OT_METADATA) : \
	dmu_ot[(ot)].ot_metadata)

	/*
	* These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
	* have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
	* is repurposed for embedded BPs.
	*/
	#define DMU_OT_HAS_FILL(ot) \
	((ot) == DMU_OT_DNODE \|\| (ot) == DMU_OT_OBJSET)

	#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
	((ot) & DMU_OT_BYTESWAP_MASK) : \
	dmu_ot[(ot)].ot_byteswap)

	typedef enum dmu_object_type {
	DMU_OT_NONE,
	/* general: */
	DMU_OT_OBJECT_DIRECTORY, /* ZAP */
	DMU_OT_OBJECT_ARRAY, /* UINT64 */
	DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
	DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
	DMU_OT_BPOBJ, /* UINT64 */
	DMU_OT_BPOBJ_HDR, /* UINT64 */
	/* spa: */
	DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
	DMU_OT_SPACE_MAP, /* UINT64 */
	/* zil: */
	DMU_OT_INTENT_LOG, /* UINT64 */
	/* dmu: */
	DMU_OT_DNODE, /* DNODE */
	DMU_OT_OBJSET, /* OBJSET */
	/* dsl: */
	DMU_OT_DSL_DIR, /* UINT64 */
	DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
	DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
	DMU_OT_DSL_PROPS, /* ZAP */
	DMU_OT_DSL_DATASET, /* UINT64 */
	/* zpl: */
	DMU_OT_ZNODE, /* ZNODE */
	DMU_OT_OLDACL, /* Old ACL */
	DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
	DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
	DMU_OT_MASTER_NODE, /* ZAP */
	DMU_OT_UNLINKED_SET, /* ZAP */
	/* zvol: */
	DMU_OT_ZVOL, /* UINT8 */
	DMU_OT_ZVOL_PROP, /* ZAP */
	/* other; for testing only! */
	DMU_OT_PLAIN_OTHER, /* UINT8 */
	DMU_OT_UINT64_OTHER, /* UINT64 */
	DMU_OT_ZAP_OTHER, /* ZAP */
	/* new object types: */
	DMU_OT_ERROR_LOG, /* ZAP */
	DMU_OT_SPA_HISTORY, /* UINT8 */
	DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
	DMU_OT_POOL_PROPS, /* ZAP */
	DMU_OT_DSL_PERMS, /* ZAP */
	DMU_OT_ACL, /* ACL */
	DMU_OT_SYSACL, /* SYSACL */
	DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
	DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
	DMU_OT_NEXT_CLONES, /* ZAP */
	DMU_OT_SCAN_QUEUE, /* ZAP */
	DMU_OT_USERGROUP_USED, /* ZAP */
	DMU_OT_USERGROUP_QUOTA, /* ZAP */
	DMU_OT_USERREFS, /* ZAP */
	DMU_OT_DDT_ZAP, /* ZAP */
	DMU_OT_DDT_STATS, /* ZAP */
	DMU_OT_SA, /* System attr */
	DMU_OT_SA_MASTER_NODE, /* ZAP */
	DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
	DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
	DMU_OT_SCAN_XLATE, /* ZAP */
	DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
	DMU_OT_DEADLIST, /* ZAP */
	DMU_OT_DEADLIST_HDR, /* UINT64 */
	DMU_OT_DSL_CLONES, /* ZAP */
	DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
	/*
	* Do not allocate new object types here. Doing so makes the on-disk
	* format incompatible with any other format that uses the same object
	* type number.
	*
	* When creating an object which does not have one of the above types
	* use the DMU_OTN_* type with the correct byteswap and metadata
	* values.
	*
	* The DMU_OTN_* types do not have entries in the dmu_ot table,
	* use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
	* of indexing into dmu_ot directly (this works for both DMU_OT_* types
	* and DMU_OTN_* types).
	*/
	DMU_OT_NUMTYPES,

	/*
	* Names for valid types declared with DMU_OT().
	*/
	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
	} dmu_object_type_t;

	/*
	* These flags are intended to be used to specify the "txg_how"
	* parameter when calling the dmu_tx_assign() function. See the comment
	* above dmu_tx_assign() for more details on the meaning of these flags.
	*/
	#define TXG_NOWAIT (0ULL)
	#define TXG_WAIT (1ULL<<0)
	#define TXG_NOTHROTTLE (1ULL<<1)

	void byteswap_uint64_array(void *buf, size_t size);
	void byteswap_uint32_array(void *buf, size_t size);
	void byteswap_uint16_array(void *buf, size_t size);
	void byteswap_uint8_array(void *buf, size_t size);
	void zap_byteswap(void *buf, size_t size);
	void zfs_oldacl_byteswap(void *buf, size_t size);
	void zfs_acl_byteswap(void *buf, size_t size);
	void zfs_znode_byteswap(void *buf, size_t size);

	#define DS_FIND_SNAPSHOTS (1<<0)
	#define DS_FIND_CHILDREN (1<<1)
	#define DS_FIND_SERIALIZE (1<<2)

	/*
	* The maximum number of bytes that can be accessed as part of one
	* operation, including metadata.
	*/
	#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
	#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */

	#define DMU_USERUSED_OBJECT (-1ULL)
	#define DMU_GROUPUSED_OBJECT (-2ULL)

	/*
	* artificial blkids for bonus buffer and spill blocks
	*/
	#define DMU_BONUS_BLKID (-1ULL)
	#define DMU_SPILL_BLKID (-2ULL)
	/*
	* Public routines to create, destroy, open, and close objsets.
	*/
	int dmu_objset_hold(const char name, void tag, objset_t **osp);
	int dmu_objset_own(const char *name, dmu_objset_type_t type,
	boolean_t readonly, void tag, objset_t *osp);
	void dmu_objset_rele(objset_t os, void tag);
	void dmu_objset_disown(objset_t os, void tag);
	int dmu_objset_open_ds(struct dsl_dataset ds, objset_t *osp);

	void dmu_objset_evict_dbufs(objset_t *os);
	int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
	void (func)(objset_t os, void arg, cred_t cr, dmu_tx_t tx), void arg);
	int dmu_get_recursive_snaps_nvl(char fsname, const char snapname,
	struct nvlist *snaps);
	int dmu_objset_clone(const char name, const char origin);
	int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
	struct nvlist *errlist);
	int dmu_objset_snapshot_one(const char fsname, const char snapname);
	int dmu_objset_snapshot_tmp(const char , const char , int);
	int dmu_objset_find(char name, int func(const char , void ), void arg,
	int flags);
	void dmu_objset_byteswap(void *buf, size_t size);
	int dsl_dataset_rename_snapshot(const char *fsname,
	const char oldsnapname, const char newsnapname, boolean_t recursive);
	+int dmu_objset_remap_indirects(const char *fsname);

	typedef struct dmu_buf {
	uint64_t db_object; /* object that this buffer is part of */
	uint64_t db_offset; /* byte offset in this object */
	uint64_t db_size; /* size of buffer in bytes */
	void db_data; / data in buffer */
	} dmu_buf_t;

	/*
	* The names of zap entries in the DIRECTORY_OBJECT of the MOS.
	*/
	#define DMU_POOL_DIRECTORY_OBJECT 1
	#define DMU_POOL_CONFIG "config"
	#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
	#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
	#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
	#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg"
	#define DMU_POOL_ROOT_DATASET "root_dataset"
	#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
	#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
	#define DMU_POOL_ERRLOG_LAST "errlog_last"
	#define DMU_POOL_SPARES "spares"
	#define DMU_POOL_DEFLATE "deflate"
	#define DMU_POOL_HISTORY "history"
	#define DMU_POOL_PROPS "pool_props"
	#define DMU_POOL_L2CACHE "l2cache"
	#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
	#define DMU_POOL_DDT "DDT-%s-%s-%s"
	#define DMU_POOL_DDT_STATS "DDT-statistics"
	#define DMU_POOL_CREATION_VERSION "creation_version"
	#define DMU_POOL_SCAN "scan"
	#define DMU_POOL_FREE_BPOBJ "free_bpobj"
	#define DMU_POOL_BPTREE_OBJ "bptree_obj"
	#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
	#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
	#define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map"
	+#define DMU_POOL_REMOVING "com.delphix:removing"
	+#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
	+#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"

	/*
	* Allocate an object from this objset. The range of object numbers
	* available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
	*
	* The transaction must be assigned to a txg. The newly allocated
	* object will be "held" in the transaction (ie. you can modify the
	* newly allocated object in this transaction).
	*
	* dmu_object_alloc() chooses an object and returns it in *objectp.
	*
	* dmu_object_claim() allocates a specific object number. If that
	* number is already allocated, it fails and returns EEXIST.
	*
	* Return 0 on success, or ENOSPC or EEXIST as specified above.
	*/
	uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
	int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
	int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
	int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
	int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
	int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);

	/*
	* Free an object from this objset.
	*
	* The object's data will be freed as well (ie. you don't need to call
	* dmu_free(object, 0, -1, tx)).
	*
	* The object need not be held in the transaction.
	*
	* If there are any holds on this object's buffers (via dmu_buf_hold()),
	* or tx holds on the object (via dmu_tx_hold_object()), you can not
	* free it; it fails and returns EBUSY.
	*
	* If the object is not allocated, it fails and returns ENOENT.
	*
	* Return 0 on success, or EBUSY or ENOENT as specified above.
	*/
	int dmu_object_free(objset_t os, uint64_t object, dmu_tx_t tx);

	/*
	* Find the next allocated or free object.
	*
	* The objectp parameter is in-out. It will be updated to be the next
	* object which is allocated. Ignore objects which have not been
	* modified since txg.
	*
	* XXX Can only be called on a objset with no dirty data.
	*
	* Returns 0 on success, or ENOENT if there are no more objects.
	*/
	int dmu_object_next(objset_t os, uint64_t objectp,
	boolean_t hole, uint64_t txg);

	/*
	* Set the data blocksize for an object.
	*
	* The object cannot have any blocks allcated beyond the first. If
	* the first block is allocated already, the new size must be greater
	* than the current block size. If these conditions are not met,
	* ENOTSUP will be returned.
	*
	* Returns 0 on success, or EBUSY if there are any holds on the object
	* contents, or ENOTSUP as described above.
	*/
	int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
	int ibs, dmu_tx_t *tx);

	/*
	* Set the checksum property on a dnode. The new checksum algorithm will
	* apply to all newly written blocks; existing blocks will not be affected.
	*/
	void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
	dmu_tx_t *tx);

	/*
	* Set the compress property on a dnode. The new compression algorithm will
	* apply to all newly written blocks; existing blocks will not be affected.
	*/
	void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
	dmu_tx_t *tx);

	+int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg);
	+
	void
	dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
	void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
	int compressed_size, int byteorder, dmu_tx_t *tx);

	/*
	* Decide how to write a block: checksum, compression, number of copies, etc.
	*/
	#define WP_NOFILL 0x1
	#define WP_DMU_SYNC 0x2
	#define WP_SPILL 0x4

	void dmu_write_policy(objset_t os, dnode_t dn, int level, int wp,
	struct zio_prop *zp);
	/*
	* The bonus data is accessed more or less like a regular buffer.
	* You must dmu_bonus_hold() to get the buffer, which will give you a
	* dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
	- * data. As with any normal buffer, you must call dmu_buf_read() to
	- * read db_data, dmu_buf_will_dirty() before modifying it, and the
	+ * data. As with any normal buffer, you must call dmu_buf_will_dirty()
	+ * before modifying it, and the
	* object must be held in an assigned transaction before calling
	* dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
	* buffer as well. You must release your hold with dmu_buf_rele().
	*
	* Returns ENOENT, EIO, or 0.
	*/
	int dmu_bonus_hold(objset_t os, uint64_t object, void tag, dmu_buf_t **);
	int dmu_bonus_max(void);
	int dmu_set_bonus(dmu_buf_t , int, dmu_tx_t );
	int dmu_set_bonustype(dmu_buf_t , dmu_object_type_t, dmu_tx_t );
	dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
	int dmu_rm_spill(objset_t , uint64_t, dmu_tx_t );

	/*
	* Special spill buffer support used by "SA" framework
	*/

	int dmu_spill_hold_by_bonus(dmu_buf_t bonus, void tag, dmu_buf_t **dbp);
	int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
	void tag, dmu_buf_t *dbp);
	int dmu_spill_hold_existing(dmu_buf_t bonus, void tag, dmu_buf_t **dbp);

	/*
	* Obtain the DMU buffer from the specified object which contains the
	* specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
	* that it will remain in memory. You must release the hold with
	* dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your
	* hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
	*
	* You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
	* on the returned buffer before reading or writing the buffer's
	* db_data. The comments for those routines describe what particular
	* operations are valid after calling them.
	*
	* The object number must be a valid, allocated object number.
	*/
	int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
	void tag, dmu_buf_t *, int flags);
	int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
	void tag, dmu_buf_t *dbp, int flags);

	/*
	* Add a reference to a dmu buffer that has already been held via
	* dmu_buf_hold() in the current context.
	*/
	void dmu_buf_add_ref(dmu_buf_t db, void tag);

	/*
	* Attempt to add a reference to a dmu buffer that is in an unknown state,
	* using a pointer that may have been invalidated by eviction processing.
	* The request will succeed if the passed in dbuf still represents the
	* same os/object/blkid, is ineligible for eviction, and has at least
	* one hold by a user other than the syncer.
	*/
	boolean_t dmu_buf_try_add_ref(dmu_buf_t , objset_t os, uint64_t object,
	uint64_t blkid, void *tag);

	void dmu_buf_rele(dmu_buf_t db, void tag);
	uint64_t dmu_buf_refcount(dmu_buf_t *db);

	/*
	* dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
	* range of an object. A pointer to an array of dmu_buf_t*'s is
	* returned (in *dbpp).
	*
	* dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
	* frees the array. The hold on the array of buffers MUST be released
	* with dmu_buf_rele_array. You can NOT release the hold on each buffer
	* individually with dmu_buf_rele.
	*/
	int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
	uint64_t length, boolean_t read, void *tag,
	int numbufsp, dmu_buf_t **dbpp);
	void dmu_buf_rele_array(dmu_buf_t *, int numbufs, void tag);

	typedef void dmu_buf_evict_func_t(void *user_ptr);

	/*
	* A DMU buffer user object may be associated with a dbuf for the
	* duration of its lifetime. This allows the user of a dbuf (client)
	* to attach private data to a dbuf (e.g. in-core only data such as a
	* dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
	* when that dbuf has been evicted. Clients typically respond to the
	* eviction notification by freeing their private data, thus ensuring
	* the same lifetime for both dbuf and private data.
	*
	* The mapping from a dmu_buf_user_t to any client private data is the
	* client's responsibility. All current consumers of the API with private
	* data embed a dmu_buf_user_t as the first member of the structure for
	* their private data. This allows conversions between the two types
	* with a simple cast. Since the DMU buf user API never needs access
	* to the private data, other strategies can be employed if necessary
	* or convenient for the client (e.g. using container_of() to do the
	* conversion for private data that cannot have the dmu_buf_user_t as
	* its first member).
	*
	* Eviction callbacks are executed without the dbuf mutex held or any
	* other type of mechanism to guarantee that the dbuf is still available.
	* For this reason, users must assume the dbuf has already been freed
	* and not reference the dbuf from the callback context.
	*
	* Users requesting "immediate eviction" are notified as soon as the dbuf
	* is only referenced by dirty records (dirties == holds). Otherwise the
	* notification occurs after eviction processing for the dbuf begins.
	*/
	typedef struct dmu_buf_user {
	/*
	* Asynchronous user eviction callback state.
	*/
	taskq_ent_t dbu_tqent;

	/*
	* This instance's eviction function pointers.
	*
	* dbu_evict_func_sync is called synchronously and then
	* dbu_evict_func_async is executed asynchronously on a taskq.
	*/
	dmu_buf_evict_func_t *dbu_evict_func_sync;
	dmu_buf_evict_func_t *dbu_evict_func_async;
	#ifdef ZFS_DEBUG
	/*
	* Pointer to user's dbuf pointer. NULL for clients that do
	* not associate a dbuf with their user data.
	*
	* The dbuf pointer is cleared upon eviction so as to catch
	* use-after-evict bugs in clients.
	*/
	dmu_buf_t **dbu_clear_on_evict_dbufp;
	#endif
	} dmu_buf_user_t;

	/*
	* Initialize the given dmu_buf_user_t instance with the eviction function
	* evict_func, to be called when the user is evicted.
	*
	* NOTE: This function should only be called once on a given dmu_buf_user_t.
	* To allow enforcement of this, dbu must already be zeroed on entry.
	*/
	/ARGSUSED/
	inline void
	dmu_buf_init_user(dmu_buf_user_t dbu, dmu_buf_evict_func_t evict_func_sync,
	dmu_buf_evict_func_t evict_func_async, dmu_buf_t *clear_on_evict_dbufp)
	{
	ASSERT(dbu->dbu_evict_func_sync == NULL);
	ASSERT(dbu->dbu_evict_func_async == NULL);

	/* must have at least one evict func */
	IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
	dbu->dbu_evict_func_sync = evict_func_sync;
	dbu->dbu_evict_func_async = evict_func_async;
	#ifdef ZFS_DEBUG
	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
	#endif
	}

	/*
	* Attach user data to a dbuf and mark it for normal (when the dbuf's
	* data is cleared or its reference count goes to zero) eviction processing.
	*
	* Returns NULL on success, or the existing user if another user currently
	* owns the buffer.
	*/
	void dmu_buf_set_user(dmu_buf_t db, dmu_buf_user_t *user);

	/*
	* Attach user data to a dbuf and mark it for immediate (its dirty and
	* reference counts are equal) eviction processing.
	*
	* Returns NULL on success, or the existing user if another user currently
	* owns the buffer.
	*/
	void dmu_buf_set_user_ie(dmu_buf_t db, dmu_buf_user_t *user);

	/*
	* Replace the current user of a dbuf.
	*
	* If given the current user of a dbuf, replaces the dbuf's user with
	* "new_user" and returns the user data pointer that was replaced.
	* Otherwise returns the current, and unmodified, dbuf user pointer.
	*/
	void dmu_buf_replace_user(dmu_buf_t db,
	dmu_buf_user_t old_user, dmu_buf_user_t new_user);

	/*
	* Remove the specified user data for a DMU buffer.
	*
	* Returns the user that was removed on success, or the current user if
	* another user currently owns the buffer.
	*/
	void dmu_buf_remove_user(dmu_buf_t db, dmu_buf_user_t *user);

	/*
	* Returns the user data (dmu_buf_user_t *) associated with this dbuf.
	*/
	void dmu_buf_get_user(dmu_buf_t db);

	objset_t dmu_buf_get_objset(dmu_buf_t db);
	dnode_t dmu_buf_dnode_enter(dmu_buf_t db);
	void dmu_buf_dnode_exit(dmu_buf_t *db);

	/* Block until any in-progress dmu buf user evictions complete. */
	void dmu_buf_user_evict_wait(void);

	/*
	* Returns the blkptr associated with this dbuf, or NULL if not set.
	*/
	struct blkptr dmu_buf_get_blkptr(dmu_buf_t db);

	/*
	* Indicate that you are going to modify the buffer's data (db_data).
	*
	* The transaction (tx) must be assigned to a txg (ie. you've called
	* dmu_tx_assign()). The buffer's object must be held in the tx
	* (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
	*/
	void dmu_buf_will_dirty(dmu_buf_t db, dmu_tx_t tx);

	/*
	* You must create a transaction, then hold the objects which you will
	* (or might) modify as part of this transaction. Then you must assign
	* the transaction to a transaction group. Once the transaction has
	* been assigned, you can modify buffers which belong to held objects as
	* part of this transaction. You can't modify buffers before the
	* transaction has been assigned; you can't modify buffers which don't
	* belong to objects which this transaction holds; you can't hold
	* objects once the transaction has been assigned. You may hold an
	* object which you are going to free (with dmu_object_free()), but you
	* don't have to.
	*
	* You can abort the transaction before it has been assigned.
	*
	* Note that you may hold buffers (with dmu_buf_hold) at any time,
	* regardless of transaction state.
	*/

	#define DMU_NEW_OBJECT (-1ULL)
	#define DMU_OBJECT_END (-1ULL)

	dmu_tx_t dmu_tx_create(objset_t os);
	void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
	void dmu_tx_hold_write_by_dnode(dmu_tx_t tx, dnode_t dn, uint64_t off,
	int len);
	void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
	uint64_t len);
	void dmu_tx_hold_free_by_dnode(dmu_tx_t tx, dnode_t dn, uint64_t off,
	uint64_t len);
	+void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object);
	void dmu_tx_hold_zap(dmu_tx_t tx, uint64_t object, int add, const char name);
	void dmu_tx_hold_zap_by_dnode(dmu_tx_t tx, dnode_t dn, int add,
	const char *name);
	void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
	void dmu_tx_hold_bonus_by_dnode(dmu_tx_t tx, dnode_t dn);
	void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
	void dmu_tx_hold_sa(dmu_tx_t tx, struct sa_handle hdl, boolean_t may_grow);
	void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
	void dmu_tx_abort(dmu_tx_t *tx);
	int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
	void dmu_tx_wait(dmu_tx_t *tx);
	void dmu_tx_commit(dmu_tx_t *tx);
	void dmu_tx_mark_netfree(dmu_tx_t *tx);

	/*
	* To register a commit callback, dmu_tx_callback_register() must be called.
	*
	* dcb_data is a pointer to caller private data that is passed on as a
	* callback parameter. The caller is responsible for properly allocating and
	* freeing it.
	*
	* When registering a callback, the transaction must be already created, but
	* it cannot be committed or aborted. It can be assigned to a txg or not.
	*
	* The callback will be called after the transaction has been safely written
	* to stable storage and will also be called if the dmu_tx is aborted.
	* If there is any error which prevents the transaction from being committed to
	* disk, the callback will be called with a value of error != 0.
	*/
	typedef void dmu_tx_callback_func_t(void *dcb_data, int error);

	void dmu_tx_callback_register(dmu_tx_t tx, dmu_tx_callback_func_t dcb_func,
	void *dcb_data);

	/*
	* Free up the data blocks for a defined range of a file. If size is
	* -1, the range from offset to end-of-file is freed.
	*/
	int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
	uint64_t size, dmu_tx_t *tx);
	int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
	uint64_t size);
	int dmu_free_long_object(objset_t *os, uint64_t object);

	/*
	* Convenience functions.
	*
	* Canfail routines will return 0 on success, or an errno if there is a
	* nonrecoverable I/O error.
	*/
	#define DMU_READ_PREFETCH 0 /* prefetch */
	#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
	int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	void *buf, uint32_t flags);
	int dmu_read_by_dnode(dnode_t dn, uint64_t offset, uint64_t size, void buf,
	uint32_t flags);
	void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	const void buf, dmu_tx_t tx);
	void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
	const void buf, dmu_tx_t tx);
	void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	dmu_tx_t *tx);
	int dmu_read_uio(objset_t os, uint64_t object, struct uio uio, uint64_t size);
	int dmu_read_uio_dbuf(dmu_buf_t zdb, struct uio uio, uint64_t size);
	int dmu_write_uio(objset_t os, uint64_t object, struct uio uio, uint64_t size,
	dmu_tx_t *tx);
	int dmu_write_uio_dbuf(dmu_buf_t zdb, struct uio uio, uint64_t size,
	dmu_tx_t *tx);
	#ifdef _KERNEL
	#ifdef illumos
	int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
	uint64_t size, struct page pp, dmu_tx_t tx);
	#else
	int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
	uint64_t size, struct vm_page *ppa, dmu_tx_t tx);
	int dmu_read_pages(objset_t os, uint64_t object, vm_page_t ma, int count,
	int rbehind, int rahead, int last_size);
	#endif
	#endif
	struct arc_buf dmu_request_arcbuf(dmu_buf_t handle, int size);
	void dmu_return_arcbuf(struct arc_buf *buf);
	void dmu_assign_arcbuf(dmu_buf_t handle, uint64_t offset, struct arc_buf buf,
	dmu_tx_t *tx);
	int dmu_xuio_init(struct xuio *uio, int niov);
	void dmu_xuio_fini(struct xuio *uio);
	int dmu_xuio_add(struct xuio uio, struct arc_buf abuf, offset_t off,
	size_t n);
	int dmu_xuio_cnt(struct xuio *uio);
	struct arc_buf dmu_xuio_arcbuf(struct xuio uio, int i);
	void dmu_xuio_clear(struct xuio *uio, int i);
	void xuio_stat_wbuf_copied(void);
	void xuio_stat_wbuf_nocopy(void);

	extern boolean_t zfs_prefetch_disable;
	extern int zfs_max_recordsize;

	/*
	* Asynchronously try to read in the data.
	*/
	void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
	uint64_t len, enum zio_priority pri);

	typedef struct dmu_object_info {
	/* All sizes are in bytes unless otherwise indicated. */
	uint32_t doi_data_block_size;
	uint32_t doi_metadata_block_size;
	dmu_object_type_t doi_type;
	dmu_object_type_t doi_bonus_type;
	uint64_t doi_bonus_size;
	uint8_t doi_indirection; /* 2 = dnode->indirect->data */
	uint8_t doi_checksum;
	uint8_t doi_compress;
	uint8_t doi_nblkptr;
	uint8_t doi_pad[4];
	uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
	uint64_t doi_max_offset;
	uint64_t doi_fill_count; /* number of non-empty blocks */
	} dmu_object_info_t;

	typedef void arc_byteswap_func_t(void *buf, size_t size);

	typedef struct dmu_object_type_info {
	dmu_object_byteswap_t ot_byteswap;
	boolean_t ot_metadata;
	char *ot_name;
	} dmu_object_type_info_t;

	typedef struct dmu_object_byteswap_info {
	arc_byteswap_func_t *ob_func;
	char *ob_name;
	} dmu_object_byteswap_info_t;

	extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
	extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];

	/*
	* Get information on a DMU object.
	*
	* Return 0 on success or ENOENT if object is not allocated.
	*
	* If doi is NULL, just indicates whether the object exists.
	*/
	int dmu_object_info(objset_t os, uint64_t object, dmu_object_info_t doi);
	/* Like dmu_object_info, but faster if you have a held dnode in hand. */
	void dmu_object_info_from_dnode(dnode_t dn, dmu_object_info_t doi);
	/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
	void dmu_object_info_from_db(dmu_buf_t db, dmu_object_info_t doi);
	/*
	* Like dmu_object_info_from_db, but faster still when you only care about
	* the size. This is specifically optimized for zfs_getattr().
	*/
	void dmu_object_size_from_db(dmu_buf_t db, uint32_t blksize,
	u_longlong_t *nblk512);

	typedef struct dmu_objset_stats {
	uint64_t dds_num_clones; /* number of clones of this */
	uint64_t dds_creation_txg;
	uint64_t dds_guid;
	dmu_objset_type_t dds_type;
	uint8_t dds_is_snapshot;
	uint8_t dds_inconsistent;
	char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
	} dmu_objset_stats_t;

	/*
	* Get stats on a dataset.
	*/
	void dmu_objset_fast_stat(objset_t os, dmu_objset_stats_t stat);

	/*
	* Add entries to the nvlist for all the objset's properties. See
	* zfs_prop_table[] and zfs(1m) for details on the properties.
	*/
	void dmu_objset_stats(objset_t os, struct nvlist nv);

	/*
	* Get the space usage statistics for statvfs().
	*
	* refdbytes is the amount of space "referenced" by this objset.
	* availbytes is the amount of space available to this objset, taking
	* into account quotas & reservations, assuming that no other objsets
	* use the space first. These values correspond to the 'referenced' and
	* 'available' properties, described in the zfs(1m) manpage.
	*
	* usedobjs and availobjs are the number of objects currently allocated,
	* and available.
	*/
	void dmu_objset_space(objset_t os, uint64_t refdbytesp, uint64_t *availbytesp,
	uint64_t usedobjsp, uint64_t availobjsp);

	/*
	* The fsid_guid is a 56-bit ID that can change to avoid collisions.
	* (Contrast with the ds_guid which is a 64-bit ID that will never
	* change, so there is a small probability that it will collide.)
	*/
	uint64_t dmu_objset_fsid_guid(objset_t *os);

	/*
	* Get the [cm]time for an objset's snapshot dir
	*/
	timestruc_t dmu_objset_snap_cmtime(objset_t *os);

	int dmu_objset_is_snapshot(objset_t *os);

	extern struct spa dmu_objset_spa(objset_t os);
	extern struct zilog dmu_objset_zil(objset_t os);
	extern struct dsl_pool dmu_objset_pool(objset_t os);
	extern struct dsl_dataset dmu_objset_ds(objset_t os);
	extern void dmu_objset_name(objset_t os, char buf);
	extern dmu_objset_type_t dmu_objset_type(objset_t *os);
	extern uint64_t dmu_objset_id(objset_t *os);
	extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
	extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
	extern int dmu_snapshot_list_next(objset_t os, int namelen, char name,
	uint64_t id, uint64_t offp, boolean_t *case_conflict);
	extern int dmu_snapshot_realname(objset_t os, char name, char *real,
	int maxlen, boolean_t *conflict);
	extern int dmu_dir_list_next(objset_t os, int namelen, char name,
	uint64_t idp, uint64_t offp);

	typedef int objset_used_cb_t(dmu_object_type_t bonustype,
	void bonus, uint64_t userp, uint64_t *groupp);
	extern void dmu_objset_register_type(dmu_objset_type_t ost,
	objset_used_cb_t *cb);
	extern void dmu_objset_set_user(objset_t os, void user_ptr);
	extern void dmu_objset_get_user(objset_t os);

	/*
	* Return the txg number for the given assigned transaction.
	*/
	uint64_t dmu_tx_get_txg(dmu_tx_t *tx);

	/*
	* Synchronous write.
	* If a parent zio is provided this function initiates a write on the
	* provided buffer as a child of the parent zio.
	* In the absence of a parent zio, the write is completed synchronously.
	* At write completion, blk is filled with the bp of the written block.
	* Note that while the data covered by this function will be on stable
	* storage when the write completes this new data does not become a
	* permanent part of the file until the associated transaction commits.
	*/

	/*
	* {zfs,zvol,ztest}_get_done() args
	*/
	typedef struct zgd {
	struct lwb *zgd_lwb;
	struct blkptr *zgd_bp;
	dmu_buf_t *zgd_db;
	struct rl *zgd_rl;
	void *zgd_private;
	} zgd_t;

	typedef void dmu_sync_cb_t(zgd_t *arg, int error);
	int dmu_sync(struct zio zio, uint64_t txg, dmu_sync_cb_t done, zgd_t *zgd);

	/*
	* Find the next hole or data block in file starting at *off
	* Return found offset in *off. Return ESRCH for end of file.
	*/
	int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
	uint64_t *off);

	/*
	* Check if a DMU object has any dirty blocks. If so, sync out
	* all pending transaction groups. Otherwise, this function
	* does not alter DMU state. This could be improved to only sync
	* out the necessary transaction groups for this particular
	* object.
	*/
	int dmu_object_wait_synced(objset_t *os, uint64_t object);

	/*
	* Initial setup and final teardown.
	*/
	extern void dmu_init(void);
	extern void dmu_fini(void);

	typedef void (dmu_traverse_cb_t)(objset_t os, void arg, struct blkptr bp,
	uint64_t object, uint64_t offset, int len);
	void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
	dmu_traverse_cb_t cb, void *arg);
	int dmu_diff(const char tosnap_name, const char fromsnap_name,
	struct file fp, offset_t offp);

	/* CRC64 table */
	#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
	extern uint64_t zfs_crc64_table[256];

	extern int zfs_mdcomp_disable;

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DMU_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h (revision 332525)
	@@ -1,362 +1,363 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	*/

	#ifndef _SYS_DNODE_H
	#define _SYS_DNODE_H

	#include <sys/zfs_context.h>
	#include <sys/avl.h>
	#include <sys/spa.h>
	#include <sys/txg.h>
	#include <sys/zio.h>
	#include <sys/refcount.h>
	#include <sys/dmu_zfetch.h>
	#include <sys/zrlock.h>
	#include <sys/multilist.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	/*
	* dnode_hold() flags.
	*/
	#define DNODE_MUST_BE_ALLOCATED 1
	#define DNODE_MUST_BE_FREE 2

	/*
	* dnode_next_offset() flags.
	*/
	#define DNODE_FIND_HOLE 1
	#define DNODE_FIND_BACKWARDS 2
	#define DNODE_FIND_HAVELOCK 4

	/*
	* Fixed constants.
	*/
	#define DNODE_SHIFT 9 /* 512 bytes */
	#define DN_MIN_INDBLKSHIFT 12 /* 4k */
	/*
	* If we ever increase this value beyond 20, we need to revisit all logic that
	* does x << level * ebps to handle overflow. With a 1M indirect block size,
	* 4 levels of indirect blocks would not be able to guarantee addressing an
	* entire object, so 5 levels will be used, but 5 * (20 - 7) = 65.
	*/
	#define DN_MAX_INDBLKSHIFT 17 /* 128k */
	#define DNODE_BLOCK_SHIFT 14 /* 16k */
	#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
	#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
	#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */

	/*
	* dnode id flags
	*
	* Note: a file will never ever have its
	* ids moved from bonus->spill
	* and only in a crypto environment would it be on spill
	*/
	#define DN_ID_CHKED_BONUS 0x1
	#define DN_ID_CHKED_SPILL 0x2
	#define DN_ID_OLD_EXIST 0x4
	#define DN_ID_NEW_EXIST 0x8

	/*
	* Derived constants.
	*/
	#define DNODE_SIZE (1 << DNODE_SHIFT)
	#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
	#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
	#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
	#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1)
	#define DN_KILL_SPILLBLK (1)

	#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
	#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)

	/*
	* This is inaccurate if the indblkshift of the particular object is not the
	* max. But it's only used by userland to calculate the zvol reservation.
	*/
	#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
	#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT)

	/* The +2 here is a cheesy way to round up */
	#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
	(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))

	#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
	(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))

	#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
	(dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)

	#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))

	struct dmu_buf_impl;
	struct objset;
	struct zio;

	enum dnode_dirtycontext {
	DN_UNDIRTIED,
	DN_DIRTY_OPEN,
	DN_DIRTY_SYNC
	};

	/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
	#define DNODE_FLAG_USED_BYTES (1<<0)
	#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)

	/* Does dnode have a SA spill blkptr in bonus? */
	#define DNODE_FLAG_SPILL_BLKPTR (1<<2)

	typedef struct dnode_phys {
	uint8_t dn_type; /* dmu_object_type_t */
	uint8_t dn_indblkshift; /* ln2(indirect block size) */
	uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
	uint8_t dn_nblkptr; /* length of dn_blkptr */
	uint8_t dn_bonustype; /* type of data in bonus buffer */
	uint8_t dn_checksum; /* ZIO_CHECKSUM type */
	uint8_t dn_compress; /* ZIO_COMPRESS type */
	uint8_t dn_flags; /* DNODE_FLAG_* */
	uint16_t dn_datablkszsec; /* data block size in 512b sectors */
	uint16_t dn_bonuslen; /* length of dn_bonus */
	uint8_t dn_pad2[4];

	/* accounting is protected by dn_dirty_mtx */
	uint64_t dn_maxblkid; /* largest allocated block ID */
	uint64_t dn_used; /* bytes (or sectors) of disk space */

	uint64_t dn_pad3[4];

	blkptr_t dn_blkptr[1];
	uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
	blkptr_t dn_spill;
	} dnode_phys_t;

	struct dnode {
	/*
	* Protects the structure of the dnode, including the number of levels
	* of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
	*/
	krwlock_t dn_struct_rwlock;

	/* Our link on dn_objset->os_dnodes list; protected by os_lock. */
	list_node_t dn_link;

	/* immutable: */
	struct objset *dn_objset;
	uint64_t dn_object;
	struct dmu_buf_impl *dn_dbuf;
	struct dnode_handle *dn_handle;
	dnode_phys_t dn_phys; / pointer into dn->dn_dbuf->db.db_data */

	/*
	* Copies of stuff in dn_phys. They're valid in the open
	* context (eg. even before the dnode is first synced).
	* Where necessary, these are protected by dn_struct_rwlock.
	*/
	dmu_object_type_t dn_type; /* object type */
	uint16_t dn_bonuslen; /* bonus length */
	uint8_t dn_bonustype; /* bonus type */
	uint8_t dn_nblkptr; /* number of blkptrs (immutable) */
	uint8_t dn_checksum; /* ZIO_CHECKSUM type */
	uint8_t dn_compress; /* ZIO_COMPRESS type */
	uint8_t dn_nlevels;
	uint8_t dn_indblkshift;
	uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
	uint8_t dn_moved; /* Has this dnode been moved? */
	uint16_t dn_datablkszsec; /* in 512b sectors */
	uint32_t dn_datablksz; /* in bytes */
	uint64_t dn_maxblkid;
	uint8_t dn_next_type[TXG_SIZE];
	uint8_t dn_next_nblkptr[TXG_SIZE];
	uint8_t dn_next_nlevels[TXG_SIZE];
	uint8_t dn_next_indblkshift[TXG_SIZE];
	uint8_t dn_next_bonustype[TXG_SIZE];
	uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */
	uint16_t dn_next_bonuslen[TXG_SIZE];
	uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */

	/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
	uint32_t dn_dbufs_count; /* count of dn_dbufs */

	/* protected by os_lock: */
	multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */

	/* protected by dn_mtx: */
	kmutex_t dn_mtx;
	list_t dn_dirty_records[TXG_SIZE];
	struct range_tree *dn_free_ranges[TXG_SIZE];
	uint64_t dn_allocated_txg;
	uint64_t dn_free_txg;
	uint64_t dn_assigned_txg;
	kcondvar_t dn_notxholds;
	enum dnode_dirtycontext dn_dirtyctx;
	uint8_t dn_dirtyctx_firstset; / dbg: contents meaningless */

	/* protected by own devices */
	refcount_t dn_tx_holds;
	refcount_t dn_holds;

	kmutex_t dn_dbufs_mtx;
	/*
	* Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
	* can contain multiple dbufs of the same (level, blkid) when a
	* dbuf is marked DB_EVICTING without being removed from
	* dn_dbufs. To maintain the avl invariant that there cannot be
	* duplicate entries, we order the dbufs by an arbitrary value -
	* their address in memory. This means that dn_dbufs cannot be used to
	* directly look up a dbuf. Instead, callers must use avl_walk, have
	* a reference to the dbuf, or look up a non-existant node with
	* db_state = DB_SEARCH (see dbuf_free_range for an example).
	*/
	avl_tree_t dn_dbufs;

	/* protected by dn_struct_rwlock */
	struct dmu_buf_impl dn_bonus; / bonus buffer dbuf */

	boolean_t dn_have_spill; /* have spill or are spilling */

	/* parent IO for current sync write */
	zio_t *dn_zio;

	/* used in syncing context */
	uint64_t dn_oldused; /* old phys used bytes */
	uint64_t dn_oldflags; /* old phys dn_flags */
	uint64_t dn_olduid, dn_oldgid;
	uint64_t dn_newuid, dn_newgid;
	int dn_id_flags;

	/* holds prefetch structure */
	struct zfetch dn_zfetch;
	};

	/*
	* Adds a level of indirection between the dbuf and the dnode to avoid
	* iterating descendent dbufs in dnode_move(). Handles are not allocated
	* individually, but as an array of child dnodes in dnode_hold_impl().
	*/
	typedef struct dnode_handle {
	/* Protects dnh_dnode from modification by dnode_move(). */
	zrlock_t dnh_zrlock;
	dnode_t *dnh_dnode;
	} dnode_handle_t;

	typedef struct dnode_children {
	dmu_buf_user_t dnc_dbu; /* User evict data */
	size_t dnc_count; /* number of children */
	dnode_handle_t dnc_children[]; /* sized dynamically */
	} dnode_children_t;

	typedef struct free_range {
	avl_node_t fr_node;
	uint64_t fr_blkid;
	uint64_t fr_nblks;
	} free_range_t;

	void dnode_special_open(struct objset dd, dnode_phys_t dnp,
	uint64_t object, dnode_handle_t *dnh);
	void dnode_special_close(dnode_handle_t *dnh);

	void dnode_setbonuslen(dnode_t dn, int newsize, dmu_tx_t tx);
	void dnode_setbonus_type(dnode_t dn, dmu_object_type_t, dmu_tx_t tx);
	void dnode_rm_spill(dnode_t dn, dmu_tx_t tx);

	int dnode_hold(struct objset *dd, uint64_t object,
	void ref, dnode_t *dnp);
	int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
	void ref, dnode_t *dnp);
	boolean_t dnode_add_ref(dnode_t dn, void ref);
	void dnode_rele(dnode_t dn, void ref);
	void dnode_rele_and_unlock(dnode_t dn, void tag);
	void dnode_setdirty(dnode_t dn, dmu_tx_t tx);
	void dnode_sync(dnode_t dn, dmu_tx_t tx);
	void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
	dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
	void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
	dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
	void dnode_free(dnode_t dn, dmu_tx_t tx);
	void dnode_byteswap(dnode_phys_t *dnp);
	void dnode_buf_byteswap(void *buf, size_t size);
	void dnode_verify(dnode_t *dn);
	int dnode_set_blksz(dnode_t dn, uint64_t size, int ibs, dmu_tx_t tx);
	void dnode_free_range(dnode_t dn, uint64_t off, uint64_t len, dmu_tx_t tx);
	void dnode_diduse_space(dnode_t *dn, int64_t space);
	void dnode_new_blkid(dnode_t dn, uint64_t blkid, dmu_tx_t tx, boolean_t);
	uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
	void dnode_init(void);
	void dnode_fini(void);
	int dnode_next_offset(dnode_t dn, int flags, uint64_t off,
	int minlvl, uint64_t blkfill, uint64_t txg);
	void dnode_evict_dbufs(dnode_t *dn);
	void dnode_evict_bonus(dnode_t *dn);
	+boolean_t dnode_needs_remap(const dnode_t *dn);

	#define DNODE_IS_CACHEABLE(_dn) \
	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL \|\| \
	(DMU_OT_IS_METADATA((_dn)->dn_type) && \
	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))

	#define DNODE_META_IS_CACHEABLE(_dn) \
	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL \|\| \
	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)

	#ifdef ZFS_DEBUG

	/*
	* There should be a ## between the string literal and fmt, to make it
	* clear that we're joining two strings together, but that piece of shit
	* gcc doesn't support that preprocessor token.
	*/
	#define dprintf_dnode(dn, fmt, ...) do { \
	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	char __db_buf[32]; \
	uint64_t __db_obj = (dn)->dn_object; \
	if (__db_obj == DMU_META_DNODE_OBJECT) \
	(void) strcpy(__db_buf, "mdn"); \
	else \
	(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
	(u_longlong_t)__db_obj);\
	dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
	__db_buf, __VA_ARGS__); \
	} \
	_NOTE(CONSTCOND) } while (0)

	#define DNODE_VERIFY(dn) dnode_verify(dn)
	#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx)

	#else

	#define dprintf_dnode(db, fmt, ...)
	#define DNODE_VERIFY(dn)
	#define FREE_VERIFY(db, start, end, tx)

	#endif

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DNODE_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h (revision 332525)
	@@ -1,420 +1,450 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2013 Steven Hartland. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#ifndef _SYS_DSL_DATASET_H
	#define _SYS_DSL_DATASET_H

	#include <sys/dmu.h>
	#include <sys/spa.h>
	#include <sys/txg.h>
	#include <sys/zio.h>
	#include <sys/bplist.h>
	#include <sys/dsl_synctask.h>
	#include <sys/zfs_context.h>
	#include <sys/dsl_deadlist.h>
	#include <sys/refcount.h>
	#include <sys/rrwlock.h>
	#include <zfeature_common.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	struct dsl_dataset;
	struct dsl_dir;
	struct dsl_pool;

	#define DS_FLAG_INCONSISTENT (1ULL<<0)
	#define DS_IS_INCONSISTENT(ds) \
	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)

	/*
	* Do not allow this dataset to be promoted.
	*/
	#define DS_FLAG_NOPROMOTE (1ULL<<1)

	/*
	* DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly
	* calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE,
	* refquota/refreservations).
	*/
	#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2)

	/*
	* DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
	* on a dataset. This allows the dataset to be destroyed using 'zfs release'.
	*/
	#define DS_FLAG_DEFER_DESTROY (1ULL<<3)
	#define DS_IS_DEFER_DESTROY(ds) \
	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)

	/*
	* DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
	* They should be of the format <reverse-dns>:<field>.
	*/

	/*
	* This field's value is the object ID of a zap object which contains the
	* bookmarks of this dataset. If it is present, then this dataset is counted
	* in the refcount of the SPA_FEATURES_BOOKMARKS feature.
	*/
	#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"

	/*
	* These fields are set on datasets that are in the middle of a resumable
	* receive, and allow the sender to resume the send if it is interrupted.
	*/
	#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
	#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
	#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
	#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
	#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
	#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
	#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok"
	#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
	#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok"

	/*
	+ * This field is set to the object number of the remap deadlist if one exists.
	+ */
	+#define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist"
	+
	+/*
	* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
	* name lookups should be performed case-insensitively.
	*/
	#define DS_FLAG_CI_DATASET (1ULL<<16)

	#define DS_CREATE_FLAG_NODIRTY (1ULL<<24)

	typedef struct dsl_dataset_phys {
	uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */
	uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */
	uint64_t ds_prev_snap_txg;
	uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */
	uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */
	uint64_t ds_num_children; /* clone/snap children; ==0 for head */
	uint64_t ds_creation_time; /* seconds since 1970 */
	uint64_t ds_creation_txg;
	uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */
	/*
	* ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes
	* include all blocks referenced by this dataset, including those
	* shared with any other datasets.
	*/
	uint64_t ds_referenced_bytes;
	uint64_t ds_compressed_bytes;
	uint64_t ds_uncompressed_bytes;
	uint64_t ds_unique_bytes; /* only relevant to snapshots */
	/*
	* The ds_fsid_guid is a 56-bit ID that can change to avoid
	* collisions. The ds_guid is a 64-bit ID that will never
	* change, so there is a small probability that it will collide.
	*/
	uint64_t ds_fsid_guid;
	uint64_t ds_guid;
	uint64_t ds_flags; /* DS_FLAG_* */
	blkptr_t ds_bp;
	uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */
	uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */
	uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */
	uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
	} dsl_dataset_phys_t;

	typedef struct dsl_dataset {
	dmu_buf_user_t ds_dbu;
	rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */

	/* Immutable: */
	struct dsl_dir *ds_dir;
	dmu_buf_t *ds_dbuf;
	uint64_t ds_object;
	uint64_t ds_fsid_guid;
	boolean_t ds_is_snapshot;

	/* only used in syncing context, only valid for non-snapshots: */
	struct dsl_dataset *ds_prev;
	uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */

	/* has internal locking: */
	dsl_deadlist_t ds_deadlist;
	bplist_t ds_pending_deadlist;

	+ /*
	+ * The remap deadlist contains blocks (DVA's, really) that are
	+ * referenced by the previous snapshot and point to indirect vdevs,
	+ * but in this dataset they have been remapped to point to concrete
	+ * (or at least, less-indirect) vdevs. In other words, the
	+ * physical DVA is referenced by the previous snapshot but not by
	+ * this dataset. Logically, the DVA continues to be referenced,
	+ * but we are using a different (less indirect) physical DVA.
	+ * This deadlist is used to determine when physical DVAs that
	+ * point to indirect vdevs are no longer referenced anywhere,
	+ * and thus should be marked obsolete.
	+ *
	+ * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled.
	+ */
	+ dsl_deadlist_t ds_remap_deadlist;
	+ /* protects creation of the ds_remap_deadlist */
	+ kmutex_t ds_remap_deadlist_lock;
	+
	/* protected by lock on pool's dp_dirty_datasets list */
	txg_node_t ds_dirty_link;
	list_node_t ds_synced_link;

	/*
	* ds_phys->ds_<accounting> is also protected by ds_lock.
	* Protected by ds_lock:
	*/
	kmutex_t ds_lock;
	objset_t *ds_objset;
	uint64_t ds_userrefs;
	void *ds_owner;

	/*
	* Long holds prevent the ds from being destroyed; they allow the
	* ds to remain held even after dropping the dp_config_rwlock.
	* Owning counts as a long hold. See the comments above
	* dsl_pool_hold() for details.
	*/
	refcount_t ds_longholds;

	/* no locking; only for making guesses */
	uint64_t ds_trysnap_txg;

	/* for objset_open() */
	kmutex_t ds_opening_lock;

	uint64_t ds_reserved; /* cached refreservation */
	uint64_t ds_quota; /* cached refquota */

	kmutex_t ds_sendstream_lock;
	list_t ds_sendstreams;

	/*
	* When in the middle of a resumable receive, tracks how much
	* progress we have made.
	*/
	uint64_t ds_resume_object[TXG_SIZE];
	uint64_t ds_resume_offset[TXG_SIZE];
	uint64_t ds_resume_bytes[TXG_SIZE];

	/* Protected by our dsl_dir's dd_lock */
	list_t ds_prop_cbs;

	/*
	* For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
	* uses this feature.
	*/
	uint8_t ds_feature_inuse[SPA_FEATURES];

	/*
	* Set if we need to activate the feature on this dataset this txg
	* (used only in syncing context).
	*/
	uint8_t ds_feature_activation_needed[SPA_FEATURES];

	/* Protected by ds_lock; keep at end of struct for better locality */
	char ds_snapname[ZFS_MAX_DATASET_NAME_LEN];
	} dsl_dataset_t;

	inline dsl_dataset_phys_t *
	dsl_dataset_phys(dsl_dataset_t *ds)
	{
	return (ds->ds_dbuf->db_data);
	}

	typedef struct dsl_dataset_promote_arg {
	const char *ddpa_clonename;
	dsl_dataset_t *ddpa_clone;
	list_t shared_snaps, origin_snaps, clone_snaps;
	dsl_dataset_t origin_origin; / origin of the origin */
	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
	nvlist_t *err_ds;
	cred_t *cr;
	} dsl_dataset_promote_arg_t;

	typedef struct dsl_dataset_rollback_arg {
	const char *ddra_fsname;
	const char *ddra_tosnap;
	void *ddra_owner;
	nvlist_t *ddra_result;
	} dsl_dataset_rollback_arg_t;

	typedef struct dsl_dataset_snapshot_arg {
	nvlist_t *ddsa_snaps;
	nvlist_t *ddsa_props;
	nvlist_t *ddsa_errors;
	cred_t *ddsa_cr;
	} dsl_dataset_snapshot_arg_t;

	/*
	* The max length of a temporary tag prefix is the number of hex digits
	* required to express UINT64_MAX plus one for the hyphen.
	*/
	#define MAX_TAG_PREFIX_LEN 17

	#define dsl_dataset_is_snapshot(ds) \
	(dsl_dataset_phys(ds)->ds_num_children != 0)

	#define DS_UNIQUE_IS_ACCURATE(ds) \
	((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)

	int dsl_dataset_hold(struct dsl_pool dp, const char name, void *tag,
	dsl_dataset_t **dsp);
	boolean_t dsl_dataset_try_add_ref(struct dsl_pool dp, dsl_dataset_t ds,
	void *tag);
	int dsl_dataset_hold_obj(struct dsl_pool dp, uint64_t dsobj, void tag,
	dsl_dataset_t **);
	void dsl_dataset_rele(dsl_dataset_t ds, void tag);
	int dsl_dataset_own(struct dsl_pool dp, const char name,
	void tag, dsl_dataset_t *dsp);
	int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
	void tag, dsl_dataset_t *dsp);
	void dsl_dataset_disown(dsl_dataset_t ds, void tag);
	void dsl_dataset_name(dsl_dataset_t ds, char name);
	boolean_t dsl_dataset_tryown(dsl_dataset_t ds, void tag);
	int dsl_dataset_namelen(dsl_dataset_t *ds);
	boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
	uint64_t dsl_dataset_create_sync(dsl_dir_t pds, const char lastname,
	dsl_dataset_t origin, uint64_t flags, cred_t , dmu_tx_t *);
	uint64_t dsl_dataset_create_sync_dd(dsl_dir_t dd, dsl_dataset_t origin,
	uint64_t flags, dmu_tx_t *tx);
	void dsl_dataset_snapshot_sync(void arg, dmu_tx_t tx);
	int dsl_dataset_snapshot_check(void arg, dmu_tx_t tx);
	int dsl_dataset_snapshot(nvlist_t snaps, nvlist_t props, nvlist_t *errors);
	void dsl_dataset_promote_sync(void arg, dmu_tx_t tx);
	int dsl_dataset_promote_check(void arg, dmu_tx_t tx);
	int dsl_dataset_promote(const char name, char conflsnap);
	int dsl_dataset_clone_swap(dsl_dataset_t clone, dsl_dataset_t origin_head,
	boolean_t force);
	int dsl_dataset_rename_snapshot(const char *fsname,
	const char oldsnapname, const char newsnapname, boolean_t recursive);
	int dsl_dataset_snapshot_tmp(const char fsname, const char snapname,
	minor_t cleanup_minor, const char *htag);

	blkptr_t dsl_dataset_get_blkptr(dsl_dataset_t ds);

	spa_t dsl_dataset_get_spa(dsl_dataset_t ds);

	boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds,
	dsl_dataset_t *snap);

	void dsl_dataset_sync(dsl_dataset_t os, zio_t zio, dmu_tx_t *tx);
	void dsl_dataset_sync_done(dsl_dataset_t os, dmu_tx_t tx);

	void dsl_dataset_block_born(dsl_dataset_t ds, const blkptr_t bp,
	dmu_tx_t *tx);
	int dsl_dataset_block_kill(dsl_dataset_t ds, const blkptr_t bp,
	dmu_tx_t *tx, boolean_t async);
	+void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev,
	+ uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx);

	void dsl_dataset_dirty(dsl_dataset_t ds, dmu_tx_t tx);

	int get_clones_stat_impl(dsl_dataset_t ds, nvlist_t val);
	char get_receive_resume_stats_impl(dsl_dataset_t ds);
	char get_child_receive_stats(dsl_dataset_t ds);
	uint64_t dsl_get_refratio(dsl_dataset_t *ds);
	uint64_t dsl_get_logicalreferenced(dsl_dataset_t *ds);
	uint64_t dsl_get_compressratio(dsl_dataset_t *ds);
	uint64_t dsl_get_used(dsl_dataset_t *ds);
	uint64_t dsl_get_creation(dsl_dataset_t *ds);
	uint64_t dsl_get_creationtxg(dsl_dataset_t *ds);
	uint64_t dsl_get_refquota(dsl_dataset_t *ds);
	uint64_t dsl_get_refreservation(dsl_dataset_t *ds);
	uint64_t dsl_get_guid(dsl_dataset_t *ds);
	uint64_t dsl_get_unique(dsl_dataset_t *ds);
	uint64_t dsl_get_objsetid(dsl_dataset_t *ds);
	uint64_t dsl_get_userrefs(dsl_dataset_t *ds);
	uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds);
	uint64_t dsl_get_referenced(dsl_dataset_t *ds);
	uint64_t dsl_get_numclones(dsl_dataset_t *ds);
	uint64_t dsl_get_inconsistent(dsl_dataset_t *ds);
	uint64_t dsl_get_available(dsl_dataset_t *ds);
	int dsl_get_written(dsl_dataset_t ds, uint64_t written);
	int dsl_get_prev_snap(dsl_dataset_t ds, char snap);
	int dsl_get_mountpoint(dsl_dataset_t ds, const char dsname, char *value,
	char *source);

	void get_clones_stat(dsl_dataset_t ds, nvlist_t nv);

	void dsl_dataset_stats(dsl_dataset_t os, nvlist_t nv);

	void dsl_dataset_fast_stat(dsl_dataset_t ds, dmu_objset_stats_t stat);
	void dsl_dataset_space(dsl_dataset_t *ds,
	uint64_t refdbytesp, uint64_t availbytesp,
	uint64_t usedobjsp, uint64_t availobjsp);
	uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
	int dsl_dataset_space_written(dsl_dataset_t oldsnap, dsl_dataset_t new,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	int dsl_dataset_space_wouldfree(dsl_dataset_t firstsnap, dsl_dataset_t last,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);

	int dsl_dsobj_to_dsname(char pname, uint64_t obj, char buf);

	int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
	uint64_t asize, uint64_t inflight, uint64_t *used,
	uint64_t *ref_rsrv);
	int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
	uint64_t quota);
	int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
	uint64_t reservation);

	boolean_t dsl_dataset_is_before(dsl_dataset_t later, dsl_dataset_t earlier,
	uint64_t earlier_txg);
	void dsl_dataset_long_hold(dsl_dataset_t ds, void tag);
	void dsl_dataset_long_rele(dsl_dataset_t ds, void tag);
	boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);

	int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
	dsl_dataset_t origin_head, boolean_t force, void owner, dmu_tx_t *tx);
	void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
	dsl_dataset_t origin_head, dmu_tx_t tx);
	int dsl_dataset_snapshot_check_impl(dsl_dataset_t ds, const char snapname,
	dmu_tx_t tx, boolean_t recv, uint64_t cnt, cred_t cr);
	void dsl_dataset_snapshot_sync_impl(dsl_dataset_t ds, const char snapname,
	dmu_tx_t *tx);

	void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
	dmu_tx_t *tx);
	void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds);
	int dsl_dataset_get_snapname(dsl_dataset_t *ds);
	int dsl_dataset_snap_lookup(dsl_dataset_t ds, const char name,
	uint64_t *value);
	int dsl_dataset_snap_remove(dsl_dataset_t ds, const char name, dmu_tx_t *tx,
	boolean_t adj_cnt);
	void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
	zprop_source_t source, uint64_t value, dmu_tx_t *tx);
	void dsl_dataset_zapify(dsl_dataset_t ds, dmu_tx_t tx);
	boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
	boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);

	int dsl_dataset_rollback_check(void arg, dmu_tx_t tx);
	void dsl_dataset_rollback_sync(void arg, dmu_tx_t tx);
	int dsl_dataset_rollback(const char fsname, const char tosnap, void *owner,
	nvlist_t *result);
	+
	+uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds);
	+void dsl_dataset_create_remap_deadlist(dsl_dataset_t ds, dmu_tx_t tx);
	+boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds);
	+void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t ds, dmu_tx_t tx);

	void dsl_dataset_deactivate_feature(uint64_t dsobj,
	spa_feature_t f, dmu_tx_t *tx);

	#ifdef ZFS_DEBUG
	#define dprintf_ds(ds, fmt, ...) do { \
	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
	dsl_dataset_name(ds, __ds_name); \
	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
	kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
	} \
	_NOTE(CONSTCOND) } while (0)
	#else
	#define dprintf_ds(dd, fmt, ...)
	#endif

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DSL_DATASET_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h (revision 332525)
	@@ -1,87 +1,89 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	+ * Copyright (c) 2015 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_DSL_DEADLIST_H
	#define _SYS_DSL_DEADLIST_H

	#include <sys/bpobj.h>
	#include <sys/zfs_context.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	struct dmu_buf;
	struct dsl_dataset;

	typedef struct dsl_deadlist_phys {
	uint64_t dl_used;
	uint64_t dl_comp;
	uint64_t dl_uncomp;
	uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
	} dsl_deadlist_phys_t;

	typedef struct dsl_deadlist {
	objset_t *dl_os;
	uint64_t dl_object;
	avl_tree_t dl_tree;
	boolean_t dl_havetree;
	struct dmu_buf *dl_dbuf;
	dsl_deadlist_phys_t *dl_phys;
	kmutex_t dl_lock;

	/* if it's the old on-disk format: */
	bpobj_t dl_bpobj;
	boolean_t dl_oldfmt;
	} dsl_deadlist_t;

	typedef struct dsl_deadlist_entry {
	avl_node_t dle_node;
	uint64_t dle_mintxg;
	bpobj_t dle_bpobj;
	} dsl_deadlist_entry_t;

	void dsl_deadlist_open(dsl_deadlist_t dl, objset_t os, uint64_t object);
	void dsl_deadlist_close(dsl_deadlist_t *dl);
	uint64_t dsl_deadlist_alloc(objset_t os, dmu_tx_t tx);
	void dsl_deadlist_free(objset_t os, uint64_t dlobj, dmu_tx_t tx);
	void dsl_deadlist_insert(dsl_deadlist_t dl, const blkptr_t bp, dmu_tx_t *tx);
	void dsl_deadlist_add_key(dsl_deadlist_t dl, uint64_t mintxg, dmu_tx_t tx);
	void dsl_deadlist_remove_key(dsl_deadlist_t dl, uint64_t mintxg, dmu_tx_t tx);
	uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
	uint64_t mrs_obj, dmu_tx_t *tx);
	void dsl_deadlist_space(dsl_deadlist_t *dl,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	void dsl_deadlist_space_range(dsl_deadlist_t *dl,
	uint64_t mintxg, uint64_t maxtxg,
	uint64_t usedp, uint64_t compp, uint64_t *uncompp);
	void dsl_deadlist_merge(dsl_deadlist_t dl, uint64_t obj, dmu_tx_t tx);
	void dsl_deadlist_move_bpobj(dsl_deadlist_t dl, bpobj_t bpo, uint64_t mintxg,
	dmu_tx_t *tx);
	+boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DSL_DEADLIST_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h (revision 332525)
	@@ -1,80 +1,81 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_DSL_DELEG_H
	#define _SYS_DSL_DELEG_H

	#include <sys/dmu.h>
	#include <sys/dsl_pool.h>
	#include <sys/zfs_context.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	#define ZFS_DELEG_PERM_NONE ""
	#define ZFS_DELEG_PERM_CREATE "create"
	#define ZFS_DELEG_PERM_DESTROY "destroy"
	#define ZFS_DELEG_PERM_SNAPSHOT "snapshot"
	#define ZFS_DELEG_PERM_ROLLBACK "rollback"
	#define ZFS_DELEG_PERM_CLONE "clone"
	#define ZFS_DELEG_PERM_PROMOTE "promote"
	#define ZFS_DELEG_PERM_RENAME "rename"
	#define ZFS_DELEG_PERM_MOUNT "mount"
	#define ZFS_DELEG_PERM_SHARE "share"
	#define ZFS_DELEG_PERM_SEND "send"
	#define ZFS_DELEG_PERM_RECEIVE "receive"
	#define ZFS_DELEG_PERM_ALLOW "allow"
	#define ZFS_DELEG_PERM_USERPROP "userprop"
	#define ZFS_DELEG_PERM_VSCAN "vscan"
	#define ZFS_DELEG_PERM_USERQUOTA "userquota"
	#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
	#define ZFS_DELEG_PERM_USERUSED "userused"
	#define ZFS_DELEG_PERM_GROUPUSED "groupused"
	#define ZFS_DELEG_PERM_HOLD "hold"
	#define ZFS_DELEG_PERM_RELEASE "release"
	#define ZFS_DELEG_PERM_DIFF "diff"
	#define ZFS_DELEG_PERM_BOOKMARK "bookmark"
	+#define ZFS_DELEG_PERM_REMAP "remap"

	/*
	* Note: the names of properties that are marked delegatable are also
	* valid delegated permissions
	*/

	int dsl_deleg_get(const char ddname, nvlist_t *nvp);
	int dsl_deleg_set(const char ddname, nvlist_t nvp, boolean_t unset);
	int dsl_deleg_access(const char ddname, const char perm, cred_t *cr);
	int dsl_deleg_access_impl(struct dsl_dataset ds, const char perm, cred_t *cr);
	void dsl_deleg_set_create_perms(dsl_dir_t dd, dmu_tx_t tx, cred_t *cr);
	int dsl_deleg_can_allow(char ddname, nvlist_t nvp, cred_t *cr);
	int dsl_deleg_can_unallow(char ddname, nvlist_t nvp, cred_t *cr);
	int dsl_deleg_destroy(objset_t os, uint64_t zapobj, dmu_tx_t tx);
	boolean_t dsl_delegation_on(objset_t *os);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DSL_DELEG_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h (revision 332525)
	@@ -1,206 +1,208 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	* Copyright (c) 2014, Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	*/

	#ifndef _SYS_DSL_DIR_H
	#define _SYS_DSL_DIR_H

	#include <sys/dmu.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_synctask.h>
	#include <sys/refcount.h>
	#include <sys/zfs_context.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	struct dsl_dataset;

	/*
	* DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
	* They should be of the format <reverse-dns>:<field>.
	*/

	#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count"
	#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
	+#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg"

	typedef enum dd_used {
	DD_USED_HEAD,
	DD_USED_SNAP,
	DD_USED_CHILD,
	DD_USED_CHILD_RSRV,
	DD_USED_REFRSRV,
	DD_USED_NUM
	} dd_used_t;

	#define DD_FLAG_USED_BREAKDOWN (1<<0)

	typedef struct dsl_dir_phys {
	uint64_t dd_creation_time; /* not actually used */
	uint64_t dd_head_dataset_obj;
	uint64_t dd_parent_obj;
	uint64_t dd_origin_obj;
	uint64_t dd_child_dir_zapobj;
	/*
	* how much space our children are accounting for; for leaf
	* datasets, == physical space used by fs + snaps
	*/
	uint64_t dd_used_bytes;
	uint64_t dd_compressed_bytes;
	uint64_t dd_uncompressed_bytes;
	/* Administrative quota setting */
	uint64_t dd_quota;
	/* Administrative reservation setting */
	uint64_t dd_reserved;
	uint64_t dd_props_zapobj;
	uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
	uint64_t dd_flags;
	uint64_t dd_used_breakdown[DD_USED_NUM];
	uint64_t dd_clones; /* dsl_dir objects */
	uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
	} dsl_dir_phys_t;

	struct dsl_dir {
	dmu_buf_user_t dd_dbu;

	/* These are immutable; no lock needed: */
	uint64_t dd_object;
	dsl_pool_t *dd_pool;

	/* Stable until user eviction; no lock needed: */
	dmu_buf_t *dd_dbuf;

	/* protected by lock on pool's dp_dirty_dirs list */
	txg_node_t dd_dirty_link;

	/* protected by dp_config_rwlock */
	dsl_dir_t *dd_parent;

	/* Protected by dd_lock */
	kmutex_t dd_lock;
	list_t dd_props; /* list of dsl_prop_record_t's */
	timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
	uint64_t dd_origin_txg;

	/* gross estimate of space used by in-flight tx's */
	uint64_t dd_tempreserved[TXG_SIZE];
	/* amount of space we expect to write; == amount of dirty data */
	int64_t dd_space_towrite[TXG_SIZE];

	/* protected by dd_lock; keep at end of struct for better locality */
	char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
	};

	inline dsl_dir_phys_t *
	dsl_dir_phys(dsl_dir_t *dd)
	{
	return (dd->dd_dbuf->db_data);
	}

	void dsl_dir_rele(dsl_dir_t dd, void tag);
	void dsl_dir_async_rele(dsl_dir_t dd, void tag);
	int dsl_dir_hold(dsl_pool_t dp, const char name, void *tag,
	dsl_dir_t , const char tail);
	int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
	const char tail, void tag, dsl_dir_t **);
	void dsl_dir_name(dsl_dir_t dd, char buf);
	int dsl_dir_namelen(dsl_dir_t *dd);
	uint64_t dsl_dir_create_sync(dsl_pool_t dp, dsl_dir_t pds,
	const char name, dmu_tx_t tx);

	uint64_t dsl_dir_get_used(dsl_dir_t *dd);
	uint64_t dsl_dir_get_quota(dsl_dir_t *dd);
	uint64_t dsl_dir_get_reservation(dsl_dir_t *dd);
	uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd);
	uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd);
	uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd);
	uint64_t dsl_dir_get_usedds(dsl_dir_t *dd);
	uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd);
	uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd);
	void dsl_dir_get_origin(dsl_dir_t dd, char buf);
	int dsl_dir_get_filesystem_count(dsl_dir_t dd, uint64_t count);
	int dsl_dir_get_snapshot_count(dsl_dir_t dd, uint64_t count);
	+int dsl_dir_get_remaptxg(dsl_dir_t dd, uint64_t count);

	void dsl_dir_stats(dsl_dir_t dd, nvlist_t nv);
	uint64_t dsl_dir_space_available(dsl_dir_t *dd,
	dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
	void dsl_dir_dirty(dsl_dir_t dd, dmu_tx_t tx);
	void dsl_dir_sync(dsl_dir_t dd, dmu_tx_t tx);
	int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
	uint64_t asize, boolean_t netfree, void *tr_cookiep, dmu_tx_t tx);
	void dsl_dir_tempreserve_clear(void tr_cookie, dmu_tx_t tx);
	void dsl_dir_willuse_space(dsl_dir_t dd, int64_t space, dmu_tx_t tx);
	void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
	int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
	void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
	dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
	int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
	uint64_t quota);
	int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
	uint64_t reservation);
	int dsl_dir_activate_fs_ss_limit(const char *);
	int dsl_fs_ss_limit_check(dsl_dir_t , uint64_t, zfs_prop_t, dsl_dir_t ,
	cred_t *);
	void dsl_fs_ss_count_adjust(dsl_dir_t , int64_t, const char , dmu_tx_t *);
	+int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t);
	int dsl_dir_rename(const char oldname, const char newname);
	int dsl_dir_transfer_possible(dsl_dir_t sdd, dsl_dir_t tdd,
	uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
	boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
	void dsl_dir_new_refreservation(dsl_dir_t dd, struct dsl_dataset ds,
	uint64_t reservation, cred_t cr, dmu_tx_t tx);
	void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
	timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
	void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
	dmu_tx_t *tx);
	void dsl_dir_zapify(dsl_dir_t dd, dmu_tx_t tx);
	boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);

	/* internal reserved dir name */
	#define MOS_DIR_NAME "$MOS"
	#define ORIGIN_DIR_NAME "$ORIGIN"
	-#define XLATION_DIR_NAME "$XLATION"
	#define FREE_DIR_NAME "$FREE"
	#define LEAK_DIR_NAME "$LEAK"

	#ifdef ZFS_DEBUG
	#define dprintf_dd(dd, fmt, ...) do { \
	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
	dsl_dir_name(dd, __ds_name); \
	dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
	kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
	} \
	_NOTE(CONSTCOND) } while (0)
	#else
	#define dprintf_dd(dd, fmt, ...)
	#endif

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DSL_DIR_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (revision 332525)
	@@ -1,180 +1,183 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
	* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
	*/

	#ifndef _SYS_DSL_POOL_H
	#define _SYS_DSL_POOL_H

	#include <sys/spa.h>
	#include <sys/txg.h>
	#include <sys/txg_impl.h>
	#include <sys/zfs_context.h>
	#include <sys/zio.h>
	#include <sys/dnode.h>
	#include <sys/ddt.h>
	#include <sys/arc.h>
	#include <sys/bpobj.h>
	#include <sys/bptree.h>
	#include <sys/rrwlock.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	struct objset;
	struct dsl_dir;
	struct dsl_dataset;
	struct dsl_pool;
	struct dmu_tx;
	struct dsl_scan;

	extern uint64_t zfs_dirty_data_max;
	extern uint64_t zfs_dirty_data_max_max;
	extern uint64_t zfs_dirty_data_sync;
	extern int zfs_dirty_data_max_percent;
	extern int zfs_delay_min_dirty_percent;
	extern uint64_t zfs_delay_scale;

	/* These macros are for indexing into the zfs_all_blkstats_t. */
	#define DMU_OT_DEFERRED DMU_OT_NONE
	#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
	#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1)

	typedef struct zfs_blkstat {
	uint64_t zb_count;
	uint64_t zb_asize;
	uint64_t zb_lsize;
	uint64_t zb_psize;
	uint64_t zb_gangs;
	uint64_t zb_ditto_2_of_2_samevdev;
	uint64_t zb_ditto_2_of_3_samevdev;
	uint64_t zb_ditto_3_of_3_samevdev;
	} zfs_blkstat_t;

	typedef struct zfs_all_blkstats {
	zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
	} zfs_all_blkstats_t;


	typedef struct dsl_pool {
	/* Immutable */
	spa_t *dp_spa;
	struct objset *dp_meta_objset;
	struct dsl_dir *dp_root_dir;
	struct dsl_dir *dp_mos_dir;
	struct dsl_dir *dp_free_dir;
	struct dsl_dir *dp_leak_dir;
	struct dsl_dataset *dp_origin_snap;
	uint64_t dp_root_dir_obj;
	struct taskq *dp_vnrele_taskq;

	/* No lock needed - sync context only */
	blkptr_t dp_meta_rootbp;
	uint64_t dp_tmp_userrefs_obj;
	bpobj_t dp_free_bpobj;
	uint64_t dp_bptree_obj;
	uint64_t dp_empty_bpobj;
	+ bpobj_t dp_obsolete_bpobj;

	struct dsl_scan *dp_scan;

	/* Uses dp_lock */
	kmutex_t dp_lock;
	kcondvar_t dp_spaceavail_cv;
	uint64_t dp_dirty_pertxg[TXG_SIZE];
	uint64_t dp_dirty_total;
	uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
	uint64_t dp_mos_used_delta;
	uint64_t dp_mos_compressed_delta;
	uint64_t dp_mos_uncompressed_delta;

	/*
	* Time of most recently scheduled (furthest in the future)
	* wakeup for delayed transactions.
	*/
	hrtime_t dp_last_wakeup;

	/* Has its own locking */
	tx_state_t dp_tx;
	txg_list_t dp_dirty_datasets;
	txg_list_t dp_dirty_zilogs;
	txg_list_t dp_dirty_dirs;
	txg_list_t dp_sync_tasks;
	taskq_t *dp_sync_taskq;
	taskq_t *dp_zil_clean_taskq;

	/*
	* Protects administrative changes (properties, namespace)
	*
	* It is only held for write in syncing context. Therefore
	* syncing context does not need to ever have it for read, since
	* nobody else could possibly have it for write.
	*/
	rrwlock_t dp_config_rwlock;

	zfs_all_blkstats_t *dp_blkstats;
	} dsl_pool_t;

	int dsl_pool_init(spa_t spa, uint64_t txg, dsl_pool_t *dpp);
	int dsl_pool_open(dsl_pool_t *dp);
	void dsl_pool_close(dsl_pool_t *dp);
	dsl_pool_t dsl_pool_create(spa_t spa, nvlist_t *zplprops, uint64_t txg);
	void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
	void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
	int dsl_pool_sync_context(dsl_pool_t *dp);
	uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
	-uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
	void dsl_pool_dirty_space(dsl_pool_t dp, int64_t space, dmu_tx_t tx);
	void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
	void dsl_free(dsl_pool_t dp, uint64_t txg, const blkptr_t bpp);
	void dsl_free_sync(zio_t pio, dsl_pool_t dp, uint64_t txg,
	const blkptr_t *bpp);
	void dsl_pool_create_origin(dsl_pool_t dp, dmu_tx_t tx);
	void dsl_pool_upgrade_clones(dsl_pool_t dp, dmu_tx_t tx);
	void dsl_pool_upgrade_dir_clones(dsl_pool_t dp, dmu_tx_t tx);
	void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
	int64_t used, int64_t comp, int64_t uncomp);
	void dsl_pool_config_enter(dsl_pool_t dp, void tag);
	void dsl_pool_config_enter_prio(dsl_pool_t dp, void tag);
	void dsl_pool_config_exit(dsl_pool_t dp, void tag);
	boolean_t dsl_pool_config_held(dsl_pool_t *dp);
	boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
	boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);

	taskq_t dsl_pool_vnrele_taskq(dsl_pool_t dp);

	int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
	const char tag, uint64_t now, dmu_tx_t tx);
	int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
	const char tag, dmu_tx_t tx);
	void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
	int dsl_pool_open_special_dir(dsl_pool_t dp, const char name, dsl_dir_t **);
	int dsl_pool_hold(const char name, void tag, dsl_pool_t **dp);
	void dsl_pool_rele(dsl_pool_t dp, void tag);
	+
	+void dsl_pool_create_obsolete_bpobj(dsl_pool_t dp, dmu_tx_t tx);
	+void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t dp, dmu_tx_t tx);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DSL_POOL_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h (revision 332525)
	@@ -1,148 +1,151 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2017 Datto Inc.
	*/

	#ifndef _SYS_DSL_SCAN_H
	#define _SYS_DSL_SCAN_H

	#include <sys/zfs_context.h>
	#include <sys/zio.h>
	#include <sys/ddt.h>
	#include <sys/bplist.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	struct objset;
	struct dsl_dir;
	struct dsl_dataset;
	struct dsl_pool;
	struct dmu_tx;

	/*
	* All members of this structure must be uint64_t, for byteswap
	* purposes.
	*/
	typedef struct dsl_scan_phys {
	uint64_t scn_func; /* pool_scan_func_t */
	uint64_t scn_state; /* dsl_scan_state_t */
	uint64_t scn_queue_obj;
	uint64_t scn_min_txg;
	uint64_t scn_max_txg;
	uint64_t scn_cur_min_txg;
	uint64_t scn_cur_max_txg;
	uint64_t scn_start_time;
	uint64_t scn_end_time;
	uint64_t scn_to_examine; /* total bytes to be scanned */
	uint64_t scn_examined; /* bytes scanned so far */
	uint64_t scn_to_process;
	uint64_t scn_processed;
	uint64_t scn_errors; /* scan I/O error count */
	uint64_t scn_ddt_class_max;
	ddt_bookmark_t scn_ddt_bookmark;
	zbookmark_phys_t scn_bookmark;
	uint64_t scn_flags; /* dsl_scan_flags_t */
	} dsl_scan_phys_t;

	#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))

	typedef enum dsl_scan_flags {
	DSF_VISIT_DS_AGAIN = 1<<0,
	DSF_SCRUB_PAUSED = 1<<1,
	} dsl_scan_flags_t;

	/*
	* Every pool will have one dsl_scan_t and this structure will contain
	* in-memory information about the scan and a pointer to the on-disk
	* representation (i.e. dsl_scan_phys_t). Most of the state of the scan
	* is contained on-disk to allow the scan to resume in the event of a reboot
	* or panic. This structure maintains information about the behavior of a
	* running scan, some caching information, and how it should traverse the pool.
	*
	* The following members of this structure direct the behavior of the scan:
	*
	* scn_suspending - a scan that cannot be completed in a single txg or
	* has exceeded its allotted time will need to suspend.
	* When this flag is set the scanner will stop traversing
	* the pool and write out the current state to disk.
	*
	* scn_restart_txg - directs the scanner to either restart or start a
	* a scan at the specified txg value.
	*
	* scn_done_txg - when a scan completes its traversal it will set
	* the completion txg to the next txg. This is necessary
	* to ensure that any blocks that were freed during
	* the scan but have not yet been processed (i.e deferred
	* frees) are accounted for.
	*
	* This structure also maintains information about deferred frees which are
	* a special kind of traversal. Deferred free can exist in either a bptree or
	* a bpobj structure. The scn_is_bptree flag will indicate the type of
	* deferred free that is in progress. If the deferred free is part of an
	* asynchronous destroy then the scn_async_destroying flag will be set.
	*/
	typedef struct dsl_scan {
	struct dsl_pool *scn_dp;

	boolean_t scn_suspending;
	uint64_t scn_restart_txg;
	uint64_t scn_done_txg;
	uint64_t scn_sync_start_time;
	zio_t *scn_zio_root;

	/* for freeing blocks */
	boolean_t scn_is_bptree;
	boolean_t scn_async_destroying;
	boolean_t scn_async_stalled;
	+ uint64_t scn_async_block_min_time_ms;
	+
	+ /* for debugging / information */
	uint64_t scn_visited_this_txg;

	dsl_scan_phys_t scn_phys;
	} dsl_scan_t;

	int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
	void dsl_scan_fini(struct dsl_pool *dp);
	void dsl_scan_sync(struct dsl_pool , dmu_tx_t );
	int dsl_scan_cancel(struct dsl_pool *);
	int dsl_scan(struct dsl_pool *, pool_scan_func_t);
	boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp);
	int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd);
	void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
	boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
	boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
	void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
	ddt_entry_t dde, dmu_tx_t tx);
	void dsl_scan_ds_destroyed(struct dsl_dataset ds, struct dmu_tx tx);
	void dsl_scan_ds_snapshotted(struct dsl_dataset ds, struct dmu_tx tx);
	void dsl_scan_ds_clone_swapped(struct dsl_dataset ds1, struct dsl_dataset ds2,
	struct dmu_tx *tx);
	boolean_t dsl_scan_active(dsl_scan_t *scn);
	boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DSL_SCAN_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h (revision 332525)
	@@ -1,112 +1,119 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_METASLAB_H
	#define _SYS_METASLAB_H

	#include <sys/spa.h>
	#include <sys/space_map.h>
	#include <sys/txg.h>
	#include <sys/zio.h>
	#include <sys/avl.h>

	#ifdef __cplusplus
	extern "C" {
	#endif


	typedef struct metaslab_ops {
	uint64_t (msop_alloc)(metaslab_t , uint64_t);
	} metaslab_ops_t;


	extern metaslab_ops_t *zfs_metaslab_ops;

	int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
	metaslab_t **);
	void metaslab_fini(metaslab_t *);

	void metaslab_load_wait(metaslab_t *);
	int metaslab_load(metaslab_t *);
	void metaslab_unload(metaslab_t *);

	void metaslab_sync(metaslab_t *, uint64_t);
	void metaslab_sync_done(metaslab_t *, uint64_t);
	void metaslab_sync_reassess(metaslab_group_t *);
	uint64_t metaslab_block_maxsize(metaslab_t *);

	#define METASLAB_HINTBP_FAVOR 0x0
	#define METASLAB_HINTBP_AVOID 0x1
	#define METASLAB_GANG_HEADER 0x2
	#define METASLAB_GANG_CHILD 0x4
	#define METASLAB_ASYNC_ALLOC 0x8
	#define METASLAB_DONT_THROTTLE 0x10

	int metaslab_alloc(spa_t , metaslab_class_t , uint64_t,
	blkptr_t , int, uint64_t, blkptr_t , int, zio_alloc_list_t , zio_t );
	+int metaslab_alloc_dva(spa_t , metaslab_class_t , uint64_t,
	+ dva_t , int, dva_t , uint64_t, int, zio_alloc_list_t *);
	void metaslab_free(spa_t , const blkptr_t , uint64_t, boolean_t);
	+void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, uint64_t);
	+void metaslab_free_dva(spa_t , const dva_t , uint64_t);
	+void metaslab_free_impl_cb(uint64_t, vdev_t , uint64_t, uint64_t, void );
	+void metaslab_unalloc_dva(spa_t , const dva_t , uint64_t);
	int metaslab_claim(spa_t , const blkptr_t , uint64_t);
	+int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
	void metaslab_check_free(spa_t , const blkptr_t );

	void metaslab_alloc_trace_init(void);
	void metaslab_alloc_trace_fini(void);
	void metaslab_trace_init(zio_alloc_list_t *);
	void metaslab_trace_fini(zio_alloc_list_t *);

	metaslab_class_t metaslab_class_create(spa_t , metaslab_ops_t *);
	void metaslab_class_destroy(metaslab_class_t *);
	int metaslab_class_validate(metaslab_class_t *);
	void metaslab_class_histogram_verify(metaslab_class_t *);
	uint64_t metaslab_class_fragmentation(metaslab_class_t *);
	uint64_t metaslab_class_expandable_space(metaslab_class_t *);
	boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
	zio_t *, int);
	void metaslab_class_throttle_unreserve(metaslab_class_t , int, zio_t );

	void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
	int64_t, int64_t);
	uint64_t metaslab_class_get_alloc(metaslab_class_t *);
	uint64_t metaslab_class_get_space(metaslab_class_t *);
	uint64_t metaslab_class_get_dspace(metaslab_class_t *);
	uint64_t metaslab_class_get_deferred(metaslab_class_t *);
	uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);

	metaslab_group_t metaslab_group_create(metaslab_class_t , vdev_t *);
	void metaslab_group_destroy(metaslab_group_t *);
	void metaslab_group_activate(metaslab_group_t *);
	void metaslab_group_passivate(metaslab_group_t *);
	boolean_t metaslab_group_initialized(metaslab_group_t *);
	uint64_t metaslab_group_get_space(metaslab_group_t *);
	void metaslab_group_histogram_verify(metaslab_group_t *);
	uint64_t metaslab_group_fragmentation(metaslab_group_t *);
	void metaslab_group_histogram_remove(metaslab_group_t , metaslab_t );
	void metaslab_group_alloc_decrement(spa_t , uint64_t, void , int);
	void metaslab_group_alloc_verify(spa_t , const blkptr_t , void *);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_METASLAB_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h (revision 332525)
	@@ -1,376 +1,376 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/

	/*
	* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_METASLAB_IMPL_H
	#define _SYS_METASLAB_IMPL_H

	#include <sys/metaslab.h>
	#include <sys/space_map.h>
	#include <sys/range_tree.h>
	#include <sys/vdev.h>
	#include <sys/txg.h>
	#include <sys/avl.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	/*
	* Metaslab allocation tracing record.
	*/
	typedef struct metaslab_alloc_trace {
	list_node_t mat_list_node;
	metaslab_group_t *mat_mg;
	metaslab_t *mat_msp;
	uint64_t mat_size;
	uint64_t mat_weight;
	uint32_t mat_dva_id;
	uint64_t mat_offset;
	} metaslab_alloc_trace_t;

	/*
	* Used by the metaslab allocation tracing facility to indicate
	* error conditions. These errors are stored to the offset member
	* of the metaslab_alloc_trace_t record and displayed by mdb.
	*/
	typedef enum trace_alloc_type {
	TRACE_ALLOC_FAILURE = -1ULL,
	TRACE_TOO_SMALL = -2ULL,
	TRACE_FORCE_GANG = -3ULL,
	TRACE_NOT_ALLOCATABLE = -4ULL,
	TRACE_GROUP_FAILURE = -5ULL,
	TRACE_ENOSPC = -6ULL,
	TRACE_CONDENSING = -7ULL,
	TRACE_VDEV_ERROR = -8ULL
	} trace_alloc_type_t;

	#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
	#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
	#define METASLAB_WEIGHT_TYPE (1ULL << 61)
	#define METASLAB_ACTIVE_MASK \
	(METASLAB_WEIGHT_PRIMARY \| METASLAB_WEIGHT_SECONDARY)

	/*
	* The metaslab weight is used to encode the amount of free space in a
	* metaslab, such that the "best" metaslab appears first when sorting the
	* metaslabs by weight. The weight (and therefore the "best" metaslab) can
	* be determined in two different ways: by computing a weighted sum of all
	* the free space in the metaslab (a space based weight) or by counting only
	* the free segments of the largest size (a segment based weight). We prefer
	* the segment based weight because it reflects how the free space is
	* comprised, but we cannot always use it -- legacy pools do not have the
	* space map histogram information necessary to determine the largest
	* contiguous regions. Pools that have the space map histogram determine
	* the segment weight by looking at each bucket in the histogram and
	* determining the free space whose size in bytes is in the range:
	* [2^i, 2^(i+1))
	* We then encode the largest index, i, that contains regions into the
	* segment-weighted value.
	*
	* Space-based weight:
	*
	* 64 56 48 40 32 24 16 8 0
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* \|PS1\| weighted-free space \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	*
	* PS - indicates primary and secondary activation
	* space - the fragmentation-weighted space
	*
	* Segment-based weight:
	*
	* 64 56 48 40 32 24 16 8 0
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* \|PS0\| idx\| count of segments in region \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	*
	* PS - indicates primary and secondary activation
	* idx - index for the highest bucket in the histogram
	* count - number of segments in the specified bucket
	*/
	#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2)
	#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x)

	#define WEIGHT_IS_SPACEBASED(weight) \
	((weight) == 0 \|\| BF64_GET((weight), 61, 1))
	#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1)

	/*
	* These macros are only applicable to segment-based weighting.
	*/
	#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6)
	#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x)
	#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55)
	#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x)

	/*
	* A metaslab class encompasses a category of allocatable top-level vdevs.
	* Each top-level vdev is associated with a metaslab group which defines
	* the allocatable region for that vdev. Examples of these categories include
	* "normal" for data block allocations (i.e. main pool allocations) or "log"
	* for allocations designated for intent log devices (i.e. slog devices).
	* When a block allocation is requested from the SPA it is associated with a
	* metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
	* to the class can be used to satisfy that request. Allocations are done
	* by traversing the metaslab groups that are linked off of the mc_rotor field.
	* This rotor points to the next metaslab group where allocations will be
	* attempted. Allocating a block is a 3 step process -- select the metaslab
	* group, select the metaslab, and then allocate the block. The metaslab
	* class defines the low-level block allocator that will be used as the
	* final step in allocation. These allocators are pluggable allowing each class
	* to use a block allocator that best suits that class.
	*/
	struct metaslab_class {
	kmutex_t mc_lock;
	spa_t *mc_spa;
	metaslab_group_t *mc_rotor;
	metaslab_ops_t *mc_ops;
	uint64_t mc_aliquot;

	/*
	* Track the number of metaslab groups that have been initialized
	* and can accept allocations. An initialized metaslab group is
	* one has been completely added to the config (i.e. we have
	* updated the MOS config and the space has been added to the pool).
	*/
	uint64_t mc_groups;

	/*
	* Toggle to enable/disable the allocation throttle.
	*/
	boolean_t mc_alloc_throttle_enabled;

	/*
	* The allocation throttle works on a reservation system. Whenever
	* an asynchronous zio wants to perform an allocation it must
	* first reserve the number of blocks that it wants to allocate.
	* If there aren't sufficient slots available for the pending zio
	* then that I/O is throttled until more slots free up. The current
	* number of reserved allocations is maintained by the mc_alloc_slots
	* refcount. The mc_alloc_max_slots value determines the maximum
	* number of allocations that the system allows. Gang blocks are
	* allowed to reserve slots even if we've reached the maximum
	* number of allocations allowed.
	*/
	uint64_t mc_alloc_max_slots;
	refcount_t mc_alloc_slots;

	uint64_t mc_alloc_groups; /* # of allocatable groups */

	uint64_t mc_alloc; /* total allocated space */
	uint64_t mc_deferred; /* total deferred frees */
	uint64_t mc_space; /* total space (alloc + free) */
	uint64_t mc_dspace; /* total deflated space */
	uint64_t mc_minblocksize;
	uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
	};

	/*
	* Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
	* of a top-level vdev. They are linked togther to form a circular linked
	* list and can belong to only one metaslab class. Metaslab groups may become
	* ineligible for allocations for a number of reasons such as limited free
	* space, fragmentation, or going offline. When this happens the allocator will
	* simply find the next metaslab group in the linked list and attempt
	* to allocate from that group instead.
	*/
	struct metaslab_group {
	kmutex_t mg_lock;
	avl_tree_t mg_metaslab_tree;
	uint64_t mg_aliquot;
	boolean_t mg_allocatable; /* can we allocate? */

	/*
	* A metaslab group is considered to be initialized only after
	* we have updated the MOS config and added the space to the pool.
	* We only allow allocation attempts to a metaslab group if it
	* has been initialized.
	*/
	boolean_t mg_initialized;

	uint64_t mg_free_capacity; /* percentage free */
	int64_t mg_bias;
	int64_t mg_activation_count;
	metaslab_class_t *mg_class;
	vdev_t *mg_vd;
	taskq_t *mg_taskq;
	metaslab_group_t *mg_prev;
	metaslab_group_t *mg_next;

	/*
	* Each metaslab group can handle mg_max_alloc_queue_depth allocations
	* which are tracked by mg_alloc_queue_depth. It's possible for a
	* metaslab group to handle more allocations than its max. This
	* can occur when gang blocks are required or when other groups
	* are unable to handle their share of allocations.
	*/
	uint64_t mg_max_alloc_queue_depth;
	refcount_t mg_alloc_queue_depth;

	/*
	* A metalab group that can no longer allocate the minimum block
	* size will set mg_no_free_space. Once a metaslab group is out
	* of space then its share of work must be distributed to other
	* groups.
	*/
	boolean_t mg_no_free_space;

	uint64_t mg_allocations;
	uint64_t mg_failed_allocations;
	uint64_t mg_fragmentation;
	uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
	};

	/*
	* This value defines the number of elements in the ms_lbas array. The value
	* of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
	* This is the equivalent of highbit(UINT64_MAX).
	*/
	#define MAX_LBAS 64

	/*
	* Each metaslab maintains a set of in-core trees to track metaslab
	* operations. The in-core free tree (ms_tree) contains the list of
	* free segments which are eligible for allocation. As blocks are
	- * allocated, the allocated segments are removed from the ms_tree and
	- * added to a per txg allocation tree (ms_alloctree). This allows us to
	- * process all allocations in syncing context where it is safe to update
	- * the on-disk space maps. Frees are also processed in syncing context.
	- * Most frees are generated from syncing context, and those that are not
	- * are held in the spa_free_bplist for processing in syncing context.
	- * An additional set of in-core trees is maintained to track deferred
	- * frees (ms_defertree). Once a block is freed it will move from the
	+ * allocated, the allocated segment are removed from the ms_tree and
	+ * added to a per txg allocation tree (ms_alloctree). As blocks are
	+ * freed, they are added to the free tree (ms_freeingtree). These trees
	+ * allow us to process all allocations and frees in syncing context
	+ * where it is safe to update the on-disk space maps. An additional set
	+ * of in-core trees is maintained to track deferred frees
	+ * (ms_defertree). Once a block is freed it will move from the
	* ms_freedtree to the ms_defertree. A deferred free means that a block
	* has been freed but cannot be used by the pool until TXG_DEFER_SIZE
	* transactions groups later. For example, a block that is freed in txg
	* 50 will not be available for reallocation until txg 52 (50 +
	* TXG_DEFER_SIZE). This provides a safety net for uberblock rollback.
	* A pool could be safely rolled back TXG_DEFERS_SIZE transactions
	* groups and ensure that no block has been reallocated.
	*
	* The simplified transition diagram looks like this:
	*
	*
	* ALLOCATE
	* \|
	* V
	* free segment (ms_tree) -----> ms_alloctree[4] ----> (write to space map)
	* ^
	* \| ms_freeingtree <--- FREE
	* \| \|
	* \| v
	* \| ms_freedtree
	* \| \|
	* +-------- ms_defertree[2] <-------+---------> (write to space map)
	*
	*
	* Each metaslab's space is tracked in a single space map in the MOS,
	* which is only updated in syncing context. Each time we sync a txg,
	* we append the allocs and frees from that txg to the space map. The
	* pool space is only updated once all metaslabs have finished syncing.
	*
	* To load the in-core free tree we read the space map from disk. This
	* object contains a series of alloc and free records that are combined
	* to make up the list of all free segments in this metaslab. These
	* segments are represented in-core by the ms_tree and are stored in an
	* AVL tree.
	*
	* As the space map grows (as a result of the appends) it will
	* eventually become space-inefficient. When the metaslab's in-core
	* free tree is zfs_condense_pct/100 times the size of the minimal
	* on-disk representation, we rewrite it in its minimized form. If a
	* metaslab needs to condense then we must set the ms_condensing flag to
	* ensure that allocations are not performed on the metaslab that is
	* being written.
	*/
	struct metaslab {
	kmutex_t ms_lock;
	+ kmutex_t ms_sync_lock;
	kcondvar_t ms_load_cv;
	space_map_t *ms_sm;
	uint64_t ms_id;
	uint64_t ms_start;
	uint64_t ms_size;
	uint64_t ms_fragmentation;

	range_tree_t *ms_alloctree[TXG_SIZE];
	range_tree_t *ms_tree;

	/*
	* The following range trees are accessed only from syncing context.
	* ms_free*tree only have entries while syncing, and are empty
	* between syncs.
	*/
	range_tree_t ms_freeingtree; / to free this syncing txg */
	range_tree_t ms_freedtree; / already freed this syncing txg */
	range_tree_t *ms_defertree[TXG_DEFER_SIZE];

	boolean_t ms_condensing; /* condensing? */
	boolean_t ms_condense_wanted;

	/*
	* We must hold both ms_lock and ms_group->mg_lock in order to
	* modify ms_loaded.
	*/
	boolean_t ms_loaded;
	boolean_t ms_loading;

	int64_t ms_deferspace; /* sum of ms_defermap[] space */
	uint64_t ms_weight; /* weight vs. others in group */
	uint64_t ms_activation_weight; /* activation weight */

	/*
	* Track of whenever a metaslab is selected for loading or allocation.
	* We use this value to determine how long the metaslab should
	* stay cached.
	*/
	uint64_t ms_selected_txg;

	uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
	uint64_t ms_max_size; /* maximum allocatable size */

	/*
	* The metaslab block allocators can optionally use a size-ordered
	* range tree and/or an array of LBAs. Not all allocators use
	* this functionality. The ms_size_tree should always contain the
	* same number of segments as the ms_tree. The only difference
	* is that the ms_size_tree is ordered by segment sizes.
	*/
	avl_tree_t ms_size_tree;
	uint64_t ms_lbas[MAX_LBAS];

	metaslab_group_t ms_group; / metaslab group */
	avl_node_t ms_group_node; /* node in metaslab group tree */
	txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
	};

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_METASLAB_IMPL_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h (revision 332525)
	@@ -1,97 +1,100 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/

	/*
	- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
	+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_RANGE_TREE_H
	#define _SYS_RANGE_TREE_H

	#include <sys/avl.h>
	#include <sys/dmu.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	#define RANGE_TREE_HISTOGRAM_SIZE 64

	typedef struct range_tree_ops range_tree_ops_t;

	+/*
	+ * Note: the range_tree may not be accessed concurrently; consumers
	+ * must provide external locking if required.
	+ */
	typedef struct range_tree {
	avl_tree_t rt_root; /* offset-ordered segment AVL tree */
	uint64_t rt_space; /* sum of all segments in the map */
	range_tree_ops_t *rt_ops;
	void *rt_arg;

	/*
	* The rt_histogram maintains a histogram of ranges. Each bucket,
	* rt_histogram[i], contains the number of ranges whose size is:
	* 2^i <= size of range in bytes < 2^(i+1)
	*/
	uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
	- kmutex_t rt_lock; / pointer to lock that protects map */
	} range_tree_t;

	typedef struct range_seg {
	avl_node_t rs_node; /* AVL node */
	avl_node_t rs_pp_node; /* AVL picker-private node */
	uint64_t rs_start; /* starting offset of this segment */
	uint64_t rs_end; /* ending offset (non-inclusive) */
	} range_seg_t;

	struct range_tree_ops {
	void (rtop_create)(range_tree_t rt, void *arg);
	void (rtop_destroy)(range_tree_t rt, void *arg);
	void (rtop_add)(range_tree_t rt, range_seg_t rs, void arg);
	void (rtop_remove)(range_tree_t rt, range_seg_t rs, void arg);
	void (rtop_vacate)(range_tree_t rt, void *arg);
	};

	typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);

	void range_tree_init(void);
	void range_tree_fini(void);
	-range_tree_t range_tree_create(range_tree_ops_t ops, void arg, kmutex_t lp);
	+range_tree_t range_tree_create(range_tree_ops_t ops, void *arg);
	void range_tree_destroy(range_tree_t *rt);
	boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
	uint64_t range_tree_space(range_tree_t *rt);
	void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
	void range_tree_swap(range_tree_t rtsrc, range_tree_t rtdst);
	void range_tree_stat_verify(range_tree_t *rt);

	void range_tree_add(void *arg, uint64_t start, uint64_t size);
	void range_tree_remove(void *arg, uint64_t start, uint64_t size);
	void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);

	void range_tree_vacate(range_tree_t rt, range_tree_func_t func, void *arg);
	void range_tree_walk(range_tree_t rt, range_tree_func_t func, void *arg);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_RANGE_TREE_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h (revision 332525)
	@@ -1,931 +1,945 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright 2013 Saso Kiselkov. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2017 Joyent, Inc.
	* Copyright (c) 2017 Datto Inc.
	*/

	#ifndef _SYS_SPA_H
	#define _SYS_SPA_H

	#include <sys/avl.h>
	#include <sys/zfs_context.h>
	#include <sys/nvpair.h>
	+#include <sys/sysevent.h>
	#include <sys/sysmacros.h>
	#include <sys/types.h>
	#include <sys/fs/zfs.h>
	#include <sys/dmu.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	/*
	* Forward references that lots of things need.
	*/
	typedef struct spa spa_t;
	typedef struct vdev vdev_t;
	typedef struct metaslab metaslab_t;
	typedef struct metaslab_group metaslab_group_t;
	typedef struct metaslab_class metaslab_class_t;
	typedef struct zio zio_t;
	typedef struct zilog zilog_t;
	typedef struct spa_aux_vdev spa_aux_vdev_t;
	typedef struct ddt ddt_t;
	typedef struct ddt_entry ddt_entry_t;
	struct dsl_pool;
	struct dsl_dataset;

	/*
	* General-purpose 32-bit and 64-bit bitfield encodings.
	*/
	#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
	#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
	#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
	#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))

	#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
	#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)

	#define BF32_SET(x, low, len, val) do { \
	ASSERT3U(val, <, 1U << (len)); \
	ASSERT3U(low + len, <=, 32); \
	(x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
	_NOTE(CONSTCOND) } while (0)

	#define BF64_SET(x, low, len, val) do { \
	ASSERT3U(val, <, 1ULL << (len)); \
	ASSERT3U(low + len, <=, 64); \
	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
	_NOTE(CONSTCOND) } while (0)

	#define BF32_GET_SB(x, low, len, shift, bias) \
	((BF32_GET(x, low, len) + (bias)) << (shift))
	#define BF64_GET_SB(x, low, len, shift, bias) \
	((BF64_GET(x, low, len) + (bias)) << (shift))

	#define BF32_SET_SB(x, low, len, shift, bias, val) do { \
	ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
	ASSERT3S((val) >> (shift), >=, bias); \
	BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
	_NOTE(CONSTCOND) } while (0)
	#define BF64_SET_SB(x, low, len, shift, bias, val) do { \
	ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
	ASSERT3S((val) >> (shift), >=, bias); \
	BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
	_NOTE(CONSTCOND) } while (0)

	/*
	* We currently support block sizes from 512 bytes to 16MB.
	* The benefits of larger blocks, and thus larger IO, need to be weighed
	* against the cost of COWing a giant block to modify one byte, and the
	* large latency of reading or writing a large block.
	*
	* Note that although blocks up to 16MB are supported, the recordsize
	* property can not be set larger than zfs_max_recordsize (default 1MB).
	* See the comment near zfs_max_recordsize in dsl_dataset.c for details.
	*
	* Note that although the LSIZE field of the blkptr_t can store sizes up
	* to 32MB, the dnode's dn_datablkszsec can only store sizes up to
	* 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
	*/
	#define SPA_MINBLOCKSHIFT 9
	#define SPA_OLD_MAXBLOCKSHIFT 17
	#define SPA_MAXBLOCKSHIFT 24
	#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
	#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
	#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)

	/*
	* Default maximum supported logical ashift.
	*
	* The current 8k allocation block size limit is due to the 8k
	* aligned/sized operations performed by vdev_probe() on
	* vdev_label->vl_pad2. Using another "safe region" for these tests
	* would allow the limit to be raised to 16k, at the expense of
	* only having 8 available uberblocks in the label area.
	*/
	#define SPA_MAXASHIFT 13

	/*
	* Default minimum supported logical ashift.
	*/
	#define SPA_MINASHIFT SPA_MINBLOCKSHIFT

	/*
	* Size of block to hold the configuration data (a packed nvlist)
	*/
	#define SPA_CONFIG_BLOCKSIZE (1ULL << 14)

	/*
	* The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
	* The ASIZE encoding should be at least 64 times larger (6 more bits)
	* to support up to 4-way RAID-Z mirror mode with worst-case gang block
	* overhead, three DVAs per bp, plus one more bit in case we do anything
	* else that expands the ASIZE.
	*/
	#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
	#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
	#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */

	#define SPA_COMPRESSBITS 7

	/*
	* All SPA data is represented by 128-bit data virtual addresses (DVAs).
	* The members of the dva_t should be considered opaque outside the SPA.
	*/
	typedef struct dva {
	uint64_t dva_word[2];
	} dva_t;

	/*
	* Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
	*/
	typedef struct zio_cksum {
	uint64_t zc_word[4];
	} zio_cksum_t;

	/*
	* Some checksums/hashes need a 256-bit initialization salt. This salt is kept
	* secret and is suitable for use in MAC algorithms as the key.
	*/
	typedef struct zio_cksum_salt {
	uint8_t zcs_bytes[32];
	} zio_cksum_salt_t;

	/*
	* Each block is described by its DVAs, time of birth, checksum, etc.
	* The word-by-word, bit-by-bit layout of the blkptr is as follows:
	*
	* 64 56 48 40 32 24 16 8 0
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 0 \| vdev1 \| GRID \| ASIZE \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 1 \|G\| offset1 \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 2 \| vdev2 \| GRID \| ASIZE \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 3 \|G\| offset2 \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 4 \| vdev3 \| GRID \| ASIZE \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 5 \|G\| offset3 \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 6 \|BDX\|lvl\| type \| cksum \|E\| comp\| PSIZE \| LSIZE \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 7 \| padding \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 8 \| padding \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 9 \| physical birth txg \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* a \| logical birth txg \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* b \| fill count \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* c \| checksum[0] \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* d \| checksum[1] \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* e \| checksum[2] \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* f \| checksum[3] \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	*
	* Legend:
	*
	* vdev virtual device ID
	* offset offset into virtual device
	* LSIZE logical size
	* PSIZE physical size (after compression)
	* ASIZE allocated size (including RAID-Z parity and gang block headers)
	* GRID RAID-Z layout information (reserved for future use)
	* cksum checksum function
	* comp compression function
	* G gang block indicator
	* B byteorder (endianness)
	* D dedup
	* X encryption (on version 30, which is not supported)
	* E blkptr_t contains embedded data (see below)
	* lvl level of indirection
	* type DMU object type
	- * phys birth txg of block allocation; zero if same as logical birth txg
	+ * phys birth txg when dva[0] was written; zero if same as logical birth txg
	+ * note that typically all the dva's would be written in this
	+ * txg, but they could be different if they were moved by
	+ * device removal.
	* log. birth transaction group in which the block was logically born
	* fill count number of non-zero blocks under this bp
	* checksum[4] 256-bit checksum of the data this bp describes
	*/

	/*
	* "Embedded" blkptr_t's don't actually point to a block, instead they
	* have a data payload embedded in the blkptr_t itself. See the comment
	* in blkptr.c for more details.
	*
	* The blkptr_t is laid out as follows:
	*
	* 64 56 48 40 32 24 16 8 0
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 0 \| payload \|
	* 1 \| payload \|
	* 2 \| payload \|
	* 3 \| payload \|
	* 4 \| payload \|
	* 5 \| payload \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 6 \|BDX\|lvl\| type \| etype \|E\| comp\| PSIZE\| LSIZE \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* 7 \| payload \|
	* 8 \| payload \|
	* 9 \| payload \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* a \| logical birth txg \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	* b \| payload \|
	* c \| payload \|
	* d \| payload \|
	* e \| payload \|
	* f \| payload \|
	* +-------+-------+-------+-------+-------+-------+-------+-------+
	*
	* Legend:
	*
	* payload contains the embedded data
	* B (byteorder) byteorder (endianness)
	* D (dedup) padding (set to zero)
	* X encryption (set to zero; see above)
	* E (embedded) set to one
	* lvl indirection level
	* type DMU object type
	* etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
	* comp compression function of payload
	* PSIZE size of payload after compression, in bytes
	* LSIZE logical size of payload, in bytes
	* note that 25 bits is enough to store the largest
	* "normal" BP's LSIZE (2^16 * 2^9) in bytes
	* log. birth transaction group in which the block was logically born
	*
	* Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
	* bp's they are stored in units of SPA_MINBLOCKSHIFT.
	* Generally, the generic BP_GET_*() macros can be used on embedded BP's.
	* The B, D, X, lvl, type, and comp fields are stored the same as with normal
	* BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
	* be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
	* other macros, as they assert that they are only used on BP's of the correct
	* "embedded-ness".
	*/

	#define BPE_GET_ETYPE(bp) \
	(ASSERT(BP_IS_EMBEDDED(bp)), \
	BF64_GET((bp)->blk_prop, 40, 8))
	#define BPE_SET_ETYPE(bp, t) do { \
	ASSERT(BP_IS_EMBEDDED(bp)); \
	BF64_SET((bp)->blk_prop, 40, 8, t); \
	_NOTE(CONSTCOND) } while (0)

	#define BPE_GET_LSIZE(bp) \
	(ASSERT(BP_IS_EMBEDDED(bp)), \
	BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
	#define BPE_SET_LSIZE(bp, x) do { \
	ASSERT(BP_IS_EMBEDDED(bp)); \
	BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
	_NOTE(CONSTCOND) } while (0)

	#define BPE_GET_PSIZE(bp) \
	(ASSERT(BP_IS_EMBEDDED(bp)), \
	BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
	#define BPE_SET_PSIZE(bp, x) do { \
	ASSERT(BP_IS_EMBEDDED(bp)); \
	BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
	_NOTE(CONSTCOND) } while (0)

	typedef enum bp_embedded_type {
	BP_EMBEDDED_TYPE_DATA,
	BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
	NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
	} bp_embedded_type_t;

	#define BPE_NUM_WORDS 14
	#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
	#define BPE_IS_PAYLOADWORD(bp, wp) \
	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)

	#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
	#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */

	/*
	* A block is a hole when it has either 1) never been written to, or
	* 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
	* without physically allocating disk space. Holes are represented in the
	* blkptr_t structure by zeroed blk_dva. Correct checking for holes is
	* done through the BP_IS_HOLE macro. For holes, the logical size, level,
	* DMU object type, and birth times are all also stored for holes that
	* were written to at some point (i.e. were punched after having been filled).
	*/
	typedef struct blkptr {
	dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
	uint64_t blk_prop; /* size, compression, type, etc */
	uint64_t blk_pad[2]; /* Extra space for the future */
	uint64_t blk_phys_birth; /* txg when block was allocated */
	uint64_t blk_birth; /* transaction group at birth */
	uint64_t blk_fill; /* fill count */
	zio_cksum_t blk_cksum; /* 256-bit checksum */
	} blkptr_t;

	/*
	* Macros to get and set fields in a bp or DVA.
	*/
	#define DVA_GET_ASIZE(dva) \
	BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
	#define DVA_SET_ASIZE(dva, x) \
	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
	SPA_MINBLOCKSHIFT, 0, x)

	#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
	#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)

	#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
	#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)

	#define DVA_GET_OFFSET(dva) \
	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
	#define DVA_SET_OFFSET(dva, x) \
	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)

	#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
	#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)

	#define BP_GET_LSIZE(bp) \
	(BP_IS_EMBEDDED(bp) ? \
	(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
	BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
	#define BP_SET_LSIZE(bp, x) do { \
	ASSERT(!BP_IS_EMBEDDED(bp)); \
	BF64_SET_SB((bp)->blk_prop, \
	0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
	_NOTE(CONSTCOND) } while (0)

	#define BP_GET_PSIZE(bp) \
	(BP_IS_EMBEDDED(bp) ? 0 : \
	BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
	#define BP_SET_PSIZE(bp, x) do { \
	ASSERT(!BP_IS_EMBEDDED(bp)); \
	BF64_SET_SB((bp)->blk_prop, \
	16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
	_NOTE(CONSTCOND) } while (0)

	#define BP_GET_COMPRESS(bp) \
	BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
	#define BP_SET_COMPRESS(bp, x) \
	BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)

	#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
	#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)

	#define BP_GET_CHECKSUM(bp) \
	(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
	BF64_GET((bp)->blk_prop, 40, 8))
	#define BP_SET_CHECKSUM(bp, x) do { \
	ASSERT(!BP_IS_EMBEDDED(bp)); \
	BF64_SET((bp)->blk_prop, 40, 8, x); \
	_NOTE(CONSTCOND) } while (0)

	#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
	#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)

	#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
	#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)

	#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
	#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)

	#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
	#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)

	#define BP_PHYSICAL_BIRTH(bp) \
	(BP_IS_EMBEDDED(bp) ? 0 : \
	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)

	#define BP_SET_BIRTH(bp, logical, physical) \
	{ \
	ASSERT(!BP_IS_EMBEDDED(bp)); \
	(bp)->blk_birth = (logical); \
	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
	}

	#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)

	#define BP_IS_METADATA(bp) \
	(BP_GET_LEVEL(bp) > 0 \|\| DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))

	#define BP_GET_ASIZE(bp) \
	(BP_IS_EMBEDDED(bp) ? 0 : \
	DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
	DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
	DVA_GET_ASIZE(&(bp)->blk_dva[2]))

	#define BP_GET_UCSIZE(bp) \
	(BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))

	#define BP_GET_NDVAS(bp) \
	(BP_IS_EMBEDDED(bp) ? 0 : \
	!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
	!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))

	#define BP_COUNT_GANG(bp) \
	(BP_IS_EMBEDDED(bp) ? 0 : \
	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
	DVA_GET_GANG(&(bp)->blk_dva[2])))

	#define DVA_EQUAL(dva1, dva2) \
	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
	(dva1)->dva_word[0] == (dva2)->dva_word[0])

	#define BP_EQUAL(bp1, bp2) \
	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
	(bp1)->blk_birth == (bp2)->blk_birth && \
	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))

	#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
	(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) \| \
	((zc1).zc_word[1] - (zc2).zc_word[1]) \| \
	((zc1).zc_word[2] - (zc2).zc_word[2]) \| \
	((zc1).zc_word[3] - (zc2).zc_word[3])))

	#define ZIO_CHECKSUM_IS_ZERO(zc) \
	(0 == ((zc)->zc_word[0] \| (zc)->zc_word[1] \| \
	(zc)->zc_word[2] \| (zc)->zc_word[3]))

	#define ZIO_CHECKSUM_BSWAP(zcp) \
	{ \
	(zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \
	(zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \
	(zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \
	(zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \
	}


	#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)

	#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
	{ \
	(zcp)->zc_word[0] = w0; \
	(zcp)->zc_word[1] = w1; \
	(zcp)->zc_word[2] = w2; \
	(zcp)->zc_word[3] = w3; \
	}

	#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
	#define BP_IS_GANG(bp) \
	(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
	#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
	(dva)->dva_word[1] == 0ULL)
	#define BP_IS_HOLE(bp) \
	(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))

	/* BP_IS_RAIDZ(bp) assumes no block compression */
	#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
	BP_GET_PSIZE(bp))

	#define BP_ZERO(bp) \
	{ \
	(bp)->blk_dva[0].dva_word[0] = 0; \
	(bp)->blk_dva[0].dva_word[1] = 0; \
	(bp)->blk_dva[1].dva_word[0] = 0; \
	(bp)->blk_dva[1].dva_word[1] = 0; \
	(bp)->blk_dva[2].dva_word[0] = 0; \
	(bp)->blk_dva[2].dva_word[1] = 0; \
	(bp)->blk_prop = 0; \
	(bp)->blk_pad[0] = 0; \
	(bp)->blk_pad[1] = 0; \
	(bp)->blk_phys_birth = 0; \
	(bp)->blk_birth = 0; \
	(bp)->blk_fill = 0; \
	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
	}

	#if BYTE_ORDER == _BIG_ENDIAN
	#define ZFS_HOST_BYTEORDER (0ULL)
	#else
	#define ZFS_HOST_BYTEORDER (1ULL)
	#endif

	#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)

	#define BP_SPRINTF_LEN 320

	/*
	* This macro allows code sharing between zfs, libzpool, and mdb.
	* 'func' is either snprintf() or mdb_snprintf().
	* 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
	*/
	#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
	{ \
	static const char *copyname[] = \
	{ "zero", "single", "double", "triple" }; \
	int len = 0; \
	int copies = 0; \
	\
	if (bp == NULL) { \
	len += func(buf + len, size - len, "<NULL>"); \
	} else if (BP_IS_HOLE(bp)) { \
	len += func(buf + len, size - len, \
	"HOLE [L%llu %s] " \
	"size=%llxL birth=%lluL", \
	(u_longlong_t)BP_GET_LEVEL(bp), \
	type, \
	(u_longlong_t)BP_GET_LSIZE(bp), \
	(u_longlong_t)bp->blk_birth); \
	} else if (BP_IS_EMBEDDED(bp)) { \
	len = func(buf + len, size - len, \
	"EMBEDDED [L%llu %s] et=%u %s " \
	"size=%llxL/%llxP birth=%lluL", \
	(u_longlong_t)BP_GET_LEVEL(bp), \
	type, \
	(int)BPE_GET_ETYPE(bp), \
	compress, \
	(u_longlong_t)BPE_GET_LSIZE(bp), \
	(u_longlong_t)BPE_GET_PSIZE(bp), \
	(u_longlong_t)bp->blk_birth); \
	} else { \
	for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
	const dva_t *dva = &bp->blk_dva[d]; \
	if (DVA_IS_VALID(dva)) \
	copies++; \
	len += func(buf + len, size - len, \
	"DVA[%d]=<%llu:%llx:%llx>%c", d, \
	(u_longlong_t)DVA_GET_VDEV(dva), \
	(u_longlong_t)DVA_GET_OFFSET(dva), \
	(u_longlong_t)DVA_GET_ASIZE(dva), \
	ws); \
	} \
	if (BP_IS_GANG(bp) && \
	DVA_GET_ASIZE(&bp->blk_dva[2]) <= \
	DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \
	copies--; \
	len += func(buf + len, size - len, \
	"[L%llu %s] %s %s %s %s %s %s%c" \
	"size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \
	"cksum=%llx:%llx:%llx:%llx", \
	(u_longlong_t)BP_GET_LEVEL(bp), \
	type, \
	checksum, \
	compress, \
	BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \
	BP_IS_GANG(bp) ? "gang" : "contiguous", \
	BP_GET_DEDUP(bp) ? "dedup" : "unique", \
	copyname[copies], \
	ws, \
	(u_longlong_t)BP_GET_LSIZE(bp), \
	(u_longlong_t)BP_GET_PSIZE(bp), \
	(u_longlong_t)bp->blk_birth, \
	(u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
	(u_longlong_t)BP_GET_FILL(bp), \
	ws, \
	(u_longlong_t)bp->blk_cksum.zc_word[0], \
	(u_longlong_t)bp->blk_cksum.zc_word[1], \
	(u_longlong_t)bp->blk_cksum.zc_word[2], \
	(u_longlong_t)bp->blk_cksum.zc_word[3]); \
	} \
	ASSERT(len < size); \
	}

	#define BP_GET_BUFC_TYPE(bp) \
	(BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)

	typedef enum spa_import_type {
	SPA_IMPORT_EXISTING,
	SPA_IMPORT_ASSEMBLE
	} spa_import_type_t;

	/* state manipulation functions */
	extern int spa_open(const char pool, spa_t , void tag);
	extern int spa_open_rewind(const char pool, spa_t , void tag,
	nvlist_t policy, nvlist_t *config);
	extern int spa_get_stats(const char pool, nvlist_t config, char altroot,
	size_t buflen);
	extern int spa_create(const char pool, nvlist_t config, nvlist_t *props,
	nvlist_t *zplprops);
	#ifdef illumos
	extern int spa_import_rootpool(char devpath, char devid);
	#else
	extern int spa_import_rootpool(const char *name);
	#endif
	extern int spa_import(const char pool, nvlist_t config, nvlist_t *props,
	uint64_t flags);
	extern nvlist_t spa_tryimport(nvlist_t tryconfig);
	extern int spa_destroy(char *pool);
	extern int spa_export(char pool, nvlist_t *oldconfig, boolean_t force,
	boolean_t hardforce);
	extern int spa_reset(char *pool);
	extern void spa_async_request(spa_t *spa, int flag);
	extern void spa_async_unrequest(spa_t *spa, int flag);
	extern void spa_async_suspend(spa_t *spa);
	extern void spa_async_resume(spa_t *spa);
	extern spa_t spa_inject_addref(char pool);
	extern void spa_inject_delref(spa_t *spa);
	extern void spa_scan_stat_init(spa_t *spa);
	extern int spa_scan_get_stats(spa_t spa, pool_scan_stat_t ps);

	#define SPA_ASYNC_CONFIG_UPDATE 0x01
	#define SPA_ASYNC_REMOVE 0x02
	#define SPA_ASYNC_PROBE 0x04
	#define SPA_ASYNC_RESILVER_DONE 0x08
	#define SPA_ASYNC_RESILVER 0x10
	#define SPA_ASYNC_AUTOEXPAND 0x20
	#define SPA_ASYNC_REMOVE_DONE 0x40
	#define SPA_ASYNC_REMOVE_STOP 0x80

	/*
	* Controls the behavior of spa_vdev_remove().
	*/
	#define SPA_REMOVE_UNSPARE 0x01
	#define SPA_REMOVE_DONE 0x02

	/* device manipulation */
	extern int spa_vdev_add(spa_t spa, nvlist_t nvroot);
	extern int spa_vdev_attach(spa_t spa, uint64_t guid, nvlist_t nvroot,
	int replacing);
	extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
	int replace_done);
	extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
	extern boolean_t spa_vdev_remove_active(spa_t *spa);
	extern int spa_vdev_setpath(spa_t spa, uint64_t guid, const char newpath);
	extern int spa_vdev_setfru(spa_t spa, uint64_t guid, const char newfru);
	extern int spa_vdev_split_mirror(spa_t spa, char newname, nvlist_t *config,
	nvlist_t *props, boolean_t exp);

	/* spare state (which is global across all pools) */
	extern void spa_spare_add(vdev_t *vd);
	extern void spa_spare_remove(vdev_t *vd);
	extern boolean_t spa_spare_exists(uint64_t guid, uint64_t pool, int refcnt);
	extern void spa_spare_activate(vdev_t *vd);

	/* L2ARC state (which is global across all pools) */
	extern void spa_l2cache_add(vdev_t *vd);
	extern void spa_l2cache_remove(vdev_t *vd);
	extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
	extern void spa_l2cache_activate(vdev_t *vd);
	extern void spa_l2cache_drop(spa_t *spa);

	/* scanning */
	extern int spa_scan(spa_t *spa, pool_scan_func_t func);
	extern int spa_scan_stop(spa_t *spa);
	extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag);

	/* spa syncing */
	extern void spa_sync(spa_t spa, uint64_t txg); / only for DMU use */
	extern void spa_sync_allpools(void);

	/* spa namespace global mutex */
	extern kmutex_t spa_namespace_lock;

	/*
	* SPA configuration functions in spa_config.c
	*/

	#define SPA_CONFIG_UPDATE_POOL 0
	#define SPA_CONFIG_UPDATE_VDEVS 1

	-extern void spa_config_sync(spa_t *, boolean_t, boolean_t);
	+extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
	extern void spa_config_load(void);
	extern nvlist_t spa_all_configs(uint64_t );
	extern void spa_config_set(spa_t spa, nvlist_t config);
	extern nvlist_t spa_config_generate(spa_t spa, vdev_t *vd, uint64_t txg,
	int getstats);
	extern void spa_config_update(spa_t *spa, int what);

	/*
	* Miscellaneous SPA routines in spa_misc.c
	*/

	/* Namespace manipulation */
	extern spa_t spa_lookup(const char name);
	extern spa_t spa_add(const char name, nvlist_t config, const char altroot);
	extern void spa_remove(spa_t *spa);
	extern spa_t spa_next(spa_t prev);

	/* Refcount functions */
	extern void spa_open_ref(spa_t spa, void tag);
	extern void spa_close(spa_t spa, void tag);
	extern void spa_async_close(spa_t spa, void tag);
	extern boolean_t spa_refcount_zero(spa_t *spa);

	#define SCL_NONE 0x00
	#define SCL_CONFIG 0x01
	#define SCL_STATE 0x02
	#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */
	#define SCL_ALLOC 0x08
	#define SCL_ZIO 0x10
	#define SCL_FREE 0x20
	#define SCL_VDEV 0x40
	#define SCL_LOCKS 7
	#define SCL_ALL ((1 << SCL_LOCKS) - 1)
	#define SCL_STATE_ALL (SCL_STATE \| SCL_L2ARC \| SCL_ZIO)

	/* Pool configuration locks */
	extern int spa_config_tryenter(spa_t spa, int locks, void tag, krw_t rw);
	extern void spa_config_enter(spa_t spa, int locks, void tag, krw_t rw);
	extern void spa_config_exit(spa_t spa, int locks, void tag);
	extern int spa_config_held(spa_t *spa, int locks, krw_t rw);

	/* Pool vdev add/remove lock */
	extern uint64_t spa_vdev_enter(spa_t *spa);
	extern uint64_t spa_vdev_config_enter(spa_t *spa);
	extern void spa_vdev_config_exit(spa_t spa, vdev_t vd, uint64_t txg,
	int error, char *tag);
	extern int spa_vdev_exit(spa_t spa, vdev_t vd, uint64_t txg, int error);

	/* Pool vdev state change lock */
	extern void spa_vdev_state_enter(spa_t *spa, int oplock);
	extern int spa_vdev_state_exit(spa_t spa, vdev_t vd, int error);

	/* Log state */
	typedef enum spa_log_state {
	SPA_LOG_UNKNOWN = 0, /* unknown log state */
	SPA_LOG_MISSING, /* missing log(s) */
	SPA_LOG_CLEAR, /* clear the log(s) */
	SPA_LOG_GOOD, /* log(s) are good */
	} spa_log_state_t;

	extern spa_log_state_t spa_get_log_state(spa_t *spa);
	extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
	-extern int spa_offline_log(spa_t *spa);
	+extern int spa_reset_logs(spa_t *spa);

	/* Log claim callback */
	extern void spa_claim_notify(zio_t *zio);

	/* Accessor functions */
	extern boolean_t spa_shutting_down(spa_t *spa);
	extern struct dsl_pool spa_get_dsl(spa_t spa);
	extern boolean_t spa_is_initializing(spa_t *spa);
	+extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
	extern blkptr_t spa_get_rootblkptr(spa_t spa);
	extern void spa_set_rootblkptr(spa_t spa, const blkptr_t bp);
	extern void spa_altroot(spa_t , char , size_t);
	extern int spa_sync_pass(spa_t *spa);
	extern char spa_name(spa_t spa);
	extern uint64_t spa_guid(spa_t *spa);
	extern uint64_t spa_load_guid(spa_t *spa);
	extern uint64_t spa_last_synced_txg(spa_t *spa);
	extern uint64_t spa_first_txg(spa_t *spa);
	extern uint64_t spa_syncing_txg(spa_t *spa);
	extern uint64_t spa_final_dirty_txg(spa_t *spa);
	extern uint64_t spa_version(spa_t *spa);
	extern pool_state_t spa_state(spa_t *spa);
	extern spa_load_state_t spa_load_state(spa_t *spa);
	extern uint64_t spa_freeze_txg(spa_t *spa);
	extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
	extern uint64_t spa_get_dspace(spa_t *spa);
	extern uint64_t spa_get_slop_space(spa_t *spa);
	extern void spa_update_dspace(spa_t *spa);
	extern uint64_t spa_version(spa_t *spa);
	extern boolean_t spa_deflate(spa_t *spa);
	extern metaslab_class_t spa_normal_class(spa_t spa);
	extern metaslab_class_t spa_log_class(spa_t spa);
	extern void spa_evicting_os_register(spa_t , objset_t os);
	extern void spa_evicting_os_deregister(spa_t , objset_t os);
	extern void spa_evicting_os_wait(spa_t *spa);
	extern int spa_max_replication(spa_t *spa);
	extern int spa_prev_software_version(spa_t *spa);
	extern int spa_busy(void);
	extern uint8_t spa_get_failmode(spa_t *spa);
	extern boolean_t spa_suspended(spa_t *spa);
	extern uint64_t spa_bootfs(spa_t *spa);
	extern uint64_t spa_delegation(spa_t *spa);
	extern objset_t spa_meta_objset(spa_t spa);
	extern uint64_t spa_deadman_synctime(spa_t *spa);

	/* Miscellaneous support routines */
	extern void spa_activate_mos_feature(spa_t spa, const char feature,
	dmu_tx_t *tx);
	extern void spa_deactivate_mos_feature(spa_t spa, const char feature);
	extern int spa_rename(const char oldname, const char newname);
	extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
	extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
	extern char spa_strdup(const char );
	extern void spa_strfree(char *);
	extern uint64_t spa_get_random(uint64_t range);
	extern uint64_t spa_generate_guid(spa_t *spa);
	extern void snprintf_blkptr(char buf, size_t buflen, const blkptr_t bp);
	extern void spa_freeze(spa_t *spa);
	extern int spa_change_guid(spa_t *spa);
	extern void spa_upgrade(spa_t *spa, uint64_t version);
	extern void spa_evict_all(void);
	extern vdev_t spa_lookup_by_guid(spa_t spa, uint64_t guid,
	boolean_t l2cache);
	extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
	extern uint64_t dva_get_dsize_sync(spa_t spa, const dva_t dva);
	extern uint64_t bp_get_dsize_sync(spa_t spa, const blkptr_t bp);
	extern uint64_t bp_get_dsize(spa_t spa, const blkptr_t bp);
	extern boolean_t spa_has_slogs(spa_t *spa);
	extern boolean_t spa_is_root(spa_t *spa);
	extern boolean_t spa_writeable(spa_t *spa);
	extern boolean_t spa_has_pending_synctask(spa_t *spa);
	extern int spa_maxblocksize(spa_t *spa);
	extern void zfs_blkptr_verify(spa_t spa, const blkptr_t bp);
	+typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
	+ void *arg);
	+extern boolean_t spa_remap_blkptr(spa_t spa, blkptr_t bp,
	+ spa_remap_cb_t callback, void *arg);
	+extern uint64_t spa_get_last_removal_txg(spa_t *spa);

	extern int spa_mode(spa_t *spa);
	extern uint64_t zfs_strtonum(const char str, char *nptr);

	extern char *spa_his_ievent_table[];

	extern void spa_history_create_obj(spa_t spa, dmu_tx_t tx);
	extern int spa_history_get(spa_t spa, uint64_t offset, uint64_t *len_read,
	char *his_buf);
	extern int spa_history_log(spa_t spa, const char his_buf);
	extern int spa_history_log_nvl(spa_t spa, nvlist_t nvl);
	extern void spa_history_log_version(spa_t spa, const char operation);
	extern void spa_history_log_internal(spa_t spa, const char operation,
	dmu_tx_t tx, const char fmt, ...);
	extern void spa_history_log_internal_ds(struct dsl_dataset ds, const char op,
	dmu_tx_t tx, const char fmt, ...);
	extern void spa_history_log_internal_dd(dsl_dir_t dd, const char operation,
	dmu_tx_t tx, const char fmt, ...);

	/* error handling */
	struct zbookmark_phys;
	extern void spa_log_error(spa_t spa, zio_t zio);
	extern void zfs_ereport_post(const char cls, spa_t spa, vdev_t *vd,
	zio_t *zio, uint64_t stateoroffset, uint64_t length);
	extern void zfs_post_remove(spa_t spa, vdev_t vd);
	extern void zfs_post_state_change(spa_t spa, vdev_t vd);
	extern void zfs_post_autoreplace(spa_t spa, vdev_t vd);
	extern uint64_t spa_get_errlog_size(spa_t *spa);
	extern int spa_get_errlog(spa_t spa, void uaddr, size_t *count);
	extern void spa_errlog_rotate(spa_t *spa);
	extern void spa_errlog_drain(spa_t *spa);
	extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
	extern void spa_get_errlists(spa_t spa, avl_tree_t last, avl_tree_t *scrub);

	/* vdev cache */
	extern void vdev_cache_stat_init(void);
	extern void vdev_cache_stat_fini(void);

	/* Initialization and termination */
	extern void spa_init(int flags);
	extern void spa_fini(void);
	extern void spa_boot_init(void);

	/* properties */
	extern int spa_prop_set(spa_t spa, nvlist_t nvp);
	extern int spa_prop_get(spa_t spa, nvlist_t *nvp);
	extern void spa_prop_clear_bootfs(spa_t spa, uint64_t obj, dmu_tx_t tx);
	extern void spa_configfile_set(spa_t , nvlist_t , boolean_t);

	/* asynchronous event notification */
	extern void spa_event_notify(spa_t spa, vdev_t vdev, nvlist_t *hist_nvl,
	const char *name);
	+extern sysevent_t spa_event_create(spa_t spa, vdev_t vd, nvlist_t hist_nvl,
	+ const char *name);
	+extern void spa_event_post(sysevent_t *ev);
	+extern void spa_event_discard(sysevent_t *ev);

	#ifdef ZFS_DEBUG
	#define dprintf_bp(bp, fmt, ...) do { \
	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
	kmem_free(__blkbuf, BP_SPRINTF_LEN); \
	} \
	_NOTE(CONSTCOND) } while (0)
	#else
	#define dprintf_bp(bp, fmt, ...)
	#endif

	extern boolean_t spa_debug_enabled(spa_t *spa);
	#define spa_dbgmsg(spa, ...) \
	{ \
	if (spa_debug_enabled(spa)) \
	zfs_dbgmsg(__VA_ARGS__); \
	}

	extern int spa_mode_global; /* mode, e.g. FREAD \| FWRITE */

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_SPA_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h (revision 332525)
	@@ -1,320 +1,389 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright 2013 Saso Kiselkov. All rights reserved.
	* Copyright (c) 2017 Datto Inc.
	*/

	#ifndef _SYS_SPA_IMPL_H
	#define _SYS_SPA_IMPL_H

	#include <sys/spa.h>
	#include <sys/vdev.h>
	+#include <sys/vdev_removal.h>
	#include <sys/metaslab.h>
	#include <sys/dmu.h>
	#include <sys/dsl_pool.h>
	#include <sys/uberblock_impl.h>
	#include <sys/zfs_context.h>
	#include <sys/avl.h>
	#include <sys/refcount.h>
	#include <sys/bplist.h>
	#include <sys/bpobj.h>
	#include <sys/zfeature.h>
	#include <zfeature_common.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	typedef struct spa_error_entry {
	zbookmark_phys_t se_bookmark;
	char *se_name;
	avl_node_t se_avl;
	} spa_error_entry_t;

	typedef struct spa_history_phys {
	uint64_t sh_pool_create_len; /* ending offset of zpool create */
	uint64_t sh_phys_max_off; /* physical EOF */
	uint64_t sh_bof; /* logical BOF */
	uint64_t sh_eof; /* logical EOF */
	uint64_t sh_records_lost; /* num of records overwritten */
	} spa_history_phys_t;

	+/*
	+ * All members must be uint64_t, for byteswap purposes.
	+ */
	+typedef struct spa_removing_phys {
	+ uint64_t sr_state; /* dsl_scan_state_t */
	+
	+ /*
	+ * The vdev ID that we most recently attempted to remove,
	+ * or -1 if no removal has been attempted.
	+ */
	+ uint64_t sr_removing_vdev;
	+
	+ /*
	+ * The vdev ID that we most recently successfully removed,
	+ * or -1 if no devices have been removed.
	+ */
	+ uint64_t sr_prev_indirect_vdev;
	+
	+ uint64_t sr_start_time;
	+ uint64_t sr_end_time;
	+
	+ /*
	+ * Note that we can not use the space map's or indirect mapping's
	+ * accounting as a substitute for these values, because we need to
	+ * count frees of not-yet-copied data as though it did the copy.
	+ * Otherwise, we could get into a situation where copied > to_copy,
	+ * or we complete before copied == to_copy.
	+ */
	+ uint64_t sr_to_copy; /* bytes that need to be copied */
	+ uint64_t sr_copied; /* bytes that have been copied or freed */
	+} spa_removing_phys_t;
	+
	+/*
	+ * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT
	+ * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense
	+ * of an indirect vdev's mapping object is in progress.
	+ */
	+typedef struct spa_condensing_indirect_phys {
	+ /*
	+ * The vdev ID of the indirect vdev whose indirect mapping is
	+ * being condensed.
	+ */
	+ uint64_t scip_vdev;
	+
	+ /*
	+ * The vdev's old obsolete spacemap. This spacemap's contents are
	+ * being integrated into the new mapping.
	+ */
	+ uint64_t scip_prev_obsolete_sm_object;
	+
	+ /*
	+ * The new mapping object that is being created.
	+ */
	+ uint64_t scip_next_mapping_object;
	+} spa_condensing_indirect_phys_t;
	+
	struct spa_aux_vdev {
	uint64_t sav_object; /* MOS object for device list */
	nvlist_t sav_config; / cached device config */
	vdev_t *sav_vdevs; / devices */
	int sav_count; /* number devices */
	boolean_t sav_sync; /* sync the device list */
	nvlist_t *sav_pending; / pending device additions */
	uint_t sav_npending; /* # pending devices */
	};

	typedef struct spa_config_lock {
	kmutex_t scl_lock;
	kthread_t *scl_writer;
	int scl_write_wanted;
	kcondvar_t scl_cv;
	refcount_t scl_count;
	} spa_config_lock_t;

	typedef struct spa_config_dirent {
	list_node_t scd_link;
	char *scd_path;
	} spa_config_dirent_t;

	typedef enum zio_taskq_type {
	ZIO_TASKQ_ISSUE = 0,
	ZIO_TASKQ_ISSUE_HIGH,
	ZIO_TASKQ_INTERRUPT,
	ZIO_TASKQ_INTERRUPT_HIGH,
	ZIO_TASKQ_TYPES
	} zio_taskq_type_t;

	/*
	* State machine for the zpool-poolname process. The states transitions
	* are done as follows:
	*
	* From To Routine
	* PROC_NONE -> PROC_CREATED spa_activate()
	* PROC_CREATED -> PROC_ACTIVE spa_thread()
	* PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate()
	* PROC_DEACTIVATE -> PROC_GONE spa_thread()
	* PROC_GONE -> PROC_NONE spa_deactivate()
	*/
	typedef enum spa_proc_state {
	SPA_PROC_NONE, /* spa_proc = &p0, no process created */
	SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */
	SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */
	SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */
	SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
	} spa_proc_state_t;

	typedef struct spa_taskqs {
	uint_t stqs_count;
	taskq_t **stqs_taskq;
	} spa_taskqs_t;

	typedef enum spa_all_vdev_zap_action {
	AVZ_ACTION_NONE = 0,
	AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */
	AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */
	AVZ_ACTION_INITIALIZE
	} spa_avz_action_t;

	struct spa {
	/*
	* Fields protected by spa_namespace_lock.
	*/
	char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */
	char spa_comment; / comment */
	avl_node_t spa_avl; /* node in spa_namespace_avl */
	nvlist_t spa_config; / last synced config */
	nvlist_t spa_config_syncing; / currently syncing config */
	nvlist_t spa_config_splitting; / config for splitting */
	nvlist_t spa_load_info; / info and errors from load */
	uint64_t spa_config_txg; /* txg of last config change */
	int spa_sync_pass; /* iterate-to-convergence */
	pool_state_t spa_state; /* pool state */
	int spa_inject_ref; /* injection references */
	uint8_t spa_sync_on; /* sync threads are running */
	spa_load_state_t spa_load_state; /* current load operation */
	+ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */
	uint64_t spa_import_flags; /* import specific flags */
	spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
	dsl_pool_t *spa_dsl_pool;
	boolean_t spa_is_initializing; /* true while opening pool */
	metaslab_class_t spa_normal_class; / normal data class */
	metaslab_class_t spa_log_class; / intent log data class */
	uint64_t spa_first_txg; /* first txg after spa_open() */
	uint64_t spa_final_txg; /* txg of export/destroy */
	uint64_t spa_freeze_txg; /* freeze pool at this txg */
	uint64_t spa_load_max_txg; /* best initial ub_txg */
	uint64_t spa_claim_max_txg; /* highest claimed birth txg */
	timespec_t spa_loaded_ts; /* 1st successful open time */
	objset_t spa_meta_objset; / copy of dp->dp_meta_objset */
	kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */
	list_t spa_evicting_os_list; /* Objsets being evicted. */
	kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */
	txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
	vdev_t spa_root_vdev; / top-level vdev container */
	int spa_min_ashift; /* of vdevs in normal class */
	int spa_max_ashift; /* of vdevs in normal class */
	uint64_t spa_config_guid; /* config pool guid */
	uint64_t spa_load_guid; /* spa_load initialized guid */
	uint64_t spa_last_synced_guid; /* last synced guid */
	list_t spa_config_dirty_list; /* vdevs with dirty config */
	list_t spa_state_dirty_list; /* vdevs with dirty state */
	kmutex_t spa_alloc_lock;
	avl_tree_t spa_alloc_tree;
	spa_aux_vdev_t spa_spares; /* hot spares */
	spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
	nvlist_t spa_label_features; / Features for reading MOS */
	uint64_t spa_config_object; /* MOS object for pool config */
	uint64_t spa_config_generation; /* config generation number */
	uint64_t spa_syncing_txg; /* txg currently syncing */
	bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
	bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
	zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
	/* checksum context templates */
	kmutex_t spa_cksum_tmpls_lock;
	void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
	uberblock_t spa_ubsync; /* last synced uberblock */
	uberblock_t spa_uberblock; /* current uberblock */
	boolean_t spa_extreme_rewind; /* rewind past deferred frees */
	uint64_t spa_last_io; /* lbolt of last non-scan I/O */
	kmutex_t spa_scrub_lock; /* resilver/scrub lock */
	uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
	kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
	uint8_t spa_scrub_active; /* active or suspended? */
	uint8_t spa_scrub_type; /* type of scrub we're doing */
	uint8_t spa_scrub_finished; /* indicator to rotate logs */
	uint8_t spa_scrub_started; /* started since last boot */
	uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */
	uint64_t spa_scan_pass_start; /* start time per pass/reboot */
	uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */
	uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
	uint64_t spa_scan_pass_exam; /* examined bytes per pass */
	kmutex_t spa_async_lock; /* protect async state */
	kthread_t spa_async_thread; / thread doing async task */
	kthread_t spa_async_thread_vd; / thread doing vd async task */
	int spa_async_suspended; /* async tasks suspended */
	kcondvar_t spa_async_cv; /* wait for thread_exit() */
	uint16_t spa_async_tasks; /* async task mask */
	+
	+ spa_removing_phys_t spa_removing_phys;
	+ spa_vdev_removal_t *spa_vdev_removal;
	+
	+ spa_condensing_indirect_phys_t spa_condensing_indirect_phys;
	+ spa_condensing_indirect_t *spa_condensing_indirect;
	+ kthread_t spa_condense_thread; / thread doing condense. */
	+
	char spa_root; / alternate root directory */
	uint64_t spa_ena; /* spa-wide ereport ENA */
	int spa_last_open_failed; /* error if last open failed */
	uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */
	uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */
	uint64_t spa_load_txg; /* ub txg that loaded */
	uint64_t spa_load_txg_ts; /* timestamp from that ub */
	uint64_t spa_load_meta_errors; /* verify metadata err count */
	uint64_t spa_load_data_errors; /* verify data err count */
	uint64_t spa_verify_min_txg; /* start txg of verify scrub */
	kmutex_t spa_errlog_lock; /* error log lock */
	uint64_t spa_errlog_last; /* last error log object */
	uint64_t spa_errlog_scrub; /* scrub error log object */
	kmutex_t spa_errlist_lock; /* error list/ereport lock */
	avl_tree_t spa_errlist_last; /* last error list */
	avl_tree_t spa_errlist_scrub; /* scrub error list */
	uint64_t spa_deflate; /* should we deflate? */
	uint64_t spa_history; /* history object */
	kmutex_t spa_history_lock; /* history lock */
	vdev_t spa_pending_vdev; / pending vdev additions */
	kmutex_t spa_props_lock; /* property lock */
	uint64_t spa_pool_props_object; /* object for properties */
	uint64_t spa_bootfs; /* default boot filesystem */
	uint64_t spa_failmode; /* failure mode for the pool */
	uint64_t spa_delegation; /* delegation on/off */
	list_t spa_config_list; /* previous cache file(s) */
	/* per-CPU array of root of async I/O: */
	zio_t **spa_async_zio_root;
	zio_t spa_suspend_zio_root; / root of all suspended I/O */
	+ zio_t spa_txg_zio[TXG_SIZE]; / spa_sync() waits for this */
	kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
	kcondvar_t spa_suspend_cv; /* notification of resume */
	uint8_t spa_suspended; /* pool is suspended */
	uint8_t spa_claiming; /* pool is doing zil_claim() */
	boolean_t spa_debug; /* debug enabled? */
	boolean_t spa_is_root; /* pool is root */
	int spa_minref; /* num refs when first opened */
	int spa_mode; /* FREAD \| FWRITE */
	spa_log_state_t spa_log_state; /* log state */
	uint64_t spa_autoexpand; /* lun expansion on/off */
	uint64_t spa_bootsize; /* efi system partition size */
	ddt_t spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; / in-core DDTs */
	uint64_t spa_ddt_stat_object; /* DDT statistics */
	uint64_t spa_dedup_ditto; /* dedup ditto threshold */
	uint64_t spa_dedup_checksum; /* default dedup checksum */
	uint64_t spa_dspace; /* dspace in normal class */
	kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
	kmutex_t spa_proc_lock; /* protects spa_proc* */
	kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
	spa_proc_state_t spa_proc_state; /* see definition */
	struct proc spa_proc; / "zpool-poolname" process */
	uint64_t spa_did; /* if procp != p0, did of t1 */
	kthread_t spa_trim_thread; / thread sending TRIM I/Os */
	kmutex_t spa_trim_lock; /* protects spa_trim_cv */
	kcondvar_t spa_trim_cv; /* used to notify TRIM thread */
	boolean_t spa_autoreplace; /* autoreplace set in open */
	int spa_vdev_locks; /* locks grabbed */
	uint64_t spa_creation_version; /* version at pool creation */
	uint64_t spa_prev_software_version; /* See ub_software_version */
	uint64_t spa_feat_for_write_obj; /* required to write to pool */
	uint64_t spa_feat_for_read_obj; /* required to read from pool */
	uint64_t spa_feat_desc_obj; /* Feature descriptions */
	uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */
	/* cache feature refcounts */
	uint64_t spa_feat_refcount_cache[SPA_FEATURES];
	#ifdef illumos
	cyclic_id_t spa_deadman_cycid; /* cyclic id */
	#else /* !illumos */
	#ifdef _KERNEL
	struct callout spa_deadman_cycid; /* callout id */
	struct task spa_deadman_task;
	#endif
	#endif /* illumos */
	uint64_t spa_deadman_calls; /* number of deadman calls */
	hrtime_t spa_sync_starttime; /* starting time fo spa_sync */
	uint64_t spa_deadman_synctime; /* deadman expiration timer */
	uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */
	spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */

	#ifdef illumos
	/*
	* spa_iokstat_lock protects spa_iokstat and
	* spa_queue_stats[].
	*/
	kmutex_t spa_iokstat_lock;
	struct kstat spa_iokstat; / kstat of io to this pool */
	struct {
	int spa_active;
	int spa_queued;
	} spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
	#endif
	hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */

	/*
	* spa_refcount & spa_config_lock must be the last elements
	* because refcount_t changes size based on compilation options.
	* In order for the MDB module to function correctly, the other
	* fields must remain in the same location.
	*/
	spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
	refcount_t spa_refcount; /* number of opens */
	#ifndef illumos
	boolean_t spa_splitting_newspa; /* creating new spa in split */
	#endif
	};

	extern const char *spa_config_path;

	extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
	task_func_t func, void arg, uint_t flags, taskq_ent_t *ent);
	+extern void spa_load_spares(spa_t *spa);
	+extern void spa_load_l2cache(spa_t *spa);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_SPA_IMPL_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h (revision 332525)
	@@ -1,164 +1,171 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/

	/*
	- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_SPACE_MAP_H
	#define _SYS_SPACE_MAP_H

	#include <sys/avl.h>
	#include <sys/range_tree.h>
	#include <sys/dmu.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	/*
	* The size of the space map object has increased to include a histogram.
	* The SPACE_MAP_SIZE_V0 designates the original size and is used to
	* maintain backward compatibility.
	*/
	#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
	#define SPACE_MAP_HISTOGRAM_SIZE 32

	/*
	* The space_map_phys is the on-disk representation of the space map.
	* Consumers of space maps should never reference any of the members of this
	* structure directly. These members may only be updated in syncing context.
	*
	* Note the smp_object is no longer used but remains in the structure
	* for backward compatibility.
	*/
	typedef struct space_map_phys {
	uint64_t smp_object; /* on-disk space map object */
	uint64_t smp_objsize; /* size of the object */
	uint64_t smp_alloc; /* space allocated from the map */
	uint64_t smp_pad[5]; /* reserved */

	/*
	* The smp_histogram maintains a histogram of free regions. Each
	* bucket, smp_histogram[i], contains the number of free regions
	* whose size is:
	* 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
	*/
	uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
	} space_map_phys_t;

	/*
	* The space map object defines a region of space, its size, how much is
	* allocated, and the on-disk object that stores this information.
	* Consumers of space maps may only access the members of this structure.
	+ *
	+ * Note: the space_map may not be accessed concurrently; consumers
	+ * must provide external locking if required.
	*/
	typedef struct space_map {
	uint64_t sm_start; /* start of map */
	uint64_t sm_size; /* size of map */
	uint8_t sm_shift; /* unit shift */
	uint64_t sm_length; /* synced length */
	uint64_t sm_alloc; /* synced space allocated */
	objset_t sm_os; / objset for this map */
	uint64_t sm_object; /* object id for this map */
	uint32_t sm_blksz; /* block size for space map */
	dmu_buf_t sm_dbuf; / space_map_phys_t dbuf */
	space_map_phys_t sm_phys; / on-disk space map */
	- kmutex_t sm_lock; / pointer to lock that protects map */
	} space_map_t;

	/*
	* debug entry
	*
	* 1 3 10 50
	* ,---+--------+------------+---------------------------------.
	* \| 1 \| action \| syncpass \| txg (lower bits) \|
	* `---+--------+------------+---------------------------------'
	* 63 62 60 59 50 49 0
	*
	*
	* non-debug entry
	*
	* 1 47 1 15
	* ,-----------------------------------------------------------.
	* \| 0 \| offset (sm_shift units) \| type \| run \|
	* `-----------------------------------------------------------'
	* 63 62 17 16 15 0
	*/

	/* All this stuff takes and returns bytes */
	#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1)
	#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15)
	#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
	#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
	#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47)
	#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47)
	#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1)
	#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1)

	#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3)
	#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3)

	#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
	#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)

	#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
	#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)

	#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)

	typedef enum {
	SM_ALLOC,
	SM_FREE
	} maptype_t;

	+typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size,
	+ void *arg);
	+
	int space_map_load(space_map_t sm, range_tree_t rt, maptype_t maptype);
	+int space_map_iterate(space_map_t sm, sm_cb_t callback, void arg);

	void space_map_histogram_clear(space_map_t *sm);
	void space_map_histogram_add(space_map_t sm, range_tree_t rt,
	dmu_tx_t *tx);

	void space_map_update(space_map_t *sm);

	uint64_t space_map_object(space_map_t *sm);
	uint64_t space_map_allocated(space_map_t *sm);
	uint64_t space_map_length(space_map_t *sm);

	void space_map_write(space_map_t sm, range_tree_t rt, maptype_t maptype,
	dmu_tx_t *tx);
	void space_map_truncate(space_map_t sm, dmu_tx_t tx);
	uint64_t space_map_alloc(objset_t os, dmu_tx_t tx);
	void space_map_free(space_map_t sm, dmu_tx_t tx);
	+void space_map_free_obj(objset_t os, uint64_t smobj, dmu_tx_t tx);

	int space_map_open(space_map_t *smp, objset_t os, uint64_t object,
	- uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp);
	+ uint64_t start, uint64_t size, uint8_t shift);
	void space_map_close(space_map_t *sm);

	int64_t space_map_alloc_delta(space_map_t *sm);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_SPACE_MAP_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h (revision 332525)
	@@ -1,176 +1,180 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_VDEV_H
	#define _SYS_VDEV_H

	#include <sys/spa.h>
	#include <sys/zio.h>
	#include <sys/dmu.h>
	#include <sys/space_map.h>
	#include <sys/fs/zfs.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	typedef enum vdev_dtl_type {
	DTL_MISSING, /* 0% replication: no copies of the data */
	DTL_PARTIAL, /* less than 100% replication: some copies missing */
	DTL_SCRUB, /* unable to fully repair during scrub/resilver */
	DTL_OUTAGE, /* temporarily missing (used to attempt detach) */
	DTL_TYPES
	} vdev_dtl_type_t;

	extern boolean_t zfs_nocacheflush;
	extern boolean_t zfs_trim_enabled;

	extern int vdev_open(vdev_t *);
	extern void vdev_open_children(vdev_t *);
	extern boolean_t vdev_uses_zvols(vdev_t *);
	extern int vdev_validate(vdev_t *, boolean_t);
	extern void vdev_close(vdev_t *);
	extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
	extern void vdev_reopen(vdev_t *);
	extern int vdev_validate_aux(vdev_t *vd);
	extern zio_t vdev_probe(vdev_t vd, zio_t *pio);
	-
	+extern boolean_t vdev_is_concrete(vdev_t *vd);
	extern boolean_t vdev_is_bootable(vdev_t *vd);
	extern vdev_t vdev_lookup_top(spa_t spa, uint64_t vdev);
	extern vdev_t vdev_lookup_by_guid(vdev_t vd, uint64_t guid);
	extern int vdev_count_leaves(spa_t *spa);
	extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
	uint64_t txg, uint64_t size);
	extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
	uint64_t txg, uint64_t size);
	extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
	extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
	int scrub_done);
	extern boolean_t vdev_dtl_required(vdev_t *vd);
	extern boolean_t vdev_resilver_needed(vdev_t *vd,
	uint64_t minp, uint64_t maxp);
	extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
	dmu_tx_t *tx);
	extern uint64_t vdev_create_link_zap(vdev_t vd, dmu_tx_t tx);
	extern void vdev_construct_zaps(vdev_t vd, dmu_tx_t tx);
	+extern void vdev_destroy_spacemaps(vdev_t vd, dmu_tx_t tx);
	+extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
	+ uint64_t size, uint64_t txg);
	+extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
	+ uint64_t offset, uint64_t size, dmu_tx_t *tx);

	extern void vdev_hold(vdev_t *);
	extern void vdev_rele(vdev_t *);

	extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
	extern void vdev_metaslab_fini(vdev_t *vd);
	extern void vdev_metaslab_set_size(vdev_t *);
	extern void vdev_ashift_optimize(vdev_t *);
	extern void vdev_expand(vdev_t *vd, uint64_t txg);
	extern void vdev_split(vdev_t *vd);
	extern void vdev_deadman(vdev_t *vd);
	-

	extern void vdev_get_stats(vdev_t vd, vdev_stat_t vs);
	extern void vdev_clear_stats(vdev_t *vd);
	extern void vdev_stat_update(zio_t *zio, uint64_t psize);
	extern void vdev_scan_stat_init(vdev_t *vd);
	extern void vdev_propagate_state(vdev_t *vd);
	extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
	vdev_aux_t aux);

	extern void vdev_space_update(vdev_t *vd,
	int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);

	extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);

	extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
	extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
	extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
	vdev_state_t *);
	extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
	extern void vdev_clear(spa_t spa, vdev_t vd);

	extern boolean_t vdev_is_dead(vdev_t *vd);
	extern boolean_t vdev_readable(vdev_t *vd);
	extern boolean_t vdev_writeable(vdev_t *vd);
	extern boolean_t vdev_allocatable(vdev_t *vd);
	extern boolean_t vdev_accessible(vdev_t vd, zio_t zio);

	extern void vdev_cache_init(vdev_t *vd);
	extern void vdev_cache_fini(vdev_t *vd);
	extern boolean_t vdev_cache_read(zio_t *zio);
	extern void vdev_cache_write(zio_t *zio);
	extern void vdev_cache_purge(vdev_t *vd);

	extern void vdev_queue_init(vdev_t *vd);
	extern void vdev_queue_fini(vdev_t *vd);
	extern zio_t vdev_queue_io(zio_t zio);
	extern void vdev_queue_io_done(zio_t *zio);
	extern int vdev_queue_length(vdev_t *vd);
	extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
	extern void vdev_queue_register_lastoffset(vdev_t vd, zio_t zio);

	extern void vdev_config_dirty(vdev_t *vd);
	extern void vdev_config_clean(vdev_t *vd);
	extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);

	extern void vdev_state_dirty(vdev_t *vd);
	extern void vdev_state_clean(vdev_t *vd);

	typedef enum vdev_config_flag {
	VDEV_CONFIG_SPARE = 1 << 0,
	VDEV_CONFIG_L2CACHE = 1 << 1,
	VDEV_CONFIG_REMOVING = 1 << 2,
	VDEV_CONFIG_MOS = 1 << 3
	} vdev_config_flag_t;

	extern void vdev_top_config_generate(spa_t spa, nvlist_t config);
	extern nvlist_t vdev_config_generate(spa_t spa, vdev_t *vd,
	boolean_t getstats, vdev_config_flag_t flags);

	/*
	* Label routines
	*/
	struct uberblock;
	extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
	extern int vdev_label_number(uint64_t psise, uint64_t offset);
	extern nvlist_t vdev_label_read_config(vdev_t vd, uint64_t txg);
	extern void vdev_uberblock_load(vdev_t , struct uberblock , nvlist_t **);

	typedef enum {
	VDEV_LABEL_CREATE, /* create/add a new device */
	VDEV_LABEL_REPLACE, /* replace an existing device */
	VDEV_LABEL_SPARE, /* add a new hot spare */
	VDEV_LABEL_REMOVE, /* remove an existing device */
	VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */
	VDEV_LABEL_SPLIT /* generating new label for split-off dev */
	} vdev_labeltype_t;

	extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);

	extern int vdev_label_write_pad2(vdev_t vd, const char buf, size_t size);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_VDEV_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h (revision 332525)
	@@ -1,403 +1,496 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_VDEV_IMPL_H
	#define _SYS_VDEV_IMPL_H

	#include <sys/avl.h>
	+#include <sys/bpobj.h>
	#include <sys/dmu.h>
	#include <sys/metaslab.h>
	#include <sys/nvpair.h>
	#include <sys/space_map.h>
	#include <sys/vdev.h>
	#include <sys/dkio.h>
	#include <sys/uberblock_impl.h>
	+#include <sys/vdev_indirect_mapping.h>
	+#include <sys/vdev_indirect_births.h>
	+#include <sys/vdev_removal.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	/*
	* Virtual device descriptors.
	*
	* All storage pool operations go through the virtual device framework,
	* which provides data replication and I/O scheduling.
	*/

	/*
	* Forward declarations that lots of things need.
	*/
	typedef struct vdev_queue vdev_queue_t;
	typedef struct vdev_cache vdev_cache_t;
	typedef struct vdev_cache_entry vdev_cache_entry_t;
	struct abd;

	extern int zfs_vdev_queue_depth_pct;
	extern uint32_t zfs_vdev_async_write_max_active;

	/*
	* Virtual device operations
	*/
	typedef int vdev_open_func_t(vdev_t vd, uint64_t size, uint64_t *max_size,
	uint64_t logical_ashift, uint64_t physical_ashift);
	typedef void vdev_close_func_t(vdev_t *vd);
	typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
	typedef void vdev_io_start_func_t(zio_t *zio);
	typedef void vdev_io_done_func_t(zio_t *zio);
	typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
	typedef void vdev_hold_func_t(vdev_t *vd);
	typedef void vdev_rele_func_t(vdev_t *vd);

	+typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
	+ uint64_t offset, uint64_t size, void *arg);
	+typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
	+ vdev_remap_cb_t callback, void *arg);
	+
	typedef struct vdev_ops {
	vdev_open_func_t *vdev_op_open;
	vdev_close_func_t *vdev_op_close;
	vdev_asize_func_t *vdev_op_asize;
	vdev_io_start_func_t *vdev_op_io_start;
	vdev_io_done_func_t *vdev_op_io_done;
	vdev_state_change_func_t *vdev_op_state_change;
	vdev_hold_func_t *vdev_op_hold;
	vdev_rele_func_t *vdev_op_rele;
	+ vdev_remap_func_t *vdev_op_remap;
	char vdev_op_type[16];
	boolean_t vdev_op_leaf;
	} vdev_ops_t;

	/*
	* Virtual device properties
	*/
	struct vdev_cache_entry {
	struct abd *ve_abd;
	uint64_t ve_offset;
	uint64_t ve_lastused;
	avl_node_t ve_offset_node;
	avl_node_t ve_lastused_node;
	uint32_t ve_hits;
	uint16_t ve_missed_update;
	zio_t *ve_fill_io;
	};

	struct vdev_cache {
	avl_tree_t vc_offset_tree;
	avl_tree_t vc_lastused_tree;
	kmutex_t vc_lock;
	};

	typedef struct vdev_queue_class {
	uint32_t vqc_active;

	/*
	* Sorted by offset or timestamp, depending on if the queue is
	* LBA-ordered vs FIFO.
	*/
	avl_tree_t vqc_queued_tree;
	} vdev_queue_class_t;

	struct vdev_queue {
	vdev_t *vq_vdev;
	vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
	avl_tree_t vq_active_tree;
	avl_tree_t vq_read_offset_tree;
	avl_tree_t vq_write_offset_tree;
	uint64_t vq_last_offset;
	hrtime_t vq_io_complete_ts; /* time last i/o completed */
	kmutex_t vq_lock;
	uint64_t vq_lastoffset;
	};

	/*
	+ * On-disk indirect vdev state.
	+ *
	+ * An indirect vdev is described exclusively in the MOS config of a pool.
	+ * The config for an indirect vdev includes several fields, which are
	+ * accessed in memory by a vdev_indirect_config_t.
	+ */
	+typedef struct vdev_indirect_config {
	+ /*
	+ * Object (in MOS) which contains the indirect mapping. This object
	+ * contains an array of vdev_indirect_mapping_entry_phys_t ordered by
	+ * vimep_src. The bonus buffer for this object is a
	+ * vdev_indirect_mapping_phys_t. This object is allocated when a vdev
	+ * removal is initiated.
	+ *
	+ * Note that this object can be empty if none of the data on the vdev
	+ * has been copied yet.
	+ */
	+ uint64_t vic_mapping_object;
	+
	+ /*
	+ * Object (in MOS) which contains the birth times for the mapping
	+ * entries. This object contains an array of
	+ * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
	+ * buffer for this object is a vdev_indirect_birth_phys_t. This object
	+ * is allocated when a vdev removal is initiated.
	+ *
	+ * Note that this object can be empty if none of the vdev has yet been
	+ * copied.
	+ */
	+ uint64_t vic_births_object;
	+
	+ /*
	+ * This is the vdev ID which was removed previous to this vdev, or
	+ * UINT64_MAX if there are no previously removed vdevs.
	+ */
	+ uint64_t vic_prev_indirect_vdev;
	+} vdev_indirect_config_t;
	+
	+/*
	* Virtual device descriptor
	*/
	struct vdev {
	/*
	* Common to all vdev types.
	*/
	uint64_t vdev_id; /* child number in vdev parent */
	uint64_t vdev_guid; /* unique ID for this vdev */
	uint64_t vdev_guid_sum; /* self guid + all child guids */
	uint64_t vdev_orig_guid; /* orig. guid prior to remove */
	uint64_t vdev_asize; /* allocatable device capacity */
	uint64_t vdev_min_asize; /* min acceptable asize */
	uint64_t vdev_max_asize; /* max acceptable asize */
	uint64_t vdev_ashift; /* block alignment shift */
	/*
	* Logical block alignment shift
	*
	* The smallest sized/aligned I/O supported by the device.
	*/
	uint64_t vdev_logical_ashift;
	/*
	* Physical block alignment shift
	*
	* The device supports logical I/Os with vdev_logical_ashift
	* size/alignment, but optimum performance will be achieved by
	* aligning/sizing requests to vdev_physical_ashift. Smaller
	* requests may be inflated or incur device level read-modify-write
	* operations.
	*
	* May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
	*/
	uint64_t vdev_physical_ashift;
	uint64_t vdev_state; /* see VDEV_STATE_* #defines */
	uint64_t vdev_prevstate; /* used when reopening a vdev */
	vdev_ops_t vdev_ops; / vdev operations */
	spa_t vdev_spa; / spa for this vdev */
	void vdev_tsd; / type-specific data */
	vnode_t vdev_name_vp; / vnode for pathname */
	vnode_t vdev_devid_vp; / vnode for devid */
	vdev_t vdev_top; / top-level vdev */
	vdev_t vdev_parent; / parent vdev */
	vdev_t *vdev_child; / array of children */
	uint64_t vdev_children; /* number of children */
	vdev_stat_t vdev_stat; /* virtual device statistics */
	boolean_t vdev_expanding; /* expand the vdev? */
	boolean_t vdev_reopening; /* reopen in progress? */
	int vdev_open_error; /* error on last open */
	kthread_t vdev_open_thread; / thread opening children */
	uint64_t vdev_crtxg; /* txg when top-level was added */

	/*
	* Top-level vdev state.
	*/
	uint64_t vdev_ms_array; /* metaslab array object */
	uint64_t vdev_ms_shift; /* metaslab size shift */
	uint64_t vdev_ms_count; /* number of metaslabs */
	metaslab_group_t vdev_mg; / metaslab group */
	metaslab_t *vdev_ms; / metaslab array */
	txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
	txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
	txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
	boolean_t vdev_remove_wanted; /* async remove wanted? */
	boolean_t vdev_probe_wanted; /* async probe wanted? */
	list_node_t vdev_config_dirty_node; /* config dirty list */
	list_node_t vdev_state_dirty_node; /* state dirty list */
	uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
	uint64_t vdev_islog; /* is an intent log device */
	uint64_t vdev_removing; /* device is being removed? */
	boolean_t vdev_ishole; /* is a hole in the namespace */
	kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
	uint64_t vdev_top_zap;

	/*
	+ * Values stored in the config for an indirect or removing vdev.
	+ */
	+ vdev_indirect_config_t vdev_indirect_config;
	+
	+ /*
	+ * The vdev_indirect_rwlock protects the vdev_indirect_mapping
	+ * pointer from changing on indirect vdevs (when it is condensed).
	+ * Note that removing (not yet indirect) vdevs have different
	+ * access patterns (the mapping is not accessed from open context,
	+ * e.g. from zio_read) and locking strategy (e.g. svr_lock).
	+ */
	+ krwlock_t vdev_indirect_rwlock;
	+ vdev_indirect_mapping_t *vdev_indirect_mapping;
	+ vdev_indirect_births_t *vdev_indirect_births;
	+
	+ /*
	+ * In memory data structures used to manage the obsolete sm, for
	+ * indirect or removing vdevs.
	+ *
	+ * The vdev_obsolete_segments is the in-core record of the segments
	+ * that are no longer referenced anywhere in the pool (due to
	+ * being freed or remapped and not referenced by any snapshots).
	+ * During a sync, segments are added to vdev_obsolete_segments
	+ * via vdev_indirect_mark_obsolete(); at the end of each sync
	+ * pass, this is appended to vdev_obsolete_sm via
	+ * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock
	+ * protects against concurrent modifications of vdev_obsolete_segments
	+ * from multiple zio threads.
	+ */
	+ kmutex_t vdev_obsolete_lock;
	+ range_tree_t *vdev_obsolete_segments;
	+ space_map_t *vdev_obsolete_sm;
	+
	+ /*
	* The queue depth parameters determine how many async writes are
	* still pending (i.e. allocated by net yet issued to disk) per
	* top-level (vdev_async_write_queue_depth) and the maximum allowed
	* (vdev_max_async_write_queue_depth). These values only apply to
	* top-level vdevs.
	*/
	uint64_t vdev_async_write_queue_depth;
	uint64_t vdev_max_async_write_queue_depth;

	/*
	* Leaf vdev state.
	*/
	range_tree_t vdev_dtl[DTL_TYPES]; / dirty time logs */
	space_map_t vdev_dtl_sm; / dirty time log space map */
	txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
	uint64_t vdev_dtl_object; /* DTL object */
	uint64_t vdev_psize; /* physical device capacity */
	uint64_t vdev_wholedisk; /* true if this is a whole disk */
	uint64_t vdev_offline; /* persistent offline state */
	uint64_t vdev_faulted; /* persistent faulted state */
	uint64_t vdev_degraded; /* persistent degraded state */
	uint64_t vdev_removed; /* persistent removed state */
	uint64_t vdev_resilver_txg; /* persistent resilvering state */
	uint64_t vdev_nparity; /* number of parity devices for raidz */
	char vdev_path; / vdev path (if any) */
	char vdev_devid; / vdev devid (if any) */
	char vdev_physpath; / vdev device path (if any) */
	char vdev_fru; / physical FRU location */
	uint64_t vdev_not_present; /* not present during import */
	uint64_t vdev_unspare; /* unspare when resilvering done */
	boolean_t vdev_nowritecache; /* true if flushwritecache failed */
	boolean_t vdev_notrim; /* true if trim failed */
	boolean_t vdev_checkremove; /* temporary online test */
	boolean_t vdev_forcefault; /* force online fault */
	boolean_t vdev_splitting; /* split or repair in progress */
	boolean_t vdev_delayed_close; /* delayed device close? */
	boolean_t vdev_tmpoffline; /* device taken offline temporarily? */
	boolean_t vdev_detached; /* device detached? */
	boolean_t vdev_cant_read; /* vdev is failing all reads */
	boolean_t vdev_cant_write; /* vdev is failing all writes */
	boolean_t vdev_isspare; /* was a hot spare */
	boolean_t vdev_isl2cache; /* was a l2cache device */
	vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
	vdev_cache_t vdev_cache; /* physical block cache */
	spa_aux_vdev_t vdev_aux; / for l2cache and spares vdevs */
	zio_t vdev_probe_zio; / root of current probe */
	vdev_aux_t vdev_label_aux; /* on-disk aux state */
	struct trim_map vdev_trimmap; / map on outstanding trims */
	uint16_t vdev_rotation_rate; /* rotational rate of the media */
	#define VDEV_RATE_UNKNOWN 0
	#define VDEV_RATE_NON_ROTATING 1
	uint64_t vdev_leaf_zap;

	/*
	* For DTrace to work in userland (libzpool) context, these fields must
	* remain at the end of the structure. DTrace will use the kernel's
	* CTF definition for 'struct vdev', and since the size of a kmutex_t is
	* larger in userland, the offsets for the rest of the fields would be
	* incorrect.
	*/
	kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */
	kmutex_t vdev_stat_lock; /* vdev_stat */
	kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
	};

	#define VDEV_RAIDZ_MAXPARITY 3

	#define VDEV_PAD_SIZE (8 << 10)
	/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
	#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
	#define VDEV_PHYS_SIZE (112 << 10)
	#define VDEV_UBERBLOCK_RING (128 << 10)

	/* The largest uberblock we support is 8k. */
	#define MAX_UBERBLOCK_SHIFT (13)
	#define VDEV_UBERBLOCK_SHIFT(vd) \
	MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
	MAX_UBERBLOCK_SHIFT)
	#define VDEV_UBERBLOCK_COUNT(vd) \
	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
	#define VDEV_UBERBLOCK_OFFSET(vd, n) \
	offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
	#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))

	typedef struct vdev_phys {
	char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
	zio_eck_t vp_zbt;
	} vdev_phys_t;

	typedef struct vdev_label {
	char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
	char vl_pad2[VDEV_PAD_SIZE]; /* 8K */
	vdev_phys_t vl_vdev_phys; /* 112K */
	char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
	} vdev_label_t; /* 256K total */

	/*
	* vdev_dirty() flags
	*/
	#define VDD_METASLAB 0x01
	#define VDD_DTL 0x02

	/* Offset of embedded boot loader region on each label */
	#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
	/*
	* Size of embedded boot loader region on each label.
	* The total size of the first two labels plus the boot area is 4MB.
	*/
	#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */

	/*
	* Size of label regions at the start and end of each leaf device.
	*/
	#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
	#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
	#define VDEV_LABELS 4
	#define VDEV_BEST_LABEL VDEV_LABELS

	#define VDEV_ALLOC_LOAD 0
	#define VDEV_ALLOC_ADD 1
	#define VDEV_ALLOC_SPARE 2
	#define VDEV_ALLOC_L2CACHE 3
	#define VDEV_ALLOC_ROOTPOOL 4
	#define VDEV_ALLOC_SPLIT 5
	#define VDEV_ALLOC_ATTACH 6

	/*
	* Allocate or free a vdev
	*/
	extern vdev_t vdev_alloc_common(spa_t spa, uint_t id, uint64_t guid,
	vdev_ops_t *ops);
	extern int vdev_alloc(spa_t spa, vdev_t vdp, nvlist_t config,
	vdev_t *parent, uint_t id, int alloctype);
	extern void vdev_free(vdev_t *vd);

	/*
	* Add or remove children and parents
	*/
	extern void vdev_add_child(vdev_t pvd, vdev_t cvd);
	extern void vdev_remove_child(vdev_t pvd, vdev_t cvd);
	extern void vdev_compact_children(vdev_t *pvd);
	extern vdev_t vdev_add_parent(vdev_t cvd, vdev_ops_t *ops);
	extern void vdev_remove_parent(vdev_t *cvd);

	/*
	* vdev sync load and sync
	*/
	extern void vdev_load_log_state(vdev_t nvd, vdev_t ovd);
	extern boolean_t vdev_log_state_valid(vdev_t *vd);
	-extern void vdev_load(vdev_t *vd);
	+extern int vdev_load(vdev_t *vd);
	extern int vdev_dtl_load(vdev_t *vd);
	extern void vdev_sync(vdev_t *vd, uint64_t txg);
	extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
	extern void vdev_dirty(vdev_t vd, int flags, void arg, uint64_t txg);
	extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);

	/*
	* Available vdev types.
	*/
	extern vdev_ops_t vdev_root_ops;
	extern vdev_ops_t vdev_mirror_ops;
	extern vdev_ops_t vdev_replacing_ops;
	extern vdev_ops_t vdev_raidz_ops;
	#ifdef _KERNEL
	extern vdev_ops_t vdev_geom_ops;
	#else
	extern vdev_ops_t vdev_disk_ops;
	#endif
	extern vdev_ops_t vdev_file_ops;
	extern vdev_ops_t vdev_missing_ops;
	extern vdev_ops_t vdev_hole_ops;
	extern vdev_ops_t vdev_spare_ops;
	+extern vdev_ops_t vdev_indirect_ops;

	/*
	* Common size functions
	*/
	extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
	extern uint64_t vdev_get_min_asize(vdev_t *vd);
	extern void vdev_set_min_asize(vdev_t *vd);

	/*
	* Global variables
	*/
	/* zdb uses this tunable, so it must be declared here to make lint happy. */
	extern int zfs_vdev_cache_size;
	extern uint_t zfs_geom_probe_vdev_key;
	+
	+/*
	+ * Functions from vdev_indirect.c
	+ */
	+extern void vdev_indirect_sync_obsolete(vdev_t vd, dmu_tx_t tx);
	+extern boolean_t vdev_indirect_should_condense(vdev_t *vd);
	+extern void spa_condense_indirect_start_sync(vdev_t vd, dmu_tx_t tx);
	+extern int vdev_obsolete_sm_object(vdev_t *vd);
	+extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd);

	#ifdef illumos
	/*
	* The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
	*/
	typedef struct vdev_buf {
	buf_t vb_buf; /* buffer that describes the io */
	zio_t vb_io; / pointer back to the original zio_t */
	} vdev_buf_t;
	#endif

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_VDEV_IMPL_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h (nonexistent)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h (revision 332525)
	@@ -0,0 +1,80 @@
	+/*
	+ * CDDL HEADER START
	+ *
	+ * This file and its contents are supplied under the terms of the
	+ * Common Development and Distribution License ("CDDL"), version 1.0.
	+ * You may only use this file in accordance with the terms of version
	+ * 1.0 of the CDDL.
	+ *
	+ * A full copy of the text of the CDDL should have accompanied this
	+ * source. A copy of the CDDL is also available via the Internet at
	+ * http://www.illumos.org/license/CDDL.
	+ *
	+ * CDDL HEADER END
	+ */
	+
	+/*
	+ * Copyright (c) 2015 by Delphix. All rights reserved.
	+ */
	+
	+#ifndef _SYS_VDEV_INDIRECT_BIRTHS_H
	+#define _SYS_VDEV_INDIRECT_BIRTHS_H
	+
	+#include <sys/dmu.h>
	+#include <sys/spa.h>
	+
	+#ifdef __cplusplus
	+extern "C" {
	+#endif
	+
	+typedef struct vdev_indirect_birth_entry_phys {
	+ uint64_t vibe_offset;
	+ uint64_t vibe_phys_birth_txg;
	+} vdev_indirect_birth_entry_phys_t;
	+
	+typedef struct vdev_indirect_birth_phys {
	+ uint64_t vib_count; /* count of v_i_b_entry_phys_t's */
	+} vdev_indirect_birth_phys_t;
	+
	+typedef struct vdev_indirect_births {
	+ uint64_t vib_object;
	+
	+ /*
	+ * Each entry indicates that everything up to but not including
	+ * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted
	+ * by increasing phys_birth, and also by increasing offset. See
	+ * vdev_indirect_births_physbirth for usage.
	+ */
	+ vdev_indirect_birth_entry_phys_t *vib_entries;
	+
	+ objset_t *vib_objset;
	+
	+ dmu_buf_t *vib_dbuf;
	+ vdev_indirect_birth_phys_t *vib_phys;
	+} vdev_indirect_births_t;
	+
	+extern vdev_indirect_births_t vdev_indirect_births_open(objset_t os,
	+ uint64_t object);
	+extern void vdev_indirect_births_close(vdev_indirect_births_t *vib);
	+extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib);
	+extern uint64_t vdev_indirect_births_alloc(objset_t os, dmu_tx_t tx);
	+extern void vdev_indirect_births_free(objset_t *os, uint64_t object,
	+ dmu_tx_t *tx);
	+
	+extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib);
	+extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib);
	+
	+extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
	+ uint64_t offset, uint64_t txg, dmu_tx_t *tx);
	+
	+extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib,
	+ uint64_t offset, uint64_t asize);
	+
	+extern uint64_t vdev_indirect_births_last_entry_txg(
	+ vdev_indirect_births_t *vib);
	+
	+#ifdef __cplusplus
	+}
	+#endif
	+
	+#endif /* _SYS_VDEV_INDIRECT_BIRTHS_H */

	Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h (nonexistent)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h (revision 332525)
	@@ -0,0 +1,141 @@
	+/*
	+ * CDDL HEADER START
	+ *
	+ * This file and its contents are supplied under the terms of the
	+ * Common Development and Distribution License ("CDDL"), version 1.0.
	+ * You may only use this file in accordance with the terms of version
	+ * 1.0 of the CDDL.
	+ *
	+ * A full copy of the text of the CDDL should have accompanied this
	+ * source. A copy of the CDDL is also available via the Internet at
	+ * http://www.illumos.org/license/CDDL.
	+ *
	+ * CDDL HEADER END
	+ */
	+
	+/*
	+ * Copyright (c) 2015 by Delphix. All rights reserved.
	+ */
	+
	+#ifndef _SYS_VDEV_INDIRECT_MAPPING_H
	+#define _SYS_VDEV_INDIRECT_MAPPING_H
	+
	+#include <sys/dmu.h>
	+#include <sys/list.h>
	+#include <sys/spa.h>
	+#include <sys/space_map.h>
	+
	+#ifdef __cplusplus
	+extern "C" {
	+#endif
	+
	+typedef struct vdev_indirect_mapping_entry_phys {
	+ /*
	+ * Decode with DVA_MAPPING_* macros.
	+ * Contains:
	+ * the source offset (low 63 bits)
	+ * the one-bit "mark", used for garbage collection (by zdb)
	+ */
	+ uint64_t vimep_src;
	+
	+ /*
	+ * Note: the DVA's asize is 24 bits, and can thus store ranges
	+ * up to 8GB.
	+ */
	+ dva_t vimep_dst;
	+} vdev_indirect_mapping_entry_phys_t;
	+
	+#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \
	+ BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0)
	+#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \
	+ BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x)
	+
	+typedef struct vdev_indirect_mapping_entry {
	+ vdev_indirect_mapping_entry_phys_t vime_mapping;
	+ uint32_t vime_obsolete_count;
	+ list_node_t vime_node;
	+} vdev_indirect_mapping_entry_t;
	+
	+/*
	+ * This is stored in the bonus buffer of the mapping object, see comment of
	+ * vdev_indirect_config for more details.
	+ */
	+typedef struct vdev_indirect_mapping_phys {
	+ uint64_t vimp_max_offset;
	+ uint64_t vimp_bytes_mapped;
	+ uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */
	+
	+ /*
	+ * For each entry in the mapping object, this object contains an
	+ * entry representing the number of bytes of that mapping entry
	+ * that were no longer in use by the pool at the time this indirect
	+ * vdev was last condensed.
	+ */
	+ uint64_t vimp_counts_object;
	+} vdev_indirect_mapping_phys_t;
	+
	+#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t))
	+
	+typedef struct vdev_indirect_mapping {
	+ uint64_t vim_object;
	+ boolean_t vim_havecounts;
	+
	+ /*
	+ * An ordered array of all mapping entries, sorted by source offset.
	+ * Note that vim_entries is needed during a removal (and contains
	+ * mappings that have been synced to disk so far) to handle frees
	+ * from the removing device.
	+ */
	+ vdev_indirect_mapping_entry_phys_t *vim_entries;
	+
	+ objset_t *vim_objset;
	+
	+ dmu_buf_t *vim_dbuf;
	+ vdev_indirect_mapping_phys_t *vim_phys;
	+} vdev_indirect_mapping_t;
	+
	+extern vdev_indirect_mapping_t vdev_indirect_mapping_open(objset_t os,
	+ uint64_t object);
	+extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim);
	+extern uint64_t vdev_indirect_mapping_alloc(objset_t os, dmu_tx_t tx);
	+extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj,
	+ dmu_tx_t *tx);
	+
	+extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim);
	+extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim);
	+extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim);
	+extern uint64_t vdev_indirect_mapping_bytes_mapped(
	+ vdev_indirect_mapping_t *vim);
	+extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim);
	+
	+/*
	+ * Writes the given list of vdev_indirect_mapping_entry_t to the mapping
	+ * then updates internal state.
	+ */
	+extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
	+ list_t vime_list, dmu_tx_t tx);
	+
	+extern vdev_indirect_mapping_entry_phys_t *
	+ vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
	+ uint64_t offset);
	+
	+extern vdev_indirect_mapping_entry_phys_t *
	+ vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
	+ uint64_t offset);
	+
	+extern uint32_t *vdev_indirect_mapping_load_obsolete_counts(
	+ vdev_indirect_mapping_t *vim);
	+extern void vdev_indirect_mapping_load_obsolete_spacemap(
	+ vdev_indirect_mapping_t *vim,
	+ uint32_t counts, space_map_t obsolete_space_sm);
	+extern void vdev_indirect_mapping_increment_obsolete_count(
	+ vdev_indirect_mapping_t *vim,
	+ uint64_t offset, uint64_t asize, uint32_t *counts);
	+extern void vdev_indirect_mapping_free_obsolete_counts(
	+ vdev_indirect_mapping_t vim, uint32_t counts);
	+
	+#ifdef __cplusplus
	+}
	+#endif
	+
	+#endif /* _SYS_VDEV_INDIRECT_MAPPING_H */

	Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h (nonexistent)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h (revision 332525)
	@@ -0,0 +1,93 @@
	+/*
	+ * CDDL HEADER START
	+ *
	+ * This file and its contents are supplied under the terms of the
	+ * Common Development and Distribution License ("CDDL"), version 1.0.
	+ * You may only use this file in accordance with the terms of version
	+ * 1.0 of the CDDL.
	+ *
	+ * A full copy of the text of the CDDL should have accompanied this
	+ * source. A copy of the CDDL is also available via the Internet at
	+ * http://www.illumos.org/license/CDDL.
	+ *
	+ * CDDL HEADER END
	+ */
	+
	+/*
	+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
	+ */
	+
	+#ifndef _SYS_VDEV_REMOVAL_H
	+#define _SYS_VDEV_REMOVAL_H
	+
	+#include <sys/spa.h>
	+#include <sys/bpobj.h>
	+#include <sys/vdev_indirect_mapping.h>
	+#include <sys/vdev_indirect_births.h>
	+
	+#ifdef __cplusplus
	+extern "C" {
	+#endif
	+
	+typedef struct spa_vdev_removal {
	+ vdev_t *svr_vdev;
	+ uint64_t svr_max_offset_to_sync[TXG_SIZE];
	+ /* Thread performing a vdev removal. */
	+ kthread_t *svr_thread;
	+ /* Segments left to copy from the current metaslab. */
	+ range_tree_t *svr_allocd_segs;
	+ kmutex_t svr_lock;
	+ kcondvar_t svr_cv;
	+ boolean_t svr_thread_exit;
	+
	+ /*
	+ * New mappings to write out each txg.
	+ */
	+ list_t svr_new_segments[TXG_SIZE];
	+
	+ /*
	+ * Ranges that were freed while a mapping was in flight. This is
	+ * a subset of the ranges covered by vdev_im_new_segments.
	+ */
	+ range_tree_t *svr_frees[TXG_SIZE];
	+
	+ /*
	+ * Number of bytes which we have finished our work for
	+ * in each txg. This could be data copied (which will be part of
	+ * the mappings in vdev_im_new_segments), or data freed before
	+ * we got around to copying it.
	+ */
	+ uint64_t svr_bytes_done[TXG_SIZE];
	+
	+ /* List of leaf zap objects to be unlinked */
	+ nvlist_t *svr_zaplist;
	+} spa_vdev_removal_t;
	+
	+typedef struct spa_condensing_indirect {
	+ /*
	+ * New mappings to write out each txg.
	+ */
	+ list_t sci_new_mapping_entries[TXG_SIZE];
	+
	+ vdev_indirect_mapping_t *sci_new_mapping;
	+} spa_condensing_indirect_t;
	+
	+extern int spa_remove_init(spa_t *);
	+extern void spa_restart_removal(spa_t *);
	+extern int spa_condense_init(spa_t *);
	+extern void spa_condense_fini(spa_t *);
	+extern void spa_condense_indirect_restart(spa_t *);
	+extern void spa_vdev_condense_suspend(spa_t *);
	+extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
	+extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);
	+extern int spa_removal_get_stats(spa_t , pool_removal_stat_t );
	+extern void svr_sync(spa_t spa, dmu_tx_t tx);
	+extern void spa_vdev_remove_suspend(spa_t *);
	+extern int spa_vdev_remove_cancel(spa_t *);
	+extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
	+
	+#ifdef __cplusplus
	+}
	+#endif
	+
	+#endif /* _SYS_VDEV_REMOVAL_H */

	Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h (revision 332525)
	@@ -1,98 +1,99 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_ZFS_DEBUG_H
	#define _SYS_ZFS_DEBUG_H

	#ifdef __cplusplus
	extern "C" {
	#endif

	#ifndef TRUE
	#define TRUE 1
	#endif

	#ifndef FALSE
	#define FALSE 0
	#endif

	/*
	* ZFS debugging
	*/

	#if defined(DEBUG) \|\| !defined(_KERNEL)
	#if !defined(ZFS_DEBUG)
	#define ZFS_DEBUG
	#endif
	#endif

	extern int zfs_flags;
	extern boolean_t zfs_recover;
	extern boolean_t zfs_free_leak_on_eio;

	#define ZFS_DEBUG_DPRINTF (1 << 0)
	#define ZFS_DEBUG_DBUF_VERIFY (1 << 1)
	#define ZFS_DEBUG_DNODE_VERIFY (1 << 2)
	#define ZFS_DEBUG_SNAPNAMES (1 << 3)
	#define ZFS_DEBUG_MODIFY (1 << 4)
	#define ZFS_DEBUG_SPA (1 << 5)
	#define ZFS_DEBUG_ZIO_FREE (1 << 6)
	#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7)
	#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8)
	+#define ZFS_DEBUG_INDIRECT_REMAP (1 << 9)

	#ifdef ZFS_DEBUG
	extern void __dprintf(const char file, const char func,
	int line, const char *fmt, ...);
	#define dprintf(...) \
	if (zfs_flags & ZFS_DEBUG_DPRINTF) \
	__dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
	#else
	#define dprintf(...) ((void)0)
	#endif /* ZFS_DEBUG */

	extern void zfs_panic_recover(const char *fmt, ...);

	typedef struct zfs_dbgmsg {
	list_node_t zdm_node;
	time_t zdm_timestamp;
	char zdm_msg[1]; /* variable length allocation */
	} zfs_dbgmsg_t;

	extern void zfs_dbgmsg_init(void);
	extern void zfs_dbgmsg_fini(void);
	extern void zfs_dbgmsg(const char *fmt, ...);
	extern void zfs_dbgmsg_print(const char *tag);

	#ifdef illumos
	#ifndef _KERNEL
	extern int dprintf_find_string(const char *string);
	#endif
	#endif /* illumos */

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_ZFS_DEBUG_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h (revision 332525)
	@@ -1,448 +1,448 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#ifndef _SYS_ZIL_H
	#define _SYS_ZIL_H

	#include <sys/types.h>
	#include <sys/spa.h>
	#include <sys/zio.h>
	#include <sys/dmu.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	struct dsl_pool;
	struct dsl_dataset;
	struct lwb;

	/*
	* Intent log format:
	*
	* Each objset has its own intent log. The log header (zil_header_t)
	* for objset N's intent log is kept in the Nth object of the SPA's
	* intent_log objset. The log header points to a chain of log blocks,
	* each of which contains log records (i.e., transactions) followed by
	* a log block trailer (zil_trailer_t). The format of a log record
	* depends on the record (or transaction) type, but all records begin
	* with a common structure that defines the type, length, and txg.
	*/

	/*
	* Intent log header - this on disk structure holds fields to manage
	* the log. All fields are 64 bit to easily handle cross architectures.
	*/
	typedef struct zil_header {
	uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
	uint64_t zh_replay_seq; /* highest replayed sequence number */
	blkptr_t zh_log; /* log chain */
	uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
	uint64_t zh_flags; /* header flags */
	uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
	uint64_t zh_pad[3];
	} zil_header_t;

	/*
	* zh_flags bit settings
	*/
	#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
	#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */

	/*
	* Log block chaining.
	*
	* Log blocks are chained together. Originally they were chained at the
	* end of the block. For performance reasons the chain was moved to the
	* beginning of the block which allows writes for only the data being used.
	* The older position is supported for backwards compatability.
	*
	* The zio_eck_t contains a zec_cksum which for the intent log is
	* the sequence number of this log block. A seq of 0 is invalid.
	* The zec_cksum is checked by the SPA against the sequence
	* number passed in the blk_cksum field of the blkptr_t
	*/
	typedef struct zil_chain {
	uint64_t zc_pad;
	blkptr_t zc_next_blk; /* next block in chain */
	uint64_t zc_nused; /* bytes in log block used */
	zio_eck_t zc_eck; /* block trailer */
	} zil_chain_t;

	#define ZIL_MIN_BLKSZ 4096ULL

	/*
	* ziltest is by and large an ugly hack, but very useful in
	* checking replay without tedious work.
	* When running ziltest we want to keep all itx's and so maintain
	* a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
	* We subtract TXG_CONCURRENT_STATES to allow for common code.
	*/
	#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)

	/*
	* The words of a log block checksum.
	*/
	#define ZIL_ZC_GUID_0 0
	#define ZIL_ZC_GUID_1 1
	#define ZIL_ZC_OBJSET 2
	#define ZIL_ZC_SEQ 3

	typedef enum zil_create {
	Z_FILE,
	Z_DIR,
	Z_XATTRDIR,
	} zil_create_t;

	/*
	* size of xvattr log section.
	* its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
	* for create time and a single 64 bit integer for all of the attributes,
	* and 4 64 bit integers (32 bytes) for the scanstamp.
	*
	*/

	#define ZIL_XVAT_SIZE(mapsize) \
	sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
	(sizeof (uint64_t) * 7)

	/*
	* Size of ACL in log. The ACE data is padded out to properly align
	* on 8 byte boundary.
	*/

	#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t)))

	/*
	* Intent log transaction types and record structures
	*/
	#define TX_COMMIT 0 /* Commit marker (no on-disk state) */
	#define TX_CREATE 1 /* Create file */
	#define TX_MKDIR 2 /* Make directory */
	#define TX_MKXATTR 3 /* Make XATTR directory */
	#define TX_SYMLINK 4 /* Create symbolic link to a file */
	#define TX_REMOVE 5 /* Remove file */
	#define TX_RMDIR 6 /* Remove directory */
	#define TX_LINK 7 /* Create hard link to a file */
	#define TX_RENAME 8 /* Rename a file */
	#define TX_WRITE 9 /* File write */
	#define TX_TRUNCATE 10 /* Truncate a file */
	#define TX_SETATTR 11 /* Set file attributes */
	#define TX_ACL_V0 12 /* Set old formatted ACL */
	#define TX_ACL 13 /* Set ACL */
	#define TX_CREATE_ACL 14 /* create with ACL */
	#define TX_CREATE_ATTR 15 /* create + attrs */
	#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */
	#define TX_MKDIR_ACL 17 /* mkdir with ACL */
	#define TX_MKDIR_ATTR 18 /* mkdir with attr */
	#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
	#define TX_WRITE2 20 /* dmu_sync EALREADY write */
	#define TX_MAX_TYPE 21 /* Max transaction type */

	/*
	* The transactions for mkdir, symlink, remove, rmdir, link, and rename
	* may have the following bit set, indicating the original request
	* specified case-insensitive handling of names.
	*/
	#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */

	/*
	* Transactions for write, truncate, setattr, acl_v0, and acl can be logged
	* out of order. For convenience in the code, all such records must have
	* lr_foid at the same offset.
	*/
	#define TX_OOO(txtype) \
	((txtype) == TX_WRITE \|\| \
	(txtype) == TX_TRUNCATE \|\| \
	(txtype) == TX_SETATTR \|\| \
	(txtype) == TX_ACL_V0 \|\| \
	(txtype) == TX_ACL \|\| \
	(txtype) == TX_WRITE2)

	/*
	* Format of log records.
	* The fields are carefully defined to allow them to be aligned
	* and sized the same on sparc & intel architectures.
	* Each log record has a common structure at the beginning.
	*
	* The log record on disk (lrc_seq) holds the sequence number of all log
	* records which is used to ensure we don't replay the same record.
	*/
	typedef struct { /* common log record header */
	uint64_t lrc_txtype; /* intent log transaction type */
	uint64_t lrc_reclen; /* transaction record length */
	uint64_t lrc_txg; /* dmu transaction group number */
	uint64_t lrc_seq; /* see comment above */
	} lr_t;

	/*
	* Common start of all out-of-order record types (TX_OOO() above).
	*/
	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* object id */
	} lr_ooo_t;

	/*
	* Handle option extended vattr attributes.
	*
	* Whenever new attributes are added the version number
	* will need to be updated as will code in
	* zfs_log.c and zfs_replay.c
	*/
	typedef struct {
	uint32_t lr_attr_masksize; /* number of elements in array */
	uint32_t lr_attr_bitmap; /* First entry of array */
	/* remainder of array and any additional fields */
	} lr_attr_t;

	/*
	* log record for creates without optional ACL.
	* This log record does support optional xvattr_t attributes.
	*/
	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_doid; /* object id of directory */
	uint64_t lr_foid; /* object id of created file object */
	uint64_t lr_mode; /* mode of object */
	uint64_t lr_uid; /* uid of object */
	uint64_t lr_gid; /* gid of object */
	uint64_t lr_gen; /* generation (txg of creation) */
	uint64_t lr_crtime[2]; /* creation time */
	uint64_t lr_rdev; /* rdev of object to create */
	/* name of object to create follows this */
	/* for symlinks, link content follows name */
	/* for creates with xvattr data, the name follows the xvattr info */
	} lr_create_t;

	/*
	* FUID ACL record will be an array of ACEs from the original ACL.
	* If this array includes ephemeral IDs, the record will also include
	* an array of log-specific FUIDs to replace the ephemeral IDs.
	* Only one copy of each unique domain will be present, so the log-specific
	* FUIDs will use an index into a compressed domain table. On replay this
	* information will be used to construct real FUIDs (and bypass idmap,
	* since it may not be available).
	*/

	/*
	* Log record for creates with optional ACL
	* This log record is also used for recording any FUID
	* information needed for replaying the create. If the
	* file doesn't have any actual ACEs then the lr_aclcnt
	* would be zero.
	*
	* After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's.
	* If create is also setting xvattr's, then acl data follows xvattr.
	* If ACE FUIDs are needed then they will follow the xvattr_t. Following
	* the FUIDs will be the domain table information. The FUIDs for the owner
	* and group will be in lr_create. Name follows ACL data.
	*/
	typedef struct {
	lr_create_t lr_create; /* common create portion */
	uint64_t lr_aclcnt; /* number of ACEs in ACL */
	uint64_t lr_domcnt; /* number of unique domains */
	uint64_t lr_fuidcnt; /* number of real fuids */
	uint64_t lr_acl_bytes; /* number of bytes in ACL */
	uint64_t lr_acl_flags; /* ACL flags */
	} lr_acl_create_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_doid; /* obj id of directory */
	/* name of object to remove follows this */
	} lr_remove_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_doid; /* obj id of directory */
	uint64_t lr_link_obj; /* obj id of link */
	/* name of object to link follows this */
	} lr_link_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_sdoid; /* obj id of source directory */
	uint64_t lr_tdoid; /* obj id of target directory */
	/* 2 strings: names of source and destination follow this */
	} lr_rename_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* file object to write */
	uint64_t lr_offset; /* offset to write to */
	uint64_t lr_length; /* user data length to write */
	uint64_t lr_blkoff; /* no longer used */
	blkptr_t lr_blkptr; /* spa block pointer for replay */
	/* write data will follow for small writes */
	} lr_write_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* object id of file to truncate */
	uint64_t lr_offset; /* offset to truncate from */
	uint64_t lr_length; /* length to truncate */
	} lr_truncate_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* file object to change attributes */
	uint64_t lr_mask; /* mask of attributes to set */
	uint64_t lr_mode; /* mode to set */
	uint64_t lr_uid; /* uid to set */
	uint64_t lr_gid; /* gid to set */
	uint64_t lr_size; /* size to set */
	uint64_t lr_atime[2]; /* access time */
	uint64_t lr_mtime[2]; /* modification time */
	/* optional attribute lr_attr_t may be here */
	} lr_setattr_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* obj id of file */
	uint64_t lr_aclcnt; /* number of acl entries */
	/* lr_aclcnt number of ace_t entries follow this */
	} lr_acl_v0_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* obj id of file */
	uint64_t lr_aclcnt; /* number of ACEs in ACL */
	uint64_t lr_domcnt; /* number of unique domains */
	uint64_t lr_fuidcnt; /* number of real fuids */
	uint64_t lr_acl_bytes; /* number of bytes in ACL */
	uint64_t lr_acl_flags; /* ACL flags */
	/* lr_acl_bytes number of variable sized ace's follows */
	} lr_acl_t;

	/*
	* ZIL structure definitions, interface function prototype and globals.
	*/

	/*
	* Writes are handled in three different ways:
	*
	* WR_INDIRECT:
	* In this mode, if we need to commit the write later, then the block
	* is immediately written into the file system (using dmu_sync),
	* and a pointer to the block is put into the log record.
	* When the txg commits the block is linked in.
	* This saves additionally writing the data into the log record.
	* There are a few requirements for this to occur:
	* - write is greater than zfs/zvol_immediate_write_sz
	* - not using slogs (as slogs are assumed to always be faster
	* than writing into the main pool)
	* - the write occupies only one block
	* WR_COPIED:
	* If we know we'll immediately be committing the
	* transaction (FSYNC or FDSYNC), the we allocate a larger
	* log record here for the data and copy the data in.
	* WR_NEED_COPY:
	* Otherwise we don't allocate a buffer, and if we need to
	* flush the write later then a buffer is allocated and
	* we retrieve the data using the dmu.
	*/
	typedef enum {
	WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
	/* and put blkptr in log, rather than actual data) */
	WR_COPIED, /* immediate - data is copied into lr_write_t */
	WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
	WR_NUM_STATES /* number of states */
	} itx_wr_state_t;

	typedef struct itx {
	list_node_t itx_node; /* linkage on zl_itx_list */
	void itx_private; / type-specific opaque data */
	itx_wr_state_t itx_wr_state; /* write state */
	uint8_t itx_sync; /* synchronous transaction */
	uint64_t itx_oid; /* object id */
	lr_t itx_lr; /* common part of log record */
	/* followed by type-specific part of lr_xx_t and its immediate data */
	} itx_t;

	typedef int zil_parse_blk_func_t(zilog_t zilog, blkptr_t bp, void *arg,
	uint64_t txg);
	typedef int zil_parse_lr_func_t(zilog_t zilog, lr_t lr, void *arg,
	uint64_t txg);
	typedef int zil_replay_func_t(void arg1, void arg2, boolean_t byteswap);
	typedef int zil_get_data_t(void arg, lr_write_t lr, char *dbuf,
	struct lwb lwb, zio_t zio);

	extern int zil_parse(zilog_t zilog, zil_parse_blk_func_t parse_blk_func,
	zil_parse_lr_func_t parse_lr_func, void arg, uint64_t txg);

	extern void zil_init(void);
	extern void zil_fini(void);

	extern zilog_t zil_alloc(objset_t os, zil_header_t *zh_phys);
	extern void zil_free(zilog_t *zilog);

	extern zilog_t zil_open(objset_t os, zil_get_data_t *get_data);
	extern void zil_close(zilog_t *zilog);

	extern void zil_replay(objset_t os, void arg,
	zil_replay_func_t *replay_func[TX_MAX_TYPE]);
	extern boolean_t zil_replaying(zilog_t zilog, dmu_tx_t tx);
	extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
	extern void zil_destroy_sync(zilog_t zilog, dmu_tx_t tx);
	extern void zil_rollback_destroy(zilog_t zilog, dmu_tx_t tx);

	extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize);
	extern void zil_itx_destroy(itx_t *itx);
	extern void zil_itx_assign(zilog_t zilog, itx_t itx, dmu_tx_t *tx);

	extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid);
	extern void zil_commit(zilog_t *zilog, uint64_t oid);
	extern void zil_commit_impl(zilog_t *zilog, uint64_t oid);

	-extern int zil_vdev_offline(const char osname, void txarg);
	+extern int zil_reset(const char osname, void txarg);
	extern int zil_claim(struct dsl_pool *dp,
	struct dsl_dataset ds, void txarg);
	extern int zil_check_log_chain(struct dsl_pool *dp,
	struct dsl_dataset ds, void tx);
	extern void zil_sync(zilog_t zilog, dmu_tx_t tx);
	extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);

	extern int zil_suspend(const char osname, void *cookiep);
	extern void zil_resume(void *cookie);

	extern void zil_lwb_add_block(struct lwb lwb, const blkptr_t bp);
	extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg);
	extern int zil_bp_tree_add(zilog_t zilog, const blkptr_t bp);

	extern void zil_set_sync(zilog_t *zilog, uint64_t syncval);

	extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);

	extern int zil_replay_disable;

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_ZIL_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h (revision 332525)
	@@ -1,663 +1,663 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright 2016 Toomas Soome <tsoome@me.com>
	*/

	#ifndef _ZIO_H
	#define _ZIO_H

	#include <sys/zio_priority.h>
	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/txg.h>
	#include <sys/avl.h>
	#include <sys/kstat.h>
	#include <sys/fs/zfs.h>
	#include <sys/zio_impl.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	/*
	* Embedded checksum
	*/
	#define ZEC_MAGIC 0x210da7ab10c7a11ULL

	typedef struct zio_eck {
	uint64_t zec_magic; /* for validation, endianness */
	zio_cksum_t zec_cksum; /* 256-bit checksum */
	} zio_eck_t;

	/*
	* Gang block headers are self-checksumming and contain an array
	* of block pointers.
	*/
	#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
	#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
	sizeof (zio_eck_t)) / sizeof (blkptr_t))
	#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
	sizeof (zio_eck_t) - \
	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
	sizeof (uint64_t))

	typedef struct zio_gbh {
	blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
	uint64_t zg_filler[SPA_GBH_FILLER];
	zio_eck_t zg_tail;
	} zio_gbh_phys_t;

	enum zio_checksum {
	ZIO_CHECKSUM_INHERIT = 0,
	ZIO_CHECKSUM_ON,
	ZIO_CHECKSUM_OFF,
	ZIO_CHECKSUM_LABEL,
	ZIO_CHECKSUM_GANG_HEADER,
	ZIO_CHECKSUM_ZILOG,
	ZIO_CHECKSUM_FLETCHER_2,
	ZIO_CHECKSUM_FLETCHER_4,
	ZIO_CHECKSUM_SHA256,
	ZIO_CHECKSUM_ZILOG2,
	ZIO_CHECKSUM_NOPARITY,
	ZIO_CHECKSUM_SHA512,
	ZIO_CHECKSUM_SKEIN,
	#ifdef illumos
	ZIO_CHECKSUM_EDONR,
	#endif
	ZIO_CHECKSUM_FUNCTIONS
	};

	/*
	* The number of "legacy" compression functions which can be set on individual
	* objects.
	*/
	#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2

	#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
	#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON

	#define ZIO_CHECKSUM_MASK 0xffULL
	#define ZIO_CHECKSUM_VERIFY (1 << 8)

	#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256
	#define ZIO_DEDUPDITTO_MIN 100

	/*
	* The number of "legacy" compression functions which can be set on individual
	* objects.
	*/
	#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4

	/*
	* The meaning of "compress = on" selected by the compression features enabled
	* on a given pool.
	*/
	#define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB
	#define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4

	#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF

	#define BOOTFS_COMPRESS_VALID(compress) \
	((compress) == ZIO_COMPRESS_LZJB \|\| \
	(compress) == ZIO_COMPRESS_LZ4 \|\| \
	(compress) == ZIO_COMPRESS_ON \|\| \
	(compress) == ZIO_COMPRESS_OFF)

	#define ZIO_FAILURE_MODE_WAIT 0
	#define ZIO_FAILURE_MODE_CONTINUE 1
	#define ZIO_FAILURE_MODE_PANIC 2

	enum zio_flag {
	/*
	* Flags inherited by gang, ddt, and vdev children,
	* and that must be equal for two zios to aggregate
	*/
	ZIO_FLAG_DONT_AGGREGATE = 1 << 0,
	ZIO_FLAG_IO_REPAIR = 1 << 1,
	ZIO_FLAG_SELF_HEAL = 1 << 2,
	ZIO_FLAG_RESILVER = 1 << 3,
	ZIO_FLAG_SCRUB = 1 << 4,
	ZIO_FLAG_SCAN_THREAD = 1 << 5,
	ZIO_FLAG_PHYSICAL = 1 << 6,

	#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)

	/*
	* Flags inherited by ddt, gang, and vdev children.
	*/
	ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */
	ZIO_FLAG_SPECULATIVE = 1 << 8,
	ZIO_FLAG_CONFIG_WRITER = 1 << 9,
	ZIO_FLAG_DONT_RETRY = 1 << 10,
	ZIO_FLAG_DONT_CACHE = 1 << 11,
	ZIO_FLAG_NODATA = 1 << 12,
	ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
	ZIO_FLAG_IO_ALLOCATING = 1 << 14,

	#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
	#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)

	/*
	* Flags inherited by vdev children.
	*/
	ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */
	ZIO_FLAG_PROBE = 1 << 16,
	ZIO_FLAG_TRYHARD = 1 << 17,
	ZIO_FLAG_OPTIONAL = 1 << 18,

	#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)

	/*
	* Flags not inherited by any children.
	*/
	ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */
	ZIO_FLAG_DONT_PROPAGATE = 1 << 20,
	ZIO_FLAG_IO_BYPASS = 1 << 21,
	ZIO_FLAG_IO_REWRITE = 1 << 22,
	ZIO_FLAG_RAW = 1 << 23,
	ZIO_FLAG_GANG_CHILD = 1 << 24,
	ZIO_FLAG_DDT_CHILD = 1 << 25,
	ZIO_FLAG_GODFATHER = 1 << 26,
	ZIO_FLAG_NOPWRITE = 1 << 27,
	ZIO_FLAG_REEXECUTED = 1 << 28,
	ZIO_FLAG_DELEGATED = 1 << 29,
	};

	#define ZIO_FLAG_MUSTSUCCEED 0

	#define ZIO_DDT_CHILD_FLAGS(zio) \
	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) \| \
	ZIO_FLAG_DDT_CHILD \| ZIO_FLAG_CANFAIL)

	#define ZIO_GANG_CHILD_FLAGS(zio) \
	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) \| \
	ZIO_FLAG_GANG_CHILD \| ZIO_FLAG_CANFAIL)

	#define ZIO_VDEV_CHILD_FLAGS(zio) \
	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) \| \
	- ZIO_FLAG_CANFAIL)
	+ ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_CANFAIL)

	#define ZIO_CHILD_BIT(x) (1 << (x))
	#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x)))

	enum zio_child {
	ZIO_CHILD_VDEV = 0,
	ZIO_CHILD_GANG,
	ZIO_CHILD_DDT,
	ZIO_CHILD_LOGICAL,
	ZIO_CHILD_TYPES
	};

	#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
	#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG)
	#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT)
	#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
	#define ZIO_CHILD_ALL_BITS \
	(ZIO_CHILD_VDEV_BIT \| ZIO_CHILD_GANG_BIT \| \
	ZIO_CHILD_DDT_BIT \| ZIO_CHILD_LOGICAL_BIT)

	enum zio_wait_type {
	ZIO_WAIT_READY = 0,
	ZIO_WAIT_DONE,
	ZIO_WAIT_TYPES
	};

	/*
	* We'll take the number 122 and 123 to indicate checksum errors and
	* fragmentation. Those doesn't collide with any errno values as they
	* are greater than ELAST.
	*/
	#define ECKSUM 122
	#define EFRAGS 123

	typedef void zio_done_func_t(zio_t *zio);

	extern boolean_t zio_dva_throttle_enabled;
	extern const char *zio_type_name[ZIO_TYPES];

	/*
	* A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
	* identifies any block in the pool. By convention, the meta-objset (MOS)
	* is objset 0, and the meta-dnode is object 0. This covers all blocks
	* except root blocks and ZIL blocks, which are defined as follows:
	*
	* Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
	* ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
	* dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
	* dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
	*
	* Note: this structure is called a bookmark because its original purpose
	* was to remember where to resume a pool-wide traverse.
	*
	* Note: this structure is passed between userland and the kernel, and is
	* stored on disk (by virtue of being incorporated into other on-disk
	* structures, e.g. dsl_scan_phys_t).
	*/
	typedef struct zbookmark_phys {
	uint64_t zb_objset;
	uint64_t zb_object;
	int64_t zb_level;
	uint64_t zb_blkid;
	} zbookmark_phys_t;

	#define SET_BOOKMARK(zb, objset, object, level, blkid) \
	{ \
	(zb)->zb_objset = objset; \
	(zb)->zb_object = object; \
	(zb)->zb_level = level; \
	(zb)->zb_blkid = blkid; \
	}

	#define ZB_DESTROYED_OBJSET (-1ULL)

	#define ZB_ROOT_OBJECT (0ULL)
	#define ZB_ROOT_LEVEL (-1LL)
	#define ZB_ROOT_BLKID (0ULL)

	#define ZB_ZIL_OBJECT (0ULL)
	#define ZB_ZIL_LEVEL (-2LL)

	#define ZB_DNODE_LEVEL (-3LL)
	#define ZB_DNODE_BLKID (0ULL)

	#define ZB_IS_ZERO(zb) \
	((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \
	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
	#define ZB_IS_ROOT(zb) \
	((zb)->zb_object == ZB_ROOT_OBJECT && \
	(zb)->zb_level == ZB_ROOT_LEVEL && \
	(zb)->zb_blkid == ZB_ROOT_BLKID)

	typedef struct zio_prop {
	enum zio_checksum zp_checksum;
	enum zio_compress zp_compress;
	dmu_object_type_t zp_type;
	uint8_t zp_level;
	uint8_t zp_copies;
	boolean_t zp_dedup;
	boolean_t zp_dedup_verify;
	boolean_t zp_nopwrite;
	} zio_prop_t;

	typedef struct zio_cksum_report zio_cksum_report_t;

	typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
	const void *good_data);
	typedef void zio_cksum_free_f(void *cbdata, size_t size);

	struct zio_bad_cksum; /* defined in zio_checksum.h */
	struct dnode_phys;
	struct abd;

	struct zio_cksum_report {
	struct zio_cksum_report *zcr_next;
	nvlist_t *zcr_ereport;
	nvlist_t *zcr_detector;
	void *zcr_cbdata;
	size_t zcr_cbinfo; /* passed to zcr_free() */
	uint64_t zcr_align;
	uint64_t zcr_length;
	zio_cksum_finish_f *zcr_finish;
	zio_cksum_free_f *zcr_free;

	/* internal use only */
	struct zio_bad_cksum zcr_ckinfo; / information from failure */
	};

	typedef void zio_vsd_cksum_report_f(zio_t zio, zio_cksum_report_t zcr,
	void *arg);

	zio_vsd_cksum_report_f zio_vsd_default_cksum_report;

	typedef struct zio_vsd_ops {
	zio_done_func_t *vsd_free;
	zio_vsd_cksum_report_f *vsd_cksum_report;
	} zio_vsd_ops_t;

	typedef struct zio_gang_node {
	zio_gbh_phys_t *gn_gbh;
	struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS];
	} zio_gang_node_t;

	typedef zio_t zio_gang_issue_func_t(zio_t zio, blkptr_t *bp,
	zio_gang_node_t gn, struct abd data, uint64_t offset);

	typedef void zio_transform_func_t(zio_t zio, struct abd data, uint64_t size);

	typedef struct zio_transform {
	struct abd *zt_orig_abd;
	uint64_t zt_orig_size;
	uint64_t zt_bufsize;
	zio_transform_func_t *zt_transform;
	struct zio_transform *zt_next;
	} zio_transform_t;

	typedef int zio_pipe_stage_t(zio_t *zio);

	/*
	* The io_reexecute flags are distinct from io_flags because the child must
	* be able to propagate them to the parent. The normal io_flags are local
	* to the zio, not protected by any lock, and not modifiable by children;
	* the reexecute flags are protected by io_lock, modifiable by children,
	* and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
	*/
	#define ZIO_REEXECUTE_NOW 0x01
	#define ZIO_REEXECUTE_SUSPEND 0x02

	typedef struct zio_alloc_list {
	list_t zal_list;
	uint64_t zal_size;
	} zio_alloc_list_t;

	typedef struct zio_link {
	zio_t *zl_parent;
	zio_t *zl_child;
	list_node_t zl_parent_node;
	list_node_t zl_child_node;
	} zio_link_t;

	/*
	* Used for TRIM kstat.
	*/
	typedef struct zio_trim_stats {
	/*
	* Number of bytes successfully TRIMmed.
	*/
	kstat_named_t bytes;

	/*
	* Number of successful TRIM requests.
	*/
	kstat_named_t success;

	/*
	* Number of TRIM requests that failed because TRIM is not
	* supported.
	*/
	kstat_named_t unsupported;

	/*
	* Number of TRIM requests that failed for other reasons.
	*/
	kstat_named_t failed;
	} zio_trim_stats_t;

	extern zio_trim_stats_t zio_trim_stats;

	#define ZIO_TRIM_STAT_INCR(stat, val) \
	atomic_add_64(&zio_trim_stats.stat.value.ui64, (val));
	#define ZIO_TRIM_STAT_BUMP(stat) \
	ZIO_TRIM_STAT_INCR(stat, 1);

	struct zio {
	/* Core information about this I/O */
	zbookmark_phys_t io_bookmark;
	zio_prop_t io_prop;
	zio_type_t io_type;
	enum zio_child io_child_type;
	int io_cmd;
	zio_priority_t io_priority;
	uint8_t io_reexecute;
	uint8_t io_state[ZIO_WAIT_TYPES];
	uint64_t io_txg;
	spa_t *io_spa;
	blkptr_t *io_bp;
	blkptr_t *io_bp_override;
	blkptr_t io_bp_copy;
	list_t io_parent_list;
	list_t io_child_list;
	zio_t *io_logical;
	zio_transform_t *io_transform_stack;

	/* Callback info */
	zio_done_func_t *io_ready;
	zio_done_func_t *io_children_ready;
	zio_done_func_t *io_physdone;
	zio_done_func_t *io_done;
	void *io_private;
	int64_t io_prev_space_delta; /* DMU private */
	blkptr_t io_bp_orig;

	/* Data represented by this I/O */
	struct abd *io_abd;
	struct abd *io_orig_abd;
	uint64_t io_size;
	uint64_t io_orig_size;
	/* io_lsize != io_orig_size iff this is a raw write */
	uint64_t io_lsize;

	/* Stuff for the vdev stack */
	vdev_t *io_vd;
	void *io_vsd;
	const zio_vsd_ops_t *io_vsd_ops;

	uint64_t io_offset;
	hrtime_t io_timestamp;
	hrtime_t io_queued_timestamp;
	hrtime_t io_target_timestamp;
	avl_node_t io_queue_node;
	avl_node_t io_offset_node;
	avl_node_t io_alloc_node;
	zio_alloc_list_t io_alloc_list;

	#ifdef __FreeBSD__
	struct bio *io_bio;
	#endif

	/* Internal pipeline state */
	enum zio_flag io_flags;
	enum zio_stage io_stage;
	enum zio_stage io_pipeline;
	enum zio_flag io_orig_flags;
	enum zio_stage io_orig_stage;
	enum zio_stage io_orig_pipeline;
	enum zio_stage io_pipeline_trace;
	int io_error;
	int io_child_error[ZIO_CHILD_TYPES];
	uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
	uint64_t io_child_count;
	uint64_t io_phys_children;
	uint64_t io_parent_count;
	uint64_t *io_stall;
	zio_t *io_gang_leader;
	zio_gang_node_t *io_gang_tree;
	void *io_executor;
	void *io_waiter;
	kmutex_t io_lock;
	kcondvar_t io_cv;

	/* FMA state */
	zio_cksum_report_t *io_cksum_report;
	uint64_t io_ena;

	/* Taskq dispatching state */
	taskq_ent_t io_tqent;

	avl_node_t io_trim_node;
	list_node_t io_trim_link;
	};

	extern int zio_bookmark_compare(const void , const void );

	extern zio_t zio_null(zio_t pio, spa_t spa, vdev_t vd,
	zio_done_func_t done, void priv, enum zio_flag flags);

	extern zio_t zio_root(spa_t spa,
	zio_done_func_t done, void priv, enum zio_flag flags);

	extern zio_t zio_read(zio_t pio, spa_t spa, const blkptr_t bp,
	struct abd data, uint64_t lsize, zio_done_func_t done, void *priv,
	zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);

	extern zio_t zio_write(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp,
	struct abd data, uint64_t size, uint64_t psize, const zio_prop_t zp,
	zio_done_func_t ready, zio_done_func_t children_ready,
	zio_done_func_t physdone, zio_done_func_t done,
	void *priv, zio_priority_t priority, enum zio_flag flags,
	const zbookmark_phys_t *zb);

	extern zio_t zio_rewrite(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp,
	struct abd data, uint64_t size, zio_done_func_t done, void *priv,
	zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);

	extern void zio_write_override(zio_t zio, blkptr_t bp, int copies,
	boolean_t nopwrite);

	extern void zio_free(spa_t spa, uint64_t txg, const blkptr_t bp);

	extern zio_t zio_claim(zio_t pio, spa_t *spa, uint64_t txg,
	const blkptr_t *bp,
	zio_done_func_t done, void priv, enum zio_flag flags);

	extern zio_t zio_ioctl(zio_t pio, spa_t spa, vdev_t vd, int cmd,
	uint64_t offset, uint64_t size, zio_done_func_t done, void priv,
	zio_priority_t priority, enum zio_flag flags);

	extern zio_t zio_read_phys(zio_t pio, vdev_t *vd, uint64_t offset,
	uint64_t size, struct abd *data, int checksum,
	zio_done_func_t done, void priv, zio_priority_t priority,
	enum zio_flag flags, boolean_t labels);

	extern zio_t zio_write_phys(zio_t pio, vdev_t *vd, uint64_t offset,
	uint64_t size, struct abd *data, int checksum,
	zio_done_func_t done, void priv, zio_priority_t priority,
	enum zio_flag flags, boolean_t labels);

	extern zio_t zio_free_sync(zio_t pio, spa_t *spa, uint64_t txg,
	const blkptr_t *bp, uint64_t size, enum zio_flag flags);

	extern int zio_alloc_zil(spa_t spa, uint64_t txg, blkptr_t new_bp,
	blkptr_t old_bp, uint64_t size, boolean_t slog);
	extern void zio_free_zil(spa_t spa, uint64_t txg, blkptr_t bp);
	extern void zio_flush(zio_t zio, vdev_t vd);
	extern zio_t zio_trim(zio_t zio, spa_t spa, vdev_t vd, uint64_t offset,
	uint64_t size);
	extern void zio_shrink(zio_t *zio, uint64_t size);

	extern int zio_wait(zio_t *zio);
	extern void zio_nowait(zio_t *zio);
	extern void zio_execute(zio_t *zio);
	extern void zio_interrupt(zio_t *zio);
	extern void zio_delay_init(zio_t *zio);
	extern void zio_delay_interrupt(zio_t *zio);

	extern zio_t zio_walk_parents(zio_t cio, zio_link_t **);
	extern zio_t zio_walk_children(zio_t pio, zio_link_t **);
	extern zio_t zio_unique_parent(zio_t cio);
	extern void zio_add_child(zio_t pio, zio_t cio);

	extern void *zio_buf_alloc(size_t size);
	extern void zio_buf_free(void *buf, size_t size);
	extern void *zio_data_buf_alloc(size_t size);
	extern void zio_data_buf_free(void *buf, size_t size);

	extern void zio_push_transform(zio_t zio, struct abd abd, uint64_t size,
	uint64_t bufsize, zio_transform_func_t *transform);
	extern void zio_pop_transforms(zio_t *zio);

	extern void zio_resubmit_stage_async(void *);

	extern zio_t zio_vdev_child_io(zio_t zio, blkptr_t bp, vdev_t vd,
	uint64_t offset, struct abd *data, uint64_t size, int type,
	zio_priority_t priority, enum zio_flag flags,
	zio_done_func_t done, void priv);

	extern zio_t zio_vdev_delegated_io(vdev_t vd, uint64_t offset,
	struct abd *data, uint64_t size, int type, zio_priority_t priority,
	enum zio_flag flags, zio_done_func_t done, void priv);

	extern void zio_vdev_io_bypass(zio_t *zio);
	extern void zio_vdev_io_reissue(zio_t *zio);
	extern void zio_vdev_io_redone(zio_t *zio);

	extern void zio_checksum_verified(zio_t *zio);
	extern int zio_worst_error(int e1, int e2);

	extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
	enum zio_checksum parent);
	extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
	enum zio_checksum child, enum zio_checksum parent);
	extern enum zio_compress zio_compress_select(spa_t *spa,
	enum zio_compress child, enum zio_compress parent);

	extern void zio_suspend(spa_t spa, zio_t zio);
	extern int zio_resume(spa_t *spa);
	extern void zio_resume_wait(spa_t *spa);

	/*
	* Initial setup and teardown.
	*/
	extern void zio_init(void);
	extern void zio_fini(void);

	/*
	* Fault injection
	*/
	struct zinject_record;
	extern uint32_t zio_injection_enabled;
	extern int zio_inject_fault(char name, int flags, int id,
	struct zinject_record *record);
	extern int zio_inject_list_next(int id, char name, size_t buflen,
	struct zinject_record *record);
	extern int zio_clear_fault(int id);
	extern void zio_handle_panic_injection(spa_t spa, char tag, uint64_t type);
	extern int zio_handle_fault_injection(zio_t *zio, int error);
	extern int zio_handle_device_injection(vdev_t vd, zio_t zio, int error);
	extern int zio_handle_label_injection(zio_t *zio, int error);
	extern void zio_handle_ignored_writes(zio_t *zio);
	extern hrtime_t zio_handle_io_delay(zio_t *zio);

	/*
	* Checksum ereport functions
	*/
	extern void zfs_ereport_start_checksum(spa_t spa, vdev_t vd, struct zio *zio,
	uint64_t offset, uint64_t length, void arg, struct zio_bad_cksum info);
	extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
	const void good_data, const void bad_data, boolean_t drop_if_identical);

	extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
	extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);

	/* If we have the good data in hand, this function can be used */
	extern void zfs_ereport_post_checksum(spa_t spa, vdev_t vd,
	struct zio *zio, uint64_t offset, uint64_t length,
	const void good_data, const void bad_data, struct zio_bad_cksum *info);

	/* Called from spa_sync(), but primarily an injection handler */
	extern void spa_handle_ignored_writes(spa_t *spa);

	/* zbookmark_phys functions */
	boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
	const zbookmark_phys_t subtree_root, const zbookmark_phys_t last_block);
	int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
	uint8_t ibs2, const zbookmark_phys_t zb1, const zbookmark_phys_t zb2);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _ZIO_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h (revision 332525)
	@@ -1,41 +1,42 @@
	/*
	* CDDL HEADER START
	*
	* This file and its contents are supplied under the terms of the
	* Common Development and Distribution License ("CDDL"), version 1.0.
	* You may only use this file in accordance with the terms of version
	* 1.0 of the CDDL.
	*
	* A full copy of the text of the CDDL should have accompanied this
	* source. A copy of the CDDL is also available via the Internet at
	* http://www.illumos.org/license/CDDL.
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2014 by Delphix. All rights reserved.
	*/
	#ifndef _ZIO_PRIORITY_H
	#define _ZIO_PRIORITY_H

	#ifdef __cplusplus
	extern "C" {
	#endif

	typedef enum zio_priority {
	ZIO_PRIORITY_SYNC_READ,
	ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
	ZIO_PRIORITY_ASYNC_READ, /* prefetch */
	ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
	ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
	ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
	+ ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
	ZIO_PRIORITY_NUM_QUEUEABLE,

	ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
	} zio_priority_t;

	#ifdef __cplusplus
	}
	#endif

	#endif /* _ZIO_PRIORITY_H */
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c (revision 332525)
	@@ -1,901 +1,903 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org>
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/txg_impl.h>
	#include <sys/dmu_impl.h>
	#include <sys/dmu_tx.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_scan.h>
	#include <sys/zil.h>
	#include <sys/callb.h>

	/*
	* ZFS Transaction Groups
	* ----------------------
	*
	* ZFS transaction groups are, as the name implies, groups of transactions
	* that act on persistent state. ZFS asserts consistency at the granularity of
	* these transaction groups. Each successive transaction group (txg) is
	* assigned a 64-bit consecutive identifier. There are three active
	* transaction group states: open, quiescing, or syncing. At any given time,
	* there may be an active txg associated with each state; each active txg may
	* either be processing, or blocked waiting to enter the next state. There may
	* be up to three active txgs, and there is always a txg in the open state
	* (though it may be blocked waiting to enter the quiescing state). In broad
	* strokes, transactions -- operations that change in-memory structures -- are
	* accepted into the txg in the open state, and are completed while the txg is
	* in the open or quiescing states. The accumulated changes are written to
	* disk in the syncing state.
	*
	* Open
	*
	* When a new txg becomes active, it first enters the open state. New
	* transactions -- updates to in-memory structures -- are assigned to the
	* currently open txg. There is always a txg in the open state so that ZFS can
	* accept new changes (though the txg may refuse new changes if it has hit
	* some limit). ZFS advances the open txg to the next state for a variety of
	* reasons such as it hitting a time or size threshold, or the execution of an
	* administrative action that must be completed in the syncing state.
	*
	* Quiescing
	*
	* After a txg exits the open state, it enters the quiescing state. The
	* quiescing state is intended to provide a buffer between accepting new
	* transactions in the open state and writing them out to stable storage in
	* the syncing state. While quiescing, transactions can continue their
	* operation without delaying either of the other states. Typically, a txg is
	* in the quiescing state very briefly since the operations are bounded by
	* software latencies rather than, say, slower I/O latencies. After all
	* transactions complete, the txg is ready to enter the next state.
	*
	* Syncing
	*
	* In the syncing state, the in-memory state built up during the open and (to
	* a lesser degree) the quiescing states is written to stable storage. The
	* process of writing out modified data can, in turn modify more data. For
	* example when we write new blocks, we need to allocate space for them; those
	* allocations modify metadata (space maps)... which themselves must be
	* written to stable storage. During the sync state, ZFS iterates, writing out
	* data until it converges and all in-memory changes have been written out.
	* The first such pass is the largest as it encompasses all the modified user
	* data (as opposed to filesystem metadata). Subsequent passes typically have
	* far less data to write as they consist exclusively of filesystem metadata.
	*
	* To ensure convergence, after a certain number of passes ZFS begins
	* overwriting locations on stable storage that had been allocated earlier in
	* the syncing state (and subsequently freed). ZFS usually allocates new
	* blocks to optimize for large, continuous, writes. For the syncing state to
	* converge however it must complete a pass where no new blocks are allocated
	* since each allocation requires a modification of persistent metadata.
	* Further, to hasten convergence, after a prescribed number of passes, ZFS
	* also defers frees, and stops compressing.
	*
	* In addition to writing out user data, we must also execute synctasks during
	* the syncing context. A synctask is the mechanism by which some
	* administrative activities work such as creating and destroying snapshots or
	* datasets. Note that when a synctask is initiated it enters the open txg,
	* and ZFS then pushes that txg as quickly as possible to completion of the
	* syncing state in order to reduce the latency of the administrative
	* activity. To complete the syncing state, ZFS writes out a new uberblock,
	* the root of the tree of blocks that comprise all state stored on the ZFS
	* pool. Finally, if there is a quiesced txg waiting, we signal that it can
	* now transition to the syncing state.
	*/

	static void txg_sync_thread(void *arg);
	static void txg_quiesce_thread(void *arg);

	int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG");
	SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0,
	"Maximum seconds worth of delta per txg");

	/*
	* Prepare the txg subsystem.
	*/
	void
	txg_init(dsl_pool_t *dp, uint64_t txg)
	{
	tx_state_t *tx = &dp->dp_tx;
	int c;
	bzero(tx, sizeof (tx_state_t));

	tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);

	for (c = 0; c < max_ncpus; c++) {
	int i;

	mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
	NULL);
	for (i = 0; i < TXG_SIZE; i++) {
	cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
	NULL);
	list_create(&tx->tx_cpu[c].tc_callbacks[i],
	sizeof (dmu_tx_callback_t),
	offsetof(dmu_tx_callback_t, dcb_node));
	}
	}

	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);

	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);

	tx->tx_open_txg = txg;
	}

	/*
	* Close down the txg subsystem.
	*/
	void
	txg_fini(dsl_pool_t *dp)
	{
	tx_state_t *tx = &dp->dp_tx;
	int c;

	ASSERT0(tx->tx_threads);

	mutex_destroy(&tx->tx_sync_lock);

	cv_destroy(&tx->tx_sync_more_cv);
	cv_destroy(&tx->tx_sync_done_cv);
	cv_destroy(&tx->tx_quiesce_more_cv);
	cv_destroy(&tx->tx_quiesce_done_cv);
	cv_destroy(&tx->tx_exit_cv);

	for (c = 0; c < max_ncpus; c++) {
	int i;

	mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
	mutex_destroy(&tx->tx_cpu[c].tc_lock);
	for (i = 0; i < TXG_SIZE; i++) {
	cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
	list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
	}
	}

	if (tx->tx_commit_cb_taskq != NULL)
	taskq_destroy(tx->tx_commit_cb_taskq);

	kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));

	bzero(tx, sizeof (tx_state_t));
	}

	/*
	* Start syncing transaction groups.
	*/
	void
	txg_sync_start(dsl_pool_t *dp)
	{
	tx_state_t *tx = &dp->dp_tx;

	mutex_enter(&tx->tx_sync_lock);

	dprintf("pool %p\n", dp);

	ASSERT0(tx->tx_threads);

	tx->tx_threads = 2;

	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
	dp, 0, &p0, TS_RUN, minclsyspri);

	/*
	* The sync thread can need a larger-than-default stack size on
	* 32-bit x86. This is due in part to nested pools and
	* scrub_visitbp() recursion.
	*/
	tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
	dp, 0, &p0, TS_RUN, minclsyspri);

	mutex_exit(&tx->tx_sync_lock);
	}

	static void
	txg_thread_enter(tx_state_t tx, callb_cpr_t cpr)
	{
	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
	mutex_enter(&tx->tx_sync_lock);
	}

	static void
	txg_thread_exit(tx_state_t tx, callb_cpr_t cpr, kthread_t **tpp)
	{
	ASSERT(*tpp != NULL);
	*tpp = NULL;
	tx->tx_threads--;
	cv_broadcast(&tx->tx_exit_cv);
	CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */
	thread_exit();
	}

	static void
	txg_thread_wait(tx_state_t tx, callb_cpr_t cpr, kcondvar_t *cv, clock_t time)
	{
	CALLB_CPR_SAFE_BEGIN(cpr);

	if (time)
	(void) cv_timedwait(cv, &tx->tx_sync_lock, time);
	else
	cv_wait(cv, &tx->tx_sync_lock);

	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
	}

	/*
	* Stop syncing transaction groups.
	*/
	void
	txg_sync_stop(dsl_pool_t *dp)
	{
	tx_state_t *tx = &dp->dp_tx;

	dprintf("pool %p\n", dp);
	/*
	* Finish off any work in progress.
	*/
	ASSERT3U(tx->tx_threads, ==, 2);

	/*
	* We need to ensure that we've vacated the deferred space_maps.
	*/
	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);

	/*
	* Wake all sync threads and wait for them to die.
	*/
	mutex_enter(&tx->tx_sync_lock);

	ASSERT3U(tx->tx_threads, ==, 2);

	tx->tx_exiting = 1;

	cv_broadcast(&tx->tx_quiesce_more_cv);
	cv_broadcast(&tx->tx_quiesce_done_cv);
	cv_broadcast(&tx->tx_sync_more_cv);

	while (tx->tx_threads != 0)
	cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);

	tx->tx_exiting = 0;

	mutex_exit(&tx->tx_sync_lock);
	}

	uint64_t
	txg_hold_open(dsl_pool_t dp, txg_handle_t th)
	{
	tx_state_t *tx = &dp->dp_tx;
	tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
	uint64_t txg;

	mutex_enter(&tc->tc_open_lock);
	txg = tx->tx_open_txg;

	mutex_enter(&tc->tc_lock);
	tc->tc_count[txg & TXG_MASK]++;
	mutex_exit(&tc->tc_lock);

	th->th_cpu = tc;
	th->th_txg = txg;

	return (txg);
	}

	void
	txg_rele_to_quiesce(txg_handle_t *th)
	{
	tx_cpu_t *tc = th->th_cpu;

	ASSERT(!MUTEX_HELD(&tc->tc_lock));
	mutex_exit(&tc->tc_open_lock);
	}

	void
	txg_register_callbacks(txg_handle_t th, list_t tx_callbacks)
	{
	tx_cpu_t *tc = th->th_cpu;
	int g = th->th_txg & TXG_MASK;

	mutex_enter(&tc->tc_lock);
	list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
	mutex_exit(&tc->tc_lock);
	}

	void
	txg_rele_to_sync(txg_handle_t *th)
	{
	tx_cpu_t *tc = th->th_cpu;
	int g = th->th_txg & TXG_MASK;

	mutex_enter(&tc->tc_lock);
	ASSERT(tc->tc_count[g] != 0);
	if (--tc->tc_count[g] == 0)
	cv_broadcast(&tc->tc_cv[g]);
	mutex_exit(&tc->tc_lock);

	th->th_cpu = NULL; /* defensive */
	}

	/*
	* Blocks until all transactions in the group are committed.
	*
	* On return, the transaction group has reached a stable state in which it can
	* then be passed off to the syncing context.
	*/
	static __noinline void
	txg_quiesce(dsl_pool_t *dp, uint64_t txg)
	{
	tx_state_t *tx = &dp->dp_tx;
	int g = txg & TXG_MASK;
	int c;

	/*
	* Grab all tc_open_locks so nobody else can get into this txg.
	*/
	for (c = 0; c < max_ncpus; c++)
	mutex_enter(&tx->tx_cpu[c].tc_open_lock);

	ASSERT(txg == tx->tx_open_txg);
	tx->tx_open_txg++;
	tx->tx_open_time = gethrtime();

	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);

	/*
	* Now that we've incremented tx_open_txg, we can let threads
	* enter the next transaction group.
	*/
	for (c = 0; c < max_ncpus; c++)
	mutex_exit(&tx->tx_cpu[c].tc_open_lock);

	/*
	* Quiesce the transaction group by waiting for everyone to txg_exit().
	*/
	for (c = 0; c < max_ncpus; c++) {
	tx_cpu_t *tc = &tx->tx_cpu[c];
	mutex_enter(&tc->tc_lock);
	while (tc->tc_count[g] != 0)
	cv_wait(&tc->tc_cv[g], &tc->tc_lock);
	mutex_exit(&tc->tc_lock);
	}
	}

	static void
	txg_do_callbacks(void *arg)
	{
	list_t *cb_list = arg;

	dmu_tx_do_callbacks(cb_list, 0);

	list_destroy(cb_list);

	kmem_free(cb_list, sizeof (list_t));
	}

	/*
	* Dispatch the commit callbacks registered on this txg to worker threads.
	*
	* If no callbacks are registered for a given TXG, nothing happens.
	* This function creates a taskq for the associated pool, if needed.
	*/
	static void
	txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
	{
	int c;
	tx_state_t *tx = &dp->dp_tx;
	list_t *cb_list;

	for (c = 0; c < max_ncpus; c++) {
	tx_cpu_t *tc = &tx->tx_cpu[c];
	/*
	* No need to lock tx_cpu_t at this point, since this can
	* only be called once a txg has been synced.
	*/

	int g = txg & TXG_MASK;

	if (list_is_empty(&tc->tc_callbacks[g]))
	continue;

	if (tx->tx_commit_cb_taskq == NULL) {
	/*
	* Commit callback taskq hasn't been created yet.
	*/
	tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
	max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
	TASKQ_PREPOPULATE);
	}

	cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
	list_create(cb_list, sizeof (dmu_tx_callback_t),
	offsetof(dmu_tx_callback_t, dcb_node));

	list_move_tail(cb_list, &tc->tc_callbacks[g]);

	(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
	txg_do_callbacks, cb_list, TQ_SLEEP);
	}
	}

	static void
	txg_sync_thread(void *arg)
	{
	dsl_pool_t *dp = arg;
	spa_t *spa = dp->dp_spa;
	tx_state_t *tx = &dp->dp_tx;
	callb_cpr_t cpr;
	uint64_t start, delta;

	txg_thread_enter(tx, &cpr);

	start = delta = 0;
	for (;;) {
	uint64_t timeout = zfs_txg_timeout * hz;
	uint64_t timer;
	uint64_t txg;

	/*
	* We sync when we're scanning, there's someone waiting
	* on us, or the quiesce thread has handed off a txg to
	* us, or we have reached our timeout.
	*/
	timer = (delta >= timeout ? 0 : timeout - delta);
	while (!dsl_scan_active(dp->dp_scan) &&
	!tx->tx_exiting && timer > 0 &&
	tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
	tx->tx_quiesced_txg == 0 &&
	dp->dp_dirty_total < zfs_dirty_data_sync) {
	dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
	tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
	txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
	delta = ddi_get_lbolt() - start;
	timer = (delta > timeout ? 0 : timeout - delta);
	}

	/*
	* Wait until the quiesce thread hands off a txg to us,
	* prompting it to do so if necessary.
	*/
	while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
	if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
	tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
	cv_broadcast(&tx->tx_quiesce_more_cv);
	txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
	}

	if (tx->tx_exiting)
	txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);

	/*
	* Consume the quiesced txg which has been handed off to
	* us. This may cause the quiescing thread to now be
	* able to quiesce another txg, so we must signal it.
	*/
	txg = tx->tx_quiesced_txg;
	tx->tx_quiesced_txg = 0;
	tx->tx_syncing_txg = txg;
	DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
	cv_broadcast(&tx->tx_quiesce_more_cv);

	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
	txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
	mutex_exit(&tx->tx_sync_lock);

	start = ddi_get_lbolt();
	spa_sync(spa, txg);
	delta = ddi_get_lbolt() - start;

	mutex_enter(&tx->tx_sync_lock);
	tx->tx_synced_txg = txg;
	tx->tx_syncing_txg = 0;
	DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
	cv_broadcast(&tx->tx_sync_done_cv);

	/*
	* Dispatch commit callbacks to worker threads.
	*/
	txg_dispatch_callbacks(dp, txg);
	}
	}

	static void
	txg_quiesce_thread(void *arg)
	{
	dsl_pool_t *dp = arg;
	tx_state_t *tx = &dp->dp_tx;
	callb_cpr_t cpr;

	txg_thread_enter(tx, &cpr);

	for (;;) {
	uint64_t txg;

	/*
	* We quiesce when there's someone waiting on us.
	* However, we can only have one txg in "quiescing" or
	* "quiesced, waiting to sync" state. So we wait until
	* the "quiesced, waiting to sync" txg has been consumed
	* by the sync thread.
	*/
	while (!tx->tx_exiting &&
	(tx->tx_open_txg >= tx->tx_quiesce_txg_waiting \|\|
	tx->tx_quiesced_txg != 0))
	txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);

	if (tx->tx_exiting)
	txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);

	txg = tx->tx_open_txg;
	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
	txg, tx->tx_quiesce_txg_waiting,
	tx->tx_sync_txg_waiting);
	mutex_exit(&tx->tx_sync_lock);
	txg_quiesce(dp, txg);
	mutex_enter(&tx->tx_sync_lock);

	/*
	* Hand this txg off to the sync thread.
	*/
	dprintf("quiesce done, handing off txg %llu\n", txg);
	tx->tx_quiesced_txg = txg;
	DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
	cv_broadcast(&tx->tx_sync_more_cv);
	cv_broadcast(&tx->tx_quiesce_done_cv);
	}
	}

	/*
	* Delay this thread by delay nanoseconds if we are still in the open
	* transaction group and there is already a waiting txg quiesing or quiesced.
	* Abort the delay if this txg stalls or enters the quiesing state.
	*/
	void
	txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
	{
	tx_state_t *tx = &dp->dp_tx;
	hrtime_t start = gethrtime();

	/* don't delay if this txg could transition to quiescing immediately */
	if (tx->tx_open_txg > txg \|\|
	tx->tx_syncing_txg == txg-1 \|\| tx->tx_synced_txg == txg-1)
	return;

	mutex_enter(&tx->tx_sync_lock);
	if (tx->tx_open_txg > txg \|\| tx->tx_synced_txg == txg-1) {
	mutex_exit(&tx->tx_sync_lock);
	return;
	}

	while (gethrtime() - start < delay &&
	tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
	(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
	&tx->tx_sync_lock, delay, resolution, 0);
	}

	mutex_exit(&tx->tx_sync_lock);
	}

	void
	txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
	{
	tx_state_t *tx = &dp->dp_tx;

	ASSERT(!dsl_pool_config_held(dp));

	mutex_enter(&tx->tx_sync_lock);
	ASSERT3U(tx->tx_threads, ==, 2);
	if (txg == 0)
	txg = tx->tx_open_txg + TXG_DEFER_SIZE;
	if (tx->tx_sync_txg_waiting < txg)
	tx->tx_sync_txg_waiting = txg;
	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
	txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
	while (tx->tx_synced_txg < txg) {
	dprintf("broadcasting sync more "
	"tx_synced=%llu waiting=%llu dp=%p\n",
	tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
	cv_broadcast(&tx->tx_sync_more_cv);
	cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
	}
	mutex_exit(&tx->tx_sync_lock);
	}

	void
	txg_wait_open(dsl_pool_t *dp, uint64_t txg)
	{
	tx_state_t *tx = &dp->dp_tx;

	ASSERT(!dsl_pool_config_held(dp));

	mutex_enter(&tx->tx_sync_lock);
	ASSERT3U(tx->tx_threads, ==, 2);
	if (txg == 0)
	txg = tx->tx_open_txg + 1;
	if (tx->tx_quiesce_txg_waiting < txg)
	tx->tx_quiesce_txg_waiting = txg;
	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
	txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
	while (tx->tx_open_txg < txg) {
	cv_broadcast(&tx->tx_quiesce_more_cv);
	cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
	}
	mutex_exit(&tx->tx_sync_lock);
	}

	/*
	* If there isn't a txg syncing or in the pipeline, push another txg through
	* the pipeline by queiscing the open txg.
	*/
	void
	txg_kick(dsl_pool_t *dp)
	{
	tx_state_t *tx = &dp->dp_tx;

	ASSERT(!dsl_pool_config_held(dp));

	mutex_enter(&tx->tx_sync_lock);
	if (tx->tx_syncing_txg == 0 &&
	tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
	tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
	tx->tx_quiesced_txg <= tx->tx_synced_txg) {
	tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
	cv_broadcast(&tx->tx_quiesce_more_cv);
	}
	mutex_exit(&tx->tx_sync_lock);
	}

	boolean_t
	txg_stalled(dsl_pool_t *dp)
	{
	tx_state_t *tx = &dp->dp_tx;
	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
	}

	boolean_t
	txg_sync_waiting(dsl_pool_t *dp)
	{
	tx_state_t *tx = &dp->dp_tx;

	return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting \|\|
	tx->tx_quiesced_txg != 0);
	}

	/*
	* Verify that this txg is active (open, quiescing, syncing). Non-active
	* txg's should not be manipulated.
	*/
	void
	txg_verify(spa_t *spa, uint64_t txg)
	{
	dsl_pool_t *dp = spa_get_dsl(spa);
	if (txg <= TXG_INITIAL \|\| txg == ZILTEST_TXG)
	return;
	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
	ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
	ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
	}

	/*
	* Per-txg object lists.
	*/
	void
	txg_list_create(txg_list_t tl, spa_t spa, size_t offset)
	{
	int t;

	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);

	tl->tl_offset = offset;
	tl->tl_spa = spa;

	for (t = 0; t < TXG_SIZE; t++)
	tl->tl_head[t] = NULL;
	}

	void
	txg_list_destroy(txg_list_t *tl)
	{
	int t;

	for (t = 0; t < TXG_SIZE; t++)
	ASSERT(txg_list_empty(tl, t));

	mutex_destroy(&tl->tl_lock);
	}

	boolean_t
	txg_list_empty(txg_list_t *tl, uint64_t txg)
	{
	txg_verify(tl->tl_spa, txg);
	return (tl->tl_head[txg & TXG_MASK] == NULL);
	}

	/*
	* Returns true if all txg lists are empty.
	*
	* Warning: this is inherently racy (an item could be added immediately
	* after this function returns). We don't bother with the lock because
	* it wouldn't change the semantics.
	*/
	boolean_t
	txg_all_lists_empty(txg_list_t *tl)
	{
	for (int i = 0; i < TXG_SIZE; i++) {
	if (!txg_list_empty(tl, i)) {
	return (B_FALSE);
	}
	}
	return (B_TRUE);
	}

	/*
	* Add an entry to the list (unless it's already on the list).
	* Returns B_TRUE if it was actually added.
	*/
	boolean_t
	txg_list_add(txg_list_t tl, void p, uint64_t txg)
	{
	int t = txg & TXG_MASK;
	txg_node_t tn = (txg_node_t )((char *)p + tl->tl_offset);
	boolean_t add;

	txg_verify(tl->tl_spa, txg);
	mutex_enter(&tl->tl_lock);
	add = (tn->tn_member[t] == 0);
	if (add) {
	tn->tn_member[t] = 1;
	tn->tn_next[t] = tl->tl_head[t];
	tl->tl_head[t] = tn;
	}
	mutex_exit(&tl->tl_lock);

	return (add);
	}

	/*
	* Add an entry to the end of the list, unless it's already on the list.
	* (walks list to find end)
	* Returns B_TRUE if it was actually added.
	*/
	boolean_t
	txg_list_add_tail(txg_list_t tl, void p, uint64_t txg)
	{
	int t = txg & TXG_MASK;
	txg_node_t tn = (txg_node_t )((char *)p + tl->tl_offset);
	boolean_t add;

	txg_verify(tl->tl_spa, txg);
	mutex_enter(&tl->tl_lock);
	add = (tn->tn_member[t] == 0);
	if (add) {
	txg_node_t **tp;

	for (tp = &tl->tl_head[t]; tp != NULL; tp = &(tp)->tn_next[t])
	continue;

	tn->tn_member[t] = 1;
	tn->tn_next[t] = NULL;
	*tp = tn;
	}
	mutex_exit(&tl->tl_lock);

	return (add);
	}

	/*
	* Remove the head of the list and return it.
	*/
	void *
	txg_list_remove(txg_list_t *tl, uint64_t txg)
	{
	int t = txg & TXG_MASK;
	txg_node_t *tn;
	void *p = NULL;

	txg_verify(tl->tl_spa, txg);
	mutex_enter(&tl->tl_lock);
	if ((tn = tl->tl_head[t]) != NULL) {
	+ ASSERT(tn->tn_member[t]);
	+ ASSERT(tn->tn_next[t] == NULL \|\| tn->tn_next[t]->tn_member[t]);
	p = (char *)tn - tl->tl_offset;
	tl->tl_head[t] = tn->tn_next[t];
	tn->tn_next[t] = NULL;
	tn->tn_member[t] = 0;
	}
	mutex_exit(&tl->tl_lock);

	return (p);
	}

	/*
	* Remove a specific item from the list and return it.
	*/
	void *
	txg_list_remove_this(txg_list_t tl, void p, uint64_t txg)
	{
	int t = txg & TXG_MASK;
	txg_node_t tn, *tp;

	txg_verify(tl->tl_spa, txg);
	mutex_enter(&tl->tl_lock);

	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
	if ((char *)tn - tl->tl_offset == p) {
	*tp = tn->tn_next[t];
	tn->tn_next[t] = NULL;
	tn->tn_member[t] = 0;
	mutex_exit(&tl->tl_lock);
	return (p);
	}
	}

	mutex_exit(&tl->tl_lock);

	return (NULL);
	}

	boolean_t
	txg_list_member(txg_list_t tl, void p, uint64_t txg)
	{
	int t = txg & TXG_MASK;
	txg_node_t tn = (txg_node_t )((char *)p + tl->tl_offset);

	txg_verify(tl->tl_spa, txg);
	return (tn->tn_member[t] != 0);
	}

	/*
	* Walk a txg list -- only safe if you know it's not changing.
	*/
	void *
	txg_list_head(txg_list_t *tl, uint64_t txg)
	{
	int t = txg & TXG_MASK;
	txg_node_t *tn = tl->tl_head[t];

	txg_verify(tl->tl_spa, txg);
	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
	}

	void *
	txg_list_next(txg_list_t tl, void p, uint64_t txg)
	{
	int t = txg & TXG_MASK;
	txg_node_t tn = (txg_node_t )((char *)p + tl->tl_offset);

	txg_verify(tl->tl_spa, txg);
	tn = tn->tn_next[t];

	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (revision 332525)
	@@ -1,3644 +1,3821 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	* Copyright 2017 Nexenta Systems, Inc.
	* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2016 Toomas Soome <tsoome@me.com>
	* Copyright 2017 Joyent, Inc.
	*/

	#include <sys/zfs_context.h>
	#include <sys/fm/fs/zfs.h>
	#include <sys/spa.h>
	#include <sys/spa_impl.h>
	+#include <sys/bpobj.h>
	#include <sys/dmu.h>
	#include <sys/dmu_tx.h>
	+#include <sys/dsl_dir.h>
	#include <sys/vdev_impl.h>
	#include <sys/uberblock_impl.h>
	#include <sys/metaslab.h>
	#include <sys/metaslab_impl.h>
	#include <sys/space_map.h>
	#include <sys/space_reftree.h>
	#include <sys/zio.h>
	#include <sys/zap.h>
	#include <sys/fs/zfs.h>
	#include <sys/arc.h>
	#include <sys/zil.h>
	#include <sys/dsl_scan.h>
	#include <sys/abd.h>
	#include <sys/trim_map.h>

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");

	/*
	* Virtual device management.
	*/

	/*
	* The limit for ZFS to automatically increase a top-level vdev's ashift
	* from logical ashift to physical ashift.
	*
	* Example: one or more 512B emulation child vdevs
	* child->vdev_ashift = 9 (512 bytes)
	* child->vdev_physical_ashift = 12 (4096 bytes)
	* zfs_max_auto_ashift = 11 (2048 bytes)
	* zfs_min_auto_ashift = 9 (512 bytes)
	*
	* On pool creation or the addition of a new top-level vdev, ZFS will
	* increase the ashift of the top-level vdev to 2048 as limited by
	* zfs_max_auto_ashift.
	*
	* Example: one or more 512B emulation child vdevs
	* child->vdev_ashift = 9 (512 bytes)
	* child->vdev_physical_ashift = 12 (4096 bytes)
	* zfs_max_auto_ashift = 13 (8192 bytes)
	* zfs_min_auto_ashift = 9 (512 bytes)
	*
	* On pool creation or the addition of a new top-level vdev, ZFS will
	* increase the ashift of the top-level vdev to 4096 to match the
	* max vdev_physical_ashift.
	*
	* Example: one or more 512B emulation child vdevs
	* child->vdev_ashift = 9 (512 bytes)
	* child->vdev_physical_ashift = 9 (512 bytes)
	* zfs_max_auto_ashift = 13 (8192 bytes)
	* zfs_min_auto_ashift = 12 (4096 bytes)
	*
	* On pool creation or the addition of a new top-level vdev, ZFS will
	* increase the ashift of the top-level vdev to 4096 to match the
	* zfs_min_auto_ashift.
	*/
	static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
	static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT;

	static int
	sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
	{
	uint64_t val;
	int err;

	val = zfs_max_auto_ashift;
	err = sysctl_handle_64(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (val > SPA_MAXASHIFT \|\| val < zfs_min_auto_ashift)
	return (EINVAL);

	zfs_max_auto_ashift = val;

	return (0);
	}
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
	CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t),
	sysctl_vfs_zfs_max_auto_ashift, "QU",
	"Max ashift used when optimising for logical -> physical sectors size on "
	"new top-level vdevs.");

	static int
	sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
	{
	uint64_t val;
	int err;

	val = zfs_min_auto_ashift;
	err = sysctl_handle_64(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (val < SPA_MINASHIFT \|\| val > zfs_max_auto_ashift)
	return (EINVAL);

	zfs_min_auto_ashift = val;

	return (0);
	}
	SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
	CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t),
	sysctl_vfs_zfs_min_auto_ashift, "QU",
	"Min ashift used when creating new top-level vdevs.");

	static vdev_ops_t *vdev_ops_table[] = {
	&vdev_root_ops,
	&vdev_raidz_ops,
	&vdev_mirror_ops,
	&vdev_replacing_ops,
	&vdev_spare_ops,
	#ifdef _KERNEL
	&vdev_geom_ops,
	#else
	&vdev_disk_ops,
	#endif
	&vdev_file_ops,
	&vdev_missing_ops,
	&vdev_hole_ops,
	+ &vdev_indirect_ops,
	NULL
	};


	/*
	* When a vdev is added, it will be divided into approximately (but no
	* more than) this number of metaslabs.
	*/
	int metaslabs_per_vdev = 200;
	SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN,
	&metaslabs_per_vdev, 0,
	"When a vdev is added, how many metaslabs the vdev should be divided into");

	/*
	* Given a vdev type, return the appropriate ops vector.
	*/
	static vdev_ops_t *
	vdev_getops(const char *type)
	{
	vdev_ops_t ops, *opspp;

	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
	if (strcmp(ops->vdev_op_type, type) == 0)
	break;

	return (ops);
	}

	/*
	* Default asize function: return the MAX of psize with the asize of
	* all children. This is what's used by anything other than RAID-Z.
	*/
	uint64_t
	vdev_default_asize(vdev_t *vd, uint64_t psize)
	{
	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
	uint64_t csize;

	for (int c = 0; c < vd->vdev_children; c++) {
	csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
	asize = MAX(asize, csize);
	}

	return (asize);
	}

	/*
	* Get the minimum allocatable size. We define the allocatable size as
	* the vdev's asize rounded to the nearest metaslab. This allows us to
	* replace or attach devices which don't have the same physical size but
	* can still satisfy the same number of allocations.
	*/
	uint64_t
	vdev_get_min_asize(vdev_t *vd)
	{
	vdev_t *pvd = vd->vdev_parent;

	/*
	* If our parent is NULL (inactive spare or cache) or is the root,
	* just return our own asize.
	*/
	if (pvd == NULL)
	return (vd->vdev_asize);

	/*
	* The top-level vdev just returns the allocatable size rounded
	* to the nearest metaslab.
	*/
	if (vd == vd->vdev_top)
	return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));

	/*
	* The allocatable space for a raidz vdev is N * sizeof(smallest child),
	* so each child must provide at least 1/Nth of its asize.
	*/
	if (pvd->vdev_ops == &vdev_raidz_ops)
	return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
	pvd->vdev_children);

	return (pvd->vdev_min_asize);
	}

	void
	vdev_set_min_asize(vdev_t *vd)
	{
	vd->vdev_min_asize = vdev_get_min_asize(vd);

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_set_min_asize(vd->vdev_child[c]);
	}

	vdev_t *
	vdev_lookup_top(spa_t *spa, uint64_t vdev)
	{
	vdev_t *rvd = spa->spa_root_vdev;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);

	if (vdev < rvd->vdev_children) {
	ASSERT(rvd->vdev_child[vdev] != NULL);
	return (rvd->vdev_child[vdev]);
	}

	return (NULL);
	}

	vdev_t *
	vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
	{
	vdev_t *mvd;

	if (vd->vdev_guid == guid)
	return (vd);

	for (int c = 0; c < vd->vdev_children; c++)
	if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
	NULL)
	return (mvd);

	return (NULL);
	}

	static int
	vdev_count_leaves_impl(vdev_t *vd)
	{
	int n = 0;

	if (vd->vdev_ops->vdev_op_leaf)
	return (1);

	for (int c = 0; c < vd->vdev_children; c++)
	n += vdev_count_leaves_impl(vd->vdev_child[c]);

	return (n);
	}

	int
	vdev_count_leaves(spa_t *spa)
	{
	return (vdev_count_leaves_impl(spa->spa_root_vdev));
	}

	void
	vdev_add_child(vdev_t pvd, vdev_t cvd)
	{
	size_t oldsize, newsize;
	uint64_t id = cvd->vdev_id;
	vdev_t **newchild;
	spa_t *spa = cvd->vdev_spa;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	ASSERT(cvd->vdev_parent == NULL);

	cvd->vdev_parent = pvd;

	if (pvd == NULL)
	return;

	ASSERT(id >= pvd->vdev_children \|\| pvd->vdev_child[id] == NULL);

	oldsize = pvd->vdev_children * sizeof (vdev_t *);
	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
	newsize = pvd->vdev_children * sizeof (vdev_t *);

	newchild = kmem_zalloc(newsize, KM_SLEEP);
	if (pvd->vdev_child != NULL) {
	bcopy(pvd->vdev_child, newchild, oldsize);
	kmem_free(pvd->vdev_child, oldsize);
	}

	pvd->vdev_child = newchild;
	pvd->vdev_child[id] = cvd;

	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);

	/*
	* Walk up all ancestors to update guid sum.
	*/
	for (; pvd != NULL; pvd = pvd->vdev_parent)
	pvd->vdev_guid_sum += cvd->vdev_guid_sum;
	}

	void
	vdev_remove_child(vdev_t pvd, vdev_t cvd)
	{
	int c;
	uint_t id = cvd->vdev_id;

	ASSERT(cvd->vdev_parent == pvd);

	if (pvd == NULL)
	return;

	ASSERT(id < pvd->vdev_children);
	ASSERT(pvd->vdev_child[id] == cvd);

	pvd->vdev_child[id] = NULL;
	cvd->vdev_parent = NULL;

	for (c = 0; c < pvd->vdev_children; c++)
	if (pvd->vdev_child[c])
	break;

	if (c == pvd->vdev_children) {
	kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
	pvd->vdev_child = NULL;
	pvd->vdev_children = 0;
	}

	/*
	* Walk up all ancestors to update guid sum.
	*/
	for (; pvd != NULL; pvd = pvd->vdev_parent)
	pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
	}

	/*
	* Remove any holes in the child array.
	*/
	void
	vdev_compact_children(vdev_t *pvd)
	{
	vdev_t *newchild, cvd;
	int oldc = pvd->vdev_children;
	int newc;

	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	for (int c = newc = 0; c < oldc; c++)
	if (pvd->vdev_child[c])
	newc++;

	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);

	for (int c = newc = 0; c < oldc; c++) {
	if ((cvd = pvd->vdev_child[c]) != NULL) {
	newchild[newc] = cvd;
	cvd->vdev_id = newc++;
	}
	}

	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
	pvd->vdev_child = newchild;
	pvd->vdev_children = newc;
	}

	/*
	* Allocate and minimally initialize a vdev_t.
	*/
	vdev_t *
	vdev_alloc_common(spa_t spa, uint_t id, uint64_t guid, vdev_ops_t ops)
	{
	vdev_t *vd;
	+ vdev_indirect_config_t *vic;

	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
	+ vic = &vd->vdev_indirect_config;

	if (spa->spa_root_vdev == NULL) {
	ASSERT(ops == &vdev_root_ops);
	spa->spa_root_vdev = vd;
	spa->spa_load_guid = spa_generate_guid(NULL);
	}

	if (guid == 0 && ops != &vdev_hole_ops) {
	if (spa->spa_root_vdev == vd) {
	/*
	* The root vdev's guid will also be the pool guid,
	* which must be unique among all pools.
	*/
	guid = spa_generate_guid(NULL);
	} else {
	/*
	* Any other vdev's guid must be unique within the pool.
	*/
	guid = spa_generate_guid(spa);
	}
	ASSERT(!spa_guid_exists(spa_guid(spa), guid));
	}

	vd->vdev_spa = spa;
	vd->vdev_id = id;
	vd->vdev_guid = guid;
	vd->vdev_guid_sum = guid;
	vd->vdev_ops = ops;
	vd->vdev_state = VDEV_STATE_CLOSED;
	vd->vdev_ishole = (ops == &vdev_hole_ops);
	+ vic->vic_prev_indirect_vdev = UINT64_MAX;

	+ rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
	+ mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
	+ vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
	+
	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
	for (int t = 0; t < DTL_TYPES; t++) {
	- vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
	- &vd->vdev_dtl_lock);
	+ vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
	}
	txg_list_create(&vd->vdev_ms_list, spa,
	offsetof(struct metaslab, ms_txg_node));
	txg_list_create(&vd->vdev_dtl_list, spa,
	offsetof(struct vdev, vdev_dtl_node));
	vd->vdev_stat.vs_timestamp = gethrtime();
	vdev_queue_init(vd);
	vdev_cache_init(vd);

	return (vd);
	}

	/*
	* Allocate a new vdev. The 'alloctype' is used to control whether we are
	* creating a new vdev or loading an existing one - the behavior is slightly
	* different for each case.
	*/
	int
	vdev_alloc(spa_t spa, vdev_t vdp, nvlist_t nv, vdev_t *parent, uint_t id,
	int alloctype)
	{
	vdev_ops_t *ops;
	char *type;
	uint64_t guid = 0, islog, nparity;
	vdev_t *vd;
	+ vdev_indirect_config_t *vic;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
	return (SET_ERROR(EINVAL));

	if ((ops = vdev_getops(type)) == NULL)
	return (SET_ERROR(EINVAL));

	/*
	* If this is a load, get the vdev guid from the nvlist.
	* Otherwise, vdev_alloc_common() will generate one for us.
	*/
	if (alloctype == VDEV_ALLOC_LOAD) {
	uint64_t label_id;

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) \|\|
	label_id != id)
	return (SET_ERROR(EINVAL));

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	return (SET_ERROR(EINVAL));
	} else if (alloctype == VDEV_ALLOC_SPARE) {
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	return (SET_ERROR(EINVAL));
	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	return (SET_ERROR(EINVAL));
	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	return (SET_ERROR(EINVAL));
	}

	/*
	* The first allocated vdev must be of type 'root'.
	*/
	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
	return (SET_ERROR(EINVAL));

	/*
	* Determine whether we're a log vdev.
	*/
	islog = 0;
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
	return (SET_ERROR(ENOTSUP));

	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
	return (SET_ERROR(ENOTSUP));

	/*
	* Set the nparity property for RAID-Z vdevs.
	*/
	nparity = -1ULL;
	if (ops == &vdev_raidz_ops) {
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
	&nparity) == 0) {
	if (nparity == 0 \|\| nparity > VDEV_RAIDZ_MAXPARITY)
	return (SET_ERROR(EINVAL));
	/*
	* Previous versions could only support 1 or 2 parity
	* device.
	*/
	if (nparity > 1 &&
	spa_version(spa) < SPA_VERSION_RAIDZ2)
	return (SET_ERROR(ENOTSUP));
	if (nparity > 2 &&
	spa_version(spa) < SPA_VERSION_RAIDZ3)
	return (SET_ERROR(ENOTSUP));
	} else {
	/*
	* We require the parity to be specified for SPAs that
	* support multiple parity levels.
	*/
	if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
	return (SET_ERROR(EINVAL));
	/*
	* Otherwise, we default to 1 parity device for RAID-Z.
	*/
	nparity = 1;
	}
	} else {
	nparity = 0;
	}
	ASSERT(nparity != -1ULL);

	vd = vdev_alloc_common(spa, id, guid, ops);
	+ vic = &vd->vdev_indirect_config;

	vd->vdev_islog = islog;
	vd->vdev_nparity = nparity;

	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
	vd->vdev_path = spa_strdup(vd->vdev_path);
	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
	vd->vdev_devid = spa_strdup(vd->vdev_devid);
	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
	&vd->vdev_physpath) == 0)
	vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
	vd->vdev_fru = spa_strdup(vd->vdev_fru);

	/*
	* Set the whole_disk property. If it's not specified, leave the value
	* as -1.
	*/
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	&vd->vdev_wholedisk) != 0)
	vd->vdev_wholedisk = -1ULL;

	+ ASSERT0(vic->vic_mapping_object);
	+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
	+ &vic->vic_mapping_object);
	+ ASSERT0(vic->vic_births_object);
	+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
	+ &vic->vic_births_object);
	+ ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
	+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
	+ &vic->vic_prev_indirect_vdev);
	+
	/*
	* Look for the 'not present' flag. This will only be set if the device
	* was not present at the time of import.
	*/
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
	&vd->vdev_not_present);

	/*
	* Get the alignment requirement.
	*/
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);

	/*
	* Retrieve the vdev creation time.
	*/
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
	&vd->vdev_crtxg);

	/*
	* If we're a top-level vdev, try to load the allocation parameters.
	*/
	if (parent && !parent->vdev_parent &&
	(alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_SPLIT)) {
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
	&vd->vdev_ms_array);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
	&vd->vdev_ms_shift);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
	&vd->vdev_asize);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
	&vd->vdev_removing);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
	&vd->vdev_top_zap);
	} else {
	ASSERT0(vd->vdev_top_zap);
	}

	if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
	ASSERT(alloctype == VDEV_ALLOC_LOAD \|\|
	alloctype == VDEV_ALLOC_ADD \|\|
	alloctype == VDEV_ALLOC_SPLIT \|\|
	alloctype == VDEV_ALLOC_ROOTPOOL);
	vd->vdev_mg = metaslab_group_create(islog ?
	spa_log_class(spa) : spa_normal_class(spa), vd);
	}

	if (vd->vdev_ops->vdev_op_leaf &&
	(alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_SPLIT)) {
	(void) nvlist_lookup_uint64(nv,
	ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
	} else {
	ASSERT0(vd->vdev_leaf_zap);
	}

	/*
	* If we're a leaf vdev, try to load the DTL object and other state.
	*/

	if (vd->vdev_ops->vdev_op_leaf &&
	(alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_L2CACHE \|\|
	alloctype == VDEV_ALLOC_ROOTPOOL)) {
	if (alloctype == VDEV_ALLOC_LOAD) {
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
	&vd->vdev_dtl_object);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
	&vd->vdev_unspare);
	}

	if (alloctype == VDEV_ALLOC_ROOTPOOL) {
	uint64_t spare = 0;

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
	&spare) == 0 && spare)
	spa_spare_add(vd);
	}

	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
	&vd->vdev_offline);

	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
	&vd->vdev_resilver_txg);

	/*
	* When importing a pool, we want to ignore the persistent fault
	* state, as the diagnosis made on another system may not be
	* valid in the current context. Local vdevs will
	* remain in the faulted state.
	*/
	if (spa_load_state(spa) == SPA_LOAD_OPEN) {
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
	&vd->vdev_faulted);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
	&vd->vdev_degraded);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
	&vd->vdev_removed);

	if (vd->vdev_faulted \|\| vd->vdev_degraded) {
	char *aux;

	vd->vdev_label_aux =
	VDEV_AUX_ERR_EXCEEDED;
	if (nvlist_lookup_string(nv,
	ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
	strcmp(aux, "external") == 0)
	vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
	}
	}
	}

	/*
	* Add ourselves to the parent's list of children.
	*/
	vdev_add_child(parent, vd);

	*vdp = vd;

	return (0);
	}

	void
	vdev_free(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	/*
	* vdev_free() implies closing the vdev first. This is simpler than
	* trying to ensure complicated semantics for all callers.
	*/
	vdev_close(vd);

	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));

	/*
	* Free all children.
	*/
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_free(vd->vdev_child[c]);

	ASSERT(vd->vdev_child == NULL);
	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);

	/*
	* Discard allocation state.
	*/
	if (vd->vdev_mg != NULL) {
	vdev_metaslab_fini(vd);
	metaslab_group_destroy(vd->vdev_mg);
	}

	ASSERT0(vd->vdev_stat.vs_space);
	ASSERT0(vd->vdev_stat.vs_dspace);
	ASSERT0(vd->vdev_stat.vs_alloc);

	/*
	* Remove this vdev from its parent's child list.
	*/
	vdev_remove_child(vd->vdev_parent, vd);

	ASSERT(vd->vdev_parent == NULL);

	/*
	* Clean up vdev structure.
	*/
	vdev_queue_fini(vd);
	vdev_cache_fini(vd);

	if (vd->vdev_path)
	spa_strfree(vd->vdev_path);
	if (vd->vdev_devid)
	spa_strfree(vd->vdev_devid);
	if (vd->vdev_physpath)
	spa_strfree(vd->vdev_physpath);
	if (vd->vdev_fru)
	spa_strfree(vd->vdev_fru);

	if (vd->vdev_isspare)
	spa_spare_remove(vd);
	if (vd->vdev_isl2cache)
	spa_l2cache_remove(vd);

	txg_list_destroy(&vd->vdev_ms_list);
	txg_list_destroy(&vd->vdev_dtl_list);

	mutex_enter(&vd->vdev_dtl_lock);
	space_map_close(vd->vdev_dtl_sm);
	for (int t = 0; t < DTL_TYPES; t++) {
	range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
	range_tree_destroy(vd->vdev_dtl[t]);
	}
	mutex_exit(&vd->vdev_dtl_lock);

	+ EQUIV(vd->vdev_indirect_births != NULL,
	+ vd->vdev_indirect_mapping != NULL);
	+ if (vd->vdev_indirect_births != NULL) {
	+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
	+ vdev_indirect_births_close(vd->vdev_indirect_births);
	+ }
	+
	+ if (vd->vdev_obsolete_sm != NULL) {
	+ ASSERT(vd->vdev_removing \|\|
	+ vd->vdev_ops == &vdev_indirect_ops);
	+ space_map_close(vd->vdev_obsolete_sm);
	+ vd->vdev_obsolete_sm = NULL;
	+ }
	+ range_tree_destroy(vd->vdev_obsolete_segments);
	+ rw_destroy(&vd->vdev_indirect_rwlock);
	+ mutex_destroy(&vd->vdev_obsolete_lock);
	+
	mutex_destroy(&vd->vdev_queue_lock);
	mutex_destroy(&vd->vdev_dtl_lock);
	mutex_destroy(&vd->vdev_stat_lock);
	mutex_destroy(&vd->vdev_probe_lock);

	if (vd == spa->spa_root_vdev)
	spa->spa_root_vdev = NULL;

	kmem_free(vd, sizeof (vdev_t));
	}

	/*
	* Transfer top-level vdev state from svd to tvd.
	*/
	static void
	vdev_top_transfer(vdev_t svd, vdev_t tvd)
	{
	spa_t *spa = svd->vdev_spa;
	metaslab_t *msp;
	vdev_t *vd;
	int t;

	ASSERT(tvd == tvd->vdev_top);

	tvd->vdev_ms_array = svd->vdev_ms_array;
	tvd->vdev_ms_shift = svd->vdev_ms_shift;
	tvd->vdev_ms_count = svd->vdev_ms_count;
	tvd->vdev_top_zap = svd->vdev_top_zap;

	svd->vdev_ms_array = 0;
	svd->vdev_ms_shift = 0;
	svd->vdev_ms_count = 0;
	svd->vdev_top_zap = 0;

	if (tvd->vdev_mg)
	ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
	tvd->vdev_mg = svd->vdev_mg;
	tvd->vdev_ms = svd->vdev_ms;

	svd->vdev_mg = NULL;
	svd->vdev_ms = NULL;

	if (tvd->vdev_mg != NULL)
	tvd->vdev_mg->mg_vd = tvd;

	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;

	svd->vdev_stat.vs_alloc = 0;
	svd->vdev_stat.vs_space = 0;
	svd->vdev_stat.vs_dspace = 0;

	for (t = 0; t < TXG_SIZE; t++) {
	while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
	(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
	while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
	(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
	if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
	(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
	}

	if (list_link_active(&svd->vdev_config_dirty_node)) {
	vdev_config_clean(svd);
	vdev_config_dirty(tvd);
	}

	if (list_link_active(&svd->vdev_state_dirty_node)) {
	vdev_state_clean(svd);
	vdev_state_dirty(tvd);
	}

	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
	svd->vdev_deflate_ratio = 0;

	tvd->vdev_islog = svd->vdev_islog;
	svd->vdev_islog = 0;
	}

	static void
	vdev_top_update(vdev_t tvd, vdev_t vd)
	{
	if (vd == NULL)
	return;

	vd->vdev_top = tvd;

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_top_update(tvd, vd->vdev_child[c]);
	}

	/*
	* Add a mirror/replacing vdev above an existing vdev.
	*/
	vdev_t *
	vdev_add_parent(vdev_t cvd, vdev_ops_t ops)
	{
	spa_t *spa = cvd->vdev_spa;
	vdev_t *pvd = cvd->vdev_parent;
	vdev_t *mvd;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);

	mvd->vdev_asize = cvd->vdev_asize;
	mvd->vdev_min_asize = cvd->vdev_min_asize;
	mvd->vdev_max_asize = cvd->vdev_max_asize;
	+ mvd->vdev_psize = cvd->vdev_psize;
	mvd->vdev_ashift = cvd->vdev_ashift;
	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
	mvd->vdev_state = cvd->vdev_state;
	mvd->vdev_crtxg = cvd->vdev_crtxg;

	vdev_remove_child(pvd, cvd);
	vdev_add_child(pvd, mvd);
	cvd->vdev_id = mvd->vdev_children;
	vdev_add_child(mvd, cvd);
	vdev_top_update(cvd->vdev_top, cvd->vdev_top);

	if (mvd == mvd->vdev_top)
	vdev_top_transfer(cvd, mvd);

	return (mvd);
	}

	/*
	* Remove a 1-way mirror/replacing vdev from the tree.
	*/
	void
	vdev_remove_parent(vdev_t *cvd)
	{
	vdev_t *mvd = cvd->vdev_parent;
	vdev_t *pvd = mvd->vdev_parent;

	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	ASSERT(mvd->vdev_children == 1);
	ASSERT(mvd->vdev_ops == &vdev_mirror_ops \|\|
	mvd->vdev_ops == &vdev_replacing_ops \|\|
	mvd->vdev_ops == &vdev_spare_ops);
	cvd->vdev_ashift = mvd->vdev_ashift;
	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;

	vdev_remove_child(mvd, cvd);
	vdev_remove_child(pvd, mvd);

	/*
	* If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
	* Otherwise, we could have detached an offline device, and when we
	* go to import the pool we'll think we have two top-level vdevs,
	* instead of a different version of the same top-level vdev.
	*/
	if (mvd->vdev_top == mvd) {
	uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
	cvd->vdev_orig_guid = cvd->vdev_guid;
	cvd->vdev_guid += guid_delta;
	cvd->vdev_guid_sum += guid_delta;
	}
	cvd->vdev_id = mvd->vdev_id;
	vdev_add_child(pvd, cvd);
	vdev_top_update(cvd->vdev_top, cvd->vdev_top);

	if (cvd == cvd->vdev_top)
	vdev_top_transfer(mvd, cvd);

	ASSERT(mvd->vdev_children == 0);
	vdev_free(mvd);
	}

	int
	vdev_metaslab_init(vdev_t *vd, uint64_t txg)
	{
	spa_t *spa = vd->vdev_spa;
	objset_t *mos = spa->spa_meta_objset;
	uint64_t m;
	uint64_t oldc = vd->vdev_ms_count;
	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
	metaslab_t **mspp;
	int error;

	ASSERT(txg == 0 \|\| spa_config_held(spa, SCL_ALLOC, RW_WRITER));

	/*
	* This vdev is not being allocated from yet or is a hole.
	*/
	if (vd->vdev_ms_shift == 0)
	return (0);

	ASSERT(!vd->vdev_ishole);

	- /*
	- * Compute the raidz-deflation ratio. Note, we hard-code
	- * in 128k (1 << 17) because it is the "typical" blocksize.
	- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
	- * otherwise it would inconsistently account for existing bp's.
	- */
	- vd->vdev_deflate_ratio = (1 << 17) /
	- (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
	-
	ASSERT(oldc <= newc);

	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);

	if (oldc != 0) {
	bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
	kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
	}

	vd->vdev_ms = mspp;
	vd->vdev_ms_count = newc;

	for (m = oldc; m < newc; m++) {
	uint64_t object = 0;

	- if (txg == 0) {
	+ /*
	+ * vdev_ms_array may be 0 if we are creating the "fake"
	+ * metaslabs for an indirect vdev for zdb's leak detection.
	+ * See zdb_leak_init().
	+ */
	+ if (txg == 0 && vd->vdev_ms_array != 0) {
	error = dmu_read(mos, vd->vdev_ms_array,
	m * sizeof (uint64_t), sizeof (uint64_t), &object,
	DMU_READ_PREFETCH);
	if (error)
	return (error);
	}

	error = metaslab_init(vd->vdev_mg, m, object, txg,
	&(vd->vdev_ms[m]));
	if (error)
	return (error);
	}

	if (txg == 0)
	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);

	/*
	* If the vdev is being removed we don't activate
	* the metaslabs since we want to ensure that no new
	* allocations are performed on this device.
	*/
	if (oldc == 0 && !vd->vdev_removing)
	metaslab_group_activate(vd->vdev_mg);

	if (txg == 0)
	spa_config_exit(spa, SCL_ALLOC, FTAG);

	return (0);
	}

	void
	vdev_metaslab_fini(vdev_t *vd)
	{
	- uint64_t m;
	- uint64_t count = vd->vdev_ms_count;
	-
	if (vd->vdev_ms != NULL) {
	+ uint64_t count = vd->vdev_ms_count;
	+
	metaslab_group_passivate(vd->vdev_mg);
	- for (m = 0; m < count; m++) {
	+ for (uint64_t m = 0; m < count; m++) {
	metaslab_t *msp = vd->vdev_ms[m];

	if (msp != NULL)
	metaslab_fini(msp);
	}
	kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
	vd->vdev_ms = NULL;
	+
	+ vd->vdev_ms_count = 0;
	}
	+ ASSERT0(vd->vdev_ms_count);
	}

	typedef struct vdev_probe_stats {
	boolean_t vps_readable;
	boolean_t vps_writeable;
	int vps_flags;
	} vdev_probe_stats_t;

	static void
	vdev_probe_done(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;
	vdev_t *vd = zio->io_vd;
	vdev_probe_stats_t *vps = zio->io_private;

	ASSERT(vd->vdev_probe_zio != NULL);

	if (zio->io_type == ZIO_TYPE_READ) {
	if (zio->io_error == 0)
	vps->vps_readable = 1;
	if (zio->io_error == 0 && spa_writeable(spa)) {
	zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
	zio->io_offset, zio->io_size, zio->io_abd,
	ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
	ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
	} else {
	abd_free(zio->io_abd);
	}
	} else if (zio->io_type == ZIO_TYPE_WRITE) {
	if (zio->io_error == 0)
	vps->vps_writeable = 1;
	abd_free(zio->io_abd);
	} else if (zio->io_type == ZIO_TYPE_NULL) {
	zio_t *pio;

	vd->vdev_cant_read \|= !vps->vps_readable;
	vd->vdev_cant_write \|= !vps->vps_writeable;

	if (vdev_readable(vd) &&
	(vdev_writeable(vd) \|\| !spa_writeable(spa))) {
	zio->io_error = 0;
	} else {
	ASSERT(zio->io_error != 0);
	+ zfs_dbgmsg("failed probe on vdev %llu",
	+ (longlong_t)vd->vdev_id);
	zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
	spa, vd, NULL, 0, 0);
	zio->io_error = SET_ERROR(ENXIO);
	}

	mutex_enter(&vd->vdev_probe_lock);
	ASSERT(vd->vdev_probe_zio == zio);
	vd->vdev_probe_zio = NULL;
	mutex_exit(&vd->vdev_probe_lock);

	zio_link_t *zl = NULL;
	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
	if (!vdev_accessible(vd, pio))
	pio->io_error = SET_ERROR(ENXIO);

	kmem_free(vps, sizeof (*vps));
	}
	}

	/*
	* Determine whether this device is accessible.
	*
	* Read and write to several known locations: the pad regions of each
	* vdev label but the first, which we leave alone in case it contains
	* a VTOC.
	*/
	zio_t *
	vdev_probe(vdev_t vd, zio_t zio)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_probe_stats_t *vps = NULL;
	zio_t *pio;

	ASSERT(vd->vdev_ops->vdev_op_leaf);

	/*
	* Don't probe the probe.
	*/
	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
	return (NULL);

	/*
	* To prevent 'probe storms' when a device fails, we create
	* just one probe i/o at a time. All zios that want to probe
	* this vdev will become parents of the probe io.
	*/
	mutex_enter(&vd->vdev_probe_lock);

	if ((pio = vd->vdev_probe_zio) == NULL) {
	vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);

	vps->vps_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_PROBE \|
	ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_AGGREGATE \|
	ZIO_FLAG_TRYHARD;

	if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
	/*
	* vdev_cant_read and vdev_cant_write can only
	* transition from TRUE to FALSE when we have the
	* SCL_ZIO lock as writer; otherwise they can only
	* transition from FALSE to TRUE. This ensures that
	* any zio looking at these values can assume that
	* failures persist for the life of the I/O. That's
	* important because when a device has intermittent
	* connectivity problems, we want to ensure that
	* they're ascribed to the device (ENXIO) and not
	* the zio (EIO).
	*
	* Since we hold SCL_ZIO as writer here, clear both
	* values so the probe can reevaluate from first
	* principles.
	*/
	vps->vps_flags \|= ZIO_FLAG_CONFIG_WRITER;
	vd->vdev_cant_read = B_FALSE;
	vd->vdev_cant_write = B_FALSE;
	}

	vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
	vdev_probe_done, vps,
	vps->vps_flags \| ZIO_FLAG_DONT_PROPAGATE);

	/*
	* We can't change the vdev state in this context, so we
	* kick off an async task to do it on our behalf.
	*/
	if (zio != NULL) {
	vd->vdev_probe_wanted = B_TRUE;
	spa_async_request(spa, SPA_ASYNC_PROBE);
	}
	}

	if (zio != NULL)
	zio_add_child(zio, pio);

	mutex_exit(&vd->vdev_probe_lock);

	if (vps == NULL) {
	ASSERT(zio != NULL);
	return (NULL);
	}

	for (int l = 1; l < VDEV_LABELS; l++) {
	zio_nowait(zio_read_phys(pio, vd,
	vdev_label_offset(vd->vdev_psize, l,
	offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
	abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
	ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
	ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
	}

	if (zio == NULL)
	return (pio);

	zio_nowait(pio);
	return (NULL);
	}

	static void
	vdev_open_child(void *arg)
	{
	vdev_t *vd = arg;

	vd->vdev_open_thread = curthread;
	vd->vdev_open_error = vdev_open(vd);
	vd->vdev_open_thread = NULL;
	}

	boolean_t
	vdev_uses_zvols(vdev_t *vd)
	{
	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
	strlen(ZVOL_DIR)) == 0)
	return (B_TRUE);
	for (int c = 0; c < vd->vdev_children; c++)
	if (vdev_uses_zvols(vd->vdev_child[c]))
	return (B_TRUE);
	return (B_FALSE);
	}

	void
	vdev_open_children(vdev_t *vd)
	{
	taskq_t *tq;
	int children = vd->vdev_children;

	/*
	* in order to handle pools on top of zvols, do the opens
	* in a single thread so that the same thread holds the
	* spa_namespace_lock
	*/
	if (B_TRUE \|\| vdev_uses_zvols(vd)) {
	for (int c = 0; c < children; c++)
	vd->vdev_child[c]->vdev_open_error =
	vdev_open(vd->vdev_child[c]);
	return;
	}
	tq = taskq_create("vdev_open", children, minclsyspri,
	children, children, TASKQ_PREPOPULATE);

	for (int c = 0; c < children; c++)
	VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
	TQ_SLEEP) != 0);

	taskq_destroy(tq);
	}

	/*
	+ * Compute the raidz-deflation ratio. Note, we hard-code
	+ * in 128k (1 << 17) because it is the "typical" blocksize.
	+ * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
	+ * otherwise it would inconsistently account for existing bp's.
	+ */
	+static void
	+vdev_set_deflate_ratio(vdev_t *vd)
	+{
	+ if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
	+ vd->vdev_deflate_ratio = (1 << 17) /
	+ (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
	+ }
	+}
	+
	+/*
	* Prepare a virtual device for access.
	*/
	int
	vdev_open(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	int error;
	uint64_t osize = 0;
	uint64_t max_osize = 0;
	uint64_t asize, max_asize, psize;
	uint64_t logical_ashift = 0;
	uint64_t physical_ashift = 0;

	ASSERT(vd->vdev_open_thread == curthread \|\|
	spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED \|\|
	vd->vdev_state == VDEV_STATE_CANT_OPEN \|\|
	vd->vdev_state == VDEV_STATE_OFFLINE);

	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
	vd->vdev_cant_read = B_FALSE;
	vd->vdev_cant_write = B_FALSE;
	vd->vdev_notrim = B_FALSE;
	vd->vdev_min_asize = vdev_get_min_asize(vd);

	/*
	* If this vdev is not removed, check its fault status. If it's
	* faulted, bail out of the open.
	*/
	if (!vd->vdev_removed && vd->vdev_faulted) {
	ASSERT(vd->vdev_children == 0);
	ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED \|\|
	vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
	vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
	vd->vdev_label_aux);
	return (SET_ERROR(ENXIO));
	} else if (vd->vdev_offline) {
	ASSERT(vd->vdev_children == 0);
	vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
	return (SET_ERROR(ENXIO));
	}

	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
	&logical_ashift, &physical_ashift);

	/*
	* Reset the vdev_reopening flag so that we actually close
	* the vdev on error.
	*/
	vd->vdev_reopening = B_FALSE;
	if (zio_injection_enabled && error == 0)
	error = zio_handle_device_injection(vd, NULL, ENXIO);

	if (error) {
	if (vd->vdev_removed &&
	vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
	vd->vdev_removed = B_FALSE;

	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	vd->vdev_stat.vs_aux);
	return (error);
	}

	vd->vdev_removed = B_FALSE;

	/*
	* Recheck the faulted flag now that we have confirmed that
	* the vdev is accessible. If we're faulted, bail.
	*/
	if (vd->vdev_faulted) {
	ASSERT(vd->vdev_children == 0);
	ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED \|\|
	vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
	vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
	vd->vdev_label_aux);
	return (SET_ERROR(ENXIO));
	}

	if (vd->vdev_degraded) {
	ASSERT(vd->vdev_children == 0);
	vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
	VDEV_AUX_ERR_EXCEEDED);
	} else {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
	}

	/*
	* For hole or missing vdevs we just return success.
	*/
	if (vd->vdev_ishole \|\| vd->vdev_ops == &vdev_missing_ops)
	return (0);

	if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf)
	trim_map_create(vd);

	for (int c = 0; c < vd->vdev_children; c++) {
	if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
	VDEV_AUX_NONE);
	break;
	}
	}

	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));

	if (vd->vdev_children == 0) {
	if (osize < SPA_MINDEVSIZE) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_TOO_SMALL);
	return (SET_ERROR(EOVERFLOW));
	}
	psize = osize;
	asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
	max_asize = max_osize - (VDEV_LABEL_START_SIZE +
	VDEV_LABEL_END_SIZE);
	} else {
	if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
	(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_TOO_SMALL);
	return (SET_ERROR(EOVERFLOW));
	}
	psize = 0;
	asize = osize;
	max_asize = max_osize;
	}

	vd->vdev_psize = psize;

	/*
	* Make sure the allocatable size hasn't shrunk too much.
	*/
	if (asize < vd->vdev_min_asize) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_BAD_LABEL);
	return (SET_ERROR(EINVAL));
	}

	vd->vdev_physical_ashift =
	MAX(physical_ashift, vd->vdev_physical_ashift);
	vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
	vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);

	if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_ASHIFT_TOO_BIG);
	return (EINVAL);
	}

	if (vd->vdev_asize == 0) {
	/*
	* This is the first-ever open, so use the computed values.
	* For testing purposes, a higher ashift can be requested.
	*/
	vd->vdev_asize = asize;
	vd->vdev_max_asize = max_asize;
	} else {
	/*
	* Make sure the alignment requirement hasn't increased.
	*/
	if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
	vd->vdev_ops->vdev_op_leaf) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_BAD_LABEL);
	return (EINVAL);
	}
	vd->vdev_max_asize = max_asize;
	}

	/*
	* If all children are healthy we update asize if either:
	* The asize has increased, due to a device expansion caused by dynamic
	* LUN growth or vdev replacement, and automatic expansion is enabled;
	* making the additional space available.
	*
	* The asize has decreased, due to a device shrink usually caused by a
	* vdev replace with a smaller device. This ensures that calculations
	* based of max_asize and asize e.g. esize are always valid. It's safe
	* to do this as we've already validated that asize is greater than
	* vdev_min_asize.
	*/
	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
	((asize > vd->vdev_asize &&
	(vd->vdev_expanding \|\| spa->spa_autoexpand)) \|\|
	(asize < vd->vdev_asize)))
	vd->vdev_asize = asize;

	vdev_set_min_asize(vd);

	/*
	* Ensure we can issue some IO before declaring the
	* vdev open for business.
	*/
	if (vd->vdev_ops->vdev_op_leaf &&
	(error = zio_wait(vdev_probe(vd, NULL))) != 0) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
	VDEV_AUX_ERR_EXCEEDED);
	return (error);
	}

	+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
	+ !vd->vdev_isl2cache && !vd->vdev_islog) {
	+ if (vd->vdev_ashift > spa->spa_max_ashift)
	+ spa->spa_max_ashift = vd->vdev_ashift;
	+ if (vd->vdev_ashift < spa->spa_min_ashift)
	+ spa->spa_min_ashift = vd->vdev_ashift;
	+ }
	+
	/*
	* Track the min and max ashift values for normal data devices.
	*/
	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
	!vd->vdev_islog && vd->vdev_aux == NULL) {
	if (vd->vdev_ashift > spa->spa_max_ashift)
	spa->spa_max_ashift = vd->vdev_ashift;
	if (vd->vdev_ashift < spa->spa_min_ashift)
	spa->spa_min_ashift = vd->vdev_ashift;
	}

	/*
	* If a leaf vdev has a DTL, and seems healthy, then kick off a
	* resilver. But don't do this if we are doing a reopen for a scrub,
	* since this would just restart the scrub we are already doing.
	*/
	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
	vdev_resilver_needed(vd, NULL, NULL))
	spa_async_request(spa, SPA_ASYNC_RESILVER);

	return (0);
	}

	/*
	* Called once the vdevs are all opened, this routine validates the label
	* contents. This needs to be done before vdev_load() so that we don't
	* inadvertently do repair I/Os to the wrong device.
	*
	* If 'strict' is false ignore the spa guid check. This is necessary because
	* if the machine crashed during a re-guid the new guid might have been written
	* to all of the vdev labels, but not the cached config. The strict check
	* will be performed when the pool is opened again using the mos config.
	*
	* This function will only return failure if one of the vdevs indicates that it
	* has since been destroyed or exported. This is only possible if
	* /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
	* will be updated but the function will return 0.
	*/
	int
	vdev_validate(vdev_t *vd, boolean_t strict)
	{
	spa_t *spa = vd->vdev_spa;
	nvlist_t *label;
	uint64_t guid = 0, top_guid;
	uint64_t state;

	for (int c = 0; c < vd->vdev_children; c++)
	if (vdev_validate(vd->vdev_child[c], strict) != 0)
	return (SET_ERROR(EBADF));

	/*
	* If the device has already failed, or was marked offline, don't do
	* any further validation. Otherwise, label I/O will fail and we will
	* overwrite the previous state.
	*/
	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
	uint64_t aux_guid = 0;
	nvlist_t *nvl;
	uint64_t txg = spa_last_synced_txg(spa) != 0 ?
	spa_last_synced_txg(spa) : -1ULL;

	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_BAD_LABEL);
	return (0);
	}

	/*
	* Determine if this vdev has been split off into another
	* pool. If so, then refuse to open it.
	*/
	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
	&aux_guid) == 0 && aux_guid == spa_guid(spa)) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_SPLIT_POOL);
	nvlist_free(label);
	return (0);
	}

	if (strict && (nvlist_lookup_uint64(label,
	ZPOOL_CONFIG_POOL_GUID, &guid) != 0 \|\|
	guid != spa_guid(spa))) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	nvlist_free(label);
	return (0);
	}

	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
	!= 0 \|\| nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
	&aux_guid) != 0)
	aux_guid = 0;

	/*
	* If this vdev just became a top-level vdev because its
	* sibling was detached, it will have adopted the parent's
	* vdev guid -- but the label may or may not be on disk yet.
	* Fortunately, either version of the label will have the
	* same top guid, so if we're a top-level vdev, we can
	* safely compare to that instead.
	*
	* If we split this vdev off instead, then we also check the
	* original pool's guid. We don't want to consider the vdev
	* corrupt if it is partway through a split operation.
	*/
	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
	&guid) != 0 \|\|
	nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
	&top_guid) != 0 \|\|
	((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
	(vd->vdev_guid != top_guid \|\| vd != vd->vdev_top))) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	nvlist_free(label);
	return (0);
	}

	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
	&state) != 0) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	nvlist_free(label);
	return (0);
	}

	nvlist_free(label);

	/*
	* If this is a verbatim import, no need to check the
	* state of the pool.
	*/
	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
	spa_load_state(spa) == SPA_LOAD_OPEN &&
	state != POOL_STATE_ACTIVE)
	return (SET_ERROR(EBADF));

	/*
	* If we were able to open and validate a vdev that was
	* previously marked permanently unavailable, clear that state
	* now.
	*/
	if (vd->vdev_not_present)
	vd->vdev_not_present = 0;
	}

	return (0);
	}

	/*
	* Close a virtual device.
	*/
	void
	vdev_close(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *pvd = vd->vdev_parent;

	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

	/*
	* If our parent is reopening, then we are as well, unless we are
	* going offline.
	*/
	if (pvd != NULL && pvd->vdev_reopening)
	vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);

	vd->vdev_ops->vdev_op_close(vd);

	vdev_cache_purge(vd);

	if (vd->vdev_ops->vdev_op_leaf)
	trim_map_destroy(vd);

	/*
	* We record the previous state before we close it, so that if we are
	* doing a reopen(), we don't generate FMA ereports if we notice that
	* it's still faulted.
	*/
	vd->vdev_prevstate = vd->vdev_state;

	if (vd->vdev_offline)
	vd->vdev_state = VDEV_STATE_OFFLINE;
	else
	vd->vdev_state = VDEV_STATE_CLOSED;
	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
	}

	void
	vdev_hold(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_is_root(spa));
	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
	return;

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_hold(vd->vdev_child[c]);

	if (vd->vdev_ops->vdev_op_leaf)
	vd->vdev_ops->vdev_op_hold(vd);
	}

	void
	vdev_rele(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_is_root(spa));
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_rele(vd->vdev_child[c]);

	if (vd->vdev_ops->vdev_op_leaf)
	vd->vdev_ops->vdev_op_rele(vd);
	}

	/*
	* Reopen all interior vdevs and any unopened leaves. We don't actually
	* reopen leaf vdevs which had previously been opened as they might deadlock
	* on the spa_config_lock. Instead we only obtain the leaf's physical size.
	* If the leaf has never been opened then open it, as usual.
	*/
	void
	vdev_reopen(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

	/* set the reopening flag unless we're taking the vdev offline */
	vd->vdev_reopening = !vd->vdev_offline;
	vdev_close(vd);
	(void) vdev_open(vd);

	/*
	* Call vdev_validate() here to make sure we have the same device.
	* Otherwise, a device with an invalid label could be successfully
	* opened in response to vdev_reopen().
	*/
	if (vd->vdev_aux) {
	(void) vdev_validate_aux(vd);
	if (vdev_readable(vd) && vdev_writeable(vd) &&
	vd->vdev_aux == &spa->spa_l2cache &&
	!l2arc_vdev_present(vd))
	l2arc_add_vdev(spa, vd);
	} else {
	(void) vdev_validate(vd, B_TRUE);
	}

	/*
	* Reassess parent vdev's health.
	*/
	vdev_propagate_state(vd);
	}

	int
	vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
	{
	int error;

	/*
	* Normally, partial opens (e.g. of a mirror) are allowed.
	* For a create, however, we want to fail the request if
	* there are any components we can't open.
	*/
	error = vdev_open(vd);

	if (error \|\| vd->vdev_state != VDEV_STATE_HEALTHY) {
	vdev_close(vd);
	return (error ? error : ENXIO);
	}

	/*
	* Recursively load DTLs and initialize all labels.
	*/
	if ((error = vdev_dtl_load(vd)) != 0 \|\|
	(error = vdev_label_init(vd, txg, isreplacing ?
	VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
	vdev_close(vd);
	return (error);
	}

	return (0);
	}

	void
	vdev_metaslab_set_size(vdev_t *vd)
	{
	/*
	* Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
	*/
	vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
	}

	/*
	* Maximize performance by inflating the configured ashift for top level
	* vdevs to be as close to the physical ashift as possible while maintaining
	* administrator defined limits and ensuring it doesn't go below the
	* logical ashift.
	*/
	void
	vdev_ashift_optimize(vdev_t *vd)
	{
	if (vd == vd->vdev_top) {
	if (vd->vdev_ashift < vd->vdev_physical_ashift) {
	vd->vdev_ashift = MIN(
	MAX(zfs_max_auto_ashift, vd->vdev_ashift),
	MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift));
	} else {
	/*
	* Unusual case where logical ashift > physical ashift
	* so we can't cap the calculated ashift based on max
	* ashift as that would cause failures.
	* We still check if we need to increase it to match
	* the min ashift.
	*/
	vd->vdev_ashift = MAX(zfs_min_auto_ashift,
	vd->vdev_ashift);
	}
	}
	}

	void
	vdev_dirty(vdev_t vd, int flags, void arg, uint64_t txg)
	{
	ASSERT(vd == vd->vdev_top);
	- ASSERT(!vd->vdev_ishole);
	+ /* indirect vdevs don't have metaslabs or dtls */
	+ ASSERT(vdev_is_concrete(vd) \|\| flags == 0);
	ASSERT(ISP2(flags));
	ASSERT(spa_writeable(vd->vdev_spa));

	if (flags & VDD_METASLAB)
	(void) txg_list_add(&vd->vdev_ms_list, arg, txg);

	if (flags & VDD_DTL)
	(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);

	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
	}

	void
	vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
	{
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_dirty_leaves(vd->vdev_child[c], flags, txg);

	if (vd->vdev_ops->vdev_op_leaf)
	vdev_dirty(vd->vdev_top, flags, vd, txg);
	}

	/*
	* DTLs.
	*
	* A vdev's DTL (dirty time log) is the set of transaction groups for which
	* the vdev has less than perfect replication. There are four kinds of DTL:
	*
	* DTL_MISSING: txgs for which the vdev has no valid copies of the data
	*
	* DTL_PARTIAL: txgs for which data is available, but not fully replicated
	*
	* DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
	* scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
	* txgs that was scrubbed.
	*
	* DTL_OUTAGE: txgs which cannot currently be read, whether due to
	* persistent errors or just some device being offline.
	* Unlike the other three, the DTL_OUTAGE map is not generally
	* maintained; it's only computed when needed, typically to
	* determine whether a device can be detached.
	*
	* For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
	* either has the data or it doesn't.
	*
	* For interior vdevs such as mirror and RAID-Z the picture is more complex.
	* A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
	* if any child is less than fully replicated, then so is its parent.
	* A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
	* comprising only those txgs which appear in 'maxfaults' or more children;
	* those are the txgs we don't have enough replication to read. For example,
	* double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
	* thus, its DTL_MISSING consists of the set of txgs that appear in more than
	* two child DTL_MISSING maps.
	*
	* It should be clear from the above that to compute the DTLs and outage maps
	* for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
	* Therefore, that is all we keep on disk. When loading the pool, or after
	* a configuration change, we generate all other DTLs from first principles.
	*/
	void
	vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
	{
	range_tree_t *rt = vd->vdev_dtl[t];

	ASSERT(t < DTL_TYPES);
	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
	ASSERT(spa_writeable(vd->vdev_spa));

	- mutex_enter(rt->rt_lock);
	+ mutex_enter(&vd->vdev_dtl_lock);
	if (!range_tree_contains(rt, txg, size))
	range_tree_add(rt, txg, size);
	- mutex_exit(rt->rt_lock);
	+ mutex_exit(&vd->vdev_dtl_lock);
	}

	boolean_t
	vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
	{
	range_tree_t *rt = vd->vdev_dtl[t];
	boolean_t dirty = B_FALSE;

	ASSERT(t < DTL_TYPES);
	ASSERT(vd != vd->vdev_spa->spa_root_vdev);

	- mutex_enter(rt->rt_lock);
	+ /*
	+ * While we are loading the pool, the DTLs have not been loaded yet.
	+ * Ignore the DTLs and try all devices. This avoids a recursive
	+ * mutex enter on the vdev_dtl_lock, and also makes us try hard
	+ * when loading the pool (relying on the checksum to ensure that
	+ * we get the right data -- note that we while loading, we are
	+ * only reading the MOS, which is always checksummed).
	+ */
	+ if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE)
	+ return (B_FALSE);
	+
	+ mutex_enter(&vd->vdev_dtl_lock);
	if (range_tree_space(rt) != 0)
	dirty = range_tree_contains(rt, txg, size);
	- mutex_exit(rt->rt_lock);
	+ mutex_exit(&vd->vdev_dtl_lock);

	return (dirty);
	}

	boolean_t
	vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
	{
	range_tree_t *rt = vd->vdev_dtl[t];
	boolean_t empty;

	- mutex_enter(rt->rt_lock);
	+ mutex_enter(&vd->vdev_dtl_lock);
	empty = (range_tree_space(rt) == 0);
	- mutex_exit(rt->rt_lock);
	+ mutex_exit(&vd->vdev_dtl_lock);

	return (empty);
	}

	/*
	* Returns the lowest txg in the DTL range.
	*/
	static uint64_t
	vdev_dtl_min(vdev_t *vd)
	{
	range_seg_t *rs;

	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
	ASSERT0(vd->vdev_children);

	rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
	return (rs->rs_start - 1);
	}

	/*
	* Returns the highest txg in the DTL.
	*/
	static uint64_t
	vdev_dtl_max(vdev_t *vd)
	{
	range_seg_t *rs;

	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
	ASSERT0(vd->vdev_children);

	rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
	return (rs->rs_end);
	}

	/*
	* Determine if a resilvering vdev should remove any DTL entries from
	* its range. If the vdev was resilvering for the entire duration of the
	* scan then it should excise that range from its DTLs. Otherwise, this
	* vdev is considered partially resilvered and should leave its DTL
	* entries intact. The comment in vdev_dtl_reassess() describes how we
	* excise the DTLs.
	*/
	static boolean_t
	vdev_dtl_should_excise(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;

	ASSERT0(scn->scn_phys.scn_errors);
	ASSERT0(vd->vdev_children);

	if (vd->vdev_state < VDEV_STATE_DEGRADED)
	return (B_FALSE);

	if (vd->vdev_resilver_txg == 0 \|\|
	range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
	return (B_TRUE);

	/*
	* When a resilver is initiated the scan will assign the scn_max_txg
	* value to the highest txg value that exists in all DTLs. If this
	* device's max DTL is not part of this scan (i.e. it is not in
	* the range (scn_min_txg, scn_max_txg] then it is not eligible
	* for excision.
	*/
	if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
	ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
	ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
	ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	/*
	* Reassess DTLs after a config change or scrub completion.
	*/
	void
	vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
	{
	spa_t *spa = vd->vdev_spa;
	avl_tree_t reftree;
	int minref;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_dtl_reassess(vd->vdev_child[c], txg,
	scrub_txg, scrub_done);

	- if (vd == spa->spa_root_vdev \|\| vd->vdev_ishole \|\| vd->vdev_aux)
	+ if (vd == spa->spa_root_vdev \|\| !vdev_is_concrete(vd) \|\| vd->vdev_aux)
	return;

	if (vd->vdev_ops->vdev_op_leaf) {
	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;

	mutex_enter(&vd->vdev_dtl_lock);

	/*
	* If we've completed a scan cleanly then determine
	* if this vdev should remove any DTLs. We only want to
	* excise regions on vdevs that were available during
	* the entire duration of this scan.
	*/
	if (scrub_txg != 0 &&
	(spa->spa_scrub_started \|\|
	(scn != NULL && scn->scn_phys.scn_errors == 0)) &&
	vdev_dtl_should_excise(vd)) {
	/*
	* We completed a scrub up to scrub_txg. If we
	* did it without rebooting, then the scrub dtl
	* will be valid, so excise the old region and
	* fold in the scrub dtl. Otherwise, leave the
	* dtl as-is if there was an error.
	*
	* There's little trick here: to excise the beginning
	* of the DTL_MISSING map, we put it into a reference
	* tree and then add a segment with refcnt -1 that
	* covers the range [0, scrub_txg). This means
	* that each txg in that range has refcnt -1 or 0.
	* We then add DTL_SCRUB with a refcnt of 2, so that
	* entries in the range [0, scrub_txg) will have a
	* positive refcnt -- either 1 or 2. We then convert
	* the reference tree into the new DTL_MISSING map.
	*/
	space_reftree_create(&reftree);
	space_reftree_add_map(&reftree,
	vd->vdev_dtl[DTL_MISSING], 1);
	space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
	space_reftree_add_map(&reftree,
	vd->vdev_dtl[DTL_SCRUB], 2);
	space_reftree_generate_map(&reftree,
	vd->vdev_dtl[DTL_MISSING], 1);
	space_reftree_destroy(&reftree);
	}
	range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
	range_tree_walk(vd->vdev_dtl[DTL_MISSING],
	range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
	if (scrub_done)
	range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
	range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
	if (!vdev_readable(vd))
	range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
	else
	range_tree_walk(vd->vdev_dtl[DTL_MISSING],
	range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);

	/*
	* If the vdev was resilvering and no longer has any
	* DTLs then reset its resilvering flag and dirty
	* the top level so that we persist the change.
	*/
	if (vd->vdev_resilver_txg != 0 &&
	range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
	range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) {
	vd->vdev_resilver_txg = 0;
	vdev_config_dirty(vd->vdev_top);
	}

	mutex_exit(&vd->vdev_dtl_lock);

	if (txg != 0)
	vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
	return;
	}

	mutex_enter(&vd->vdev_dtl_lock);
	for (int t = 0; t < DTL_TYPES; t++) {
	/* account for child's outage in parent's missing map */
	int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
	if (t == DTL_SCRUB)
	continue; /* leaf vdevs only */
	if (t == DTL_PARTIAL)
	minref = 1; /* i.e. non-zero */
	else if (vd->vdev_nparity != 0)
	minref = vd->vdev_nparity + 1; /* RAID-Z */
	else
	minref = vd->vdev_children; /* any kind of mirror */
	space_reftree_create(&reftree);
	for (int c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];
	mutex_enter(&cvd->vdev_dtl_lock);
	space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
	mutex_exit(&cvd->vdev_dtl_lock);
	}
	space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
	space_reftree_destroy(&reftree);
	}
	mutex_exit(&vd->vdev_dtl_lock);
	}

	int
	vdev_dtl_load(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	objset_t *mos = spa->spa_meta_objset;
	int error = 0;

	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
	- ASSERT(!vd->vdev_ishole);
	+ ASSERT(vdev_is_concrete(vd));

	error = space_map_open(&vd->vdev_dtl_sm, mos,
	- vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
	+ vd->vdev_dtl_object, 0, -1ULL, 0);
	if (error)
	return (error);
	ASSERT(vd->vdev_dtl_sm != NULL);

	mutex_enter(&vd->vdev_dtl_lock);

	/*
	* Now that we've opened the space_map we need to update
	* the in-core DTL.
	*/
	space_map_update(vd->vdev_dtl_sm);

	error = space_map_load(vd->vdev_dtl_sm,
	vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
	mutex_exit(&vd->vdev_dtl_lock);

	return (error);
	}

	for (int c = 0; c < vd->vdev_children; c++) {
	error = vdev_dtl_load(vd->vdev_child[c]);
	if (error != 0)
	break;
	}

	return (error);
	}

	void
	vdev_destroy_unlink_zap(vdev_t vd, uint64_t zapobj, dmu_tx_t tx)
	{
	spa_t *spa = vd->vdev_spa;

	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
	zapobj, tx));
	}

	uint64_t
	vdev_create_link_zap(vdev_t vd, dmu_tx_t tx)
	{
	spa_t *spa = vd->vdev_spa;
	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
	DMU_OT_NONE, 0, tx);

	ASSERT(zap != 0);
	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
	zap, tx));

	return (zap);
	}

	void
	vdev_construct_zaps(vdev_t vd, dmu_tx_t tx)
	{
	if (vd->vdev_ops != &vdev_hole_ops &&
	vd->vdev_ops != &vdev_missing_ops &&
	vd->vdev_ops != &vdev_root_ops &&
	!vd->vdev_top->vdev_removing) {
	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
	vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
	}
	if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
	vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
	}
	}
	for (uint64_t i = 0; i < vd->vdev_children; i++) {
	vdev_construct_zaps(vd->vdev_child[i], tx);
	}
	}

	void
	vdev_dtl_sync(vdev_t *vd, uint64_t txg)
	{
	spa_t *spa = vd->vdev_spa;
	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
	objset_t *mos = spa->spa_meta_objset;
	range_tree_t *rtsync;
	- kmutex_t rtlock;
	dmu_tx_t *tx;
	uint64_t object = space_map_object(vd->vdev_dtl_sm);

	- ASSERT(!vd->vdev_ishole);
	+ ASSERT(vdev_is_concrete(vd));
	ASSERT(vd->vdev_ops->vdev_op_leaf);

	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);

	if (vd->vdev_detached \|\| vd->vdev_top->vdev_removing) {
	mutex_enter(&vd->vdev_dtl_lock);
	space_map_free(vd->vdev_dtl_sm, tx);
	space_map_close(vd->vdev_dtl_sm);
	vd->vdev_dtl_sm = NULL;
	mutex_exit(&vd->vdev_dtl_lock);

	/*
	* We only destroy the leaf ZAP for detached leaves or for
	* removed log devices. Removed data devices handle leaf ZAP
	* cleanup later, once cancellation is no longer possible.
	*/
	if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached \|\|
	vd->vdev_top->vdev_islog)) {
	vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
	vd->vdev_leaf_zap = 0;
	}

	dmu_tx_commit(tx);
	return;
	}

	if (vd->vdev_dtl_sm == NULL) {
	uint64_t new_object;

	new_object = space_map_alloc(mos, tx);
	VERIFY3U(new_object, !=, 0);

	VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
	- 0, -1ULL, 0, &vd->vdev_dtl_lock));
	+ 0, -1ULL, 0));
	ASSERT(vd->vdev_dtl_sm != NULL);
	}

	- bzero(&rtlock, sizeof(rtlock));
	- mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
	+ rtsync = range_tree_create(NULL, NULL);

	- rtsync = range_tree_create(NULL, NULL, &rtlock);
	-
	- mutex_enter(&rtlock);
	-
	mutex_enter(&vd->vdev_dtl_lock);
	range_tree_walk(rt, range_tree_add, rtsync);
	mutex_exit(&vd->vdev_dtl_lock);

	space_map_truncate(vd->vdev_dtl_sm, tx);
	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
	range_tree_vacate(rtsync, NULL, NULL);

	range_tree_destroy(rtsync);

	- mutex_exit(&rtlock);
	- mutex_destroy(&rtlock);
	-
	/*
	* If the object for the space map has changed then dirty
	* the top level so that we update the config.
	*/
	if (object != space_map_object(vd->vdev_dtl_sm)) {
	zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
	"new object %llu", txg, spa_name(spa), object,
	space_map_object(vd->vdev_dtl_sm));
	vdev_config_dirty(vd->vdev_top);
	}

	dmu_tx_commit(tx);

	mutex_enter(&vd->vdev_dtl_lock);
	space_map_update(vd->vdev_dtl_sm);
	mutex_exit(&vd->vdev_dtl_lock);
	}

	/*
	* Determine whether the specified vdev can be offlined/detached/removed
	* without losing data.
	*/
	boolean_t
	vdev_dtl_required(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *tvd = vd->vdev_top;
	uint8_t cant_read = vd->vdev_cant_read;
	boolean_t required;

	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

	if (vd == spa->spa_root_vdev \|\| vd == tvd)
	return (B_TRUE);

	/*
	* Temporarily mark the device as unreadable, and then determine
	* whether this results in any DTL outages in the top-level vdev.
	* If not, we can safely offline/detach/remove the device.
	*/
	vd->vdev_cant_read = B_TRUE;
	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
	vd->vdev_cant_read = cant_read;
	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);

	if (!required && zio_injection_enabled)
	required = !!zio_handle_device_injection(vd, NULL, ECHILD);

	return (required);
	}

	/*
	* Determine if resilver is needed, and if so the txg range.
	*/
	boolean_t
	vdev_resilver_needed(vdev_t vd, uint64_t minp, uint64_t *maxp)
	{
	boolean_t needed = B_FALSE;
	uint64_t thismin = UINT64_MAX;
	uint64_t thismax = 0;

	if (vd->vdev_children == 0) {
	mutex_enter(&vd->vdev_dtl_lock);
	if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
	vdev_writeable(vd)) {

	thismin = vdev_dtl_min(vd);
	thismax = vdev_dtl_max(vd);
	needed = B_TRUE;
	}
	mutex_exit(&vd->vdev_dtl_lock);
	} else {
	for (int c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];
	uint64_t cmin, cmax;

	if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
	thismin = MIN(thismin, cmin);
	thismax = MAX(thismax, cmax);
	needed = B_TRUE;
	}
	}
	}

	if (needed && minp) {
	*minp = thismin;
	*maxp = thismax;
	}
	return (needed);
	}

	-void
	+int
	vdev_load(vdev_t *vd)
	{
	+ int error = 0;
	/*
	* Recursively load all children.
	*/
	- for (int c = 0; c < vd->vdev_children; c++)
	- vdev_load(vd->vdev_child[c]);
	+ for (int c = 0; c < vd->vdev_children; c++) {
	+ error = vdev_load(vd->vdev_child[c]);
	+ if (error != 0) {
	+ return (error);
	+ }
	+ }

	+ vdev_set_deflate_ratio(vd);
	+
	/*
	* If this is a top-level vdev, initialize its metaslabs.
	*/
	- if (vd == vd->vdev_top && !vd->vdev_ishole &&
	- (vd->vdev_ashift == 0 \|\| vd->vdev_asize == 0 \|\|
	- vdev_metaslab_init(vd, 0) != 0))
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_CORRUPT_DATA);
	+ if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
	+ if (vd->vdev_ashift == 0 \|\| vd->vdev_asize == 0) {
	+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	+ VDEV_AUX_CORRUPT_DATA);
	+ return (SET_ERROR(ENXIO));
	+ } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
	+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	+ VDEV_AUX_CORRUPT_DATA);
	+ return (error);
	+ }
	+ }

	/*
	* If this is a leaf vdev, load its DTL.
	*/
	- if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
	+ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	+ return (error);
	+ }
	+
	+ uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
	+ if (obsolete_sm_object != 0) {
	+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
	+ ASSERT(vd->vdev_asize != 0);
	+ ASSERT(vd->vdev_obsolete_sm == NULL);
	+
	+ if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
	+ obsolete_sm_object, 0, vd->vdev_asize, 0))) {
	+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	+ VDEV_AUX_CORRUPT_DATA);
	+ return (error);
	+ }
	+ space_map_update(vd->vdev_obsolete_sm);
	+ }
	+
	+ return (0);
	}

	/*
	* The special vdev case is used for hot spares and l2cache devices. Its
	* sole purpose it to set the vdev state for the associated vdev. To do this,
	* we make sure that we can open the underlying device, then try to read the
	* label, and make sure that the label is sane and that it hasn't been
	* repurposed to another pool.
	*/
	int
	vdev_validate_aux(vdev_t *vd)
	{
	nvlist_t *label;
	uint64_t guid, version;
	uint64_t state;

	if (!vdev_readable(vd))
	return (0);

	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	return (-1);
	}

	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 \|\|
	!SPA_VERSION_IS_SUPPORTED(version) \|\|
	nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 \|\|
	guid != vd->vdev_guid \|\|
	nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	nvlist_free(label);
	return (-1);
	}

	/*
	* We don't actually check the pool state here. If it's in fact in
	* use by another pool, we update this fact on the fly when requested.
	*/
	nvlist_free(label);
	return (0);
	}

	+/*
	+ * Free the objects used to store this vdev's spacemaps, and the array
	+ * that points to them.
	+ */
	void
	-vdev_remove(vdev_t *vd, uint64_t txg)
	+vdev_destroy_spacemaps(vdev_t vd, dmu_tx_t tx)
	{
	+ if (vd->vdev_ms_array == 0)
	+ return;
	+
	+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
	+ uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
	+ size_t array_bytes = array_count * sizeof (uint64_t);
	+ uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
	+ VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
	+ array_bytes, smobj_array, 0));
	+
	+ for (uint64_t i = 0; i < array_count; i++) {
	+ uint64_t smobj = smobj_array[i];
	+ if (smobj == 0)
	+ continue;
	+
	+ space_map_free_obj(mos, smobj, tx);
	+ }
	+
	+ kmem_free(smobj_array, array_bytes);
	+ VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
	+ vd->vdev_ms_array = 0;
	+}
	+
	+static void
	+vdev_remove_empty(vdev_t *vd, uint64_t txg)
	+{
	spa_t *spa = vd->vdev_spa;
	- objset_t *mos = spa->spa_meta_objset;
	dmu_tx_t *tx;

	- tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
	ASSERT(vd == vd->vdev_top);
	ASSERT3U(txg, ==, spa_syncing_txg(spa));

	if (vd->vdev_ms != NULL) {
	metaslab_group_t *mg = vd->vdev_mg;

	metaslab_group_histogram_verify(mg);
	metaslab_class_histogram_verify(mg->mg_class);

	for (int m = 0; m < vd->vdev_ms_count; m++) {
	metaslab_t *msp = vd->vdev_ms[m];

	if (msp == NULL \|\| msp->ms_sm == NULL)
	continue;

	mutex_enter(&msp->ms_lock);
	/*
	* If the metaslab was not loaded when the vdev
	* was removed then the histogram accounting may
	* not be accurate. Update the histogram information
	* here so that we ensure that the metaslab group
	* and metaslab class are up-to-date.
	*/
	metaslab_group_histogram_remove(mg, msp);

	VERIFY0(space_map_allocated(msp->ms_sm));
	- space_map_free(msp->ms_sm, tx);
	space_map_close(msp->ms_sm);
	msp->ms_sm = NULL;
	mutex_exit(&msp->ms_lock);
	}

	metaslab_group_histogram_verify(mg);
	metaslab_class_histogram_verify(mg->mg_class);
	for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
	ASSERT0(mg->mg_histogram[i]);
	-
	}

	- if (vd->vdev_ms_array) {
	- (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
	- vd->vdev_ms_array = 0;
	- }
	+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
	+ vdev_destroy_spacemaps(vd, tx);

	if (vd->vdev_islog && vd->vdev_top_zap != 0) {
	vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
	vd->vdev_top_zap = 0;
	}
	dmu_tx_commit(tx);
	}

	void
	vdev_sync_done(vdev_t *vd, uint64_t txg)
	{
	metaslab_t *msp;
	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));

	- ASSERT(!vd->vdev_ishole);
	+ ASSERT(vdev_is_concrete(vd));

	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
	metaslab_sync_done(msp, txg);

	if (reassess)
	metaslab_sync_reassess(vd->vdev_mg);
	}

	void
	vdev_sync(vdev_t *vd, uint64_t txg)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *lvd;
	metaslab_t *msp;
	dmu_tx_t *tx;

	- ASSERT(!vd->vdev_ishole);
	+ if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
	+ dmu_tx_t *tx;

	- if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
	+ ASSERT(vd->vdev_removing \|\|
	+ vd->vdev_ops == &vdev_indirect_ops);
	+
	+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	+ vdev_indirect_sync_obsolete(vd, tx);
	+ dmu_tx_commit(tx);
	+
	+ /*
	+ * If the vdev is indirect, it can't have dirty
	+ * metaslabs or DTLs.
	+ */
	+ if (vd->vdev_ops == &vdev_indirect_ops) {
	+ ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
	+ ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
	+ return;
	+ }
	+ }
	+
	+ ASSERT(vdev_is_concrete(vd));
	+
	+ if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
	+ !vd->vdev_removing) {
	ASSERT(vd == vd->vdev_top);
	+ ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
	DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
	ASSERT(vd->vdev_ms_array != 0);
	vdev_config_dirty(vd);
	dmu_tx_commit(tx);
	}

	- /*
	- * Remove the metadata associated with this vdev once it's empty.
	- */
	- if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
	- vdev_remove(vd, txg);
	-
	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
	metaslab_sync(msp, txg);
	(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
	}

	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
	vdev_dtl_sync(lvd, txg);

	+ /*
	+ * Remove the metadata associated with this vdev once it's empty.
	+ * Note that this is typically used for log/cache device removal;
	+ * we don't empty toplevel vdevs when removing them. But if
	+ * a toplevel happens to be emptied, this is not harmful.
	+ */
	+ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) {
	+ vdev_remove_empty(vd, txg);
	+ }
	+
	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
	}

	uint64_t
	vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
	{
	return (vd->vdev_ops->vdev_op_asize(vd, psize));
	}

	/*
	* Mark the given vdev faulted. A faulted vdev behaves as if the device could
	* not be opened, and no I/O is attempted.
	*/
	int
	vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
	{
	vdev_t vd, tvd;

	spa_vdev_state_enter(spa, SCL_NONE);

	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENODEV));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));

	tvd = vd->vdev_top;

	/*
	* We don't directly use the aux state here, but if we do a
	* vdev_reopen(), we need this value to be present to remember why we
	* were faulted.
	*/
	vd->vdev_label_aux = aux;

	/*
	* Faulted state takes precedence over degraded.
	*/
	vd->vdev_delayed_close = B_FALSE;
	vd->vdev_faulted = 1ULL;
	vd->vdev_degraded = 0ULL;
	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);

	/*
	* If this device has the only valid copy of the data, then
	* back off and simply mark the vdev as degraded instead.
	*/
	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
	vd->vdev_degraded = 1ULL;
	vd->vdev_faulted = 0ULL;

	/*
	* If we reopen the device and it's not dead, only then do we
	* mark it degraded.
	*/
	vdev_reopen(tvd);

	if (vdev_readable(vd))
	vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
	}

	return (spa_vdev_state_exit(spa, vd, 0));
	}

	/*
	* Mark the given vdev degraded. A degraded vdev is purely an indication to the
	* user that something is wrong. The vdev continues to operate as normal as far
	* as I/O is concerned.
	*/
	int
	vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
	{
	vdev_t *vd;

	spa_vdev_state_enter(spa, SCL_NONE);

	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENODEV));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));

	/*
	* If the vdev is already faulted, then don't do anything.
	*/
	if (vd->vdev_faulted \|\| vd->vdev_degraded)
	return (spa_vdev_state_exit(spa, NULL, 0));

	vd->vdev_degraded = 1ULL;
	if (!vdev_is_dead(vd))
	vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
	aux);

	return (spa_vdev_state_exit(spa, vd, 0));
	}

	/*
	* Online the given vdev.
	*
	* If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
	* spare device should be detached when the device finishes resilvering.
	* Second, the online should be treated like a 'test' online case, so no FMA
	* events are generated if the device fails to open.
	*/
	int
	vdev_online(spa_t spa, uint64_t guid, uint64_t flags, vdev_state_t newstate)
	{
	vdev_t vd, tvd, pvd, rvd = spa->spa_root_vdev;
	boolean_t wasoffline;
	vdev_state_t oldstate;

	spa_vdev_state_enter(spa, SCL_NONE);

	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENODEV));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));

	wasoffline = (vd->vdev_offline \|\| vd->vdev_tmpoffline);
	oldstate = vd->vdev_state;

	tvd = vd->vdev_top;
	vd->vdev_offline = B_FALSE;
	vd->vdev_tmpoffline = B_FALSE;
	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);

	/* XXX - L2ARC 1.0 does not support expansion */
	if (!vd->vdev_aux) {
	for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
	pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
	}

	vdev_reopen(tvd);
	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;

	if (!vd->vdev_aux) {
	for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
	pvd->vdev_expanding = B_FALSE;
	}

	if (newstate)
	*newstate = vd->vdev_state;
	if ((flags & ZFS_ONLINE_UNSPARE) &&
	!vdev_is_dead(vd) && vd->vdev_parent &&
	vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
	vd->vdev_parent->vdev_child[0] == vd)
	vd->vdev_unspare = B_TRUE;

	if ((flags & ZFS_ONLINE_EXPAND) \|\| spa->spa_autoexpand) {

	/* XXX - L2ARC 1.0 does not support expansion */
	if (vd->vdev_aux)
	return (spa_vdev_state_exit(spa, vd, ENOTSUP));
	spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
	}

	if (wasoffline \|\|
	(oldstate < VDEV_STATE_DEGRADED &&
	vd->vdev_state >= VDEV_STATE_DEGRADED))
	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);

	return (spa_vdev_state_exit(spa, vd, 0));
	}

	static int
	vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
	{
	vdev_t vd, tvd;
	int error = 0;
	uint64_t generation;
	metaslab_group_t *mg;

	top:
	spa_vdev_state_enter(spa, SCL_ALLOC);

	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENODEV));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));

	tvd = vd->vdev_top;
	mg = tvd->vdev_mg;
	generation = spa->spa_config_generation + 1;

	/*
	* If the device isn't already offline, try to offline it.
	*/
	if (!vd->vdev_offline) {
	/*
	* If this device has the only valid copy of some data,
	* don't allow it to be offlined. Log devices are always
	* expendable.
	*/
	if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
	vdev_dtl_required(vd))
	return (spa_vdev_state_exit(spa, NULL, EBUSY));

	/*
	* If the top-level is a slog and it has had allocations
	* then proceed. We check that the vdev's metaslab group
	* is not NULL since it's possible that we may have just
	* added this vdev but not yet initialized its metaslabs.
	*/
	if (tvd->vdev_islog && mg != NULL) {
	/*
	* Prevent any future allocations.
	*/
	metaslab_group_passivate(mg);
	(void) spa_vdev_state_exit(spa, vd, 0);

	- error = spa_offline_log(spa);
	+ error = spa_reset_logs(spa);

	spa_vdev_state_enter(spa, SCL_ALLOC);

	/*
	* Check to see if the config has changed.
	*/
	if (error \|\| generation != spa->spa_config_generation) {
	metaslab_group_activate(mg);
	if (error)
	return (spa_vdev_state_exit(spa,
	vd, error));
	(void) spa_vdev_state_exit(spa, vd, 0);
	goto top;
	}
	ASSERT0(tvd->vdev_stat.vs_alloc);
	}

	/*
	* Offline this device and reopen its top-level vdev.
	* If the top-level vdev is a log device then just offline
	* it. Otherwise, if this action results in the top-level
	* vdev becoming unusable, undo it and fail the request.
	*/
	vd->vdev_offline = B_TRUE;
	vdev_reopen(tvd);

	if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
	vdev_is_dead(tvd)) {
	vd->vdev_offline = B_FALSE;
	vdev_reopen(tvd);
	return (spa_vdev_state_exit(spa, NULL, EBUSY));
	}

	/*
	* Add the device back into the metaslab rotor so that
	* once we online the device it's open for business.
	*/
	if (tvd->vdev_islog && mg != NULL)
	metaslab_group_activate(mg);
	}

	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);

	return (spa_vdev_state_exit(spa, vd, 0));
	}

	int
	vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
	{
	int error;

	mutex_enter(&spa->spa_vdev_top_lock);
	error = vdev_offline_locked(spa, guid, flags);
	mutex_exit(&spa->spa_vdev_top_lock);

	return (error);
	}

	/*
	* Clear the error counts associated with this vdev. Unlike vdev_online() and
	* vdev_offline(), we assume the spa config is locked. We also clear all
	* children. If 'vd' is NULL, then the user wants to clear all vdevs.
	*/
	void
	vdev_clear(spa_t spa, vdev_t vd)
	{
	vdev_t *rvd = spa->spa_root_vdev;

	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

	if (vd == NULL)
	vd = rvd;

	vd->vdev_stat.vs_read_errors = 0;
	vd->vdev_stat.vs_write_errors = 0;
	vd->vdev_stat.vs_checksum_errors = 0;

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_clear(spa, vd->vdev_child[c]);

	if (vd == rvd) {
	for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
	vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);

	for (int c = 0; c < spa->spa_spares.sav_count; c++)
	vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
	}

	/*
	+ * It makes no sense to "clear" an indirect vdev.
	+ */
	+ if (!vdev_is_concrete(vd))
	+ return;
	+
	+ /*
	* If we're in the FAULTED state or have experienced failed I/O, then
	* clear the persistent state and attempt to reopen the device. We
	* also mark the vdev config dirty, so that the new faulted state is
	* written out to disk.
	*/
	if (vd->vdev_faulted \|\| vd->vdev_degraded \|\|
	!vdev_readable(vd) \|\| !vdev_writeable(vd)) {

	/*
	* When reopening in reponse to a clear event, it may be due to
	* a fmadm repair request. In this case, if the device is
	* still broken, we want to still post the ereport again.
	*/
	vd->vdev_forcefault = B_TRUE;

	vd->vdev_faulted = vd->vdev_degraded = 0ULL;
	vd->vdev_cant_read = B_FALSE;
	vd->vdev_cant_write = B_FALSE;

	vdev_reopen(vd == rvd ? rvd : vd->vdev_top);

	vd->vdev_forcefault = B_FALSE;

	if (vd != rvd && vdev_writeable(vd->vdev_top))
	vdev_state_dirty(vd->vdev_top);

	if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
	spa_async_request(spa, SPA_ASYNC_RESILVER);

	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
	}

	/*
	* When clearing a FMA-diagnosed fault, we always want to
	* unspare the device, as we assume that the original spare was
	* done in response to the FMA fault.
	*/
	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
	vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
	vd->vdev_parent->vdev_child[0] == vd)
	vd->vdev_unspare = B_TRUE;
	}

	boolean_t
	vdev_is_dead(vdev_t *vd)
	{
	/*
	* Holes and missing devices are always considered "dead".
	* This simplifies the code since we don't have to check for
	* these types of devices in the various code paths.
	* Instead we rely on the fact that we skip over dead devices
	* before issuing I/O to them.
	*/
	- return (vd->vdev_state < VDEV_STATE_DEGRADED \|\| vd->vdev_ishole \|\|
	+ return (vd->vdev_state < VDEV_STATE_DEGRADED \|\|
	+ vd->vdev_ops == &vdev_hole_ops \|\|
	vd->vdev_ops == &vdev_missing_ops);
	}

	boolean_t
	vdev_readable(vdev_t *vd)
	{
	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
	}

	boolean_t
	vdev_writeable(vdev_t *vd)
	{
	- return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
	+ return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
	+ vdev_is_concrete(vd));
	}

	boolean_t
	vdev_allocatable(vdev_t *vd)
	{
	uint64_t state = vd->vdev_state;

	/*
	* We currently allow allocations from vdevs which may be in the
	* process of reopening (i.e. VDEV_STATE_CLOSED). If the device
	* fails to reopen then we'll catch it later when we're holding
	* the proper locks. Note that we have to get the vdev state
	* in a local variable because although it changes atomically,
	* we're asking two separate questions about it.
	*/
	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
	- !vd->vdev_cant_write && !vd->vdev_ishole &&
	+ !vd->vdev_cant_write && vdev_is_concrete(vd) &&
	vd->vdev_mg->mg_initialized);
	}

	boolean_t
	vdev_accessible(vdev_t vd, zio_t zio)
	{
	ASSERT(zio->io_vd == vd);

	if (vdev_is_dead(vd) \|\| vd->vdev_remove_wanted)
	return (B_FALSE);

	if (zio->io_type == ZIO_TYPE_READ)
	return (!vd->vdev_cant_read);

	if (zio->io_type == ZIO_TYPE_WRITE)
	return (!vd->vdev_cant_write);

	return (B_TRUE);
	}

	/*
	* Get statistics for the given vdev.
	*/
	void
	vdev_get_stats(vdev_t vd, vdev_stat_t vs)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t *tvd = vd->vdev_top;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);

	mutex_enter(&vd->vdev_stat_lock);
	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
	vs->vs_state = vd->vdev_state;
	vs->vs_rsize = vdev_get_min_asize(vd);
	if (vd->vdev_ops->vdev_op_leaf)
	vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
	/*
	* Report expandable space on top-level, non-auxillary devices only.
	* The expandable space is reported in terms of metaslab sized units
	* since that determines how much space the pool can expand.
	*/
	if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
	vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize -
	spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
	}
	vs->vs_configured_ashift = vd->vdev_top != NULL
	? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
	vs->vs_logical_ashift = vd->vdev_logical_ashift;
	vs->vs_physical_ashift = vd->vdev_physical_ashift;
	- if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
	+ if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
	+ vdev_is_concrete(vd)) {
	vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
	}

	/*
	* If we're getting stats on the root vdev, aggregate the I/O counts
	* over all top-level vdevs (i.e. the direct children of the root).
	*/
	if (vd == rvd) {
	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *cvd = rvd->vdev_child[c];
	vdev_stat_t *cvs = &cvd->vdev_stat;

	for (int t = 0; t < ZIO_TYPES; t++) {
	vs->vs_ops[t] += cvs->vs_ops[t];
	vs->vs_bytes[t] += cvs->vs_bytes[t];
	}
	cvs->vs_scan_removing = cvd->vdev_removing;
	}
	}
	mutex_exit(&vd->vdev_stat_lock);
	}

	void
	vdev_clear_stats(vdev_t *vd)
	{
	mutex_enter(&vd->vdev_stat_lock);
	vd->vdev_stat.vs_space = 0;
	vd->vdev_stat.vs_dspace = 0;
	vd->vdev_stat.vs_alloc = 0;
	mutex_exit(&vd->vdev_stat_lock);
	}

	void
	vdev_scan_stat_init(vdev_t *vd)
	{
	vdev_stat_t *vs = &vd->vdev_stat;

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_scan_stat_init(vd->vdev_child[c]);

	mutex_enter(&vd->vdev_stat_lock);
	vs->vs_scan_processed = 0;
	mutex_exit(&vd->vdev_stat_lock);
	}

	void
	vdev_stat_update(zio_t *zio, uint64_t psize)
	{
	spa_t *spa = zio->io_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
	vdev_t *pvd;
	uint64_t txg = zio->io_txg;
	vdev_stat_t *vs = &vd->vdev_stat;
	zio_type_t type = zio->io_type;
	int flags = zio->io_flags;

	/*
	* If this i/o is a gang leader, it didn't do any actual work.
	*/
	if (zio->io_gang_tree)
	return;

	if (zio->io_error == 0) {
	/*
	* If this is a root i/o, don't count it -- we've already
	* counted the top-level vdevs, and vdev_get_stats() will
	* aggregate them when asked. This reduces contention on
	* the root vdev_stat_lock and implicitly handles blocks
	* that compress away to holes, for which there is no i/o.
	* (Holes never create vdev children, so all the counters
	* remain zero, which is what we want.)
	*
	* Note: this only applies to successful i/o (io_error == 0)
	* because unlike i/o counts, errors are not additive.
	* When reading a ditto block, for example, failure of
	* one top-level vdev does not imply a root-level error.
	*/
	if (vd == rvd)
	return;

	ASSERT(vd == zio->io_vd);

	if (flags & ZIO_FLAG_IO_BYPASS)
	return;

	mutex_enter(&vd->vdev_stat_lock);

	if (flags & ZIO_FLAG_IO_REPAIR) {
	if (flags & ZIO_FLAG_SCAN_THREAD) {
	dsl_scan_phys_t *scn_phys =
	&spa->spa_dsl_pool->dp_scan->scn_phys;
	uint64_t *processed = &scn_phys->scn_processed;

	/* XXX cleanup? */
	if (vd->vdev_ops->vdev_op_leaf)
	atomic_add_64(processed, psize);
	vs->vs_scan_processed += psize;
	}

	if (flags & ZIO_FLAG_SELF_HEAL)
	vs->vs_self_healed += psize;
	}

	vs->vs_ops[type]++;
	vs->vs_bytes[type] += psize;

	mutex_exit(&vd->vdev_stat_lock);
	return;
	}

	if (flags & ZIO_FLAG_SPECULATIVE)
	return;

	/*
	* If this is an I/O error that is going to be retried, then ignore the
	* error. Otherwise, the user may interpret B_FAILFAST I/O errors as
	* hard errors, when in reality they can happen for any number of
	* innocuous reasons (bus resets, MPxIO link failure, etc).
	*/
	if (zio->io_error == EIO &&
	!(zio->io_flags & ZIO_FLAG_IO_RETRY))
	return;

	/*
	* Intent logs writes won't propagate their error to the root
	* I/O so don't mark these types of failures as pool-level
	* errors.
	*/
	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
	return;

	mutex_enter(&vd->vdev_stat_lock);
	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
	if (zio->io_error == ECKSUM)
	vs->vs_checksum_errors++;
	else
	vs->vs_read_errors++;
	}
	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
	vs->vs_write_errors++;
	mutex_exit(&vd->vdev_stat_lock);

	- if (type == ZIO_TYPE_WRITE && txg != 0 &&
	+ if (spa->spa_load_state == SPA_LOAD_NONE &&
	+ type == ZIO_TYPE_WRITE && txg != 0 &&
	(!(flags & ZIO_FLAG_IO_REPAIR) \|\|
	(flags & ZIO_FLAG_SCAN_THREAD) \|\|
	spa->spa_claiming)) {
	/*
	* This is either a normal write (not a repair), or it's
	* a repair induced by the scrub thread, or it's a repair
	* made by zil_claim() during spa_load() in the first txg.
	* In the normal case, we commit the DTL change in the same
	* txg as the block was born. In the scrub-induced repair
	* case, we know that scrubs run in first-pass syncing context,
	* so we commit the DTL change in spa_syncing_txg(spa).
	* In the zil_claim() case, we commit in spa_first_txg(spa).
	*
	* We currently do not make DTL entries for failed spontaneous
	* self-healing writes triggered by normal (non-scrubbing)
	* reads, because we have no transactional context in which to
	* do so -- and it's not clear that it'd be desirable anyway.
	*/
	if (vd->vdev_ops->vdev_op_leaf) {
	uint64_t commit_txg = txg;
	if (flags & ZIO_FLAG_SCAN_THREAD) {
	ASSERT(flags & ZIO_FLAG_IO_REPAIR);
	ASSERT(spa_sync_pass(spa) == 1);
	vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
	commit_txg = spa_syncing_txg(spa);
	} else if (spa->spa_claiming) {
	ASSERT(flags & ZIO_FLAG_IO_REPAIR);
	commit_txg = spa_first_txg(spa);
	}
	ASSERT(commit_txg >= spa_syncing_txg(spa));
	if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
	return;
	for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
	vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
	vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
	}
	if (vd != rvd)
	vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
	}
	}

	/*
	* Update the in-core space usage stats for this vdev, its metaslab class,
	* and the root vdev.
	*/
	void
	vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
	int64_t space_delta)
	{
	int64_t dspace_delta = space_delta;
	spa_t *spa = vd->vdev_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	metaslab_group_t *mg = vd->vdev_mg;
	metaslab_class_t *mc = mg ? mg->mg_class : NULL;

	ASSERT(vd == vd->vdev_top);

	/*
	* Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
	* factor. We must calculate this here and not at the root vdev
	* because the root vdev's psize-to-asize is simply the max of its
	* childrens', thus not accurate enough for us.
	*/
	ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
	ASSERT(vd->vdev_deflate_ratio != 0 \|\| vd->vdev_isl2cache);
	dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
	vd->vdev_deflate_ratio;

	mutex_enter(&vd->vdev_stat_lock);
	vd->vdev_stat.vs_alloc += alloc_delta;
	vd->vdev_stat.vs_space += space_delta;
	vd->vdev_stat.vs_dspace += dspace_delta;
	mutex_exit(&vd->vdev_stat_lock);

	if (mc == spa_normal_class(spa)) {
	mutex_enter(&rvd->vdev_stat_lock);
	rvd->vdev_stat.vs_alloc += alloc_delta;
	rvd->vdev_stat.vs_space += space_delta;
	rvd->vdev_stat.vs_dspace += dspace_delta;
	mutex_exit(&rvd->vdev_stat_lock);
	}

	if (mc != NULL) {
	ASSERT(rvd == vd->vdev_parent);
	ASSERT(vd->vdev_ms_count != 0);

	metaslab_class_space_update(mc,
	alloc_delta, defer_delta, space_delta, dspace_delta);
	}
	}

	/*
	* Mark a top-level vdev's config as dirty, placing it on the dirty list
	* so that it will be written out next time the vdev configuration is synced.
	* If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
	*/
	void
	vdev_config_dirty(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	int c;

	ASSERT(spa_writeable(spa));

	/*
	* If this is an aux vdev (as with l2cache and spare devices), then we
	* update the vdev config manually and set the sync flag.
	*/
	if (vd->vdev_aux != NULL) {
	spa_aux_vdev_t *sav = vd->vdev_aux;
	nvlist_t **aux;
	uint_t naux;

	for (c = 0; c < sav->sav_count; c++) {
	if (sav->sav_vdevs[c] == vd)
	break;
	}

	if (c == sav->sav_count) {
	/*
	* We're being removed. There's nothing more to do.
	*/
	ASSERT(sav->sav_sync == B_TRUE);
	return;
	}

	sav->sav_sync = B_TRUE;

	if (nvlist_lookup_nvlist_array(sav->sav_config,
	ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
	VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
	ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
	}

	ASSERT(c < naux);

	/*
	* Setting the nvlist in the middle if the array is a little
	* sketchy, but it will work.
	*/
	nvlist_free(aux[c]);
	aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);

	return;
	}

	/*
	* The dirty list is protected by the SCL_CONFIG lock. The caller
	* must either hold SCL_CONFIG as writer, or must be the sync thread
	* (which holds SCL_CONFIG as reader). There's only one sync thread,
	* so this is sufficient to ensure mutual exclusion.
	*/
	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) \|\|
	(dsl_pool_sync_context(spa_get_dsl(spa)) &&
	spa_config_held(spa, SCL_CONFIG, RW_READER)));

	if (vd == rvd) {
	for (c = 0; c < rvd->vdev_children; c++)
	vdev_config_dirty(rvd->vdev_child[c]);
	} else {
	ASSERT(vd == vd->vdev_top);

	if (!list_link_active(&vd->vdev_config_dirty_node) &&
	- !vd->vdev_ishole)
	+ vdev_is_concrete(vd)) {
	list_insert_head(&spa->spa_config_dirty_list, vd);
	+ }
	}
	}

	void
	vdev_config_clean(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) \|\|
	(dsl_pool_sync_context(spa_get_dsl(spa)) &&
	spa_config_held(spa, SCL_CONFIG, RW_READER)));

	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
	list_remove(&spa->spa_config_dirty_list, vd);
	}

	/*
	* Mark a top-level vdev's state as dirty, so that the next pass of
	* spa_sync() can convert this into vdev_config_dirty(). We distinguish
	* the state changes from larger config changes because they require
	* much less locking, and are often needed for administrative actions.
	*/
	void
	vdev_state_dirty(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_writeable(spa));
	ASSERT(vd == vd->vdev_top);

	/*
	* The state list is protected by the SCL_STATE lock. The caller
	* must either hold SCL_STATE as writer, or must be the sync thread
	* (which holds SCL_STATE as reader). There's only one sync thread,
	* so this is sufficient to ensure mutual exclusion.
	*/
	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) \|\|
	(dsl_pool_sync_context(spa_get_dsl(spa)) &&
	spa_config_held(spa, SCL_STATE, RW_READER)));

	- if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
	+ if (!list_link_active(&vd->vdev_state_dirty_node) &&
	+ vdev_is_concrete(vd))
	list_insert_head(&spa->spa_state_dirty_list, vd);
	}

	void
	vdev_state_clean(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) \|\|
	(dsl_pool_sync_context(spa_get_dsl(spa)) &&
	spa_config_held(spa, SCL_STATE, RW_READER)));

	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
	list_remove(&spa->spa_state_dirty_list, vd);
	}

	/*
	* Propagate vdev state up from children to parent.
	*/
	void
	vdev_propagate_state(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	int degraded = 0, faulted = 0;
	int corrupted = 0;
	vdev_t *child;

	if (vd->vdev_children > 0) {
	for (int c = 0; c < vd->vdev_children; c++) {
	child = vd->vdev_child[c];

	/*
	- * Don't factor holes into the decision.
	+ * Don't factor holes or indirect vdevs into the
	+ * decision.
	*/
	- if (child->vdev_ishole)
	+ if (!vdev_is_concrete(child))
	continue;

	if (!vdev_readable(child) \|\|
	(!vdev_writeable(child) && spa_writeable(spa))) {
	/*
	* Root special: if there is a top-level log
	* device, treat the root vdev as if it were
	* degraded.
	*/
	if (child->vdev_islog && vd == rvd)
	degraded++;
	else
	faulted++;
	} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
	degraded++;
	}

	if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
	corrupted++;
	}

	vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);

	/*
	* Root special: if there is a top-level vdev that cannot be
	* opened due to corrupted metadata, then propagate the root
	* vdev's aux state as 'corrupt' rather than 'insufficient
	* replicas'.
	*/
	if (corrupted && vd == rvd &&
	rvd->vdev_state == VDEV_STATE_CANT_OPEN)
	vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	}

	if (vd->vdev_parent)
	vdev_propagate_state(vd->vdev_parent);
	}

	/*
	* Set a vdev's state. If this is during an open, we don't update the parent
	* state, because we're in the process of opening children depth-first.
	* Otherwise, we propagate the change to the parent.
	*
	* If this routine places a device in a faulted state, an appropriate ereport is
	* generated.
	*/
	void
	vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
	{
	uint64_t save_state;
	spa_t *spa = vd->vdev_spa;

	if (state == vd->vdev_state) {
	vd->vdev_stat.vs_aux = aux;
	return;
	}

	save_state = vd->vdev_state;

	vd->vdev_state = state;
	vd->vdev_stat.vs_aux = aux;

	/*
	* If we are setting the vdev state to anything but an open state, then
	* always close the underlying device unless the device has requested
	* a delayed close (i.e. we're about to remove or fault the device).
	* Otherwise, we keep accessible but invalid devices open forever.
	* We don't call vdev_close() itself, because that implies some extra
	* checks (offline, etc) that we don't want here. This is limited to
	* leaf devices, because otherwise closing the device will affect other
	* children.
	*/
	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
	vd->vdev_ops->vdev_op_leaf)
	vd->vdev_ops->vdev_op_close(vd);

	if (vd->vdev_removed &&
	state == VDEV_STATE_CANT_OPEN &&
	(aux == VDEV_AUX_OPEN_FAILED \|\| vd->vdev_checkremove)) {
	/*
	* If the previous state is set to VDEV_STATE_REMOVED, then this
	* device was previously marked removed and someone attempted to
	* reopen it. If this failed due to a nonexistent device, then
	* keep the device in the REMOVED state. We also let this be if
	* it is one of our special test online cases, which is only
	* attempting to online the device and shouldn't generate an FMA
	* fault.
	*/
	vd->vdev_state = VDEV_STATE_REMOVED;
	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
	} else if (state == VDEV_STATE_REMOVED) {
	vd->vdev_removed = B_TRUE;
	} else if (state == VDEV_STATE_CANT_OPEN) {
	/*
	* If we fail to open a vdev during an import or recovery, we
	* mark it as "not available", which signifies that it was
	* never there to begin with. Failure to open such a device
	* is not considered an error.
	*/
	if ((spa_load_state(spa) == SPA_LOAD_IMPORT \|\|
	spa_load_state(spa) == SPA_LOAD_RECOVER) &&
	vd->vdev_ops->vdev_op_leaf)
	vd->vdev_not_present = 1;

	/*
	* Post the appropriate ereport. If the 'prevstate' field is
	* set to something other than VDEV_STATE_UNKNOWN, it indicates
	* that this is part of a vdev_reopen(). In this case, we don't
	* want to post the ereport if the device was already in the
	* CANT_OPEN state beforehand.
	*
	* If the 'checkremove' flag is set, then this is an attempt to
	* online the device in response to an insertion event. If we
	* hit this case, then we have detected an insertion event for a
	* faulted or offline device that wasn't in the removed state.
	* In this scenario, we don't post an ereport because we are
	* about to replace the device, or attempt an online with
	* vdev_forcefault, which will generate the fault for us.
	*/
	if ((vd->vdev_prevstate != state \|\| vd->vdev_forcefault) &&
	!vd->vdev_not_present && !vd->vdev_checkremove &&
	vd != spa->spa_root_vdev) {
	const char *class;

	switch (aux) {
	case VDEV_AUX_OPEN_FAILED:
	class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
	break;
	case VDEV_AUX_CORRUPT_DATA:
	class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
	break;
	case VDEV_AUX_NO_REPLICAS:
	class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
	break;
	case VDEV_AUX_BAD_GUID_SUM:
	class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
	break;
	case VDEV_AUX_TOO_SMALL:
	class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
	break;
	case VDEV_AUX_BAD_LABEL:
	class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
	break;
	default:
	class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
	}

	zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
	}

	/* Erase any notion of persistent removed state */
	vd->vdev_removed = B_FALSE;
	} else {
	vd->vdev_removed = B_FALSE;
	}

	/*
	* Notify the fmd of the state change. Be verbose and post
	* notifications even for stuff that's not important; the fmd agent can
	* sort it out. Don't emit state change events for non-leaf vdevs since
	* they can't change state on their own. The FMD can check their state
	* if it wants to when it sees that a leaf vdev had a state change.
	*/
	if (vd->vdev_ops->vdev_op_leaf)
	zfs_post_state_change(spa, vd);

	if (!isopen && vd->vdev_parent)
	vdev_propagate_state(vd->vdev_parent);
	}

	/*
	* Check the vdev configuration to ensure that it's capable of supporting
	* a root pool. We do not support partial configuration.
	* In addition, only a single top-level vdev is allowed.
	*
	* FreeBSD does not have above limitations.
	*/
	boolean_t
	vdev_is_bootable(vdev_t *vd)
	{
	#ifdef illumos
	if (!vd->vdev_ops->vdev_op_leaf) {
	char *vdev_type = vd->vdev_ops->vdev_op_type;

	if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
	vd->vdev_children > 1) {
	return (B_FALSE);
	- } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
	+ } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 \|\|
	+ strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
	return (B_FALSE);
	}
	}

	for (int c = 0; c < vd->vdev_children; c++) {
	if (!vdev_is_bootable(vd->vdev_child[c]))
	return (B_FALSE);
	}
	#endif /* illumos */
	return (B_TRUE);
	}

	+boolean_t
	+vdev_is_concrete(vdev_t *vd)
	+{
	+ vdev_ops_t *ops = vd->vdev_ops;
	+ if (ops == &vdev_indirect_ops \|\| ops == &vdev_hole_ops \|\|
	+ ops == &vdev_missing_ops \|\| ops == &vdev_root_ops) {
	+ return (B_FALSE);
	+ } else {
	+ return (B_TRUE);
	+ }
	+}
	+
	/*
	* Load the state from the original vdev tree (ovd) which
	* we've retrieved from the MOS config object. If the original
	* vdev was offline or faulted then we transfer that state to the
	* device in the current vdev tree (nvd).
	*/
	void
	vdev_load_log_state(vdev_t nvd, vdev_t ovd)
	{
	spa_t *spa = nvd->vdev_spa;

	ASSERT(nvd->vdev_top->vdev_islog);
	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);

	for (int c = 0; c < nvd->vdev_children; c++)
	vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);

	if (nvd->vdev_ops->vdev_op_leaf) {
	/*
	* Restore the persistent vdev state
	*/
	nvd->vdev_offline = ovd->vdev_offline;
	nvd->vdev_faulted = ovd->vdev_faulted;
	nvd->vdev_degraded = ovd->vdev_degraded;
	nvd->vdev_removed = ovd->vdev_removed;
	}
	}

	/*
	* Determine if a log device has valid content. If the vdev was
	* removed or faulted in the MOS config then we know that
	* the content on the log device has already been written to the pool.
	*/
	boolean_t
	vdev_log_state_valid(vdev_t *vd)
	{
	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
	!vd->vdev_removed)
	return (B_TRUE);

	for (int c = 0; c < vd->vdev_children; c++)
	if (vdev_log_state_valid(vd->vdev_child[c]))
	return (B_TRUE);

	return (B_FALSE);
	}

	/*
	* Expand a vdev if possible.
	*/
	void
	vdev_expand(vdev_t *vd, uint64_t txg)
	{
	ASSERT(vd->vdev_top == vd);
	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
	+ vdev_set_deflate_ratio(vd);
	+
	+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
	+ vdev_is_concrete(vd)) {
	VERIFY(vdev_metaslab_init(vd, txg) == 0);
	vdev_config_dirty(vd);
	}
	}

	/*
	* Split a vdev.
	*/
	void
	vdev_split(vdev_t *vd)
	{
	vdev_t cvd, pvd = vd->vdev_parent;

	vdev_remove_child(pvd, vd);
	vdev_compact_children(pvd);

	cvd = pvd->vdev_child[0];
	if (pvd->vdev_children == 1) {
	vdev_remove_parent(cvd);
	cvd->vdev_splitting = B_TRUE;
	}
	vdev_propagate_state(cvd);
	}

	void
	vdev_deadman(vdev_t *vd)
	{
	for (int c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];

	vdev_deadman(cvd);
	}

	if (vd->vdev_ops->vdev_op_leaf) {
	vdev_queue_t *vq = &vd->vdev_queue;

	mutex_enter(&vq->vq_lock);
	if (avl_numnodes(&vq->vq_active_tree) > 0) {
	spa_t *spa = vd->vdev_spa;
	zio_t *fio;
	uint64_t delta;

	/*
	* Look at the head of all the pending queues,
	* if any I/O has been outstanding for longer than
	* the spa_deadman_synctime we panic the system.
	*/
	fio = avl_first(&vq->vq_active_tree);
	delta = gethrtime() - fio->io_timestamp;
	if (delta > spa_deadman_synctime(spa)) {
	zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
	"delta %lluns, last io %lluns",
	fio->io_timestamp, delta,
	vq->vq_io_complete_ts);
	fm_panic("I/O to pool '%s' appears to be "
	"hung on vdev guid %llu at '%s'.",
	spa_name(spa),
	(long long unsigned int) vd->vdev_guid,
	vd->vdev_path);
	}
	}
	mutex_exit(&vq->vq_lock);
	}
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c (revision 332525)
	@@ -1,918 +1,919 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2013 Joyent, Inc. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa_impl.h>
	#include <sys/refcount.h>
	#include <sys/vdev_disk.h>
	#include <sys/vdev_impl.h>
	#include <sys/abd.h>
	#include <sys/fs/zfs.h>
	#include <sys/zio.h>
	#include <sys/sunldi.h>
	#include <sys/efi_partition.h>
	#include <sys/fm/fs/zfs.h>

	/*
	* Virtual device vector for disks.
	*/

	extern ldi_ident_t zfs_li;

	static void vdev_disk_close(vdev_t *);

	typedef struct vdev_disk_ldi_cb {
	list_node_t lcb_next;
	ldi_callback_id_t lcb_id;
	} vdev_disk_ldi_cb_t;

	static void
	vdev_disk_alloc(vdev_t *vd)
	{
	vdev_disk_t *dvd;

	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
	/*
	* Create the LDI event callback list.
	*/
	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
	offsetof(vdev_disk_ldi_cb_t, lcb_next));
	}

	static void
	vdev_disk_free(vdev_t *vd)
	{
	vdev_disk_t *dvd = vd->vdev_tsd;
	vdev_disk_ldi_cb_t *lcb;

	if (dvd == NULL)
	return;

	/*
	* We have already closed the LDI handle. Clean up the LDI event
	* callbacks and free vd->vdev_tsd.
	*/
	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
	list_remove(&dvd->vd_ldi_cbs, lcb);
	(void) ldi_ev_remove_callbacks(lcb->lcb_id);
	kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
	}
	list_destroy(&dvd->vd_ldi_cbs);
	kmem_free(dvd, sizeof (vdev_disk_t));
	vd->vdev_tsd = NULL;
	}

	/* ARGSUSED */
	static int
	vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
	void *ev_data)
	{
	vdev_t vd = (vdev_t )arg;
	vdev_disk_t *dvd = vd->vdev_tsd;

	/*
	* Ignore events other than offline.
	*/
	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
	return (LDI_EV_SUCCESS);

	/*
	* All LDI handles must be closed for the state change to succeed, so
	* call on vdev_disk_close() to do this.
	*
	* We inform vdev_disk_close that it is being called from offline
	* notify context so it will defer cleanup of LDI event callbacks and
	* freeing of vd->vdev_tsd to the offline finalize or a reopen.
	*/
	dvd->vd_ldi_offline = B_TRUE;
	vdev_disk_close(vd);

	/*
	* Now that the device is closed, request that the spa_async_thread
	* mark the device as REMOVED and notify FMA of the removal.
	*/
	zfs_post_remove(vd->vdev_spa, vd);
	vd->vdev_remove_wanted = B_TRUE;
	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);

	return (LDI_EV_SUCCESS);
	}

	/* ARGSUSED */
	static void
	vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
	int ldi_result, void arg, void ev_data)
	{
	vdev_t vd = (vdev_t )arg;

	/*
	* Ignore events other than offline.
	*/
	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
	return;

	/*
	* We have already closed the LDI handle in notify.
	* Clean up the LDI event callbacks and free vd->vdev_tsd.
	*/
	vdev_disk_free(vd);

	/*
	* Request that the vdev be reopened if the offline state change was
	* unsuccessful.
	*/
	if (ldi_result != LDI_EV_SUCCESS) {
	vd->vdev_probe_wanted = B_TRUE;
	spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
	}
	}

	static ldi_ev_callback_t vdev_disk_off_callb = {
	.cb_vers = LDI_EV_CB_VERS,
	.cb_notify = vdev_disk_off_notify,
	.cb_finalize = vdev_disk_off_finalize
	};

	/* ARGSUSED */
	static void
	vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
	int ldi_result, void arg, void ev_data)
	{
	vdev_t vd = (vdev_t )arg;

	/*
	* Ignore events other than degrade.
	*/
	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
	return;

	/*
	* Degrade events always succeed. Mark the vdev as degraded.
	* This status is purely informative for the user.
	*/
	(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
	}

	static ldi_ev_callback_t vdev_disk_dgrd_callb = {
	.cb_vers = LDI_EV_CB_VERS,
	.cb_notify = NULL,
	.cb_finalize = vdev_disk_dgrd_finalize
	};

	static void
	vdev_disk_hold(vdev_t *vd)
	{
	ddi_devid_t devid;
	char *minor;

	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));

	/*
	* We must have a pathname, and it must be absolute.
	*/
	if (vd->vdev_path == NULL \|\| vd->vdev_path[0] != '/')
	return;

	/*
	* Only prefetch path and devid info if the device has
	* never been opened.
	*/
	if (vd->vdev_tsd != NULL)
	return;

	if (vd->vdev_wholedisk == -1ULL) {
	size_t len = strlen(vd->vdev_path) + 3;
	char *buf = kmem_alloc(len, KM_SLEEP);

	(void) snprintf(buf, len, "%ss0", vd->vdev_path);

	(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
	kmem_free(buf, len);
	}

	if (vd->vdev_name_vp == NULL)
	(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);

	if (vd->vdev_devid != NULL &&
	ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
	(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
	ddi_devid_str_free(minor);
	ddi_devid_free(devid);
	}
	}

	static void
	vdev_disk_rele(vdev_t *vd)
	{
	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));

	if (vd->vdev_name_vp) {
	VN_RELE_ASYNC(vd->vdev_name_vp,
	dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
	vd->vdev_name_vp = NULL;
	}
	if (vd->vdev_devid_vp) {
	VN_RELE_ASYNC(vd->vdev_devid_vp,
	dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
	vd->vdev_devid_vp = NULL;
	}
	}

	/*
	* We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
	* even a fallback to DKIOCGMEDIAINFO fails.
	*/
	#ifdef DEBUG
	#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__)
	#else
	#define VDEV_DEBUG(...) /* Nothing... */
	#endif

	static int
	vdev_disk_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	uint64_t *ashift)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_disk_t *dvd = vd->vdev_tsd;
	ldi_ev_cookie_t ecookie;
	vdev_disk_ldi_cb_t *lcb;
	union {
	struct dk_minfo_ext ude;
	struct dk_minfo ud;
	} dks;
	struct dk_minfo_ext *dkmext = &dks.ude;
	struct dk_minfo *dkm = &dks.ud;
	int error;
	dev_t dev;
	int otyp;
	boolean_t validate_devid = B_FALSE;
	ddi_devid_t devid;
	uint64_t capacity = 0, blksz = 0, pbsize;

	/*
	* We must have a pathname, and it must be absolute.
	*/
	if (vd->vdev_path == NULL \|\| vd->vdev_path[0] != '/') {
	vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	return (SET_ERROR(EINVAL));
	}

	/*
	* Reopen the device if it's not currently open. Otherwise,
	* just update the physical size of the device.
	*/
	if (dvd != NULL) {
	if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
	/*
	* If we are opening a device in its offline notify
	* context, the LDI handle was just closed. Clean
	* up the LDI event callbacks and free vd->vdev_tsd.
	*/
	vdev_disk_free(vd);
	} else {
	ASSERT(vd->vdev_reopening);
	goto skip_open;
	}
	}

	/*
	* Create vd->vdev_tsd.
	*/
	vdev_disk_alloc(vd);
	dvd = vd->vdev_tsd;

	/*
	* When opening a disk device, we want to preserve the user's original
	* intent. We always want to open the device by the path the user gave
	* us, even if it is one of multiple paths to the save device. But we
	* also want to be able to survive disks being removed/recabled.
	* Therefore the sequence of opening devices is:
	*
	* 1. Try opening the device by path. For legacy pools without the
	* 'whole_disk' property, attempt to fix the path by appending 's0'.
	*
	* 2. If the devid of the device matches the stored value, return
	* success.
	*
	* 3. Otherwise, the device may have moved. Try opening the device
	* by the devid instead.
	*/
	if (vd->vdev_devid != NULL) {
	if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
	&dvd->vd_minor) != 0) {
	vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	return (SET_ERROR(EINVAL));
	}
	}

	error = EINVAL; /* presume failure */

	if (vd->vdev_path != NULL) {

	if (vd->vdev_wholedisk == -1ULL) {
	size_t len = strlen(vd->vdev_path) + 3;
	char *buf = kmem_alloc(len, KM_SLEEP);

	(void) snprintf(buf, len, "%ss0", vd->vdev_path);

	error = ldi_open_by_name(buf, spa_mode(spa), kcred,
	&dvd->vd_lh, zfs_li);
	if (error == 0) {
	spa_strfree(vd->vdev_path);
	vd->vdev_path = buf;
	vd->vdev_wholedisk = 1ULL;
	} else {
	kmem_free(buf, len);
	}
	}

	/*
	* If we have not yet opened the device, try to open it by the
	* specified path.
	*/
	if (error != 0) {
	error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
	kcred, &dvd->vd_lh, zfs_li);
	}

	/*
	* Compare the devid to the stored value.
	*/
	if (error == 0 && vd->vdev_devid != NULL &&
	ldi_get_devid(dvd->vd_lh, &devid) == 0) {
	if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
	error = SET_ERROR(EINVAL);
	(void) ldi_close(dvd->vd_lh, spa_mode(spa),
	kcred);
	dvd->vd_lh = NULL;
	}
	ddi_devid_free(devid);
	}

	/*
	* If we succeeded in opening the device, but 'vdev_wholedisk'
	* is not yet set, then this must be a slice.
	*/
	if (error == 0 && vd->vdev_wholedisk == -1ULL)
	vd->vdev_wholedisk = 0;
	}

	/*
	* If we were unable to open by path, or the devid check fails, open by
	* devid instead.
	*/
	if (error != 0 && vd->vdev_devid != NULL) {
	error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
	spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
	}

	/*
	* If all else fails, then try opening by physical path (if available)
	* or the logical path (if we failed due to the devid check). While not
	* as reliable as the devid, this will give us something, and the higher
	* level vdev validation will prevent us from opening the wrong device.
	*/
	if (error) {
	if (vd->vdev_devid != NULL)
	validate_devid = B_TRUE;

	if (vd->vdev_physpath != NULL &&
	(dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
	error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
	kcred, &dvd->vd_lh, zfs_li);

	/*
	* Note that we don't support the legacy auto-wholedisk support
	* as above. This hasn't been used in a very long time and we
	* don't need to propagate its oddities to this edge condition.
	*/
	if (error && vd->vdev_path != NULL)
	error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
	kcred, &dvd->vd_lh, zfs_li);
	}

	if (error) {
	vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	return (error);
	}

	/*
	* Now that the device has been successfully opened, update the devid
	* if necessary.
	*/
	if (validate_devid && spa_writeable(spa) &&
	ldi_get_devid(dvd->vd_lh, &devid) == 0) {
	if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
	char *vd_devid;

	vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
	zfs_dbgmsg("vdev %s: update devid from %s, "
	"to %s", vd->vdev_path, vd->vdev_devid, vd_devid);
	spa_strfree(vd->vdev_devid);
	vd->vdev_devid = spa_strdup(vd_devid);
	ddi_devid_str_free(vd_devid);
	}
	ddi_devid_free(devid);
	}

	/*
	* Once a device is opened, verify that the physical device path (if
	* available) is up to date.
	*/
	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
	ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
	char physpath, minorname;

	physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	minorname = NULL;
	if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
	ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
	(vd->vdev_physpath == NULL \|\|
	strcmp(vd->vdev_physpath, physpath) != 0)) {
	if (vd->vdev_physpath)
	spa_strfree(vd->vdev_physpath);
	(void) strlcat(physpath, ":", MAXPATHLEN);
	(void) strlcat(physpath, minorname, MAXPATHLEN);
	vd->vdev_physpath = spa_strdup(physpath);
	}
	if (minorname)
	kmem_free(minorname, strlen(minorname) + 1);
	kmem_free(physpath, MAXPATHLEN);
	}

	/*
	* Register callbacks for the LDI offline event.
	*/
	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
	LDI_EV_SUCCESS) {
	lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
	list_insert_tail(&dvd->vd_ldi_cbs, lcb);
	(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
	&vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
	}

	/*
	* Register callbacks for the LDI degrade event.
	*/
	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
	LDI_EV_SUCCESS) {
	lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
	list_insert_tail(&dvd->vd_ldi_cbs, lcb);
	(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
	&vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
	}
	skip_open:
	/*
	* Determine the actual size of the device.
	*/
	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
	vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	return (SET_ERROR(EINVAL));
	}

	max_psize = psize;

	/*
	* Determine the device's minimum transfer size.
	* If the ioctl isn't supported, assume DEV_BSIZE.
	*/
	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
	(intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
	capacity = dkmext->dki_capacity - 1;
	blksz = dkmext->dki_lbsize;
	pbsize = dkmext->dki_pbsize;
	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
	(intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
	VDEV_DEBUG(
	"vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
	vd->vdev_path);
	capacity = dkm->dki_capacity - 1;
	blksz = dkm->dki_lbsize;
	pbsize = blksz;
	} else {
	VDEV_DEBUG("vdev_disk_open(\"%s\"): "
	"both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
	vd->vdev_path, error);
	pbsize = DEV_BSIZE;
	}

	*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;

	if (vd->vdev_wholedisk == 1) {
	int wce = 1;

	if (error == 0) {
	/*
	* If we have the capability to expand, we'd have
	* found out via success from DKIOCGMEDIAINFO{,EXT}.
	* Adjust max_psize upward accordingly since we know
	* we own the whole disk now.
	*/
	max_psize = capacity blksz;
	}

	/*
	* Since we own the whole disk, try to enable disk write
	* caching. We ignore errors because it's OK if we can't do it.
	*/
	(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
	FKIOCTL, kcred, NULL);
	}

	/*
	* Clear the nowritecache bit, so that on a vdev_reopen() we will
	* try again.
	*/
	vd->vdev_nowritecache = B_FALSE;

	return (0);
	}

	static void
	vdev_disk_close(vdev_t *vd)
	{
	vdev_disk_t *dvd = vd->vdev_tsd;

	if (vd->vdev_reopening \|\| dvd == NULL)
	return;

	if (dvd->vd_minor != NULL) {
	ddi_devid_str_free(dvd->vd_minor);
	dvd->vd_minor = NULL;
	}

	if (dvd->vd_devid != NULL) {
	ddi_devid_free(dvd->vd_devid);
	dvd->vd_devid = NULL;
	}

	if (dvd->vd_lh != NULL) {
	(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
	dvd->vd_lh = NULL;
	}

	vd->vdev_delayed_close = B_FALSE;
	/*
	* If we closed the LDI handle due to an offline notify from LDI,
	* don't free vd->vdev_tsd or unregister the callbacks here;
	* the offline finalize callback or a reopen will take care of it.
	*/
	if (dvd->vd_ldi_offline)
	return;

	vdev_disk_free(vd);
	}

	int
	vdev_disk_physio(vdev_t *vd, caddr_t data,
	size_t size, uint64_t offset, int flags, boolean_t isdump)
	{
	vdev_disk_t *dvd = vd->vdev_tsd;

	/*
	* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
	* Nothing to be done here but return failure.
	*/
	if (dvd == NULL \|\| (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
	return (EIO);

	ASSERT(vd->vdev_ops == &vdev_disk_ops);

	/*
	* If in the context of an active crash dump, use the ldi_dump(9F)
	* call instead of ldi_strategy(9F) as usual.
	*/
	if (isdump) {
	ASSERT3P(dvd, !=, NULL);
	return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
	lbtodb(size)));
	}

	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
	}

	int
	vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
	size_t size, uint64_t offset, int flags)
	{
	buf_t *bp;
	int error = 0;

	if (vd_lh == NULL)
	return (SET_ERROR(EINVAL));

	ASSERT(flags & B_READ \|\| flags & B_WRITE);

	bp = getrbuf(KM_SLEEP);
	bp->b_flags = flags \| B_BUSY \| B_NOCACHE \| B_FAILFAST;
	bp->b_bcount = size;
	bp->b_un.b_addr = (void *)data;
	bp->b_lblkno = lbtodb(offset);
	bp->b_bufsize = size;

	error = ldi_strategy(vd_lh, bp);
	ASSERT(error == 0);
	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
	error = SET_ERROR(EIO);
	freerbuf(bp);

	return (error);
	}

	static void
	vdev_disk_io_intr(buf_t *bp)
	{
	vdev_buf_t vb = (vdev_buf_t )bp;
	zio_t *zio = vb->vb_io;

	/*
	* The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
	* Rather than teach the rest of the stack about other error
	* possibilities (EFAULT, etc), we normalize the error value here.
	*/
	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);

	if (zio->io_error == 0 && bp->b_resid != 0)
	zio->io_error = SET_ERROR(EIO);

	if (zio->io_type == ZIO_TYPE_READ) {
	abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
	} else {
	abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
	}

	kmem_free(vb, sizeof (vdev_buf_t));

	zio_delay_interrupt(zio);
	}

	static void
	vdev_disk_ioctl_free(zio_t *zio)
	{
	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
	}

	static const zio_vsd_ops_t vdev_disk_vsd_ops = {
	vdev_disk_ioctl_free,
	zio_vsd_default_cksum_report
	};

	static void
	vdev_disk_ioctl_done(void *zio_arg, int error)
	{
	zio_t *zio = zio_arg;

	zio->io_error = error;

	zio_interrupt(zio);
	}

	static void
	vdev_disk_io_start(zio_t *zio)
	{
	vdev_t *vd = zio->io_vd;
	vdev_disk_t *dvd = vd->vdev_tsd;
	vdev_buf_t *vb;
	struct dk_callback *dkc;
	buf_t *bp;
	int error;

	/*
	* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
	* Nothing to be done here but return failure.
	*/
	if (dvd == NULL \|\| (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
	zio->io_error = SET_ERROR(ENXIO);
	zio_interrupt(zio);
	return;
	}

	if (zio->io_type == ZIO_TYPE_IOCTL) {
	/* XXPOLICY */
	if (!vdev_readable(vd)) {
	zio->io_error = SET_ERROR(ENXIO);
	zio_interrupt(zio);
	return;
	}

	switch (zio->io_cmd) {

	case DKIOCFLUSHWRITECACHE:

	if (zfs_nocacheflush)
	break;

	if (vd->vdev_nowritecache) {
	zio->io_error = SET_ERROR(ENOTSUP);
	break;
	}

	zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
	zio->io_vsd_ops = &vdev_disk_vsd_ops;

	dkc->dkc_callback = vdev_disk_ioctl_done;
	dkc->dkc_flag = FLUSH_VOLATILE;
	dkc->dkc_cookie = zio;

	error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
	(uintptr_t)dkc, FKIOCTL, kcred, NULL);

	if (error == 0) {
	/*
	* The ioctl will be done asychronously,
	* and will call vdev_disk_ioctl_done()
	* upon completion.
	*/
	return;
	}

	zio->io_error = error;

	break;

	default:
	zio->io_error = SET_ERROR(ENOTSUP);
	}

	zio_execute(zio);
	return;
	}

	ASSERT(zio->io_type == ZIO_TYPE_READ \|\| zio->io_type == ZIO_TYPE_WRITE);
	zio->io_target_timestamp = zio_handle_io_delay(zio);

	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);

	vb->vb_io = zio;
	bp = &vb->vb_buf;

	bioinit(bp);
	bp->b_flags = B_BUSY \| B_NOCACHE \|
	(zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY \| ZIO_FLAG_TRYHARD)))
	bp->b_flags \|= B_FAILFAST;
	bp->b_bcount = zio->io_size;

	if (zio->io_type == ZIO_TYPE_READ) {
	bp->b_un.b_addr =
	abd_borrow_buf(zio->io_abd, zio->io_size);
	} else {
	bp->b_un.b_addr =
	abd_borrow_buf_copy(zio->io_abd, zio->io_size);
	}

	bp->b_lblkno = lbtodb(zio->io_offset);
	bp->b_bufsize = zio->io_size;
	bp->b_iodone = (int (*)())vdev_disk_io_intr;

	/* ldi_strategy() will return non-zero only on programming errors */
	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
	}

	static void
	vdev_disk_io_done(zio_t *zio)
	{
	vdev_t *vd = zio->io_vd;

	/*
	* If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
	* the device has been removed. If this is the case, then we trigger an
	* asynchronous removal of the device. Otherwise, probe the device and
	* make sure it's still accessible.
	*/
	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
	vdev_disk_t *dvd = vd->vdev_tsd;
	int state = DKIO_NONE;

	if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
	FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
	/*
	* We post the resource as soon as possible, instead of
	* when the async removal actually happens, because the
	* DE is using this information to discard previous I/O
	* errors.
	*/
	zfs_post_remove(zio->io_spa, vd);
	vd->vdev_remove_wanted = B_TRUE;
	spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
	} else if (!vd->vdev_delayed_close) {
	vd->vdev_delayed_close = B_TRUE;
	}
	}
	}

	vdev_ops_t vdev_disk_ops = {
	vdev_disk_open,
	vdev_disk_close,
	vdev_default_asize,
	vdev_disk_io_start,
	vdev_disk_io_done,
	NULL,
	vdev_disk_hold,
	vdev_disk_rele,
	+ NULL,
	VDEV_TYPE_DISK, /* name of this vdev type */
	B_TRUE /* leaf vdev */
	};

	/*
	* Given the root disk device devid or pathname, read the label from
	* the device, and construct a configuration nvlist.
	*/
	int
	vdev_disk_read_rootlabel(char devpath, char devid, nvlist_t **config)
	{
	ldi_handle_t vd_lh;
	vdev_label_t *label;
	uint64_t s, size;
	int l;
	ddi_devid_t tmpdevid;
	int error = -1;
	char *minor_name;

	/*
	* Read the device label and build the nvlist.
	*/
	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
	&minor_name) == 0) {
	error = ldi_open_by_devid(tmpdevid, minor_name,
	FREAD, kcred, &vd_lh, zfs_li);
	ddi_devid_free(tmpdevid);
	ddi_devid_str_free(minor_name);
	}

	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
	zfs_li)))
	return (error);

	if (ldi_get_size(vd_lh, &s)) {
	(void) ldi_close(vd_lh, FREAD, kcred);
	return (SET_ERROR(EIO));
	}

	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);

	*config = NULL;
	for (l = 0; l < VDEV_LABELS; l++) {
	uint64_t offset, state, txg = 0;

	/* read vdev label */
	offset = vdev_label_offset(size, l, 0);
	if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
	VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
	continue;

	if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
	sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
	*config = NULL;
	continue;
	}

	if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
	&state) != 0 \|\| state >= POOL_STATE_DESTROYED) {
	nvlist_free(*config);
	*config = NULL;
	continue;
	}

	if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
	&txg) != 0 \|\| txg == 0) {
	nvlist_free(*config);
	*config = NULL;
	continue;
	}

	break;
	}

	kmem_free(label, sizeof (vdev_label_t));
	(void) ldi_close(vd_lh, FREAD, kcred);
	if (*config == NULL)
	error = SET_ERROR(EIDRM);

	return (error);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c (revision 332525)
	@@ -1,294 +1,296 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/vdev_file.h>
	#include <sys/vdev_impl.h>
	#include <sys/zio.h>
	#include <sys/fs/zfs.h>
	#include <sys/fm/fs/zfs.h>
	#include <sys/abd.h>

	/*
	* Virtual device vector for files.
	*/

	static taskq_t *vdev_file_taskq;

	void
	vdev_file_init(void)
	{
	vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
	minclsyspri, max_ncpus, INT_MAX, 0);
	}

	void
	vdev_file_fini(void)
	{
	taskq_destroy(vdev_file_taskq);
	}

	static void
	vdev_file_hold(vdev_t *vd)
	{
	ASSERT(vd->vdev_path != NULL);
	}

	static void
	vdev_file_rele(vdev_t *vd)
	{
	ASSERT(vd->vdev_path != NULL);
	}

	static int
	vdev_file_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	uint64_t logical_ashift, uint64_t physical_ashift)
	{
	vdev_file_t *vf;
	vnode_t *vp;
	vattr_t vattr;
	int error;

	/*
	* We must have a pathname, and it must be absolute.
	*/
	if (vd->vdev_path == NULL \|\| vd->vdev_path[0] != '/') {
	vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	return (SET_ERROR(EINVAL));
	}

	/*
	* Reopen the device if it's not currently open. Otherwise,
	* just update the physical size of the device.
	*/
	if (vd->vdev_tsd != NULL) {
	ASSERT(vd->vdev_reopening);
	vf = vd->vdev_tsd;
	vp = vf->vf_vnode;
	goto skip_open;
	}

	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);

	/*
	* We always open the files from the root of the global zone, even if
	* we're in a local zone. If the user has gotten to this point, the
	* administrator has already decided that the pool should be available
	* to local zone users, so the underlying devices should be as well.
	*/
	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
	spa_mode(vd->vdev_spa) \| FOFFMAX, 0, &vp, 0, 0, rootdir, -1);

	if (error) {
	vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
	vd->vdev_tsd = NULL;
	return (error);
	}

	vf->vf_vnode = vp;

	#ifdef _KERNEL
	/*
	* Make sure it's a regular file.
	*/
	if (vp->v_type != VREG) {
	#ifdef __FreeBSD__
	(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
	#endif
	vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	#ifdef __FreeBSD__
	kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
	vd->vdev_tsd = NULL;
	#endif
	return (SET_ERROR(ENODEV));
	}
	#endif /* _KERNEL */

	skip_open:
	/*
	* Determine the physical size of the file.
	*/
	vattr.va_mask = AT_SIZE;
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	error = VOP_GETATTR(vp, &vattr, kcred);
	VOP_UNLOCK(vp, 0);
	if (error) {
	(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
	vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
	vd->vdev_tsd = NULL;
	return (error);
	}

	vd->vdev_notrim = B_TRUE;

	max_psize = psize = vattr.va_size;
	*logical_ashift = SPA_MINBLOCKSHIFT;
	*physical_ashift = SPA_MINBLOCKSHIFT;

	return (0);
	}

	static void
	vdev_file_close(vdev_t *vd)
	{
	vdev_file_t *vf = vd->vdev_tsd;

	if (vd->vdev_reopening \|\| vf == NULL)
	return;

	if (vf->vf_vnode != NULL) {
	(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
	kcred, NULL);
	}

	vd->vdev_delayed_close = B_FALSE;
	kmem_free(vf, sizeof (vdev_file_t));
	vd->vdev_tsd = NULL;
	}

	/*
	* Implements the interrupt side for file vdev types. This routine will be
	* called when the I/O completes allowing us to transfer the I/O to the
	* interrupt taskqs. For consistency, the code structure mimics disk vdev
	* types.
	*/
	static void
	vdev_file_io_intr(zio_t *zio)
	{
	zio_delay_interrupt(zio);
	}

	static void
	vdev_file_io_strategy(void *arg)
	{
	zio_t *zio = arg;
	vdev_t *vd = zio->io_vd;
	vdev_file_t *vf;
	vnode_t *vp;
	void *addr;
	ssize_t resid;

	vf = vd->vdev_tsd;
	vp = vf->vf_vnode;

	ASSERT(zio->io_type == ZIO_TYPE_READ \|\| zio->io_type == ZIO_TYPE_WRITE);
	if (zio->io_type == ZIO_TYPE_READ) {
	addr = abd_borrow_buf(zio->io_abd, zio->io_size);
	} else {
	addr = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
	}

	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
	UIO_READ : UIO_WRITE, vp, addr, zio->io_size,
	zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);

	if (zio->io_type == ZIO_TYPE_READ) {
	abd_return_buf_copy(zio->io_abd, addr, zio->io_size);
	} else {
	abd_return_buf(zio->io_abd, addr, zio->io_size);
	}

	if (resid != 0 && zio->io_error == 0)
	zio->io_error = ENOSPC;

	vdev_file_io_intr(zio);
	}

	static void
	vdev_file_io_start(zio_t *zio)
	{
	vdev_t *vd = zio->io_vd;
	vdev_file_t *vf = vd->vdev_tsd;

	if (zio->io_type == ZIO_TYPE_IOCTL) {
	/* XXPOLICY */
	if (!vdev_readable(vd)) {
	zio->io_error = SET_ERROR(ENXIO);
	zio_interrupt(zio);
	return;
	}

	switch (zio->io_cmd) {
	case DKIOCFLUSHWRITECACHE:
	zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC \| FDSYNC,
	kcred, NULL);
	break;
	default:
	zio->io_error = SET_ERROR(ENOTSUP);
	}

	zio_execute(zio);
	return;
	}

	ASSERT(zio->io_type == ZIO_TYPE_READ \|\| zio->io_type == ZIO_TYPE_WRITE);
	zio->io_target_timestamp = zio_handle_io_delay(zio);

	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
	TQ_SLEEP), !=, 0);
	}

	/* ARGSUSED */
	static void
	vdev_file_io_done(zio_t *zio)
	{
	}

	vdev_ops_t vdev_file_ops = {
	vdev_file_open,
	vdev_file_close,
	vdev_default_asize,
	vdev_file_io_start,
	vdev_file_io_done,
	NULL,
	vdev_file_hold,
	vdev_file_rele,
	+ NULL,
	VDEV_TYPE_FILE, /* name of this vdev type */
	B_TRUE /* leaf vdev */
	};

	/*
	* From userland we access disks just like files.
	*/
	#ifndef _KERNEL

	vdev_ops_t vdev_disk_ops = {
	vdev_file_open,
	vdev_file_close,
	vdev_default_asize,
	vdev_file_io_start,
	vdev_file_io_done,
	NULL,
	vdev_file_hold,
	vdev_file_rele,
	+ NULL,
	VDEV_TYPE_DISK, /* name of this vdev type */
	B_TRUE /* leaf vdev */
	};

	#endif
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 332525)
	@@ -1,1152 +1,1153 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	* All rights reserved.
	*
	* Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
	*/

	#include <sys/zfs_context.h>
	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/disk.h>
	#include <sys/spa.h>
	#include <sys/spa_impl.h>
	#include <sys/vdev_impl.h>
	#include <sys/fs/zfs.h>
	#include <sys/zio.h>
	#include <geom/geom.h>
	#include <geom/geom_int.h>

	/*
	* Virtual device vector for GEOM.
	*/

	static g_attrchanged_t vdev_geom_attrchanged;
	struct g_class zfs_vdev_class = {
	.name = "ZFS::VDEV",
	.version = G_VERSION,
	.attrchanged = vdev_geom_attrchanged,
	};

	struct consumer_vdev_elem {
	SLIST_ENTRY(consumer_vdev_elem) elems;
	vdev_t *vd;
	};

	SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
	_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
	== sizeof(struct consumer_priv_t*),
	"consumer_priv_t* can't be stored in g_consumer.private");

	DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);

	SYSCTL_DECL(_vfs_zfs_vdev);
	/* Don't send BIO_FLUSH. */
	static int vdev_geom_bio_flush_disable;
	SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
	&vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
	/* Don't send BIO_DELETE. */
	static int vdev_geom_bio_delete_disable;
	SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
	&vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");

	/* Declare local functions */
	static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);

	/*
	* Thread local storage used to indicate when a thread is probing geoms
	* for their guids. If NULL, this thread is not tasting geoms. If non NULL,
	* it is looking for a replacement for the vdev_t* that is its value.
	*/
	uint_t zfs_geom_probe_vdev_key;

	static void
	vdev_geom_set_rotation_rate(vdev_t vd, struct g_consumer cp)
	{
	int error;
	uint16_t rate;

	error = g_getattr("GEOM::rotation_rate", cp, &rate);
	if (error == 0)
	vd->vdev_rotation_rate = rate;
	else
	vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
	}

	static void
	vdev_geom_set_physpath(vdev_t vd, struct g_consumer cp,
	boolean_t do_null_update)
	{
	boolean_t needs_update = B_FALSE;
	char *physpath;
	int error, physpath_len;

	physpath_len = MAXPATHLEN;
	physpath = g_malloc(physpath_len, M_WAITOK\|M_ZERO);
	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
	if (error == 0) {
	char *old_physpath;

	/* g_topology lock ensures that vdev has not been closed */
	g_topology_assert();
	old_physpath = vd->vdev_physpath;
	vd->vdev_physpath = spa_strdup(physpath);

	if (old_physpath != NULL) {
	needs_update = (strcmp(old_physpath,
	vd->vdev_physpath) != 0);
	spa_strfree(old_physpath);
	} else
	needs_update = do_null_update;
	}
	g_free(physpath);

	/*
	* If the physical path changed, update the config.
	* Only request an update for previously unset physpaths if
	* requested by the caller.
	*/
	if (needs_update)
	spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);

	}

	static void
	vdev_geom_attrchanged(struct g_consumer cp, const char attr)
	{
	char *old_physpath;
	struct consumer_priv_t *priv;
	struct consumer_vdev_elem *elem;
	int error;

	priv = (struct consumer_priv_t*)&cp->private;
	if (SLIST_EMPTY(priv))
	return;

	SLIST_FOREACH(elem, priv, elems) {
	vdev_t *vd = elem->vd;
	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
	vdev_geom_set_rotation_rate(vd, cp);
	return;
	}
	if (strcmp(attr, "GEOM::physpath") == 0) {
	vdev_geom_set_physpath(vd, cp, /null_update/B_TRUE);
	return;
	}
	}
	}

	static void
	vdev_geom_orphan(struct g_consumer *cp)
	{
	struct consumer_priv_t *priv;
	struct consumer_vdev_elem *elem;

	g_topology_assert();

	priv = (struct consumer_priv_t*)&cp->private;
	if (SLIST_EMPTY(priv))
	/* Vdev close in progress. Ignore the event. */
	return;

	/*
	* Orphan callbacks occur from the GEOM event thread.
	* Concurrent with this call, new I/O requests may be
	* working their way through GEOM about to find out
	* (only once executed by the g_down thread) that we've
	* been orphaned from our disk provider. These I/Os
	* must be retired before we can detach our consumer.
	* This is most easily achieved by acquiring the
	* SPA ZIO configuration lock as a writer, but doing
	* so with the GEOM topology lock held would cause
	* a lock order reversal. Instead, rely on the SPA's
	* async removal support to invoke a close on this
	* vdev once it is safe to do so.
	*/
	SLIST_FOREACH(elem, priv, elems) {
	vdev_t *vd = elem->vd;

	vd->vdev_remove_wanted = B_TRUE;
	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
	}
	}

	static struct g_consumer *
	vdev_geom_attach(struct g_provider pp, vdev_t vd, boolean_t sanity)
	{
	struct g_geom *gp;
	struct g_consumer *cp;
	int error;

	g_topology_assert();

	ZFS_LOG(1, "Attaching to %s.", pp->name);

	if (sanity) {
	if (pp->sectorsize > VDEV_PAD_SIZE \|\| !ISP2(pp->sectorsize)) {
	ZFS_LOG(1, "Failing attach of %s. "
	"Incompatible sectorsize %d\n",
	pp->name, pp->sectorsize);
	return (NULL);
	} else if (pp->mediasize < SPA_MINDEVSIZE) {
	ZFS_LOG(1, "Failing attach of %s. "
	"Incompatible mediasize %ju\n",
	pp->name, pp->mediasize);
	return (NULL);
	}
	}

	/* Do we have geom already? No? Create one. */
	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
	if (gp->flags & G_GEOM_WITHER)
	continue;
	if (strcmp(gp->name, "zfs::vdev") != 0)
	continue;
	break;
	}
	if (gp == NULL) {
	gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
	gp->orphan = vdev_geom_orphan;
	gp->attrchanged = vdev_geom_attrchanged;
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error != 0) {
	ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
	__LINE__, error);
	vdev_geom_detach(cp, B_FALSE);
	return (NULL);
	}
	error = g_access(cp, 1, 0, 1);
	if (error != 0) {
	ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
	__LINE__, error);
	vdev_geom_detach(cp, B_FALSE);
	return (NULL);
	}
	ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
	} else {
	/* Check if we are already connected to this provider. */
	LIST_FOREACH(cp, &gp->consumer, consumer) {
	if (cp->provider == pp) {
	ZFS_LOG(1, "Found consumer for %s.", pp->name);
	break;
	}
	}
	if (cp == NULL) {
	cp = g_new_consumer(gp);
	error = g_attach(cp, pp);
	if (error != 0) {
	ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
	__func__, __LINE__, error);
	vdev_geom_detach(cp, B_FALSE);
	return (NULL);
	}
	error = g_access(cp, 1, 0, 1);
	if (error != 0) {
	ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
	__func__, __LINE__, error);
	vdev_geom_detach(cp, B_FALSE);
	return (NULL);
	}
	ZFS_LOG(1, "Created consumer for %s.", pp->name);
	} else {
	error = g_access(cp, 1, 0, 1);
	if (error != 0) {
	ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
	__func__, __LINE__, error);
	return (NULL);
	}
	ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
	}
	}

	if (vd != NULL)
	vd->vdev_tsd = cp;

	cp->flags \|= G_CF_DIRECT_SEND \| G_CF_DIRECT_RECEIVE;
	return (cp);
	}

	static void
	vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
	{
	struct g_geom *gp;

	g_topology_assert();

	ZFS_LOG(1, "Detaching from %s.",
	cp->provider && cp->provider->name ? cp->provider->name : "NULL");

	gp = cp->geom;
	if (open_for_read)
	g_access(cp, -1, 0, -1);
	/* Destroy consumer on last close. */
	if (cp->acr == 0 && cp->ace == 0) {
	if (cp->acw > 0)
	g_access(cp, 0, -cp->acw, 0);
	if (cp->provider != NULL) {
	ZFS_LOG(1, "Destroying consumer for %s.",
	cp->provider->name ? cp->provider->name : "NULL");
	g_detach(cp);
	}
	g_destroy_consumer(cp);
	}
	/* Destroy geom if there are no consumers left. */
	if (LIST_EMPTY(&gp->consumer)) {
	ZFS_LOG(1, "Destroyed geom %s.", gp->name);
	g_wither_geom(gp, ENXIO);
	}
	}

	static void
	vdev_geom_close_locked(vdev_t *vd)
	{
	struct g_consumer *cp;
	struct consumer_priv_t *priv;
	struct consumer_vdev_elem elem, elem_temp;

	g_topology_assert();

	cp = vd->vdev_tsd;
	vd->vdev_delayed_close = B_FALSE;
	if (cp == NULL)
	return;

	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
	priv = (struct consumer_priv_t*)&cp->private;
	vd->vdev_tsd = NULL;
	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
	if (elem->vd == vd) {
	SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
	g_free(elem);
	}
	}

	vdev_geom_detach(cp, B_TRUE);
	}

	/*
	* Issue one or more bios to the vdev in parallel
	* cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO
	* operation is described by parallel entries from each array. There may be
	* more bios actually issued than entries in the array
	*/
	static void
	vdev_geom_io(struct g_consumer cp, int cmds, void *datas, off_t offsets,
	off_t sizes, int errors, int ncmds)
	{
	struct bio **bios;
	u_char *p;
	off_t off, maxio, s, end;
	int i, n_bios, j;
	size_t bios_size;

	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
	n_bios = 0;

	/* How many bios are required for all commands ? */
	for (i = 0; i < ncmds; i++)
	n_bios += (sizes[i] + maxio - 1) / maxio;

	/* Allocate memory for the bios */
	bios_size = n_bios * sizeof(struct bio*);
	bios = kmem_zalloc(bios_size, KM_SLEEP);

	/* Prepare and issue all of the bios */
	for (i = j = 0; i < ncmds; i++) {
	off = offsets[i];
	p = datas[i];
	s = sizes[i];
	end = off + s;
	ASSERT((off % cp->provider->sectorsize) == 0);
	ASSERT((s % cp->provider->sectorsize) == 0);

	for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
	bios[j] = g_alloc_bio();
	bios[j]->bio_cmd = cmds[i];
	bios[j]->bio_done = NULL;
	bios[j]->bio_offset = off;
	bios[j]->bio_length = MIN(s, maxio);
	bios[j]->bio_data = p;
	g_io_request(bios[j], cp);
	}
	}
	ASSERT(j == n_bios);

	/* Wait for all of the bios to complete, and clean them up */
	for (i = j = 0; i < ncmds; i++) {
	off = offsets[i];
	s = sizes[i];
	end = off + s;

	for (; off < end; off += maxio, s -= maxio, j++) {
	errors[i] = biowait(bios[j], "vdev_geom_io") \|\| errors[i];
	g_destroy_bio(bios[j]);
	}
	}
	kmem_free(bios, bios_size);
	}

	/*
	* Read the vdev config from a device. Return the number of valid labels that
	* were found. The vdev config will be returned in config if and only if at
	* least one valid label was found.
	*/
	static int
	vdev_geom_read_config(struct g_consumer cp, nvlist_t *config)
	{
	struct g_provider *pp;
	vdev_phys_t *vdev_lists[VDEV_LABELS];
	char *buf;
	size_t buflen;
	uint64_t psize, state, txg;
	off_t offsets[VDEV_LABELS];
	off_t size;
	off_t sizes[VDEV_LABELS];
	int cmds[VDEV_LABELS];
	int errors[VDEV_LABELS];
	int l, nlabels;

	g_topology_assert_not();

	pp = cp->provider;
	ZFS_LOG(1, "Reading config from %s...", pp->name);

	psize = pp->mediasize;
	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));

	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
	((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;

	buflen = sizeof(vdev_lists[0]->vp_nvlist);

	*config = NULL;
	/* Create all of the IO requests */
	for (l = 0; l < VDEV_LABELS; l++) {
	cmds[l] = BIO_READ;
	vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
	offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
	sizes[l] = size;
	errors[l] = 0;
	ASSERT(offsets[l] % pp->sectorsize == 0);
	}

	/* Issue the IO requests */
	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
	VDEV_LABELS);

	/* Parse the labels */
	nlabels = 0;
	for (l = 0; l < VDEV_LABELS; l++) {
	if (errors[l] != 0)
	continue;

	buf = vdev_lists[l]->vp_nvlist;

	if (nvlist_unpack(buf, buflen, config, 0) != 0)
	continue;

	if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
	&state) != 0 \|\| state > POOL_STATE_L2CACHE) {
	nvlist_free(*config);
	*config = NULL;
	continue;
	}

	if (state != POOL_STATE_SPARE &&
	state != POOL_STATE_L2CACHE &&
	(nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
	&txg) != 0 \|\| txg == 0)) {
	nvlist_free(*config);
	*config = NULL;
	continue;
	}

	nlabels++;
	}

	/* Free the label storage */
	for (l = 0; l < VDEV_LABELS; l++)
	kmem_free(vdev_lists[l], size);

	return (nlabels);
	}

	static void
	resize_configs(nvlist_t **configs, uint64_t count, uint64_t id)
	{
	nvlist_t **new_configs;
	uint64_t i;

	if (id < *count)
	return;
	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
	KM_SLEEP);
	for (i = 0; i < *count; i++)
	new_configs[i] = (*configs)[i];
	if (*configs != NULL)
	kmem_free(configs, count * sizeof(void *));
	*configs = new_configs;
	*count = id + 1;
	}

	static void
	process_vdev_config(nvlist_t **configs, uint64_t count, nvlist_t *cfg,
	const char name, uint64_t known_pool_guid)
	{
	nvlist_t *vdev_tree;
	uint64_t pool_guid;
	uint64_t vdev_guid, known_guid;
	uint64_t id, txg, known_txg;
	char *pname;
	int i;

	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 \|\|
	strcmp(pname, name) != 0)
	goto ignore;

	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
	goto ignore;

	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
	goto ignore;

	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
	goto ignore;

	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
	goto ignore;

	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);

	if (*known_pool_guid != 0) {
	if (pool_guid != *known_pool_guid)
	goto ignore;
	} else
	*known_pool_guid = pool_guid;

	resize_configs(configs, count, id);

	if ((*configs)[id] != NULL) {
	VERIFY(nvlist_lookup_uint64((*configs)[id],
	ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
	if (txg <= known_txg)
	goto ignore;
	nvlist_free((*configs)[id]);
	}

	(*configs)[id] = cfg;
	return;

	ignore:
	nvlist_free(cfg);
	}

	int
	vdev_geom_read_pool_label(const char *name,
	nvlist_t **configs, uint64_t count)
	{
	struct g_class *mp;
	struct g_geom *gp;
	struct g_provider *pp;
	struct g_consumer *zcp;
	nvlist_t *vdev_cfg;
	uint64_t pool_guid;
	int error, nlabels;

	DROP_GIANT();
	g_topology_lock();

	*configs = NULL;
	*count = 0;
	pool_guid = 0;
	LIST_FOREACH(mp, &g_classes, class) {
	if (mp == &zfs_vdev_class)
	continue;
	LIST_FOREACH(gp, &mp->geom, geom) {
	if (gp->flags & G_GEOM_WITHER)
	continue;
	LIST_FOREACH(pp, &gp->provider, provider) {
	if (pp->flags & G_PF_WITHER)
	continue;
	zcp = vdev_geom_attach(pp, NULL, B_TRUE);
	if (zcp == NULL)
	continue;
	g_topology_unlock();
	nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
	g_topology_lock();
	vdev_geom_detach(zcp, B_TRUE);
	if (nlabels == 0)
	continue;
	ZFS_LOG(1, "successfully read vdev config");

	process_vdev_config(configs, count,
	vdev_cfg, name, &pool_guid);
	}
	}
	}
	g_topology_unlock();
	PICKUP_GIANT();

	return (*count > 0 ? 0 : ENOENT);
	}

	enum match {
	NO_MATCH = 0, /* No matching labels found */
	TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid*/
	ZERO_MATCH = 1, /* Should never be returned */
	ONE_MATCH = 2, /* 1 label matching the vdev_guid */
	TWO_MATCH = 3, /* 2 label matching the vdev_guid */
	THREE_MATCH = 4, /* 3 label matching the vdev_guid */
	FULL_MATCH = 5 /* all labels match the vdev_guid */
	};

	static enum match
	vdev_attach_ok(vdev_t vd, struct g_provider pp)
	{
	nvlist_t *config;
	uint64_t pool_guid, top_guid, vdev_guid;
	struct g_consumer *cp;
	int nlabels;

	cp = vdev_geom_attach(pp, NULL, B_TRUE);
	if (cp == NULL) {
	ZFS_LOG(1, "Unable to attach tasting instance to %s.",
	pp->name);
	return (NO_MATCH);
	}
	g_topology_unlock();
	nlabels = vdev_geom_read_config(cp, &config);
	g_topology_lock();
	vdev_geom_detach(cp, B_TRUE);
	if (nlabels == 0) {
	ZFS_LOG(1, "Unable to read config from %s.", pp->name);
	return (NO_MATCH);
	}

	pool_guid = 0;
	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
	top_guid = 0;
	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
	vdev_guid = 0;
	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
	nvlist_free(config);

	/*
	* Check that the label's pool guid matches the desired guid.
	* Inactive spares and L2ARCs do not have any pool guid in the label.
	*/
	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
	ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
	pp->name,
	(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
	return (NO_MATCH);
	}

	/*
	* Check that the label's vdev guid matches the desired guid.
	* The second condition handles possible race on vdev detach, when
	* remaining vdev receives GUID of destroyed top level mirror vdev.
	*/
	if (vdev_guid == vd->vdev_guid) {
	ZFS_LOG(1, "guids match for provider %s.", pp->name);
	return (ZERO_MATCH + nlabels);
	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
	ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
	return (TOPGUID_MATCH);
	}
	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
	pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
	return (NO_MATCH);
	}

	static struct g_consumer *
	vdev_geom_attach_by_guids(vdev_t *vd)
	{
	struct g_class *mp;
	struct g_geom *gp;
	struct g_provider pp, best_pp;
	struct g_consumer *cp;
	enum match match, best_match;

	g_topology_assert();

	cp = NULL;
	best_pp = NULL;
	best_match = NO_MATCH;
	LIST_FOREACH(mp, &g_classes, class) {
	if (mp == &zfs_vdev_class)
	continue;
	LIST_FOREACH(gp, &mp->geom, geom) {
	if (gp->flags & G_GEOM_WITHER)
	continue;
	LIST_FOREACH(pp, &gp->provider, provider) {
	match = vdev_attach_ok(vd, pp);
	if (match > best_match) {
	best_match = match;
	best_pp = pp;
	}
	if (match == FULL_MATCH)
	goto out;
	}
	}
	}

	out:
	if (best_pp) {
	cp = vdev_geom_attach(best_pp, vd, B_TRUE);
	if (cp == NULL) {
	printf("ZFS WARNING: Unable to attach to %s.\n",
	best_pp->name);
	}
	}
	return (cp);
	}

	static struct g_consumer *
	vdev_geom_open_by_guids(vdev_t *vd)
	{
	struct g_consumer *cp;
	char *buf;
	size_t len;

	g_topology_assert();

	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
	(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
	cp = vdev_geom_attach_by_guids(vd);
	if (cp != NULL) {
	len = strlen(cp->provider->name) + strlen("/dev/") + 1;
	buf = kmem_alloc(len, KM_SLEEP);

	snprintf(buf, len, "/dev/%s", cp->provider->name);
	spa_strfree(vd->vdev_path);
	vd->vdev_path = buf;

	ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
	(uintmax_t)spa_guid(vd->vdev_spa),
	(uintmax_t)vd->vdev_guid, cp->provider->name);
	} else {
	ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
	(uintmax_t)spa_guid(vd->vdev_spa),
	(uintmax_t)vd->vdev_guid);
	}

	return (cp);
	}

	static struct g_consumer *
	vdev_geom_open_by_path(vdev_t *vd, int check_guid)
	{
	struct g_provider *pp;
	struct g_consumer *cp;

	g_topology_assert();

	cp = NULL;
	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
	if (pp != NULL) {
	ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
	if (!check_guid \|\| vdev_attach_ok(vd, pp) == FULL_MATCH)
	cp = vdev_geom_attach(pp, vd, B_FALSE);
	}

	return (cp);
	}

	static int
	vdev_geom_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	uint64_t logical_ashift, uint64_t physical_ashift)
	{
	struct g_provider *pp;
	struct g_consumer *cp;
	size_t bufsize;
	int error;

	/* Set the TLS to indicate downstack that we should not access zvols*/
	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);

	/*
	* We must have a pathname, and it must be absolute.
	*/
	if (vd->vdev_path == NULL \|\| vd->vdev_path[0] != '/') {
	vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	return (EINVAL);
	}

	/*
	* Reopen the device if it's not currently open. Otherwise,
	* just update the physical size of the device.
	*/
	if ((cp = vd->vdev_tsd) != NULL) {
	ASSERT(vd->vdev_reopening);
	goto skip_open;
	}

	DROP_GIANT();
	g_topology_lock();
	error = 0;

	if (vd->vdev_spa->spa_splitting_newspa \|\|
	(vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
	vd->vdev_spa->spa_load_state == SPA_LOAD_NONE \|\|
	vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
	/*
	* We are dealing with a vdev that hasn't been previously
	* opened (since boot), and we are not loading an
	* existing pool configuration. This looks like a
	* vdev add operation to a new or existing pool.
	* Assume the user knows what he/she is doing and find
	* GEOM provider by its name, ignoring GUID mismatches.
	*
	* XXPOLICY: It would be safer to only allow a device
	* that is unlabeled or labeled but missing
	* GUID information to be opened in this fashion,
	* unless we are doing a split, in which case we
	* should allow any guid.
	*/
	cp = vdev_geom_open_by_path(vd, 0);
	} else {
	/*
	* Try using the recorded path for this device, but only
	* accept it if its label data contains the expected GUIDs.
	*/
	cp = vdev_geom_open_by_path(vd, 1);
	if (cp == NULL) {
	/*
	* The device at vd->vdev_path doesn't have the
	* expected GUIDs. The disks might have merely
	* moved around so try all other GEOM providers
	* to find one with the right GUIDs.
	*/
	cp = vdev_geom_open_by_guids(vd);
	}
	}

	/* Clear the TLS now that tasting is done */
	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);

	if (cp == NULL) {
	ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
	error = ENOENT;
	} else {
	struct consumer_priv_t *priv;
	struct consumer_vdev_elem *elem;
	int spamode;

	priv = (struct consumer_priv_t*)&cp->private;
	if (cp->private == NULL)
	SLIST_INIT(priv);
	elem = g_malloc(sizeof(*elem), M_WAITOK\|M_ZERO);
	elem->vd = vd;
	SLIST_INSERT_HEAD(priv, elem, elems);

	spamode = spa_mode(vd->vdev_spa);
	if (cp->provider->sectorsize > VDEV_PAD_SIZE \|\|
	!ISP2(cp->provider->sectorsize)) {
	ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
	cp->provider->name);

	vdev_geom_close_locked(vd);
	error = EINVAL;
	cp = NULL;
	} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
	int i;

	for (i = 0; i < 5; i++) {
	error = g_access(cp, 0, 1, 0);
	if (error == 0)
	break;
	g_topology_unlock();
	tsleep(vd, 0, "vdev", hz / 2);
	g_topology_lock();
	}
	if (error != 0) {
	printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
	cp->provider->name, error);
	vdev_geom_close_locked(vd);
	cp = NULL;
	}
	}
	}

	/* Fetch initial physical path information for this device. */
	if (cp != NULL) {
	vdev_geom_attrchanged(cp, "GEOM::physpath");

	/* Set other GEOM characteristics */
	vdev_geom_set_physpath(vd, cp, /do_null_update/B_FALSE);
	vdev_geom_set_rotation_rate(vd, cp);
	}

	g_topology_unlock();
	PICKUP_GIANT();
	if (cp == NULL) {
	vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
	return (error);
	}
	skip_open:
	pp = cp->provider;

	/*
	* Determine the actual size of the device.
	*/
	max_psize = psize = pp->mediasize;

	/*
	* Determine the device's minimum transfer size and preferred
	* transfer size.
	*/
	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
	*physical_ashift = 0;
	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
	pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
	*physical_ashift = highbit(pp->stripesize) - 1;

	/*
	* Clear the nowritecache settings, so that on a vdev_reopen()
	* we will try again.
	*/
	vd->vdev_nowritecache = B_FALSE;

	return (0);
	}

	static void
	vdev_geom_close(vdev_t *vd)
	{
	struct g_consumer *cp;

	cp = vd->vdev_tsd;

	DROP_GIANT();
	g_topology_lock();

	if (!vd->vdev_reopening \|\|
	(cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 \|\|
	(cp->provider != NULL && cp->provider->error != 0))))
	vdev_geom_close_locked(vd);

	g_topology_unlock();
	PICKUP_GIANT();
	}

	static void
	vdev_geom_io_intr(struct bio *bp)
	{
	vdev_t *vd;
	zio_t *zio;

	zio = bp->bio_caller1;
	vd = zio->io_vd;
	zio->io_error = bp->bio_error;
	if (zio->io_error == 0 && bp->bio_resid != 0)
	zio->io_error = SET_ERROR(EIO);

	switch(zio->io_error) {
	case ENOTSUP:
	/*
	* If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
	* that future attempts will never succeed. In this case
	* we set a persistent flag so that we don't bother with
	* requests in the future.
	*/
	switch(bp->bio_cmd) {
	case BIO_FLUSH:
	vd->vdev_nowritecache = B_TRUE;
	break;
	case BIO_DELETE:
	vd->vdev_notrim = B_TRUE;
	break;
	}
	break;
	case ENXIO:
	if (!vd->vdev_remove_wanted) {
	/*
	* If provider's error is set we assume it is being
	* removed.
	*/
	if (bp->bio_to->error != 0) {
	vd->vdev_remove_wanted = B_TRUE;
	spa_async_request(zio->io_spa,
	SPA_ASYNC_REMOVE);
	} else if (!vd->vdev_delayed_close) {
	vd->vdev_delayed_close = B_TRUE;
	}
	}
	break;
	}

	/*
	* We have to split bio freeing into two parts, because the ABD code
	* cannot be called in this context and vdev_op_io_done is not called
	* for ZIO_TYPE_IOCTL zio-s.
	*/
	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
	g_destroy_bio(bp);
	zio->io_bio = NULL;
	}
	zio_delay_interrupt(zio);
	}

	static void
	vdev_geom_io_start(zio_t *zio)
	{
	vdev_t *vd;
	struct g_consumer *cp;
	struct bio *bp;
	int error;

	vd = zio->io_vd;

	switch (zio->io_type) {
	case ZIO_TYPE_IOCTL:
	/* XXPOLICY */
	if (!vdev_readable(vd)) {
	zio->io_error = SET_ERROR(ENXIO);
	zio_interrupt(zio);
	return;
	} else {
	switch (zio->io_cmd) {
	case DKIOCFLUSHWRITECACHE:
	if (zfs_nocacheflush \|\| vdev_geom_bio_flush_disable)
	break;
	if (vd->vdev_nowritecache) {
	zio->io_error = SET_ERROR(ENOTSUP);
	break;
	}
	goto sendreq;
	default:
	zio->io_error = SET_ERROR(ENOTSUP);
	}
	}

	zio_execute(zio);
	return;
	case ZIO_TYPE_FREE:
	if (vd->vdev_notrim) {
	zio->io_error = SET_ERROR(ENOTSUP);
	} else if (!vdev_geom_bio_delete_disable) {
	goto sendreq;
	}
	zio_execute(zio);
	return;
	}
	sendreq:
	ASSERT(zio->io_type == ZIO_TYPE_READ \|\|
	zio->io_type == ZIO_TYPE_WRITE \|\|
	zio->io_type == ZIO_TYPE_FREE \|\|
	zio->io_type == ZIO_TYPE_IOCTL);

	cp = vd->vdev_tsd;
	if (cp == NULL) {
	zio->io_error = SET_ERROR(ENXIO);
	zio_interrupt(zio);
	return;
	}
	bp = g_alloc_bio();
	bp->bio_caller1 = zio;
	switch (zio->io_type) {
	case ZIO_TYPE_READ:
	case ZIO_TYPE_WRITE:
	zio->io_target_timestamp = zio_handle_io_delay(zio);
	bp->bio_offset = zio->io_offset;
	bp->bio_length = zio->io_size;
	if (zio->io_type == ZIO_TYPE_READ) {
	bp->bio_cmd = BIO_READ;
	bp->bio_data =
	abd_borrow_buf(zio->io_abd, zio->io_size);
	} else {
	bp->bio_cmd = BIO_WRITE;
	bp->bio_data =
	abd_borrow_buf_copy(zio->io_abd, zio->io_size);
	}
	break;
	case ZIO_TYPE_FREE:
	bp->bio_cmd = BIO_DELETE;
	bp->bio_data = NULL;
	bp->bio_offset = zio->io_offset;
	bp->bio_length = zio->io_size;
	break;
	case ZIO_TYPE_IOCTL:
	bp->bio_cmd = BIO_FLUSH;
	bp->bio_flags \|= BIO_ORDERED;
	bp->bio_data = NULL;
	bp->bio_offset = cp->provider->mediasize;
	bp->bio_length = 0;
	break;
	}
	bp->bio_done = vdev_geom_io_intr;
	zio->io_bio = bp;

	g_io_request(bp, cp);
	}

	static void
	vdev_geom_io_done(zio_t *zio)
	{
	struct bio *bp = zio->io_bio;

	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
	ASSERT(bp == NULL);
	return;
	}

	if (bp == NULL) {
	ASSERT3S(zio->io_error, ==, ENXIO);
	return;
	}

	if (zio->io_type == ZIO_TYPE_READ)
	abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
	else
	abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);

	g_destroy_bio(bp);
	zio->io_bio = NULL;
	}

	static void
	vdev_geom_hold(vdev_t *vd)
	{
	}

	static void
	vdev_geom_rele(vdev_t *vd)
	{
	}

	vdev_ops_t vdev_geom_ops = {
	vdev_geom_open,
	vdev_geom_close,
	vdev_default_asize,
	vdev_geom_io_start,
	vdev_geom_io_done,
	NULL,
	vdev_geom_hold,
	vdev_geom_rele,
	+ NULL,
	VDEV_TYPE_DISK, /* name of this vdev type */
	B_TRUE /* leaf vdev */
	};
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c (nonexistent)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c (revision 332525)
	@@ -0,0 +1,1037 @@
	+/*
	+ * CDDL HEADER START
	+ *
	+ * This file and its contents are supplied under the terms of the
	+ * Common Development and Distribution License ("CDDL"), version 1.0.
	+ * You may only use this file in accordance with the terms of version
	+ * 1.0 of the CDDL.
	+ *
	+ * A full copy of the text of the CDDL should have accompanied this
	+ * source. A copy of the CDDL is also available via the Internet at
	+ * http://www.illumos.org/license/CDDL.
	+ *
	+ * CDDL HEADER END
	+ */
	+
	+/*
	+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
	+ */
	+
	+#include <sys/zfs_context.h>
	+#include <sys/spa.h>
	+#include <sys/spa_impl.h>
	+#include <sys/vdev_impl.h>
	+#include <sys/fs/zfs.h>
	+#include <sys/zio.h>
	+#include <sys/metaslab.h>
	+#include <sys/refcount.h>
	+#include <sys/dmu.h>
	+#include <sys/vdev_indirect_mapping.h>
	+#include <sys/dmu_tx.h>
	+#include <sys/dsl_synctask.h>
	+#include <sys/zap.h>
	+
	+/*
	+ * An indirect vdev corresponds to a vdev that has been removed. Since
	+ * we cannot rewrite block pointers of snapshots, etc., we keep a
	+ * mapping from old location on the removed device to the new location
	+ * on another device in the pool and use this mapping whenever we need
	+ * to access the DVA. Unfortunately, this mapping did not respect
	+ * logical block boundaries when it was first created, and so a DVA on
	+ * this indirect vdev may be "split" into multiple sections that each
	+ * map to a different location. As a consequence, not all DVAs can be
	+ * translated to an equivalent new DVA. Instead we must provide a
	+ * "vdev_remap" operation that executes a callback on each contiguous
	+ * segment of the new location. This function is used in multiple ways:
	+ *
	+ * - reads and repair writes to this device use the callback to create
	+ * a child io for each mapped segment.
	+ *
	+ * - frees and claims to this device use the callback to free or claim
	+ * each mapped segment. (Note that we don't actually need to claim
	+ * log blocks on indirect vdevs, because we don't allocate to
	+ * removing vdevs. However, zdb uses zio_claim() for its leak
	+ * detection.)
	+ */
	+
	+/*
	+ * "Big theory statement" for how we mark blocks obsolete.
	+ *
	+ * When a block on an indirect vdev is freed or remapped, a section of
	+ * that vdev's mapping may no longer be referenced (aka "obsolete"). We
	+ * keep track of how much of each mapping entry is obsolete. When
	+ * an entry becomes completely obsolete, we can remove it, thus reducing
	+ * the memory used by the mapping. The complete picture of obsolescence
	+ * is given by the following data structures, described below:
	+ * - the entry-specific obsolete count
	+ * - the vdev-specific obsolete spacemap
	+ * - the pool-specific obsolete bpobj
	+ *
	+ * == On disk data structures used ==
	+ *
	+ * We track the obsolete space for the pool using several objects. Each
	+ * of these objects is created on demand and freed when no longer
	+ * needed, and is assumed to be empty if it does not exist.
	+ * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
	+ *
	+ * - Each vic_mapping_object (associated with an indirect vdev) can
	+ * have a vimp_counts_object. This is an array of uint32_t's
	+ * with the same number of entries as the vic_mapping_object. When
	+ * the mapping is condensed, entries from the vic_obsolete_sm_object
	+ * (see below) are folded into the counts. Therefore, each
	+ * obsolete_counts entry tells us the number of bytes in the
	+ * corresponding mapping entry that were not referenced when the
	+ * mapping was last condensed.
	+ *
	+ * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
	+ * This is a space map containing an alloc entry for every DVA that
	+ * has been obsoleted since the last time this indirect vdev was
	+ * condensed. We use this object in order to improve performance
	+ * when marking a DVA as obsolete. Instead of modifying an arbitrary
	+ * offset of the vimp_counts_object, we only need to append an entry
	+ * to the end of this object. When a DVA becomes obsolete, it is
	+ * added to the obsolete space map. This happens when the DVA is
	+ * freed, remapped and not referenced by a snapshot, or the last
	+ * snapshot referencing it is destroyed.
	+ *
	+ * - Each dataset can have a ds_remap_deadlist object. This is a
	+ * deadlist object containing all blocks that were remapped in this
	+ * dataset but referenced in a previous snapshot. Blocks can only
	+ * appear on this list if they were remapped (dsl_dataset_block_remapped);
	+ * blocks that were killed in a head dataset are put on the normal
	+ * ds_deadlist and marked obsolete when they are freed.
	+ *
	+ * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
	+ * in the pool that need to be marked obsolete. When a snapshot is
	+ * destroyed, we move some of the ds_remap_deadlist to the obsolete
	+ * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then
	+ * asynchronously process the obsolete bpobj, moving its entries to
	+ * the specific vdevs' obsolete space maps.
	+ *
	+ * == Summary of how we mark blocks as obsolete ==
	+ *
	+ * - When freeing a block: if any DVA is on an indirect vdev, append to
	+ * vic_obsolete_sm_object.
	+ * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
	+ * references; otherwise append to vic_obsolete_sm_object).
	+ * - When freeing a snapshot: move parts of ds_remap_deadlist to
	+ * dp_obsolete_bpobj (same algorithm as ds_deadlist).
	+ * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
	+ * individual vdev's vic_obsolete_sm_object.
	+ */
	+
	+/*
	+ * "Big theory statement" for how we condense indirect vdevs.
	+ *
	+ * Condensing an indirect vdev's mapping is the process of determining
	+ * the precise counts of obsolete space for each mapping entry (by
	+ * integrating the obsolete spacemap into the obsolete counts) and
	+ * writing out a new mapping that contains only referenced entries.
	+ *
	+ * We condense a vdev when we expect the mapping to shrink (see
	+ * vdev_indirect_should_condense()), but only perform one condense at a
	+ * time to limit the memory usage. In addition, we use a separate
	+ * open-context thread (spa_condense_indirect_thread) to incrementally
	+ * create the new mapping object in a way that minimizes the impact on
	+ * the rest of the system.
	+ *
	+ * == Generating a new mapping ==
	+ *
	+ * To generate a new mapping, we follow these steps:
	+ *
	+ * 1. Save the old obsolete space map and create a new mapping object
	+ * (see spa_condense_indirect_start_sync()). This initializes the
	+ * spa_condensing_indirect_phys with the "previous obsolete space map",
	+ * which is now read only. Newly obsolete DVAs will be added to a
	+ * new (initially empty) obsolete space map, and will not be
	+ * considered as part of this condense operation.
	+ *
	+ * 2. Construct in memory the precise counts of obsolete space for each
	+ * mapping entry, by incorporating the obsolete space map into the
	+ * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
	+ *
	+ * 3. Iterate through each mapping entry, writing to the new mapping any
	+ * entries that are not completely obsolete (i.e. which don't have
	+ * obsolete count == mapping length). (See
	+ * spa_condense_indirect_generate_new_mapping().)
	+ *
	+ * 4. Destroy the old mapping object and switch over to the new one
	+ * (spa_condense_indirect_complete_sync).
	+ *
	+ * == Restarting from failure ==
	+ *
	+ * To restart the condense when we import/open the pool, we must start
	+ * at the 2nd step above: reconstruct the precise counts in memory,
	+ * based on the space map + counts. Then in the 3rd step, we start
	+ * iterating where we left off: at vimp_max_offset of the new mapping
	+ * object.
	+ */
	+
	+boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
	+
	+/*
	+ * Condense if at least this percent of the bytes in the mapping is
	+ * obsolete. With the default of 25%, the amount of space mapped
	+ * will be reduced to 1% of its original size after at most 16
	+ * condenses. Higher values will condense less often (causing less
	+ * i/o); lower values will reduce the mapping size more quickly.
	+ */
	+int zfs_indirect_condense_obsolete_pct = 25;
	+
	+/*
	+ * Condense if the obsolete space map takes up more than this amount of
	+ * space on disk (logically). This limits the amount of disk space
	+ * consumed by the obsolete space map; the default of 1GB is small enough
	+ * that we typically don't mind "wasting" it.
	+ */
	+uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
	+
	+/*
	+ * Don't bother condensing if the mapping uses less than this amount of
	+ * memory. The default of 128KB is considered a "trivial" amount of
	+ * memory and not worth reducing.
	+ */
	+uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
	+
	+/*
	+ * This is used by the test suite so that it can ensure that certain
	+ * actions happen while in the middle of a condense (which might otherwise
	+ * complete too quickly). If used to reduce the performance impact of
	+ * condensing in production, a maximum value of 1 should be sufficient.
	+ */
	+int zfs_condense_indirect_commit_entry_delay_ticks = 0;
	+
	+/*
	+ * Mark the given offset and size as being obsolete in the given txg.
	+ */
	+void
	+vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
	+ uint64_t txg)
	+{
	+ spa_t *spa = vd->vdev_spa;
	+ ASSERT3U(spa_syncing_txg(spa), ==, txg);
	+ ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
	+ ASSERT(vd->vdev_removing \|\| vd->vdev_ops == &vdev_indirect_ops);
	+ ASSERT(size > 0);
	+ VERIFY(vdev_indirect_mapping_entry_for_offset(
	+ vd->vdev_indirect_mapping, offset) != NULL);
	+
	+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	+ mutex_enter(&vd->vdev_obsolete_lock);
	+ range_tree_add(vd->vdev_obsolete_segments, offset, size);
	+ mutex_exit(&vd->vdev_obsolete_lock);
	+ vdev_dirty(vd, 0, NULL, txg);
	+ }
	+}
	+
	+/*
	+ * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
	+ * wrapper is provided because the DMU does not know about vdev_t's and
	+ * cannot directly call vdev_indirect_mark_obsolete.
	+ */
	+void
	+spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
	+ uint64_t size, dmu_tx_t *tx)
	+{
	+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
	+ ASSERT(dmu_tx_is_syncing(tx));
	+
	+ /* The DMU can only remap indirect vdevs. */
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	+ vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx));
	+}
	+
	+static spa_condensing_indirect_t *
	+spa_condensing_indirect_create(spa_t *spa)
	+{
	+ spa_condensing_indirect_phys_t *scip =
	+ &spa->spa_condensing_indirect_phys;
	+ spa_condensing_indirect_t sci = kmem_zalloc(sizeof (sci), KM_SLEEP);
	+ objset_t *mos = spa->spa_meta_objset;
	+
	+ for (int i = 0; i < TXG_SIZE; i++) {
	+ list_create(&sci->sci_new_mapping_entries[i],
	+ sizeof (vdev_indirect_mapping_entry_t),
	+ offsetof(vdev_indirect_mapping_entry_t, vime_node));
	+ }
	+
	+ sci->sci_new_mapping =
	+ vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
	+
	+ return (sci);
	+}
	+
	+static void
	+spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
	+{
	+ for (int i = 0; i < TXG_SIZE; i++)
	+ list_destroy(&sci->sci_new_mapping_entries[i]);
	+
	+ if (sci->sci_new_mapping != NULL)
	+ vdev_indirect_mapping_close(sci->sci_new_mapping);
	+
	+ kmem_free(sci, sizeof (*sci));
	+}
	+
	+boolean_t
	+vdev_indirect_should_condense(vdev_t *vd)
	+{
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+ spa_t *spa = vd->vdev_spa;
	+
	+ ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
	+
	+ if (!zfs_condense_indirect_vdevs_enable)
	+ return (B_FALSE);
	+
	+ /*
	+ * We can only condense one indirect vdev at a time.
	+ */
	+ if (spa->spa_condensing_indirect != NULL)
	+ return (B_FALSE);
	+
	+ if (spa_shutting_down(spa))
	+ return (B_FALSE);
	+
	+ /*
	+ * The mapping object size must not change while we are
	+ * condensing, so we can only condense indirect vdevs
	+ * (not vdevs that are still in the middle of being removed).
	+ */
	+ if (vd->vdev_ops != &vdev_indirect_ops)
	+ return (B_FALSE);
	+
	+ /*
	+ * If nothing new has been marked obsolete, there is no
	+ * point in condensing.
	+ */
	+ if (vd->vdev_obsolete_sm == NULL) {
	+ ASSERT0(vdev_obsolete_sm_object(vd));
	+ return (B_FALSE);
	+ }
	+
	+ ASSERT(vd->vdev_obsolete_sm != NULL);
	+
	+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
	+ space_map_object(vd->vdev_obsolete_sm));
	+
	+ uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
	+ uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
	+ uint64_t mapping_size = vdev_indirect_mapping_size(vim);
	+ uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
	+
	+ ASSERT3U(bytes_obsolete, <=, bytes_mapped);
	+
	+ /*
	+ * If a high percentage of the bytes that are mapped have become
	+ * obsolete, condense (unless the mapping is already small enough).
	+ * This has a good chance of reducing the amount of memory used
	+ * by the mapping.
	+ */
	+ if (bytes_obsolete * 100 / bytes_mapped >=
	+ zfs_indirect_condense_obsolete_pct &&
	+ mapping_size > zfs_condense_min_mapping_bytes) {
	+ zfs_dbgmsg("should condense vdev %llu because obsolete "
	+ "spacemap covers %d%% of %lluMB mapping",
	+ (u_longlong_t)vd->vdev_id,
	+ (int)(bytes_obsolete * 100 / bytes_mapped),
	+ (u_longlong_t)bytes_mapped / 1024 / 1024);
	+ return (B_TRUE);
	+ }
	+
	+ /*
	+ * If the obsolete space map takes up too much space on disk,
	+ * condense in order to free up this disk space.
	+ */
	+ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
	+ zfs_dbgmsg("should condense vdev %llu because obsolete sm "
	+ "length %lluMB >= max size %lluMB",
	+ (u_longlong_t)vd->vdev_id,
	+ (u_longlong_t)obsolete_sm_size / 1024 / 1024,
	+ (u_longlong_t)zfs_condense_max_obsolete_bytes /
	+ 1024 / 1024);
	+ return (B_TRUE);
	+ }
	+
	+ return (B_FALSE);
	+}
	+
	+/*
	+ * This sync task completes (finishes) a condense, deleting the old
	+ * mapping and replacing it with the new one.
	+ */
	+static void
	+spa_condense_indirect_complete_sync(void arg, dmu_tx_t tx)
	+{
	+ spa_condensing_indirect_t *sci = arg;
	+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	+ spa_condensing_indirect_phys_t *scip =
	+ &spa->spa_condensing_indirect_phys;
	+ vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+ objset_t *mos = spa->spa_meta_objset;
	+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
	+ uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
	+ uint64_t new_count =
	+ vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
	+
	+ ASSERT(dmu_tx_is_syncing(tx));
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	+ ASSERT3P(sci, ==, spa->spa_condensing_indirect);
	+ for (int i = 0; i < TXG_SIZE; i++) {
	+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
	+ }
	+ ASSERT(vic->vic_mapping_object != 0);
	+ ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
	+ ASSERT(scip->scip_next_mapping_object != 0);
	+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
	+
	+ /*
	+ * Reset vdev_indirect_mapping to refer to the new object.
	+ */
	+ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
	+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
	+ vd->vdev_indirect_mapping = sci->sci_new_mapping;
	+ rw_exit(&vd->vdev_indirect_rwlock);
	+
	+ sci->sci_new_mapping = NULL;
	+ vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
	+ vic->vic_mapping_object = scip->scip_next_mapping_object;
	+ scip->scip_next_mapping_object = 0;
	+
	+ space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
	+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	+ scip->scip_prev_obsolete_sm_object = 0;
	+
	+ scip->scip_vdev = 0;
	+
	+ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
	+ DMU_POOL_CONDENSING_INDIRECT, tx));
	+ spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
	+ spa->spa_condensing_indirect = NULL;
	+
	+ zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
	+ "new mapping object %llu has %llu entries "
	+ "(was %llu entries)",
	+ vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
	+ new_count, old_count);
	+
	+ vdev_config_dirty(spa->spa_root_vdev);
	+}
	+
	+/*
	+ * This sync task appends entries to the new mapping object.
	+ */
	+static void
	+spa_condense_indirect_commit_sync(void arg, dmu_tx_t tx)
	+{
	+ spa_condensing_indirect_t *sci = arg;
	+ uint64_t txg = dmu_tx_get_txg(tx);
	+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	+
	+ ASSERT(dmu_tx_is_syncing(tx));
	+ ASSERT3P(sci, ==, spa->spa_condensing_indirect);
	+
	+ vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
	+ &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
	+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
	+}
	+
	+/*
	+ * Open-context function to add one entry to the new mapping. The new
	+ * entry will be remembered and written from syncing context.
	+ */
	+static void
	+spa_condense_indirect_commit_entry(spa_t *spa,
	+ vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
	+{
	+ spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
	+
	+ ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
	+
	+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	+ dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
	+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
	+
	+ /*
	+ * If we are the first entry committed this txg, kick off the sync
	+ * task to write to the MOS on our behalf.
	+ */
	+ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
	+ dsl_sync_task_nowait(dmu_tx_pool(tx),
	+ spa_condense_indirect_commit_sync, sci,
	+ 0, ZFS_SPACE_CHECK_NONE, tx);
	+ }
	+
	+ vdev_indirect_mapping_entry_t *vime =
	+ kmem_alloc(sizeof (*vime), KM_SLEEP);
	+ vime->vime_mapping = *vimep;
	+ vime->vime_obsolete_count = count;
	+ list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
	+
	+ dmu_tx_commit(tx);
	+}
	+
	+static void
	+spa_condense_indirect_generate_new_mapping(vdev_t *vd,
	+ uint32_t *obsolete_counts, uint64_t start_index)
	+{
	+ spa_t *spa = vd->vdev_spa;
	+ uint64_t mapi = start_index;
	+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
	+ uint64_t old_num_entries =
	+ vdev_indirect_mapping_num_entries(old_mapping);
	+
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	+ ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
	+
	+ zfs_dbgmsg("starting condense of vdev %llu from index %llu",
	+ (u_longlong_t)vd->vdev_id,
	+ (u_longlong_t)mapi);
	+
	+ while (mapi < old_num_entries && !spa_shutting_down(spa)) {
	+ vdev_indirect_mapping_entry_phys_t *entry =
	+ &old_mapping->vim_entries[mapi];
	+ uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
	+ ASSERT3U(obsolete_counts[mapi], <=, entry_size);
	+ if (obsolete_counts[mapi] < entry_size) {
	+ spa_condense_indirect_commit_entry(spa, entry,
	+ obsolete_counts[mapi]);
	+
	+ /*
	+ * This delay may be requested for testing, debugging,
	+ * or performance reasons.
	+ */
	+ delay(zfs_condense_indirect_commit_entry_delay_ticks);
	+ }
	+
	+ mapi++;
	+ }
	+ if (spa_shutting_down(spa)) {
	+ zfs_dbgmsg("pausing condense of vdev %llu at index %llu",
	+ (u_longlong_t)vd->vdev_id,
	+ (u_longlong_t)mapi);
	+ }
	+}
	+
	+static void
	+spa_condense_indirect_thread(void *arg)
	+{
	+ vdev_t *vd = arg;
	+ spa_t *spa = vd->vdev_spa;
	+ spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
	+ spa_condensing_indirect_phys_t *scip =
	+ &spa->spa_condensing_indirect_phys;
	+ uint32_t *counts;
	+ uint64_t start_index;
	+ vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
	+ space_map_t *prev_obsolete_sm = NULL;
	+
	+ ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
	+ ASSERT(scip->scip_next_mapping_object != 0);
	+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	+
	+ for (int i = 0; i < TXG_SIZE; i++) {
	+ /*
	+ * The list must start out empty in order for the
	+ * _commit_sync() sync task to be properly registered
	+ * on the first call to _commit_entry(); so it's wise
	+ * to double check and ensure we actually are starting
	+ * with empty lists.
	+ */
	+ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
	+ }
	+
	+ VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
	+ scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
	+ space_map_update(prev_obsolete_sm);
	+ counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
	+ if (prev_obsolete_sm != NULL) {
	+ vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
	+ counts, prev_obsolete_sm);
	+ }
	+ space_map_close(prev_obsolete_sm);
	+
	+ /*
	+ * Generate new mapping. Determine what index to continue from
	+ * based on the max offset that we've already written in the
	+ * new mapping.
	+ */
	+ uint64_t max_offset =
	+ vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
	+ if (max_offset == 0) {
	+ /* We haven't written anything to the new mapping yet. */
	+ start_index = 0;
	+ } else {
	+ /*
	+ * Pick up from where we left off. _entry_for_offset()
	+ * returns a pointer into the vim_entries array. If
	+ * max_offset is greater than any of the mappings
	+ * contained in the table NULL will be returned and
	+ * that indicates we've exhausted our iteration of the
	+ * old_mapping.
	+ */
	+
	+ vdev_indirect_mapping_entry_phys_t *entry =
	+ vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
	+ max_offset);
	+
	+ if (entry == NULL) {
	+ /*
	+ * We've already written the whole new mapping.
	+ * This special value will cause us to skip the
	+ * generate_new_mapping step and just do the sync
	+ * task to complete the condense.
	+ */
	+ start_index = UINT64_MAX;
	+ } else {
	+ start_index = entry - old_mapping->vim_entries;
	+ ASSERT3U(start_index, <,
	+ vdev_indirect_mapping_num_entries(old_mapping));
	+ }
	+ }
	+
	+ spa_condense_indirect_generate_new_mapping(vd, counts, start_index);
	+
	+ vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
	+
	+ /*
	+ * We may have bailed early from generate_new_mapping(), if
	+ * the spa is shutting down. In this case, do not complete
	+ * the condense.
	+ */
	+ if (!spa_shutting_down(spa)) {
	+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,
	+ spa_condense_indirect_complete_sync, sci, 0,
	+ ZFS_SPACE_CHECK_NONE));
	+ }
	+
	+ mutex_enter(&spa->spa_async_lock);
	+ spa->spa_condense_thread = NULL;
	+ cv_broadcast(&spa->spa_async_cv);
	+ mutex_exit(&spa->spa_async_lock);
	+ thread_exit();
	+}
	+
	+/*
	+ * Sync task to begin the condensing process.
	+ */
	+void
	+spa_condense_indirect_start_sync(vdev_t vd, dmu_tx_t tx)
	+{
	+ spa_t *spa = vd->vdev_spa;
	+ spa_condensing_indirect_phys_t *scip =
	+ &spa->spa_condensing_indirect_phys;
	+
	+ ASSERT0(scip->scip_next_mapping_object);
	+ ASSERT0(scip->scip_prev_obsolete_sm_object);
	+ ASSERT0(scip->scip_vdev);
	+ ASSERT(dmu_tx_is_syncing(tx));
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
	+ ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
	+
	+ uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
	+ ASSERT(obsolete_sm_obj != 0);
	+
	+ scip->scip_vdev = vd->vdev_id;
	+ scip->scip_next_mapping_object =
	+ vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
	+
	+ scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
	+
	+ /*
	+ * We don't need to allocate a new space map object, since
	+ * vdev_indirect_sync_obsolete will allocate one when needed.
	+ */
	+ space_map_close(vd->vdev_obsolete_sm);
	+ vd->vdev_obsolete_sm = NULL;
	+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
	+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
	+
	+ VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
	+ DMU_POOL_DIRECTORY_OBJECT,
	+ DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
	+ sizeof (*scip) / sizeof (uint64_t), scip, tx));
	+
	+ ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
	+ spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
	+
	+ zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
	+ "posm=%llu nm=%llu",
	+ vd->vdev_id, dmu_tx_get_txg(tx),
	+ (u_longlong_t)scip->scip_prev_obsolete_sm_object,
	+ (u_longlong_t)scip->scip_next_mapping_object);
	+
	+ ASSERT3P(spa->spa_condense_thread, ==, NULL);
	+ spa->spa_condense_thread = thread_create(NULL, 0,
	+ spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, minclsyspri);
	+}
	+
	+/*
	+ * Sync to the given vdev's obsolete space map any segments that are no longer
	+ * referenced as of the given txg.
	+ *
	+ * If the obsolete space map doesn't exist yet, create and open it.
	+ */
	+void
	+vdev_indirect_sync_obsolete(vdev_t vd, dmu_tx_t tx)
	+{
	+ spa_t *spa = vd->vdev_spa;
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+
	+ ASSERT3U(vic->vic_mapping_object, !=, 0);
	+ ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
	+ ASSERT(vd->vdev_removing \|\| vd->vdev_ops == &vdev_indirect_ops);
	+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
	+
	+ if (vdev_obsolete_sm_object(vd) == 0) {
	+ uint64_t obsolete_sm_object =
	+ space_map_alloc(spa->spa_meta_objset, tx);
	+
	+ ASSERT(vd->vdev_top_zap != 0);
	+ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
	+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
	+ sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
	+ ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
	+
	+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	+ VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
	+ spa->spa_meta_objset, obsolete_sm_object,
	+ 0, vd->vdev_asize, 0));
	+ space_map_update(vd->vdev_obsolete_sm);
	+ }
	+
	+ ASSERT(vd->vdev_obsolete_sm != NULL);
	+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
	+ space_map_object(vd->vdev_obsolete_sm));
	+
	+ space_map_write(vd->vdev_obsolete_sm,
	+ vd->vdev_obsolete_segments, SM_ALLOC, tx);
	+ space_map_update(vd->vdev_obsolete_sm);
	+ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
	+}
	+
	+int
	+spa_condense_init(spa_t *spa)
	+{
	+ int error = zap_lookup(spa->spa_meta_objset,
	+ DMU_POOL_DIRECTORY_OBJECT,
	+ DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
	+ sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
	+ &spa->spa_condensing_indirect_phys);
	+ if (error == 0) {
	+ if (spa_writeable(spa)) {
	+ spa->spa_condensing_indirect =
	+ spa_condensing_indirect_create(spa);
	+ }
	+ return (0);
	+ } else if (error == ENOENT) {
	+ return (0);
	+ } else {
	+ return (error);
	+ }
	+}
	+
	+void
	+spa_condense_fini(spa_t *spa)
	+{
	+ if (spa->spa_condensing_indirect != NULL) {
	+ spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
	+ spa->spa_condensing_indirect = NULL;
	+ }
	+}
	+
	+/*
	+ * Restart the condense - called when the pool is opened.
	+ */
	+void
	+spa_condense_indirect_restart(spa_t *spa)
	+{
	+ vdev_t *vd;
	+ ASSERT(spa->spa_condensing_indirect != NULL);
	+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
	+ vd = vdev_lookup_top(spa,
	+ spa->spa_condensing_indirect_phys.scip_vdev);
	+ ASSERT(vd != NULL);
	+ spa_config_exit(spa, SCL_VDEV, FTAG);
	+
	+ ASSERT3P(spa->spa_condense_thread, ==, NULL);
	+ spa->spa_condense_thread = thread_create(NULL, 0,
	+ spa_condense_indirect_thread, vd, 0, &p0, TS_RUN,
	+ minclsyspri);
	+}
	+
	+/*
	+ * Gets the obsolete spacemap object from the vdev's ZAP.
	+ * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
	+ * exist yet.
	+ */
	+int
	+vdev_obsolete_sm_object(vdev_t *vd)
	+{
	+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
	+ if (vd->vdev_top_zap == 0) {
	+ return (0);
	+ }
	+
	+ uint64_t sm_obj = 0;
	+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
	+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
	+
	+ ASSERT(err == 0 \|\| err == ENOENT);
	+
	+ return (sm_obj);
	+}
	+
	+boolean_t
	+vdev_obsolete_counts_are_precise(vdev_t *vd)
	+{
	+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
	+ if (vd->vdev_top_zap == 0) {
	+ return (B_FALSE);
	+ }
	+
	+ uint64_t val = 0;
	+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
	+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
	+
	+ ASSERT(err == 0 \|\| err == ENOENT);
	+
	+ return (val != 0);
	+}
	+
	+/* ARGSUSED */
	+static void
	+vdev_indirect_close(vdev_t *vd)
	+{
	+}
	+
	+/* ARGSUSED */
	+static void
	+vdev_indirect_io_done(zio_t *zio)
	+{
	+}
	+
	+/* ARGSUSED */
	+static int
	+vdev_indirect_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	+ uint64_t logical_ashift, uint64_t physical_ashift)
	+{
	+ psize = max_psize = vd->vdev_asize +
	+ VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
	+ *logical_ashift = vd->vdev_ashift;
	+ *physical_ashift = vd->vdev_physical_ashift;
	+ return (0);
	+}
	+
	+typedef struct remap_segment {
	+ vdev_t *rs_vd;
	+ uint64_t rs_offset;
	+ uint64_t rs_asize;
	+ uint64_t rs_split_offset;
	+ list_node_t rs_node;
	+} remap_segment_t;
	+
	+remap_segment_t *
	+rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
	+{
	+ remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
	+ rs->rs_vd = vd;
	+ rs->rs_offset = offset;
	+ rs->rs_asize = asize;
	+ rs->rs_split_offset = split_offset;
	+ return (rs);
	+}
	+
	+/*
	+ * Goes through the relevant indirect mappings until it hits a concrete vdev
	+ * and issues the callback. On the way to the concrete vdev, if any other
	+ * indirect vdevs are encountered, then the callback will also be called on
	+ * each of those indirect vdevs. For example, if the segment is mapped to
	+ * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
	+ * mapped to segment B on concrete vdev 2, then the callback will be called on
	+ * both vdev 1 and vdev 2.
	+ *
	+ * While the callback passed to vdev_indirect_remap() is called on every vdev
	+ * the function encounters, certain callbacks only care about concrete vdevs.
	+ * These types of callbacks should return immediately and explicitly when they
	+ * are called on an indirect vdev.
	+ *
	+ * Because there is a possibility that a DVA section in the indirect device
	+ * has been split into multiple sections in our mapping, we keep track
	+ * of the relevant contiguous segments of the new location (remap_segment_t)
	+ * in a stack. This way we can call the callback for each of the new sections
	+ * created by a single section of the indirect device. Note though, that in
	+ * this scenario the callbacks in each split block won't occur in-order in
	+ * terms of offset, so callers should not make any assumptions about that.
	+ *
	+ * For callbacks that don't handle split blocks and immediately return when
	+ * they encounter them (as is the case for remap_blkptr_cb), the caller can
	+ * assume that its callback will be applied from the first indirect vdev
	+ * encountered to the last one and then the concrete vdev, in that order.
	+ */
	+static void
	+vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
	+ void (func)(uint64_t, vdev_t , uint64_t, uint64_t, void ), void arg)
	+{
	+ list_t stack;
	+ spa_t *spa = vd->vdev_spa;
	+
	+ list_create(&stack, sizeof (remap_segment_t),
	+ offsetof(remap_segment_t, rs_node));
	+
	+ for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
	+ rs != NULL; rs = list_remove_head(&stack)) {
	+ vdev_t *v = rs->rs_vd;
	+
	+ /*
	+ * Note: this can be called from open context
	+ * (eg. zio_read()), so we need the rwlock to prevent
	+ * the mapping from being changed by condensing.
	+ */
	+ rw_enter(&v->vdev_indirect_rwlock, RW_READER);
	+ vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
	+ ASSERT3P(vim, !=, NULL);
	+
	+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
	+ ASSERT(rs->rs_asize > 0);
	+
	+ vdev_indirect_mapping_entry_phys_t *mapping =
	+ vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset);
	+ ASSERT3P(mapping, !=, NULL);
	+
	+ while (rs->rs_asize > 0) {
	+ /*
	+ * Note: the vdev_indirect_mapping can not change
	+ * while we are running. It only changes while the
	+ * removal is in progress, and then only from syncing
	+ * context. While a removal is in progress, this
	+ * function is only called for frees, which also only
	+ * happen from syncing context.
	+ */
	+
	+ uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
	+ uint64_t dst_offset =
	+ DVA_GET_OFFSET(&mapping->vimep_dst);
	+ uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst);
	+
	+ ASSERT3U(rs->rs_offset, >=,
	+ DVA_MAPPING_GET_SRC_OFFSET(mapping));
	+ ASSERT3U(rs->rs_offset, <,
	+ DVA_MAPPING_GET_SRC_OFFSET(mapping) + size);
	+ ASSERT3U(dst_vdev, !=, v->vdev_id);
	+
	+ uint64_t inner_offset = rs->rs_offset -
	+ DVA_MAPPING_GET_SRC_OFFSET(mapping);
	+ uint64_t inner_size =
	+ MIN(rs->rs_asize, size - inner_offset);
	+
	+ vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
	+ ASSERT3P(dst_v, !=, NULL);
	+
	+ if (dst_v->vdev_ops == &vdev_indirect_ops) {
	+ list_insert_head(&stack,
	+ rs_alloc(dst_v, dst_offset + inner_offset,
	+ inner_size, rs->rs_split_offset));
	+
	+ }
	+
	+ if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
	+ IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
	+ /*
	+ * Note: This clause exists only solely for
	+ * testing purposes. We use it to ensure that
	+ * split blocks work and that the callbacks
	+ * using them yield the same result if issued
	+ * in reverse order.
	+ */
	+ uint64_t inner_half = inner_size / 2;
	+
	+ func(rs->rs_split_offset + inner_half, dst_v,
	+ dst_offset + inner_offset + inner_half,
	+ inner_half, arg);
	+
	+ func(rs->rs_split_offset, dst_v,
	+ dst_offset + inner_offset,
	+ inner_half, arg);
	+ } else {
	+ func(rs->rs_split_offset, dst_v,
	+ dst_offset + inner_offset,
	+ inner_size, arg);
	+ }
	+
	+ rs->rs_offset += inner_size;
	+ rs->rs_asize -= inner_size;
	+ rs->rs_split_offset += inner_size;
	+ mapping++;
	+ }
	+
	+ rw_exit(&v->vdev_indirect_rwlock);
	+ kmem_free(rs, sizeof (remap_segment_t));
	+ }
	+ list_destroy(&stack);
	+}
	+
	+static void
	+vdev_indirect_child_io_done(zio_t *zio)
	+{
	+ zio_t *pio = zio->io_private;
	+
	+ mutex_enter(&pio->io_lock);
	+ pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
	+ mutex_exit(&pio->io_lock);
	+
	+ abd_put(zio->io_abd);
	+}
	+
	+static void
	+vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
	+ uint64_t size, void *arg)
	+{
	+ zio_t *zio = arg;
	+
	+ ASSERT3P(vd, !=, NULL);
	+
	+ if (vd->vdev_ops == &vdev_indirect_ops)
	+ return;
	+
	+ zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
	+ abd_get_offset(zio->io_abd, split_offset),
	+ size, zio->io_type, zio->io_priority,
	+ 0, vdev_indirect_child_io_done, zio));
	+}
	+
	+static void
	+vdev_indirect_io_start(zio_t *zio)
	+{
	+ spa_t *spa = zio->io_spa;
	+
	+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
	+ if (zio->io_type != ZIO_TYPE_READ) {
	+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
	+ ASSERT((zio->io_flags &
	+ (ZIO_FLAG_SELF_HEAL \| ZIO_FLAG_INDUCE_DAMAGE)) != 0);
	+ }
	+
	+ vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
	+ vdev_indirect_io_start_cb, zio);
	+
	+ zio_execute(zio);
	+}
	+
	+vdev_ops_t vdev_indirect_ops = {
	+ vdev_indirect_open,
	+ vdev_indirect_close,
	+ vdev_default_asize,
	+ vdev_indirect_io_start,
	+ vdev_indirect_io_done,
	+ NULL,
	+ NULL,
	+ NULL,
	+ vdev_indirect_remap,
	+ VDEV_TYPE_INDIRECT, /* name of this vdev type */
	+ B_FALSE /* leaf vdev */
	+};

	Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c (nonexistent)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c (revision 332525)
	@@ -0,0 +1,212 @@
	+/*
	+ * CDDL HEADER START
	+ *
	+ * This file and its contents are supplied under the terms of the
	+ * Common Development and Distribution License ("CDDL"), version 1.0.
	+ * You may only use this file in accordance with the terms of version
	+ * 1.0 of the CDDL.
	+ *
	+ * A full copy of the text of the CDDL should have accompanied this
	+ * source. A copy of the CDDL is also available via the Internet at
	+ * http://www.illumos.org/license/CDDL.
	+ *
	+ * CDDL HEADER END
	+ */
	+
	+/*
	+ * Copyright (c) 2015 by Delphix. All rights reserved.
	+ */
	+
	+#include <sys/dmu_tx.h>
	+#include <sys/spa.h>
	+#include <sys/dmu.h>
	+#include <sys/dsl_pool.h>
	+#include <sys/vdev_indirect_births.h>
	+
	+static boolean_t
	+vdev_indirect_births_verify(vdev_indirect_births_t *vib)
	+{
	+ ASSERT(vib != NULL);
	+
	+ ASSERT(vib->vib_object != 0);
	+ ASSERT(vib->vib_objset != NULL);
	+ ASSERT(vib->vib_phys != NULL);
	+ ASSERT(vib->vib_dbuf != NULL);
	+
	+ EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL);
	+
	+ return (B_TRUE);
	+}
	+
	+uint64_t
	+vdev_indirect_births_count(vdev_indirect_births_t *vib)
	+{
	+ ASSERT(vdev_indirect_births_verify(vib));
	+
	+ return (vib->vib_phys->vib_count);
	+}
	+
	+uint64_t
	+vdev_indirect_births_object(vdev_indirect_births_t *vib)
	+{
	+ ASSERT(vdev_indirect_births_verify(vib));
	+
	+ return (vib->vib_object);
	+}
	+
	+static uint64_t
	+vdev_indirect_births_size_impl(vdev_indirect_births_t *vib)
	+{
	+ return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries));
	+}
	+
	+void
	+vdev_indirect_births_close(vdev_indirect_births_t *vib)
	+{
	+ ASSERT(vdev_indirect_births_verify(vib));
	+
	+ if (vib->vib_phys->vib_count > 0) {
	+ uint64_t births_size = vdev_indirect_births_size_impl(vib);
	+
	+ kmem_free(vib->vib_entries, births_size);
	+ vib->vib_entries = NULL;
	+ }
	+
	+ dmu_buf_rele(vib->vib_dbuf, vib);
	+
	+ vib->vib_objset = NULL;
	+ vib->vib_object = 0;
	+ vib->vib_dbuf = NULL;
	+ vib->vib_phys = NULL;
	+
	+ kmem_free(vib, sizeof (*vib));
	+}
	+
	+uint64_t
	+vdev_indirect_births_alloc(objset_t os, dmu_tx_t tx)
	+{
	+ ASSERT(dmu_tx_is_syncing(tx));
	+
	+ return (dmu_object_alloc(os,
	+ DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
	+ DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t),
	+ tx));
	+}
	+
	+vdev_indirect_births_t *
	+vdev_indirect_births_open(objset_t *os, uint64_t births_object)
	+{
	+ vdev_indirect_births_t vib = kmem_zalloc(sizeof (vib), KM_SLEEP);
	+
	+ vib->vib_objset = os;
	+ vib->vib_object = births_object;
	+
	+ VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf));
	+ vib->vib_phys = vib->vib_dbuf->db_data;
	+
	+ if (vib->vib_phys->vib_count > 0) {
	+ uint64_t births_size = vdev_indirect_births_size_impl(vib);
	+ vib->vib_entries = kmem_alloc(births_size, KM_SLEEP);
	+ VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0,
	+ births_size, vib->vib_entries, DMU_READ_PREFETCH));
	+ }
	+
	+ ASSERT(vdev_indirect_births_verify(vib));
	+
	+ return (vib);
	+}
	+
	+void
	+vdev_indirect_births_free(objset_t os, uint64_t object, dmu_tx_t tx)
	+{
	+ VERIFY0(dmu_object_free(os, object, tx));
	+}
	+
	+void
	+vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
	+ uint64_t max_offset, uint64_t txg, dmu_tx_t *tx)
	+{
	+ vdev_indirect_birth_entry_phys_t vibe;
	+ uint64_t old_size;
	+ uint64_t new_size;
	+ vdev_indirect_birth_entry_phys_t *new_entries;
	+
	+ ASSERT(dmu_tx_is_syncing(tx));
	+ ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
	+ ASSERT(vdev_indirect_births_verify(vib));
	+
	+ dmu_buf_will_dirty(vib->vib_dbuf, tx);
	+
	+ vibe.vibe_offset = max_offset;
	+ vibe.vibe_phys_birth_txg = txg;
	+
	+ old_size = vdev_indirect_births_size_impl(vib);
	+ dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe),
	+ &vibe, tx);
	+ vib->vib_phys->vib_count++;
	+ new_size = vdev_indirect_births_size_impl(vib);
	+
	+ new_entries = kmem_alloc(new_size, KM_SLEEP);
	+ if (old_size > 0) {
	+ bcopy(vib->vib_entries, new_entries, old_size);
	+ kmem_free(vib->vib_entries, old_size);
	+ }
	+ new_entries[vib->vib_phys->vib_count - 1] = vibe;
	+ vib->vib_entries = new_entries;
	+}
	+
	+uint64_t
	+vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib)
	+{
	+ ASSERT(vdev_indirect_births_verify(vib));
	+ ASSERT(vib->vib_phys->vib_count > 0);
	+
	+ vdev_indirect_birth_entry_phys_t *last =
	+ &vib->vib_entries[vib->vib_phys->vib_count - 1];
	+ return (last->vibe_phys_birth_txg);
	+}
	+
	+/*
	+ * Return the txg in which the given range was copied (i.e. its physical
	+ * birth txg). The specified offset+asize must be contiguously mapped
	+ * (i.e. not a split block).
	+ *
	+ * The entries are sorted by increasing phys_birth, and also by increasing
	+ * offset. We find the specified offset by binary search. Note that we
	+ * can not use bsearch() because looking at each entry independently is
	+ * insufficient to find the correct entry. Each entry implicitly relies
	+ * on the previous entry: an entry indicates that the offsets from the
	+ * end of the previous entry to the end of this entry were written in the
	+ * specified txg.
	+ */
	+uint64_t
	+vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset,
	+ uint64_t asize)
	+{
	+ vdev_indirect_birth_entry_phys_t *base;
	+ vdev_indirect_birth_entry_phys_t *last;
	+
	+ ASSERT(vdev_indirect_births_verify(vib));
	+ ASSERT(vib->vib_phys->vib_count > 0);
	+
	+ base = vib->vib_entries;
	+ last = base + vib->vib_phys->vib_count - 1;
	+
	+ ASSERT3U(offset, <, last->vibe_offset);
	+
	+ while (last >= base) {
	+ vdev_indirect_birth_entry_phys_t *p =
	+ base + ((last - base) / 2);
	+ if (offset >= p->vibe_offset) {
	+ base = p + 1;
	+ } else if (p == vib->vib_entries \|\|
	+ offset >= (p - 1)->vibe_offset) {
	+ ASSERT3U(offset + asize, <=, p->vibe_offset);
	+ return (p->vibe_phys_birth_txg);
	+ } else {
	+ last = p - 1;
	+ }
	+ }
	+ ASSERT(!"offset not found");
	+ return (-1);
	+}

	Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c (nonexistent)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c (revision 332525)
	@@ -0,0 +1,594 @@
	+/*
	+ * CDDL HEADER START
	+ *
	+ * This file and its contents are supplied under the terms of the
	+ * Common Development and Distribution License ("CDDL"), version 1.0.
	+ * You may only use this file in accordance with the terms of version
	+ * 1.0 of the CDDL.
	+ *
	+ * A full copy of the text of the CDDL should have accompanied this
	+ * source. A copy of the CDDL is also available via the Internet at
	+ * http://www.illumos.org/license/CDDL.
	+ *
	+ * CDDL HEADER END
	+ */
	+
	+/*
	+ * Copyright (c) 2015 by Delphix. All rights reserved.
	+ */
	+
	+#include <sys/dmu_tx.h>
	+#include <sys/dsl_pool.h>
	+#include <sys/spa.h>
	+#include <sys/vdev_impl.h>
	+#include <sys/vdev_indirect_mapping.h>
	+#include <sys/zfeature.h>
	+#include <sys/dmu_objset.h>
	+
	+static boolean_t
	+vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
	+{
	+ ASSERT(vim != NULL);
	+
	+ ASSERT(vim->vim_object != 0);
	+ ASSERT(vim->vim_objset != NULL);
	+ ASSERT(vim->vim_phys != NULL);
	+ ASSERT(vim->vim_dbuf != NULL);
	+
	+ EQUIV(vim->vim_phys->vimp_num_entries > 0,
	+ vim->vim_entries != NULL);
	+ if (vim->vim_phys->vimp_num_entries > 0) {
	+ vdev_indirect_mapping_entry_phys_t *last_entry =
	+ &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
	+ uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry);
	+ uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst);
	+
	+ ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
	+ }
	+ if (vim->vim_havecounts) {
	+ ASSERT(vim->vim_phys->vimp_counts_object != 0);
	+ }
	+
	+ return (B_TRUE);
	+}
	+
	+uint64_t
	+vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
	+{
	+ ASSERT(vdev_indirect_mapping_verify(vim));
	+
	+ return (vim->vim_phys->vimp_num_entries);
	+}
	+
	+uint64_t
	+vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
	+{
	+ ASSERT(vdev_indirect_mapping_verify(vim));
	+
	+ return (vim->vim_phys->vimp_max_offset);
	+}
	+
	+uint64_t
	+vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
	+{
	+ ASSERT(vdev_indirect_mapping_verify(vim));
	+
	+ return (vim->vim_object);
	+}
	+
	+uint64_t
	+vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
	+{
	+ ASSERT(vdev_indirect_mapping_verify(vim));
	+
	+ return (vim->vim_phys->vimp_bytes_mapped);
	+}
	+
	+/*
	+ * The length (in bytes) of the mapping object array in memory and
	+ * (logically) on disk.
	+ *
	+ * Note that unlike most of our accessor functions,
	+ * we don't assert that the struct is consistent; therefore it can be
	+ * called while there may be concurrent changes, if we don't care about
	+ * the value being immediately stale (e.g. from spa_removal_get_stats()).
	+ */
	+uint64_t
	+vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
	+{
	+ return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
	+}
	+
	+/*
	+ * Compare an offset with an indirect mapping entry; there are three
	+ * possible scenarios:
	+ *
	+ * 1. The offset is "less than" the mapping entry; meaning the
	+ * offset is less than the source offset of the mapping entry. In
	+ * this case, there is no overlap between the offset and the
	+ * mapping entry and -1 will be returned.
	+ *
	+ * 2. The offset is "greater than" the mapping entry; meaning the
	+ * offset is greater than the mapping entry's source offset plus
	+ * the entry's size. In this case, there is no overlap between
	+ * the offset and the mapping entry and 1 will be returned.
	+ *
	+ * NOTE: If the offset is actually equal to the entry's offset
	+ * plus size, this is considered to be "greater" than the entry,
	+ * and this case applies (i.e. 1 will be returned). Thus, the
	+ * entry's "range" can be considered to be inclusive at its
	+ * start, but exclusive at its end: e.g. [src, src + size).
	+ *
	+ * 3. The last case to consider is if the offset actually falls
	+ * within the mapping entry's range. If this is the case, the
	+ * offset is considered to be "equal to" the mapping entry and
	+ * 0 will be returned.
	+ *
	+ * NOTE: If the offset is equal to the entry's source offset,
	+ * this case applies and 0 will be returned. If the offset is
	+ * equal to the entry's source plus its size, this case does
	+ * not apply (see "NOTE" above for scenario 2), and 1 will be
	+ * returned.
	+ */
	+static int
	+dva_mapping_overlap_compare(const void v_key, const void v_array_elem)
	+{
	+ const uint64_t *key = v_key;
	+ const vdev_indirect_mapping_entry_phys_t *array_elem =
	+ v_array_elem;
	+ uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
	+
	+ if (*key < src_offset) {
	+ return (-1);
	+ } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
	+ return (0);
	+ } else {
	+ return (1);
	+ }
	+}
	+
	+/*
	+ * Returns the mapping entry for the given offset.
	+ *
	+ * It's possible that the given offset will not be in the mapping table
	+ * (i.e. no mapping entries contain this offset), in which case, the
	+ * return value value depends on the "next_if_missing" parameter.
	+ *
	+ * If the offset is not found in the table and "next_if_missing" is
	+ * B_FALSE, then NULL will always be returned. The behavior is intended
	+ * to allow consumers to get the entry corresponding to the offset
	+ * parameter, iff the offset overlaps with an entry in the table.
	+ *
	+ * If the offset is not found in the table and "next_if_missing" is
	+ * B_TRUE, then the entry nearest to the given offset will be returned,
	+ * such that the entry's source offset is greater than the offset
	+ * passed in (i.e. the "next" mapping entry in the table is returned, if
	+ * the offset is missing from the table). If there are no entries whose
	+ * source offset is greater than the passed in offset, NULL is returned.
	+ */
	+static vdev_indirect_mapping_entry_phys_t *
	+vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
	+ uint64_t offset, boolean_t next_if_missing)
	+{
	+ ASSERT(vdev_indirect_mapping_verify(vim));
	+ ASSERT(vim->vim_phys->vimp_num_entries > 0);
	+
	+ vdev_indirect_mapping_entry_phys_t *entry = NULL;
	+
	+ uint64_t last = vim->vim_phys->vimp_num_entries - 1;
	+ uint64_t base = 0;
	+
	+ /*
	+ * We don't define these inside of the while loop because we use
	+ * their value in the case that offset isn't in the mapping.
	+ */
	+ uint64_t mid;
	+ int result;
	+
	+ while (last >= base) {
	+ mid = base + ((last - base) >> 1);
	+
	+ result = dva_mapping_overlap_compare(&offset,
	+ &vim->vim_entries[mid]);
	+
	+ if (result == 0) {
	+ entry = &vim->vim_entries[mid];
	+ break;
	+ } else if (result < 0) {
	+ last = mid - 1;
	+ } else {
	+ base = mid + 1;
	+ }
	+ }
	+
	+ if (entry == NULL && next_if_missing) {
	+ ASSERT3U(base, ==, last + 1);
	+ ASSERT(mid == base \|\| mid == last);
	+ ASSERT3S(result, !=, 0);
	+
	+ /*
	+ * The offset we're looking for isn't actually contained
	+ * in the mapping table, thus we need to return the
	+ * closest mapping entry that is greater than the
	+ * offset. We reuse the result of the last comparison,
	+ * comparing the mapping entry at index "mid" and the
	+ * offset. The offset is guaranteed to lie between
	+ * indices one less than "mid", and one greater than
	+ * "mid"; we just need to determine if offset is greater
	+ * than, or less than the mapping entry contained at
	+ * index "mid".
	+ */
	+
	+ uint64_t index;
	+ if (result < 0)
	+ index = mid;
	+ else
	+ index = mid + 1;
	+
	+ ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
	+
	+ if (index == vim->vim_phys->vimp_num_entries) {
	+ /*
	+ * If "index" is past the end of the entries
	+ * array, then not only is the offset not in the
	+ * mapping table, but it's actually greater than
	+ * all entries in the table. In this case, we
	+ * can't return a mapping entry greater than the
	+ * offset (since none exist), so we return NULL.
	+ */
	+
	+ ASSERT3S(dva_mapping_overlap_compare(&offset,
	+ &vim->vim_entries[index - 1]), >, 0);
	+
	+ return (NULL);
	+ } else {
	+ /*
	+ * Just to be safe, we verify the offset falls
	+ * in between the mapping entries at index and
	+ * one less than index. Since we know the offset
	+ * doesn't overlap an entry, and we're supposed
	+ * to return the entry just greater than the
	+ * offset, both of the following tests must be
	+ * true.
	+ */
	+ ASSERT3S(dva_mapping_overlap_compare(&offset,
	+ &vim->vim_entries[index]), <, 0);
	+ IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
	+ &vim->vim_entries[index - 1]) > 0);
	+
	+ return (&vim->vim_entries[index]);
	+ }
	+ } else {
	+ return (entry);
	+ }
	+}
	+
	+vdev_indirect_mapping_entry_phys_t *
	+vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
	+ uint64_t offset)
	+{
	+ return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
	+ B_FALSE));
	+}
	+
	+vdev_indirect_mapping_entry_phys_t *
	+vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
	+ uint64_t offset)
	+{
	+ return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
	+ B_TRUE));
	+}
	+
	+
	+void
	+vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
	+{
	+ ASSERT(vdev_indirect_mapping_verify(vim));
	+
	+ if (vim->vim_phys->vimp_num_entries > 0) {
	+ uint64_t map_size = vdev_indirect_mapping_size(vim);
	+ kmem_free(vim->vim_entries, map_size);
	+ vim->vim_entries = NULL;
	+ }
	+
	+ dmu_buf_rele(vim->vim_dbuf, vim);
	+
	+ vim->vim_objset = NULL;
	+ vim->vim_object = 0;
	+ vim->vim_dbuf = NULL;
	+ vim->vim_phys = NULL;
	+
	+ kmem_free(vim, sizeof (*vim));
	+}
	+
	+uint64_t
	+vdev_indirect_mapping_alloc(objset_t os, dmu_tx_t tx)
	+{
	+ uint64_t object;
	+ ASSERT(dmu_tx_is_syncing(tx));
	+ uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
	+
	+ if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	+ bonus_size = sizeof (vdev_indirect_mapping_phys_t);
	+ }
	+
	+ object = dmu_object_alloc(os,
	+ DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
	+ DMU_OTN_UINT64_METADATA, bonus_size,
	+ tx);
	+
	+ if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	+ dmu_buf_t *dbuf;
	+ vdev_indirect_mapping_phys_t *vimp;
	+
	+ VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
	+ dmu_buf_will_dirty(dbuf, tx);
	+ vimp = dbuf->db_data;
	+ vimp->vimp_counts_object = dmu_object_alloc(os,
	+ DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
	+ DMU_OT_NONE, 0, tx);
	+ spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	+ dmu_buf_rele(dbuf, FTAG);
	+ }
	+
	+ return (object);
	+}
	+
	+
	+vdev_indirect_mapping_t *
	+vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
	+{
	+ vdev_indirect_mapping_t vim = kmem_zalloc(sizeof (vim), KM_SLEEP);
	+ dmu_object_info_t doi;
	+ VERIFY0(dmu_object_info(os, mapping_object, &doi));
	+
	+ vim->vim_objset = os;
	+ vim->vim_object = mapping_object;
	+
	+ VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
	+ &vim->vim_dbuf));
	+ vim->vim_phys = vim->vim_dbuf->db_data;
	+
	+ vim->vim_havecounts =
	+ (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
	+
	+ if (vim->vim_phys->vimp_num_entries > 0) {
	+ uint64_t map_size = vdev_indirect_mapping_size(vim);
	+ vim->vim_entries = kmem_alloc(map_size, KM_SLEEP);
	+ VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
	+ vim->vim_entries, DMU_READ_PREFETCH));
	+ }
	+
	+ ASSERT(vdev_indirect_mapping_verify(vim));
	+
	+ return (vim);
	+}
	+
	+void
	+vdev_indirect_mapping_free(objset_t os, uint64_t object, dmu_tx_t tx)
	+{
	+ vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
	+ if (vim->vim_havecounts) {
	+ VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
	+ tx));
	+ spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	+ }
	+ vdev_indirect_mapping_close(vim);
	+
	+ VERIFY0(dmu_object_free(os, object, tx));
	+}
	+
	+/*
	+ * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
	+ * mapping object. Also remove the entries from the list and free them.
	+ * This also implicitly extends the max_offset of the mapping (to the end
	+ * of the last entry).
	+ */
	+void
	+vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
	+ list_t list, dmu_tx_t tx)
	+{
	+ vdev_indirect_mapping_entry_phys_t *mapbuf;
	+ uint64_t old_size;
	+ uint32_t *countbuf = NULL;
	+ vdev_indirect_mapping_entry_phys_t *old_entries;
	+ uint64_t old_count;
	+ uint64_t entries_written = 0;
	+
	+ ASSERT(vdev_indirect_mapping_verify(vim));
	+ ASSERT(dmu_tx_is_syncing(tx));
	+ ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
	+ ASSERT(!list_is_empty(list));
	+
	+ old_size = vdev_indirect_mapping_size(vim);
	+ old_entries = vim->vim_entries;
	+ old_count = vim->vim_phys->vimp_num_entries;
	+
	+ dmu_buf_will_dirty(vim->vim_dbuf, tx);
	+
	+ mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
	+ if (vim->vim_havecounts) {
	+ countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
	+ ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
	+ SPA_FEATURE_OBSOLETE_COUNTS));
	+ }
	+ while (!list_is_empty(list)) {
	+ uint64_t i;
	+ /*
	+ * Write entries from the list to the
	+ * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
	+ */
	+ for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
	+ vdev_indirect_mapping_entry_t *entry =
	+ list_remove_head(list);
	+ if (entry == NULL)
	+ break;
	+
	+ uint64_t size =
	+ DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
	+ uint64_t src_offset =
	+ DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
	+
	+ /*
	+ * We shouldn't be adding an entry which is fully
	+ * obsolete.
	+ */
	+ ASSERT3U(entry->vime_obsolete_count, <, size);
	+ IMPLY(entry->vime_obsolete_count != 0,
	+ vim->vim_havecounts);
	+
	+ mapbuf[i] = entry->vime_mapping;
	+ if (vim->vim_havecounts)
	+ countbuf[i] = entry->vime_obsolete_count;
	+
	+ vim->vim_phys->vimp_bytes_mapped += size;
	+ ASSERT3U(src_offset, >=,
	+ vim->vim_phys->vimp_max_offset);
	+ vim->vim_phys->vimp_max_offset = src_offset + size;
	+
	+ entries_written++;
	+
	+ kmem_free(entry, sizeof (*entry));
	+ }
	+ dmu_write(vim->vim_objset, vim->vim_object,
	+ vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
	+ i * sizeof (*mapbuf),
	+ mapbuf, tx);
	+ if (vim->vim_havecounts) {
	+ dmu_write(vim->vim_objset,
	+ vim->vim_phys->vimp_counts_object,
	+ vim->vim_phys->vimp_num_entries *
	+ sizeof (*countbuf),
	+ i * sizeof (*countbuf), countbuf, tx);
	+ }
	+ vim->vim_phys->vimp_num_entries += i;
	+ }
	+ zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
	+ if (vim->vim_havecounts)
	+ zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
	+
	+ /*
	+ * Update the entry array to reflect the new entries. First, copy
	+ * over any old entries then read back the new entries we just wrote.
	+ */
	+ uint64_t new_size = vdev_indirect_mapping_size(vim);
	+ ASSERT3U(new_size, >, old_size);
	+ ASSERT3U(new_size - old_size, ==,
	+ entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
	+ vim->vim_entries = kmem_alloc(new_size, KM_SLEEP);
	+ if (old_size > 0) {
	+ bcopy(old_entries, vim->vim_entries, old_size);
	+ kmem_free(old_entries, old_size);
	+ }
	+ VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
	+ new_size - old_size, &vim->vim_entries[old_count],
	+ DMU_READ_PREFETCH));
	+
	+ zfs_dbgmsg("txg %llu: wrote %llu entries to "
	+ "indirect mapping obj %llu; max offset=0x%llx",
	+ (u_longlong_t)dmu_tx_get_txg(tx),
	+ (u_longlong_t)entries_written,
	+ (u_longlong_t)vim->vim_object,
	+ (u_longlong_t)vim->vim_phys->vimp_max_offset);
	+}
	+
	+/*
	+ * Increment the relevant counts for the specified offset and length.
	+ * The counts array must be obtained from
	+ * vdev_indirect_mapping_load_obsolete_counts().
	+ */
	+void
	+vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
	+ uint64_t offset, uint64_t length, uint32_t *counts)
	+{
	+ vdev_indirect_mapping_entry_phys_t *mapping;
	+ uint64_t index;
	+
	+ mapping = vdev_indirect_mapping_entry_for_offset(vim, offset);
	+
	+ ASSERT(length > 0);
	+ ASSERT3P(mapping, !=, NULL);
	+
	+ index = mapping - vim->vim_entries;
	+
	+ while (length > 0) {
	+ ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
	+
	+ uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
	+ uint64_t inner_offset = offset -
	+ DVA_MAPPING_GET_SRC_OFFSET(mapping);
	+ VERIFY3U(inner_offset, <, size);
	+ uint64_t inner_size = MIN(length, size - inner_offset);
	+
	+ VERIFY3U(counts[index] + inner_size, <=, size);
	+ counts[index] += inner_size;
	+
	+ offset += inner_size;
	+ length -= inner_size;
	+ mapping++;
	+ index++;
	+ }
	+}
	+
	+typedef struct load_obsolete_space_map_arg {
	+ vdev_indirect_mapping_t *losma_vim;
	+ uint32_t *losma_counts;
	+} load_obsolete_space_map_arg_t;
	+
	+static int
	+load_obsolete_sm_callback(maptype_t type, uint64_t offset, uint64_t size,
	+ void *arg)
	+{
	+ load_obsolete_space_map_arg_t *losma = arg;
	+ ASSERT3S(type, ==, SM_ALLOC);
	+
	+ vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
	+ offset, size, losma->losma_counts);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Modify the counts (increment them) based on the spacemap.
	+ */
	+void
	+vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
	+ uint32_t counts, space_map_t obsolete_space_sm)
	+{
	+ load_obsolete_space_map_arg_t losma;
	+ losma.losma_counts = counts;
	+ losma.losma_vim = vim;
	+ VERIFY0(space_map_iterate(obsolete_space_sm,
	+ load_obsolete_sm_callback, &losma));
	+}
	+
	+/*
	+ * Read the obsolete counts from disk, returning them in an array.
	+ */
	+uint32_t *
	+vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
	+{
	+ ASSERT(vdev_indirect_mapping_verify(vim));
	+
	+ uint64_t counts_size =
	+ vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
	+ uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP);
	+ if (vim->vim_havecounts) {
	+ VERIFY0(dmu_read(vim->vim_objset,
	+ vim->vim_phys->vimp_counts_object,
	+ 0, counts_size,
	+ counts, DMU_READ_PREFETCH));
	+ } else {
	+ bzero(counts, counts_size);
	+ }
	+ return (counts);
	+}
	+
	+extern void
	+vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
	+ uint32_t *counts)
	+{
	+ ASSERT(vdev_indirect_mapping_verify(vim));
	+
	+ kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
	+}

	Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c (revision 332525)
	@@ -1,1360 +1,1437 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	*/

	/*
	* Virtual Device Labels
	* ---------------------
	*
	* The vdev label serves several distinct purposes:
	*
	* 1. Uniquely identify this device as part of a ZFS pool and confirm its
	* identity within the pool.
	*
	* 2. Verify that all the devices given in a configuration are present
	* within the pool.
	*
	* 3. Determine the uberblock for the pool.
	*
	* 4. In case of an import operation, determine the configuration of the
	* toplevel vdev of which it is a part.
	*
	* 5. If an import operation cannot find all the devices in the pool,
	* provide enough information to the administrator to determine which
	* devices are missing.
	*
	* It is important to note that while the kernel is responsible for writing the
	* label, it only consumes the information in the first three cases. The
	* latter information is only consumed in userland when determining the
	* configuration to import a pool.
	*
	*
	* Label Organization
	* ------------------
	*
	* Before describing the contents of the label, it's important to understand how
	* the labels are written and updated with respect to the uberblock.
	*
	* When the pool configuration is altered, either because it was newly created
	* or a device was added, we want to update all the labels such that we can deal
	* with fatal failure at any point. To this end, each disk has two labels which
	* are updated before and after the uberblock is synced. Assuming we have
	* labels and an uberblock with the following transaction groups:
	*
	* L1 UB L2
	* +------+ +------+ +------+
	* \| \| \| \| \| \|
	* \| t10 \| \| t10 \| \| t10 \|
	* \| \| \| \| \| \|
	* +------+ +------+ +------+
	*
	* In this stable state, the labels and the uberblock were all updated within
	* the same transaction group (10). Each label is mirrored and checksummed, so
	* that we can detect when we fail partway through writing the label.
	*
	* In order to identify which labels are valid, the labels are written in the
	* following manner:
	*
	* 1. For each vdev, update 'L1' to the new label
	* 2. Update the uberblock
	* 3. For each vdev, update 'L2' to the new label
	*
	* Given arbitrary failure, we can determine the correct label to use based on
	* the transaction group. If we fail after updating L1 but before updating the
	* UB, we will notice that L1's transaction group is greater than the uberblock,
	* so L2 must be valid. If we fail after writing the uberblock but before
	* writing L2, we will notice that L2's transaction group is less than L1, and
	* therefore L1 is valid.
	*
	* Another added complexity is that not every label is updated when the config
	* is synced. If we add a single device, we do not want to have to re-write
	* every label for every device in the pool. This means that both L1 and L2 may
	* be older than the pool uberblock, because the necessary information is stored
	* on another vdev.
	*
	*
	* On-disk Format
	* --------------
	*
	* The vdev label consists of two distinct parts, and is wrapped within the
	* vdev_label_t structure. The label includes 8k of padding to permit legacy
	* VTOC disk labels, but is otherwise ignored.
	*
	* The first half of the label is a packed nvlist which contains pool wide
	* properties, per-vdev properties, and configuration information. It is
	* described in more detail below.
	*
	* The latter half of the label consists of a redundant array of uberblocks.
	* These uberblocks are updated whenever a transaction group is committed,
	* or when the configuration is updated. When a pool is loaded, we scan each
	* vdev for the 'best' uberblock.
	*
	*
	* Configuration Information
	* -------------------------
	*
	* The nvlist describing the pool and vdev contains the following elements:
	*
	* version ZFS on-disk version
	* name Pool name
	* state Pool state
	* txg Transaction group in which this label was written
	* pool_guid Unique identifier for this pool
	* vdev_tree An nvlist describing vdev tree.
	* features_for_read
	* An nvlist of the features necessary for reading the MOS.
	*
	* Each leaf device label also contains the following:
	*
	* top_guid Unique ID for top-level vdev in which this is contained
	* guid Unique ID for the leaf vdev
	*
	* The 'vs' configuration follows the format described in 'spa_config.c'.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/spa_impl.h>
	#include <sys/dmu.h>
	#include <sys/zap.h>
	#include <sys/vdev.h>
	#include <sys/vdev_impl.h>
	#include <sys/uberblock_impl.h>
	#include <sys/metaslab.h>
	+#include <sys/metaslab_impl.h>
	#include <sys/zio.h>
	#include <sys/dsl_scan.h>
	#include <sys/abd.h>
	#include <sys/fs/zfs.h>
	#include <sys/trim_map.h>

	static boolean_t vdev_trim_on_init = B_TRUE;
	SYSCTL_DECL(_vfs_zfs_vdev);
	SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW,
	&vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");

	/*
	* Basic routines to read and write from a vdev label.
	* Used throughout the rest of this file.
	*/
	uint64_t
	vdev_label_offset(uint64_t psize, int l, uint64_t offset)
	{
	ASSERT(offset < sizeof (vdev_label_t));
	ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);

	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
	0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
	}

	/*
	* Returns back the vdev label associated with the passed in offset.
	*/
	int
	vdev_label_number(uint64_t psize, uint64_t offset)
	{
	int l;

	if (offset >= psize - VDEV_LABEL_END_SIZE) {
	offset -= psize - VDEV_LABEL_END_SIZE;
	offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
	}
	l = offset / sizeof (vdev_label_t);
	return (l < VDEV_LABELS ? l : -1);
	}

	static void
	vdev_label_read(zio_t zio, vdev_t vd, int l, abd_t *buf, uint64_t offset,
	uint64_t size, zio_done_func_t done, void private, int flags)
	{
	ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
	SCL_STATE_ALL);
	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);

	zio_nowait(zio_read_phys(zio, vd,
	vdev_label_offset(vd->vdev_psize, l, offset),
	size, buf, ZIO_CHECKSUM_LABEL, done, private,
	ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
	}

	static void
	vdev_label_write(zio_t zio, vdev_t vd, int l, abd_t *buf, uint64_t offset,
	uint64_t size, zio_done_func_t done, void private, int flags)
	{
	ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL \|\|
	(spa_config_held(zio->io_spa, SCL_CONFIG \| SCL_STATE, RW_READER) ==
	(SCL_CONFIG \| SCL_STATE) &&
	dsl_pool_sync_context(spa_get_dsl(zio->io_spa))));
	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);

	zio_nowait(zio_write_phys(zio, vd,
	vdev_label_offset(vd->vdev_psize, l, offset),
	size, buf, ZIO_CHECKSUM_LABEL, done, private,
	ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
	}

	/*
	* Generate the nvlist representing this vdev's config.
	*/
	nvlist_t *
	vdev_config_generate(spa_t spa, vdev_t vd, boolean_t getstats,
	vdev_config_flag_t flags)
	{
	nvlist_t *nv = NULL;
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;

	nv = fnvlist_alloc();

	fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
	if (!(flags & (VDEV_CONFIG_SPARE \| VDEV_CONFIG_L2CACHE)))
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);

	if (vd->vdev_path != NULL)
	fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);

	if (vd->vdev_devid != NULL)
	fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);

	if (vd->vdev_physpath != NULL)
	fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
	vd->vdev_physpath);

	if (vd->vdev_fru != NULL)
	fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);

	if (vd->vdev_nparity != 0) {
	ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
	VDEV_TYPE_RAIDZ) == 0);

	/*
	* Make sure someone hasn't managed to sneak a fancy new vdev
	* into a crufty old storage pool.
	*/
	ASSERT(vd->vdev_nparity == 1 \|\|
	(vd->vdev_nparity <= 2 &&
	spa_version(spa) >= SPA_VERSION_RAIDZ2) \|\|
	(vd->vdev_nparity <= 3 &&
	spa_version(spa) >= SPA_VERSION_RAIDZ3));

	/*
	* Note that we'll add the nparity tag even on storage pools
	* that only support a single parity device -- older software
	* will just ignore it.
	*/
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
	}

	if (vd->vdev_wholedisk != -1ULL)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	vd->vdev_wholedisk);

	if (vd->vdev_not_present)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);

	if (vd->vdev_isspare)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);

	if (!(flags & (VDEV_CONFIG_SPARE \| VDEV_CONFIG_L2CACHE)) &&
	vd == vd->vdev_top) {
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
	vd->vdev_ms_array);
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
	vd->vdev_ms_shift);
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
	vd->vdev_asize);
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
	- if (vd->vdev_removing)
	+ if (vd->vdev_removing) {
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
	vd->vdev_removing);
	+ }
	}

	if (vd->vdev_dtl_sm != NULL) {
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
	space_map_object(vd->vdev_dtl_sm));
	}

	+ if (vic->vic_mapping_object != 0) {
	+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
	+ vic->vic_mapping_object);
	+ }
	+
	+ if (vic->vic_births_object != 0) {
	+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
	+ vic->vic_births_object);
	+ }
	+
	+ if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
	+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
	+ vic->vic_prev_indirect_vdev);
	+ }
	+
	if (vd->vdev_crtxg)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);

	if (flags & VDEV_CONFIG_MOS) {
	if (vd->vdev_leaf_zap != 0) {
	ASSERT(vd->vdev_ops->vdev_op_leaf);
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
	vd->vdev_leaf_zap);
	}

	if (vd->vdev_top_zap != 0) {
	ASSERT(vd == vd->vdev_top);
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
	vd->vdev_top_zap);
	}
	}

	if (getstats) {
	vdev_stat_t vs;
	- pool_scan_stat_t ps;

	vdev_get_stats(vd, &vs);
	fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
	(uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));

	/* provide either current or previous scan information */
	+ pool_scan_stat_t ps;
	if (spa_scan_get_stats(spa, &ps) == 0) {
	fnvlist_add_uint64_array(nv,
	ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
	sizeof (pool_scan_stat_t) / sizeof (uint64_t));
	}
	+
	+ pool_removal_stat_t prs;
	+ if (spa_removal_get_stats(spa, &prs) == 0) {
	+ fnvlist_add_uint64_array(nv,
	+ ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
	+ sizeof (prs) / sizeof (uint64_t));
	+ }
	+
	+ /*
	+ * Note: this can be called from open context
	+ * (spa_get_stats()), so we need the rwlock to prevent
	+ * the mapping from being changed by condensing.
	+ */
	+ rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
	+ if (vd->vdev_indirect_mapping != NULL) {
	+ ASSERT(vd->vdev_indirect_births != NULL);
	+ vdev_indirect_mapping_t *vim =
	+ vd->vdev_indirect_mapping;
	+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
	+ vdev_indirect_mapping_size(vim));
	+ }
	+ rw_exit(&vd->vdev_indirect_rwlock);
	+ if (vd->vdev_mg != NULL &&
	+ vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
	+ /*
	+ * Compute approximately how much memory would be used
	+ * for the indirect mapping if this device were to
	+ * be removed.
	+ *
	+ * Note: If the frag metric is invalid, then not
	+ * enough metaslabs have been converted to have
	+ * histograms.
	+ */
	+ uint64_t seg_count = 0;
	+
	+ /*
	+ * There are the same number of allocated segments
	+ * as free segments, so we will have at least one
	+ * entry per free segment.
	+ */
	+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
	+ seg_count += vd->vdev_mg->mg_histogram[i];
	+ }
	+
	+ /*
	+ * The maximum length of a mapping is SPA_MAXBLOCKSIZE,
	+ * so we need at least one entry per SPA_MAXBLOCKSIZE
	+ * of allocated data.
	+ */
	+ seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE;
	+
	+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
	+ seg_count *
	+ sizeof (vdev_indirect_mapping_entry_phys_t));
	+ }
	}

	if (!vd->vdev_ops->vdev_op_leaf) {
	nvlist_t **child;
	int c, idx;

	ASSERT(!vd->vdev_ishole);

	child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
	KM_SLEEP);

	for (c = 0, idx = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];

	/*
	* If we're generating an nvlist of removing
	* vdevs then skip over any device which is
	* not being removed.
	*/
	if ((flags & VDEV_CONFIG_REMOVING) &&
	!cvd->vdev_removing)
	continue;

	child[idx++] = vdev_config_generate(spa, cvd,
	getstats, flags);
	}

	if (idx) {
	fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	child, idx);
	}

	for (c = 0; c < idx; c++)
	nvlist_free(child[c]);

	kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));

	} else {
	const char *aux = NULL;

	if (vd->vdev_offline && !vd->vdev_tmpoffline)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
	if (vd->vdev_resilver_txg != 0)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
	vd->vdev_resilver_txg);
	if (vd->vdev_faulted)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
	if (vd->vdev_degraded)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
	if (vd->vdev_removed)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
	if (vd->vdev_unspare)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
	if (vd->vdev_ishole)
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);

	switch (vd->vdev_stat.vs_aux) {
	case VDEV_AUX_ERR_EXCEEDED:
	aux = "err_exceeded";
	break;

	case VDEV_AUX_EXTERNAL:
	aux = "external";
	break;
	}

	if (aux != NULL)
	fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);

	if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
	vd->vdev_orig_guid);
	}
	}

	return (nv);
	}

	/*
	* Generate a view of the top-level vdevs. If we currently have holes
	* in the namespace, then generate an array which contains a list of holey
	* vdevs. Additionally, add the number of top-level children that currently
	* exist.
	*/
	void
	vdev_top_config_generate(spa_t spa, nvlist_t config)
	{
	vdev_t *rvd = spa->spa_root_vdev;
	uint64_t *array;
	uint_t c, idx;

	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);

	for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];

	- if (tvd->vdev_ishole)
	+ if (tvd->vdev_ishole) {
	array[idx++] = c;
	+ }
	}

	if (idx) {
	VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
	array, idx) == 0);
	}

	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
	rvd->vdev_children) == 0);

	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
	}

	/*
	* Returns the configuration from the label of the given vdev. For vdevs
	* which don't have a txg value stored on their label (i.e. spares/cache)
	* or have not been completely initialized (txg = 0) just return
	* the configuration from the first valid label we find. Otherwise,
	* find the most up-to-date label that does not exceed the specified
	* 'txg' value.
	*/
	nvlist_t *
	vdev_label_read_config(vdev_t *vd, uint64_t txg)
	{
	spa_t *spa = vd->vdev_spa;
	nvlist_t *config = NULL;
	vdev_phys_t *vp;
	abd_t *vp_abd;
	zio_t *zio;
	uint64_t best_txg = 0;
	int error = 0;
	int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL \|
	ZIO_FLAG_SPECULATIVE;

	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

	if (!vdev_readable(vd))
	return (NULL);

	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
	vp = abd_to_buf(vp_abd);

	retry:
	for (int l = 0; l < VDEV_LABELS; l++) {
	nvlist_t *label = NULL;

	zio = zio_root(spa, NULL, NULL, flags);

	vdev_label_read(zio, vd, l, vp_abd,
	offsetof(vdev_label_t, vl_vdev_phys),
	sizeof (vdev_phys_t), NULL, NULL, flags);

	if (zio_wait(zio) == 0 &&
	nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
	&label, 0) == 0) {
	uint64_t label_txg = 0;

	/*
	* Auxiliary vdevs won't have txg values in their
	* labels and newly added vdevs may not have been
	* completely initialized so just return the
	* configuration from the first valid label we
	* encounter.
	*/
	error = nvlist_lookup_uint64(label,
	ZPOOL_CONFIG_POOL_TXG, &label_txg);
	if ((error \|\| label_txg == 0) && !config) {
	config = label;
	break;
	} else if (label_txg <= txg && label_txg > best_txg) {
	best_txg = label_txg;
	nvlist_free(config);
	config = fnvlist_dup(label);
	}
	}

	if (label != NULL) {
	nvlist_free(label);
	label = NULL;
	}
	}

	if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
	flags \|= ZIO_FLAG_TRYHARD;
	goto retry;
	}

	abd_free(vp_abd);

	return (config);
	}

	/*
	* Determine if a device is in use. The 'spare_guid' parameter will be filled
	* in with the device guid if this spare is active elsewhere on the system.
	*/
	static boolean_t
	vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
	uint64_t spare_guid, uint64_t l2cache_guid)
	{
	spa_t *spa = vd->vdev_spa;
	uint64_t state, pool_guid, device_guid, txg, spare_pool;
	uint64_t vdtxg = 0;
	nvlist_t *label;

	if (spare_guid)
	*spare_guid = 0ULL;
	if (l2cache_guid)
	*l2cache_guid = 0ULL;

	/*
	* Read the label, if any, and perform some basic sanity checks.
	*/
	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
	return (B_FALSE);

	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
	&vdtxg);

	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
	&state) != 0 \|\|
	nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
	&device_guid) != 0) {
	nvlist_free(label);
	return (B_FALSE);
	}

	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
	(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
	&pool_guid) != 0 \|\|
	nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
	&txg) != 0)) {
	nvlist_free(label);
	return (B_FALSE);
	}

	nvlist_free(label);

	/*
	* Check to see if this device indeed belongs to the pool it claims to
	* be a part of. The only way this is allowed is if the device is a hot
	* spare (which we check for later on).
	*/
	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
	!spa_guid_exists(pool_guid, device_guid) &&
	!spa_spare_exists(device_guid, NULL, NULL) &&
	!spa_l2cache_exists(device_guid, NULL))
	return (B_FALSE);

	/*
	* If the transaction group is zero, then this an initialized (but
	* unused) label. This is only an error if the create transaction
	* on-disk is the same as the one we're using now, in which case the
	* user has attempted to add the same vdev multiple times in the same
	* transaction.
	*/
	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
	txg == 0 && vdtxg == crtxg)
	return (B_TRUE);

	/*
	* Check to see if this is a spare device. We do an explicit check for
	* spa_has_spare() here because it may be on our pending list of spares
	* to add. We also check if it is an l2cache device.
	*/
	if (spa_spare_exists(device_guid, &spare_pool, NULL) \|\|
	spa_has_spare(spa, device_guid)) {
	if (spare_guid)
	*spare_guid = device_guid;

	switch (reason) {
	case VDEV_LABEL_CREATE:
	case VDEV_LABEL_L2CACHE:
	return (B_TRUE);

	case VDEV_LABEL_REPLACE:
	return (!spa_has_spare(spa, device_guid) \|\|
	spare_pool != 0ULL);

	case VDEV_LABEL_SPARE:
	return (spa_has_spare(spa, device_guid));
	}
	}

	/*
	* Check to see if this is an l2cache device.
	*/
	if (spa_l2cache_exists(device_guid, NULL))
	return (B_TRUE);

	/*
	* We can't rely on a pool's state if it's been imported
	* read-only. Instead we look to see if the pools is marked
	* read-only in the namespace and set the state to active.
	*/
	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
	(spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
	spa_mode(spa) == FREAD)
	state = POOL_STATE_ACTIVE;

	/*
	* If the device is marked ACTIVE, then this device is in use by another
	* pool on the system.
	*/
	return (state == POOL_STATE_ACTIVE);
	}

	/*
	* Initialize a vdev label. We check to make sure each leaf device is not in
	* use, and writable. We put down an initial label which we will later
	* overwrite with a complete label. Note that it's important to do this
	* sequentially, not in parallel, so that we catch cases of multiple use of the
	* same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
	* itself.
	*/
	int
	vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
	{
	spa_t *spa = vd->vdev_spa;
	nvlist_t *label;
	vdev_phys_t *vp;
	abd_t *vp_abd;
	abd_t *pad2;
	uberblock_t *ub;
	abd_t *ub_abd;
	zio_t *zio;
	char *buf;
	size_t buflen;
	int error;
	uint64_t spare_guid, l2cache_guid;
	int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	for (int c = 0; c < vd->vdev_children; c++)
	if ((error = vdev_label_init(vd->vdev_child[c],
	crtxg, reason)) != 0)
	return (error);

	/* Track the creation time for this vdev */
	vd->vdev_crtxg = crtxg;

	if (!vd->vdev_ops->vdev_op_leaf \|\| !spa_writeable(spa))
	return (0);

	/*
	* Dead vdevs cannot be initialized.
	*/
	if (vdev_is_dead(vd))
	return (SET_ERROR(EIO));

	/*
	* Determine if the vdev is in use.
	*/
	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
	vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
	return (SET_ERROR(EBUSY));

	/*
	* If this is a request to add or replace a spare or l2cache device
	* that is in use elsewhere on the system, then we must update the
	* guid (which was initialized to a random value) to reflect the
	* actual GUID (which is shared between multiple pools).
	*/
	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
	spare_guid != 0ULL) {
	uint64_t guid_delta = spare_guid - vd->vdev_guid;

	vd->vdev_guid += guid_delta;

	for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
	pvd->vdev_guid_sum += guid_delta;

	/*
	* If this is a replacement, then we want to fallthrough to the
	* rest of the code. If we're adding a spare, then it's already
	* labeled appropriately and we can just return.
	*/
	if (reason == VDEV_LABEL_SPARE)
	return (0);
	ASSERT(reason == VDEV_LABEL_REPLACE \|\|
	reason == VDEV_LABEL_SPLIT);
	}

	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
	l2cache_guid != 0ULL) {
	uint64_t guid_delta = l2cache_guid - vd->vdev_guid;

	vd->vdev_guid += guid_delta;

	for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
	pvd->vdev_guid_sum += guid_delta;

	/*
	* If this is a replacement, then we want to fallthrough to the
	* rest of the code. If we're adding an l2cache, then it's
	* already labeled appropriately and we can just return.
	*/
	if (reason == VDEV_LABEL_L2CACHE)
	return (0);
	ASSERT(reason == VDEV_LABEL_REPLACE);
	}

	/*
	* TRIM the whole thing, excluding the blank space and boot header
	* as specified by ZFS On-Disk Specification (section 1.3), so that
	* we start with a clean slate.
	* It's just an optimization, so we don't care if it fails.
	* Don't TRIM if removing so that we don't interfere with zpool
	* disaster recovery.
	*/
	if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim &&
	(reason == VDEV_LABEL_CREATE \|\| reason == VDEV_LABEL_SPARE \|\|
	reason == VDEV_LABEL_L2CACHE))
	zio_wait(zio_trim(NULL, spa, vd, VDEV_SKIP_SIZE,
	vd->vdev_psize - VDEV_SKIP_SIZE));

	/*
	* Initialize its label.
	*/
	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
	abd_zero(vp_abd, sizeof (vdev_phys_t));
	vp = abd_to_buf(vp_abd);

	/*
	* Generate a label describing the pool and our top-level vdev.
	* We mark it as being from txg 0 to indicate that it's not
	* really part of an active pool just yet. The labels will
	* be written again with a meaningful txg by spa_sync().
	*/
	if (reason == VDEV_LABEL_SPARE \|\|
	(reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
	/*
	* For inactive hot spares, we generate a special label that
	* identifies as a mutually shared hot spare. We write the
	* label if we are adding a hot spare, or if we are removing an
	* active hot spare (in which case we want to revert the
	* labels).
	*/
	VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
	spa_version(spa)) == 0);
	VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
	POOL_STATE_SPARE) == 0);
	VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
	vd->vdev_guid) == 0);
	} else if (reason == VDEV_LABEL_L2CACHE \|\|
	(reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
	/*
	* For level 2 ARC devices, add a special label.
	*/
	VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
	spa_version(spa)) == 0);
	VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
	POOL_STATE_L2CACHE) == 0);
	VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
	vd->vdev_guid) == 0);
	} else {
	uint64_t txg = 0ULL;

	if (reason == VDEV_LABEL_SPLIT)
	txg = spa->spa_uberblock.ub_txg;
	label = spa_config_generate(spa, vd, txg, B_FALSE);

	/*
	* Add our creation time. This allows us to detect multiple
	* vdev uses as described above, and automatically expires if we
	* fail.
	*/
	VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
	crtxg) == 0);
	}

	buf = vp->vp_nvlist;
	buflen = sizeof (vp->vp_nvlist);

	error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
	if (error != 0) {
	nvlist_free(label);
	abd_free(vp_abd);
	/* EFAULT means nvlist_pack ran out of room */
	return (error == EFAULT ? ENAMETOOLONG : EINVAL);
	}

	/*
	* Initialize uberblock template.
	*/
	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
	abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
	abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
	ub = abd_to_buf(ub_abd);
	ub->ub_txg = 0;

	/* Initialize the 2nd padding area. */
	pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
	abd_zero(pad2, VDEV_PAD_SIZE);

	/*
	* Write everything in parallel.
	*/
	retry:
	zio = zio_root(spa, NULL, NULL, flags);

	for (int l = 0; l < VDEV_LABELS; l++) {

	vdev_label_write(zio, vd, l, vp_abd,
	offsetof(vdev_label_t, vl_vdev_phys),
	sizeof (vdev_phys_t), NULL, NULL, flags);

	/*
	* Skip the 1st padding area.
	* Zero out the 2nd padding area where it might have
	* left over data from previous filesystem format.
	*/
	vdev_label_write(zio, vd, l, pad2,
	offsetof(vdev_label_t, vl_pad2),
	VDEV_PAD_SIZE, NULL, NULL, flags);

	vdev_label_write(zio, vd, l, ub_abd,
	offsetof(vdev_label_t, vl_uberblock),
	VDEV_UBERBLOCK_RING, NULL, NULL, flags);
	}

	error = zio_wait(zio);

	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
	flags \|= ZIO_FLAG_TRYHARD;
	goto retry;
	}

	nvlist_free(label);
	abd_free(pad2);
	abd_free(ub_abd);
	abd_free(vp_abd);

	/*
	* If this vdev hasn't been previously identified as a spare, then we
	* mark it as such only if a) we are labeling it as a spare, or b) it
	* exists as a spare elsewhere in the system. Do the same for
	* level 2 ARC devices.
	*/
	if (error == 0 && !vd->vdev_isspare &&
	(reason == VDEV_LABEL_SPARE \|\|
	spa_spare_exists(vd->vdev_guid, NULL, NULL)))
	spa_spare_add(vd);

	if (error == 0 && !vd->vdev_isl2cache &&
	(reason == VDEV_LABEL_L2CACHE \|\|
	spa_l2cache_exists(vd->vdev_guid, NULL)))
	spa_l2cache_add(vd);

	return (error);
	}

	int
	vdev_label_write_pad2(vdev_t vd, const char buf, size_t size)
	{
	spa_t *spa = vd->vdev_spa;
	zio_t *zio;
	abd_t *pad2;
	int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL;
	int error;

	if (size > VDEV_PAD_SIZE)
	return (EINVAL);

	if (!vd->vdev_ops->vdev_op_leaf)
	return (ENODEV);
	if (vdev_is_dead(vd))
	return (ENXIO);

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
	abd_zero(pad2, VDEV_PAD_SIZE);
	abd_copy_from_buf(pad2, buf, size);

	retry:
	zio = zio_root(spa, NULL, NULL, flags);
	vdev_label_write(zio, vd, 0, pad2,
	offsetof(vdev_label_t, vl_pad2),
	VDEV_PAD_SIZE, NULL, NULL, flags);
	error = zio_wait(zio);
	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
	flags \|= ZIO_FLAG_TRYHARD;
	goto retry;
	}

	abd_free(pad2);
	return (error);
	}

	/*
	* ==========================================================================
	* uberblock load/sync
	* ==========================================================================
	*/

	/*
	* Consider the following situation: txg is safely synced to disk. We've
	* written the first uberblock for txg + 1, and then we lose power. When we
	* come back up, we fail to see the uberblock for txg + 1 because, say,
	* it was on a mirrored device and the replica to which we wrote txg + 1
	* is now offline. If we then make some changes and sync txg + 1, and then
	* the missing replica comes back, then for a few seconds we'll have two
	* conflicting uberblocks on disk with the same txg. The solution is simple:
	* among uberblocks with equal txg, choose the one with the latest timestamp.
	*/
	static int
	vdev_uberblock_compare(uberblock_t ub1, uberblock_t ub2)
	{
	if (ub1->ub_txg < ub2->ub_txg)
	return (-1);
	if (ub1->ub_txg > ub2->ub_txg)
	return (1);

	if (ub1->ub_timestamp < ub2->ub_timestamp)
	return (-1);
	if (ub1->ub_timestamp > ub2->ub_timestamp)
	return (1);

	return (0);
	}

	struct ubl_cbdata {
	uberblock_t ubl_ubbest; / Best uberblock */
	vdev_t ubl_vd; / vdev associated with the above */
	};

	static void
	vdev_uberblock_load_done(zio_t *zio)
	{
	vdev_t *vd = zio->io_vd;
	spa_t *spa = zio->io_spa;
	zio_t *rio = zio->io_private;
	uberblock_t *ub = abd_to_buf(zio->io_abd);
	struct ubl_cbdata *cbp = rio->io_private;

	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));

	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
	mutex_enter(&rio->io_lock);
	if (ub->ub_txg <= spa->spa_load_max_txg &&
	vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
	/*
	* Keep track of the vdev in which this uberblock
	* was found. We will use this information later
	* to obtain the config nvlist associated with
	* this uberblock.
	*/
	cbp->ubl_ubbest = ub;
	cbp->ubl_vd = vd;
	}
	mutex_exit(&rio->io_lock);
	}

	abd_free(zio->io_abd);
	}

	static void
	vdev_uberblock_load_impl(zio_t zio, vdev_t vd, int flags,
	struct ubl_cbdata *cbp)
	{
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);

	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
	for (int l = 0; l < VDEV_LABELS; l++) {
	for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
	vdev_label_read(zio, vd, l,
	abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
	B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
	VDEV_UBERBLOCK_SIZE(vd),
	vdev_uberblock_load_done, zio, flags);
	}
	}
	}
	}

	/*
	* Reads the 'best' uberblock from disk along with its associated
	* configuration. First, we read the uberblock array of each label of each
	* vdev, keeping track of the uberblock with the highest txg in each array.
	* Then, we read the configuration from the same vdev as the best uberblock.
	*/
	void
	vdev_uberblock_load(vdev_t rvd, uberblock_t ub, nvlist_t **config)
	{
	zio_t *zio;
	spa_t *spa = rvd->vdev_spa;
	struct ubl_cbdata cb;
	int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL \|
	ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_TRYHARD;

	ASSERT(ub);
	ASSERT(config);

	bzero(ub, sizeof (uberblock_t));
	*config = NULL;

	cb.ubl_ubbest = ub;
	cb.ubl_vd = NULL;

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	zio = zio_root(spa, NULL, &cb, flags);
	vdev_uberblock_load_impl(zio, rvd, flags, &cb);
	(void) zio_wait(zio);

	/*
	* It's possible that the best uberblock was discovered on a label
	* that has a configuration which was written in a future txg.
	* Search all labels on this vdev to find the configuration that
	* matches the txg for our uberblock.
	*/
	if (cb.ubl_vd != NULL)
	*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
	spa_config_exit(spa, SCL_ALL, FTAG);
	}

	/*
	* On success, increment root zio's count of good writes.
	* We only get credit for writes to known-visible vdevs; see spa_vdev_add().
	*/
	static void
	vdev_uberblock_sync_done(zio_t *zio)
	{
	uint64_t *good_writes = zio->io_private;

	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
	atomic_inc_64(good_writes);
	}

	/*
	* Write the uberblock to all labels of all leaves of the specified vdev.
	*/
	static void
	vdev_uberblock_sync(zio_t zio, uberblock_t ub, vdev_t *vd, int flags)
	{
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);

	if (!vd->vdev_ops->vdev_op_leaf)
	return;

	if (!vdev_writeable(vd))
	return;

	int n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);

	/* Copy the uberblock_t into the ABD */
	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));

	for (int l = 0; l < VDEV_LABELS; l++)
	vdev_label_write(zio, vd, l, ub_abd,
	VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
	vdev_uberblock_sync_done, zio->io_private,
	flags \| ZIO_FLAG_DONT_PROPAGATE);

	abd_free(ub_abd);
	}

	/* Sync the uberblocks to all vdevs in svd[] */
	int
	vdev_uberblock_sync_list(vdev_t *svd, int svdcount, uberblock_t ub, int flags)
	{
	spa_t *spa = svd[0]->vdev_spa;
	zio_t *zio;
	uint64_t good_writes = 0;

	zio = zio_root(spa, NULL, &good_writes, flags);

	for (int v = 0; v < svdcount; v++)
	vdev_uberblock_sync(zio, ub, svd[v], flags);

	(void) zio_wait(zio);

	/*
	* Flush the uberblocks to disk. This ensures that the odd labels
	* are no longer needed (because the new uberblocks and the even
	* labels are safely on disk), so it is safe to overwrite them.
	*/
	zio = zio_root(spa, NULL, NULL, flags);

	- for (int v = 0; v < svdcount; v++)
	- zio_flush(zio, svd[v]);
	+ for (int v = 0; v < svdcount; v++) {
	+ if (vdev_writeable(svd[v])) {
	+ zio_flush(zio, svd[v]);
	+ }
	+ }

	(void) zio_wait(zio);

	return (good_writes >= 1 ? 0 : EIO);
	}

	/*
	* On success, increment the count of good writes for our top-level vdev.
	*/
	static void
	vdev_label_sync_done(zio_t *zio)
	{
	uint64_t *good_writes = zio->io_private;

	if (zio->io_error == 0)
	atomic_inc_64(good_writes);
	}

	/*
	* If there weren't enough good writes, indicate failure to the parent.
	*/
	static void
	vdev_label_sync_top_done(zio_t *zio)
	{
	uint64_t *good_writes = zio->io_private;

	if (*good_writes == 0)
	zio->io_error = SET_ERROR(EIO);

	kmem_free(good_writes, sizeof (uint64_t));
	}

	/*
	* We ignore errors for log and cache devices, simply free the private data.
	*/
	static void
	vdev_label_sync_ignore_done(zio_t *zio)
	{
	kmem_free(zio->io_private, sizeof (uint64_t));
	}

	/*
	* Write all even or odd labels to all leaves of the specified vdev.
	*/
	static void
	vdev_label_sync(zio_t zio, vdev_t vd, int l, uint64_t txg, int flags)
	{
	nvlist_t *label;
	vdev_phys_t *vp;
	abd_t *vp_abd;
	char *buf;
	size_t buflen;

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags);

	if (!vd->vdev_ops->vdev_op_leaf)
	return;

	if (!vdev_writeable(vd))
	return;

	/*
	* Generate a label describing the top-level config to which we belong.
	*/
	label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);

	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
	abd_zero(vp_abd, sizeof (vdev_phys_t));
	vp = abd_to_buf(vp_abd);

	buf = vp->vp_nvlist;
	buflen = sizeof (vp->vp_nvlist);

	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
	for (; l < VDEV_LABELS; l += 2) {
	vdev_label_write(zio, vd, l, vp_abd,
	offsetof(vdev_label_t, vl_vdev_phys),
	sizeof (vdev_phys_t),
	vdev_label_sync_done, zio->io_private,
	flags \| ZIO_FLAG_DONT_PROPAGATE);
	}
	}

	abd_free(vp_abd);
	nvlist_free(label);
	}

	int
	vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
	{
	list_t *dl = &spa->spa_config_dirty_list;
	vdev_t *vd;
	zio_t *zio;
	int error;

	/*
	* Write the new labels to disk.
	*/
	zio = zio_root(spa, NULL, NULL, flags);

	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
	uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
	KM_SLEEP);

	ASSERT(!vd->vdev_ishole);

	zio_t *vio = zio_null(zio, spa, NULL,
	(vd->vdev_islog \|\| vd->vdev_aux != NULL) ?
	vdev_label_sync_ignore_done : vdev_label_sync_top_done,
	good_writes, flags);
	vdev_label_sync(vio, vd, l, txg, flags);
	zio_nowait(vio);
	}

	error = zio_wait(zio);

	/*
	* Flush the new labels to disk.
	*/
	zio = zio_root(spa, NULL, NULL, flags);

	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
	zio_flush(zio, vd);

	(void) zio_wait(zio);

	return (error);
	}

	/*
	* Sync the uberblock and any changes to the vdev configuration.
	*
	* The order of operations is carefully crafted to ensure that
	* if the system panics or loses power at any time, the state on disk
	* is still transactionally consistent. The in-line comments below
	* describe the failure semantics at each stage.
	*
	* Moreover, vdev_config_sync() is designed to be idempotent: if it fails
	* at any time, you can just call it again, and it will resume its work.
	*/
	int
	vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
	{
	spa_t *spa = svd[0]->vdev_spa;
	uberblock_t *ub = &spa->spa_uberblock;
	vdev_t *vd;
	zio_t *zio;
	int error = 0;
	int flags = ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_CANFAIL;

	retry:
	/*
	* Normally, we don't want to try too hard to write every label and
	* uberblock. If there is a flaky disk, we don't want the rest of the
	* sync process to block while we retry. But if we can't write a
	* single label out, we should retry with ZIO_FLAG_TRYHARD before
	* bailing out and declaring the pool faulted.
	*/
	if (error != 0) {
	if ((flags & ZIO_FLAG_TRYHARD) != 0)
	return (error);
	flags \|= ZIO_FLAG_TRYHARD;
	}

	ASSERT(ub->ub_txg <= txg);

	/*
	* If this isn't a resync due to I/O errors,
	* and nothing changed in this transaction group,
	* and the vdev configuration hasn't changed,
	* then there's nothing to do.
	*/
	if (ub->ub_txg < txg &&
	uberblock_update(ub, spa->spa_root_vdev, txg) == B_FALSE &&
	list_is_empty(&spa->spa_config_dirty_list))
	return (0);

	if (txg > spa_freeze_txg(spa))
	return (0);

	ASSERT(txg <= spa->spa_final_txg);

	/*
	* Flush the write cache of every disk that's been written to
	* in this transaction group. This ensures that all blocks
	* written in this txg will be committed to stable storage
	* before any uberblock that references them.
	*/
	zio = zio_root(spa, NULL, NULL, flags);

	for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
	vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
	zio_flush(zio, vd);

	(void) zio_wait(zio);

	/*
	* Sync out the even labels (L0, L2) for every dirty vdev. If the
	* system dies in the middle of this process, that's OK: all of the
	* even labels that made it to disk will be newer than any uberblock,
	* and will therefore be considered invalid. The odd labels (L1, L3),
	* which have not yet been touched, will still be valid. We flush
	* the new labels to disk to ensure that all even-label updates
	* are committed to stable storage before the uberblock update.
	*/
	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
	goto retry;

	/*
	* Sync the uberblocks to all vdevs in svd[].
	* If the system dies in the middle of this step, there are two cases
	* to consider, and the on-disk state is consistent either way:
	*
	* (1) If none of the new uberblocks made it to disk, then the
	* previous uberblock will be the newest, and the odd labels
	* (which had not yet been touched) will be valid with respect
	* to that uberblock.
	*
	* (2) If one or more new uberblocks made it to disk, then they
	* will be the newest, and the even labels (which had all
	* been successfully committed) will be valid with respect
	* to the new uberblocks.
	*/
	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
	goto retry;

	/*
	* Sync out odd labels for every dirty vdev. If the system dies
	* in the middle of this process, the even labels and the new
	* uberblocks will suffice to open the pool. The next time
	* the pool is opened, the first thing we'll do -- before any
	* user data is modified -- is mark every vdev dirty so that
	* all labels will be brought up to date. We flush the new labels
	* to disk to ensure that all odd-label updates are committed to
	* stable storage before the next transaction group begins.
	*/
	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
	goto retry;;

	trim_thread_wakeup(spa);

	return (0);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c (revision 332525)
	@@ -1,708 +1,711 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/

	/*
	* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/spa_impl.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_scan.h>
	#include <sys/vdev_impl.h>
	#include <sys/zio.h>
	#include <sys/abd.h>
	#include <sys/fs/zfs.h>

	/*
	* Virtual device vector for mirroring.
	*/

	typedef struct mirror_child {
	vdev_t *mc_vd;
	uint64_t mc_offset;
	int mc_error;
	int mc_load;
	uint8_t mc_tried;
	uint8_t mc_skipped;
	uint8_t mc_speculative;
	} mirror_child_t;

	typedef struct mirror_map {
	int *mm_preferred;
	int mm_preferred_cnt;
	int mm_children;
	boolean_t mm_resilvering;
	boolean_t mm_root;
	mirror_child_t mm_child[];
	} mirror_map_t;

	static int vdev_mirror_shift = 21;

	#ifdef _KERNEL
	SYSCTL_DECL(_vfs_zfs_vdev);
	static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
	"ZFS VDEV Mirror");
	#endif

	/*
	* The load configuration settings below are tuned by default for
	* the case where all devices are of the same rotational type.
	*
	* If there is a mixture of rotating and non-rotating media, setting
	* non_rotating_seek_inc to 0 may well provide better results as it
	* will direct more reads to the non-rotating vdevs which are more
	* likely to have a higher performance.
	*/

	/* Rotating media load calculation configuration. */
	static int rotating_inc = 0;
	#ifdef _KERNEL
	SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RWTUN,
	&rotating_inc, 0, "Rotating media load increment for non-seeking I/O's");
	#endif

	static int rotating_seek_inc = 5;
	#ifdef _KERNEL
	SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RWTUN,
	&rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's");
	#endif

	static int rotating_seek_offset = 1 * 1024 * 1024;
	#ifdef _KERNEL
	SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RWTUN,
	&rotating_seek_offset, 0, "Offset in bytes from the last I/O which "
	"triggers a reduced rotating media seek increment");
	#endif

	/* Non-rotating media load calculation configuration. */
	static int non_rotating_inc = 0;
	#ifdef _KERNEL
	SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RWTUN,
	&non_rotating_inc, 0,
	"Non-rotating media load increment for non-seeking I/O's");
	#endif

	static int non_rotating_seek_inc = 1;
	#ifdef _KERNEL
	SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RWTUN,
	&non_rotating_seek_inc, 0,
	"Non-rotating media load increment for seeking I/O's");
	#endif


	static inline size_t
	vdev_mirror_map_size(int children)
	{
	return (offsetof(mirror_map_t, mm_child[children]) +
	sizeof(int) * children);
	}

	static inline mirror_map_t *
	vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
	{
	mirror_map_t *mm;

	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
	mm->mm_children = children;
	mm->mm_resilvering = resilvering;
	mm->mm_root = root;
	mm->mm_preferred = (int *)((uintptr_t)mm +
	offsetof(mirror_map_t, mm_child[children]));

	return mm;
	}

	static void
	vdev_mirror_map_free(zio_t *zio)
	{
	mirror_map_t *mm = zio->io_vsd;

	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
	}

	static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
	vdev_mirror_map_free,
	zio_vsd_default_cksum_report
	};

	static int
	vdev_mirror_load(mirror_map_t mm, vdev_t vd, uint64_t zio_offset)
	{
	uint64_t lastoffset;
	int load;

	/* All DVAs have equal weight at the root. */
	if (mm->mm_root)
	return (INT_MAX);

	/*
	* We don't return INT_MAX if the device is resilvering i.e.
	* vdev_resilver_txg != 0 as when tested performance was slightly
	* worse overall when resilvering with compared to without.
	*/

	/* Standard load based on pending queue length. */
	load = vdev_queue_length(vd);
	lastoffset = vdev_queue_lastoffset(vd);

	if (vd->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) {
	/* Non-rotating media. */
	if (lastoffset == zio_offset)
	return (load + non_rotating_inc);

	/*
	* Apply a seek penalty even for non-rotating devices as
	* sequential I/O'a can be aggregated into fewer operations
	* on the device, thus avoiding unnecessary per-command
	* overhead and boosting performance.
	*/
	return (load + non_rotating_seek_inc);
	}

	/* Rotating media I/O's which directly follow the last I/O. */
	if (lastoffset == zio_offset)
	return (load + rotating_inc);

	/*
	* Apply half the seek increment to I/O's within seek offset
	* of the last I/O queued to this vdev as they should incure less
	* of a seek increment.
	*/
	if (ABS(lastoffset - zio_offset) < rotating_seek_offset)
	return (load + (rotating_seek_inc / 2));

	/* Apply the full seek increment to all other I/O's. */
	return (load + rotating_seek_inc);
	}


	static mirror_map_t *
	vdev_mirror_map_init(zio_t *zio)
	{
	mirror_map_t *mm = NULL;
	mirror_child_t *mc;
	vdev_t *vd = zio->io_vd;
	int c;

	if (vd == NULL) {
	dva_t *dva = zio->io_bp->blk_dva;
	spa_t *spa = zio->io_spa;

	mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
	B_TRUE);
	for (c = 0; c < mm->mm_children; c++) {
	mc = &mm->mm_child[c];
	mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
	mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
	}
	} else {
	/*
	* If we are resilvering, then we should handle scrub reads
	* differently; we shouldn't issue them to the resilvering
	* device because it might not have those blocks.
	*
	* We are resilvering iff:
	* 1) We are a replacing vdev (ie our name is "replacing-1" or
	* "spare-1" or something like that), and
	* 2) The pool is currently being resilvered.
	*
	* We cannot simply check vd->vdev_resilver_txg, because it's
	* not set in this path.
	*
	* Nor can we just check our vdev_ops; there are cases (such as
	* when a user types "zpool replace pool odev spare_dev" and
	* spare_dev is in the spare list, or when a spare device is
	* automatically used to replace a DEGRADED device) when
	* resilvering is complete but both the original vdev and the
	* spare vdev remain in the pool. That behavior is intentional.
	* It helps implement the policy that a spare should be
	* automatically removed from the pool after the user replaces
	* the device that originally failed.
	*
	* If a spa load is in progress, then spa_dsl_pool may be
	* uninitialized. But we shouldn't be resilvering during a spa
	* load anyway.
	*/
	boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops \|\|
	vd->vdev_ops == &vdev_spare_ops) &&
	spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
	dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
	mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
	B_FALSE);
	for (c = 0; c < mm->mm_children; c++) {
	mc = &mm->mm_child[c];
	mc->mc_vd = vd->vdev_child[c];
	mc->mc_offset = zio->io_offset;
	}
	}

	zio->io_vsd = mm;
	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
	return (mm);
	}

	static int
	vdev_mirror_open(vdev_t vd, uint64_t asize, uint64_t *max_asize,
	uint64_t logical_ashift, uint64_t physical_ashift)
	{
	int numerrors = 0;
	int lasterror = 0;

	if (vd->vdev_children == 0) {
	vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	return (SET_ERROR(EINVAL));
	}

	vdev_open_children(vd);

	for (int c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];

	if (cvd->vdev_open_error) {
	lasterror = cvd->vdev_open_error;
	numerrors++;
	continue;
	}

	asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
	max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
	logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
	physical_ashift = MAX(physical_ashift,
	cvd->vdev_physical_ashift);
	}

	if (numerrors == vd->vdev_children) {
	vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
	return (lasterror);
	}

	return (0);
	}

	static void
	vdev_mirror_close(vdev_t *vd)
	{
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_close(vd->vdev_child[c]);
	}

	static void
	vdev_mirror_child_done(zio_t *zio)
	{
	mirror_child_t *mc = zio->io_private;

	mc->mc_error = zio->io_error;
	mc->mc_tried = 1;
	mc->mc_skipped = 0;
	}

	static void
	vdev_mirror_scrub_done(zio_t *zio)
	{
	mirror_child_t *mc = zio->io_private;

	if (zio->io_error == 0) {
	zio_t *pio;
	zio_link_t *zl = NULL;

	mutex_enter(&zio->io_lock);
	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
	mutex_enter(&pio->io_lock);
	ASSERT3U(zio->io_size, >=, pio->io_size);
	abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
	mutex_exit(&pio->io_lock);
	}
	mutex_exit(&zio->io_lock);
	}
	abd_free(zio->io_abd);

	mc->mc_error = zio->io_error;
	mc->mc_tried = 1;
	mc->mc_skipped = 0;
	}

	/*
	* Check the other, lower-index DVAs to see if they're on the same
	* vdev as the child we picked. If they are, use them since they
	* are likely to have been allocated from the primary metaslab in
	* use at the time, and hence are more likely to have locality with
	* single-copy data.
	*/
	static int
	vdev_mirror_dva_select(zio_t *zio, int p)
	{
	dva_t *dva = zio->io_bp->blk_dva;
	mirror_map_t *mm = zio->io_vsd;
	int preferred;
	int c;

	preferred = mm->mm_preferred[p];
	for (p-- ; p >= 0; p--) {
	c = mm->mm_preferred[p];
	if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
	preferred = c;
	}
	return (preferred);
	}

	static int
	vdev_mirror_preferred_child_randomize(zio_t *zio)
	{
	mirror_map_t *mm = zio->io_vsd;
	int p;

	if (mm->mm_root) {
	p = spa_get_random(mm->mm_preferred_cnt);
	return (vdev_mirror_dva_select(zio, p));
	}

	/*
	* To ensure we don't always favour the first matching vdev,
	* which could lead to wear leveling issues on SSD's, we
	* use the I/O offset as a pseudo random seed into the vdevs
	* which have the lowest load.
	*/
	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
	return (mm->mm_preferred[p]);
	}

	/*
	* Try to find a vdev whose DTL doesn't contain the block we want to read
	* prefering vdevs based on determined load.
	*
	* If we can't, try the read on any vdev we haven't already tried.
	*/
	static int
	vdev_mirror_child_select(zio_t *zio)
	{
	mirror_map_t *mm = zio->io_vsd;
	uint64_t txg = zio->io_txg;
	int c, lowest_load;

	ASSERT(zio->io_bp == NULL \|\| BP_PHYSICAL_BIRTH(zio->io_bp) == txg);

	lowest_load = INT_MAX;
	mm->mm_preferred_cnt = 0;
	for (c = 0; c < mm->mm_children; c++) {
	mirror_child_t *mc;

	mc = &mm->mm_child[c];
	if (mc->mc_tried \|\| mc->mc_skipped)
	continue;

	if (!vdev_readable(mc->mc_vd)) {
	mc->mc_error = SET_ERROR(ENXIO);
	mc->mc_tried = 1; /* don't even try */
	mc->mc_skipped = 1;
	continue;
	}

	if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
	mc->mc_error = SET_ERROR(ESTALE);
	mc->mc_skipped = 1;
	mc->mc_speculative = 1;
	continue;
	}

	mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
	if (mc->mc_load > lowest_load)
	continue;

	if (mc->mc_load < lowest_load) {
	lowest_load = mc->mc_load;
	mm->mm_preferred_cnt = 0;
	}
	mm->mm_preferred[mm->mm_preferred_cnt] = c;
	mm->mm_preferred_cnt++;
	}

	if (mm->mm_preferred_cnt == 1) {
	vdev_queue_register_lastoffset(
	mm->mm_child[mm->mm_preferred[0]].mc_vd, zio);
	return (mm->mm_preferred[0]);
	}

	if (mm->mm_preferred_cnt > 1) {
	int c = vdev_mirror_preferred_child_randomize(zio);

	vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio);
	return (c);
	}

	/*
	* Every device is either missing or has this txg in its DTL.
	* Look for any child we haven't already tried before giving up.
	*/
	for (c = 0; c < mm->mm_children; c++) {
	if (!mm->mm_child[c].mc_tried) {
	vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd,
	zio);
	return (c);
	}
	}

	/*
	* Every child failed. There's no place left to look.
	*/
	return (-1);
	}

	static void
	vdev_mirror_io_start(zio_t *zio)
	{
	mirror_map_t *mm;
	mirror_child_t *mc;
	int c, children;

	mm = vdev_mirror_map_init(zio);

	if (zio->io_type == ZIO_TYPE_READ) {
	if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
	mm->mm_children > 1) {
	/*
	* For scrubbing reads we need to allocate a read
	* buffer for each child and issue reads to all
	* children. If any child succeeds, it will copy its
	* data into zio->io_data in vdev_mirror_scrub_done.
	*/
	for (c = 0; c < mm->mm_children; c++) {
	mc = &mm->mm_child[c];
	zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
	mc->mc_vd, mc->mc_offset,
	abd_alloc_sametype(zio->io_abd,
	zio->io_size), zio->io_size,
	zio->io_type, zio->io_priority, 0,
	vdev_mirror_scrub_done, mc));
	}
	zio_execute(zio);
	return;
	}
	/*
	* For normal reads just pick one child.
	*/
	c = vdev_mirror_child_select(zio);
	children = (c >= 0);
	} else {
	ASSERT(zio->io_type == ZIO_TYPE_WRITE \|\|
	zio->io_type == ZIO_TYPE_FREE);

	/*
	* Writes and frees go to all children.
	*/
	c = 0;
	children = mm->mm_children;
	}

	while (children--) {
	mc = &mm->mm_child[c];
	zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
	mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
	zio->io_type, zio->io_priority, 0,
	vdev_mirror_child_done, mc));
	c++;
	}

	zio_execute(zio);
	}

	static int
	vdev_mirror_worst_error(mirror_map_t *mm)
	{
	int error[2] = { 0, 0 };

	for (int c = 0; c < mm->mm_children; c++) {
	mirror_child_t *mc = &mm->mm_child[c];
	int s = mc->mc_speculative;
	error[s] = zio_worst_error(error[s], mc->mc_error);
	}

	return (error[0] ? error[0] : error[1]);
	}

	static void
	vdev_mirror_io_done(zio_t *zio)
	{
	mirror_map_t *mm = zio->io_vsd;
	mirror_child_t *mc;
	int c;
	int good_copies = 0;
	int unexpected_errors = 0;

	for (c = 0; c < mm->mm_children; c++) {
	mc = &mm->mm_child[c];

	if (mc->mc_error) {
	if (!mc->mc_skipped)
	unexpected_errors++;
	} else if (mc->mc_tried) {
	good_copies++;
	}
	}

	if (zio->io_type == ZIO_TYPE_WRITE) {
	/*
	* XXX -- for now, treat partial writes as success.
	*
	* Now that we support write reallocation, it would be better
	* to treat partial failure as real failure unless there are
	* no non-degraded top-level vdevs left, and not update DTLs
	* if we intend to reallocate.
	*/
	/* XXPOLICY */
	if (good_copies != mm->mm_children) {
	/*
	* Always require at least one good copy.
	*
	* For ditto blocks (io_vd == NULL), require
	* all copies to be good.
	*
	* XXX -- for replacing vdevs, there's no great answer.
	* If the old device is really dead, we may not even
	* be able to access it -- so we only want to
	* require good writes to the new device. But if
	* the new device turns out to be flaky, we want
	* to be able to detach it -- which requires all
	* writes to the old device to have succeeded.
	*/
	if (good_copies == 0 \|\| zio->io_vd == NULL)
	zio->io_error = vdev_mirror_worst_error(mm);
	}
	return;
	} else if (zio->io_type == ZIO_TYPE_FREE) {
	return;
	}

	ASSERT(zio->io_type == ZIO_TYPE_READ);

	/*
	* If we don't have a good copy yet, keep trying other children.
	*/
	/* XXPOLICY */
	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
	ASSERT(c >= 0 && c < mm->mm_children);
	mc = &mm->mm_child[c];
	zio_vdev_io_redone(zio);
	zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
	mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
	ZIO_TYPE_READ, zio->io_priority, 0,
	vdev_mirror_child_done, mc));
	return;
	}

	/* XXPOLICY */
	if (good_copies == 0) {
	zio->io_error = vdev_mirror_worst_error(mm);
	ASSERT(zio->io_error != 0);
	}

	if (good_copies && spa_writeable(zio->io_spa) &&
	(unexpected_errors \|\|
	(zio->io_flags & ZIO_FLAG_RESILVER) \|\|
	((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
	/*
	* Use the good data we have in hand to repair damaged children.
	*/
	for (c = 0; c < mm->mm_children; c++) {
	/*
	* Don't rewrite known good children.
	* Not only is it unnecessary, it could
	* actually be harmful: if the system lost
	* power while rewriting the only good copy,
	* there would be no good copies left!
	*/
	mc = &mm->mm_child[c];

	if (mc->mc_error == 0) {
	if (mc->mc_tried)
	continue;
	if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
	!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
	zio->io_txg, 1))
	continue;
	mc->mc_error = SET_ERROR(ESTALE);
	}

	zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
	mc->mc_vd, mc->mc_offset,
	zio->io_abd, zio->io_size,
	ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
	ZIO_FLAG_IO_REPAIR \| (unexpected_errors ?
	ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
	}
	}
	}

	static void
	vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
	{
	if (faulted == vd->vdev_children)
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_NO_REPLICAS);
	else if (degraded + faulted != 0)
	vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
	else
	vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
	}

	vdev_ops_t vdev_mirror_ops = {
	vdev_mirror_open,
	vdev_mirror_close,
	vdev_default_asize,
	vdev_mirror_io_start,
	vdev_mirror_io_done,
	vdev_mirror_state_change,
	NULL,
	NULL,
	+ NULL,
	VDEV_TYPE_MIRROR, /* name of this vdev type */
	B_FALSE /* not a leaf vdev */
	};

	vdev_ops_t vdev_replacing_ops = {
	vdev_mirror_open,
	vdev_mirror_close,
	vdev_default_asize,
	vdev_mirror_io_start,
	vdev_mirror_io_done,
	vdev_mirror_state_change,
	NULL,
	NULL,
	+ NULL,
	VDEV_TYPE_REPLACING, /* name of this vdev type */
	B_FALSE /* not a leaf vdev */
	};

	vdev_ops_t vdev_spare_ops = {
	vdev_mirror_open,
	vdev_mirror_close,
	vdev_default_asize,
	vdev_mirror_io_start,
	vdev_mirror_io_done,
	vdev_mirror_state_change,
	+ NULL,
	NULL,
	NULL,
	VDEV_TYPE_SPARE, /* name of this vdev type */
	B_FALSE /* not a leaf vdev */
	};
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c (revision 332525)
	@@ -1,107 +1,109 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/

	/*
	* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	*/

	/*
	* The 'missing' vdev is a special vdev type used only during import. It
	* signifies a placeholder in the root vdev for some vdev that we know is
	* missing. We pass it down to the kernel to allow the rest of the
	* configuration to parsed and an attempt made to open all available devices.
	* Because its GUID is always 0, we know that the guid sum will mismatch and we
	* won't be able to open the pool anyway.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/vdev_impl.h>
	#include <sys/fs/zfs.h>
	#include <sys/zio.h>

	/* ARGSUSED */
	static int
	vdev_missing_open(vdev_t vd, uint64_t psize, uint64_t *max_psize,
	uint64_t logical_ashift, uint64_t physical_ashift)
	{
	/*
	* Really this should just fail. But then the root vdev will be in the
	* faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
	* VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
	* will fail the GUID sum check before ever trying to open the pool.
	*/
	*psize = 0;
	*max_psize = 0;
	*logical_ashift = 0;
	*physical_ashift = 0;
	return (0);
	}

	/* ARGSUSED */
	static void
	vdev_missing_close(vdev_t *vd)
	{
	}

	/* ARGSUSED */
	static void
	vdev_missing_io_start(zio_t *zio)
	{
	zio->io_error = SET_ERROR(ENOTSUP);
	zio_execute(zio);
	}

	/* ARGSUSED */
	static void
	vdev_missing_io_done(zio_t *zio)
	{
	}

	vdev_ops_t vdev_missing_ops = {
	vdev_missing_open,
	vdev_missing_close,
	vdev_default_asize,
	vdev_missing_io_start,
	vdev_missing_io_done,
	NULL,
	NULL,
	NULL,
	+ NULL,
	VDEV_TYPE_MISSING, /* name of this vdev type */
	B_TRUE /* leaf vdev */
	};

	vdev_ops_t vdev_hole_ops = {
	vdev_missing_open,
	vdev_missing_close,
	vdev_default_asize,
	vdev_missing_io_start,
	vdev_missing_io_done,
	+ NULL,
	NULL,
	NULL,
	NULL,
	VDEV_TYPE_HOLE, /* name of this vdev type */
	B_TRUE /* leaf vdev */
	};
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 332525)
	@@ -1,952 +1,962 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/

	/*
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#include <sys/zfs_context.h>
	#include <sys/vdev_impl.h>
	#include <sys/spa_impl.h>
	#include <sys/zio.h>
	#include <sys/avl.h>
	#include <sys/dsl_pool.h>
	#include <sys/metaslab_impl.h>
	#include <sys/abd.h>

	/*
	* ZFS I/O Scheduler
	* ---------------
	*
	* ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The
	* I/O scheduler determines when and in what order those operations are
	* issued. The I/O scheduler divides operations into six I/O classes
	* prioritized in the following order: sync read, sync write, async read,
	* async write, scrub/resilver and trim. Each queue defines the minimum and
	* maximum number of concurrent operations that may be issued to the device.
	* In addition, the device has an aggregate maximum. Note that the sum of the
	* per-queue minimums must not exceed the aggregate maximum, and if the
	* aggregate maximum is equal to or greater than the sum of the per-queue
	* maximums, the per-queue minimum has no effect.
	*
	* For many physical devices, throughput increases with the number of
	* concurrent operations, but latency typically suffers. Further, physical
	* devices typically have a limit at which more concurrent operations have no
	* effect on throughput or can actually cause it to decrease.
	*
	* The scheduler selects the next operation to issue by first looking for an
	* I/O class whose minimum has not been satisfied. Once all are satisfied and
	* the aggregate maximum has not been hit, the scheduler looks for classes
	* whose maximum has not been satisfied. Iteration through the I/O classes is
	* done in the order specified above. No further operations are issued if the
	* aggregate maximum number of concurrent operations has been hit or if there
	* are no operations queued for an I/O class that has not hit its maximum.
	* Every time an I/O is queued or an operation completes, the I/O scheduler
	* looks for new operations to issue.
	*
	* All I/O classes have a fixed maximum number of outstanding operations
	* except for the async write class. Asynchronous writes represent the data
	* that is committed to stable storage during the syncing stage for
	* transaction groups (see txg.c). Transaction groups enter the syncing state
	* periodically so the number of queued async writes will quickly burst up and
	* then bleed down to zero. Rather than servicing them as quickly as possible,
	* the I/O scheduler changes the maximum number of active async write I/Os
	* according to the amount of dirty data in the pool (see dsl_pool.c). Since
	* both throughput and latency typically increase with the number of
	* concurrent operations issued to physical devices, reducing the burstiness
	* in the number of concurrent operations also stabilizes the response time of
	* operations from other -- and in particular synchronous -- queues. In broad
	* strokes, the I/O scheduler will issue more concurrent operations from the
	* async write queue as there's more dirty data in the pool.
	*
	* Async Writes
	*
	* The number of concurrent operations issued for the async write I/O class
	* follows a piece-wise linear function defined by a few adjustable points.
	*
	* \| o---------\| <-- zfs_vdev_async_write_max_active
	* ^ \| /^ \|
	* \| \| / \| \|
	* active \| / \| \|
	* I/O \| / \| \|
	* count \| / \| \|
	* \| / \| \|
	* \|------------o \| \| <-- zfs_vdev_async_write_min_active
	* 0\|____________^______\|_________\|
	* 0% \| \| 100% of zfs_dirty_data_max
	* \| \|
	* \| `-- zfs_vdev_async_write_active_max_dirty_percent
	* `--------- zfs_vdev_async_write_active_min_dirty_percent
	*
	* Until the amount of dirty data exceeds a minimum percentage of the dirty
	* data allowed in the pool, the I/O scheduler will limit the number of
	* concurrent operations to the minimum. As that threshold is crossed, the
	* number of concurrent operations issued increases linearly to the maximum at
	* the specified maximum percentage of the dirty data allowed in the pool.
	*
	* Ideally, the amount of dirty data on a busy pool will stay in the sloped
	* part of the function between zfs_vdev_async_write_active_min_dirty_percent
	* and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
	* maximum percentage, this indicates that the rate of incoming data is
	* greater than the rate that the backend storage can handle. In this case, we
	* must further throttle incoming writes (see dmu_tx_delay() for details).
	*/

	/*
	* The maximum number of I/Os active to each device. Ideally, this will be >=
	* the sum of each queue's max_active. It must be at least the sum of each
	* queue's min_active.
	*/
	uint32_t zfs_vdev_max_active = 1000;

	/*
	* Per-queue limits on the number of I/Os active to each device. If the
	* sum of the queue's max_active is < zfs_vdev_max_active, then the
	* min_active comes into play. We will send min_active from each queue,
	* and then select from queues in the order defined by zio_priority_t.
	*
	* In general, smaller max_active's will lead to lower latency of synchronous
	* operations. Larger max_active's may lead to higher overall throughput,
	* depending on underlying storage.
	*
	* The ratio of the queues' max_actives determines the balance of performance
	* between reads, writes, and scrubs. E.g., increasing
	* zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
	* more quickly, but reads and writes to have higher latency and lower
	* throughput.
	*/
	uint32_t zfs_vdev_sync_read_min_active = 10;
	uint32_t zfs_vdev_sync_read_max_active = 10;
	uint32_t zfs_vdev_sync_write_min_active = 10;
	uint32_t zfs_vdev_sync_write_max_active = 10;
	uint32_t zfs_vdev_async_read_min_active = 1;
	uint32_t zfs_vdev_async_read_max_active = 3;
	uint32_t zfs_vdev_async_write_min_active = 1;
	uint32_t zfs_vdev_async_write_max_active = 10;
	uint32_t zfs_vdev_scrub_min_active = 1;
	uint32_t zfs_vdev_scrub_max_active = 2;
	uint32_t zfs_vdev_trim_min_active = 1;
	/*
	* TRIM max active is large in comparison to the other values due to the fact
	* that TRIM IOs are coalesced at the device layer. This value is set such
	* that a typical SSD can process the queued IOs in a single request.
	*/
	uint32_t zfs_vdev_trim_max_active = 64;
	+uint32_t zfs_vdev_removal_min_active = 1;
	+uint32_t zfs_vdev_removal_max_active = 2;


	/*
	* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
	* dirty data, use zfs_vdev_async_write_min_active. When it has more than
	* zfs_vdev_async_write_active_max_dirty_percent, use
	* zfs_vdev_async_write_max_active. The value is linearly interpolated
	* between min and max.
	*/
	int zfs_vdev_async_write_active_min_dirty_percent = 30;
	int zfs_vdev_async_write_active_max_dirty_percent = 60;

	/*
	* To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
	* For read I/Os, we also aggregate across small adjacency gaps; for writes
	* we include spans of optional I/Os to aid aggregation at the disk even when
	* they aren't able to help us aggregate at this level.
	*/
	int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
	int zfs_vdev_read_gap_limit = 32 << 10;
	int zfs_vdev_write_gap_limit = 4 << 10;

	/*
	* Define the queue depth percentage for each top-level. This percentage is
	* used in conjunction with zfs_vdev_async_max_active to determine how many
	* allocations a specific top-level vdev should handle. Once the queue depth
	* reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
	* then allocator will stop allocating blocks on that top-level device.
	* The default kernel setting is 1000% which will yield 100 allocations per
	* device. For userland testing, the default setting is 300% which equates
	* to 30 allocations per device.
	*/
	#ifdef _KERNEL
	int zfs_vdev_queue_depth_pct = 1000;
	#else
	int zfs_vdev_queue_depth_pct = 300;
	#endif


	#ifdef __FreeBSD__
	#ifdef _KERNEL
	SYSCTL_DECL(_vfs_zfs_vdev);

	static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS);
	SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent,
	CTLTYPE_UINT \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN, 0, sizeof(int),
	sysctl_zfs_async_write_active_min_dirty_percent, "I",
	"Percentage of async write dirty data below which "
	"async_write_min_active is used.");

	static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS);
	SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent,
	CTLTYPE_UINT \| CTLFLAG_MPSAFE \| CTLFLAG_RWTUN, 0, sizeof(int),
	sysctl_zfs_async_write_active_max_dirty_percent, "I",
	"Percentage of async write dirty data above which "
	"async_write_max_active is used.");

	SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN,
	&zfs_vdev_max_active, 0,
	"The maximum number of I/Os of all types active for each device.");

	#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \
	SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
	&zfs_vdev_ ## name ## _min_active, 0, \
	"Initial number of I/O requests of type " #name \
	" active for each device");

	#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \
	SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\
	&zfs_vdev_ ## name ## _max_active, 0, \
	"Maximum number of I/O requests of type " #name \
	" active for each device");

	ZFS_VDEV_QUEUE_KNOB_MIN(sync_read);
	ZFS_VDEV_QUEUE_KNOB_MAX(sync_read);
	ZFS_VDEV_QUEUE_KNOB_MIN(sync_write);
	ZFS_VDEV_QUEUE_KNOB_MAX(sync_write);
	ZFS_VDEV_QUEUE_KNOB_MIN(async_read);
	ZFS_VDEV_QUEUE_KNOB_MAX(async_read);
	ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
	ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
	ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
	ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
	ZFS_VDEV_QUEUE_KNOB_MIN(trim);
	ZFS_VDEV_QUEUE_KNOB_MAX(trim);

	#undef ZFS_VDEV_QUEUE_KNOB

	SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN,
	&zfs_vdev_aggregation_limit, 0,
	"I/O requests are aggregated up to this size");
	SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
	&zfs_vdev_read_gap_limit, 0,
	"Acceptable gap between two reads being aggregated");
	SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
	&zfs_vdev_write_gap_limit, 0,
	"Acceptable gap between two writes being aggregated");
	SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
	&zfs_vdev_queue_depth_pct, 0,
	"Queue depth percentage for each top-level");

	static int
	sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
	{
	int val, err;

	val = zfs_vdev_async_write_active_min_dirty_percent;
	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (val < 0 \|\| val > 100 \|\|
	val >= zfs_vdev_async_write_active_max_dirty_percent)
	return (EINVAL);

	zfs_vdev_async_write_active_min_dirty_percent = val;

	return (0);
	}

	static int
	sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
	{
	int val, err;

	val = zfs_vdev_async_write_active_max_dirty_percent;
	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err != 0 \|\| req->newptr == NULL)
	return (err);

	if (val < 0 \|\| val > 100 \|\|
	val <= zfs_vdev_async_write_active_min_dirty_percent)
	return (EINVAL);

	zfs_vdev_async_write_active_max_dirty_percent = val;

	return (0);
	}
	#endif
	#endif

	int
	vdev_queue_offset_compare(const void x1, const void x2)
	{
	const zio_t *z1 = x1;
	const zio_t *z2 = x2;

	if (z1->io_offset < z2->io_offset)
	return (-1);
	if (z1->io_offset > z2->io_offset)
	return (1);

	if (z1 < z2)
	return (-1);
	if (z1 > z2)
	return (1);

	return (0);
	}

	static inline avl_tree_t *
	vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
	{
	return (&vq->vq_class[p].vqc_queued_tree);
	}

	static inline avl_tree_t *
	vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
	{
	if (t == ZIO_TYPE_READ)
	return (&vq->vq_read_offset_tree);
	else if (t == ZIO_TYPE_WRITE)
	return (&vq->vq_write_offset_tree);
	else
	return (NULL);
	}

	int
	vdev_queue_timestamp_compare(const void x1, const void x2)
	{
	const zio_t *z1 = x1;
	const zio_t *z2 = x2;

	if (z1->io_timestamp < z2->io_timestamp)
	return (-1);
	if (z1->io_timestamp > z2->io_timestamp)
	return (1);

	if (z1->io_offset < z2->io_offset)
	return (-1);
	if (z1->io_offset > z2->io_offset)
	return (1);

	if (z1 < z2)
	return (-1);
	if (z1 > z2)
	return (1);

	return (0);
	}

	void
	vdev_queue_init(vdev_t *vd)
	{
	vdev_queue_t *vq = &vd->vdev_queue;

	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
	vq->vq_vdev = vd;

	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
	sizeof (zio_t), offsetof(struct zio, io_queue_node));
	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
	vdev_queue_offset_compare, sizeof (zio_t),
	offsetof(struct zio, io_offset_node));
	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
	vdev_queue_offset_compare, sizeof (zio_t),
	offsetof(struct zio, io_offset_node));

	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
	int (compfn) (const void , const void *);

	/*
	* The synchronous i/o queues are dispatched in FIFO rather
	* than LBA order. This provides more consistent latency for
	* these i/os.
	*/
	if (p == ZIO_PRIORITY_SYNC_READ \|\| p == ZIO_PRIORITY_SYNC_WRITE)
	compfn = vdev_queue_timestamp_compare;
	else
	compfn = vdev_queue_offset_compare;

	avl_create(vdev_queue_class_tree(vq, p), compfn,
	sizeof (zio_t), offsetof(struct zio, io_queue_node));
	}

	vq->vq_lastoffset = 0;
	}

	void
	vdev_queue_fini(vdev_t *vd)
	{
	vdev_queue_t *vq = &vd->vdev_queue;

	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
	avl_destroy(vdev_queue_class_tree(vq, p));
	avl_destroy(&vq->vq_active_tree);
	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));

	mutex_destroy(&vq->vq_lock);
	}

	static void
	vdev_queue_io_add(vdev_queue_t vq, zio_t zio)
	{
	spa_t *spa = zio->io_spa;
	avl_tree_t *qtt;

	ASSERT(MUTEX_HELD(&vq->vq_lock));
	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
	qtt = vdev_queue_type_tree(vq, zio->io_type);
	if (qtt)
	avl_add(qtt, zio);

	#ifdef illumos
	mutex_enter(&spa->spa_iokstat_lock);
	spa->spa_queue_stats[zio->io_priority].spa_queued++;
	if (spa->spa_iokstat != NULL)
	kstat_waitq_enter(spa->spa_iokstat->ks_data);
	mutex_exit(&spa->spa_iokstat_lock);
	#endif
	}

	static void
	vdev_queue_io_remove(vdev_queue_t vq, zio_t zio)
	{
	spa_t *spa = zio->io_spa;
	avl_tree_t *qtt;

	ASSERT(MUTEX_HELD(&vq->vq_lock));
	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
	qtt = vdev_queue_type_tree(vq, zio->io_type);
	if (qtt)
	avl_remove(qtt, zio);

	#ifdef illumos
	mutex_enter(&spa->spa_iokstat_lock);
	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
	spa->spa_queue_stats[zio->io_priority].spa_queued--;
	if (spa->spa_iokstat != NULL)
	kstat_waitq_exit(spa->spa_iokstat->ks_data);
	mutex_exit(&spa->spa_iokstat_lock);
	#endif
	}

	static void
	vdev_queue_pending_add(vdev_queue_t vq, zio_t zio)
	{
	spa_t *spa = zio->io_spa;
	ASSERT(MUTEX_HELD(&vq->vq_lock));
	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	vq->vq_class[zio->io_priority].vqc_active++;
	avl_add(&vq->vq_active_tree, zio);

	#ifdef illumos
	mutex_enter(&spa->spa_iokstat_lock);
	spa->spa_queue_stats[zio->io_priority].spa_active++;
	if (spa->spa_iokstat != NULL)
	kstat_runq_enter(spa->spa_iokstat->ks_data);
	mutex_exit(&spa->spa_iokstat_lock);
	#endif
	}

	static void
	vdev_queue_pending_remove(vdev_queue_t vq, zio_t zio)
	{
	spa_t *spa = zio->io_spa;
	ASSERT(MUTEX_HELD(&vq->vq_lock));
	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	vq->vq_class[zio->io_priority].vqc_active--;
	avl_remove(&vq->vq_active_tree, zio);

	#ifdef illumos
	mutex_enter(&spa->spa_iokstat_lock);
	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
	spa->spa_queue_stats[zio->io_priority].spa_active--;
	if (spa->spa_iokstat != NULL) {
	kstat_io_t *ksio = spa->spa_iokstat->ks_data;

	kstat_runq_exit(spa->spa_iokstat->ks_data);
	if (zio->io_type == ZIO_TYPE_READ) {
	ksio->reads++;
	ksio->nread += zio->io_size;
	} else if (zio->io_type == ZIO_TYPE_WRITE) {
	ksio->writes++;
	ksio->nwritten += zio->io_size;
	}
	}
	mutex_exit(&spa->spa_iokstat_lock);
	#endif
	}

	static void
	vdev_queue_agg_io_done(zio_t *aio)
	{
	if (aio->io_type == ZIO_TYPE_READ) {
	zio_t *pio;
	zio_link_t *zl = NULL;
	while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
	abd_copy_off(pio->io_abd, aio->io_abd,
	0, pio->io_offset - aio->io_offset, pio->io_size);
	}
	}

	abd_free(aio->io_abd);
	}

	static int
	vdev_queue_class_min_active(zio_priority_t p)
	{
	switch (p) {
	case ZIO_PRIORITY_SYNC_READ:
	return (zfs_vdev_sync_read_min_active);
	case ZIO_PRIORITY_SYNC_WRITE:
	return (zfs_vdev_sync_write_min_active);
	case ZIO_PRIORITY_ASYNC_READ:
	return (zfs_vdev_async_read_min_active);
	case ZIO_PRIORITY_ASYNC_WRITE:
	return (zfs_vdev_async_write_min_active);
	case ZIO_PRIORITY_SCRUB:
	return (zfs_vdev_scrub_min_active);
	case ZIO_PRIORITY_TRIM:
	return (zfs_vdev_trim_min_active);
	+ case ZIO_PRIORITY_REMOVAL:
	+ return (zfs_vdev_removal_min_active);
	default:
	panic("invalid priority %u", p);
	return (0);
	}
	}

	static __noinline int
	vdev_queue_max_async_writes(spa_t *spa)
	{
	int writes;
	uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
	uint64_t min_bytes = zfs_dirty_data_max *
	zfs_vdev_async_write_active_min_dirty_percent / 100;
	uint64_t max_bytes = zfs_dirty_data_max *
	zfs_vdev_async_write_active_max_dirty_percent / 100;

	/*
	* Sync tasks correspond to interactive user actions. To reduce the
	* execution time of those actions we push data out as fast as possible.
	*/
	if (spa_has_pending_synctask(spa)) {
	return (zfs_vdev_async_write_max_active);
	}

	if (dirty < min_bytes)
	return (zfs_vdev_async_write_min_active);
	if (dirty > max_bytes)
	return (zfs_vdev_async_write_max_active);

	/*
	* linear interpolation:
	* slope = (max_writes - min_writes) / (max_bytes - min_bytes)
	* move right by min_bytes
	* move up by min_writes
	*/
	writes = (dirty - min_bytes) *
	(zfs_vdev_async_write_max_active -
	zfs_vdev_async_write_min_active) /
	(max_bytes - min_bytes) +
	zfs_vdev_async_write_min_active;
	ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
	ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
	return (writes);
	}

	static int
	vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
	{
	switch (p) {
	case ZIO_PRIORITY_SYNC_READ:
	return (zfs_vdev_sync_read_max_active);
	case ZIO_PRIORITY_SYNC_WRITE:
	return (zfs_vdev_sync_write_max_active);
	case ZIO_PRIORITY_ASYNC_READ:
	return (zfs_vdev_async_read_max_active);
	case ZIO_PRIORITY_ASYNC_WRITE:
	return (vdev_queue_max_async_writes(spa));
	case ZIO_PRIORITY_SCRUB:
	return (zfs_vdev_scrub_max_active);
	case ZIO_PRIORITY_TRIM:
	return (zfs_vdev_trim_max_active);
	+ case ZIO_PRIORITY_REMOVAL:
	+ return (zfs_vdev_removal_max_active);
	default:
	panic("invalid priority %u", p);
	return (0);
	}
	}

	/*
	* Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
	* there is no eligible class.
	*/
	static zio_priority_t
	vdev_queue_class_to_issue(vdev_queue_t *vq)
	{
	spa_t *spa = vq->vq_vdev->vdev_spa;
	zio_priority_t p;

	ASSERT(MUTEX_HELD(&vq->vq_lock));

	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
	return (ZIO_PRIORITY_NUM_QUEUEABLE);

	/* find a queue that has not reached its minimum # outstanding i/os */
	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
	if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
	vq->vq_class[p].vqc_active <
	vdev_queue_class_min_active(p))
	return (p);
	}

	/*
	* If we haven't found a queue, look for one that hasn't reached its
	* maximum # outstanding i/os.
	*/
	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
	if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
	vq->vq_class[p].vqc_active <
	vdev_queue_class_max_active(spa, p))
	return (p);
	}

	/* No eligible queued i/os */
	return (ZIO_PRIORITY_NUM_QUEUEABLE);
	}

	/*
	* Compute the range spanned by two i/os, which is the endpoint of the last
	* (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
	* Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
	* thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
	*/
	#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
	#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))

	static zio_t *
	vdev_queue_aggregate(vdev_queue_t vq, zio_t zio)
	{
	zio_t first, last, aio, dio, mandatory, nio;
	uint64_t maxgap = 0;
	uint64_t size;
	boolean_t stretch;
	avl_tree_t *t;
	enum zio_flag flags;

	ASSERT(MUTEX_HELD(&vq->vq_lock));

	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
	return (NULL);

	first = last = zio;

	if (zio->io_type == ZIO_TYPE_READ)
	maxgap = zfs_vdev_read_gap_limit;

	/*
	* We can aggregate I/Os that are sufficiently adjacent and of
	* the same flavor, as expressed by the AGG_INHERIT flags.
	* The latter requirement is necessary so that certain
	* attributes of the I/O, such as whether it's a normal I/O
	* or a scrub/resilver, can be preserved in the aggregate.
	* We can include optional I/Os, but don't allow them
	* to begin a range as they add no benefit in that situation.
	*/

	/*
	* We keep track of the last non-optional I/O.
	*/
	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;

	/*
	* Walk backwards through sufficiently contiguous I/Os
	* recording the last non-optional I/O.
	*/
	flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
	t = vdev_queue_type_tree(vq, zio->io_type);
	while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
	(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
	- IO_GAP(dio, first) <= maxgap) {
	+ IO_GAP(dio, first) <= maxgap &&
	+ dio->io_type == zio->io_type) {
	first = dio;
	if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
	mandatory = first;
	}

	/*
	* Skip any initial optional I/Os.
	*/
	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
	first = AVL_NEXT(t, first);
	ASSERT(first != NULL);
	}

	/*
	* Walk forward through sufficiently contiguous I/Os.
	* The aggregation limit does not apply to optional i/os, so that
	* we can issue contiguous writes even if they are larger than the
	* aggregation limit.
	*/
	while ((dio = AVL_NEXT(t, last)) != NULL &&
	(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	(IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit \|\|
	(dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
	- IO_GAP(last, dio) <= maxgap) {
	+ IO_GAP(last, dio) <= maxgap &&
	+ dio->io_type == zio->io_type) {
	last = dio;
	if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
	mandatory = last;
	}

	/*
	* Now that we've established the range of the I/O aggregation
	* we must decide what to do with trailing optional I/Os.
	* For reads, there's nothing to do. While we are unable to
	* aggregate further, it's possible that a trailing optional
	* I/O would allow the underlying device to aggregate with
	* subsequent I/Os. We must therefore determine if the next
	* non-optional I/O is close enough to make aggregation
	* worthwhile.
	*/
	stretch = B_FALSE;
	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
	zio_t *nio = last;
	while ((dio = AVL_NEXT(t, nio)) != NULL &&
	IO_GAP(nio, dio) == 0 &&
	IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
	nio = dio;
	if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
	stretch = B_TRUE;
	break;
	}
	}
	}

	if (stretch) {
	/*
	* We are going to include an optional io in our aggregated
	* span, thus closing the write gap. Only mandatory i/os can
	* start aggregated spans, so make sure that the next i/o
	* after our span is mandatory.
	*/
	dio = AVL_NEXT(t, last);
	dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
	} else {
	/* do not include the optional i/o */
	while (last != mandatory && last != first) {
	ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
	last = AVL_PREV(t, last);
	ASSERT(last != NULL);
	}
	}

	if (first == last)
	return (NULL);

	size = IO_SPAN(first, last);
	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);

	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
	abd_alloc_for_io(size, B_TRUE), size, first->io_type,
	zio->io_priority, flags \| ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_QUEUE,
	vdev_queue_agg_io_done, NULL);
	aio->io_timestamp = first->io_timestamp;

	nio = first;
	do {
	dio = nio;
	nio = AVL_NEXT(t, dio);
	ASSERT3U(dio->io_type, ==, aio->io_type);

	if (dio->io_flags & ZIO_FLAG_NODATA) {
	ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
	abd_zero_off(aio->io_abd,
	dio->io_offset - aio->io_offset, dio->io_size);
	} else if (dio->io_type == ZIO_TYPE_WRITE) {
	abd_copy_off(aio->io_abd, dio->io_abd,
	dio->io_offset - aio->io_offset, 0, dio->io_size);
	}

	zio_add_child(dio, aio);
	vdev_queue_io_remove(vq, dio);
	zio_vdev_io_bypass(dio);
	zio_execute(dio);
	} while (dio != last);

	return (aio);
	}

	static zio_t *
	vdev_queue_io_to_issue(vdev_queue_t *vq)
	{
	zio_t zio, aio;
	zio_priority_t p;
	avl_index_t idx;
	avl_tree_t *tree;
	zio_t search;

	again:
	ASSERT(MUTEX_HELD(&vq->vq_lock));

	p = vdev_queue_class_to_issue(vq);

	if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
	/* No eligible queued i/os */
	return (NULL);
	}

	/*
	* For LBA-ordered queues (async / scrub), issue the i/o which follows
	* the most recently issued i/o in LBA (offset) order.
	*
	* For FIFO queues (sync), issue the i/o with the lowest timestamp.
	*/
	tree = vdev_queue_class_tree(vq, p);
	search.io_timestamp = 0;
	search.io_offset = vq->vq_last_offset + 1;
	VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
	zio = avl_nearest(tree, idx, AVL_AFTER);
	if (zio == NULL)
	zio = avl_first(tree);
	ASSERT3U(zio->io_priority, ==, p);

	aio = vdev_queue_aggregate(vq, zio);
	if (aio != NULL)
	zio = aio;
	else
	vdev_queue_io_remove(vq, zio);

	/*
	* If the I/O is or was optional and therefore has no data, we need to
	* simply discard it. We need to drop the vdev queue's lock to avoid a
	* deadlock that we could encounter since this I/O will complete
	* immediately.
	*/
	if (zio->io_flags & ZIO_FLAG_NODATA) {
	mutex_exit(&vq->vq_lock);
	zio_vdev_io_bypass(zio);
	zio_execute(zio);
	mutex_enter(&vq->vq_lock);
	goto again;
	}

	vdev_queue_pending_add(vq, zio);
	vq->vq_last_offset = zio->io_offset;

	return (zio);
	}

	zio_t *
	vdev_queue_io(zio_t *zio)
	{
	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
	zio_t *nio;

	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
	return (zio);

	/*
	* Children i/os inherent their parent's priority, which might
	* not match the child's i/o type. Fix it up here.
	*/
	if (zio->io_type == ZIO_TYPE_READ) {
	if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
	zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
	- zio->io_priority != ZIO_PRIORITY_SCRUB)
	+ zio->io_priority != ZIO_PRIORITY_SCRUB &&
	+ zio->io_priority != ZIO_PRIORITY_REMOVAL)
	zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
	} else if (zio->io_type == ZIO_TYPE_WRITE) {
	if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
	- zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
	+ zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
	+ zio->io_priority != ZIO_PRIORITY_REMOVAL)
	zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
	} else {
	ASSERT(zio->io_type == ZIO_TYPE_FREE);
	zio->io_priority = ZIO_PRIORITY_TRIM;
	}

	zio->io_flags \|= ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_QUEUE;

	mutex_enter(&vq->vq_lock);
	zio->io_timestamp = gethrtime();
	vdev_queue_io_add(vq, zio);
	nio = vdev_queue_io_to_issue(vq);
	mutex_exit(&vq->vq_lock);

	if (nio == NULL)
	return (NULL);

	if (nio->io_done == vdev_queue_agg_io_done) {
	zio_nowait(nio);
	return (NULL);
	}

	return (nio);
	}

	void
	vdev_queue_io_done(zio_t *zio)
	{
	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
	zio_t *nio;

	mutex_enter(&vq->vq_lock);

	vdev_queue_pending_remove(vq, zio);

	vq->vq_io_complete_ts = gethrtime();

	while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
	mutex_exit(&vq->vq_lock);
	if (nio->io_done == vdev_queue_agg_io_done) {
	zio_nowait(nio);
	} else {
	zio_vdev_io_reissue(nio);
	zio_execute(nio);
	}
	mutex_enter(&vq->vq_lock);
	}

	mutex_exit(&vq->vq_lock);
	}

	/*
	* As these three methods are only used for load calculations we're not concerned
	* if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
	* use here, instead we prefer to keep it lock free for performance.
	*/
	int
	vdev_queue_length(vdev_t *vd)
	{
	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
	}

	uint64_t
	vdev_queue_lastoffset(vdev_t *vd)
	{
	return (vd->vdev_queue.vq_lastoffset);
	}

	void
	vdev_queue_register_lastoffset(vdev_t vd, zio_t zio)
	{
	vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c (revision 332525)
	@@ -1,2598 +1,2599 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/vdev_impl.h>
	#ifdef illumos
	#include <sys/vdev_disk.h>
	#endif
	#include <sys/vdev_file.h>
	#include <sys/vdev_raidz.h>
	#include <sys/zio.h>
	#include <sys/zio_checksum.h>
	#include <sys/abd.h>
	#include <sys/fs/zfs.h>
	#include <sys/fm/fs/zfs.h>
	#include <sys/bio.h>

	/*
	* Virtual device vector for RAID-Z.
	*
	* This vdev supports single, double, and triple parity. For single parity,
	* we use a simple XOR of all the data columns. For double or triple parity,
	* we use a special case of Reed-Solomon coding. This extends the
	* technique described in "The mathematics of RAID-6" by H. Peter Anvin by
	* drawing on the system described in "A Tutorial on Reed-Solomon Coding for
	* Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
	* former is also based. The latter is designed to provide higher performance
	* for writes.
	*
	* Note that the Plank paper claimed to support arbitrary N+M, but was then
	* amended six years later identifying a critical flaw that invalidates its
	* claims. Nevertheless, the technique can be adapted to work for up to
	* triple parity. For additional parity, the amendment "Note: Correction to
	* the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
	* is viable, but the additional complexity means that write performance will
	* suffer.
	*
	* All of the methods above operate on a Galois field, defined over the
	* integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
	* can be expressed with a single byte. Briefly, the operations on the
	* field are defined as follows:
	*
	* o addition (+) is represented by a bitwise XOR
	* o subtraction (-) is therefore identical to addition: A + B = A - B
	* o multiplication of A by 2 is defined by the following bitwise expression:
	*
	* (A * 2)_7 = A_6
	* (A * 2)_6 = A_5
	* (A * 2)_5 = A_4
	* (A * 2)_4 = A_3 + A_7
	* (A * 2)_3 = A_2 + A_7
	* (A * 2)_2 = A_1 + A_7
	* (A * 2)_1 = A_0
	* (A * 2)_0 = A_7
	*
	* In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
	* As an aside, this multiplication is derived from the error correcting
	* primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
	*
	* Observe that any number in the field (except for 0) can be expressed as a
	* power of 2 -- a generator for the field. We store a table of the powers of
	* 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
	* be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
	* than field addition). The inverse of a field element A (A^-1) is therefore
	* A ^ (255 - 1) = A^254.
	*
	* The up-to-three parity columns, P, Q, R over several data columns,
	* D_0, ... D_n-1, can be expressed by field operations:
	*
	* P = D_0 + D_1 + ... + D_n-2 + D_n-1
	* Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
	* = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
	* R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
	* = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
	*
	* We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
	* XOR operation, and 2 and 4 can be computed quickly and generate linearly-
	* independent coefficients. (There are no additional coefficients that have
	* this property which is why the uncorrected Plank method breaks down.)
	*
	* See the reconstruction code below for how P, Q and R can used individually
	* or in concert to recover missing data columns.
	*/

	typedef struct raidz_col {
	uint64_t rc_devidx; /* child device index for I/O */
	uint64_t rc_offset; /* device offset */
	uint64_t rc_size; /* I/O size */
	abd_t rc_abd; / I/O data */
	void rc_gdata; / used to store the "good" version */
	int rc_error; /* I/O error for this device */
	uint8_t rc_tried; /* Did we attempt this I/O column? */
	uint8_t rc_skipped; /* Did we skip this I/O column? */
	} raidz_col_t;

	typedef struct raidz_map {
	uint64_t rm_cols; /* Regular column count */
	uint64_t rm_scols; /* Count including skipped columns */
	uint64_t rm_bigcols; /* Number of oversized columns */
	uint64_t rm_asize; /* Actual total I/O size */
	uint64_t rm_missingdata; /* Count of missing data devices */
	uint64_t rm_missingparity; /* Count of missing parity devices */
	uint64_t rm_firstdatacol; /* First data column/parity count */
	uint64_t rm_nskip; /* Skipped sectors for padding */
	uint64_t rm_skipstart; /* Column index of padding start */
	abd_t rm_abd_copy; / rm_asize-buffer of copied data */
	uintptr_t rm_reports; /* # of referencing checksum reports */
	uint8_t rm_freed; /* map no longer has referencing ZIO */
	uint8_t rm_ecksuminjected; /* checksum error was injected */
	raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
	} raidz_map_t;

	#define VDEV_RAIDZ_P 0
	#define VDEV_RAIDZ_Q 1
	#define VDEV_RAIDZ_R 2

	#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
	#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))

	/*
	* We provide a mechanism to perform the field multiplication operation on a
	* 64-bit value all at once rather than a byte at a time. This works by
	* creating a mask from the top bit in each byte and using that to
	* conditionally apply the XOR of 0x1d.
	*/
	#define VDEV_RAIDZ_64MUL_2(x, mask) \
	{ \
	(mask) = (x) & 0x8080808080808080ULL; \
	(mask) = ((mask) << 1) - ((mask) >> 7); \
	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
	((mask) & 0x1d1d1d1d1d1d1d1d); \
	}

	#define VDEV_RAIDZ_64MUL_4(x, mask) \
	{ \
	VDEV_RAIDZ_64MUL_2((x), mask); \
	VDEV_RAIDZ_64MUL_2((x), mask); \
	}

	#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)

	/*
	* Force reconstruction to use the general purpose method.
	*/
	int vdev_raidz_default_to_general;

	/* Powers of 2 in the Galois field defined above. */
	static const uint8_t vdev_raidz_pow2[256] = {
	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
	};
	/* Logs of 2 in the Galois field defined above. */
	static const uint8_t vdev_raidz_log2[256] = {
	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
	};

	static void vdev_raidz_generate_parity(raidz_map_t *rm);

	/*
	* Multiply a given number by 2 raised to the given power.
	*/
	static uint8_t
	vdev_raidz_exp2(uint_t a, int exp)
	{
	if (a == 0)
	return (0);

	ASSERT(exp >= 0);
	ASSERT(vdev_raidz_log2[a] > 0 \|\| a == 1);

	exp += vdev_raidz_log2[a];
	if (exp > 255)
	exp -= 255;

	return (vdev_raidz_pow2[exp]);
	}

	static void
	vdev_raidz_map_free(raidz_map_t *rm)
	{
	int c;
	size_t size;

	for (c = 0; c < rm->rm_firstdatacol; c++) {
	if (rm->rm_col[c].rc_abd != NULL)
	abd_free(rm->rm_col[c].rc_abd);

	if (rm->rm_col[c].rc_gdata != NULL)
	zio_buf_free(rm->rm_col[c].rc_gdata,
	rm->rm_col[c].rc_size);
	}

	size = 0;
	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	if (rm->rm_col[c].rc_abd != NULL)
	abd_put(rm->rm_col[c].rc_abd);
	size += rm->rm_col[c].rc_size;
	}

	if (rm->rm_abd_copy != NULL)
	abd_free(rm->rm_abd_copy);

	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
	}

	static void
	vdev_raidz_map_free_vsd(zio_t *zio)
	{
	raidz_map_t *rm = zio->io_vsd;

	ASSERT0(rm->rm_freed);
	rm->rm_freed = 1;

	if (rm->rm_reports == 0)
	vdev_raidz_map_free(rm);
	}

	/ARGSUSED/
	static void
	vdev_raidz_cksum_free(void *arg, size_t ignored)
	{
	raidz_map_t *rm = arg;

	ASSERT3U(rm->rm_reports, >, 0);

	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
	vdev_raidz_map_free(rm);
	}

	static void
	vdev_raidz_cksum_finish(zio_cksum_report_t zcr, const void good_data)
	{
	raidz_map_t *rm = zcr->zcr_cbdata;
	size_t c = zcr->zcr_cbinfo;
	size_t x;

	const char *good = NULL;
	char *bad;

	if (good_data == NULL) {
	zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
	return;
	}

	if (c < rm->rm_firstdatacol) {
	/*
	* The first time through, calculate the parity blocks for
	* the good data (this relies on the fact that the good
	* data never changes for a given logical ZIO)
	*/
	if (rm->rm_col[0].rc_gdata == NULL) {
	abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
	char *buf;
	int offset;

	/*
	* Set up the rm_col[]s to generate the parity for
	* good_data, first saving the parity bufs and
	* replacing them with buffers to hold the result.
	*/
	for (x = 0; x < rm->rm_firstdatacol; x++) {
	bad_parity[x] = rm->rm_col[x].rc_abd;
	rm->rm_col[x].rc_gdata =
	zio_buf_alloc(rm->rm_col[x].rc_size);
	rm->rm_col[x].rc_abd =
	abd_get_from_buf(rm->rm_col[x].rc_gdata,
	rm->rm_col[x].rc_size);
	}

	/* fill in the data columns from good_data */
	buf = (char *)good_data;
	for (; x < rm->rm_cols; x++) {
	abd_put(rm->rm_col[x].rc_abd);
	rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
	rm->rm_col[x].rc_size);
	buf += rm->rm_col[x].rc_size;
	}

	/*
	* Construct the parity from the good data.
	*/
	vdev_raidz_generate_parity(rm);

	/* restore everything back to its original state */
	for (x = 0; x < rm->rm_firstdatacol; x++) {
	abd_put(rm->rm_col[x].rc_abd);
	rm->rm_col[x].rc_abd = bad_parity[x];
	}

	offset = 0;
	for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
	abd_put(rm->rm_col[x].rc_abd);
	rm->rm_col[x].rc_abd = abd_get_offset(
	rm->rm_abd_copy, offset);
	offset += rm->rm_col[x].rc_size;
	}
	}

	ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
	good = rm->rm_col[c].rc_gdata;
	} else {
	/* adjust good_data to point at the start of our column */
	good = good_data;

	for (x = rm->rm_firstdatacol; x < c; x++)
	good += rm->rm_col[x].rc_size;
	}

	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
	/* we drop the ereport if it ends up that the data was good */
	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
	}

	/*
	* Invoked indirectly by zfs_ereport_start_checksum(), called
	* below when our read operation fails completely. The main point
	* is to keep a copy of everything we read from disk, so that at
	* vdev_raidz_cksum_finish() time we can compare it with the good data.
	*/
	static void
	vdev_raidz_cksum_report(zio_t zio, zio_cksum_report_t zcr, void *arg)
	{
	size_t c = (size_t)(uintptr_t)arg;
	size_t offset;

	raidz_map_t *rm = zio->io_vsd;
	size_t size;

	/* set up the report and bump the refcount */
	zcr->zcr_cbdata = rm;
	zcr->zcr_cbinfo = c;
	zcr->zcr_finish = vdev_raidz_cksum_finish;
	zcr->zcr_free = vdev_raidz_cksum_free;

	rm->rm_reports++;
	ASSERT3U(rm->rm_reports, >, 0);

	if (rm->rm_abd_copy != NULL)
	return;

	/*
	* It's the first time we're called for this raidz_map_t, so we need
	* to copy the data aside; there's no guarantee that our zio's buffer
	* won't be re-used for something else.
	*
	* Our parity data is already in separate buffers, so there's no need
	* to copy them.
	*/

	size = 0;
	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
	size += rm->rm_col[c].rc_size;

	rm->rm_abd_copy =
	abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);

	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	raidz_col_t *col = &rm->rm_col[c];
	abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);

	abd_copy(tmp, col->rc_abd, col->rc_size);
	abd_put(col->rc_abd);
	col->rc_abd = tmp;

	offset += col->rc_size;
	}
	ASSERT3U(offset, ==, size);
	}

	static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
	vdev_raidz_map_free_vsd,
	vdev_raidz_cksum_report
	};

	/*
	* Divides the IO evenly across all child vdevs; usually, dcols is
	* the number of children in the target vdev.
	*/
	static raidz_map_t *
	vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
	uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
	{
	raidz_map_t *rm;
	/* The starting RAIDZ (parent) vdev sector of the block. */
	uint64_t b = offset >> unit_shift;
	/* The zio's size in units of the vdev's minimum sector size. */
	uint64_t s = size >> unit_shift;
	/* The first column for this stripe. */
	uint64_t f = b % dcols;
	/* The starting byte offset on each child vdev. */
	uint64_t o = (b / dcols) << unit_shift;
	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
	uint64_t off = 0;

	/*
	* "Quotient": The number of data sectors for this stripe on all but
	* the "big column" child vdevs that also contain "remainder" data.
	*/
	q = s / (dcols - nparity);

	/*
	* "Remainder": The number of partial stripe data sectors in this I/O.
	* This will add a sector to some, but not all, child vdevs.
	*/
	r = s - q * (dcols - nparity);

	/* The number of "big columns" - those which contain remainder data. */
	bc = (r == 0 ? 0 : r + nparity);

	/*
	* The total number of data and parity sectors associated with
	* this I/O.
	*/
	tot = s + nparity * (q + (r == 0 ? 0 : 1));

	/* acols: The columns that will be accessed. */
	/* scols: The columns that will be accessed or skipped. */
	if (q == 0) {
	/* Our I/O request doesn't span all child vdevs. */
	acols = bc;
	scols = MIN(dcols, roundup(bc, nparity + 1));
	} else {
	acols = dcols;
	scols = dcols;
	}

	ASSERT3U(acols, <=, scols);

	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);

	rm->rm_cols = acols;
	rm->rm_scols = scols;
	rm->rm_bigcols = bc;
	rm->rm_skipstart = bc;
	rm->rm_missingdata = 0;
	rm->rm_missingparity = 0;
	rm->rm_firstdatacol = nparity;
	rm->rm_abd_copy = NULL;
	rm->rm_reports = 0;
	rm->rm_freed = 0;
	rm->rm_ecksuminjected = 0;

	asize = 0;

	for (c = 0; c < scols; c++) {
	col = f + c;
	coff = o;
	if (col >= dcols) {
	col -= dcols;
	coff += 1ULL << unit_shift;
	}
	rm->rm_col[c].rc_devidx = col;
	rm->rm_col[c].rc_offset = coff;
	rm->rm_col[c].rc_abd = NULL;
	rm->rm_col[c].rc_gdata = NULL;
	rm->rm_col[c].rc_error = 0;
	rm->rm_col[c].rc_tried = 0;
	rm->rm_col[c].rc_skipped = 0;

	if (c >= acols)
	rm->rm_col[c].rc_size = 0;
	else if (c < bc)
	rm->rm_col[c].rc_size = (q + 1) << unit_shift;
	else
	rm->rm_col[c].rc_size = q << unit_shift;

	asize += rm->rm_col[c].rc_size;
	}

	ASSERT3U(asize, ==, tot << unit_shift);
	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
	ASSERT3U(rm->rm_nskip, <=, nparity);

	if (!dofree) {
	for (c = 0; c < rm->rm_firstdatacol; c++) {
	rm->rm_col[c].rc_abd =
	abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
	}

	rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
	off = rm->rm_col[c].rc_size;

	for (c = c + 1; c < acols; c++) {
	rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
	off += rm->rm_col[c].rc_size;
	}
	}

	/*
	* If all data stored spans all columns, there's a danger that parity
	* will always be on the same device and, since parity isn't read
	* during normal operation, that that device's I/O bandwidth won't be
	* used effectively. We therefore switch the parity every 1MB.
	*
	* ... at least that was, ostensibly, the theory. As a practical
	* matter unless we juggle the parity between all devices evenly, we
	* won't see any benefit. Further, occasional writes that aren't a
	* multiple of the LCM of the number of children and the minimum
	* stripe width are sufficient to avoid pessimal behavior.
	* Unfortunately, this decision created an implicit on-disk format
	* requirement that we need to support for all eternity, but only
	* for single-parity RAID-Z.
	*
	* If we intend to skip a sector in the zeroth column for padding
	* we must make sure to note this swap. We will never intend to
	* skip the first column since at least one data and one parity
	* column must appear in each row.
	*/
	ASSERT(rm->rm_cols >= 2);
	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);

	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
	devidx = rm->rm_col[0].rc_devidx;
	o = rm->rm_col[0].rc_offset;
	rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
	rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
	rm->rm_col[1].rc_devidx = devidx;
	rm->rm_col[1].rc_offset = o;

	if (rm->rm_skipstart == 0)
	rm->rm_skipstart = 1;
	}

	return (rm);
	}

	struct pqr_struct {
	uint64_t *p;
	uint64_t *q;
	uint64_t *r;
	};

	static int
	vdev_raidz_p_func(void buf, size_t size, void private)
	{
	struct pqr_struct *pqr = private;
	const uint64_t *src = buf;
	int i, cnt = size / sizeof (src[0]);

	ASSERT(pqr->p && !pqr->q && !pqr->r);

	for (i = 0; i < cnt; i++, src++, pqr->p++)
	pqr->p ^= src;

	return (0);
	}

	static int
	vdev_raidz_pq_func(void buf, size_t size, void private)
	{
	struct pqr_struct *pqr = private;
	const uint64_t *src = buf;
	uint64_t mask;
	int i, cnt = size / sizeof (src[0]);

	ASSERT(pqr->p && pqr->q && !pqr->r);

	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
	pqr->p ^= src;
	VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
	pqr->q ^= src;
	}

	return (0);
	}

	static int
	vdev_raidz_pqr_func(void buf, size_t size, void private)
	{
	struct pqr_struct *pqr = private;
	const uint64_t *src = buf;
	uint64_t mask;
	int i, cnt = size / sizeof (src[0]);

	ASSERT(pqr->p && pqr->q && pqr->r);

	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
	pqr->p ^= src;
	VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
	pqr->q ^= src;
	VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
	pqr->r ^= src;
	}

	return (0);
	}

	static void
	vdev_raidz_generate_parity_p(raidz_map_t *rm)
	{
	uint64_t *p;
	int c;
	abd_t *src;

	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	src = rm->rm_col[c].rc_abd;
	p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);

	if (c == rm->rm_firstdatacol) {
	abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
	} else {
	struct pqr_struct pqr = { p, NULL, NULL };
	(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
	vdev_raidz_p_func, &pqr);
	}
	}
	}

	static void
	vdev_raidz_generate_parity_pq(raidz_map_t *rm)
	{
	uint64_t p, q, pcnt, ccnt, mask, i;
	int c;
	abd_t *src;

	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
	rm->rm_col[VDEV_RAIDZ_Q].rc_size);

	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	src = rm->rm_col[c].rc_abd;
	p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);

	ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);

	if (c == rm->rm_firstdatacol) {
	abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
	(void) memcpy(q, p, rm->rm_col[c].rc_size);
	} else {
	struct pqr_struct pqr = { p, q, NULL };
	(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
	vdev_raidz_pq_func, &pqr);
	}

	if (c == rm->rm_firstdatacol) {
	for (i = ccnt; i < pcnt; i++) {
	p[i] = 0;
	q[i] = 0;
	}
	} else {
	/*
	* Treat short columns as though they are full of 0s.
	* Note that there's therefore nothing needed for P.
	*/
	for (i = ccnt; i < pcnt; i++) {
	VDEV_RAIDZ_64MUL_2(q[i], mask);
	}
	}
	}
	}

	static void
	vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
	{
	uint64_t p, q, *r, pcnt, ccnt, mask, i;
	int c;
	abd_t *src;

	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
	rm->rm_col[VDEV_RAIDZ_Q].rc_size);
	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
	rm->rm_col[VDEV_RAIDZ_R].rc_size);

	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	src = rm->rm_col[c].rc_abd;
	p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
	r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);

	ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);

	if (c == rm->rm_firstdatacol) {
	abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
	(void) memcpy(q, p, rm->rm_col[c].rc_size);
	(void) memcpy(r, p, rm->rm_col[c].rc_size);
	} else {
	struct pqr_struct pqr = { p, q, r };
	(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
	vdev_raidz_pqr_func, &pqr);
	}

	if (c == rm->rm_firstdatacol) {
	for (i = ccnt; i < pcnt; i++) {
	p[i] = 0;
	q[i] = 0;
	r[i] = 0;
	}
	} else {
	/*
	* Treat short columns as though they are full of 0s.
	* Note that there's therefore nothing needed for P.
	*/
	for (i = ccnt; i < pcnt; i++) {
	VDEV_RAIDZ_64MUL_2(q[i], mask);
	VDEV_RAIDZ_64MUL_4(r[i], mask);
	}
	}
	}
	}

	/*
	* Generate RAID parity in the first virtual columns according to the number of
	* parity columns available.
	*/
	static void
	vdev_raidz_generate_parity(raidz_map_t *rm)
	{
	switch (rm->rm_firstdatacol) {
	case 1:
	vdev_raidz_generate_parity_p(rm);
	break;
	case 2:
	vdev_raidz_generate_parity_pq(rm);
	break;
	case 3:
	vdev_raidz_generate_parity_pqr(rm);
	break;
	default:
	cmn_err(CE_PANIC, "invalid RAID-Z configuration");
	}
	}

	/* ARGSUSED */
	static int
	vdev_raidz_reconst_p_func(void dbuf, void sbuf, size_t size, void *private)
	{
	uint64_t *dst = dbuf;
	uint64_t *src = sbuf;
	int cnt = size / sizeof (src[0]);

	for (int i = 0; i < cnt; i++) {
	dst[i] ^= src[i];
	}

	return (0);
	}

	/* ARGSUSED */
	static int
	vdev_raidz_reconst_q_pre_func(void dbuf, void sbuf, size_t size,
	void *private)
	{
	uint64_t *dst = dbuf;
	uint64_t *src = sbuf;
	uint64_t mask;
	int cnt = size / sizeof (dst[0]);

	for (int i = 0; i < cnt; i++, dst++, src++) {
	VDEV_RAIDZ_64MUL_2(*dst, mask);
	dst ^= src;
	}

	return (0);
	}

	/* ARGSUSED */
	static int
	vdev_raidz_reconst_q_pre_tail_func(void buf, size_t size, void private)
	{
	uint64_t *dst = buf;
	uint64_t mask;
	int cnt = size / sizeof (dst[0]);

	for (int i = 0; i < cnt; i++, dst++) {
	/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
	VDEV_RAIDZ_64MUL_2(*dst, mask);
	}

	return (0);
	}

	struct reconst_q_struct {
	uint64_t *q;
	int exp;
	};

	static int
	vdev_raidz_reconst_q_post_func(void buf, size_t size, void private)
	{
	struct reconst_q_struct *rq = private;
	uint64_t *dst = buf;
	int cnt = size / sizeof (dst[0]);

	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
	dst ^= rq->q;

	int j;
	uint8_t *b;
	for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
	b = vdev_raidz_exp2(b, rq->exp);
	}
	}

	return (0);
	}

	struct reconst_pq_struct {
	uint8_t *p;
	uint8_t *q;
	uint8_t *pxy;
	uint8_t *qxy;
	int aexp;
	int bexp;
	};

	static int
	vdev_raidz_reconst_pq_func(void xbuf, void ybuf, size_t size, void *private)
	{
	struct reconst_pq_struct *rpq = private;
	uint8_t *xd = xbuf;
	uint8_t *yd = ybuf;

	for (int i = 0; i < size;
	i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
	xd = vdev_raidz_exp2(rpq->p ^ *rpq->pxy, rpq->aexp) ^
	vdev_raidz_exp2(rpq->q ^ rpq->qxy, rpq->bexp);
	yd = rpq->p ^ rpq->pxy ^ xd;
	}

	return (0);
	}

	static int
	vdev_raidz_reconst_pq_tail_func(void xbuf, size_t size, void private)
	{
	struct reconst_pq_struct *rpq = private;
	uint8_t *xd = xbuf;

	for (int i = 0; i < size;
	i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
	/* same operation as vdev_raidz_reconst_pq_func() on xd */
	xd = vdev_raidz_exp2(rpq->p ^ *rpq->pxy, rpq->aexp) ^
	vdev_raidz_exp2(rpq->q ^ rpq->qxy, rpq->bexp);
	}

	return (0);
	}

	static int
	vdev_raidz_reconstruct_p(raidz_map_t rm, int tgts, int ntgts)
	{
	int x = tgts[0];
	int c;
	abd_t dst, src;

	ASSERT(ntgts == 1);
	ASSERT(x >= rm->rm_firstdatacol);
	ASSERT(x < rm->rm_cols);

	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
	ASSERT(rm->rm_col[x].rc_size > 0);

	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
	dst = rm->rm_col[x].rc_abd;

	abd_copy(dst, src, rm->rm_col[x].rc_size);

	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	uint64_t size = MIN(rm->rm_col[x].rc_size,
	rm->rm_col[c].rc_size);

	src = rm->rm_col[c].rc_abd;
	dst = rm->rm_col[x].rc_abd;

	if (c == x)
	continue;

	(void) abd_iterate_func2(dst, src, 0, 0, size,
	vdev_raidz_reconst_p_func, NULL);
	}

	return (1 << VDEV_RAIDZ_P);
	}

	static int
	vdev_raidz_reconstruct_q(raidz_map_t rm, int tgts, int ntgts)
	{
	int x = tgts[0];
	int c, exp;
	abd_t dst, src;

	ASSERT(ntgts == 1);

	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);

	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
	rm->rm_col[c].rc_size);

	src = rm->rm_col[c].rc_abd;
	dst = rm->rm_col[x].rc_abd;

	if (c == rm->rm_firstdatacol) {
	abd_copy(dst, src, size);
	if (rm->rm_col[x].rc_size > size)
	abd_zero_off(dst, size,
	rm->rm_col[x].rc_size - size);
	} else {
	ASSERT3U(size, <=, rm->rm_col[x].rc_size);
	(void) abd_iterate_func2(dst, src, 0, 0, size,
	vdev_raidz_reconst_q_pre_func, NULL);
	(void) abd_iterate_func(dst,
	size, rm->rm_col[x].rc_size - size,
	vdev_raidz_reconst_q_pre_tail_func, NULL);
	}
	}

	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
	dst = rm->rm_col[x].rc_abd;
	exp = 255 - (rm->rm_cols - 1 - x);

	struct reconst_q_struct rq = { abd_to_buf(src), exp };
	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
	vdev_raidz_reconst_q_post_func, &rq);

	return (1 << VDEV_RAIDZ_Q);
	}

	static int
	vdev_raidz_reconstruct_pq(raidz_map_t rm, int tgts, int ntgts)
	{
	uint8_t p, q, pxy, qxy, tmp, a, b, aexp, bexp;
	abd_t pdata, qdata;
	uint64_t xsize, ysize;
	int x = tgts[0];
	int y = tgts[1];
	abd_t xd, yd;

	ASSERT(ntgts == 2);
	ASSERT(x < y);
	ASSERT(x >= rm->rm_firstdatacol);
	ASSERT(y < rm->rm_cols);

	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);

	/*
	* Move the parity data aside -- we're going to compute parity as
	* though columns x and y were full of zeros -- Pxy and Qxy. We want to
	* reuse the parity generation mechanism without trashing the actual
	* parity so we make those columns appear to be full of zeros by
	* setting their lengths to zero.
	*/
	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
	xsize = rm->rm_col[x].rc_size;
	ysize = rm->rm_col[y].rc_size;

	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
	abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
	abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
	rm->rm_col[x].rc_size = 0;
	rm->rm_col[y].rc_size = 0;

	vdev_raidz_generate_parity_pq(rm);

	rm->rm_col[x].rc_size = xsize;
	rm->rm_col[y].rc_size = ysize;

	p = abd_to_buf(pdata);
	q = abd_to_buf(qdata);
	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
	xd = rm->rm_col[x].rc_abd;
	yd = rm->rm_col[y].rc_abd;

	/*
	* We now have:
	* Pxy = P + D_x + D_y
	* Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
	*
	* We can then solve for D_x:
	* D_x = A * (P + Pxy) + B * (Q + Qxy)
	* where
	* A = 2^(x - y) * (2^(x - y) + 1)^-1
	* B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
	*
	* With D_x in hand, we can easily solve for D_y:
	* D_y = P + Pxy + D_x
	*/

	a = vdev_raidz_pow2[255 + x - y];
	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
	tmp = 255 - vdev_raidz_log2[a ^ 1];

	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];

	ASSERT3U(xsize, >=, ysize);
	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
	vdev_raidz_reconst_pq_func, &rpq);
	(void) abd_iterate_func(xd, ysize, xsize - ysize,
	vdev_raidz_reconst_pq_tail_func, &rpq);

	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);

	/*
	* Restore the saved parity data.
	*/
	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;

	return ((1 << VDEV_RAIDZ_P) \| (1 << VDEV_RAIDZ_Q));
	}

	/* BEGIN CSTYLED */
	/*
	* In the general case of reconstruction, we must solve the system of linear
	* equations defined by the coeffecients used to generate parity as well as
	* the contents of the data and parity disks. This can be expressed with
	* vectors for the original data (D) and the actual data (d) and parity (p)
	* and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
	*
	* __ __ __ __
	* \| \| __ __ \| p_0 \|
	* \| V \| \| D_0 \| \| p_m-1 \|
	* \| \| x \| : \| = \| d_0 \|
	* \| I \| \| D_n-1 \| \| : \|
	* \| \| ~~ ~~ \| d_n-1 \|
	* ~~ ~~ ~~ ~~
	*
	* I is simply a square identity matrix of size n, and V is a vandermonde
	* matrix defined by the coeffecients we chose for the various parity columns
	* (1, 2, 4). Note that these values were chosen both for simplicity, speedy
	* computation as well as linear separability.
	*
	* __ __ __ __
	* \| 1 .. 1 1 1 \| \| p_0 \|
	* \| 2^n-1 .. 4 2 1 \| __ __ \| : \|
	* \| 4^n-1 .. 16 4 1 \| \| D_0 \| \| p_m-1 \|
	* \| 1 .. 0 0 0 \| \| D_1 \| \| d_0 \|
	* \| 0 .. 0 0 0 \| x \| D_2 \| = \| d_1 \|
	* \| : : : : \| \| : \| \| d_2 \|
	* \| 0 .. 1 0 0 \| \| D_n-1 \| \| : \|
	* \| 0 .. 0 1 0 \| ~~ ~~ \| : \|
	* \| 0 .. 0 0 1 \| \| d_n-1 \|
	* ~~ ~~ ~~ ~~
	*
	* Note that I, V, d, and p are known. To compute D, we must invert the
	* matrix and use the known data and parity values to reconstruct the unknown
	* data values. We begin by removing the rows in V\|I and d\|p that correspond
	* to failed or missing columns; we then make V\|I square (n x n) and d\|p
	* sized n by removing rows corresponding to unused parity from the bottom up
	* to generate (V\|I)' and (d\|p)'. We can then generate the inverse of (V\|I)'
	* using Gauss-Jordan elimination. In the example below we use m=3 parity
	* columns, n=8 data columns, with errors in d_1, d_2, and p_1:
	* __ __
	* \| 1 1 1 1 1 1 1 1 \|
	* \| 128 64 32 16 8 4 2 1 \| <-----+-+-- missing disks
	* \| 19 205 116 29 64 16 4 1 \| / /
	* \| 1 0 0 0 0 0 0 0 \| / /
	* \| 0 1 0 0 0 0 0 0 \| <--' /
	* (V\|I) = \| 0 0 1 0 0 0 0 0 \| <---'
	* \| 0 0 0 1 0 0 0 0 \|
	* \| 0 0 0 0 1 0 0 0 \|
	* \| 0 0 0 0 0 1 0 0 \|
	* \| 0 0 0 0 0 0 1 0 \|
	* \| 0 0 0 0 0 0 0 1 \|
	* ~~ ~~
	* __ __
	* \| 1 1 1 1 1 1 1 1 \|
	* \| 19 205 116 29 64 16 4 1 \|
	* \| 1 0 0 0 0 0 0 0 \|
	* (V\|I)' = \| 0 0 0 1 0 0 0 0 \|
	* \| 0 0 0 0 1 0 0 0 \|
	* \| 0 0 0 0 0 1 0 0 \|
	* \| 0 0 0 0 0 0 1 0 \|
	* \| 0 0 0 0 0 0 0 1 \|
	* ~~ ~~
	*
	* Here we employ Gauss-Jordan elimination to find the inverse of (V\|I)'. We
	* have carefully chosen the seed values 1, 2, and 4 to ensure that this
	* matrix is not singular.
	* __ __
	* \| 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 \|
	* \| 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 \|
	* \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	* \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	* \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	* \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	* \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	* \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	* ~~ ~~
	* __ __
	* \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	* \| 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 \|
	* \| 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 \|
	* \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	* \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	* \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	* \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	* \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	* ~~ ~~
	* __ __
	* \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	* \| 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 \|
	* \| 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 \|
	* \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	* \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	* \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	* \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	* \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	* ~~ ~~
	* __ __
	* \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	* \| 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 \|
	* \| 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 \|
	* \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	* \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	* \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	* \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	* \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	* ~~ ~~
	* __ __
	* \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	* \| 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 \|
	* \| 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 \|
	* \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	* \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	* \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	* \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	* \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	* ~~ ~~
	* __ __
	* \| 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 \|
	* \| 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 \|
	* \| 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 \|
	* \| 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 \|
	* \| 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 \|
	* \| 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 \|
	* \| 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 \|
	* \| 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 \|
	* ~~ ~~
	* __ __
	* \| 0 0 1 0 0 0 0 0 \|
	* \| 167 100 5 41 159 169 217 208 \|
	* \| 166 100 4 40 158 168 216 209 \|
	* (V\|I)'^-1 = \| 0 0 0 1 0 0 0 0 \|
	* \| 0 0 0 0 1 0 0 0 \|
	* \| 0 0 0 0 0 1 0 0 \|
	* \| 0 0 0 0 0 0 1 0 \|
	* \| 0 0 0 0 0 0 0 1 \|
	* ~~ ~~
	*
	* We can then simply compute D = (V\|I)'^-1 x (d\|p)' to discover the values
	* of the missing data.
	*
	* As is apparent from the example above, the only non-trivial rows in the
	* inverse matrix correspond to the data disks that we're trying to
	* reconstruct. Indeed, those are the only rows we need as the others would
	* only be useful for reconstructing data known or assumed to be valid. For
	* that reason, we only build the coefficients in the rows that correspond to
	* targeted columns.
	*/
	/* END CSTYLED */

	static void
	vdev_raidz_matrix_init(raidz_map_t rm, int n, int nmap, int map,
	uint8_t **rows)
	{
	int i, j;
	int pow;

	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);

	/*
	* Fill in the missing rows of interest.
	*/
	for (i = 0; i < nmap; i++) {
	ASSERT3S(0, <=, map[i]);
	ASSERT3S(map[i], <=, 2);

	pow = map[i] * n;
	if (pow > 255)
	pow -= 255;
	ASSERT(pow <= 255);

	for (j = 0; j < n; j++) {
	pow -= map[i];
	if (pow < 0)
	pow += 255;
	rows[i][j] = vdev_raidz_pow2[pow];
	}
	}
	}

	static void
	vdev_raidz_matrix_invert(raidz_map_t rm, int n, int nmissing, int missing,
	uint8_t rows, uint8_t invrows, const uint8_t *used)
	{
	int i, j, ii, jj;
	uint8_t log;

	/*
	* Assert that the first nmissing entries from the array of used
	* columns correspond to parity columns and that subsequent entries
	* correspond to data columns.
	*/
	for (i = 0; i < nmissing; i++) {
	ASSERT3S(used[i], <, rm->rm_firstdatacol);
	}
	for (; i < n; i++) {
	ASSERT3S(used[i], >=, rm->rm_firstdatacol);
	}

	/*
	* First initialize the storage where we'll compute the inverse rows.
	*/
	for (i = 0; i < nmissing; i++) {
	for (j = 0; j < n; j++) {
	invrows[i][j] = (i == j) ? 1 : 0;
	}
	}

	/*
	* Subtract all trivial rows from the rows of consequence.
	*/
	for (i = 0; i < nmissing; i++) {
	for (j = nmissing; j < n; j++) {
	ASSERT3U(used[j], >=, rm->rm_firstdatacol);
	jj = used[j] - rm->rm_firstdatacol;
	ASSERT3S(jj, <, n);
	invrows[i][j] = rows[i][jj];
	rows[i][jj] = 0;
	}
	}

	/*
	* For each of the rows of interest, we must normalize it and subtract
	* a multiple of it from the other rows.
	*/
	for (i = 0; i < nmissing; i++) {
	for (j = 0; j < missing[i]; j++) {
	ASSERT0(rows[i][j]);
	}
	ASSERT3U(rows[i][missing[i]], !=, 0);

	/*
	* Compute the inverse of the first element and multiply each
	* element in the row by that value.
	*/
	log = 255 - vdev_raidz_log2[rows[i][missing[i]]];

	for (j = 0; j < n; j++) {
	rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
	invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
	}

	for (ii = 0; ii < nmissing; ii++) {
	if (i == ii)
	continue;

	ASSERT3U(rows[ii][missing[i]], !=, 0);

	log = vdev_raidz_log2[rows[ii][missing[i]]];

	for (j = 0; j < n; j++) {
	rows[ii][j] ^=
	vdev_raidz_exp2(rows[i][j], log);
	invrows[ii][j] ^=
	vdev_raidz_exp2(invrows[i][j], log);
	}
	}
	}

	/*
	* Verify that the data that is left in the rows are properly part of
	* an identity matrix.
	*/
	for (i = 0; i < nmissing; i++) {
	for (j = 0; j < n; j++) {
	if (j == missing[i]) {
	ASSERT3U(rows[i][j], ==, 1);
	} else {
	ASSERT0(rows[i][j]);
	}
	}
	}
	}

	static void
	vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
	int missing, uint8_t invrows, const uint8_t used)
	{
	int i, j, x, cc, c;
	uint8_t *src;
	uint64_t ccount;
	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
	uint8_t log = 0;
	uint8_t val;
	int ll;
	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
	uint8_t p, pp;
	size_t psize;

	psize = sizeof (invlog[0][0]) * n * nmissing;
	p = kmem_alloc(psize, KM_SLEEP);

	for (pp = p, i = 0; i < nmissing; i++) {
	invlog[i] = pp;
	pp += n;
	}

	for (i = 0; i < nmissing; i++) {
	for (j = 0; j < n; j++) {
	ASSERT3U(invrows[i][j], !=, 0);
	invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
	}
	}

	for (i = 0; i < n; i++) {
	c = used[i];
	ASSERT3U(c, <, rm->rm_cols);

	src = abd_to_buf(rm->rm_col[c].rc_abd);
	ccount = rm->rm_col[c].rc_size;
	for (j = 0; j < nmissing; j++) {
	cc = missing[j] + rm->rm_firstdatacol;
	ASSERT3U(cc, >=, rm->rm_firstdatacol);
	ASSERT3U(cc, <, rm->rm_cols);
	ASSERT3U(cc, !=, c);

	dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
	dcount[j] = rm->rm_col[cc].rc_size;
	}

	ASSERT(ccount >= rm->rm_col[missing[0]].rc_size \|\| i > 0);

	for (x = 0; x < ccount; x++, src++) {
	if (*src != 0)
	log = vdev_raidz_log2[*src];

	for (cc = 0; cc < nmissing; cc++) {
	if (x >= dcount[cc])
	continue;

	if (*src == 0) {
	val = 0;
	} else {
	if ((ll = log + invlog[cc][i]) >= 255)
	ll -= 255;
	val = vdev_raidz_pow2[ll];
	}

	if (i == 0)
	dst[cc][x] = val;
	else
	dst[cc][x] ^= val;
	}
	}
	}

	kmem_free(p, psize);
	}

	static int
	vdev_raidz_reconstruct_general(raidz_map_t rm, int tgts, int ntgts)
	{
	int n, i, c, t, tt;
	int nmissing_rows;
	int missing_rows[VDEV_RAIDZ_MAXPARITY];
	int parity_map[VDEV_RAIDZ_MAXPARITY];

	uint8_t p, pp;
	size_t psize;

	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
	uint8_t *used;

	abd_t **bufs = NULL;

	int code = 0;

	/*
	* Matrix reconstruction can't use scatter ABDs yet, so we allocate
	* temporary linear ABDs.
	*/
	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
	bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);

	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	raidz_col_t *col = &rm->rm_col[c];

	bufs[c] = col->rc_abd;
	col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
	abd_copy(col->rc_abd, bufs[c], col->rc_size);
	}
	}

	n = rm->rm_cols - rm->rm_firstdatacol;

	/*
	* Figure out which data columns are missing.
	*/
	nmissing_rows = 0;
	for (t = 0; t < ntgts; t++) {
	if (tgts[t] >= rm->rm_firstdatacol) {
	missing_rows[nmissing_rows++] =
	tgts[t] - rm->rm_firstdatacol;
	}
	}

	/*
	* Figure out which parity columns to use to help generate the missing
	* data columns.
	*/
	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
	ASSERT(tt < ntgts);
	ASSERT(c < rm->rm_firstdatacol);

	/*
	* Skip any targeted parity columns.
	*/
	if (c == tgts[tt]) {
	tt++;
	continue;
	}

	code \|= 1 << c;

	parity_map[i] = c;
	i++;
	}

	ASSERT(code != 0);
	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);

	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
	nmissing_rows * n + sizeof (used[0]) * n;
	p = kmem_alloc(psize, KM_SLEEP);

	for (pp = p, i = 0; i < nmissing_rows; i++) {
	rows[i] = pp;
	pp += n;
	invrows[i] = pp;
	pp += n;
	}
	used = pp;

	for (i = 0; i < nmissing_rows; i++) {
	used[i] = parity_map[i];
	}

	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	if (tt < nmissing_rows &&
	c == missing_rows[tt] + rm->rm_firstdatacol) {
	tt++;
	continue;
	}

	ASSERT3S(i, <, n);
	used[i] = c;
	i++;
	}

	/*
	* Initialize the interesting rows of the matrix.
	*/
	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);

	/*
	* Invert the matrix.
	*/
	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
	invrows, used);

	/*
	* Reconstruct the missing data using the generated matrix.
	*/
	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
	invrows, used);

	kmem_free(p, psize);

	/*
	* copy back from temporary linear abds and free them
	*/
	if (bufs) {
	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	raidz_col_t *col = &rm->rm_col[c];

	abd_copy(bufs[c], col->rc_abd, col->rc_size);
	abd_free(col->rc_abd);
	col->rc_abd = bufs[c];
	}
	kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
	}

	return (code);
	}

	static int
	vdev_raidz_reconstruct(raidz_map_t rm, int t, int nt)
	{
	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
	int ntgts;
	int i, c;
	int code;
	int nbadparity, nbaddata;
	int parity_valid[VDEV_RAIDZ_MAXPARITY];

	/*
	* The tgts list must already be sorted.
	*/
	for (i = 1; i < nt; i++) {
	ASSERT(t[i] > t[i - 1]);
	}

	nbadparity = rm->rm_firstdatacol;
	nbaddata = rm->rm_cols - nbadparity;
	ntgts = 0;
	for (i = 0, c = 0; c < rm->rm_cols; c++) {
	if (c < rm->rm_firstdatacol)
	parity_valid[c] = B_FALSE;

	if (i < nt && c == t[i]) {
	tgts[ntgts++] = c;
	i++;
	} else if (rm->rm_col[c].rc_error != 0) {
	tgts[ntgts++] = c;
	} else if (c >= rm->rm_firstdatacol) {
	nbaddata--;
	} else {
	parity_valid[c] = B_TRUE;
	nbadparity--;
	}
	}

	ASSERT(ntgts >= nt);
	ASSERT(nbaddata >= 0);
	ASSERT(nbaddata + nbadparity == ntgts);

	dt = &tgts[nbadparity];

	/*
	* See if we can use any of our optimized reconstruction routines.
	*/
	if (!vdev_raidz_default_to_general) {
	switch (nbaddata) {
	case 1:
	if (parity_valid[VDEV_RAIDZ_P])
	return (vdev_raidz_reconstruct_p(rm, dt, 1));

	ASSERT(rm->rm_firstdatacol > 1);

	if (parity_valid[VDEV_RAIDZ_Q])
	return (vdev_raidz_reconstruct_q(rm, dt, 1));

	ASSERT(rm->rm_firstdatacol > 2);
	break;

	case 2:
	ASSERT(rm->rm_firstdatacol > 1);

	if (parity_valid[VDEV_RAIDZ_P] &&
	parity_valid[VDEV_RAIDZ_Q])
	return (vdev_raidz_reconstruct_pq(rm, dt, 2));

	ASSERT(rm->rm_firstdatacol > 2);

	break;
	}
	}

	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
	ASSERT(code > 0);
	return (code);
	}

	static int
	vdev_raidz_open(vdev_t vd, uint64_t asize, uint64_t *max_asize,
	uint64_t logical_ashift, uint64_t physical_ashift)
	{
	vdev_t *cvd;
	uint64_t nparity = vd->vdev_nparity;
	int c;
	int lasterror = 0;
	int numerrors = 0;

	ASSERT(nparity > 0);

	if (nparity > VDEV_RAIDZ_MAXPARITY \|\|
	vd->vdev_children < nparity + 1) {
	vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	return (SET_ERROR(EINVAL));
	}

	vdev_open_children(vd);

	for (c = 0; c < vd->vdev_children; c++) {
	cvd = vd->vdev_child[c];

	if (cvd->vdev_open_error != 0) {
	lasterror = cvd->vdev_open_error;
	numerrors++;
	continue;
	}

	asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
	max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
	logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
	physical_ashift = MAX(physical_ashift,
	cvd->vdev_physical_ashift);
	}

	asize = vd->vdev_children;
	max_asize = vd->vdev_children;

	if (numerrors > nparity) {
	vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
	return (lasterror);
	}

	return (0);
	}

	static void
	vdev_raidz_close(vdev_t *vd)
	{
	int c;

	for (c = 0; c < vd->vdev_children; c++)
	vdev_close(vd->vdev_child[c]);
	}

	#ifdef illumos
	/*
	* Handle a read or write I/O to a RAID-Z dump device.
	*
	* The dump device is in a unique situation compared to other ZFS datasets:
	* writing to this device should be as simple and fast as possible. In
	* addition, durability matters much less since the dump will be extracted
	* once the machine reboots. For that reason, this function eschews parity for
	* performance and simplicity. The dump device uses the checksum setting
	* ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
	* dataset.
	*
	* Blocks of size 128 KB have been preallocated for this volume. I/Os less than
	* 128 KB will not fill an entire block; in addition, they may not be properly
	* aligned. In that case, this function uses the preallocated 128 KB block and
	* omits reading or writing any "empty" portions of that block, as opposed to
	* allocating a fresh appropriately-sized block.
	*
	* Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
	*
	* vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
	*
	* If this were a standard RAID-Z dataset, a block of at least 40 KB would be
	* allocated which spans all five child vdevs. 8 KB of data would be written to
	* each of four vdevs, with the fifth containing the parity bits.
	*
	* parity data data data data
	* \| PP \| XX \| XX \| XX \| XX \|
	* ^ ^ ^ ^ ^
	* \| \| \| \| \|
	* 8 KB parity ------8 KB data blocks------
	*
	* However, when writing to the dump device, the behavior is different:
	*
	* vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
	*
	* Unlike the normal RAID-Z case in which the block is allocated based on the
	* I/O size, reads and writes here always use a 128 KB logical I/O size. If the
	* I/O size is less than 128 KB, only the actual portions of data are written.
	* In this example the data is written to the third data vdev since that vdev
	* contains the offset [64 KB, 96 KB).
	*
	* parity data data data data
	* \| \| \| \| XX \| \|
	* ^
	* \|
	* 32 KB data block
	*
	* As a result, an individual I/O may not span all child vdevs; moreover, a
	* small I/O may only operate on a single child vdev.
	*
	* Note that since there are no parity bits calculated or written, this format
	* remains the same no matter how many parity bits are used in a normal RAID-Z
	* stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
	* would look like:
	*
	* parity parity parity data data data data
	* \| \| \| \| \| \| XX \| \|
	* ^
	* \|
	* 32 KB data block
	*/
	int
	vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
	uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
	{
	vdev_t *tvd = vd->vdev_top;
	vdev_t *cvd;
	raidz_map_t *rm;
	raidz_col_t *rc;
	int c, err = 0;

	uint64_t start, end, colstart, colend;
	uint64_t coloffset, colsize, colskip;

	int flags = doread ? BIO_READ : BIO_WRITE;

	#ifdef _KERNEL

	/*
	* Don't write past the end of the block
	*/
	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);

	start = offset;
	end = start + size;

	/*
	* Allocate a RAID-Z map for this block. Note that this block starts
	* from the "original" offset, this is, the offset of the extent which
	* contains the requisite offset of the data being read or written.
	*
	* Even if this I/O operation doesn't span the full block size, let's
	* treat the on-disk format as if the only blocks are the complete 128
	* KB size.
	*/
	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
	SPA_OLD_MAXBLOCKSIZE);
	rm = vdev_raidz_map_alloc(abd,
	SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
	vd->vdev_children, vd->vdev_nparity);

	coloffset = origoffset;

	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
	c++, coloffset += rc->rc_size) {
	rc = &rm->rm_col[c];
	cvd = vd->vdev_child[rc->rc_devidx];

	/*
	* Find the start and end of this column in the RAID-Z map,
	* keeping in mind that the stated size and offset of the
	* operation may not fill the entire column for this vdev.
	*
	* If any portion of the data spans this column, issue the
	* appropriate operation to the vdev.
	*/
	if (coloffset + rc->rc_size <= start)
	continue;
	if (coloffset >= end)
	continue;

	colstart = MAX(coloffset, start);
	colend = MIN(end, coloffset + rc->rc_size);
	colsize = colend - colstart;
	colskip = colstart - coloffset;

	VERIFY3U(colsize, <=, rc->rc_size);
	VERIFY3U(colskip, <=, rc->rc_size);

	/*
	* Note that the child vdev will have a vdev label at the start
	* of its range of offsets, hence the need for
	* VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
	* example of why this calculation is needed.
	*/
	if ((err = vdev_disk_physio(cvd,
	((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
	VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
	flags, isdump)) != 0)
	break;
	}

	vdev_raidz_map_free(rm);
	abd_put(abd);
	#endif /* KERNEL */

	return (err);
	}
	#endif

	static uint64_t
	vdev_raidz_asize(vdev_t *vd, uint64_t psize)
	{
	uint64_t asize;
	uint64_t ashift = vd->vdev_top->vdev_ashift;
	uint64_t cols = vd->vdev_children;
	uint64_t nparity = vd->vdev_nparity;

	asize = ((psize - 1) >> ashift) + 1;
	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
	asize = roundup(asize, nparity + 1) << ashift;

	return (asize);
	}

	static void
	vdev_raidz_child_done(zio_t *zio)
	{
	raidz_col_t *rc = zio->io_private;

	rc->rc_error = zio->io_error;
	rc->rc_tried = 1;
	rc->rc_skipped = 0;
	}

	/*
	* Start an IO operation on a RAIDZ VDev
	*
	* Outline:
	* - For write operations:
	* 1. Generate the parity data
	* 2. Create child zio write operations to each column's vdev, for both
	* data and parity.
	* 3. If the column skips any sectors for padding, create optional dummy
	* write zio children for those areas to improve aggregation continuity.
	* - For read operations:
	* 1. Create child zio read operations to each data column's vdev to read
	* the range of data required for zio.
	* 2. If this is a scrub or resilver operation, or if any of the data
	* vdevs have had errors, then create zio read operations to the parity
	* columns' VDevs as well.
	*/
	static void
	vdev_raidz_io_start(zio_t *zio)
	{
	vdev_t *vd = zio->io_vd;
	vdev_t *tvd = vd->vdev_top;
	vdev_t *cvd;
	raidz_map_t *rm;
	raidz_col_t *rc;
	int c, i;

	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
	zio->io_type == ZIO_TYPE_FREE,
	tvd->vdev_ashift, vd->vdev_children,
	vd->vdev_nparity);

	zio->io_vsd = rm;
	zio->io_vsd_ops = &vdev_raidz_vsd_ops;

	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));

	if (zio->io_type == ZIO_TYPE_FREE) {
	for (c = 0; c < rm->rm_cols; c++) {
	rc = &rm->rm_col[c];
	cvd = vd->vdev_child[rc->rc_devidx];
	zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	rc->rc_offset, rc->rc_abd, rc->rc_size,
	zio->io_type, zio->io_priority, 0,
	vdev_raidz_child_done, rc));
	}

	zio_execute(zio);
	return;
	}

	if (zio->io_type == ZIO_TYPE_WRITE) {
	vdev_raidz_generate_parity(rm);

	for (c = 0; c < rm->rm_cols; c++) {
	rc = &rm->rm_col[c];
	cvd = vd->vdev_child[rc->rc_devidx];
	zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	rc->rc_offset, rc->rc_abd, rc->rc_size,
	zio->io_type, zio->io_priority, 0,
	vdev_raidz_child_done, rc));
	}

	/*
	* Generate optional I/Os for any skipped sectors to improve
	* aggregation contiguity.
	*/
	for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
	ASSERT(c <= rm->rm_scols);
	if (c == rm->rm_scols)
	c = 0;
	rc = &rm->rm_col[c];
	cvd = vd->vdev_child[rc->rc_devidx];
	zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	rc->rc_offset + rc->rc_size, NULL,
	1 << tvd->vdev_ashift,
	zio->io_type, zio->io_priority,
	ZIO_FLAG_NODATA \| ZIO_FLAG_OPTIONAL, NULL, NULL));
	}

	zio_execute(zio);
	return;
	}

	ASSERT(zio->io_type == ZIO_TYPE_READ);

	/*
	* Iterate over the columns in reverse order so that we hit the parity
	* last -- any errors along the way will force us to read the parity.
	*/
	for (c = rm->rm_cols - 1; c >= 0; c--) {
	rc = &rm->rm_col[c];
	cvd = vd->vdev_child[rc->rc_devidx];
	if (!vdev_readable(cvd)) {
	if (c >= rm->rm_firstdatacol)
	rm->rm_missingdata++;
	else
	rm->rm_missingparity++;
	rc->rc_error = SET_ERROR(ENXIO);
	rc->rc_tried = 1; /* don't even try */
	rc->rc_skipped = 1;
	continue;
	}
	if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
	if (c >= rm->rm_firstdatacol)
	rm->rm_missingdata++;
	else
	rm->rm_missingparity++;
	rc->rc_error = SET_ERROR(ESTALE);
	rc->rc_skipped = 1;
	continue;
	}
	if (c >= rm->rm_firstdatacol \|\| rm->rm_missingdata > 0 \|\|
	(zio->io_flags & (ZIO_FLAG_SCRUB \| ZIO_FLAG_RESILVER))) {
	zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	rc->rc_offset, rc->rc_abd, rc->rc_size,
	zio->io_type, zio->io_priority, 0,
	vdev_raidz_child_done, rc));
	}
	}

	zio_execute(zio);
	}


	/*
	* Report a checksum error for a child of a RAID-Z device.
	*/
	static void
	raidz_checksum_error(zio_t zio, raidz_col_t rc, void *bad_data)
	{
	void *buf;
	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];

	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
	zio_bad_cksum_t zbc;
	raidz_map_t *rm = zio->io_vsd;

	mutex_enter(&vd->vdev_stat_lock);
	vd->vdev_stat.vs_checksum_errors++;
	mutex_exit(&vd->vdev_stat_lock);

	zbc.zbc_has_cksum = 0;
	zbc.zbc_injected = rm->rm_ecksuminjected;

	buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
	zfs_ereport_post_checksum(zio->io_spa, vd, zio,
	rc->rc_offset, rc->rc_size, buf, bad_data,
	&zbc);
	abd_return_buf(rc->rc_abd, buf, rc->rc_size);
	}
	}

	/*
	* We keep track of whether or not there were any injected errors, so that
	* any ereports we generate can note it.
	*/
	static int
	raidz_checksum_verify(zio_t *zio)
	{
	zio_bad_cksum_t zbc;
	raidz_map_t *rm = zio->io_vsd;

	int ret = zio_checksum_error(zio, &zbc);
	if (ret != 0 && zbc.zbc_injected != 0)
	rm->rm_ecksuminjected = 1;

	return (ret);
	}

	/*
	* Generate the parity from the data columns. If we tried and were able to
	* read the parity without error, verify that the generated parity matches the
	* data we read. If it doesn't, we fire off a checksum error. Return the
	* number such failures.
	*/
	static int
	raidz_parity_verify(zio_t zio, raidz_map_t rm)
	{
	void *orig[VDEV_RAIDZ_MAXPARITY];
	int c, ret = 0;
	raidz_col_t *rc;

	blkptr_t *bp = zio->io_bp;
	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
	(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));

	if (checksum == ZIO_CHECKSUM_NOPARITY)
	return (ret);

	for (c = 0; c < rm->rm_firstdatacol; c++) {
	rc = &rm->rm_col[c];
	if (!rc->rc_tried \|\| rc->rc_error != 0)
	continue;
	orig[c] = zio_buf_alloc(rc->rc_size);
	abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
	}

	vdev_raidz_generate_parity(rm);

	for (c = 0; c < rm->rm_firstdatacol; c++) {
	rc = &rm->rm_col[c];
	if (!rc->rc_tried \|\| rc->rc_error != 0)
	continue;
	if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
	raidz_checksum_error(zio, rc, orig[c]);
	rc->rc_error = SET_ERROR(ECKSUM);
	ret++;
	}
	zio_buf_free(orig[c], rc->rc_size);
	}

	return (ret);
	}

	/*
	* Keep statistics on all the ways that we used parity to correct data.
	*/
	static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];

	static int
	vdev_raidz_worst_error(raidz_map_t *rm)
	{
	int error = 0;

	for (int c = 0; c < rm->rm_cols; c++)
	error = zio_worst_error(error, rm->rm_col[c].rc_error);

	return (error);
	}

	/*
	* Iterate over all combinations of bad data and attempt a reconstruction.
	* Note that the algorithm below is non-optimal because it doesn't take into
	* account how reconstruction is actually performed. For example, with
	* triple-parity RAID-Z the reconstruction procedure is the same if column 4
	* is targeted as invalid as if columns 1 and 4 are targeted since in both
	* cases we'd only use parity information in column 0.
	*/
	static int
	vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
	{
	raidz_map_t *rm = zio->io_vsd;
	raidz_col_t *rc;
	void *orig[VDEV_RAIDZ_MAXPARITY];
	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
	int *tgts = &tstore[1];
	int current, next, i, c, n;
	int code, ret = 0;

	ASSERT(total_errors < rm->rm_firstdatacol);

	/*
	* This simplifies one edge condition.
	*/
	tgts[-1] = -1;

	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
	/*
	* Initialize the targets array by finding the first n columns
	* that contain no error.
	*
	* If there were no data errors, we need to ensure that we're
	* always explicitly attempting to reconstruct at least one
	* data column. To do this, we simply push the highest target
	* up into the data columns.
	*/
	for (c = 0, i = 0; i < n; i++) {
	if (i == n - 1 && data_errors == 0 &&
	c < rm->rm_firstdatacol) {
	c = rm->rm_firstdatacol;
	}

	while (rm->rm_col[c].rc_error != 0) {
	c++;
	ASSERT3S(c, <, rm->rm_cols);
	}

	tgts[i] = c++;
	}

	/*
	* Setting tgts[n] simplifies the other edge condition.
	*/
	tgts[n] = rm->rm_cols;

	/*
	* These buffers were allocated in previous iterations.
	*/
	for (i = 0; i < n - 1; i++) {
	ASSERT(orig[i] != NULL);
	}

	orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);

	current = 0;
	next = tgts[current];

	while (current != n) {
	tgts[current] = next;
	current = 0;

	/*
	* Save off the original data that we're going to
	* attempt to reconstruct.
	*/
	for (i = 0; i < n; i++) {
	ASSERT(orig[i] != NULL);
	c = tgts[i];
	ASSERT3S(c, >=, 0);
	ASSERT3S(c, <, rm->rm_cols);
	rc = &rm->rm_col[c];
	abd_copy_to_buf(orig[i], rc->rc_abd,
	rc->rc_size);
	}

	/*
	* Attempt a reconstruction and exit the outer loop on
	* success.
	*/
	code = vdev_raidz_reconstruct(rm, tgts, n);
	if (raidz_checksum_verify(zio) == 0) {
	atomic_inc_64(&raidz_corrected[code]);

	for (i = 0; i < n; i++) {
	c = tgts[i];
	rc = &rm->rm_col[c];
	ASSERT(rc->rc_error == 0);
	if (rc->rc_tried)
	raidz_checksum_error(zio, rc,
	orig[i]);
	rc->rc_error = SET_ERROR(ECKSUM);
	}

	ret = code;
	goto done;
	}

	/*
	* Restore the original data.
	*/
	for (i = 0; i < n; i++) {
	c = tgts[i];
	rc = &rm->rm_col[c];
	abd_copy_from_buf(rc->rc_abd, orig[i],
	rc->rc_size);
	}

	do {
	/*
	* Find the next valid column after the current
	* position..
	*/
	for (next = tgts[current] + 1;
	next < rm->rm_cols &&
	rm->rm_col[next].rc_error != 0; next++)
	continue;

	ASSERT(next <= tgts[current + 1]);

	/*
	* If that spot is available, we're done here.
	*/
	if (next != tgts[current + 1])
	break;

	/*
	* Otherwise, find the next valid column after
	* the previous position.
	*/
	for (c = tgts[current - 1] + 1;
	rm->rm_col[c].rc_error != 0; c++)
	continue;

	tgts[current] = c;
	current++;

	} while (current != n);
	}
	}
	n--;
	done:
	for (i = 0; i < n; i++) {
	zio_buf_free(orig[i], rm->rm_col[0].rc_size);
	}

	return (ret);
	}

	/*
	* Complete an IO operation on a RAIDZ VDev
	*
	* Outline:
	* - For write operations:
	* 1. Check for errors on the child IOs.
	* 2. Return, setting an error code if too few child VDevs were written
	* to reconstruct the data later. Note that partial writes are
	* considered successful if they can be reconstructed at all.
	* - For read operations:
	* 1. Check for errors on the child IOs.
	* 2. If data errors occurred:
	* a. Try to reassemble the data from the parity available.
	* b. If we haven't yet read the parity drives, read them now.
	* c. If all parity drives have been read but the data still doesn't
	* reassemble with a correct checksum, then try combinatorial
	* reconstruction.
	* d. If that doesn't work, return an error.
	* 3. If there were unexpected errors or this is a resilver operation,
	* rewrite the vdevs that had errors.
	*/
	static void
	vdev_raidz_io_done(zio_t *zio)
	{
	vdev_t *vd = zio->io_vd;
	vdev_t *cvd;
	raidz_map_t *rm = zio->io_vsd;
	raidz_col_t *rc;
	int unexpected_errors = 0;
	int parity_errors = 0;
	int parity_untried = 0;
	int data_errors = 0;
	int total_errors = 0;
	int n, c;
	int tgts[VDEV_RAIDZ_MAXPARITY];
	int code;

	ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */

	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);

	for (c = 0; c < rm->rm_cols; c++) {
	rc = &rm->rm_col[c];

	if (rc->rc_error) {
	ASSERT(rc->rc_error != ECKSUM); /* child has no bp */

	if (c < rm->rm_firstdatacol)
	parity_errors++;
	else
	data_errors++;

	if (!rc->rc_skipped)
	unexpected_errors++;

	total_errors++;
	} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
	parity_untried++;
	}
	}

	if (zio->io_type == ZIO_TYPE_WRITE) {
	/*
	* XXX -- for now, treat partial writes as a success.
	* (If we couldn't write enough columns to reconstruct
	* the data, the I/O failed. Otherwise, good enough.)
	*
	* Now that we support write reallocation, it would be better
	* to treat partial failure as real failure unless there are
	* no non-degraded top-level vdevs left, and not update DTLs
	* if we intend to reallocate.
	*/
	/* XXPOLICY */
	if (total_errors > rm->rm_firstdatacol)
	zio->io_error = vdev_raidz_worst_error(rm);

	return;
	} else if (zio->io_type == ZIO_TYPE_FREE) {
	return;
	}

	ASSERT(zio->io_type == ZIO_TYPE_READ);
	/*
	* There are three potential phases for a read:
	* 1. produce valid data from the columns read
	* 2. read all disks and try again
	* 3. perform combinatorial reconstruction
	*
	* Each phase is progressively both more expensive and less likely to
	* occur. If we encounter more errors than we can repair or all phases
	* fail, we have no choice but to return an error.
	*/

	/*
	* If the number of errors we saw was correctable -- less than or equal
	* to the number of parity disks read -- attempt to produce data that
	* has a valid checksum. Naturally, this case applies in the absence of
	* any errors.
	*/
	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
	if (data_errors == 0) {
	if (raidz_checksum_verify(zio) == 0) {
	/*
	* If we read parity information (unnecessarily
	* as it happens since no reconstruction was
	* needed) regenerate and verify the parity.
	* We also regenerate parity when resilvering
	* so we can write it out to the failed device
	* later.
	*/
	if (parity_errors + parity_untried <
	rm->rm_firstdatacol \|\|
	(zio->io_flags & ZIO_FLAG_RESILVER)) {
	n = raidz_parity_verify(zio, rm);
	unexpected_errors += n;
	ASSERT(parity_errors + n <=
	rm->rm_firstdatacol);
	}
	goto done;
	}
	} else {
	/*
	* We either attempt to read all the parity columns or
	* none of them. If we didn't try to read parity, we
	* wouldn't be here in the correctable case. There must
	* also have been fewer parity errors than parity
	* columns or, again, we wouldn't be in this code path.
	*/
	ASSERT(parity_untried == 0);
	ASSERT(parity_errors < rm->rm_firstdatacol);

	/*
	* Identify the data columns that reported an error.
	*/
	n = 0;
	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	rc = &rm->rm_col[c];
	if (rc->rc_error != 0) {
	ASSERT(n < VDEV_RAIDZ_MAXPARITY);
	tgts[n++] = c;
	}
	}

	ASSERT(rm->rm_firstdatacol >= n);

	code = vdev_raidz_reconstruct(rm, tgts, n);

	if (raidz_checksum_verify(zio) == 0) {
	atomic_inc_64(&raidz_corrected[code]);

	/*
	* If we read more parity disks than were used
	* for reconstruction, confirm that the other
	* parity disks produced correct data. This
	* routine is suboptimal in that it regenerates
	* the parity that we already used in addition
	* to the parity that we're attempting to
	* verify, but this should be a relatively
	* uncommon case, and can be optimized if it
	* becomes a problem. Note that we regenerate
	* parity when resilvering so we can write it
	* out to failed devices later.
	*/
	if (parity_errors < rm->rm_firstdatacol - n \|\|
	(zio->io_flags & ZIO_FLAG_RESILVER)) {
	n = raidz_parity_verify(zio, rm);
	unexpected_errors += n;
	ASSERT(parity_errors + n <=
	rm->rm_firstdatacol);
	}

	goto done;
	}
	}
	}

	/*
	* This isn't a typical situation -- either we got a read error or
	* a child silently returned bad data. Read every block so we can
	* try again with as much data and parity as we can track down. If
	* we've already been through once before, all children will be marked
	* as tried so we'll proceed to combinatorial reconstruction.
	*/
	unexpected_errors = 1;
	rm->rm_missingdata = 0;
	rm->rm_missingparity = 0;

	for (c = 0; c < rm->rm_cols; c++) {
	if (rm->rm_col[c].rc_tried)
	continue;

	zio_vdev_io_redone(zio);
	do {
	rc = &rm->rm_col[c];
	if (rc->rc_tried)
	continue;
	zio_nowait(zio_vdev_child_io(zio, NULL,
	vd->vdev_child[rc->rc_devidx],
	rc->rc_offset, rc->rc_abd, rc->rc_size,
	zio->io_type, zio->io_priority, 0,
	vdev_raidz_child_done, rc));
	} while (++c < rm->rm_cols);

	return;
	}

	/*
	* At this point we've attempted to reconstruct the data given the
	* errors we detected, and we've attempted to read all columns. There
	* must, therefore, be one or more additional problems -- silent errors
	* resulting in invalid data rather than explicit I/O errors resulting
	* in absent data. We check if there is enough additional data to
	* possibly reconstruct the data and then perform combinatorial
	* reconstruction over all possible combinations. If that fails,
	* we're cooked.
	*/
	if (total_errors > rm->rm_firstdatacol) {
	zio->io_error = vdev_raidz_worst_error(rm);

	} else if (total_errors < rm->rm_firstdatacol &&
	(code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
	/*
	* If we didn't use all the available parity for the
	* combinatorial reconstruction, verify that the remaining
	* parity is correct.
	*/
	if (code != (1 << rm->rm_firstdatacol) - 1)
	(void) raidz_parity_verify(zio, rm);
	} else {
	/*
	* We're here because either:
	*
	* total_errors == rm_first_datacol, or
	* vdev_raidz_combrec() failed
	*
	* In either case, there is enough bad data to prevent
	* reconstruction.
	*
	* Start checksum ereports for all children which haven't
	* failed, and the IO wasn't speculative.
	*/
	zio->io_error = SET_ERROR(ECKSUM);

	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
	for (c = 0; c < rm->rm_cols; c++) {
	rc = &rm->rm_col[c];
	if (rc->rc_error == 0) {
	zio_bad_cksum_t zbc;
	zbc.zbc_has_cksum = 0;
	zbc.zbc_injected =
	rm->rm_ecksuminjected;

	zfs_ereport_start_checksum(
	zio->io_spa,
	vd->vdev_child[rc->rc_devidx],
	zio, rc->rc_offset, rc->rc_size,
	(void *)(uintptr_t)c, &zbc);
	}
	}
	}
	}

	done:
	zio_checksum_verified(zio);

	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
	(unexpected_errors \|\| (zio->io_flags & ZIO_FLAG_RESILVER))) {
	/*
	* Use the good data we have in hand to repair damaged children.
	*/
	for (c = 0; c < rm->rm_cols; c++) {
	rc = &rm->rm_col[c];
	cvd = vd->vdev_child[rc->rc_devidx];

	if (rc->rc_error == 0)
	continue;

	zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	rc->rc_offset, rc->rc_abd, rc->rc_size,
	ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
	ZIO_FLAG_IO_REPAIR \| (unexpected_errors ?
	ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
	}
	}
	}

	static void
	vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
	{
	if (faulted > vd->vdev_nparity)
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_NO_REPLICAS);
	else if (degraded + faulted != 0)
	vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
	else
	vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
	}

	vdev_ops_t vdev_raidz_ops = {
	vdev_raidz_open,
	vdev_raidz_close,
	vdev_raidz_asize,
	vdev_raidz_io_start,
	vdev_raidz_io_done,
	vdev_raidz_state_change,
	NULL,
	NULL,
	+ NULL,
	VDEV_TYPE_RAIDZ, /* name of this vdev type */
	B_FALSE /* not a leaf vdev */
	};
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c (nonexistent)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c (revision 332525)
	@@ -0,0 +1,1919 @@
	+/*
	+ * CDDL HEADER START
	+ *
	+ * The contents of this file are subject to the terms of the
	+ * Common Development and Distribution License (the "License").
	+ * You may not use this file except in compliance with the License.
	+ *
	+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	+ * or http://www.opensolaris.org/os/licensing.
	+ * See the License for the specific language governing permissions
	+ * and limitations under the License.
	+ *
	+ * When distributing Covered Code, include this CDDL HEADER in each
	+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	+ * If applicable, add the following below this CDDL HEADER, with the
	+ * fields enclosed by brackets "[]" replaced with your own identifying
	+ * information: Portions Copyright [yyyy] [name of copyright owner]
	+ *
	+ * CDDL HEADER END
	+ */
	+
	+/*
	+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
	+ */
	+
	+#include <sys/zfs_context.h>
	+#include <sys/spa_impl.h>
	+#include <sys/dmu.h>
	+#include <sys/dmu_tx.h>
	+#include <sys/zap.h>
	+#include <sys/vdev_impl.h>
	+#include <sys/metaslab.h>
	+#include <sys/metaslab_impl.h>
	+#include <sys/uberblock_impl.h>
	+#include <sys/txg.h>
	+#include <sys/avl.h>
	+#include <sys/bpobj.h>
	+#include <sys/dsl_pool.h>
	+#include <sys/dsl_synctask.h>
	+#include <sys/dsl_dir.h>
	+#include <sys/arc.h>
	+#include <sys/zfeature.h>
	+#include <sys/vdev_indirect_births.h>
	+#include <sys/vdev_indirect_mapping.h>
	+#include <sys/abd.h>
	+
	+/*
	+ * This file contains the necessary logic to remove vdevs from a
	+ * storage pool. Currently, the only devices that can be removed
	+ * are log, cache, and spare devices; and top level vdevs from a pool
	+ * w/o raidz. (Note that members of a mirror can also be removed
	+ * by the detach operation.)
	+ *
	+ * Log vdevs are removed by evacuating them and then turning the vdev
	+ * into a hole vdev while holding spa config locks.
	+ *
	+ * Top level vdevs are removed and converted into an indirect vdev via
	+ * a multi-step process:
	+ *
	+ * - Disable allocations from this device (spa_vdev_remove_top).
	+ *
	+ * - From a new thread (spa_vdev_remove_thread), copy data from
	+ * the removing vdev to a different vdev. The copy happens in open
	+ * context (spa_vdev_copy_impl) and issues a sync task
	+ * (vdev_mapping_sync) so the sync thread can update the partial
	+ * indirect mappings in core and on disk.
	+ *
	+ * - If a free happens during a removal, it is freed from the
	+ * removing vdev, and if it has already been copied, from the new
	+ * location as well (free_from_removing_vdev).
	+ *
	+ * - After the removal is completed, the copy thread converts the vdev
	+ * into an indirect vdev (vdev_remove_complete) before instructing
	+ * the sync thread to destroy the space maps and finish the removal
	+ * (spa_finish_removal).
	+ */
	+
	+typedef struct vdev_copy_arg {
	+ metaslab_t *vca_msp;
	+ uint64_t vca_outstanding_bytes;
	+ kcondvar_t vca_cv;
	+ kmutex_t vca_lock;
	+} vdev_copy_arg_t;
	+
	+typedef struct vdev_copy_seg_arg {
	+ vdev_copy_arg_t *vcsa_copy_arg;
	+ uint64_t vcsa_txg;
	+ dva_t *vcsa_dest_dva;
	+ blkptr_t *vcsa_dest_bp;
	+} vdev_copy_seg_arg_t;
	+
	+/*
	+ * The maximum amount of allowed data we're allowed to copy from a device
	+ * at a time when removing it.
	+ */
	+int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
	+
	+/*
	+ * The largest contiguous segment that we will attempt to allocate when
	+ * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If
	+ * there is a performance problem with attempting to allocate large blocks,
	+ * consider decreasing this.
	+ *
	+ * Note: we will issue I/Os of up to this size. The mpt driver does not
	+ * respond well to I/Os larger than 1MB, so we set this to 1MB. (When
	+ * mpt processes an I/O larger than 1MB, it needs to do an allocation of
	+ * 2 physically contiguous pages; if this allocation fails, mpt will drop
	+ * the I/O and hang the device.)
	+ */
	+int zfs_remove_max_segment = 1024 * 1024;
	+
	+#define VDEV_REMOVAL_ZAP_OBJS "lzap"
	+
	+static void spa_vdev_remove_thread(void *arg);
	+
	+static void
	+spa_sync_removing_state(spa_t spa, dmu_tx_t tx)
	+{
	+ VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
	+ DMU_POOL_DIRECTORY_OBJECT,
	+ DMU_POOL_REMOVING, sizeof (uint64_t),
	+ sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
	+ &spa->spa_removing_phys, tx));
	+}
	+
	+static nvlist_t *
	+spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
	+{
	+ for (int i = 0; i < count; i++) {
	+ uint64_t guid =
	+ fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
	+
	+ if (guid == target_guid)
	+ return (nvpp[i]);
	+ }
	+
	+ return (NULL);
	+}
	+
	+static void
	+spa_vdev_remove_aux(nvlist_t config, char name, nvlist_t **dev, int count,
	+ nvlist_t *dev_to_remove)
	+{
	+ nvlist_t **newdev = NULL;
	+
	+ if (count > 1)
	+ newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
	+
	+ for (int i = 0, j = 0; i < count; i++) {
	+ if (dev[i] == dev_to_remove)
	+ continue;
	+ VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
	+ }
	+
	+ VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
	+ VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
	+
	+ for (int i = 0; i < count - 1; i++)
	+ nvlist_free(newdev[i]);
	+
	+ if (count > 1)
	+ kmem_free(newdev, (count - 1) * sizeof (void *));
	+}
	+
	+static spa_vdev_removal_t *
	+spa_vdev_removal_create(vdev_t *vd)
	+{
	+ spa_vdev_removal_t svr = kmem_zalloc(sizeof (svr), KM_SLEEP);
	+ mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
	+ cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
	+ svr->svr_allocd_segs = range_tree_create(NULL, NULL);
	+ svr->svr_vdev = vd;
	+
	+ for (int i = 0; i < TXG_SIZE; i++) {
	+ svr->svr_frees[i] = range_tree_create(NULL, NULL);
	+ list_create(&svr->svr_new_segments[i],
	+ sizeof (vdev_indirect_mapping_entry_t),
	+ offsetof(vdev_indirect_mapping_entry_t, vime_node));
	+ }
	+
	+ return (svr);
	+}
	+
	+void
	+spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
	+{
	+ for (int i = 0; i < TXG_SIZE; i++) {
	+ ASSERT0(svr->svr_bytes_done[i]);
	+ ASSERT0(svr->svr_max_offset_to_sync[i]);
	+ range_tree_destroy(svr->svr_frees[i]);
	+ list_destroy(&svr->svr_new_segments[i]);
	+ }
	+
	+ range_tree_destroy(svr->svr_allocd_segs);
	+ mutex_destroy(&svr->svr_lock);
	+ cv_destroy(&svr->svr_cv);
	+ kmem_free(svr, sizeof (*svr));
	+}
	+
	+/*
	+ * This is called as a synctask in the txg in which we will mark this vdev
	+ * as removing (in the config stored in the MOS).
	+ *
	+ * It begins the evacuation of a toplevel vdev by:
	+ * - initializing the spa_removing_phys which tracks this removal
	+ * - computing the amount of space to remove for accounting purposes
	+ * - dirtying all dbufs in the spa_config_object
	+ * - creating the spa_vdev_removal
	+ * - starting the spa_vdev_remove_thread
	+ */
	+static void
	+vdev_remove_initiate_sync(void arg, dmu_tx_t tx)
	+{
	+ vdev_t *vd = arg;
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+ spa_t *spa = vd->vdev_spa;
	+ objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
	+ spa_vdev_removal_t *svr = NULL;
	+ uint64_t txg = dmu_tx_get_txg(tx);
	+
	+ ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
	+ svr = spa_vdev_removal_create(vd);
	+
	+ ASSERT(vd->vdev_removing);
	+ ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
	+
	+ spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
	+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
	+ /*
	+ * By activating the OBSOLETE_COUNTS feature, we prevent
	+ * the pool from being downgraded and ensure that the
	+ * refcounts are precise.
	+ */
	+ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	+ uint64_t one = 1;
	+ VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
	+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
	+ &one, tx));
	+ ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
	+ }
	+
	+ vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
	+ vd->vdev_indirect_mapping =
	+ vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
	+ vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
	+ vd->vdev_indirect_births =
	+ vdev_indirect_births_open(mos, vic->vic_births_object);
	+ spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
	+ spa->spa_removing_phys.sr_start_time = gethrestime_sec();
	+ spa->spa_removing_phys.sr_end_time = 0;
	+ spa->spa_removing_phys.sr_state = DSS_SCANNING;
	+ spa->spa_removing_phys.sr_to_copy = 0;
	+ spa->spa_removing_phys.sr_copied = 0;
	+
	+ /*
	+ * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
	+ * there may be space in the defer tree, which is free, but still
	+ * counted in vs_alloc.
	+ */
	+ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
	+ metaslab_t *ms = vd->vdev_ms[i];
	+ if (ms->ms_sm == NULL)
	+ continue;
	+
	+ /*
	+ * Sync tasks happen before metaslab_sync(), therefore
	+ * smp_alloc and sm_alloc must be the same.
	+ */
	+ ASSERT3U(space_map_allocated(ms->ms_sm), ==,
	+ ms->ms_sm->sm_phys->smp_alloc);
	+
	+ spa->spa_removing_phys.sr_to_copy +=
	+ space_map_allocated(ms->ms_sm);
	+
	+ /*
	+ * Space which we are freeing this txg does not need to
	+ * be copied.
	+ */
	+ spa->spa_removing_phys.sr_to_copy -=
	+ range_tree_space(ms->ms_freeingtree);
	+
	+ ASSERT0(range_tree_space(ms->ms_freedtree));
	+ for (int t = 0; t < TXG_SIZE; t++)
	+ ASSERT0(range_tree_space(ms->ms_alloctree[t]));
	+ }
	+
	+ /*
	+ * Sync tasks are called before metaslab_sync(), so there should
	+ * be no already-synced metaslabs in the TXG_CLEAN list.
	+ */
	+ ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
	+
	+ spa_sync_removing_state(spa, tx);
	+
	+ /*
	+ * All blocks that we need to read the most recent mapping must be
	+ * stored on concrete vdevs. Therefore, we must dirty anything that
	+ * is read before spa_remove_init(). Specifically, the
	+ * spa_config_object. (Note that although we already modified the
	+ * spa_config_object in spa_sync_removing_state, that may not have
	+ * modified all blocks of the object.)
	+ */
	+ dmu_object_info_t doi;
	+ VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
	+ for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
	+ dmu_buf_t *dbuf;
	+ VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
	+ offset, FTAG, &dbuf, 0));
	+ dmu_buf_will_dirty(dbuf, tx);
	+ offset += dbuf->db_size;
	+ dmu_buf_rele(dbuf, FTAG);
	+ }
	+
	+ /*
	+ * Now that we've allocated the im_object, dirty the vdev to ensure
	+ * that the object gets written to the config on disk.
	+ */
	+ vdev_config_dirty(vd);
	+
	+ zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu "
	+ "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx),
	+ vic->vic_mapping_object);
	+
	+ spa_history_log_internal(spa, "vdev remove started", tx,
	+ "%s vdev %llu %s", spa_name(spa), vd->vdev_id,
	+ (vd->vdev_path != NULL) ? vd->vdev_path : "-");
	+ /*
	+ * Setting spa_vdev_removal causes subsequent frees to call
	+ * free_from_removing_vdev(). Note that we don't need any locking
	+ * because we are the sync thread, and metaslab_free_impl() is only
	+ * called from syncing context (potentially from a zio taskq thread,
	+ * but in any case only when there are outstanding free i/os, which
	+ * there are not).
	+ */
	+ ASSERT3P(spa->spa_vdev_removal, ==, NULL);
	+ spa->spa_vdev_removal = svr;
	+ svr->svr_thread = thread_create(NULL, 0,
	+ spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
	+}
	+
	+/*
	+ * When we are opening a pool, we must read the mapping for each
	+ * indirect vdev in order from most recently removed to least
	+ * recently removed. We do this because the blocks for the mapping
	+ * of older indirect vdevs may be stored on more recently removed vdevs.
	+ * In order to read each indirect mapping object, we must have
	+ * initialized all more recently removed vdevs.
	+ */
	+int
	+spa_remove_init(spa_t *spa)
	+{
	+ int error;
	+
	+ error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
	+ DMU_POOL_DIRECTORY_OBJECT,
	+ DMU_POOL_REMOVING, sizeof (uint64_t),
	+ sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
	+ &spa->spa_removing_phys);
	+
	+ if (error == ENOENT) {
	+ spa->spa_removing_phys.sr_state = DSS_NONE;
	+ spa->spa_removing_phys.sr_removing_vdev = -1;
	+ spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
	+ return (0);
	+ } else if (error != 0) {
	+ return (error);
	+ }
	+
	+ if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
	+ /*
	+ * We are currently removing a vdev. Create and
	+ * initialize a spa_vdev_removal_t from the bonus
	+ * buffer of the removing vdevs vdev_im_object, and
	+ * initialize its partial mapping.
	+ */
	+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	+ vdev_t *vd = vdev_lookup_top(spa,
	+ spa->spa_removing_phys.sr_removing_vdev);
	+ spa_config_exit(spa, SCL_STATE, FTAG);
	+
	+ if (vd == NULL)
	+ return (EINVAL);
	+
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+
	+ ASSERT(vdev_is_concrete(vd));
	+ spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
	+ ASSERT(svr->svr_vdev->vdev_removing);
	+
	+ vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
	+ spa->spa_meta_objset, vic->vic_mapping_object);
	+ vd->vdev_indirect_births = vdev_indirect_births_open(
	+ spa->spa_meta_objset, vic->vic_births_object);
	+
	+ spa->spa_vdev_removal = svr;
	+ }
	+
	+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	+ uint64_t indirect_vdev_id =
	+ spa->spa_removing_phys.sr_prev_indirect_vdev;
	+ while (indirect_vdev_id != UINT64_MAX) {
	+ vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	+ vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
	+ spa->spa_meta_objset, vic->vic_mapping_object);
	+ vd->vdev_indirect_births = vdev_indirect_births_open(
	+ spa->spa_meta_objset, vic->vic_births_object);
	+
	+ indirect_vdev_id = vic->vic_prev_indirect_vdev;
	+ }
	+ spa_config_exit(spa, SCL_STATE, FTAG);
	+
	+ /*
	+ * Now that we've loaded all the indirect mappings, we can allow
	+ * reads from other blocks (e.g. via predictive prefetch).
	+ */
	+ spa->spa_indirect_vdevs_loaded = B_TRUE;
	+ return (0);
	+}
	+
	+void
	+spa_restart_removal(spa_t *spa)
	+{
	+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	+
	+ if (svr == NULL)
	+ return;
	+
	+ /*
	+ * In general when this function is called there is no
	+ * removal thread running. The only scenario where this
	+ * is not true is during spa_import() where this function
	+ * is called twice [once from spa_import_impl() and
	+ * spa_async_resume()]. Thus, in the scenario where we
	+ * import a pool that has an ongoing removal we don't
	+ * want to spawn a second thread.
	+ */
	+ if (svr->svr_thread != NULL)
	+ return;
	+
	+ if (!spa_writeable(spa))
	+ return;
	+
	+ vdev_t *vd = svr->svr_vdev;
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+
	+ ASSERT3P(vd, !=, NULL);
	+ ASSERT(vd->vdev_removing);
	+
	+ zfs_dbgmsg("restarting removal of %llu at count=%llu",
	+ vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
	+ svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
	+ 0, &p0, TS_RUN, minclsyspri);
	+}
	+
	+/*
	+ * Process freeing from a device which is in the middle of being removed.
	+ * We must handle this carefully so that we attempt to copy freed data,
	+ * and we correctly free already-copied data.
	+ */
	+void
	+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
	+ uint64_t txg)
	+{
	+ spa_t *spa = vd->vdev_spa;
	+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+ uint64_t max_offset_yet = 0;
	+
	+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
	+ ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
	+ vdev_indirect_mapping_object(vim));
	+ ASSERT3P(vd, ==, svr->svr_vdev);
	+ ASSERT3U(spa_syncing_txg(spa), ==, txg);
	+
	+ mutex_enter(&svr->svr_lock);
	+
	+ /*
	+ * Remove the segment from the removing vdev's spacemap. This
	+ * ensures that we will not attempt to copy this space (if the
	+ * removal thread has not yet visited it), and also ensures
	+ * that we know what is actually allocated on the new vdevs
	+ * (needed if we cancel the removal).
	+ *
	+ * Note: we must do the metaslab_free_concrete() with the svr_lock
	+ * held, so that the remove_thread can not load this metaslab and then
	+ * visit this offset between the time that we metaslab_free_concrete()
	+ * and when we check to see if it has been visited.
	+ */
	+ metaslab_free_concrete(vd, offset, size, txg);
	+
	+ uint64_t synced_size = 0;
	+ uint64_t synced_offset = 0;
	+ uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
	+ if (offset < max_offset_synced) {
	+ /*
	+ * The mapping for this offset is already on disk.
	+ * Free from the new location.
	+ *
	+ * Note that we use svr_max_synced_offset because it is
	+ * updated atomically with respect to the in-core mapping.
	+ * By contrast, vim_max_offset is not.
	+ *
	+ * This block may be split between a synced entry and an
	+ * in-flight or unvisited entry. Only process the synced
	+ * portion of it here.
	+ */
	+ synced_size = MIN(size, max_offset_synced - offset);
	+ synced_offset = offset;
	+
	+ ASSERT3U(max_offset_yet, <=, max_offset_synced);
	+ max_offset_yet = max_offset_synced;
	+
	+ DTRACE_PROBE3(remove__free__synced,
	+ spa_t *, spa,
	+ uint64_t, offset,
	+ uint64_t, synced_size);
	+
	+ size -= synced_size;
	+ offset += synced_size;
	+ }
	+
	+ /*
	+ * Look at all in-flight txgs starting from the currently syncing one
	+ * and see if a section of this free is being copied. By starting from
	+ * this txg and iterating forward, we might find that this region
	+ * was copied in two different txgs and handle it appropriately.
	+ */
	+ for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
	+ int txgoff = (txg + i) & TXG_MASK;
	+ if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
	+ /*
	+ * The mapping for this offset is in flight, and
	+ * will be synced in txg+i.
	+ */
	+ uint64_t inflight_size = MIN(size,
	+ svr->svr_max_offset_to_sync[txgoff] - offset);
	+
	+ DTRACE_PROBE4(remove__free__inflight,
	+ spa_t *, spa,
	+ uint64_t, offset,
	+ uint64_t, inflight_size,
	+ uint64_t, txg + i);
	+
	+ /*
	+ * We copy data in order of increasing offset.
	+ * Therefore the max_offset_to_sync[] must increase
	+ * (or be zero, indicating that nothing is being
	+ * copied in that txg).
	+ */
	+ if (svr->svr_max_offset_to_sync[txgoff] != 0) {
	+ ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
	+ >=, max_offset_yet);
	+ max_offset_yet =
	+ svr->svr_max_offset_to_sync[txgoff];
	+ }
	+
	+ /*
	+ * We've already committed to copying this segment:
	+ * we have allocated space elsewhere in the pool for
	+ * it and have an IO outstanding to copy the data. We
	+ * cannot free the space before the copy has
	+ * completed, or else the copy IO might overwrite any
	+ * new data. To free that space, we record the
	+ * segment in the appropriate svr_frees tree and free
	+ * the mapped space later, in the txg where we have
	+ * completed the copy and synced the mapping (see
	+ * vdev_mapping_sync).
	+ */
	+ range_tree_add(svr->svr_frees[txgoff],
	+ offset, inflight_size);
	+ size -= inflight_size;
	+ offset += inflight_size;
	+
	+ /*
	+ * This space is already accounted for as being
	+ * done, because it is being copied in txg+i.
	+ * However, if i!=0, then it is being copied in
	+ * a future txg. If we crash after this txg
	+ * syncs but before txg+i syncs, then the space
	+ * will be free. Therefore we must account
	+ * for the space being done in this txg
	+ * (when it is freed) rather than the future txg
	+ * (when it will be copied).
	+ */
	+ ASSERT3U(svr->svr_bytes_done[txgoff], >=,
	+ inflight_size);
	+ svr->svr_bytes_done[txgoff] -= inflight_size;
	+ svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
	+ }
	+ }
	+ ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
	+
	+ if (size > 0) {
	+ /*
	+ * The copy thread has not yet visited this offset. Ensure
	+ * that it doesn't.
	+ */
	+
	+ DTRACE_PROBE3(remove__free__unvisited,
	+ spa_t *, spa,
	+ uint64_t, offset,
	+ uint64_t, size);
	+
	+ if (svr->svr_allocd_segs != NULL)
	+ range_tree_clear(svr->svr_allocd_segs, offset, size);
	+
	+ /*
	+ * Since we now do not need to copy this data, for
	+ * accounting purposes we have done our job and can count
	+ * it as completed.
	+ */
	+ svr->svr_bytes_done[txg & TXG_MASK] += size;
	+ }
	+ mutex_exit(&svr->svr_lock);
	+
	+ /*
	+ * Now that we have dropped svr_lock, process the synced portion
	+ * of this free.
	+ */
	+ if (synced_size > 0) {
	+ vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
	+ txg);
	+ /*
	+ * Note: this can only be called from syncing context,
	+ * and the vdev_indirect_mapping is only changed from the
	+ * sync thread, so we don't need svr_lock while doing
	+ * metaslab_free_impl_cb.
	+ */
	+ vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
	+ metaslab_free_impl_cb, &txg);
	+ }
	+}
	+
	+/*
	+ * Stop an active removal and update the spa_removing phys.
	+ */
	+static void
	+spa_finish_removal(spa_t spa, dsl_scan_state_t state, dmu_tx_t tx)
	+{
	+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	+ ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
	+
	+ /* Ensure the removal thread has completed before we free the svr. */
	+ spa_vdev_remove_suspend(spa);
	+
	+ ASSERT(state == DSS_FINISHED \|\| state == DSS_CANCELED);
	+
	+ if (state == DSS_FINISHED) {
	+ spa_removing_phys_t *srp = &spa->spa_removing_phys;
	+ vdev_t *vd = svr->svr_vdev;
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+
	+ if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
	+ vdev_t *pvd = vdev_lookup_top(spa,
	+ srp->sr_prev_indirect_vdev);
	+ ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
	+ }
	+
	+ vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
	+ srp->sr_prev_indirect_vdev = vd->vdev_id;
	+ }
	+ spa->spa_removing_phys.sr_state = state;
	+ spa->spa_removing_phys.sr_end_time = gethrestime_sec();
	+
	+ spa->spa_vdev_removal = NULL;
	+ spa_vdev_removal_destroy(svr);
	+
	+ spa_sync_removing_state(spa, tx);
	+
	+ vdev_config_dirty(spa->spa_root_vdev);
	+}
	+
	+static void
	+free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
	+{
	+ vdev_t *vd = arg;
	+ vdev_indirect_mark_obsolete(vd, offset, size,
	+ vd->vdev_spa->spa_syncing_txg);
	+ vdev_indirect_ops.vdev_op_remap(vd, offset, size,
	+ metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
	+}
	+
	+/*
	+ * On behalf of the removal thread, syncs an incremental bit more of
	+ * the indirect mapping to disk and updates the in-memory mapping.
	+ * Called as a sync task in every txg that the removal thread makes progress.
	+ */
	+static void
	+vdev_mapping_sync(void arg, dmu_tx_t tx)
	+{
	+ spa_vdev_removal_t *svr = arg;
	+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	+ vdev_t *vd = svr->svr_vdev;
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+ uint64_t txg = dmu_tx_get_txg(tx);
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+
	+ ASSERT(vic->vic_mapping_object != 0);
	+ ASSERT3U(txg, ==, spa_syncing_txg(spa));
	+
	+ vdev_indirect_mapping_add_entries(vim,
	+ &svr->svr_new_segments[txg & TXG_MASK], tx);
	+ vdev_indirect_births_add_entry(vd->vdev_indirect_births,
	+ vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
	+
	+ /*
	+ * Free the copied data for anything that was freed while the
	+ * mapping entries were in flight.
	+ */
	+ mutex_enter(&svr->svr_lock);
	+ range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
	+ free_mapped_segment_cb, vd);
	+ ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
	+ vdev_indirect_mapping_max_offset(vim));
	+ svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
	+ mutex_exit(&svr->svr_lock);
	+
	+ spa_sync_removing_state(spa, tx);
	+}
	+
	+static void
	+spa_vdev_copy_segment_write_done(zio_t *zio)
	+{
	+ vdev_copy_seg_arg_t *vcsa = zio->io_private;
	+ vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
	+ spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
	+ abd_free(zio->io_abd);
	+
	+ mutex_enter(&vca->vca_lock);
	+ vca->vca_outstanding_bytes -= zio->io_size;
	+ cv_signal(&vca->vca_cv);
	+ mutex_exit(&vca->vca_lock);
	+
	+ ASSERT0(zio->io_error);
	+ kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
	+ kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
	+}
	+
	+static void
	+spa_vdev_copy_segment_read_done(zio_t *zio)
	+{
	+ vdev_copy_seg_arg_t *vcsa = zio->io_private;
	+ dva_t *dest_dva = vcsa->vcsa_dest_dva;
	+ uint64_t txg = vcsa->vcsa_txg;
	+ spa_t *spa = zio->io_spa;
	+ vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva));
	+ blkptr_t *bp = NULL;
	+ dva_t *dva = NULL;
	+ uint64_t size = zio->io_size;
	+
	+ ASSERT3P(dest_vd, !=, NULL);
	+ ASSERT0(zio->io_error);
	+
	+ vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
	+ bp = vcsa->vcsa_dest_bp;
	+ dva = bp->blk_dva;
	+
	+ BP_ZERO(bp);
	+
	+ /* initialize with dest_dva */
	+ bcopy(dest_dva, dva, sizeof (dva_t));
	+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
	+
	+ BP_SET_LSIZE(bp, size);
	+ BP_SET_PSIZE(bp, size);
	+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
	+ BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
	+ BP_SET_TYPE(bp, DMU_OT_NONE);
	+ BP_SET_LEVEL(bp, 0);
	+ BP_SET_DEDUP(bp, 0);
	+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
	+
	+ zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
	+ txg, bp, zio->io_abd, size,
	+ spa_vdev_copy_segment_write_done, vcsa,
	+ ZIO_PRIORITY_REMOVAL, 0, NULL));
	+}
	+
	+static int
	+spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
	+ vdev_copy_arg_t vca, zio_alloc_list_t zal)
	+{
	+ metaslab_group_t *mg = vd->vdev_mg;
	+ spa_t *spa = vd->vdev_spa;
	+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	+ vdev_indirect_mapping_entry_t *entry;
	+ vdev_copy_seg_arg_t *private;
	+ dva_t dst = { 0 };
	+ blkptr_t blk, *bp = &blk;
	+ dva_t *dva = bp->blk_dva;
	+
	+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
	+
	+ int error = metaslab_alloc_dva(spa, mg->mg_class, size,
	+ &dst, 0, NULL, txg, 0, zal);
	+ if (error != 0)
	+ return (error);
	+
	+ /*
	+ * We can't have any padding of the allocated size, otherwise we will
	+ * misunderstand what's allocated, and the size of the mapping.
	+ * The caller ensures this will be true by passing in a size that is
	+ * aligned to the worst (highest) ashift in the pool.
	+ */
	+ ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
	+
	+ mutex_enter(&vca->vca_lock);
	+ vca->vca_outstanding_bytes += size;
	+ mutex_exit(&vca->vca_lock);
	+
	+ entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
	+ DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
	+ entry->vime_mapping.vimep_dst = dst;
	+
	+ private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
	+ private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
	+ private->vcsa_txg = txg;
	+ private->vcsa_copy_arg = vca;
	+
	+ /*
	+ * This lock is eventually released by the donefunc for the
	+ * zio_write_phys that finishes copying the data.
	+ */
	+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	+
	+ /*
	+ * Do logical I/O, letting the redundancy vdevs (like mirror)
	+ * handle their own I/O instead of duplicating that code here.
	+ */
	+ BP_ZERO(bp);
	+
	+ DVA_SET_VDEV(&dva[0], vd->vdev_id);
	+ DVA_SET_OFFSET(&dva[0], start);
	+ DVA_SET_GANG(&dva[0], 0);
	+ DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
	+
	+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
	+
	+ BP_SET_LSIZE(bp, size);
	+ BP_SET_PSIZE(bp, size);
	+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
	+ BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
	+ BP_SET_TYPE(bp, DMU_OT_NONE);
	+ BP_SET_LEVEL(bp, 0);
	+ BP_SET_DEDUP(bp, 0);
	+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
	+
	+ zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
	+ bp, abd_alloc_for_io(size, B_FALSE), size,
	+ spa_vdev_copy_segment_read_done, private,
	+ ZIO_PRIORITY_REMOVAL, 0, NULL));
	+
	+ list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
	+ ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
	+ vdev_dirty(vd, 0, NULL, txg);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Complete the removal of a toplevel vdev. This is called as a
	+ * synctask in the same txg that we will sync out the new config (to the
	+ * MOS object) which indicates that this vdev is indirect.
	+ */
	+static void
	+vdev_remove_complete_sync(void arg, dmu_tx_t tx)
	+{
	+ spa_vdev_removal_t *svr = arg;
	+ vdev_t *vd = svr->svr_vdev;
	+ spa_t *spa = vd->vdev_spa;
	+
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	+
	+ for (int i = 0; i < TXG_SIZE; i++) {
	+ ASSERT0(svr->svr_bytes_done[i]);
	+ }
	+
	+ ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
	+ spa->spa_removing_phys.sr_to_copy);
	+
	+ vdev_destroy_spacemaps(vd, tx);
	+
	+ /* destroy leaf zaps, if any */
	+ ASSERT3P(svr->svr_zaplist, !=, NULL);
	+ for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
	+ pair != NULL;
	+ pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
	+ vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
	+ }
	+ fnvlist_free(svr->svr_zaplist);
	+
	+ spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
	+ /* vd->vdev_path is not available here */
	+ spa_history_log_internal(spa, "vdev remove completed", tx,
	+ "%s vdev %llu", spa_name(spa), vd->vdev_id);
	+}
	+
	+static void
	+vdev_indirect_state_transfer(vdev_t ivd, vdev_t vd)
	+{
	+ ivd->vdev_indirect_config = vd->vdev_indirect_config;
	+
	+ ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
	+ ASSERT(vd->vdev_indirect_mapping != NULL);
	+ ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
	+ vd->vdev_indirect_mapping = NULL;
	+
	+ ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
	+ ASSERT(vd->vdev_indirect_births != NULL);
	+ ivd->vdev_indirect_births = vd->vdev_indirect_births;
	+ vd->vdev_indirect_births = NULL;
	+
	+ ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
	+ ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
	+
	+ if (vd->vdev_obsolete_sm != NULL) {
	+ ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
	+
	+ /*
	+ * We cannot use space_map_{open,close} because we hold all
	+ * the config locks as writer.
	+ */
	+ ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
	+ ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
	+ vd->vdev_obsolete_sm = NULL;
	+ }
	+}
	+
	+static void
	+vdev_remove_enlist_zaps(vdev_t vd, nvlist_t zlist)
	+{
	+ ASSERT3P(zlist, !=, NULL);
	+ ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
	+
	+ if (vd->vdev_leaf_zap != 0) {
	+ char zkey[32];
	+ (void) snprintf(zkey, sizeof (zkey), "%s-%ju",
	+ VDEV_REMOVAL_ZAP_OBJS, (uintmax_t)vd->vdev_leaf_zap);
	+ fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
	+ }
	+
	+ for (uint64_t id = 0; id < vd->vdev_children; id++) {
	+ vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
	+ }
	+}
	+
	+static void
	+vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
	+{
	+ vdev_t *ivd;
	+ dmu_tx_t *tx;
	+ spa_t *spa = vd->vdev_spa;
	+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	+
	+ /*
	+ * First, build a list of leaf zaps to be destroyed.
	+ * This is passed to the sync context thread,
	+ * which does the actual unlinking.
	+ */
	+ svr->svr_zaplist = fnvlist_alloc();
	+ vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
	+
	+ ivd = vdev_add_parent(vd, &vdev_indirect_ops);
	+
	+ vd->vdev_leaf_zap = 0;
	+
	+ vdev_remove_child(ivd, vd);
	+ vdev_compact_children(ivd);
	+
	+ vdev_indirect_state_transfer(ivd, vd);
	+
	+ svr->svr_vdev = ivd;
	+
	+ ASSERT(!ivd->vdev_removing);
	+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
	+
	+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
	+ 0, ZFS_SPACE_CHECK_NONE, tx);
	+ dmu_tx_commit(tx);
	+
	+ /*
	+ * Indicate that this thread has exited.
	+ * After this, we can not use svr.
	+ */
	+ mutex_enter(&svr->svr_lock);
	+ svr->svr_thread = NULL;
	+ cv_broadcast(&svr->svr_cv);
	+ mutex_exit(&svr->svr_lock);
	+}
	+
	+/*
	+ * Complete the removal of a toplevel vdev. This is called in open
	+ * context by the removal thread after we have copied all vdev's data.
	+ */
	+static void
	+vdev_remove_complete(vdev_t *vd)
	+{
	+ spa_t *spa = vd->vdev_spa;
	+ uint64_t txg;
	+
	+ /*
	+ * Wait for any deferred frees to be synced before we call
	+ * vdev_metaslab_fini()
	+ */
	+ txg_wait_synced(spa->spa_dsl_pool, 0);
	+
	+ txg = spa_vdev_enter(spa);
	+ zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
	+ vd->vdev_id, txg);
	+
	+ /*
	+ * Discard allocation state.
	+ */
	+ if (vd->vdev_mg != NULL) {
	+ vdev_metaslab_fini(vd);
	+ metaslab_group_destroy(vd->vdev_mg);
	+ vd->vdev_mg = NULL;
	+ }
	+ ASSERT0(vd->vdev_stat.vs_space);
	+ ASSERT0(vd->vdev_stat.vs_dspace);
	+
	+ vdev_remove_replace_with_indirect(vd, txg);
	+
	+ /*
	+ * We now release the locks, allowing spa_sync to run and finish the
	+ * removal via vdev_remove_complete_sync in syncing context.
	+ */
	+ (void) spa_vdev_exit(spa, NULL, txg, 0);
	+
	+ /*
	+ * Top ZAP should have been transferred to the indirect vdev in
	+ * vdev_remove_replace_with_indirect.
	+ */
	+ ASSERT0(vd->vdev_top_zap);
	+
	+ /*
	+ * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
	+ */
	+ ASSERT0(vd->vdev_leaf_zap);
	+
	+ txg = spa_vdev_enter(spa);
	+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
	+ /*
	+ * Request to update the config and the config cachefile.
	+ */
	+ vdev_config_dirty(spa->spa_root_vdev);
	+ (void) spa_vdev_exit(spa, vd, txg, 0);
	+}
	+
	+/*
	+ * Evacuates a segment of size at most max_alloc from the vdev
	+ * via repeated calls to spa_vdev_copy_segment. If an allocation
	+ * fails, the pool is probably too fragmented to handle such a
	+ * large size, so decrease max_alloc so that the caller will not try
	+ * this size again this txg.
	+ */
	+static void
	+spa_vdev_copy_impl(spa_vdev_removal_t svr, vdev_copy_arg_t vca,
	+ uint64_t max_alloc, dmu_tx_t tx)
	+{
	+ uint64_t txg = dmu_tx_get_txg(tx);
	+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	+
	+ mutex_enter(&svr->svr_lock);
	+
	+ range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
	+ if (rs == NULL) {
	+ mutex_exit(&svr->svr_lock);
	+ return;
	+ }
	+ uint64_t offset = rs->rs_start;
	+ uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc);
	+
	+ range_tree_remove(svr->svr_allocd_segs, offset, length);
	+
	+ if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
	+ dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
	+ svr, 0, ZFS_SPACE_CHECK_NONE, tx);
	+ }
	+
	+ svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length;
	+
	+ /*
	+ * Note: this is the amount of allocated space
	+ * that we are taking care of each txg.
	+ */
	+ svr->svr_bytes_done[txg & TXG_MASK] += length;
	+
	+ mutex_exit(&svr->svr_lock);
	+
	+ zio_alloc_list_t zal;
	+ metaslab_trace_init(&zal);
	+ uint64_t thismax = *max_alloc;
	+ while (length > 0) {
	+ uint64_t mylen = MIN(length, thismax);
	+
	+ int error = spa_vdev_copy_segment(svr->svr_vdev,
	+ offset, mylen, txg, vca, &zal);
	+
	+ if (error == ENOSPC) {
	+ /*
	+ * Cut our segment in half, and don't try this
	+ * segment size again this txg. Note that the
	+ * allocation size must be aligned to the highest
	+ * ashift in the pool, so that the allocation will
	+ * not be padded out to a multiple of the ashift,
	+ * which could cause us to think that this mapping
	+ * is larger than we intended.
	+ */
	+ ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
	+ ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
	+ thismax = P2ROUNDUP(mylen / 2,
	+ 1 << spa->spa_max_ashift);
	+ ASSERT3U(thismax, <, mylen);
	+ /*
	+ * The minimum-size allocation can not fail.
	+ */
	+ ASSERT3U(mylen, >, 1 << spa->spa_max_ashift);
	+ *max_alloc = mylen - (1 << spa->spa_max_ashift);
	+ } else {
	+ ASSERT0(error);
	+ length -= mylen;
	+ offset += mylen;
	+
	+ /*
	+ * We've performed an allocation, so reset the
	+ * alloc trace list.
	+ */
	+ metaslab_trace_fini(&zal);
	+ metaslab_trace_init(&zal);
	+ }
	+ }
	+ metaslab_trace_fini(&zal);
	+}
	+
	+/*
	+ * The removal thread operates in open context. It iterates over all
	+ * allocated space in the vdev, by loading each metaslab's spacemap.
	+ * For each contiguous segment of allocated space (capping the segment
	+ * size at SPA_MAXBLOCKSIZE), we:
	+ * - Allocate space for it on another vdev.
	+ * - Create a new mapping from the old location to the new location
	+ * (as a record in svr_new_segments).
	+ * - Initiate a logical read zio to get the data off the removing disk.
	+ * - In the read zio's done callback, initiate a logical write zio to
	+ * write it to the new vdev.
	+ * Note that all of this will take effect when a particular TXG syncs.
	+ * The sync thread ensures that all the phys reads and writes for the syncing
	+ * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
	+ * (see vdev_mapping_sync()).
	+ */
	+static void
	+spa_vdev_remove_thread(void *arg)
	+{
	+ vdev_t *vd = arg;
	+ spa_t *spa = vd->vdev_spa;
	+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	+ vdev_copy_arg_t vca;
	+ uint64_t max_alloc = zfs_remove_max_segment;
	+ uint64_t last_txg = 0;
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+ uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
	+
	+ ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
	+ ASSERT(vdev_is_concrete(vd));
	+ ASSERT(vd->vdev_removing);
	+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
	+ ASSERT3P(svr->svr_vdev, ==, vd);
	+ ASSERT(vim != NULL);
	+
	+ mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
	+ cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
	+ vca.vca_outstanding_bytes = 0;
	+
	+ mutex_enter(&svr->svr_lock);
	+
	+ /*
	+ * Start from vim_max_offset so we pick up where we left off
	+ * if we are restarting the removal after opening the pool.
	+ */
	+ uint64_t msi;
	+ for (msi = start_offset >> vd->vdev_ms_shift;
	+ msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
	+ metaslab_t *msp = vd->vdev_ms[msi];
	+ ASSERT3U(msi, <=, vd->vdev_ms_count);
	+
	+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
	+
	+ mutex_enter(&msp->ms_sync_lock);
	+ mutex_enter(&msp->ms_lock);
	+
	+ /*
	+ * Assert nothing in flight -- ms_*tree is empty.
	+ */
	+ for (int i = 0; i < TXG_SIZE; i++) {
	+ ASSERT0(range_tree_space(msp->ms_alloctree[i]));
	+ }
	+
	+ /*
	+ * If the metaslab has ever been allocated from (ms_sm!=NULL),
	+ * read the allocated segments from the space map object
	+ * into svr_allocd_segs. Since we do this while holding
	+ * svr_lock and ms_sync_lock, concurrent frees (which
	+ * would have modified the space map) will wait for us
	+ * to finish loading the spacemap, and then take the
	+ * appropriate action (see free_from_removing_vdev()).
	+ */
	+ if (msp->ms_sm != NULL) {
	+ space_map_t *sm = NULL;
	+
	+ /*
	+ * We have to open a new space map here, because
	+ * ms_sm's sm_length and sm_alloc may not reflect
	+ * what's in the object contents, if we are in between
	+ * metaslab_sync() and metaslab_sync_done().
	+ */
	+ VERIFY0(space_map_open(&sm,
	+ spa->spa_dsl_pool->dp_meta_objset,
	+ msp->ms_sm->sm_object, msp->ms_sm->sm_start,
	+ msp->ms_sm->sm_size, msp->ms_sm->sm_shift));
	+ space_map_update(sm);
	+ VERIFY0(space_map_load(sm, svr->svr_allocd_segs,
	+ SM_ALLOC));
	+ space_map_close(sm);
	+
	+ range_tree_walk(msp->ms_freeingtree,
	+ range_tree_remove, svr->svr_allocd_segs);
	+
	+ /*
	+ * When we are resuming from a paused removal (i.e.
	+ * when importing a pool with a removal in progress),
	+ * discard any state that we have already processed.
	+ */
	+ range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
	+ }
	+ mutex_exit(&msp->ms_lock);
	+ mutex_exit(&msp->ms_sync_lock);
	+
	+ vca.vca_msp = msp;
	+ zfs_dbgmsg("copying %llu segments for metaslab %llu",
	+ avl_numnodes(&svr->svr_allocd_segs->rt_root),
	+ msp->ms_id);
	+
	+ while (!svr->svr_thread_exit &&
	+ range_tree_space(svr->svr_allocd_segs) != 0) {
	+
	+ mutex_exit(&svr->svr_lock);
	+
	+ mutex_enter(&vca.vca_lock);
	+ while (vca.vca_outstanding_bytes >
	+ zfs_remove_max_copy_bytes) {
	+ cv_wait(&vca.vca_cv, &vca.vca_lock);
	+ }
	+ mutex_exit(&vca.vca_lock);
	+
	+ dmu_tx_t *tx =
	+ dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	+
	+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	+ uint64_t txg = dmu_tx_get_txg(tx);
	+
	+ if (txg != last_txg)
	+ max_alloc = zfs_remove_max_segment;
	+ last_txg = txg;
	+
	+ spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
	+
	+ dmu_tx_commit(tx);
	+ mutex_enter(&svr->svr_lock);
	+ }
	+ }
	+
	+ mutex_exit(&svr->svr_lock);
	+ /*
	+ * Wait for all copies to finish before cleaning up the vca.
	+ */
	+ txg_wait_synced(spa->spa_dsl_pool, 0);
	+ ASSERT0(vca.vca_outstanding_bytes);
	+
	+ mutex_destroy(&vca.vca_lock);
	+ cv_destroy(&vca.vca_cv);
	+
	+ if (svr->svr_thread_exit) {
	+ mutex_enter(&svr->svr_lock);
	+ range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
	+ svr->svr_thread = NULL;
	+ cv_broadcast(&svr->svr_cv);
	+ mutex_exit(&svr->svr_lock);
	+ } else {
	+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
	+ vdev_remove_complete(vd);
	+ }
	+ thread_exit();
	+}
	+
	+void
	+spa_vdev_remove_suspend(spa_t *spa)
	+{
	+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	+
	+ if (svr == NULL)
	+ return;
	+
	+ mutex_enter(&svr->svr_lock);
	+ svr->svr_thread_exit = B_TRUE;
	+ while (svr->svr_thread != NULL)
	+ cv_wait(&svr->svr_cv, &svr->svr_lock);
	+ svr->svr_thread_exit = B_FALSE;
	+ mutex_exit(&svr->svr_lock);
	+}
	+
	+/* ARGSUSED */
	+static int
	+spa_vdev_remove_cancel_check(void arg, dmu_tx_t tx)
	+{
	+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	+
	+ if (spa->spa_vdev_removal == NULL)
	+ return (ESRCH);
	+ return (0);
	+}
	+
	+/*
	+ * Cancel a removal by freeing all entries from the partial mapping
	+ * and marking the vdev as no longer being removing.
	+ */
	+/* ARGSUSED */
	+static void
	+spa_vdev_remove_cancel_sync(void arg, dmu_tx_t tx)
	+{
	+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	+ vdev_t *vd = svr->svr_vdev;
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+ objset_t *mos = spa->spa_meta_objset;
	+
	+ ASSERT3P(svr->svr_thread, ==, NULL);
	+
	+ spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
	+ if (vdev_obsolete_counts_are_precise(vd)) {
	+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
	+ VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
	+ }
	+
	+ if (vdev_obsolete_sm_object(vd) != 0) {
	+ ASSERT(vd->vdev_obsolete_sm != NULL);
	+ ASSERT3U(vdev_obsolete_sm_object(vd), ==,
	+ space_map_object(vd->vdev_obsolete_sm));
	+
	+ space_map_free(vd->vdev_obsolete_sm, tx);
	+ VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
	+ VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
	+ space_map_close(vd->vdev_obsolete_sm);
	+ vd->vdev_obsolete_sm = NULL;
	+ spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
	+ }
	+ for (int i = 0; i < TXG_SIZE; i++) {
	+ ASSERT(list_is_empty(&svr->svr_new_segments[i]));
	+ ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
	+ vdev_indirect_mapping_max_offset(vim));
	+ }
	+
	+ for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
	+ metaslab_t *msp = vd->vdev_ms[msi];
	+
	+ if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
	+ break;
	+
	+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
	+
	+ mutex_enter(&msp->ms_lock);
	+
	+ /*
	+ * Assert nothing in flight -- ms_*tree is empty.
	+ */
	+ for (int i = 0; i < TXG_SIZE; i++)
	+ ASSERT0(range_tree_space(msp->ms_alloctree[i]));
	+ for (int i = 0; i < TXG_DEFER_SIZE; i++)
	+ ASSERT0(range_tree_space(msp->ms_defertree[i]));
	+ ASSERT0(range_tree_space(msp->ms_freedtree));
	+
	+ if (msp->ms_sm != NULL) {
	+ /*
	+ * Assert that the in-core spacemap has the same
	+ * length as the on-disk one, so we can use the
	+ * existing in-core spacemap to load it from disk.
	+ */
	+ ASSERT3U(msp->ms_sm->sm_alloc, ==,
	+ msp->ms_sm->sm_phys->smp_alloc);
	+ ASSERT3U(msp->ms_sm->sm_length, ==,
	+ msp->ms_sm->sm_phys->smp_objsize);
	+
	+ mutex_enter(&svr->svr_lock);
	+ VERIFY0(space_map_load(msp->ms_sm,
	+ svr->svr_allocd_segs, SM_ALLOC));
	+ range_tree_walk(msp->ms_freeingtree,
	+ range_tree_remove, svr->svr_allocd_segs);
	+
	+ /*
	+ * Clear everything past what has been synced,
	+ * because we have not allocated mappings for it yet.
	+ */
	+ uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
	+ range_tree_clear(svr->svr_allocd_segs, syncd,
	+ msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
	+
	+ mutex_exit(&svr->svr_lock);
	+ }
	+ mutex_exit(&msp->ms_lock);
	+
	+ mutex_enter(&svr->svr_lock);
	+ range_tree_vacate(svr->svr_allocd_segs,
	+ free_mapped_segment_cb, vd);
	+ mutex_exit(&svr->svr_lock);
	+ }
	+
	+ /*
	+ * Note: this must happen after we invoke free_mapped_segment_cb,
	+ * because it adds to the obsolete_segments.
	+ */
	+ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
	+
	+ ASSERT3U(vic->vic_mapping_object, ==,
	+ vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
	+ vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
	+ vd->vdev_indirect_mapping = NULL;
	+ vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
	+ vic->vic_mapping_object = 0;
	+
	+ ASSERT3U(vic->vic_births_object, ==,
	+ vdev_indirect_births_object(vd->vdev_indirect_births));
	+ vdev_indirect_births_close(vd->vdev_indirect_births);
	+ vd->vdev_indirect_births = NULL;
	+ vdev_indirect_births_free(mos, vic->vic_births_object, tx);
	+ vic->vic_births_object = 0;
	+
	+ /*
	+ * We may have processed some frees from the removing vdev in this
	+ * txg, thus increasing svr_bytes_done; discard that here to
	+ * satisfy the assertions in spa_vdev_removal_destroy().
	+ * Note that future txg's can not have any bytes_done, because
	+ * future TXG's are only modified from open context, and we have
	+ * already shut down the copying thread.
	+ */
	+ svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
	+ spa_finish_removal(spa, DSS_CANCELED, tx);
	+
	+ vd->vdev_removing = B_FALSE;
	+ vdev_config_dirty(vd);
	+
	+ zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
	+ vd->vdev_id, dmu_tx_get_txg(tx));
	+ spa_history_log_internal(spa, "vdev remove canceled", tx,
	+ "%s vdev %llu %s", spa_name(spa),
	+ vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
	+}
	+
	+int
	+spa_vdev_remove_cancel(spa_t *spa)
	+{
	+ spa_vdev_remove_suspend(spa);
	+
	+ if (spa->spa_vdev_removal == NULL)
	+ return (ESRCH);
	+
	+ uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
	+
	+ int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
	+ spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
	+
	+ if (error == 0) {
	+ spa_config_enter(spa, SCL_ALLOC \| SCL_VDEV, FTAG, RW_WRITER);
	+ vdev_t *vd = vdev_lookup_top(spa, vdid);
	+ metaslab_group_activate(vd->vdev_mg);
	+ spa_config_exit(spa, SCL_ALLOC \| SCL_VDEV, FTAG);
	+ }
	+
	+ return (error);
	+}
	+
	+/*
	+ * Called every sync pass of every txg if there's a svr.
	+ */
	+void
	+svr_sync(spa_t spa, dmu_tx_t tx)
	+{
	+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
	+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
	+
	+ /*
	+ * This check is necessary so that we do not dirty the
	+ * DIRECTORY_OBJECT via spa_sync_removing_state() when there
	+ * is nothing to do. Dirtying it every time would prevent us
	+ * from syncing-to-convergence.
	+ */
	+ if (svr->svr_bytes_done[txgoff] == 0)
	+ return;
	+
	+ /*
	+ * Update progress accounting.
	+ */
	+ spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
	+ svr->svr_bytes_done[txgoff] = 0;
	+
	+ spa_sync_removing_state(spa, tx);
	+}
	+
	+static void
	+vdev_remove_make_hole_and_free(vdev_t *vd)
	+{
	+ uint64_t id = vd->vdev_id;
	+ spa_t *spa = vd->vdev_spa;
	+ vdev_t *rvd = spa->spa_root_vdev;
	+ boolean_t last_vdev = (id == (rvd->vdev_children - 1));
	+
	+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
	+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	+
	+ vdev_free(vd);
	+
	+ if (last_vdev) {
	+ vdev_compact_children(rvd);
	+ } else {
	+ vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
	+ vdev_add_child(rvd, vd);
	+ }
	+ vdev_config_dirty(rvd);
	+
	+ /*
	+ * Reassess the health of our root vdev.
	+ */
	+ vdev_reopen(rvd);
	+}
	+
	+/*
	+ * Remove a log device. The config lock is held for the specified TXG.
	+ */
	+static int
	+spa_vdev_remove_log(vdev_t vd, uint64_t txg)
	+{
	+ metaslab_group_t *mg = vd->vdev_mg;
	+ spa_t *spa = vd->vdev_spa;
	+ int error = 0;
	+
	+ ASSERT(vd->vdev_islog);
	+ ASSERT(vd == vd->vdev_top);
	+
	+ /*
	+ * Stop allocating from this vdev.
	+ */
	+ metaslab_group_passivate(mg);
	+
	+ /*
	+ * Wait for the youngest allocations and frees to sync,
	+ * and then wait for the deferral of those frees to finish.
	+ */
	+ spa_vdev_config_exit(spa, NULL,
	+ *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
	+
	+ /*
	+ * Evacuate the device. We don't hold the config lock as writer
	+ * since we need to do I/O but we do keep the
	+ * spa_namespace_lock held. Once this completes the device
	+ * should no longer have any blocks allocated on it.
	+ */
	+ if (vd->vdev_islog) {
	+ if (vd->vdev_stat.vs_alloc != 0)
	+ error = spa_reset_logs(spa);
	+ }
	+
	+ *txg = spa_vdev_config_enter(spa);
	+
	+ if (error != 0) {
	+ metaslab_group_activate(mg);
	+ return (error);
	+ }
	+ ASSERT0(vd->vdev_stat.vs_alloc);
	+
	+ /*
	+ * The evacuation succeeded. Remove any remaining MOS metadata
	+ * associated with this vdev, and wait for these changes to sync.
	+ */
	+ vd->vdev_removing = B_TRUE;
	+
	+ vdev_dirty_leaves(vd, VDD_DTL, *txg);
	+ vdev_config_dirty(vd);
	+
	+ spa_history_log_internal(spa, "vdev remove", NULL,
	+ "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
	+ (vd->vdev_path != NULL) ? vd->vdev_path : "-");
	+
	+ /* Make sure these changes are sync'ed */
	+ spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
	+
	+ *txg = spa_vdev_config_enter(spa);
	+
	+ sysevent_t *ev = spa_event_create(spa, vd, NULL,
	+ ESC_ZFS_VDEV_REMOVE_DEV);
	+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
	+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	+
	+ /* The top ZAP should have been destroyed by vdev_remove_empty. */
	+ ASSERT0(vd->vdev_top_zap);
	+ /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
	+ ASSERT0(vd->vdev_leaf_zap);
	+
	+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
	+
	+ if (list_link_active(&vd->vdev_state_dirty_node))
	+ vdev_state_clean(vd);
	+ if (list_link_active(&vd->vdev_config_dirty_node))
	+ vdev_config_clean(vd);
	+
	+ /*
	+ * Clean up the vdev namespace.
	+ */
	+ vdev_remove_make_hole_and_free(vd);
	+
	+ if (ev != NULL)
	+ spa_event_post(ev);
	+
	+ return (0);
	+}
	+
	+static int
	+spa_vdev_remove_top_check(vdev_t *vd)
	+{
	+ spa_t *spa = vd->vdev_spa;
	+
	+ if (vd != vd->vdev_top)
	+ return (SET_ERROR(ENOTSUP));
	+
	+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
	+ return (SET_ERROR(ENOTSUP));
	+
	+ /*
	+ * There has to be enough free space to remove the
	+ * device and leave double the "slop" space (i.e. we
	+ * must leave at least 3% of the pool free, in addition to
	+ * the normal slop space).
	+ */
	+ if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
	+ NULL, 0, B_TRUE) <
	+ vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
	+ return (SET_ERROR(ENOSPC));
	+ }
	+
	+ /*
	+ * There can not be a removal in progress.
	+ */
	+ if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
	+ return (SET_ERROR(EBUSY));
	+
	+ /*
	+ * The device must have all its data.
	+ */
	+ if (!vdev_dtl_empty(vd, DTL_MISSING) \|\|
	+ !vdev_dtl_empty(vd, DTL_OUTAGE))
	+ return (SET_ERROR(EBUSY));
	+
	+ /*
	+ * The device must be healthy.
	+ */
	+ if (!vdev_readable(vd))
	+ return (SET_ERROR(EIO));
	+
	+ /*
	+ * All vdevs in normal class must have the same ashift.
	+ */
	+ if (spa->spa_max_ashift != spa->spa_min_ashift) {
	+ return (SET_ERROR(EINVAL));
	+ }
	+
	+ /*
	+ * All vdevs in normal class must have the same ashift
	+ * and not be raidz.
	+ */
	+ vdev_t *rvd = spa->spa_root_vdev;
	+ int num_indirect = 0;
	+ for (uint64_t id = 0; id < rvd->vdev_children; id++) {
	+ vdev_t *cvd = rvd->vdev_child[id];
	+ if (cvd->vdev_ashift != 0 && !cvd->vdev_islog)
	+ ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
	+ if (cvd->vdev_ops == &vdev_indirect_ops)
	+ num_indirect++;
	+ if (!vdev_is_concrete(cvd))
	+ continue;
	+ if (cvd->vdev_ops == &vdev_raidz_ops)
	+ return (SET_ERROR(EINVAL));
	+ /*
	+ * Need the mirror to be mirror of leaf vdevs only
	+ */
	+ if (cvd->vdev_ops == &vdev_mirror_ops) {
	+ for (uint64_t cid = 0;
	+ cid < cvd->vdev_children; cid++) {
	+ vdev_t *tmp = cvd->vdev_child[cid];
	+ if (!tmp->vdev_ops->vdev_op_leaf)
	+ return (SET_ERROR(EINVAL));
	+ }
	+ }
	+ }
	+
	+ return (0);
	+}
	+
	+/*
	+ * Initiate removal of a top-level vdev, reducing the total space in the pool.
	+ * The config lock is held for the specified TXG. Once initiated,
	+ * evacuation of all allocated space (copying it to other vdevs) happens
	+ * in the background (see spa_vdev_remove_thread()), and can be canceled
	+ * (see spa_vdev_remove_cancel()). If successful, the vdev will
	+ * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
	+ */
	+static int
	+spa_vdev_remove_top(vdev_t vd, uint64_t txg)
	+{
	+ spa_t *spa = vd->vdev_spa;
	+ int error;
	+
	+ /*
	+ * Check for errors up-front, so that we don't waste time
	+ * passivating the metaslab group and clearing the ZIL if there
	+ * are errors.
	+ */
	+ error = spa_vdev_remove_top_check(vd);
	+ if (error != 0)
	+ return (error);
	+
	+ /*
	+ * Stop allocating from this vdev. Note that we must check
	+ * that this is not the only device in the pool before
	+ * passivating, otherwise we will not be able to make
	+ * progress because we can't allocate from any vdevs.
	+ * The above check for sufficient free space serves this
	+ * purpose.
	+ */
	+ metaslab_group_t *mg = vd->vdev_mg;
	+ metaslab_group_passivate(mg);
	+
	+ /*
	+ * Wait for the youngest allocations and frees to sync,
	+ * and then wait for the deferral of those frees to finish.
	+ */
	+ spa_vdev_config_exit(spa, NULL,
	+ *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
	+
	+ /*
	+ * We must ensure that no "stubby" log blocks are allocated
	+ * on the device to be removed. These blocks could be
	+ * written at any time, including while we are in the middle
	+ * of copying them.
	+ */
	+ error = spa_reset_logs(spa);
	+
	+ *txg = spa_vdev_config_enter(spa);
	+
	+ /*
	+ * Things might have changed while the config lock was dropped
	+ * (e.g. space usage). Check for errors again.
	+ */
	+ if (error == 0)
	+ error = spa_vdev_remove_top_check(vd);
	+
	+ if (error != 0) {
	+ metaslab_group_activate(mg);
	+ return (error);
	+ }
	+
	+ vd->vdev_removing = B_TRUE;
	+
	+ vdev_dirty_leaves(vd, VDD_DTL, *txg);
	+ vdev_config_dirty(vd);
	+ dmu_tx_t tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	+ dsl_sync_task_nowait(spa->spa_dsl_pool,
	+ vdev_remove_initiate_sync,
	+ vd, 0, ZFS_SPACE_CHECK_NONE, tx);
	+ dmu_tx_commit(tx);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Remove a device from the pool.
	+ *
	+ * Removing a device from the vdev namespace requires several steps
	+ * and can take a significant amount of time. As a result we use
	+ * the spa_vdev_config_[enter/exit] functions which allow us to
	+ * grab and release the spa_config_lock while still holding the namespace
	+ * lock. During each step the configuration is synced out.
	+ */
	+int
	+spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
	+{
	+ vdev_t *vd;
	+ nvlist_t spares, l2cache, *nv;
	+ uint64_t txg = 0;
	+ uint_t nspares, nl2cache;
	+ int error = 0;
	+ boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
	+ sysevent_t *ev = NULL;
	+
	+ ASSERT(spa_writeable(spa));
	+
	+ if (!locked)
	+ txg = spa_vdev_enter(spa);
	+
	+ vd = spa_lookup_by_guid(spa, guid, B_FALSE);
	+
	+ if (spa->spa_spares.sav_vdevs != NULL &&
	+ nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
	+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
	+ (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
	+ /*
	+ * Only remove the hot spare if it's not currently in use
	+ * in this pool.
	+ */
	+ if (vd == NULL \|\| unspare) {
	+ char *nvstr = fnvlist_lookup_string(nv,
	+ ZPOOL_CONFIG_PATH);
	+ spa_history_log_internal(spa, "vdev remove", NULL,
	+ "%s vdev (%s) %s", spa_name(spa),
	+ VDEV_TYPE_SPARE, nvstr);
	+ if (vd == NULL)
	+ vd = spa_lookup_by_guid(spa, guid, B_TRUE);
	+ ev = spa_event_create(spa, vd, NULL,
	+ ESC_ZFS_VDEV_REMOVE_AUX);
	+ spa_vdev_remove_aux(spa->spa_spares.sav_config,
	+ ZPOOL_CONFIG_SPARES, spares, nspares, nv);
	+ spa_load_spares(spa);
	+ spa->spa_spares.sav_sync = B_TRUE;
	+ } else {
	+ error = SET_ERROR(EBUSY);
	+ }
	+ } else if (spa->spa_l2cache.sav_vdevs != NULL &&
	+ nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
	+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
	+ (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
	+ char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
	+ spa_history_log_internal(spa, "vdev remove", NULL,
	+ "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr);
	+ /*
	+ * Cache devices can always be removed.
	+ */
	+ vd = spa_lookup_by_guid(spa, guid, B_TRUE);
	+ ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
	+ spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
	+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
	+ spa_load_l2cache(spa);
	+ spa->spa_l2cache.sav_sync = B_TRUE;
	+ } else if (vd != NULL && vd->vdev_islog) {
	+ ASSERT(!locked);
	+ error = spa_vdev_remove_log(vd, &txg);
	+ } else if (vd != NULL) {
	+ ASSERT(!locked);
	+ error = spa_vdev_remove_top(vd, &txg);
	+ } else {
	+ /*
	+ * There is no vdev of any kind with the specified guid.
	+ */
	+ error = SET_ERROR(ENOENT);
	+ }
	+
	+ if (!locked)
	+ error = spa_vdev_exit(spa, NULL, txg, error);
	+
	+ if (ev != NULL) {
	+ if (error != 0) {
	+ spa_event_discard(ev);
	+ } else {
	+ spa_event_post(ev);
	+ }
	+ }
	+
	+ return (error);
	+}
	+
	+int
	+spa_removal_get_stats(spa_t spa, pool_removal_stat_t prs)
	+{
	+ prs->prs_state = spa->spa_removing_phys.sr_state;
	+
	+ if (prs->prs_state == DSS_NONE)
	+ return (SET_ERROR(ENOENT));
	+
	+ prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
	+ prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
	+ prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
	+ prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
	+ prs->prs_copied = spa->spa_removing_phys.sr_copied;
	+
	+ if (spa->spa_vdev_removal != NULL) {
	+ for (int i = 0; i < TXG_SIZE; i++) {
	+ prs->prs_copied +=
	+ spa->spa_vdev_removal->svr_bytes_done[i];
	+ }
	+ }
	+
	+ prs->prs_mapping_memory = 0;
	+ uint64_t indirect_vdev_id =
	+ spa->spa_removing_phys.sr_prev_indirect_vdev;
	+ while (indirect_vdev_id != -1) {
	+ vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
	+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
	+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
	+
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
	+ prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
	+ indirect_vdev_id = vic->vic_prev_indirect_vdev;
	+ }
	+
	+ return (0);
	+}

	Property changes on: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c (revision 332525)
	@@ -1,123 +1,124 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/

	/*
	- * Copyright (c) 2013 by Delphix. All rights reserved.
	+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/vdev_impl.h>
	#include <sys/zio.h>
	#include <sys/fs/zfs.h>

	/*
	* Virtual device vector for the pool's root vdev.
	*/

	/*
	* We should be able to tolerate one failure with absolutely no damage
	* to our metadata. Two failures will take out space maps, a bunch of
	* indirect block trees, meta dnodes, dnodes, etc. Probably not a happy
	* place to live. When we get smarter, we can liberalize this policy.
	* e.g. If we haven't lost two consecutive top-level vdevs, then we are
	* probably fine. Adding bean counters during alloc/free can make this
	* future guesswork more accurate.
	*/
	static int
	too_many_errors(vdev_t *vd, int numerrors)
	{
	ASSERT3U(numerrors, <=, vd->vdev_children);
	return (numerrors > 0);
	}

	static int
	vdev_root_open(vdev_t vd, uint64_t asize, uint64_t *max_asize,
	uint64_t logical_ashift, uint64_t physical_ashift)
	{
	int lasterror = 0;
	int numerrors = 0;

	if (vd->vdev_children == 0) {
	vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
	return (SET_ERROR(EINVAL));
	}

	vdev_open_children(vd);

	for (int c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];

	if (cvd->vdev_open_error && !cvd->vdev_islog) {
	lasterror = cvd->vdev_open_error;
	numerrors++;
	}
	}

	if (too_many_errors(vd, numerrors)) {
	vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
	return (lasterror);
	}

	*asize = 0;
	*max_asize = 0;
	*logical_ashift = 0;
	*physical_ashift = 0;

	return (0);
	}

	static void
	vdev_root_close(vdev_t *vd)
	{
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_close(vd->vdev_child[c]);
	}

	static void
	vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
	{
	if (too_many_errors(vd, faulted)) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_NO_REPLICAS);
	} else if (degraded) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
	} else {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
	}
	}

	vdev_ops_t vdev_root_ops = {
	vdev_root_open,
	vdev_root_close,
	vdev_default_asize,
	NULL, /* io_start - not applicable to the root */
	NULL, /* io_done - not applicable to the root */
	vdev_root_state_change,
	+ NULL,
	NULL,
	NULL,
	VDEV_TYPE_ROOT, /* name of this vdev type */
	B_FALSE /* not a leaf vdev */
	};
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c (revision 332525)
	@@ -1,857 +1,860 @@
	/*
	* CDDL HEADER START
	*
	* This file and its contents are supplied under the terms of the
	* Common Development and Distribution License ("CDDL"), version 1.0.
	* You may only use this file in accordance with the terms of version
	* 1.0 of the CDDL.
	*
	* A full copy of the text of the CDDL should have accompanied this
	* source. A copy of the CDDL is also available via the Internet at
	* http://www.illumos.org/license/CDDL.
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2016 by Delphix. All rights reserved.
	*/

	#include "lua.h"
	#include "lualib.h"
	#include "lauxlib.h"

	#include <zfs_prop.h>

	#include <sys/dsl_prop.h>
	#include <sys/dsl_synctask.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dmu_objset.h>
	#include <sys/mntent.h>
	#include <sys/sunddi.h>
	#include <sys/zap.h>
	#include <sys/zcp.h>
	#include <sys/zcp_iter.h>
	#include <sys/zcp_global.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/zfs_znode.h>
	#include <sys/zvol.h>

	#ifdef _KERNEL
	#include <sys/zfs_vfsops.h>
	#endif

	static int
	get_objset_type(dsl_dataset_t ds, zfs_type_t type)
	{
	int error;
	objset_t *os;
	error = dmu_objset_from_ds(ds, &os);
	if (error != 0)
	return (error);
	if (ds->ds_is_snapshot) {
	*type = ZFS_TYPE_SNAPSHOT;
	} else {
	switch (os->os_phys->os_type) {
	case DMU_OST_ZFS:
	*type = ZFS_TYPE_FILESYSTEM;
	break;
	case DMU_OST_ZVOL:
	*type = ZFS_TYPE_VOLUME;
	break;
	default:
	return (EINVAL);
	}
	}
	return (0);
	}

	/*
	* Returns the string name of ds's type in str (a buffer which should be
	* at least 12 bytes long).
	*/
	static int
	get_objset_type_name(dsl_dataset_t ds, char str)
	{
	int error;
	zfs_type_t type;
	error = get_objset_type(ds, &type);
	if (error != 0)
	return (error);
	switch (type) {
	case ZFS_TYPE_SNAPSHOT:
	(void) strcpy(str, "snapshot");
	break;
	case ZFS_TYPE_FILESYSTEM:
	(void) strcpy(str, "filesystem");
	break;
	case ZFS_TYPE_VOLUME:
	(void) strcpy(str, "volume");
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	/*
	* Determines the source of a property given its setpoint and
	* property type. It pushes the source to the lua stack.
	*/
	static void
	get_prop_src(lua_State state, const char setpoint, zfs_prop_t prop)
	{
	if (zfs_prop_readonly(prop) \|\| (prop == ZFS_PROP_VERSION)) {
	lua_pushnil(state);
	} else {
	const char *src;
	if (strcmp("", setpoint) == 0) {
	src = "default";
	} else {
	src = setpoint;
	}
	(void) lua_pushstring(state, src);
	}
	}

	/*
	* Given an error encountered while getting properties, either longjmp's for
	* a fatal error or pushes nothing to the stack for a non fatal one.
	*/
	static int
	zcp_handle_error(lua_State state, const char dataset_name,
	const char *property_name, int error)
	{
	ASSERT3S(error, !=, 0);
	if (error == ENOENT) {
	return (0);
	} else if (error == EINVAL) {
	return (luaL_error(state,
	"property '%s' is not a valid property on dataset '%s'",
	property_name, dataset_name));
	} else if (error == EIO) {
	return (luaL_error(state,
	"I/O error while retrieving property '%s' on dataset '%s'",
	property_name, dataset_name));
	} else {
	return (luaL_error(state, "unexpected error %d while "
	"retrieving property '%s' on dataset '%s'",
	error, property_name, dataset_name));
	}
	}

	/*
	* Look up a user defined property in the zap object. If it exists, push it
	* and the setpoint onto the stack, otherwise don't push anything.
	*/
	static int
	zcp_get_user_prop(lua_State state, dsl_pool_t dp, const char *dataset_name,
	const char *property_name)
	{
	int error;
	char *buf;
	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
	/*
	* zcp_dataset_hold will either successfully return the requested
	* dataset or throw a lua error and longjmp out of the zfs.get_prop call
	* without returning.
	*/
	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
	if (ds == NULL)
	return (1); /* not reached; zcp_dataset_hold() longjmp'd */

	buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
	error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN,
	buf, setpoint);
	dsl_dataset_rele(ds, FTAG);

	if (error != 0) {
	kmem_free(buf, ZAP_MAXVALUELEN);
	return (zcp_handle_error(state, dataset_name, property_name,
	error));
	}
	(void) lua_pushstring(state, buf);
	(void) lua_pushstring(state, setpoint);
	kmem_free(buf, ZAP_MAXVALUELEN);
	return (2);
	}

	/*
	* Check if the property we're looking for is stored in the ds_dir. If so,
	* return it in the 'val' argument. Return 0 on success and ENOENT and if
	* the property is not present.
	*/
	static int
	get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop,
	uint64_t *val)
	{
	dsl_dir_t *dd = ds->ds_dir;
	mutex_enter(&dd->dd_lock);
	switch (zfs_prop) {
	case ZFS_PROP_USEDSNAP:
	*val = dsl_dir_get_usedsnap(dd);
	break;
	case ZFS_PROP_USEDCHILD:
	*val = dsl_dir_get_usedchild(dd);
	break;
	case ZFS_PROP_USEDDS:
	*val = dsl_dir_get_usedds(dd);
	break;
	case ZFS_PROP_USEDREFRESERV:
	*val = dsl_dir_get_usedrefreserv(dd);
	break;
	case ZFS_PROP_LOGICALUSED:
	*val = dsl_dir_get_logicalused(dd);
	break;
	default:
	mutex_exit(&dd->dd_lock);
	return (ENOENT);
	}
	mutex_exit(&dd->dd_lock);
	return (0);
	}

	/*
	* Takes a dataset, a property, a value and that value's setpoint as
	* found in the ZAP. Checks if the property has been changed in the vfs.
	* If so, val and setpoint will be overwritten with updated content.
	* Otherwise, they are left unchanged.
	*/
	static int
	get_temporary_prop(dsl_dataset_t ds, zfs_prop_t zfs_prop, uint64_t val,
	char *setpoint)
	{
	#ifndef _KERNEL
	return (0);
	#else
	int error;
	#ifdef illumos
	zfsvfs_t *zfvp;
	#endif
	vfs_t *vfsp;
	objset_t *os;
	uint64_t tmp = *val;

	error = dmu_objset_from_ds(ds, &os);
	if (error != 0)
	return (error);

	error = getzfsvfs_impl(os, &vfsp);
	if (error != 0)
	return (error);
	#ifdef illumos
	vfsp = zfvp->z_vfs;
	#endif
	switch (zfs_prop) {
	case ZFS_PROP_ATIME:
	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
	tmp = 0;
	if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
	tmp = 1;
	break;
	case ZFS_PROP_DEVICES:
	if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
	tmp = 0;
	if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
	tmp = 1;
	break;
	case ZFS_PROP_EXEC:
	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
	tmp = 0;
	if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
	tmp = 1;
	break;
	case ZFS_PROP_SETUID:
	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
	tmp = 0;
	if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
	tmp = 1;
	break;
	case ZFS_PROP_READONLY:
	if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
	tmp = 0;
	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
	tmp = 1;
	break;
	case ZFS_PROP_XATTR:
	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
	tmp = 0;
	if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
	tmp = 1;
	break;
	case ZFS_PROP_NBMAND:
	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
	tmp = 0;
	if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
	tmp = 1;
	break;
	default:
	#ifdef illumos
	VFS_RELE(vfsp);
	#else
	vfs_rel(vfsp);
	#endif
	return (ENOENT);
	}

	#ifdef illumos
	VFS_RELE(vfsp);
	#else
	vfs_rel(vfsp);
	#endif
	if (tmp != *val) {
	(void) strcpy(setpoint, "temporary");
	*val = tmp;
	}
	return (0);
	#endif
	}

	/*
	* Check if the property we're looking for is stored at the dsl_dataset or
	* dsl_dir level. If so, push the property value and source onto the lua stack
	* and return 0. If it is not present or a failure occurs in lookup, return a
	* non-zero error value.
	*/
	static int
	get_special_prop(lua_State state, dsl_dataset_t ds, const char *dsname,
	zfs_prop_t zfs_prop)
	{
	int error = 0;
	objset_t *os;
	uint64_t numval;
	char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
	char setpoint[ZFS_MAX_DATASET_NAME_LEN] =
	"Internal error - setpoint not determined";
	zfs_type_t ds_type;
	zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
	(void) get_objset_type(ds, &ds_type);

	switch (zfs_prop) {
	case ZFS_PROP_REFRATIO:
	numval = dsl_get_refratio(ds);
	break;
	case ZFS_PROP_USED:
	numval = dsl_get_used(ds);
	break;
	case ZFS_PROP_CLONES: {
	nvlist_t *clones = fnvlist_alloc();
	error = get_clones_stat_impl(ds, clones);
	if (error == 0) {
	/* push list to lua stack */
	VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0));
	/* source */
	(void) lua_pushnil(state);
	}
	nvlist_free(clones);
	kmem_free(strval, ZAP_MAXVALUELEN);
	return (error);
	}
	case ZFS_PROP_COMPRESSRATIO:
	numval = dsl_get_compressratio(ds);
	break;
	case ZFS_PROP_CREATION:
	numval = dsl_get_creation(ds);
	break;
	case ZFS_PROP_REFERENCED:
	numval = dsl_get_referenced(ds);
	break;
	case ZFS_PROP_AVAILABLE:
	numval = dsl_get_available(ds);
	break;
	case ZFS_PROP_LOGICALREFERENCED:
	numval = dsl_get_logicalreferenced(ds);
	break;
	case ZFS_PROP_CREATETXG:
	numval = dsl_get_creationtxg(ds);
	break;
	case ZFS_PROP_GUID:
	numval = dsl_get_guid(ds);
	break;
	case ZFS_PROP_UNIQUE:
	numval = dsl_get_unique(ds);
	break;
	case ZFS_PROP_OBJSETID:
	numval = dsl_get_objsetid(ds);
	break;
	case ZFS_PROP_ORIGIN:
	dsl_dir_get_origin(ds->ds_dir, strval);
	break;
	case ZFS_PROP_USERACCOUNTING:
	error = dmu_objset_from_ds(ds, &os);
	if (error == 0)
	numval = dmu_objset_userspace_present(os);
	break;
	case ZFS_PROP_WRITTEN:
	error = dsl_get_written(ds, &numval);
	break;
	case ZFS_PROP_TYPE:
	error = get_objset_type_name(ds, strval);
	break;
	case ZFS_PROP_PREV_SNAP:
	error = dsl_get_prev_snap(ds, strval);
	break;
	case ZFS_PROP_NAME:
	dsl_dataset_name(ds, strval);
	break;
	case ZFS_PROP_MOUNTPOINT:
	error = dsl_get_mountpoint(ds, dsname, strval, setpoint);
	break;
	case ZFS_PROP_VERSION:
	/* should be a snapshot or filesystem */
	ASSERT(ds_type != ZFS_TYPE_VOLUME);
	error = dmu_objset_from_ds(ds, &os);
	/* look in the master node for the version */
	if (error == 0) {
	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
	sizeof (numval), 1, &numval);
	}
	break;
	case ZFS_PROP_DEFER_DESTROY:
	numval = dsl_get_defer_destroy(ds);
	break;
	case ZFS_PROP_USERREFS:
	numval = dsl_get_userrefs(ds);
	break;
	case ZFS_PROP_FILESYSTEM_COUNT:
	error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval);
	(void) strcpy(setpoint, "");
	break;
	case ZFS_PROP_SNAPSHOT_COUNT:
	error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval);
	(void) strcpy(setpoint, "");
	break;
	+ case ZFS_PROP_REMAPTXG:
	+ error = dsl_dir_get_remaptxg(ds->ds_dir, &numval);
	+ break;
	case ZFS_PROP_NUMCLONES:
	numval = dsl_get_numclones(ds);
	break;
	case ZFS_PROP_INCONSISTENT:
	numval = dsl_get_inconsistent(ds);
	break;
	case ZFS_PROP_RECEIVE_RESUME_TOKEN:
	VERIFY3U(strlcpy(strval, get_receive_resume_stats_impl(ds),
	ZAP_MAXVALUELEN), <, ZAP_MAXVALUELEN);
	if (strcmp(strval, "") == 0) {
	VERIFY3U(strlcpy(strval, get_child_receive_stats(ds),
	ZAP_MAXVALUELEN), <, ZAP_MAXVALUELEN);
	if (strcmp(strval, "") == 0)
	error = ENOENT;
	}
	break;
	case ZFS_PROP_VOLSIZE:
	ASSERT(ds_type == ZFS_TYPE_VOLUME);
	error = dmu_objset_from_ds(ds, &os);
	if (error == 0) {
	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size",
	sizeof (numval), 1, &numval);
	}
	if (error == 0)
	(void) strcpy(setpoint, dsname);

	break;
	case ZFS_PROP_VOLBLOCKSIZE: {
	ASSERT(ds_type == ZFS_TYPE_VOLUME);
	dmu_object_info_t doi;
	error = dmu_objset_from_ds(ds, &os);
	if (error == 0) {
	error = dmu_object_info(os, ZVOL_OBJ, &doi);
	if (error == 0)
	numval = doi.doi_data_block_size;
	}
	break;
	}
	default:
	/* Did not match these props, check in the dsl_dir */
	error = get_dsl_dir_prop(ds, zfs_prop, &numval);
	}
	if (error != 0) {
	kmem_free(strval, ZAP_MAXVALUELEN);
	return (error);
	}

	switch (prop_type) {
	case PROP_TYPE_NUMBER: {
	(void) lua_pushnumber(state, numval);
	break;
	}
	case PROP_TYPE_STRING: {
	(void) lua_pushstring(state, strval);
	break;
	}
	case PROP_TYPE_INDEX: {
	const char *propval;
	error = zfs_prop_index_to_string(zfs_prop, numval, &propval);
	if (error != 0) {
	kmem_free(strval, ZAP_MAXVALUELEN);
	return (error);
	}
	(void) lua_pushstring(state, propval);
	break;
	}
	}
	kmem_free(strval, ZAP_MAXVALUELEN);

	/* Push the source to the stack */
	get_prop_src(state, setpoint, zfs_prop);
	return (0);
	}

	/*
	* Look up a property and its source in the zap object. If the value is
	* present and successfully retrieved, push the value and source on the
	* lua stack and return 0. On failure, return a non-zero error value.
	*/
	static int
	get_zap_prop(lua_State state, dsl_dataset_t ds, zfs_prop_t zfs_prop)
	{
	int error = 0;
	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
	char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
	uint64_t numval;
	const char *prop_name = zfs_prop_to_name(zfs_prop);
	zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);

	if (prop_type == PROP_TYPE_STRING) {
	/* Push value to lua stack */
	error = dsl_prop_get_ds(ds, prop_name, 1,
	ZAP_MAXVALUELEN, strval, setpoint);
	if (error == 0)
	(void) lua_pushstring(state, strval);
	} else {
	error = dsl_prop_get_ds(ds, prop_name, sizeof (numval),
	1, &numval, setpoint);

	/* Fill in temorary value for prop, if applicable */
	(void) get_temporary_prop(ds, zfs_prop, &numval, setpoint);

	/* Push value to lua stack */
	if (prop_type == PROP_TYPE_INDEX) {
	const char *propval;
	error = zfs_prop_index_to_string(zfs_prop, numval,
	&propval);
	if (error == 0)
	(void) lua_pushstring(state, propval);
	} else {
	if (error == 0)
	(void) lua_pushnumber(state, numval);
	}
	}
	kmem_free(strval, ZAP_MAXVALUELEN);
	if (error == 0)
	get_prop_src(state, setpoint, zfs_prop);
	return (error);
	}

	/*
	* Determine whether property is valid for a given dataset
	*/
	boolean_t
	prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
	{
	int error;
	zfs_type_t zfs_type;

	/* properties not supported */
	if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) \|\|
	(zfs_prop == ZFS_PROP_MOUNTED))
	return (B_FALSE);

	/* if we want the origin prop, ds must be a clone */
	if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir)))
	return (B_FALSE);

	error = get_objset_type(ds, &zfs_type);
	if (error != 0)
	return (B_FALSE);
	return (zfs_prop_valid_for_type(zfs_prop, zfs_type));
	}

	/*
	* Look up a given dataset property. On success return 2, the number of
	* values pushed to the lua stack (property value and source). On a fatal
	* error, longjmp. On a non fatal error push nothing.
	*/
	static int
	zcp_get_system_prop(lua_State state, dsl_pool_t dp, const char *dataset_name,
	zfs_prop_t zfs_prop)
	{
	int error;
	/*
	* zcp_dataset_hold will either successfully return the requested
	* dataset or throw a lua error and longjmp out of the zfs.get_prop call
	* without returning.
	*/
	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
	if (ds == NULL)
	return (1); /* not reached; zcp_dataset_hold() longjmp'd */

	/* Check that the property is valid for the given dataset */
	const char *prop_name = zfs_prop_to_name(zfs_prop);
	if (!prop_valid_for_ds(ds, zfs_prop)) {
	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	/* Check if the property can be accessed directly */
	error = get_special_prop(state, ds, dataset_name, zfs_prop);
	if (error == 0) {
	dsl_dataset_rele(ds, FTAG);
	/* The value and source have been pushed by get_special_prop */
	return (2);
	}
	if (error != ENOENT) {
	dsl_dataset_rele(ds, FTAG);
	return (zcp_handle_error(state, dataset_name,
	prop_name, error));
	}

	/* If we were unable to find it, look in the zap object */
	error = get_zap_prop(state, ds, zfs_prop);
	dsl_dataset_rele(ds, FTAG);
	if (error != 0) {
	return (zcp_handle_error(state, dataset_name,
	prop_name, error));
	}
	/* The value and source have been pushed by get_zap_prop */
	return (2);
	}

	static zfs_userquota_prop_t
	get_userquota_prop(const char *prop_name)
	{
	zfs_userquota_prop_t type;
	/* Figure out the property type ({user\|group}{quota\|used}) */
	for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
	if (strncmp(prop_name, zfs_userquota_prop_prefixes[type],
	strlen(zfs_userquota_prop_prefixes[type])) == 0)
	break;
	}
	return (type);
	}

	#ifdef _KERNEL
	/*
	* Given the name of a zfs_userquota_prop, this function determines the
	* prop type as well as the numeric group/user ids based on the string
	* following the '@' in the property name. On success, returns 0. On failure,
	* returns a non-zero error.
	* 'domain' must be free'd by caller using strfree()
	*/
	static int
	parse_userquota_prop(const char prop_name, zfs_userquota_prop_t type,
	char *domain, uint64_t rid)
	{
	char cp, end, *domain_val;

	*type = get_userquota_prop(prop_name);
	if (*type >= ZFS_NUM_USERQUOTA_PROPS)
	return (EINVAL);

	*rid = 0;
	cp = strchr(prop_name, '@') + 1;
	if (strncmp(cp, "S-1-", 4) == 0) {
	/*
	* It's a numeric SID (eg "S-1-234-567-89") and we want to
	* seperate the domain id and the rid
	*/
	int domain_len = strrchr(cp, '-') - cp;
	domain_val = kmem_alloc(domain_len + 1, KM_SLEEP);
	(void) strncpy(domain_val, cp, domain_len);
	domain_val[domain_len] = '\0';
	cp += domain_len + 1;

	(void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
	if (*end != '\0') {
	strfree(domain_val);
	return (EINVAL);
	}
	} else {
	/* It's only a user/group ID (eg "12345"), just get the rid */
	domain_val = NULL;
	(void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
	if (*end != '\0')
	return (EINVAL);
	}
	*domain = domain_val;
	return (0);
	}

	/*
	* Look up {user\|group}{quota\|used} property for given dataset. On success
	* push the value (quota or used amount) and the setpoint. On failure, push
	* a lua error.
	*/
	static int
	zcp_get_userquota_prop(lua_State state, dsl_pool_t dp,
	const char dataset_name, const char prop_name)
	{
	zfsvfs_t *zfvp;
	zfsvfs_t *zfsvfs;
	int error;
	zfs_userquota_prop_t type;
	char *domain;
	uint64_t rid, value;
	objset_t *os;

	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
	if (ds == NULL)
	return (1); /* not reached; zcp_dataset_hold() longjmp'd */

	error = parse_userquota_prop(prop_name, &type, &domain, &rid);
	if (error == 0) {
	error = dmu_objset_from_ds(ds, &os);
	if (error == 0) {
	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
	error = zfsvfs_create_impl(&zfvp, zfsvfs, os);
	if (error == 0) {
	error = zfs_userspace_one(zfvp, type, domain,
	rid, &value);
	zfsvfs_free(zfvp);
	}
	}
	if (domain != NULL)
	strfree(domain);
	}
	dsl_dataset_rele(ds, FTAG);

	if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) \|\|
	(type == ZFS_PROP_GROUPQUOTA)))
	error = ENOENT;
	if (error != 0) {
	return (zcp_handle_error(state, dataset_name,
	prop_name, error));
	}

	(void) lua_pushnumber(state, value);
	(void) lua_pushstring(state, dataset_name);
	return (2);
	}
	#endif

	/*
	* Determines the name of the snapshot referenced in the written property
	* name. Returns snapshot name in snap_name, a buffer that must be at least
	* as large as ZFS_MAX_DATASET_NAME_LEN
	*/
	static void
	parse_written_prop(const char dataset_name, const char prop_name,
	char *snap_name)
	{
	ASSERT(zfs_prop_written(prop_name));
	const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN;
	if (strchr(name, '@') == NULL) {
	(void) sprintf(snap_name, "%s@%s", dataset_name, name);
	} else {
	(void) strcpy(snap_name, name);
	}
	}

	/*
	* Look up written@ property for given dataset. On success
	* push the value and the setpoint. If error is fatal, we will
	* longjmp, otherwise push nothing.
	*/
	static int
	zcp_get_written_prop(lua_State state, dsl_pool_t dp,
	const char dataset_name, const char prop_name)
	{
	char snap_name[ZFS_MAX_DATASET_NAME_LEN];
	uint64_t used, comp, uncomp;
	dsl_dataset_t *old;
	int error = 0;

	parse_written_prop(dataset_name, prop_name, snap_name);
	dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG);
	if (new == NULL)
	return (1); /* not reached; zcp_dataset_hold() longjmp'd */

	error = dsl_dataset_hold(dp, snap_name, FTAG, &old);
	if (error != 0) {
	dsl_dataset_rele(new, FTAG);
	return (zcp_dataset_hold_error(state, dp, snap_name,
	error));
	}
	error = dsl_dataset_space_written(old, new,
	&used, &comp, &uncomp);

	dsl_dataset_rele(old, FTAG);
	dsl_dataset_rele(new, FTAG);

	if (error != 0) {
	return (zcp_handle_error(state, dataset_name,
	snap_name, error));
	}
	(void) lua_pushnumber(state, used);
	(void) lua_pushstring(state, dataset_name);
	return (2);
	}

	static int zcp_get_prop(lua_State *state);
	static zcp_lib_info_t zcp_get_prop_info = {
	.name = "get_prop",
	.func = zcp_get_prop,
	.pargs = {
	{ .za_name = "dataset", .za_lua_type = LUA_TSTRING},
	{ .za_name = "property", .za_lua_type = LUA_TSTRING},
	{NULL, 0}
	},
	.kwargs = {
	{NULL, 0}
	}
	};

	static int
	zcp_get_prop(lua_State *state)
	{
	const char *dataset_name;
	const char *property_name;
	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
	zcp_lib_info_t *libinfo = &zcp_get_prop_info;

	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);

	dataset_name = lua_tostring(state, 1);
	property_name = lua_tostring(state, 2);

	/* User defined property */
	if (zfs_prop_user(property_name)) {
	return (zcp_get_user_prop(state, dp,
	dataset_name, property_name));
	}
	/* userspace property */
	if (zfs_prop_userquota(property_name)) {
	#ifdef _KERNEL
	return (zcp_get_userquota_prop(state, dp,
	dataset_name, property_name));
	#else
	return (luaL_error(state,
	"user quota properties only supported in kernel mode",
	property_name));
	#endif
	}
	/* written@ property */
	if (zfs_prop_written(property_name)) {
	return (zcp_get_written_prop(state, dp,
	dataset_name, property_name));
	}

	zfs_prop_t zfs_prop = zfs_name_to_prop(property_name);
	/* Valid system property */
	if (zfs_prop != ZPROP_INVAL) {
	return (zcp_get_system_prop(state, dp, dataset_name,
	zfs_prop));
	}

	/* Invalid property name */
	return (luaL_error(state,
	"'%s' is not a valid property", property_name));
	}

	int
	zcp_load_get_lib(lua_State *state)
	{
	lua_pushcclosure(state, zcp_get_prop_info.func, 0);
	lua_setfield(state, -2, zcp_get_prop_info.name);

	return (1);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 332525)
	@@ -1,6989 +1,7017 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
	* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
	* Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
	* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
	* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright (c) 2013 Steven Hartland. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2016 Toomas Soome <tsoome@me.com>
	* Copyright 2017 RackTop Systems.
	* Copyright (c) 2017 Datto Inc.
	* Copyright 2016 Toomas Soome <tsoome@me.com>
	*/

	/*
	* ZFS ioctls.
	*
	* This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
	* pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
	*
	* There are two ways that we handle ioctls: the legacy way where almost
	* all of the logic is in the ioctl callback, and the new way where most
	* of the marshalling is handled in the common entry point, zfsdev_ioctl().
	*
	* Non-legacy ioctls should be registered by calling
	* zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked
	* from userland by lzc_ioctl().
	*
	* The registration arguments are as follows:
	*
	* const char *name
	* The name of the ioctl. This is used for history logging. If the
	* ioctl returns successfully (the callback returns 0), and allow_log
	* is true, then a history log entry will be recorded with the input &
	* output nvlists. The log entry can be printed with "zpool history -i".
	*
	* zfs_ioc_t ioc
	* The ioctl request number, which userland will pass to ioctl(2).
	* The ioctl numbers can change from release to release, because
	* the caller (libzfs) must be matched to the kernel.
	*
	* zfs_secpolicy_func_t *secpolicy
	* This function will be called before the zfs_ioc_func_t, to
	* determine if this operation is permitted. It should return EPERM
	* on failure, and 0 on success. Checks include determining if the
	* dataset is visible in this zone, and if the user has either all
	* zfs privileges in the zone (SYS_MOUNT), or has been granted permission
	* to do this operation on this dataset with "zfs allow".
	*
	* zfs_ioc_namecheck_t namecheck
	* This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
	* name, a dataset name, or nothing. If the name is not well-formed,
	* the ioctl will fail and the callback will not be called.
	* Therefore, the callback can assume that the name is well-formed
	* (e.g. is null-terminated, doesn't have more than one '@' character,
	* doesn't have invalid characters).
	*
	* zfs_ioc_poolcheck_t pool_check
	* This specifies requirements on the pool state. If the pool does
	* not meet them (is suspended or is readonly), the ioctl will fail
	* and the callback will not be called. If any checks are specified
	* (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
	* Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED \|
	* POOL_CHECK_READONLY).
	*
	* boolean_t smush_outnvlist
	* If smush_outnvlist is true, then the output is presumed to be a
	* list of errors, and it will be "smushed" down to fit into the
	* caller's buffer, by removing some entries and replacing them with a
	* single "N_MORE_ERRORS" entry indicating how many were removed. See
	* nvlist_smush() for details. If smush_outnvlist is false, and the
	* outnvlist does not fit into the userland-provided buffer, then the
	* ioctl will fail with ENOMEM.
	*
	* zfs_ioc_func_t *func
	* The callback function that will perform the operation.
	*
	* The callback should return 0 on success, or an error number on
	* failure. If the function fails, the userland ioctl will return -1,
	* and errno will be set to the callback's return value. The callback
	* will be called with the following arguments:
	*
	* const char *name
	* The name of the pool or dataset to operate on, from
	* zfs_cmd_t:zc_name. The 'namecheck' argument specifies the
	* expected type (pool, dataset, or none).
	*
	* nvlist_t *innvl
	* The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or
	* NULL if no input nvlist was provided. Changes to this nvlist are
	* ignored. If the input nvlist could not be deserialized, the
	* ioctl will fail and the callback will not be called.
	*
	* nvlist_t *outnvl
	* The output nvlist, initially empty. The callback can fill it in,
	* and it will be returned to userland by serializing it into
	* zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization
	* fails (e.g. because the caller didn't supply a large enough
	* buffer), then the overall ioctl will fail. See the
	* 'smush_nvlist' argument above for additional behaviors.
	*
	* There are two typical uses of the output nvlist:
	* - To return state, e.g. property values. In this case,
	* smush_outnvlist should be false. If the buffer was not large
	* enough, the caller will reallocate a larger buffer and try
	* the ioctl again.
	*
	* - To return multiple errors from an ioctl which makes on-disk
	* changes. In this case, smush_outnvlist should be true.
	* Ioctls which make on-disk modifications should generally not
	* use the outnvl if they succeed, because the caller can not
	* distinguish between the operation failing, and
	* deserialization failing.
	*/
	#ifdef __FreeBSD__
	#include "opt_kstack_pages.h"
	#endif

	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/conf.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/errno.h>
	#include <sys/uio.h>
	#include <sys/buf.h>
	#include <sys/file.h>
	#include <sys/kmem.h>
	#include <sys/conf.h>
	#include <sys/cmn_err.h>
	#include <sys/stat.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/zfs_vfsops.h>
	#include <sys/zfs_znode.h>
	#include <sys/zap.h>
	#include <sys/spa.h>
	#include <sys/spa_impl.h>
	#include <sys/vdev.h>
	#include <sys/dmu.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_deleg.h>
	#include <sys/dmu_objset.h>
	#include <sys/dmu_impl.h>
	#include <sys/dmu_tx.h>
	#include <sys/sunddi.h>
	#include <sys/policy.h>
	#include <sys/zone.h>
	#include <sys/nvpair.h>
	#include <sys/mount.h>
	#include <sys/taskqueue.h>
	#include <sys/sdt.h>
	#include <sys/varargs.h>
	#include <sys/fs/zfs.h>
	#include <sys/zfs_ctldir.h>
	#include <sys/zfs_dir.h>
	#include <sys/zfs_onexit.h>
	#include <sys/zvol.h>
	#include <sys/dsl_scan.h>
	#include <sys/dmu_objset.h>
	#include <sys/dmu_send.h>
	#include <sys/dsl_destroy.h>
	#include <sys/dsl_bookmark.h>
	#include <sys/dsl_userhold.h>
	#include <sys/zfeature.h>
	#include <sys/zcp.h>
	#include <sys/zio_checksum.h>
	+#include <sys/vdev_removal.h>

	#include "zfs_namecheck.h"
	#include "zfs_prop.h"
	#include "zfs_deleg.h"
	#include "zfs_comutil.h"
	#include "zfs_ioctl_compat.h"

	#include "lua.h"
	#include "lauxlib.h"

	static struct cdev *zfsdev;

	extern void zfs_init(void);
	extern void zfs_fini(void);

	uint_t zfs_fsyncer_key;
	extern uint_t rrw_tsd_key;
	static uint_t zfs_allow_log_key;
	extern uint_t zfs_geom_probe_vdev_key;

	typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
	typedef int zfs_ioc_func_t(const char , nvlist_t , nvlist_t *);
	typedef int zfs_secpolicy_func_t(zfs_cmd_t , nvlist_t , cred_t *);

	typedef enum {
	NO_NAME,
	POOL_NAME,
	DATASET_NAME
	} zfs_ioc_namecheck_t;

	typedef enum {
	POOL_CHECK_NONE = 1 << 0,
	POOL_CHECK_SUSPENDED = 1 << 1,
	POOL_CHECK_READONLY = 1 << 2,
	} zfs_ioc_poolcheck_t;

	typedef struct zfs_ioc_vec {
	zfs_ioc_legacy_func_t *zvec_legacy_func;
	zfs_ioc_func_t *zvec_func;
	zfs_secpolicy_func_t *zvec_secpolicy;
	zfs_ioc_namecheck_t zvec_namecheck;
	boolean_t zvec_allow_log;
	zfs_ioc_poolcheck_t zvec_pool_check;
	boolean_t zvec_smush_outnvlist;
	const char *zvec_name;
	} zfs_ioc_vec_t;

	/* This array is indexed by zfs_userquota_prop_t */
	static const char *userquota_perms[] = {
	ZFS_DELEG_PERM_USERUSED,
	ZFS_DELEG_PERM_USERQUOTA,
	ZFS_DELEG_PERM_GROUPUSED,
	ZFS_DELEG_PERM_GROUPQUOTA,
	};

	static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
	static int zfs_check_settable(const char name, nvpair_t property,
	cred_t *cr);
	static int zfs_check_clearable(char dataset, nvlist_t props,
	nvlist_t **errors);
	static int zfs_fill_zplprops_root(uint64_t, nvlist_t , nvlist_t ,
	boolean_t *);
	int zfs_set_prop_nvlist(const char , zprop_source_t, nvlist_t , nvlist_t *);
	static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);

	static void zfsdev_close(void *data);

	static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);

	/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
	void
	__dprintf(const char file, const char func, int line, const char *fmt, ...)
	{
	const char *newfile;
	char buf[512];
	va_list adx;

	/*
	* Get rid of annoying "../common/" prefix to filename.
	*/
	newfile = strrchr(file, '/');
	if (newfile != NULL) {
	newfile = newfile + 1; /* Get rid of leading / */
	} else {
	newfile = file;
	}

	va_start(adx, fmt);
	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
	va_end(adx);

	/*
	* To get this data, use the zfs-dprintf probe as so:
	* dtrace -q -n 'zfs-dprintf \
	* /stringof(arg0) == "dbuf.c"/ \
	* {printf("%s: %s", stringof(arg1), stringof(arg3))}'
	* arg0 = file name
	* arg1 = function name
	* arg2 = line number
	* arg3 = message
	*/
	DTRACE_PROBE4(zfs__dprintf,
	char , newfile, char , func, int, line, char *, buf);
	}

	static void
	history_str_free(char *buf)
	{
	kmem_free(buf, HIS_MAX_RECORD_LEN);
	}

	static char *
	history_str_get(zfs_cmd_t *zc)
	{
	char *buf;

	if (zc->zc_history == 0)
	return (NULL);

	buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
	if (copyinstr((void *)(uintptr_t)zc->zc_history,
	buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
	history_str_free(buf);
	return (NULL);
	}

	buf[HIS_MAX_RECORD_LEN -1] = '\0';

	return (buf);
	}

	/*
	* Check to see if the named dataset is currently defined as bootable
	*/
	static boolean_t
	zfs_is_bootfs(const char *name)
	{
	objset_t *os;

	if (dmu_objset_hold(name, FTAG, &os) == 0) {
	boolean_t ret;
	ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
	dmu_objset_rele(os, FTAG);
	return (ret);
	}
	return (B_FALSE);
	}

	/*
	* Return non-zero if the spa version is less than requested version.
	*/
	static int
	zfs_earlier_version(const char *name, int version)
	{
	spa_t *spa;

	if (spa_open(name, &spa, FTAG) == 0) {
	if (spa_version(spa) < version) {
	spa_close(spa, FTAG);
	return (1);
	}
	spa_close(spa, FTAG);
	}
	return (0);
	}

	/*
	* Return TRUE if the ZPL version is less than requested version.
	*/
	static boolean_t
	zpl_earlier_version(const char *name, int version)
	{
	objset_t *os;
	boolean_t rc = B_TRUE;

	if (dmu_objset_hold(name, FTAG, &os) == 0) {
	uint64_t zplversion;

	if (dmu_objset_type(os) != DMU_OST_ZFS) {
	dmu_objset_rele(os, FTAG);
	return (B_TRUE);
	}
	/* XXX reading from non-owned objset */
	if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
	rc = zplversion < version;
	dmu_objset_rele(os, FTAG);
	}
	return (rc);
	}

	static void
	zfs_log_history(zfs_cmd_t *zc)
	{
	spa_t *spa;
	char *buf;

	if ((buf = history_str_get(zc)) == NULL)
	return;

	if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
	if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
	(void) spa_history_log(spa, buf);
	spa_close(spa, FTAG);
	}
	history_str_free(buf);
	}

	/*
	* Policy for top-level read operations (list pools). Requires no privileges,
	* and can be used in the local zone, as there is no associated dataset.
	*/
	/* ARGSUSED */
	static int
	zfs_secpolicy_none(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	return (0);
	}

	/*
	* Policy for dataset read operations (list children, get statistics). Requires
	* no privileges, but must be visible in the local zone.
	*/
	/* ARGSUSED */
	static int
	zfs_secpolicy_read(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	if (INGLOBALZONE(curthread) \|\|
	zone_dataset_visible(zc->zc_name, NULL))
	return (0);

	return (SET_ERROR(ENOENT));
	}

	static int
	zfs_dozonecheck_impl(const char dataset, uint64_t zoned, cred_t cr)
	{
	int writable = 1;

	/*
	* The dataset must be visible by this zone -- check this first
	* so they don't see EPERM on something they shouldn't know about.
	*/
	if (!INGLOBALZONE(curthread) &&
	!zone_dataset_visible(dataset, &writable))
	return (SET_ERROR(ENOENT));

	if (INGLOBALZONE(curthread)) {
	/*
	* If the fs is zoned, only root can access it from the
	* global zone.
	*/
	if (secpolicy_zfs(cr) && zoned)
	return (SET_ERROR(EPERM));
	} else {
	/*
	* If we are in a local zone, the 'zoned' property must be set.
	*/
	if (!zoned)
	return (SET_ERROR(EPERM));

	/* must be writable by this zone */
	if (!writable)
	return (SET_ERROR(EPERM));
	}
	return (0);
	}

	static int
	zfs_dozonecheck(const char dataset, cred_t cr)
	{
	uint64_t zoned;

	if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
	return (SET_ERROR(ENOENT));

	return (zfs_dozonecheck_impl(dataset, zoned, cr));
	}

	static int
	zfs_dozonecheck_ds(const char dataset, dsl_dataset_t ds, cred_t *cr)
	{
	uint64_t zoned;

	if (dsl_prop_get_int_ds(ds, "jailed", &zoned))
	return (SET_ERROR(ENOENT));

	return (zfs_dozonecheck_impl(dataset, zoned, cr));
	}

	static int
	zfs_secpolicy_write_perms_ds(const char name, dsl_dataset_t ds,
	const char perm, cred_t cr)
	{
	int error;

	error = zfs_dozonecheck_ds(name, ds, cr);
	if (error == 0) {
	error = secpolicy_zfs(cr);
	if (error != 0)
	error = dsl_deleg_access_impl(ds, perm, cr);
	}
	return (error);
	}

	static int
	zfs_secpolicy_write_perms(const char name, const char perm, cred_t *cr)
	{
	int error;
	dsl_dataset_t *ds;
	dsl_pool_t *dp;

	/*
	* First do a quick check for root in the global zone, which
	* is allowed to do all write_perms. This ensures that zfs_ioc_*
	* will get to handle nonexistent datasets.
	*/
	if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0)
	return (0);

	error = dsl_pool_hold(name, FTAG, &dp);
	if (error != 0)
	return (error);

	error = dsl_dataset_hold(dp, name, FTAG, &ds);
	if (error != 0) {
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);

	dsl_dataset_rele(ds, FTAG);
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	#ifdef SECLABEL
	/*
	* Policy for setting the security label property.
	*
	* Returns 0 for success, non-zero for access and other errors.
	*/
	static int
	zfs_set_slabel_policy(const char name, char strval, cred_t *cr)
	{
	char ds_hexsl[MAXNAMELEN];
	bslabel_t ds_sl, new_sl;
	boolean_t new_default = FALSE;
	uint64_t zoned;
	int needed_priv = -1;
	int error;

	/* First get the existing dataset label. */
	error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
	1, sizeof (ds_hexsl), &ds_hexsl, NULL);
	if (error != 0)
	return (SET_ERROR(EPERM));

	if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
	new_default = TRUE;

	/* The label must be translatable */
	if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
	return (SET_ERROR(EINVAL));

	/*
	* In a non-global zone, disallow attempts to set a label that
	* doesn't match that of the zone; otherwise no other checks
	* are needed.
	*/
	if (!INGLOBALZONE(curproc)) {
	if (new_default \|\| !blequal(&new_sl, CR_SL(CRED())))
	return (SET_ERROR(EPERM));
	return (0);
	}

	/*
	* For global-zone datasets (i.e., those whose zoned property is
	* "off", verify that the specified new label is valid for the
	* global zone.
	*/
	if (dsl_prop_get_integer(name,
	zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
	return (SET_ERROR(EPERM));
	if (!zoned) {
	if (zfs_check_global_label(name, strval) != 0)
	return (SET_ERROR(EPERM));
	}

	/*
	* If the existing dataset label is nondefault, check if the
	* dataset is mounted (label cannot be changed while mounted).
	* Get the zfsvfs; if there isn't one, then the dataset isn't
	* mounted (or isn't a dataset, doesn't exist, ...).
	*/
	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
	objset_t *os;
	static char *setsl_tag = "setsl_tag";

	/*
	* Try to own the dataset; abort if there is any error,
	* (e.g., already mounted, in use, or other error).
	*/
	error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
	setsl_tag, &os);
	if (error != 0)
	return (SET_ERROR(EPERM));

	dmu_objset_disown(os, setsl_tag);

	if (new_default) {
	needed_priv = PRIV_FILE_DOWNGRADE_SL;
	goto out_check;
	}

	if (hexstr_to_label(strval, &new_sl) != 0)
	return (SET_ERROR(EPERM));

	if (blstrictdom(&ds_sl, &new_sl))
	needed_priv = PRIV_FILE_DOWNGRADE_SL;
	else if (blstrictdom(&new_sl, &ds_sl))
	needed_priv = PRIV_FILE_UPGRADE_SL;
	} else {
	/* dataset currently has a default label */
	if (!new_default)
	needed_priv = PRIV_FILE_UPGRADE_SL;
	}

	out_check:
	if (needed_priv != -1)
	return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
	return (0);
	}
	#endif /* SECLABEL */

	static int
	zfs_secpolicy_setprop(const char dsname, zfs_prop_t prop, nvpair_t propval,
	cred_t *cr)
	{
	char *strval;

	/*
	* Check permissions for special properties.
	*/
	switch (prop) {
	case ZFS_PROP_ZONED:
	/*
	* Disallow setting of 'zoned' from within a local zone.
	*/
	if (!INGLOBALZONE(curthread))
	return (SET_ERROR(EPERM));
	break;

	case ZFS_PROP_QUOTA:
	case ZFS_PROP_FILESYSTEM_LIMIT:
	case ZFS_PROP_SNAPSHOT_LIMIT:
	if (!INGLOBALZONE(curthread)) {
	uint64_t zoned;
	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
	/*
	* Unprivileged users are allowed to modify the
	* limit on things under (ie. contained by)
	* the thing they own.
	*/
	if (dsl_prop_get_integer(dsname, "jailed", &zoned,
	setpoint))
	return (SET_ERROR(EPERM));
	if (!zoned \|\| strlen(dsname) <= strlen(setpoint))
	return (SET_ERROR(EPERM));
	}
	break;

	case ZFS_PROP_MLSLABEL:
	#ifdef SECLABEL
	if (!is_system_labeled())
	return (SET_ERROR(EPERM));

	if (nvpair_value_string(propval, &strval) == 0) {
	int err;

	err = zfs_set_slabel_policy(dsname, strval, CRED());
	if (err != 0)
	return (err);
	}
	#else
	return (EOPNOTSUPP);
	#endif
	break;
	}

	return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_set_fsacl(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	int error;

	error = zfs_dozonecheck(zc->zc_name, cr);
	if (error != 0)
	return (error);

	/*
	* permission to set permissions will be evaluated later in
	* dsl_deleg_can_allow()
	*/
	return (0);
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_rollback(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	return (zfs_secpolicy_write_perms(zc->zc_name,
	ZFS_DELEG_PERM_ROLLBACK, cr));
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_send(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	dsl_pool_t *dp;
	dsl_dataset_t *ds;
	char *cp;
	int error;

	/*
	* Generate the current snapshot name from the given objsetid, then
	* use that name for the secpolicy/zone checks.
	*/
	cp = strchr(zc->zc_name, '@');
	if (cp == NULL)
	return (SET_ERROR(EINVAL));
	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	if (error != 0)
	return (error);

	error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
	if (error != 0) {
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	dsl_dataset_name(ds, zc->zc_name);

	error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
	ZFS_DELEG_PERM_SEND, cr);
	dsl_dataset_rele(ds, FTAG);
	dsl_pool_rele(dp, FTAG);

	return (error);
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_send_new(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	return (zfs_secpolicy_write_perms(zc->zc_name,
	ZFS_DELEG_PERM_SEND, cr));
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_deleg_share(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	vnode_t *vp;
	int error;

	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
	NO_FOLLOW, NULL, &vp)) != 0)
	return (error);

	/* Now make sure mntpnt and dataset are ZFS */

	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 \|\|
	(strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
	zc->zc_name) != 0)) {
	VN_RELE(vp);
	return (SET_ERROR(EPERM));
	}

	VN_RELE(vp);
	return (dsl_deleg_access(zc->zc_name,
	ZFS_DELEG_PERM_SHARE, cr));
	}

	int
	zfs_secpolicy_share(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	if (!INGLOBALZONE(curthread))
	return (SET_ERROR(EPERM));

	if (secpolicy_nfs(cr) == 0) {
	return (0);
	} else {
	return (zfs_secpolicy_deleg_share(zc, innvl, cr));
	}
	}

	int
	zfs_secpolicy_smb_acl(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	if (!INGLOBALZONE(curthread))
	return (SET_ERROR(EPERM));

	if (secpolicy_smb(cr) == 0) {
	return (0);
	} else {
	return (zfs_secpolicy_deleg_share(zc, innvl, cr));
	}
	}

	static int
	zfs_get_parent(const char datasetname, char parent, int parentsize)
	{
	char *cp;

	/*
	* Remove the @bla or /bla from the end of the name to get the parent.
	*/
	(void) strncpy(parent, datasetname, parentsize);
	cp = strrchr(parent, '@');
	if (cp != NULL) {
	cp[0] = '\0';
	} else {
	cp = strrchr(parent, '/');
	if (cp == NULL)
	return (SET_ERROR(ENOENT));
	cp[0] = '\0';
	}

	return (0);
	}

	int
	zfs_secpolicy_destroy_perms(const char name, cred_t cr)
	{
	int error;

	if ((error = zfs_secpolicy_write_perms(name,
	ZFS_DELEG_PERM_MOUNT, cr)) != 0)
	return (error);

	return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_destroy(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
	}

	/*
	* Destroying snapshots with delegated permissions requires
	* descendant mount and destroy permissions.
	*/
	/* ARGSUSED */
	static int
	zfs_secpolicy_destroy_snaps(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	nvlist_t *snaps;
	nvpair_t pair, nextpair;
	int error = 0;

	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
	return (SET_ERROR(EINVAL));
	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	pair = nextpair) {
	nextpair = nvlist_next_nvpair(snaps, pair);
	error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
	if (error == ENOENT) {
	/*
	* Ignore any snapshots that don't exist (we consider
	* them "already destroyed"). Remove the name from the
	* nvl here in case the snapshot is created between
	* now and when we try to destroy it (in which case
	* we don't want to destroy it since we haven't
	* checked for permission).
	*/
	fnvlist_remove_nvpair(snaps, pair);
	error = 0;
	}
	if (error != 0)
	break;
	}

	return (error);
	}

	int
	zfs_secpolicy_rename_perms(const char from, const char to, cred_t *cr)
	{
	char parentname[ZFS_MAX_DATASET_NAME_LEN];
	int error;

	if ((error = zfs_secpolicy_write_perms(from,
	ZFS_DELEG_PERM_RENAME, cr)) != 0)
	return (error);

	if ((error = zfs_secpolicy_write_perms(from,
	ZFS_DELEG_PERM_MOUNT, cr)) != 0)
	return (error);

	if ((error = zfs_get_parent(to, parentname,
	sizeof (parentname))) != 0)
	return (error);

	if ((error = zfs_secpolicy_write_perms(parentname,
	ZFS_DELEG_PERM_CREATE, cr)) != 0)
	return (error);

	if ((error = zfs_secpolicy_write_perms(parentname,
	ZFS_DELEG_PERM_MOUNT, cr)) != 0)
	return (error);

	return (error);
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_rename(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	char *at = NULL;
	int error;

	if ((zc->zc_cookie & 1) != 0) {
	/*
	* This is recursive rename, so the starting snapshot might
	* not exist. Check file system or volume permission instead.
	*/
	at = strchr(zc->zc_name, '@');
	if (at == NULL)
	return (EINVAL);
	*at = '\0';
	}

	error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr);

	if (at != NULL)
	*at = '@';

	return (error);
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_promote(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	dsl_pool_t *dp;
	dsl_dataset_t *clone;
	int error;

	error = zfs_secpolicy_write_perms(zc->zc_name,
	ZFS_DELEG_PERM_PROMOTE, cr);
	if (error != 0)
	return (error);

	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	if (error != 0)
	return (error);

	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);

	if (error == 0) {
	char parentname[ZFS_MAX_DATASET_NAME_LEN];
	dsl_dataset_t *origin = NULL;
	dsl_dir_t *dd;
	dd = clone->ds_dir;

	error = dsl_dataset_hold_obj(dd->dd_pool,
	dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
	if (error != 0) {
	dsl_dataset_rele(clone, FTAG);
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
	ZFS_DELEG_PERM_MOUNT, cr);

	dsl_dataset_name(origin, parentname);
	if (error == 0) {
	error = zfs_secpolicy_write_perms_ds(parentname, origin,
	ZFS_DELEG_PERM_PROMOTE, cr);
	}
	dsl_dataset_rele(clone, FTAG);
	dsl_dataset_rele(origin, FTAG);
	}
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_recv(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	int error;

	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
	ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
	return (error);

	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
	ZFS_DELEG_PERM_MOUNT, cr)) != 0)
	return (error);

	return (zfs_secpolicy_write_perms(zc->zc_name,
	ZFS_DELEG_PERM_CREATE, cr));
	}

	int
	zfs_secpolicy_snapshot_perms(const char name, cred_t cr)
	{
	return (zfs_secpolicy_write_perms(name,
	ZFS_DELEG_PERM_SNAPSHOT, cr));
	}

	/*
	* Check for permission to create each snapshot in the nvlist.
	*/
	/* ARGSUSED */
	static int
	zfs_secpolicy_snapshot(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	nvlist_t *snaps;
	int error;
	nvpair_t *pair;

	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
	return (SET_ERROR(EINVAL));
	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	pair = nvlist_next_nvpair(snaps, pair)) {
	char *name = nvpair_name(pair);
	char *atp = strchr(name, '@');

	if (atp == NULL) {
	error = SET_ERROR(EINVAL);
	break;
	}
	*atp = '\0';
	error = zfs_secpolicy_snapshot_perms(name, cr);
	*atp = '@';
	if (error != 0)
	break;
	}
	return (error);
	}

	/*
	* Check for permission to create each snapshot in the nvlist.
	*/
	/* ARGSUSED */
	static int
	zfs_secpolicy_bookmark(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	int error = 0;

	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
	pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
	char *name = nvpair_name(pair);
	char *hashp = strchr(name, '#');

	if (hashp == NULL) {
	error = SET_ERROR(EINVAL);
	break;
	}
	*hashp = '\0';
	error = zfs_secpolicy_write_perms(name,
	ZFS_DELEG_PERM_BOOKMARK, cr);
	*hashp = '#';
	if (error != 0)
	break;
	}
	return (error);
	}

	/* ARGSUSED */
	static int
	+zfs_secpolicy_remap(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	+{
	+ return (zfs_secpolicy_write_perms(zc->zc_name,
	+ ZFS_DELEG_PERM_REMAP, cr));
	+}
	+
	+/* ARGSUSED */
	+static int
	zfs_secpolicy_destroy_bookmarks(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	nvpair_t pair, nextpair;
	int error = 0;

	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
	pair = nextpair) {
	char *name = nvpair_name(pair);
	char *hashp = strchr(name, '#');
	nextpair = nvlist_next_nvpair(innvl, pair);

	if (hashp == NULL) {
	error = SET_ERROR(EINVAL);
	break;
	}

	*hashp = '\0';
	error = zfs_secpolicy_write_perms(name,
	ZFS_DELEG_PERM_DESTROY, cr);
	*hashp = '#';
	if (error == ENOENT) {
	/*
	* Ignore any filesystems that don't exist (we consider
	* their bookmarks "already destroyed"). Remove
	* the name from the nvl here in case the filesystem
	* is created between now and when we try to destroy
	* the bookmark (in which case we don't want to
	* destroy it since we haven't checked for permission).
	*/
	fnvlist_remove_nvpair(innvl, pair);
	error = 0;
	}
	if (error != 0)
	break;
	}

	return (error);
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_log_history(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	/*
	* Even root must have a proper TSD so that we know what pool
	* to log to.
	*/
	if (tsd_get(zfs_allow_log_key) == NULL)
	return (SET_ERROR(EPERM));
	return (0);
	}

	static int
	zfs_secpolicy_create_clone(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	char parentname[ZFS_MAX_DATASET_NAME_LEN];
	int error;
	char *origin;

	if ((error = zfs_get_parent(zc->zc_name, parentname,
	sizeof (parentname))) != 0)
	return (error);

	if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
	(error = zfs_secpolicy_write_perms(origin,
	ZFS_DELEG_PERM_CLONE, cr)) != 0)
	return (error);

	if ((error = zfs_secpolicy_write_perms(parentname,
	ZFS_DELEG_PERM_CREATE, cr)) != 0)
	return (error);

	return (zfs_secpolicy_write_perms(parentname,
	ZFS_DELEG_PERM_MOUNT, cr));
	}

	/*
	* Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
	* SYS_CONFIG privilege, which is not available in a local zone.
	*/
	/* ARGSUSED */
	static int
	zfs_secpolicy_config(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	if (secpolicy_sys_config(cr, B_FALSE) != 0)
	return (SET_ERROR(EPERM));

	return (0);
	}

	/*
	* Policy for object to name lookups.
	*/
	/* ARGSUSED */
	static int
	zfs_secpolicy_diff(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	int error;

	if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
	return (0);

	error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
	return (error);
	}

	/*
	* Policy for fault injection. Requires all privileges.
	*/
	/* ARGSUSED */
	static int
	zfs_secpolicy_inject(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	return (secpolicy_zinject(cr));
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_inherit_prop(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);

	if (prop == ZPROP_INVAL) {
	if (!zfs_prop_user(zc->zc_value))
	return (SET_ERROR(EINVAL));
	return (zfs_secpolicy_write_perms(zc->zc_name,
	ZFS_DELEG_PERM_USERPROP, cr));
	} else {
	return (zfs_secpolicy_setprop(zc->zc_name, prop,
	NULL, cr));
	}
	}

	static int
	zfs_secpolicy_userspace_one(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	int err = zfs_secpolicy_read(zc, innvl, cr);
	if (err)
	return (err);

	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
	return (SET_ERROR(EINVAL));

	if (zc->zc_value[0] == 0) {
	/*
	* They are asking about a posix uid/gid. If it's
	* themself, allow it.
	*/
	if (zc->zc_objset_type == ZFS_PROP_USERUSED \|\|
	zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
	if (zc->zc_guid == crgetuid(cr))
	return (0);
	} else {
	if (groupmember(zc->zc_guid, cr))
	return (0);
	}
	}

	return (zfs_secpolicy_write_perms(zc->zc_name,
	userquota_perms[zc->zc_objset_type], cr));
	}

	static int
	zfs_secpolicy_userspace_many(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	int err = zfs_secpolicy_read(zc, innvl, cr);
	if (err)
	return (err);

	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
	return (SET_ERROR(EINVAL));

	return (zfs_secpolicy_write_perms(zc->zc_name,
	userquota_perms[zc->zc_objset_type], cr));
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_userspace_upgrade(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
	NULL, cr));
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_hold(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	nvpair_t *pair;
	nvlist_t *holds;
	int error;

	error = nvlist_lookup_nvlist(innvl, "holds", &holds);
	if (error != 0)
	return (SET_ERROR(EINVAL));

	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
	pair = nvlist_next_nvpair(holds, pair)) {
	char fsname[ZFS_MAX_DATASET_NAME_LEN];
	error = dmu_fsname(nvpair_name(pair), fsname);
	if (error != 0)
	return (error);
	error = zfs_secpolicy_write_perms(fsname,
	ZFS_DELEG_PERM_HOLD, cr);
	if (error != 0)
	return (error);
	}
	return (0);
	}

	/* ARGSUSED */
	static int
	zfs_secpolicy_release(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	nvpair_t *pair;
	int error;

	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
	pair = nvlist_next_nvpair(innvl, pair)) {
	char fsname[ZFS_MAX_DATASET_NAME_LEN];
	error = dmu_fsname(nvpair_name(pair), fsname);
	if (error != 0)
	return (error);
	error = zfs_secpolicy_write_perms(fsname,
	ZFS_DELEG_PERM_RELEASE, cr);
	if (error != 0)
	return (error);
	}
	return (0);
	}

	/*
	* Policy for allowing temporary snapshots to be taken or released
	*/
	static int
	zfs_secpolicy_tmp_snapshot(zfs_cmd_t zc, nvlist_t innvl, cred_t *cr)
	{
	/*
	* A temporary snapshot is the same as a snapshot,
	* hold, destroy and release all rolled into one.
	* Delegated diff alone is sufficient that we allow this.
	*/
	int error;

	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
	ZFS_DELEG_PERM_DIFF, cr)) == 0)
	return (0);

	error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
	if (error == 0)
	error = zfs_secpolicy_hold(zc, innvl, cr);
	if (error == 0)
	error = zfs_secpolicy_release(zc, innvl, cr);
	if (error == 0)
	error = zfs_secpolicy_destroy(zc, innvl, cr);
	return (error);
	}

	/*
	* Returns the nvlist as specified by the user in the zfs_cmd_t.
	*/
	static int
	get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
	{
	char *packed;
	int error;
	nvlist_t *list = NULL;

	/*
	* Read in and unpack the user-supplied nvlist.
	*/
	if (size == 0)
	return (SET_ERROR(EINVAL));

	packed = kmem_alloc(size, KM_SLEEP);

	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
	iflag)) != 0) {
	kmem_free(packed, size);
	return (SET_ERROR(EFAULT));
	}

	if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
	kmem_free(packed, size);
	return (error);
	}

	kmem_free(packed, size);

	*nvp = list;
	return (0);
	}

	/*
	* Reduce the size of this nvlist until it can be serialized in 'max' bytes.
	* Entries will be removed from the end of the nvlist, and one int32 entry
	* named "N_MORE_ERRORS" will be added indicating how many entries were
	* removed.
	*/
	static int
	nvlist_smush(nvlist_t *errors, size_t max)
	{
	size_t size;

	size = fnvlist_size(errors);

	if (size > max) {
	nvpair_t *more_errors;
	int n = 0;

	if (max < 1024)
	return (SET_ERROR(ENOMEM));

	fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
	more_errors = nvlist_prev_nvpair(errors, NULL);

	do {
	nvpair_t *pair = nvlist_prev_nvpair(errors,
	more_errors);
	fnvlist_remove_nvpair(errors, pair);
	n++;
	size = fnvlist_size(errors);
	} while (size > max);

	fnvlist_remove_nvpair(errors, more_errors);
	fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
	ASSERT3U(fnvlist_size(errors), <=, max);
	}

	return (0);
	}

	static int
	put_nvlist(zfs_cmd_t zc, nvlist_t nvl)
	{
	char *packed = NULL;
	int error = 0;
	size_t size;

	size = fnvlist_size(nvl);

	if (size > zc->zc_nvlist_dst_size) {
	/*
	* Solaris returns ENOMEM here, because even if an error is
	* returned from an ioctl(2), new zc_nvlist_dst_size will be
	* passed to the userland. This is not the case for FreeBSD.
	* We need to return 0, so the kernel will copy the
	* zc_nvlist_dst_size back and the userland can discover that a
	* bigger buffer is needed.
	*/
	error = 0;
	} else {
	packed = fnvlist_pack(nvl, &size);
	if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
	size, zc->zc_iflags) != 0)
	error = SET_ERROR(EFAULT);
	fnvlist_pack_free(packed, size);
	}

	zc->zc_nvlist_dst_size = size;
	zc->zc_nvlist_dst_filled = B_TRUE;
	return (error);
	}

	int
	getzfsvfs_impl(objset_t os, vfs_t *vfsp)
	{
	zfsvfs_t *zfvp;
	int error = 0;

	if (dmu_objset_type(os) != DMU_OST_ZFS) {
	return (SET_ERROR(EINVAL));
	}

	mutex_enter(&os->os_user_ptr_lock);
	zfvp = dmu_objset_get_user(os);
	if (zfvp) {
	*vfsp = zfvp->z_vfs;
	vfs_ref(zfvp->z_vfs);
	} else {
	error = SET_ERROR(ESRCH);
	}
	mutex_exit(&os->os_user_ptr_lock);
	return (error);
	}

	int
	getzfsvfs(const char dsname, zfsvfs_t *zfvp)
	{
	objset_t *os;
	vfs_t *vfsp;
	int error;

	error = dmu_objset_hold(dsname, FTAG, &os);
	if (error != 0)
	return (error);
	error = getzfsvfs_impl(os, &vfsp);
	dmu_objset_rele(os, FTAG);
	if (error != 0)
	return (error);

	error = vfs_busy(vfsp, 0);
	vfs_rel(vfsp);
	if (error != 0) {
	*zfvp = NULL;
	error = SET_ERROR(ESRCH);
	} else {
	*zfvp = vfsp->vfs_data;
	}
	return (error);
	}

	/*
	* Find a zfsvfs_t for a mounted filesystem, or create our own, in which
	* case its z_vfs will be NULL, and it will be opened as the owner.
	* If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
	* which prevents all vnode ops from running.
	*/
	static int
	zfsvfs_hold(const char name, void tag, zfsvfs_t **zfvp, boolean_t writer)
	{
	int error = 0;

	if (getzfsvfs(name, zfvp) != 0)
	error = zfsvfs_create(name, zfvp);
	if (error == 0) {
	rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
	RW_READER, tag);
	#ifdef illumos
	if ((*zfvp)->z_unmounted) {
	/*
	* XXX we could probably try again, since the unmounting
	* thread should be just about to disassociate the
	* objset from the zfsvfs.
	*/
	rrm_exit(&(*zfvp)->z_teardown_lock, tag);
	return (SET_ERROR(EBUSY));
	}
	#else
	/*
	* vfs_busy() ensures that the filesystem is not and
	* can not be unmounted.
	*/
	ASSERT(!(*zfvp)->z_unmounted);
	#endif
	}
	return (error);
	}

	static void
	zfsvfs_rele(zfsvfs_t zfsvfs, void tag)
	{
	rrm_exit(&zfsvfs->z_teardown_lock, tag);

	if (zfsvfs->z_vfs) {
	#ifdef illumos
	VFS_RELE(zfsvfs->z_vfs);
	#else
	vfs_unbusy(zfsvfs->z_vfs);
	#endif
	} else {
	dmu_objset_disown(zfsvfs->z_os, zfsvfs);
	zfsvfs_free(zfsvfs);
	}
	}

	static int
	zfs_ioc_pool_create(zfs_cmd_t *zc)
	{
	int error;
	nvlist_t config, props = NULL;
	nvlist_t *rootprops = NULL;
	nvlist_t *zplprops = NULL;

	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	zc->zc_iflags, &config))
	return (error);

	if (zc->zc_nvlist_src_size != 0 && (error =
	get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	zc->zc_iflags, &props))) {
	nvlist_free(config);
	return (error);
	}

	if (props) {
	nvlist_t *nvl = NULL;
	uint64_t version = SPA_VERSION;

	(void) nvlist_lookup_uint64(props,
	zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
	if (!SPA_VERSION_IS_SUPPORTED(version)) {
	error = SET_ERROR(EINVAL);
	goto pool_props_bad;
	}
	(void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
	if (nvl) {
	error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
	if (error != 0) {
	nvlist_free(config);
	nvlist_free(props);
	return (error);
	}
	(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
	}
	VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	error = zfs_fill_zplprops_root(version, rootprops,
	zplprops, NULL);
	if (error != 0)
	goto pool_props_bad;
	}

	error = spa_create(zc->zc_name, config, props, zplprops);

	/*
	* Set the remaining root properties
	*/
	if (!error && (error = zfs_set_prop_nvlist(zc->zc_name,
	ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
	(void) spa_destroy(zc->zc_name);

	pool_props_bad:
	nvlist_free(rootprops);
	nvlist_free(zplprops);
	nvlist_free(config);
	nvlist_free(props);

	return (error);
	}

	static int
	zfs_ioc_pool_destroy(zfs_cmd_t *zc)
	{
	int error;
	zfs_log_history(zc);
	error = spa_destroy(zc->zc_name);
	if (error == 0)
	zvol_remove_minors(zc->zc_name);
	return (error);
	}

	static int
	zfs_ioc_pool_import(zfs_cmd_t *zc)
	{
	nvlist_t config, props = NULL;
	uint64_t guid;
	int error;

	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	zc->zc_iflags, &config)) != 0)
	return (error);

	if (zc->zc_nvlist_src_size != 0 && (error =
	get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	zc->zc_iflags, &props))) {
	nvlist_free(config);
	return (error);
	}

	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 \|\|
	guid != zc->zc_guid)
	error = SET_ERROR(EINVAL);
	else
	error = spa_import(zc->zc_name, config, props, zc->zc_cookie);

	if (zc->zc_nvlist_dst != 0) {
	int err;

	if ((err = put_nvlist(zc, config)) != 0)
	error = err;
	}

	nvlist_free(config);

	nvlist_free(props);

	return (error);
	}

	static int
	zfs_ioc_pool_export(zfs_cmd_t *zc)
	{
	int error;
	boolean_t force = (boolean_t)zc->zc_cookie;
	boolean_t hardforce = (boolean_t)zc->zc_guid;

	zfs_log_history(zc);
	error = spa_export(zc->zc_name, NULL, force, hardforce);
	if (error == 0)
	zvol_remove_minors(zc->zc_name);
	return (error);
	}

	static int
	zfs_ioc_pool_configs(zfs_cmd_t *zc)
	{
	nvlist_t *configs;
	int error;

	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
	return (SET_ERROR(EEXIST));

	error = put_nvlist(zc, configs);

	nvlist_free(configs);

	return (error);
	}

	/*
	* inputs:
	* zc_name name of the pool
	*
	* outputs:
	* zc_cookie real errno
	* zc_nvlist_dst config nvlist
	* zc_nvlist_dst_size size of config nvlist
	*/
	static int
	zfs_ioc_pool_stats(zfs_cmd_t *zc)
	{
	nvlist_t *config;
	int error;
	int ret = 0;

	error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
	sizeof (zc->zc_value));

	if (config != NULL) {
	ret = put_nvlist(zc, config);
	nvlist_free(config);

	/*
	* The config may be present even if 'error' is non-zero.
	* In this case we return success, and preserve the real errno
	* in 'zc_cookie'.
	*/
	zc->zc_cookie = error;
	} else {
	ret = error;
	}

	return (ret);
	}

	/*
	* Try to import the given pool, returning pool stats as appropriate so that
	* user land knows which devices are available and overall pool health.
	*/
	static int
	zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
	{
	nvlist_t tryconfig, config;
	int error;

	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	zc->zc_iflags, &tryconfig)) != 0)
	return (error);

	config = spa_tryimport(tryconfig);

	nvlist_free(tryconfig);

	if (config == NULL)
	return (SET_ERROR(EINVAL));

	error = put_nvlist(zc, config);
	nvlist_free(config);

	return (error);
	}

	/*
	* inputs:
	* zc_name name of the pool
	* zc_cookie scan func (pool_scan_func_t)
	* zc_flags scrub pause/resume flag (pool_scrub_cmd_t)
	*/
	static int
	zfs_ioc_pool_scan(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;

	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	return (error);

	if (zc->zc_flags >= POOL_SCRUB_FLAGS_END)
	return (SET_ERROR(EINVAL));

	if (zc->zc_flags == POOL_SCRUB_PAUSE)
	error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
	else if (zc->zc_cookie == POOL_SCAN_NONE)
	error = spa_scan_stop(spa);
	else
	error = spa_scan(spa, zc->zc_cookie);

	spa_close(spa, FTAG);

	return (error);
	}

	static int
	zfs_ioc_pool_freeze(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;

	error = spa_open(zc->zc_name, &spa, FTAG);
	if (error == 0) {
	spa_freeze(spa);
	spa_close(spa, FTAG);
	}
	return (error);
	}

	static int
	zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;

	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	return (error);

	if (zc->zc_cookie < spa_version(spa) \|\|
	!SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
	spa_close(spa, FTAG);
	return (SET_ERROR(EINVAL));
	}

	spa_upgrade(spa, zc->zc_cookie);
	spa_close(spa, FTAG);

	return (error);
	}

	static int
	zfs_ioc_pool_get_history(zfs_cmd_t *zc)
	{
	spa_t *spa;
	char *hist_buf;
	uint64_t size;
	int error;

	if ((size = zc->zc_history_len) == 0)
	return (SET_ERROR(EINVAL));

	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	return (error);

	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
	spa_close(spa, FTAG);
	return (SET_ERROR(ENOTSUP));
	}

	hist_buf = kmem_alloc(size, KM_SLEEP);
	if ((error = spa_history_get(spa, &zc->zc_history_offset,
	&zc->zc_history_len, hist_buf)) == 0) {
	error = ddi_copyout(hist_buf,
	(void *)(uintptr_t)zc->zc_history,
	zc->zc_history_len, zc->zc_iflags);
	}

	spa_close(spa, FTAG);
	kmem_free(hist_buf, size);
	return (error);
	}

	static int
	zfs_ioc_pool_reguid(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;

	error = spa_open(zc->zc_name, &spa, FTAG);
	if (error == 0) {
	error = spa_change_guid(spa);
	spa_close(spa, FTAG);
	}
	return (error);
	}

	static int
	zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
	{
	return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_obj object to find
	*
	* outputs:
	* zc_value name of object
	*/
	static int
	zfs_ioc_obj_to_path(zfs_cmd_t *zc)
	{
	objset_t *os;
	int error;

	/* XXX reading from objset not owned */
	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
	return (error);
	if (dmu_objset_type(os) != DMU_OST_ZFS) {
	dmu_objset_rele(os, FTAG);
	return (SET_ERROR(EINVAL));
	}
	error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
	sizeof (zc->zc_value));
	dmu_objset_rele(os, FTAG);

	return (error);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_obj object to find
	*
	* outputs:
	* zc_stat stats on object
	* zc_value path to object
	*/
	static int
	zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
	{
	objset_t *os;
	int error;

	/* XXX reading from objset not owned */
	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
	return (error);
	if (dmu_objset_type(os) != DMU_OST_ZFS) {
	dmu_objset_rele(os, FTAG);
	return (SET_ERROR(EINVAL));
	}
	error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
	sizeof (zc->zc_value));
	dmu_objset_rele(os, FTAG);

	return (error);
	}

	static int
	zfs_ioc_vdev_add(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;
	nvlist_t config, l2cache, *spares;
	uint_t nl2cache = 0, nspares = 0;

	error = spa_open(zc->zc_name, &spa, FTAG);
	if (error != 0)
	return (error);

	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	zc->zc_iflags, &config);
	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
	&l2cache, &nl2cache);

	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
	&spares, &nspares);

	#ifdef illumos
	/*
	* A root pool with concatenated devices is not supported.
	* Thus, can not add a device to a root pool.
	*
	* Intent log device can not be added to a rootpool because
	* during mountroot, zil is replayed, a seperated log device
	* can not be accessed during the mountroot time.
	*
	* l2cache and spare devices are ok to be added to a rootpool.
	*/
	if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
	nvlist_free(config);
	spa_close(spa, FTAG);
	return (SET_ERROR(EDOM));
	}
	#endif /* illumos */

	if (error == 0) {
	error = spa_vdev_add(spa, config);
	nvlist_free(config);
	}
	spa_close(spa, FTAG);
	return (error);
	}

	/*
	* inputs:
	* zc_name name of the pool
	- * zc_nvlist_conf nvlist of devices to remove
	- * zc_cookie to stop the remove?
	+ * zc_guid guid of vdev to remove
	+ * zc_cookie cancel removal
	*/
	static int
	zfs_ioc_vdev_remove(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;

	error = spa_open(zc->zc_name, &spa, FTAG);
	if (error != 0)
	return (error);
	- error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
	+ if (zc->zc_cookie != 0) {
	+ error = spa_vdev_remove_cancel(spa);
	+ } else {
	+ error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
	+ }
	spa_close(spa, FTAG);
	return (error);
	}

	static int
	zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;
	vdev_state_t newstate = VDEV_STATE_UNKNOWN;

	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	return (error);
	switch (zc->zc_cookie) {
	case VDEV_STATE_ONLINE:
	error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
	break;

	case VDEV_STATE_OFFLINE:
	error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
	break;

	case VDEV_STATE_FAULTED:
	if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
	zc->zc_obj != VDEV_AUX_EXTERNAL)
	zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;

	error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
	break;

	case VDEV_STATE_DEGRADED:
	if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
	zc->zc_obj != VDEV_AUX_EXTERNAL)
	zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;

	error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
	break;

	default:
	error = SET_ERROR(EINVAL);
	}
	zc->zc_cookie = newstate;
	spa_close(spa, FTAG);
	return (error);
	}

	static int
	zfs_ioc_vdev_attach(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int replacing = zc->zc_cookie;
	nvlist_t *config;
	int error;

	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	return (error);

	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	zc->zc_iflags, &config)) == 0) {
	error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
	nvlist_free(config);
	}

	spa_close(spa, FTAG);
	return (error);
	}

	static int
	zfs_ioc_vdev_detach(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;

	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	return (error);

	error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);

	spa_close(spa, FTAG);
	return (error);
	}

	static int
	zfs_ioc_vdev_split(zfs_cmd_t *zc)
	{
	spa_t *spa;
	nvlist_t config, props = NULL;
	int error;
	boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);

	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	return (error);

	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
	zc->zc_iflags, &config)) {
	spa_close(spa, FTAG);
	return (error);
	}

	if (zc->zc_nvlist_src_size != 0 && (error =
	get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	zc->zc_iflags, &props))) {
	spa_close(spa, FTAG);
	nvlist_free(config);
	return (error);
	}

	error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);

	spa_close(spa, FTAG);

	nvlist_free(config);
	nvlist_free(props);

	return (error);
	}

	static int
	zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
	{
	spa_t *spa;
	char *path = zc->zc_value;
	uint64_t guid = zc->zc_guid;
	int error;

	error = spa_open(zc->zc_name, &spa, FTAG);
	if (error != 0)
	return (error);

	error = spa_vdev_setpath(spa, guid, path);
	spa_close(spa, FTAG);
	return (error);
	}

	static int
	zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
	{
	spa_t *spa;
	char *fru = zc->zc_value;
	uint64_t guid = zc->zc_guid;
	int error;

	error = spa_open(zc->zc_name, &spa, FTAG);
	if (error != 0)
	return (error);

	error = spa_vdev_setfru(spa, guid, fru);
	spa_close(spa, FTAG);
	return (error);
	}

	static int
	zfs_ioc_objset_stats_impl(zfs_cmd_t zc, objset_t os)
	{
	int error = 0;
	nvlist_t *nv;

	dmu_objset_fast_stat(os, &zc->zc_objset_stats);

	if (zc->zc_nvlist_dst != 0 &&
	(error = dsl_prop_get_all(os, &nv)) == 0) {
	dmu_objset_stats(os, nv);
	/*
	* NB: zvol_get_stats() will read the objset contents,
	* which we aren't supposed to do with a
	* DS_MODE_USER hold, because it could be
	* inconsistent. So this is a bit of a workaround...
	* XXX reading with out owning
	*/
	if (!zc->zc_objset_stats.dds_inconsistent &&
	dmu_objset_type(os) == DMU_OST_ZVOL) {
	error = zvol_get_stats(os, nv);
	if (error == EIO)
	return (error);
	VERIFY0(error);
	}
	error = put_nvlist(zc, nv);
	nvlist_free(nv);
	}

	return (error);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_nvlist_dst_size size of buffer for property nvlist
	*
	* outputs:
	* zc_objset_stats stats
	* zc_nvlist_dst property nvlist
	* zc_nvlist_dst_size size of property nvlist
	*/
	static int
	zfs_ioc_objset_stats(zfs_cmd_t *zc)
	{
	objset_t *os;
	int error;

	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
	if (error == 0) {
	error = zfs_ioc_objset_stats_impl(zc, os);
	dmu_objset_rele(os, FTAG);
	}

	if (error == ENOMEM)
	error = 0;
	return (error);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_nvlist_dst_size size of buffer for property nvlist
	*
	* outputs:
	* zc_nvlist_dst received property nvlist
	* zc_nvlist_dst_size size of received property nvlist
	*
	* Gets received properties (distinct from local properties on or after
	* SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
	* local property values.
	*/
	static int
	zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
	{
	int error = 0;
	nvlist_t *nv;

	/*
	* Without this check, we would return local property values if the
	* caller has not already received properties on or after
	* SPA_VERSION_RECVD_PROPS.
	*/
	if (!dsl_prop_get_hasrecvd(zc->zc_name))
	return (SET_ERROR(ENOTSUP));

	if (zc->zc_nvlist_dst != 0 &&
	(error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
	error = put_nvlist(zc, nv);
	nvlist_free(nv);
	}

	return (error);
	}

	static int
	nvl_add_zplprop(objset_t os, nvlist_t props, zfs_prop_t prop)
	{
	uint64_t value;
	int error;

	/*
	* zfs_get_zplprop() will either find a value or give us
	* the default value (if there is one).
	*/
	if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
	return (error);
	VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
	return (0);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_nvlist_dst_size size of buffer for zpl property nvlist
	*
	* outputs:
	* zc_nvlist_dst zpl property nvlist
	* zc_nvlist_dst_size size of zpl property nvlist
	*/
	static int
	zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
	{
	objset_t *os;
	int err;

	/* XXX reading without owning */
	if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
	return (err);

	dmu_objset_fast_stat(os, &zc->zc_objset_stats);

	/*
	* NB: nvl_add_zplprop() will read the objset contents,
	* which we aren't supposed to do with a DS_MODE_USER
	* hold, because it could be inconsistent.
	*/
	if (zc->zc_nvlist_dst != 0 &&
	!zc->zc_objset_stats.dds_inconsistent &&
	dmu_objset_type(os) == DMU_OST_ZFS) {
	nvlist_t *nv;

	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
	(err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
	(err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
	(err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
	err = put_nvlist(zc, nv);
	nvlist_free(nv);
	} else {
	err = SET_ERROR(ENOENT);
	}
	dmu_objset_rele(os, FTAG);
	return (err);
	}

	boolean_t
	dataset_name_hidden(const char *name)
	{
	/*
	* Skip over datasets that are not visible in this zone,
	* internal datasets (which have a $ in their name), and
	* temporary datasets (which have a % in their name).
	*/
	if (strchr(name, '$') != NULL)
	return (B_TRUE);
	if (strchr(name, '%') != NULL)
	return (B_TRUE);
	if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL))
	return (B_TRUE);
	return (B_FALSE);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_cookie zap cursor
	* zc_nvlist_dst_size size of buffer for property nvlist
	*
	* outputs:
	* zc_name name of next filesystem
	* zc_cookie zap cursor
	* zc_objset_stats stats
	* zc_nvlist_dst property nvlist
	* zc_nvlist_dst_size size of property nvlist
	*/
	static int
	zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
	{
	objset_t *os;
	int error;
	char *p;
	size_t orig_len = strlen(zc->zc_name);

	top:
	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
	if (error == ENOENT)
	error = SET_ERROR(ESRCH);
	return (error);
	}

	p = strrchr(zc->zc_name, '/');
	if (p == NULL \|\| p[1] != '\0')
	(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
	p = zc->zc_name + strlen(zc->zc_name);

	do {
	error = dmu_dir_list_next(os,
	sizeof (zc->zc_name) - (p - zc->zc_name), p,
	NULL, &zc->zc_cookie);
	if (error == ENOENT)
	error = SET_ERROR(ESRCH);
	} while (error == 0 && dataset_name_hidden(zc->zc_name));
	dmu_objset_rele(os, FTAG);

	/*
	* If it's an internal dataset (ie. with a '$' in its name),
	* don't try to get stats for it, otherwise we'll return ENOENT.
	*/
	if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
	error = zfs_ioc_objset_stats(zc); /* fill in the stats */
	if (error == ENOENT) {
	/* We lost a race with destroy, get the next one. */
	zc->zc_name[orig_len] = '\0';
	goto top;
	}
	}
	return (error);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_cookie zap cursor
	* zc_nvlist_dst_size size of buffer for property nvlist
	* zc_simple when set, only name is requested
	*
	* outputs:
	* zc_name name of next snapshot
	* zc_objset_stats stats
	* zc_nvlist_dst property nvlist
	* zc_nvlist_dst_size size of property nvlist
	*/
	static int
	zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
	{
	objset_t *os;
	int error;

	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
	if (error != 0) {
	return (error == ENOENT ? ESRCH : error);
	}

	/*
	* A dataset name of maximum length cannot have any snapshots,
	* so exit immediately.
	*/
	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
	ZFS_MAX_DATASET_NAME_LEN) {
	dmu_objset_rele(os, FTAG);
	return (SET_ERROR(ESRCH));
	}

	error = dmu_snapshot_list_next(os,
	sizeof (zc->zc_name) - strlen(zc->zc_name),
	zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie,
	NULL);

	if (error == 0 && !zc->zc_simple) {
	dsl_dataset_t *ds;
	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;

	error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds);
	if (error == 0) {
	objset_t *ossnap;

	error = dmu_objset_from_ds(ds, &ossnap);
	if (error == 0)
	error = zfs_ioc_objset_stats_impl(zc, ossnap);
	dsl_dataset_rele(ds, FTAG);
	}
	} else if (error == ENOENT) {
	error = SET_ERROR(ESRCH);
	}

	dmu_objset_rele(os, FTAG);
	/* if we failed, undo the @ that we tacked on to zc_name */
	if (error != 0)
	*strchr(zc->zc_name, '@') = '\0';
	return (error);
	}

	static int
	zfs_prop_set_userquota(const char dsname, nvpair_t pair)
	{
	const char *propname = nvpair_name(pair);
	uint64_t *valary;
	unsigned int vallen;
	const char *domain;
	char *dash;
	zfs_userquota_prop_t type;
	uint64_t rid;
	uint64_t quota;
	zfsvfs_t *zfsvfs;
	int err;

	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	nvlist_t *attrs;
	VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
	if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	&pair) != 0)
	return (SET_ERROR(EINVAL));
	}

	/*
	* A correctly constructed propname is encoded as
	* userquota@<rid>-<domain>.
	*/
	if ((dash = strchr(propname, '-')) == NULL \|\|
	nvpair_value_uint64_array(pair, &valary, &vallen) != 0 \|\|
	vallen != 3)
	return (SET_ERROR(EINVAL));

	domain = dash + 1;
	type = valary[0];
	rid = valary[1];
	quota = valary[2];

	err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
	if (err == 0) {
	err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
	zfsvfs_rele(zfsvfs, FTAG);
	}

	return (err);
	}

	/*
	* If the named property is one that has a special function to set its value,
	* return 0 on success and a positive error code on failure; otherwise if it is
	* not one of the special properties handled by this function, return -1.
	*
	* XXX: It would be better for callers of the property interface if we handled
	* these special cases in dsl_prop.c (in the dsl layer).
	*/
	static int
	zfs_prop_set_special(const char *dsname, zprop_source_t source,
	nvpair_t *pair)
	{
	const char *propname = nvpair_name(pair);
	zfs_prop_t prop = zfs_name_to_prop(propname);
	uint64_t intval;
	int err = -1;

	if (prop == ZPROP_INVAL) {
	if (zfs_prop_userquota(propname))
	return (zfs_prop_set_userquota(dsname, pair));
	return (-1);
	}

	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	nvlist_t *attrs;
	VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
	VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	&pair) == 0);
	}

	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
	return (-1);

	VERIFY(0 == nvpair_value_uint64(pair, &intval));

	switch (prop) {
	case ZFS_PROP_QUOTA:
	err = dsl_dir_set_quota(dsname, source, intval);
	break;
	case ZFS_PROP_REFQUOTA:
	err = dsl_dataset_set_refquota(dsname, source, intval);
	break;
	case ZFS_PROP_FILESYSTEM_LIMIT:
	case ZFS_PROP_SNAPSHOT_LIMIT:
	if (intval == UINT64_MAX) {
	/* clearing the limit, just do it */
	err = 0;
	} else {
	err = dsl_dir_activate_fs_ss_limit(dsname);
	}
	/*
	* Set err to -1 to force the zfs_set_prop_nvlist code down the
	* default path to set the value in the nvlist.
	*/
	if (err == 0)
	err = -1;
	break;
	case ZFS_PROP_RESERVATION:
	err = dsl_dir_set_reservation(dsname, source, intval);
	break;
	case ZFS_PROP_REFRESERVATION:
	err = dsl_dataset_set_refreservation(dsname, source, intval);
	break;
	case ZFS_PROP_VOLSIZE:
	err = zvol_set_volsize(dsname, intval);
	break;
	case ZFS_PROP_VERSION:
	{
	zfsvfs_t *zfsvfs;

	if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
	break;

	err = zfs_set_version(zfsvfs, intval);
	zfsvfs_rele(zfsvfs, FTAG);

	if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
	zfs_cmd_t *zc;

	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
	(void) strcpy(zc->zc_name, dsname);
	(void) zfs_ioc_userspace_upgrade(zc);
	kmem_free(zc, sizeof (zfs_cmd_t));
	}
	break;
	}
	default:
	err = -1;
	}

	return (err);
	}

	/*
	* This function is best effort. If it fails to set any of the given properties,
	* it continues to set as many as it can and returns the last error
	* encountered. If the caller provides a non-NULL errlist, it will be filled in
	* with the list of names of all the properties that failed along with the
	* corresponding error numbers.
	*
	* If every property is set successfully, zero is returned and errlist is not
	* modified.
	*/
	int
	zfs_set_prop_nvlist(const char dsname, zprop_source_t source, nvlist_t nvl,
	nvlist_t *errlist)
	{
	nvpair_t *pair;
	nvpair_t *propval;
	int rv = 0;
	uint64_t intval;
	char *strval;
	nvlist_t *genericnvl = fnvlist_alloc();
	nvlist_t *retrynvl = fnvlist_alloc();

	retry:
	pair = NULL;
	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
	const char *propname = nvpair_name(pair);
	zfs_prop_t prop = zfs_name_to_prop(propname);
	int err = 0;

	/* decode the property value */
	propval = pair;
	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	nvlist_t *attrs;
	attrs = fnvpair_value_nvlist(pair);
	if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	&propval) != 0)
	err = SET_ERROR(EINVAL);
	}

	/* Validate value type */
	if (err == 0 && prop == ZPROP_INVAL) {
	if (zfs_prop_user(propname)) {
	if (nvpair_type(propval) != DATA_TYPE_STRING)
	err = SET_ERROR(EINVAL);
	} else if (zfs_prop_userquota(propname)) {
	if (nvpair_type(propval) !=
	DATA_TYPE_UINT64_ARRAY)
	err = SET_ERROR(EINVAL);
	} else {
	err = SET_ERROR(EINVAL);
	}
	} else if (err == 0) {
	if (nvpair_type(propval) == DATA_TYPE_STRING) {
	if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
	err = SET_ERROR(EINVAL);
	} else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
	const char *unused;

	intval = fnvpair_value_uint64(propval);

	switch (zfs_prop_get_type(prop)) {
	case PROP_TYPE_NUMBER:
	break;
	case PROP_TYPE_STRING:
	err = SET_ERROR(EINVAL);
	break;
	case PROP_TYPE_INDEX:
	if (zfs_prop_index_to_string(prop,
	intval, &unused) != 0)
	err = SET_ERROR(EINVAL);
	break;
	default:
	cmn_err(CE_PANIC,
	"unknown property type");
	}
	} else {
	err = SET_ERROR(EINVAL);
	}
	}

	/* Validate permissions */
	if (err == 0)
	err = zfs_check_settable(dsname, pair, CRED());

	if (err == 0) {
	err = zfs_prop_set_special(dsname, source, pair);
	if (err == -1) {
	/*
	* For better performance we build up a list of
	* properties to set in a single transaction.
	*/
	err = nvlist_add_nvpair(genericnvl, pair);
	} else if (err != 0 && nvl != retrynvl) {
	/*
	* This may be a spurious error caused by
	* receiving quota and reservation out of order.
	* Try again in a second pass.
	*/
	err = nvlist_add_nvpair(retrynvl, pair);
	}
	}

	if (err != 0) {
	if (errlist != NULL)
	fnvlist_add_int32(errlist, propname, err);
	rv = err;
	}
	}

	if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
	nvl = retrynvl;
	goto retry;
	}

	if (!nvlist_empty(genericnvl) &&
	dsl_props_set(dsname, source, genericnvl) != 0) {
	/*
	* If this fails, we still want to set as many properties as we
	* can, so try setting them individually.
	*/
	pair = NULL;
	while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
	const char *propname = nvpair_name(pair);
	int err = 0;

	propval = pair;
	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	nvlist_t *attrs;
	attrs = fnvpair_value_nvlist(pair);
	propval = fnvlist_lookup_nvpair(attrs,
	ZPROP_VALUE);
	}

	if (nvpair_type(propval) == DATA_TYPE_STRING) {
	strval = fnvpair_value_string(propval);
	err = dsl_prop_set_string(dsname, propname,
	source, strval);
	} else {
	intval = fnvpair_value_uint64(propval);
	err = dsl_prop_set_int(dsname, propname, source,
	intval);
	}

	if (err != 0) {
	if (errlist != NULL) {
	fnvlist_add_int32(errlist, propname,
	err);
	}
	rv = err;
	}
	}
	}
	nvlist_free(genericnvl);
	nvlist_free(retrynvl);

	return (rv);
	}

	/*
	* Check that all the properties are valid user properties.
	*/
	static int
	zfs_check_userprops(const char fsname, nvlist_t nvl)
	{
	nvpair_t *pair = NULL;
	int error = 0;

	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
	const char *propname = nvpair_name(pair);

	if (!zfs_prop_user(propname) \|\|
	nvpair_type(pair) != DATA_TYPE_STRING)
	return (SET_ERROR(EINVAL));

	if (error = zfs_secpolicy_write_perms(fsname,
	ZFS_DELEG_PERM_USERPROP, CRED()))
	return (error);

	if (strlen(propname) >= ZAP_MAXNAMELEN)
	return (SET_ERROR(ENAMETOOLONG));

	if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
	return (E2BIG);
	}
	return (0);
	}

	static void
	props_skip(nvlist_t props, nvlist_t skipped, nvlist_t **newprops)
	{
	nvpair_t *pair;

	VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	pair = NULL;
	while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
	if (nvlist_exists(skipped, nvpair_name(pair)))
	continue;

	VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
	}
	}

	static int
	clear_received_props(const char dsname, nvlist_t props,
	nvlist_t *skipped)
	{
	int err = 0;
	nvlist_t *cleared_props = NULL;
	props_skip(props, skipped, &cleared_props);
	if (!nvlist_empty(cleared_props)) {
	/*
	* Acts on local properties until the dataset has received
	* properties at least once on or after SPA_VERSION_RECVD_PROPS.
	*/
	zprop_source_t flags = (ZPROP_SRC_NONE \|
	(dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
	err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
	}
	nvlist_free(cleared_props);
	return (err);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_value name of property to set
	* zc_nvlist_src{_size} nvlist of properties to apply
	* zc_cookie received properties flag
	*
	* outputs:
	* zc_nvlist_dst{_size} error for each unapplied received property
	*/
	static int
	zfs_ioc_set_prop(zfs_cmd_t *zc)
	{
	nvlist_t *nvl;
	boolean_t received = zc->zc_cookie;
	zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
	ZPROP_SRC_LOCAL);
	nvlist_t *errors;
	int error;

	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	zc->zc_iflags, &nvl)) != 0)
	return (error);

	if (received) {
	nvlist_t *origprops;

	if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
	(void) clear_received_props(zc->zc_name,
	origprops, nvl);
	nvlist_free(origprops);
	}

	error = dsl_prop_set_hasrecvd(zc->zc_name);
	}

	errors = fnvlist_alloc();
	if (error == 0)
	error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);

	if (zc->zc_nvlist_dst != 0 && errors != NULL) {
	(void) put_nvlist(zc, errors);
	}

	nvlist_free(errors);
	nvlist_free(nvl);
	return (error);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_value name of property to inherit
	* zc_cookie revert to received value if TRUE
	*
	* outputs: none
	*/
	static int
	zfs_ioc_inherit_prop(zfs_cmd_t *zc)
	{
	const char *propname = zc->zc_value;
	zfs_prop_t prop = zfs_name_to_prop(propname);
	boolean_t received = zc->zc_cookie;
	zprop_source_t source = (received
	? ZPROP_SRC_NONE /* revert to received value, if any */
	: ZPROP_SRC_INHERITED); /* explicitly inherit */

	if (received) {
	nvlist_t *dummy;
	nvpair_t *pair;
	zprop_type_t type;
	int err;

	/*
	* zfs_prop_set_special() expects properties in the form of an
	* nvpair with type info.
	*/
	if (prop == ZPROP_INVAL) {
	if (!zfs_prop_user(propname))
	return (SET_ERROR(EINVAL));

	type = PROP_TYPE_STRING;
	} else if (prop == ZFS_PROP_VOLSIZE \|\|
	prop == ZFS_PROP_VERSION) {
	return (SET_ERROR(EINVAL));
	} else {
	type = zfs_prop_get_type(prop);
	}

	VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	switch (type) {
	case PROP_TYPE_STRING:
	VERIFY(0 == nvlist_add_string(dummy, propname, ""));
	break;
	case PROP_TYPE_NUMBER:
	case PROP_TYPE_INDEX:
	VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
	break;
	default:
	nvlist_free(dummy);
	return (SET_ERROR(EINVAL));
	}

	pair = nvlist_next_nvpair(dummy, NULL);
	err = zfs_prop_set_special(zc->zc_name, source, pair);
	nvlist_free(dummy);
	if (err != -1)
	return (err); /* special property already handled */
	} else {
	/*
	* Only check this in the non-received case. We want to allow
	* 'inherit -S' to revert non-inheritable properties like quota
	* and reservation to the received or default values even though
	* they are not considered inheritable.
	*/
	if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
	return (SET_ERROR(EINVAL));
	}

	/* property name has been validated by zfs_secpolicy_inherit_prop() */
	return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source));
	}

	static int
	zfs_ioc_pool_set_props(zfs_cmd_t *zc)
	{
	nvlist_t *props;
	spa_t *spa;
	int error;
	nvpair_t *pair;

	if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	zc->zc_iflags, &props))
	return (error);

	/*
	* If the only property is the configfile, then just do a spa_lookup()
	* to handle the faulted case.
	*/
	pair = nvlist_next_nvpair(props, NULL);
	if (pair != NULL && strcmp(nvpair_name(pair),
	zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
	nvlist_next_nvpair(props, pair) == NULL) {
	mutex_enter(&spa_namespace_lock);
	if ((spa = spa_lookup(zc->zc_name)) != NULL) {
	spa_configfile_set(spa, props, B_FALSE);
	- spa_config_sync(spa, B_FALSE, B_TRUE);
	+ spa_write_cachefile(spa, B_FALSE, B_TRUE);
	}
	mutex_exit(&spa_namespace_lock);
	if (spa != NULL) {
	nvlist_free(props);
	return (0);
	}
	}

	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
	nvlist_free(props);
	return (error);
	}

	error = spa_prop_set(spa, props);

	nvlist_free(props);
	spa_close(spa, FTAG);

	return (error);
	}

	static int
	zfs_ioc_pool_get_props(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;
	nvlist_t *nvp = NULL;

	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
	/*
	* If the pool is faulted, there may be properties we can still
	* get (such as altroot and cachefile), so attempt to get them
	* anyway.
	*/
	mutex_enter(&spa_namespace_lock);
	if ((spa = spa_lookup(zc->zc_name)) != NULL)
	error = spa_prop_get(spa, &nvp);
	mutex_exit(&spa_namespace_lock);
	} else {
	error = spa_prop_get(spa, &nvp);
	spa_close(spa, FTAG);
	}

	if (error == 0 && zc->zc_nvlist_dst != 0)
	error = put_nvlist(zc, nvp);
	else
	error = SET_ERROR(EFAULT);

	nvlist_free(nvp);
	return (error);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_nvlist_src{_size} nvlist of delegated permissions
	* zc_perm_action allow/unallow flag
	*
	* outputs: none
	*/
	static int
	zfs_ioc_set_fsacl(zfs_cmd_t *zc)
	{
	int error;
	nvlist_t *fsaclnv = NULL;

	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	zc->zc_iflags, &fsaclnv)) != 0)
	return (error);

	/*
	* Verify nvlist is constructed correctly
	*/
	if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
	nvlist_free(fsaclnv);
	return (SET_ERROR(EINVAL));
	}

	/*
	* If we don't have PRIV_SYS_MOUNT, then validate
	* that user is allowed to hand out each permission in
	* the nvlist(s)
	*/

	error = secpolicy_zfs(CRED());
	if (error != 0) {
	if (zc->zc_perm_action == B_FALSE) {
	error = dsl_deleg_can_allow(zc->zc_name,
	fsaclnv, CRED());
	} else {
	error = dsl_deleg_can_unallow(zc->zc_name,
	fsaclnv, CRED());
	}
	}

	if (error == 0)
	error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);

	nvlist_free(fsaclnv);
	return (error);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	*
	* outputs:
	* zc_nvlist_src{_size} nvlist of delegated permissions
	*/
	static int
	zfs_ioc_get_fsacl(zfs_cmd_t *zc)
	{
	nvlist_t *nvp;
	int error;

	if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
	error = put_nvlist(zc, nvp);
	nvlist_free(nvp);
	}

	return (error);
	}

	/* ARGSUSED */
	static void
	zfs_create_cb(objset_t os, void arg, cred_t cr, dmu_tx_t tx)
	{
	zfs_creat_t *zct = arg;

	zfs_create_fs(os, cr, zct->zct_zplprops, tx);
	}

	#define ZFS_PROP_UNDEFINED ((uint64_t)-1)

	/*
	* inputs:
	* os parent objset pointer (NULL if root fs)
	* fuids_ok fuids allowed in this version of the spa?
	* sa_ok SAs allowed in this version of the spa?
	* createprops list of properties requested by creator
	*
	* outputs:
	* zplprops values for the zplprops we attach to the master node object
	* is_ci true if requested file system will be purely case-insensitive
	*
	* Determine the settings for utf8only, normalization and
	* casesensitivity. Specific values may have been requested by the
	* creator and/or we can inherit values from the parent dataset. If
	* the file system is of too early a vintage, a creator can not
	* request settings for these properties, even if the requested
	* setting is the default value. We don't actually want to create dsl
	* properties for these, so remove them from the source nvlist after
	* processing.
	*/
	static int
	zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
	boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
	nvlist_t zplprops, boolean_t is_ci)
	{
	uint64_t sense = ZFS_PROP_UNDEFINED;
	uint64_t norm = ZFS_PROP_UNDEFINED;
	uint64_t u8 = ZFS_PROP_UNDEFINED;

	ASSERT(zplprops != NULL);

	if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS)
	return (SET_ERROR(EINVAL));

	/*
	* Pull out creator prop choices, if any.
	*/
	if (createprops) {
	(void) nvlist_lookup_uint64(createprops,
	zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
	(void) nvlist_lookup_uint64(createprops,
	zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
	(void) nvlist_remove_all(createprops,
	zfs_prop_to_name(ZFS_PROP_NORMALIZE));
	(void) nvlist_lookup_uint64(createprops,
	zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
	(void) nvlist_remove_all(createprops,
	zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
	(void) nvlist_lookup_uint64(createprops,
	zfs_prop_to_name(ZFS_PROP_CASE), &sense);
	(void) nvlist_remove_all(createprops,
	zfs_prop_to_name(ZFS_PROP_CASE));
	}

	/*
	* If the zpl version requested is whacky or the file system
	* or pool is version is too "young" to support normalization
	* and the creator tried to set a value for one of the props,
	* error out.
	*/
	if ((zplver < ZPL_VERSION_INITIAL \|\| zplver > ZPL_VERSION) \|\|
	(zplver >= ZPL_VERSION_FUID && !fuids_ok) \|\|
	(zplver >= ZPL_VERSION_SA && !sa_ok) \|\|
	(zplver < ZPL_VERSION_NORMALIZATION &&
	(norm != ZFS_PROP_UNDEFINED \|\| u8 != ZFS_PROP_UNDEFINED \|\|
	sense != ZFS_PROP_UNDEFINED)))
	return (SET_ERROR(ENOTSUP));

	/*
	* Put the version in the zplprops
	*/
	VERIFY(nvlist_add_uint64(zplprops,
	zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);

	if (norm == ZFS_PROP_UNDEFINED)
	VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
	VERIFY(nvlist_add_uint64(zplprops,
	zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);

	/*
	* If we're normalizing, names must always be valid UTF-8 strings.
	*/
	if (norm)
	u8 = 1;
	if (u8 == ZFS_PROP_UNDEFINED)
	VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
	VERIFY(nvlist_add_uint64(zplprops,
	zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);

	if (sense == ZFS_PROP_UNDEFINED)
	VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
	VERIFY(nvlist_add_uint64(zplprops,
	zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);

	if (is_ci)
	*is_ci = (sense == ZFS_CASE_INSENSITIVE);

	return (0);
	}

	static int
	zfs_fill_zplprops(const char dataset, nvlist_t createprops,
	nvlist_t zplprops, boolean_t is_ci)
	{
	boolean_t fuids_ok, sa_ok;
	uint64_t zplver = ZPL_VERSION;
	objset_t *os = NULL;
	char parentname[ZFS_MAX_DATASET_NAME_LEN];
	char *cp;
	spa_t *spa;
	uint64_t spa_vers;
	int error;

	(void) strlcpy(parentname, dataset, sizeof (parentname));
	cp = strrchr(parentname, '/');
	ASSERT(cp != NULL);
	cp[0] = '\0';

	if ((error = spa_open(dataset, &spa, FTAG)) != 0)
	return (error);

	spa_vers = spa_version(spa);
	spa_close(spa, FTAG);

	zplver = zfs_zpl_version_map(spa_vers);
	fuids_ok = (zplver >= ZPL_VERSION_FUID);
	sa_ok = (zplver >= ZPL_VERSION_SA);

	/*
	* Open parent object set so we can inherit zplprop values.
	*/
	if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
	return (error);

	error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
	zplprops, is_ci);
	dmu_objset_rele(os, FTAG);
	return (error);
	}

	static int
	zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
	nvlist_t zplprops, boolean_t is_ci)
	{
	boolean_t fuids_ok;
	boolean_t sa_ok;
	uint64_t zplver = ZPL_VERSION;
	int error;

	zplver = zfs_zpl_version_map(spa_vers);
	fuids_ok = (zplver >= ZPL_VERSION_FUID);
	sa_ok = (zplver >= ZPL_VERSION_SA);

	error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
	createprops, zplprops, is_ci);
	return (error);
	}

	/*
	* innvl: {
	* "type" -> dmu_objset_type_t (int32)
	* (optional) "props" -> { prop -> value }
	* }
	*
	* outnvl: propname -> error code (int32)
	*/
	static int
	zfs_ioc_create(const char fsname, nvlist_t innvl, nvlist_t *outnvl)
	{
	int error = 0;
	zfs_creat_t zct = { 0 };
	nvlist_t *nvprops = NULL;
	void (cbfunc)(objset_t os, void arg, cred_t cr, dmu_tx_t *tx);
	int32_t type32;
	dmu_objset_type_t type;
	boolean_t is_insensitive = B_FALSE;

	if (nvlist_lookup_int32(innvl, "type", &type32) != 0)
	return (SET_ERROR(EINVAL));
	type = type32;
	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);

	switch (type) {
	case DMU_OST_ZFS:
	cbfunc = zfs_create_cb;
	break;

	case DMU_OST_ZVOL:
	cbfunc = zvol_create_cb;
	break;

	default:
	cbfunc = NULL;
	break;
	}
	if (strchr(fsname, '@') \|\|
	strchr(fsname, '%'))
	return (SET_ERROR(EINVAL));

	zct.zct_props = nvprops;

	if (cbfunc == NULL)
	return (SET_ERROR(EINVAL));

	if (type == DMU_OST_ZVOL) {
	uint64_t volsize, volblocksize;

	if (nvprops == NULL)
	return (SET_ERROR(EINVAL));
	if (nvlist_lookup_uint64(nvprops,
	zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
	return (SET_ERROR(EINVAL));

	if ((error = nvlist_lookup_uint64(nvprops,
	zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
	&volblocksize)) != 0 && error != ENOENT)
	return (SET_ERROR(EINVAL));

	if (error != 0)
	volblocksize = zfs_prop_default_numeric(
	ZFS_PROP_VOLBLOCKSIZE);

	if ((error = zvol_check_volblocksize(
	volblocksize)) != 0 \|\|
	(error = zvol_check_volsize(volsize,
	volblocksize)) != 0)
	return (error);
	} else if (type == DMU_OST_ZFS) {
	int error;

	/*
	* We have to have normalization and
	* case-folding flags correct when we do the
	* file system creation, so go figure them out
	* now.
	*/
	VERIFY(nvlist_alloc(&zct.zct_zplprops,
	NV_UNIQUE_NAME, KM_SLEEP) == 0);
	error = zfs_fill_zplprops(fsname, nvprops,
	zct.zct_zplprops, &is_insensitive);
	if (error != 0) {
	nvlist_free(zct.zct_zplprops);
	return (error);
	}
	}

	error = dmu_objset_create(fsname, type,
	is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
	nvlist_free(zct.zct_zplprops);

	/*
	* It would be nice to do this atomically.
	*/
	if (error == 0) {
	error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
	nvprops, outnvl);
	if (error != 0)
	(void) dsl_destroy_head(fsname);
	}
	#ifdef __FreeBSD__
	if (error == 0 && type == DMU_OST_ZVOL)
	zvol_create_minors(fsname);
	#endif
	return (error);
	}

	/*
	* innvl: {
	* "origin" -> name of origin snapshot
	* (optional) "props" -> { prop -> value }
	* }
	*
	* outnvl: propname -> error code (int32)
	*/
	static int
	zfs_ioc_clone(const char fsname, nvlist_t innvl, nvlist_t *outnvl)
	{
	int error = 0;
	nvlist_t *nvprops = NULL;
	char *origin_name;

	if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0)
	return (SET_ERROR(EINVAL));
	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);

	if (strchr(fsname, '@') \|\|
	strchr(fsname, '%'))
	return (SET_ERROR(EINVAL));

	if (dataset_namecheck(origin_name, NULL, NULL) != 0)
	return (SET_ERROR(EINVAL));
	error = dmu_objset_clone(fsname, origin_name);
	if (error != 0)
	return (error);

	/*
	* It would be nice to do this atomically.
	*/
	if (error == 0) {
	error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
	nvprops, outnvl);
	if (error != 0)
	(void) dsl_destroy_head(fsname);
	}
	#ifdef __FreeBSD__
	if (error == 0)
	zvol_create_minors(fsname);
	#endif
	return (error);
	}

	+/* ARGSUSED */
	+static int
	+zfs_ioc_remap(const char fsname, nvlist_t innvl, nvlist_t *outnvl)
	+{
	+ if (strchr(fsname, '@') \|\|
	+ strchr(fsname, '%'))
	+ return (SET_ERROR(EINVAL));
	+
	+ return (dmu_objset_remap_indirects(fsname));
	+}
	+
	/*
	* innvl: {
	* "snaps" -> { snapshot1, snapshot2 }
	* (optional) "props" -> { prop -> value (string) }
	* }
	*
	* outnvl: snapshot -> error code (int32)
	*/
	static int
	zfs_ioc_snapshot(const char poolname, nvlist_t innvl, nvlist_t *outnvl)
	{
	nvlist_t *snaps;
	nvlist_t *props = NULL;
	int error, poollen;
	nvpair_t *pair;

	(void) nvlist_lookup_nvlist(innvl, "props", &props);
	if ((error = zfs_check_userprops(poolname, props)) != 0)
	return (error);

	if (!nvlist_empty(props) &&
	zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
	return (SET_ERROR(ENOTSUP));

	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
	return (SET_ERROR(EINVAL));
	poollen = strlen(poolname);
	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	pair = nvlist_next_nvpair(snaps, pair)) {
	const char *name = nvpair_name(pair);
	const char *cp = strchr(name, '@');

	/*
	* The snap name must contain an @, and the part after it must
	* contain only valid characters.
	*/
	if (cp == NULL \|\|
	zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
	return (SET_ERROR(EINVAL));

	/*
	* The snap must be in the specified pool.
	*/
	if (strncmp(name, poolname, poollen) != 0 \|\|
	(name[poollen] != '/' && name[poollen] != '@'))
	return (SET_ERROR(EXDEV));

	/* This must be the only snap of this fs. */
	for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
	pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
	if (strncmp(name, nvpair_name(pair2), cp - name + 1)
	== 0) {
	return (SET_ERROR(EXDEV));
	}
	}
	}

	error = dsl_dataset_snapshot(snaps, props, outnvl);
	return (error);
	}

	/*
	* innvl: "message" -> string
	*/
	/* ARGSUSED */
	static int
	zfs_ioc_log_history(const char unused, nvlist_t innvl, nvlist_t *outnvl)
	{
	char *message;
	spa_t *spa;
	int error;
	char *poolname;

	/*
	* The poolname in the ioctl is not set, we get it from the TSD,
	* which was set at the end of the last successful ioctl that allows
	* logging. The secpolicy func already checked that it is set.
	* Only one log ioctl is allowed after each successful ioctl, so
	* we clear the TSD here.
	*/
	poolname = tsd_get(zfs_allow_log_key);
	(void) tsd_set(zfs_allow_log_key, NULL);
	error = spa_open(poolname, &spa, FTAG);
	strfree(poolname);
	if (error != 0)
	return (error);

	if (nvlist_lookup_string(innvl, "message", &message) != 0) {
	spa_close(spa, FTAG);
	return (SET_ERROR(EINVAL));
	}

	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
	spa_close(spa, FTAG);
	return (SET_ERROR(ENOTSUP));
	}

	error = spa_history_log(spa, message);
	spa_close(spa, FTAG);
	return (error);
	}

	#ifdef __FreeBSD__
	static int
	zfs_ioc_nextboot(const char unused, nvlist_t innvl, nvlist_t *outnvl)
	{
	char name[MAXNAMELEN];
	spa_t *spa;
	vdev_t *vd;
	char *command;
	uint64_t pool_guid;
	uint64_t vdev_guid;
	int error;

	if (nvlist_lookup_uint64(innvl,
	ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
	return (EINVAL);
	if (nvlist_lookup_uint64(innvl,
	ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
	return (EINVAL);
	if (nvlist_lookup_string(innvl,
	"command", &command) != 0)
	return (EINVAL);

	mutex_enter(&spa_namespace_lock);
	spa = spa_by_guid(pool_guid, vdev_guid);
	if (spa != NULL)
	strcpy(name, spa_name(spa));
	mutex_exit(&spa_namespace_lock);
	if (spa == NULL)
	return (ENOENT);

	if ((error = spa_open(name, &spa, FTAG)) != 0)
	return (error);
	spa_vdev_state_enter(spa, SCL_ALL);
	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
	if (vd == NULL) {
	(void) spa_vdev_state_exit(spa, NULL, ENXIO);
	spa_close(spa, FTAG);
	return (ENODEV);
	}
	error = vdev_label_write_pad2(vd, command, strlen(command));
	(void) spa_vdev_state_exit(spa, NULL, 0);
	txg_wait_synced(spa->spa_dsl_pool, 0);
	spa_close(spa, FTAG);
	return (error);
	}
	#endif

	/*
	* The dp_config_rwlock must not be held when calling this, because the
	* unmount may need to write out data.
	*
	* This function is best-effort. Callers must deal gracefully if it
	* remains mounted (or is remounted after this call).
	*
	* Returns 0 if the argument is not a snapshot, or it is not currently a
	* filesystem, or we were able to unmount it. Returns error code otherwise.
	*/
	void
	zfs_unmount_snap(const char *snapname)
	{
	vfs_t *vfsp = NULL;
	zfsvfs_t *zfsvfs = NULL;

	if (strchr(snapname, '@') == NULL)
	return;

	int err = getzfsvfs(snapname, &zfsvfs);
	if (err != 0) {
	ASSERT3P(zfsvfs, ==, NULL);
	return;
	}
	vfsp = zfsvfs->z_vfs;

	ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));

	#ifdef illumos
	err = vn_vfswlock(vfsp->vfs_vnodecovered);
	VFS_RELE(vfsp);
	if (err != 0)
	return;
	#endif

	/*
	* Always force the unmount for snapshots.
	*/
	#ifdef illumos
	(void) dounmount(vfsp, MS_FORCE, kcred);
	#else
	vfs_ref(vfsp);
	vfs_unbusy(vfsp);
	(void) dounmount(vfsp, MS_FORCE, curthread);
	#endif
	}

	/* ARGSUSED */
	static int
	zfs_unmount_snap_cb(const char snapname, void arg)
	{
	zfs_unmount_snap(snapname);
	return (0);
	}

	/*
	* When a clone is destroyed, its origin may also need to be destroyed,
	* in which case it must be unmounted. This routine will do that unmount
	* if necessary.
	*/
	void
	zfs_destroy_unmount_origin(const char *fsname)
	{
	int error;
	objset_t *os;
	dsl_dataset_t *ds;

	error = dmu_objset_hold(fsname, FTAG, &os);
	if (error != 0)
	return;
	ds = dmu_objset_ds(os);
	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
	char originname[ZFS_MAX_DATASET_NAME_LEN];
	dsl_dataset_name(ds->ds_prev, originname);
	dmu_objset_rele(os, FTAG);
	zfs_unmount_snap(originname);
	} else {
	dmu_objset_rele(os, FTAG);
	}
	}

	/*
	* innvl: {
	* "snaps" -> { snapshot1, snapshot2 }
	* (optional boolean) "defer"
	* }
	*
	* outnvl: snapshot -> error code (int32)
	*
	*/
	/* ARGSUSED */
	static int
	zfs_ioc_destroy_snaps(const char poolname, nvlist_t innvl, nvlist_t *outnvl)
	{
	int error, poollen;
	nvlist_t *snaps;
	nvpair_t *pair;
	boolean_t defer;

	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
	return (SET_ERROR(EINVAL));
	defer = nvlist_exists(innvl, "defer");

	poollen = strlen(poolname);
	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
	pair = nvlist_next_nvpair(snaps, pair)) {
	const char *name = nvpair_name(pair);

	/*
	* The snap must be in the specified pool to prevent the
	* invalid removal of zvol minors below.
	*/
	if (strncmp(name, poolname, poollen) != 0 \|\|
	(name[poollen] != '/' && name[poollen] != '@'))
	return (SET_ERROR(EXDEV));

	zfs_unmount_snap(nvpair_name(pair));
	#if defined(__FreeBSD__)
	zvol_remove_minors(name);
	#endif
	}

	return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
	}

	/*
	* Create bookmarks. Bookmark names are of the form <fs>#<bmark>.
	* All bookmarks must be in the same pool.
	*
	* innvl: {
	* bookmark1 -> snapshot1, bookmark2 -> snapshot2
	* }
	*
	* outnvl: bookmark -> error code (int32)
	*
	*/
	/* ARGSUSED */
	static int
	zfs_ioc_bookmark(const char poolname, nvlist_t innvl, nvlist_t *outnvl)
	{
	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
	pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
	char *snap_name;

	/*
	* Verify the snapshot argument.
	*/
	if (nvpair_value_string(pair, &snap_name) != 0)
	return (SET_ERROR(EINVAL));


	/* Verify that the keys (bookmarks) are unique */
	for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
	pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
	if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
	return (SET_ERROR(EINVAL));
	}
	}

	return (dsl_bookmark_create(innvl, outnvl));
	}

	/*
	* innvl: {
	* property 1, property 2, ...
	* }
	*
	* outnvl: {
	* bookmark name 1 -> { property 1, property 2, ... },
	* bookmark name 2 -> { property 1, property 2, ... }
	* }
	*
	*/
	static int
	zfs_ioc_get_bookmarks(const char fsname, nvlist_t innvl, nvlist_t *outnvl)
	{
	return (dsl_get_bookmarks(fsname, innvl, outnvl));
	}

	/*
	* innvl: {
	* bookmark name 1, bookmark name 2
	* }
	*
	* outnvl: bookmark -> error code (int32)
	*
	*/
	static int
	zfs_ioc_destroy_bookmarks(const char poolname, nvlist_t innvl,
	nvlist_t *outnvl)
	{
	int error, poollen;

	poollen = strlen(poolname);
	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
	pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
	const char *name = nvpair_name(pair);
	const char *cp = strchr(name, '#');

	/*
	* The bookmark name must contain an #, and the part after it
	* must contain only valid characters.
	*/
	if (cp == NULL \|\|
	zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
	return (SET_ERROR(EINVAL));

	/*
	* The bookmark must be in the specified pool.
	*/
	if (strncmp(name, poolname, poollen) != 0 \|\|
	(name[poollen] != '/' && name[poollen] != '#'))
	return (SET_ERROR(EXDEV));
	}

	error = dsl_bookmark_destroy(innvl, outnvl);
	return (error);
	}

	static int
	zfs_ioc_channel_program(const char poolname, nvlist_t innvl,
	nvlist_t *outnvl)
	{
	char *program;
	uint64_t instrlimit, memlimit;
	boolean_t sync_flag;
	nvpair_t *nvarg = NULL;

	if (0 != nvlist_lookup_string(innvl, ZCP_ARG_PROGRAM, &program)) {
	return (EINVAL);
	}
	if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
	sync_flag = B_TRUE;
	}
	if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) {
	instrlimit = ZCP_DEFAULT_INSTRLIMIT;
	}
	if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) {
	memlimit = ZCP_DEFAULT_MEMLIMIT;
	}
	if (0 != nvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST, &nvarg)) {
	return (EINVAL);
	}

	if (instrlimit == 0 \|\| instrlimit > zfs_lua_max_instrlimit)
	return (EINVAL);
	if (memlimit == 0 \|\| memlimit > zfs_lua_max_memlimit)
	return (EINVAL);

	return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit,
	nvarg, outnvl));
	}

	/*
	* inputs:
	* zc_name name of dataset to destroy
	* zc_objset_type type of objset
	* zc_defer_destroy mark for deferred destroy
	*
	* outputs: none
	*/
	static int
	zfs_ioc_destroy(zfs_cmd_t *zc)
	{
	int err;

	if (zc->zc_objset_type == DMU_OST_ZFS)
	zfs_unmount_snap(zc->zc_name);

	if (strchr(zc->zc_name, '@'))
	err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
	else
	err = dsl_destroy_head(zc->zc_name);
	if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
	#ifdef __FreeBSD__
	zvol_remove_minors(zc->zc_name);
	#else
	(void) zvol_remove_minor(zc->zc_name);
	#endif
	return (err);
	}

	/*
	* fsname is name of dataset to rollback (to most recent snapshot)
	*
	* innvl may contain name of expected target snapshot
	*
	* outnvl: "target" -> name of most recent snapshot
	* }
	*/
	/* ARGSUSED */
	static int
	zfs_ioc_rollback(const char fsname, nvlist_t innvl, nvlist_t *outnvl)
	{
	zfsvfs_t *zfsvfs;
	char *target = NULL;
	int error;

	(void) nvlist_lookup_string(innvl, "target", &target);
	if (target != NULL) {
	const char *cp = strchr(target, '@');

	/*
	* The snap name must contain an @, and the part after it must
	* contain only valid characters.
	*/
	if (cp == NULL \|\|
	zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
	return (SET_ERROR(EINVAL));
	}

	if (getzfsvfs(fsname, &zfsvfs) == 0) {
	dsl_dataset_t *ds;

	ds = dmu_objset_ds(zfsvfs->z_os);
	error = zfs_suspend_fs(zfsvfs);
	if (error == 0) {
	int resume_err;

	error = dsl_dataset_rollback(fsname, target, zfsvfs,
	outnvl);
	resume_err = zfs_resume_fs(zfsvfs, ds);
	error = error ? error : resume_err;
	}
	#ifdef illumos
	VFS_RELE(zfsvfs->z_vfs);
	#else
	vfs_unbusy(zfsvfs->z_vfs);
	#endif
	} else {
	error = dsl_dataset_rollback(fsname, target, NULL, outnvl);
	}
	return (error);
	}

	static int
	recursive_unmount(const char fsname, void arg)
	{
	const char *snapname = arg;
	char fullname[ZFS_MAX_DATASET_NAME_LEN];

	(void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
	zfs_unmount_snap(fullname);

	return (0);
	}

	/*
	* inputs:
	* zc_name old name of dataset
	* zc_value new name of dataset
	* zc_cookie recursive flag (only valid for snapshots)
	*
	* outputs: none
	*/
	static int
	zfs_ioc_rename(zfs_cmd_t *zc)
	{
	boolean_t recursive = zc->zc_cookie & 1;
	char *at;
	boolean_t allow_mounted = B_TRUE;

	#ifdef __FreeBSD__
	allow_mounted = (zc->zc_cookie & 2) != 0;
	#endif

	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 \|\|
	strchr(zc->zc_value, '%'))
	return (SET_ERROR(EINVAL));

	at = strchr(zc->zc_name, '@');
	if (at != NULL) {
	/* snaps must be in same fs */
	int error;

	if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
	return (SET_ERROR(EXDEV));
	*at = '\0';
	if (zc->zc_objset_type == DMU_OST_ZFS && !allow_mounted) {
	error = dmu_objset_find(zc->zc_name,
	recursive_unmount, at + 1,
	recursive ? DS_FIND_CHILDREN : 0);
	if (error != 0) {
	*at = '@';
	return (error);
	}
	}
	error = dsl_dataset_rename_snapshot(zc->zc_name,
	at + 1, strchr(zc->zc_value, '@') + 1, recursive);
	*at = '@';

	return (error);
	} else {
	#ifdef illumos
	if (zc->zc_objset_type == DMU_OST_ZVOL)
	(void) zvol_remove_minor(zc->zc_name);
	#endif
	return (dsl_dir_rename(zc->zc_name, zc->zc_value));
	}
	}

	static int
	zfs_check_settable(const char dsname, nvpair_t pair, cred_t *cr)
	{
	const char *propname = nvpair_name(pair);
	boolean_t issnap = (strchr(dsname, '@') != NULL);
	zfs_prop_t prop = zfs_name_to_prop(propname);
	uint64_t intval;
	int err;

	if (prop == ZPROP_INVAL) {
	if (zfs_prop_user(propname)) {
	if (err = zfs_secpolicy_write_perms(dsname,
	ZFS_DELEG_PERM_USERPROP, cr))
	return (err);
	return (0);
	}

	if (!issnap && zfs_prop_userquota(propname)) {
	const char *perm = NULL;
	const char *uq_prefix =
	zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
	const char *gq_prefix =
	zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];

	if (strncmp(propname, uq_prefix,
	strlen(uq_prefix)) == 0) {
	perm = ZFS_DELEG_PERM_USERQUOTA;
	} else if (strncmp(propname, gq_prefix,
	strlen(gq_prefix)) == 0) {
	perm = ZFS_DELEG_PERM_GROUPQUOTA;
	} else {
	/* USERUSED and GROUPUSED are read-only */
	return (SET_ERROR(EINVAL));
	}

	if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
	return (err);
	return (0);
	}

	return (SET_ERROR(EINVAL));
	}

	if (issnap)
	return (SET_ERROR(EINVAL));

	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
	/*
	* dsl_prop_get_all_impl() returns properties in this
	* format.
	*/
	nvlist_t *attrs;
	VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
	VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	&pair) == 0);
	}

	/*
	* Check that this value is valid for this pool version
	*/
	switch (prop) {
	case ZFS_PROP_COMPRESSION:
	/*
	* If the user specified gzip compression, make sure
	* the SPA supports it. We ignore any errors here since
	* we'll catch them later.
	*/
	if (nvpair_value_uint64(pair, &intval) == 0) {
	if (intval >= ZIO_COMPRESS_GZIP_1 &&
	intval <= ZIO_COMPRESS_GZIP_9 &&
	zfs_earlier_version(dsname,
	SPA_VERSION_GZIP_COMPRESSION)) {
	return (SET_ERROR(ENOTSUP));
	}

	if (intval == ZIO_COMPRESS_ZLE &&
	zfs_earlier_version(dsname,
	SPA_VERSION_ZLE_COMPRESSION))
	return (SET_ERROR(ENOTSUP));

	if (intval == ZIO_COMPRESS_LZ4) {
	spa_t *spa;

	if ((err = spa_open(dsname, &spa, FTAG)) != 0)
	return (err);

	if (!spa_feature_is_enabled(spa,
	SPA_FEATURE_LZ4_COMPRESS)) {
	spa_close(spa, FTAG);
	return (SET_ERROR(ENOTSUP));
	}
	spa_close(spa, FTAG);
	}

	/*
	* If this is a bootable dataset then
	* verify that the compression algorithm
	* is supported for booting. We must return
	* something other than ENOTSUP since it
	* implies a downrev pool version.
	*/
	if (zfs_is_bootfs(dsname) &&
	!BOOTFS_COMPRESS_VALID(intval)) {
	return (SET_ERROR(ERANGE));
	}
	}
	break;

	case ZFS_PROP_COPIES:
	if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
	return (SET_ERROR(ENOTSUP));
	break;

	case ZFS_PROP_RECORDSIZE:
	/* Record sizes above 128k need the feature to be enabled */
	if (nvpair_value_uint64(pair, &intval) == 0 &&
	intval > SPA_OLD_MAXBLOCKSIZE) {
	spa_t *spa;

	/*
	* We don't allow setting the property above 1MB,
	* unless the tunable has been changed.
	*/
	if (intval > zfs_max_recordsize \|\|
	intval > SPA_MAXBLOCKSIZE)
	return (SET_ERROR(ERANGE));

	if ((err = spa_open(dsname, &spa, FTAG)) != 0)
	return (err);

	if (!spa_feature_is_enabled(spa,
	SPA_FEATURE_LARGE_BLOCKS)) {
	spa_close(spa, FTAG);
	return (SET_ERROR(ENOTSUP));
	}
	spa_close(spa, FTAG);
	}
	break;

	case ZFS_PROP_SHARESMB:
	if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
	return (SET_ERROR(ENOTSUP));
	break;

	case ZFS_PROP_ACLINHERIT:
	if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
	nvpair_value_uint64(pair, &intval) == 0) {
	if (intval == ZFS_ACL_PASSTHROUGH_X &&
	zfs_earlier_version(dsname,
	SPA_VERSION_PASSTHROUGH_X))
	return (SET_ERROR(ENOTSUP));
	}
	break;

	case ZFS_PROP_CHECKSUM:
	case ZFS_PROP_DEDUP:
	{
	spa_feature_t feature;
	spa_t *spa;

	/* dedup feature version checks */
	if (prop == ZFS_PROP_DEDUP &&
	zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
	return (SET_ERROR(ENOTSUP));

	if (nvpair_value_uint64(pair, &intval) != 0)
	return (SET_ERROR(EINVAL));

	/* check prop value is enabled in features */
	feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
	if (feature == SPA_FEATURE_NONE)
	break;

	if ((err = spa_open(dsname, &spa, FTAG)) != 0)
	return (err);
	/*
	* Salted checksums are not supported on root pools.
	*/
	if (spa_bootfs(spa) != 0 &&
	intval < ZIO_CHECKSUM_FUNCTIONS &&
	(zio_checksum_table[intval].ci_flags &
	ZCHECKSUM_FLAG_SALTED)) {
	spa_close(spa, FTAG);
	return (SET_ERROR(ERANGE));
	}
	if (!spa_feature_is_enabled(spa, feature)) {
	spa_close(spa, FTAG);
	return (SET_ERROR(ENOTSUP));
	}
	spa_close(spa, FTAG);
	break;
	}
	}

	return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
	}

	/*
	* Checks for a race condition to make sure we don't increment a feature flag
	* multiple times.
	*/
	static int
	zfs_prop_activate_feature_check(void arg, dmu_tx_t tx)
	{
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	spa_feature_t *featurep = arg;

	if (!spa_feature_is_active(spa, *featurep))
	return (0);
	else
	return (SET_ERROR(EBUSY));
	}

	/*
	* The callback invoked on feature activation in the sync task caused by
	* zfs_prop_activate_feature.
	*/
	static void
	zfs_prop_activate_feature_sync(void arg, dmu_tx_t tx)
	{
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	spa_feature_t *featurep = arg;

	spa_feature_incr(spa, *featurep, tx);
	}

	/*
	* Activates a feature on a pool in response to a property setting. This
	* creates a new sync task which modifies the pool to reflect the feature
	* as being active.
	*/
	static int
	zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
	{
	int err;

	/* EBUSY here indicates that the feature is already active */
	err = dsl_sync_task(spa_name(spa),
	zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
	&feature, 2, ZFS_SPACE_CHECK_RESERVED);

	if (err != 0 && err != EBUSY)
	return (err);
	else
	return (0);
	}

	/*
	* Removes properties from the given props list that fail permission checks
	* needed to clear them and to restore them in case of a receive error. For each
	* property, make sure we have both set and inherit permissions.
	*
	* Returns the first error encountered if any permission checks fail. If the
	* caller provides a non-NULL errlist, it also gives the complete list of names
	* of all the properties that failed a permission check along with the
	* corresponding error numbers. The caller is responsible for freeing the
	* returned errlist.
	*
	* If every property checks out successfully, zero is returned and the list
	* pointed at by errlist is NULL.
	*/
	static int
	zfs_check_clearable(char dataset, nvlist_t props, nvlist_t **errlist)
	{
	zfs_cmd_t *zc;
	nvpair_t pair, next_pair;
	nvlist_t *errors;
	int err, rv = 0;

	if (props == NULL)
	return (0);

	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
	(void) strcpy(zc->zc_name, dataset);
	pair = nvlist_next_nvpair(props, NULL);
	while (pair != NULL) {
	next_pair = nvlist_next_nvpair(props, pair);

	(void) strcpy(zc->zc_value, nvpair_name(pair));
	if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 \|\|
	(err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
	VERIFY(nvlist_remove_nvpair(props, pair) == 0);
	VERIFY(nvlist_add_int32(errors,
	zc->zc_value, err) == 0);
	}
	pair = next_pair;
	}
	kmem_free(zc, sizeof (zfs_cmd_t));

	if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
	nvlist_free(errors);
	errors = NULL;
	} else {
	VERIFY(nvpair_value_int32(pair, &rv) == 0);
	}

	if (errlist == NULL)
	nvlist_free(errors);
	else
	*errlist = errors;

	return (rv);
	}

	static boolean_t
	propval_equals(nvpair_t p1, nvpair_t p2)
	{
	if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
	/* dsl_prop_get_all_impl() format */
	nvlist_t *attrs;
	VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
	VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	&p1) == 0);
	}

	if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
	nvlist_t *attrs;
	VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
	VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
	&p2) == 0);
	}

	if (nvpair_type(p1) != nvpair_type(p2))
	return (B_FALSE);

	if (nvpair_type(p1) == DATA_TYPE_STRING) {
	char valstr1, valstr2;

	VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
	VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
	return (strcmp(valstr1, valstr2) == 0);
	} else {
	uint64_t intval1, intval2;

	VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
	VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
	return (intval1 == intval2);
	}
	}

	/*
	* Remove properties from props if they are not going to change (as determined
	* by comparison with origprops). Remove them from origprops as well, since we
	* do not need to clear or restore properties that won't change.
	*/
	static void
	props_reduce(nvlist_t props, nvlist_t origprops)
	{
	nvpair_t pair, next_pair;

	if (origprops == NULL)
	return; /* all props need to be received */

	pair = nvlist_next_nvpair(props, NULL);
	while (pair != NULL) {
	const char *propname = nvpair_name(pair);
	nvpair_t *match;

	next_pair = nvlist_next_nvpair(props, pair);

	if ((nvlist_lookup_nvpair(origprops, propname,
	&match) != 0) \|\| !propval_equals(pair, match))
	goto next; /* need to set received value */

	/* don't clear the existing received value */
	(void) nvlist_remove_nvpair(origprops, match);
	/* don't bother receiving the property */
	(void) nvlist_remove_nvpair(props, pair);
	next:
	pair = next_pair;
	}
	}

	/*
	* Extract properties that cannot be set PRIOR to the receipt of a dataset.
	* For example, refquota cannot be set until after the receipt of a dataset,
	* because in replication streams, an older/earlier snapshot may exceed the
	* refquota. We want to receive the older/earlier snapshot, but setting
	* refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
	* the older/earlier snapshot from being received (with EDQUOT).
	*
	* The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
	*
	* libzfs will need to be judicious handling errors encountered by props
	* extracted by this function.
	*/
	static nvlist_t *
	extract_delay_props(nvlist_t *props)
	{
	nvlist_t *delayprops;
	nvpair_t nvp, tmp;
	static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
	int i;

	VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
	nvp = nvlist_next_nvpair(props, nvp)) {
	/*
	* strcmp() is safe because zfs_prop_to_name() always returns
	* a bounded string.
	*/
	for (i = 0; delayable[i] != 0; i++) {
	if (strcmp(zfs_prop_to_name(delayable[i]),
	nvpair_name(nvp)) == 0) {
	break;
	}
	}
	if (delayable[i] != 0) {
	tmp = nvlist_prev_nvpair(props, nvp);
	VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
	VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
	nvp = tmp;
	}
	}

	if (nvlist_empty(delayprops)) {
	nvlist_free(delayprops);
	delayprops = NULL;
	}
	return (delayprops);
	}

	#ifdef DEBUG
	static boolean_t zfs_ioc_recv_inject_err;
	#endif

	/*
	* inputs:
	* zc_name name of containing filesystem
	* zc_nvlist_src{_size} nvlist of properties to apply
	* zc_value name of snapshot to create
	* zc_string name of clone origin (if DRR_FLAG_CLONE)
	* zc_cookie file descriptor to recv from
	* zc_begin_record the BEGIN record of the stream (not byteswapped)
	* zc_guid force flag
	* zc_cleanup_fd cleanup-on-exit file descriptor
	* zc_action_handle handle for this guid/ds mapping (or zero on first call)
	* zc_resumable if data is incomplete assume sender will resume
	*
	* outputs:
	* zc_cookie number of bytes read
	* zc_nvlist_dst{_size} error for each unapplied received property
	* zc_obj zprop_errflags_t
	* zc_action_handle handle for this guid/ds mapping
	*/
	static int
	zfs_ioc_recv(zfs_cmd_t *zc)
	{
	file_t *fp;
	dmu_recv_cookie_t drc;
	boolean_t force = (boolean_t)zc->zc_guid;
	int fd;
	int error = 0;
	int props_error = 0;
	nvlist_t *errors;
	offset_t off;
	nvlist_t props = NULL; / sent properties */
	nvlist_t origprops = NULL; / existing properties */
	nvlist_t delayprops = NULL; / sent properties applied post-receive */
	char *origin = NULL;
	char *tosnap;
	char tofs[ZFS_MAX_DATASET_NAME_LEN];
	cap_rights_t rights;
	boolean_t first_recvd_props = B_FALSE;

	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 \|\|
	strchr(zc->zc_value, '@') == NULL \|\|
	strchr(zc->zc_value, '%'))
	return (SET_ERROR(EINVAL));

	(void) strcpy(tofs, zc->zc_value);
	tosnap = strchr(tofs, '@');
	*tosnap++ = '\0';

	if (zc->zc_nvlist_src != 0 &&
	(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	zc->zc_iflags, &props)) != 0)
	return (error);

	fd = zc->zc_cookie;
	#ifdef illumos
	fp = getf(fd);
	#else
	fget_read(curthread, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
	#endif
	if (fp == NULL) {
	nvlist_free(props);
	return (SET_ERROR(EBADF));
	}

	errors = fnvlist_alloc();

	if (zc->zc_string[0])
	origin = zc->zc_string;

	error = dmu_recv_begin(tofs, tosnap,
	&zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
	if (error != 0)
	goto out;

	/*
	* Set properties before we receive the stream so that they are applied
	* to the new data. Note that we must call dmu_recv_stream() if
	* dmu_recv_begin() succeeds.
	*/
	if (props != NULL && !drc.drc_newfs) {
	if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
	SPA_VERSION_RECVD_PROPS &&
	!dsl_prop_get_hasrecvd(tofs))
	first_recvd_props = B_TRUE;

	/*
	* If new received properties are supplied, they are to
	* completely replace the existing received properties, so stash
	* away the existing ones.
	*/
	if (dsl_prop_get_received(tofs, &origprops) == 0) {
	nvlist_t *errlist = NULL;
	/*
	* Don't bother writing a property if its value won't
	* change (and avoid the unnecessary security checks).
	*
	* The first receive after SPA_VERSION_RECVD_PROPS is a
	* special case where we blow away all local properties
	* regardless.
	*/
	if (!first_recvd_props)
	props_reduce(props, origprops);
	if (zfs_check_clearable(tofs, origprops, &errlist) != 0)
	(void) nvlist_merge(errors, errlist, 0);
	nvlist_free(errlist);

	if (clear_received_props(tofs, origprops,
	first_recvd_props ? NULL : props) != 0)
	zc->zc_obj \|= ZPROP_ERR_NOCLEAR;
	} else {
	zc->zc_obj \|= ZPROP_ERR_NOCLEAR;
	}
	}

	if (props != NULL) {
	props_error = dsl_prop_set_hasrecvd(tofs);

	if (props_error == 0) {
	delayprops = extract_delay_props(props);
	(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
	props, errors);
	}
	}

	off = fp->f_offset;
	error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
	&zc->zc_action_handle);

	if (error == 0) {
	zfsvfs_t *zfsvfs = NULL;

	if (getzfsvfs(tofs, &zfsvfs) == 0) {
	/* online recv */
	dsl_dataset_t *ds;
	int end_err;

	ds = dmu_objset_ds(zfsvfs->z_os);
	error = zfs_suspend_fs(zfsvfs);
	/*
	* If the suspend fails, then the recv_end will
	* likely also fail, and clean up after itself.
	*/
	end_err = dmu_recv_end(&drc, zfsvfs);
	if (error == 0)
	error = zfs_resume_fs(zfsvfs, ds);
	error = error ? error : end_err;
	#ifdef illumos
	VFS_RELE(zfsvfs->z_vfs);
	#else
	vfs_unbusy(zfsvfs->z_vfs);
	#endif
	} else {
	error = dmu_recv_end(&drc, NULL);
	}

	/* Set delayed properties now, after we're done receiving. */
	if (delayprops != NULL && error == 0) {
	(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
	delayprops, errors);
	}
	}

	if (delayprops != NULL) {
	/*
	* Merge delayed props back in with initial props, in case
	* we're DEBUG and zfs_ioc_recv_inject_err is set (which means
	* we have to make sure clear_received_props() includes
	* the delayed properties).
	*
	* Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
	* using ASSERT() will be just like a VERIFY.
	*/
	ASSERT(nvlist_merge(props, delayprops, 0) == 0);
	nvlist_free(delayprops);
	}

	/*
	* Now that all props, initial and delayed, are set, report the prop
	* errors to the caller.
	*/
	if (zc->zc_nvlist_dst_size != 0 &&
	(nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 \|\|
	put_nvlist(zc, errors) != 0)) {
	/*
	* Caller made zc->zc_nvlist_dst less than the minimum expected
	* size or supplied an invalid address.
	*/
	props_error = SET_ERROR(EINVAL);
	}

	zc->zc_cookie = off - fp->f_offset;
	if (off >= 0 && off <= MAXOFFSET_T)
	fp->f_offset = off;

	#ifdef DEBUG
	if (zfs_ioc_recv_inject_err) {
	zfs_ioc_recv_inject_err = B_FALSE;
	error = 1;
	}
	#endif

	#ifdef __FreeBSD__
	if (error == 0)
	zvol_create_minors(tofs);
	#endif

	/*
	* On error, restore the original props.
	*/
	if (error != 0 && props != NULL && !drc.drc_newfs) {
	if (clear_received_props(tofs, props, NULL) != 0) {
	/*
	* We failed to clear the received properties.
	* Since we may have left a $recvd value on the
	* system, we can't clear the $hasrecvd flag.
	*/
	zc->zc_obj \|= ZPROP_ERR_NORESTORE;
	} else if (first_recvd_props) {
	dsl_prop_unset_hasrecvd(tofs);
	}

	if (origprops == NULL && !drc.drc_newfs) {
	/* We failed to stash the original properties. */
	zc->zc_obj \|= ZPROP_ERR_NORESTORE;
	}

	/*
	* dsl_props_set() will not convert RECEIVED to LOCAL on or
	* after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
	* explictly if we're restoring local properties cleared in the
	* first new-style receive.
	*/
	if (origprops != NULL &&
	zfs_set_prop_nvlist(tofs, (first_recvd_props ?
	ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
	origprops, NULL) != 0) {
	/*
	* We stashed the original properties but failed to
	* restore them.
	*/
	zc->zc_obj \|= ZPROP_ERR_NORESTORE;
	}
	}
	out:
	nvlist_free(props);
	nvlist_free(origprops);
	nvlist_free(errors);
	releasef(fd);

	if (error == 0)
	error = props_error;

	return (error);
	}

	/*
	* inputs:
	* zc_name name of snapshot to send
	* zc_cookie file descriptor to send stream to
	* zc_obj fromorigin flag (mutually exclusive with zc_fromobj)
	* zc_sendobj objsetid of snapshot to send
	* zc_fromobj objsetid of incremental fromsnap (may be zero)
	* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
	* output size in zc_objset_type.
	* zc_flags lzc_send_flags
	*
	* outputs:
	* zc_objset_type estimated size, if zc_guid is set
	*/
	static int
	zfs_ioc_send(zfs_cmd_t *zc)
	{
	int error;
	offset_t off;
	boolean_t estimate = (zc->zc_guid != 0);
	boolean_t embedok = (zc->zc_flags & 0x1);
	boolean_t large_block_ok = (zc->zc_flags & 0x2);
	boolean_t compressok = (zc->zc_flags & 0x4);

	if (zc->zc_obj != 0) {
	dsl_pool_t *dp;
	dsl_dataset_t *tosnap;

	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	if (error != 0)
	return (error);

	error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
	if (error != 0) {
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	if (dsl_dir_is_clone(tosnap->ds_dir))
	zc->zc_fromobj =
	dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
	dsl_dataset_rele(tosnap, FTAG);
	dsl_pool_rele(dp, FTAG);
	}

	if (estimate) {
	dsl_pool_t *dp;
	dsl_dataset_t *tosnap;
	dsl_dataset_t *fromsnap = NULL;

	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	if (error != 0)
	return (error);

	error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
	if (error != 0) {
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	if (zc->zc_fromobj != 0) {
	error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
	FTAG, &fromsnap);
	if (error != 0) {
	dsl_dataset_rele(tosnap, FTAG);
	dsl_pool_rele(dp, FTAG);
	return (error);
	}
	}

	error = dmu_send_estimate(tosnap, fromsnap, compressok,
	&zc->zc_objset_type);

	if (fromsnap != NULL)
	dsl_dataset_rele(fromsnap, FTAG);
	dsl_dataset_rele(tosnap, FTAG);
	dsl_pool_rele(dp, FTAG);
	} else {
	file_t *fp;
	cap_rights_t rights;

	#ifdef illumos
	fp = getf(zc->zc_cookie);
	#else
	fget_write(curthread, zc->zc_cookie,
	cap_rights_init(&rights, CAP_WRITE), &fp);
	#endif
	if (fp == NULL)
	return (SET_ERROR(EBADF));

	off = fp->f_offset;
	error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
	zc->zc_fromobj, embedok, large_block_ok, compressok,
	#ifdef illumos
	zc->zc_cookie, fp->f_vnode, &off);
	#else
	zc->zc_cookie, fp, &off);
	#endif

	if (off >= 0 && off <= MAXOFFSET_T)
	fp->f_offset = off;
	releasef(zc->zc_cookie);
	}
	return (error);
	}

	/*
	* inputs:
	* zc_name name of snapshot on which to report progress
	* zc_cookie file descriptor of send stream
	*
	* outputs:
	* zc_cookie number of bytes written in send stream thus far
	*/
	static int
	zfs_ioc_send_progress(zfs_cmd_t *zc)
	{
	dsl_pool_t *dp;
	dsl_dataset_t *ds;
	dmu_sendarg_t *dsp = NULL;
	int error;

	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	if (error != 0)
	return (error);

	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
	if (error != 0) {
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	mutex_enter(&ds->ds_sendstream_lock);

	/*
	* Iterate over all the send streams currently active on this dataset.
	* If there's one which matches the specified file descriptor _and_ the
	* stream was started by the current process, return the progress of
	* that stream.
	*/
	for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
	dsp = list_next(&ds->ds_sendstreams, dsp)) {
	if (dsp->dsa_outfd == zc->zc_cookie &&
	dsp->dsa_proc == curproc)
	break;
	}

	if (dsp != NULL)
	zc->zc_cookie = *(dsp->dsa_off);
	else
	error = SET_ERROR(ENOENT);

	mutex_exit(&ds->ds_sendstream_lock);
	dsl_dataset_rele(ds, FTAG);
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	static int
	zfs_ioc_inject_fault(zfs_cmd_t *zc)
	{
	int id, error;

	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
	&zc->zc_inject_record);

	if (error == 0)
	zc->zc_guid = (uint64_t)id;

	return (error);
	}

	static int
	zfs_ioc_clear_fault(zfs_cmd_t *zc)
	{
	return (zio_clear_fault((int)zc->zc_guid));
	}

	static int
	zfs_ioc_inject_list_next(zfs_cmd_t *zc)
	{
	int id = (int)zc->zc_guid;
	int error;

	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
	&zc->zc_inject_record);

	zc->zc_guid = id;

	return (error);
	}

	static int
	zfs_ioc_error_log(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;
	size_t count = (size_t)zc->zc_nvlist_dst_size;

	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
	return (error);

	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
	&count);
	if (error == 0)
	zc->zc_nvlist_dst_size = count;
	else
	zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);

	spa_close(spa, FTAG);

	return (error);
	}

	static int
	zfs_ioc_clear(zfs_cmd_t *zc)
	{
	spa_t *spa;
	vdev_t *vd;
	int error;

	/*
	* On zpool clear we also fix up missing slogs
	*/
	mutex_enter(&spa_namespace_lock);
	spa = spa_lookup(zc->zc_name);
	if (spa == NULL) {
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(EIO));
	}
	if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
	/* we need to let spa_open/spa_load clear the chains */
	spa_set_log_state(spa, SPA_LOG_CLEAR);
	}
	spa->spa_last_open_failed = 0;
	mutex_exit(&spa_namespace_lock);

	if (zc->zc_cookie & ZPOOL_NO_REWIND) {
	error = spa_open(zc->zc_name, &spa, FTAG);
	} else {
	nvlist_t *policy;
	nvlist_t *config = NULL;

	if (zc->zc_nvlist_src == 0)
	return (SET_ERROR(EINVAL));

	if ((error = get_nvlist(zc->zc_nvlist_src,
	zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
	error = spa_open_rewind(zc->zc_name, &spa, FTAG,
	policy, &config);
	if (config != NULL) {
	int err;

	if ((err = put_nvlist(zc, config)) != 0)
	error = err;
	nvlist_free(config);
	}
	nvlist_free(policy);
	}
	}

	if (error != 0)
	return (error);

	spa_vdev_state_enter(spa, SCL_NONE);

	if (zc->zc_guid == 0) {
	vd = NULL;
	} else {
	vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
	if (vd == NULL) {
	(void) spa_vdev_state_exit(spa, NULL, ENODEV);
	spa_close(spa, FTAG);
	return (SET_ERROR(ENODEV));
	}
	}

	vdev_clear(spa, vd);

	(void) spa_vdev_state_exit(spa, NULL, 0);

	/*
	* Resume any suspended I/Os.
	*/
	if (zio_resume(spa) != 0)
	error = SET_ERROR(EIO);

	spa_close(spa, FTAG);

	return (error);
	}

	static int
	zfs_ioc_pool_reopen(zfs_cmd_t *zc)
	{
	spa_t *spa;
	int error;

	error = spa_open(zc->zc_name, &spa, FTAG);
	if (error != 0)
	return (error);

	spa_vdev_state_enter(spa, SCL_NONE);

	/*
	* If a resilver is already in progress then set the
	* spa_scrub_reopen flag to B_TRUE so that we don't restart
	* the scan as a side effect of the reopen. Otherwise, let
	* vdev_open() decided if a resilver is required.
	*/
	spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool);
	vdev_reopen(spa->spa_root_vdev);
	spa->spa_scrub_reopen = B_FALSE;

	(void) spa_vdev_state_exit(spa, NULL, 0);
	spa_close(spa, FTAG);
	return (0);
	}
	/*
	* inputs:
	* zc_name name of filesystem
	*
	* outputs:
	* zc_string name of conflicting snapshot, if there is one
	*/
	static int
	zfs_ioc_promote(zfs_cmd_t *zc)
	{
	dsl_pool_t *dp;
	dsl_dataset_t ds, ods;
	char origin[ZFS_MAX_DATASET_NAME_LEN];
	char *cp;
	int error;

	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	if (error != 0)
	return (error);

	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
	if (error != 0) {
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	if (!dsl_dir_is_clone(ds->ds_dir)) {
	dsl_dataset_rele(ds, FTAG);
	dsl_pool_rele(dp, FTAG);
	return (SET_ERROR(EINVAL));
	}

	error = dsl_dataset_hold_obj(dp,
	dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods);
	if (error != 0) {
	dsl_dataset_rele(ds, FTAG);
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	dsl_dataset_name(ods, origin);
	dsl_dataset_rele(ods, FTAG);
	dsl_dataset_rele(ds, FTAG);
	dsl_pool_rele(dp, FTAG);

	/*
	* We don't need to unmount all the origin fs's snapshots, but
	* it's easier.
	*/
	cp = strchr(origin, '@');
	if (cp)
	*cp = '\0';
	(void) dmu_objset_find(origin,
	zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
	return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
	}

	/*
	* Retrieve a single {user\|group}{used\|quota}@... property.
	*
	* inputs:
	* zc_name name of filesystem
	* zc_objset_type zfs_userquota_prop_t
	* zc_value domain name (eg. "S-1-234-567-89")
	* zc_guid RID/UID/GID
	*
	* outputs:
	* zc_cookie property value
	*/
	static int
	zfs_ioc_userspace_one(zfs_cmd_t *zc)
	{
	zfsvfs_t *zfsvfs;
	int error;

	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
	return (SET_ERROR(EINVAL));

	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
	if (error != 0)
	return (error);

	error = zfs_userspace_one(zfsvfs,
	zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
	zfsvfs_rele(zfsvfs, FTAG);

	return (error);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_cookie zap cursor
	* zc_objset_type zfs_userquota_prop_t
	* zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
	*
	* outputs:
	* zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t)
	* zc_cookie zap cursor
	*/
	static int
	zfs_ioc_userspace_many(zfs_cmd_t *zc)
	{
	zfsvfs_t *zfsvfs;
	int bufsize = zc->zc_nvlist_dst_size;

	if (bufsize <= 0)
	return (SET_ERROR(ENOMEM));

	int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
	if (error != 0)
	return (error);

	void *buf = kmem_alloc(bufsize, KM_SLEEP);

	error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
	buf, &zc->zc_nvlist_dst_size);

	if (error == 0) {
	error = ddi_copyout(buf,
	(void *)(uintptr_t)zc->zc_nvlist_dst,
	zc->zc_nvlist_dst_size, zc->zc_iflags);
	}
	kmem_free(buf, bufsize);
	zfsvfs_rele(zfsvfs, FTAG);

	return (error);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	*
	* outputs:
	* none
	*/
	static int
	zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
	{
	objset_t *os;
	int error = 0;
	zfsvfs_t *zfsvfs;

	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
	if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
	/*
	* If userused is not enabled, it may be because the
	* objset needs to be closed & reopened (to grow the
	* objset_phys_t). Suspend/resume the fs will do that.
	*/
	dsl_dataset_t ds, newds;

	ds = dmu_objset_ds(zfsvfs->z_os);
	error = zfs_suspend_fs(zfsvfs);
	if (error == 0) {
	dmu_objset_refresh_ownership(ds, &newds,
	zfsvfs);
	error = zfs_resume_fs(zfsvfs, newds);
	}
	}
	if (error == 0)
	error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
	#ifdef illumos
	VFS_RELE(zfsvfs->z_vfs);
	#else
	vfs_unbusy(zfsvfs->z_vfs);
	#endif
	} else {
	/* XXX kind of reading contents without owning */
	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
	if (error != 0)
	return (error);

	error = dmu_objset_userspace_upgrade(os);
	dmu_objset_rele(os, FTAG);
	}

	return (error);
	}

	#ifdef illumos
	/*
	* We don't want to have a hard dependency
	* against some special symbols in sharefs
	* nfs, and smbsrv. Determine them if needed when
	* the first file system is shared.
	* Neither sharefs, nfs or smbsrv are unloadable modules.
	*/
	int (znfsexport_fs)(void arg);
	int (zshare_fs)(enum sharefs_sys_op, share_t , uint32_t);
	int (zsmbexport_fs)(void arg, boolean_t add_share);

	int zfs_nfsshare_inited;
	int zfs_smbshare_inited;

	ddi_modhandle_t nfs_mod;
	ddi_modhandle_t sharefs_mod;
	ddi_modhandle_t smbsrv_mod;
	#endif /* illumos */
	kmutex_t zfs_share_lock;

	#ifdef illumos
	static int
	zfs_init_sharefs()
	{
	int error;

	ASSERT(MUTEX_HELD(&zfs_share_lock));
	/* Both NFS and SMB shares also require sharetab support. */
	if (sharefs_mod == NULL && ((sharefs_mod =
	ddi_modopen("fs/sharefs",
	KRTLD_MODE_FIRST, &error)) == NULL)) {
	return (SET_ERROR(ENOSYS));
	}
	if (zshare_fs == NULL && ((zshare_fs =
	(int ()(enum sharefs_sys_op, share_t , uint32_t))
	ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
	return (SET_ERROR(ENOSYS));
	}
	return (0);
	}
	#endif /* illumos */

	static int
	zfs_ioc_share(zfs_cmd_t *zc)
	{
	#ifdef illumos
	int error;
	int opcode;

	switch (zc->zc_share.z_sharetype) {
	case ZFS_SHARE_NFS:
	case ZFS_UNSHARE_NFS:
	if (zfs_nfsshare_inited == 0) {
	mutex_enter(&zfs_share_lock);
	if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
	KRTLD_MODE_FIRST, &error)) == NULL)) {
	mutex_exit(&zfs_share_lock);
	return (SET_ERROR(ENOSYS));
	}
	if (znfsexport_fs == NULL &&
	((znfsexport_fs = (int ()(void ))
	ddi_modsym(nfs_mod,
	"nfs_export", &error)) == NULL)) {
	mutex_exit(&zfs_share_lock);
	return (SET_ERROR(ENOSYS));
	}
	error = zfs_init_sharefs();
	if (error != 0) {
	mutex_exit(&zfs_share_lock);
	return (SET_ERROR(ENOSYS));
	}
	zfs_nfsshare_inited = 1;
	mutex_exit(&zfs_share_lock);
	}
	break;
	case ZFS_SHARE_SMB:
	case ZFS_UNSHARE_SMB:
	if (zfs_smbshare_inited == 0) {
	mutex_enter(&zfs_share_lock);
	if (smbsrv_mod == NULL && ((smbsrv_mod =
	ddi_modopen("drv/smbsrv",
	KRTLD_MODE_FIRST, &error)) == NULL)) {
	mutex_exit(&zfs_share_lock);
	return (SET_ERROR(ENOSYS));
	}
	if (zsmbexport_fs == NULL && ((zsmbexport_fs =
	(int ()(void , boolean_t))ddi_modsym(smbsrv_mod,
	"smb_server_share", &error)) == NULL)) {
	mutex_exit(&zfs_share_lock);
	return (SET_ERROR(ENOSYS));
	}
	error = zfs_init_sharefs();
	if (error != 0) {
	mutex_exit(&zfs_share_lock);
	return (SET_ERROR(ENOSYS));
	}
	zfs_smbshare_inited = 1;
	mutex_exit(&zfs_share_lock);
	}
	break;
	default:
	return (SET_ERROR(EINVAL));
	}

	switch (zc->zc_share.z_sharetype) {
	case ZFS_SHARE_NFS:
	case ZFS_UNSHARE_NFS:
	if (error =
	znfsexport_fs((void *)
	(uintptr_t)zc->zc_share.z_exportdata))
	return (error);
	break;
	case ZFS_SHARE_SMB:
	case ZFS_UNSHARE_SMB:
	if (error = zsmbexport_fs((void *)
	(uintptr_t)zc->zc_share.z_exportdata,
	zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
	B_TRUE: B_FALSE)) {
	return (error);
	}
	break;
	}

	opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS \|\|
	zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ?
	SHAREFS_ADD : SHAREFS_REMOVE;

	/*
	* Add or remove share from sharetab
	*/
	error = zshare_fs(opcode,
	(void *)(uintptr_t)zc->zc_share.z_sharedata,
	zc->zc_share.z_sharemax);

	return (error);

	#else /* !illumos */
	return (ENOSYS);
	#endif /* illumos */
	}

	ace_t full_access[] = {
	{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
	};

	/*
	* inputs:
	* zc_name name of containing filesystem
	* zc_obj object # beyond which we want next in-use object #
	*
	* outputs:
	* zc_obj next in-use object #
	*/
	static int
	zfs_ioc_next_obj(zfs_cmd_t *zc)
	{
	objset_t *os = NULL;
	int error;

	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
	if (error != 0)
	return (error);

	error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
	dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg);

	dmu_objset_rele(os, FTAG);
	return (error);
	}

	/*
	* inputs:
	* zc_name name of filesystem
	* zc_value prefix name for snapshot
	* zc_cleanup_fd cleanup-on-exit file descriptor for calling process
	*
	* outputs:
	* zc_value short name of new snapshot
	*/
	static int
	zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
	{
	char *snap_name;
	char *hold_name;
	int error;
	minor_t minor;

	error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
	if (error != 0)
	return (error);

	snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
	(u_longlong_t)ddi_get_lbolt64());
	hold_name = kmem_asprintf("%%%s", zc->zc_value);

	error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
	hold_name);
	if (error == 0)
	(void) strcpy(zc->zc_value, snap_name);
	strfree(snap_name);
	strfree(hold_name);
	zfs_onexit_fd_rele(zc->zc_cleanup_fd);
	return (error);
	}

	/*
	* inputs:
	* zc_name name of "to" snapshot
	* zc_value name of "from" snapshot
	* zc_cookie file descriptor to write diff data on
	*
	* outputs:
	* dmu_diff_record_t's to the file descriptor
	*/
	static int
	zfs_ioc_diff(zfs_cmd_t *zc)
	{
	file_t *fp;
	cap_rights_t rights;
	offset_t off;
	int error;

	#ifdef illumos
	fp = getf(zc->zc_cookie);
	#else
	fget_write(curthread, zc->zc_cookie,
	cap_rights_init(&rights, CAP_WRITE), &fp);
	#endif
	if (fp == NULL)
	return (SET_ERROR(EBADF));

	off = fp->f_offset;

	#ifdef illumos
	error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off);
	#else
	error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
	#endif

	if (off >= 0 && off <= MAXOFFSET_T)
	fp->f_offset = off;
	releasef(zc->zc_cookie);

	return (error);
	}

	#ifdef illumos
	/*
	* Remove all ACL files in shares dir
	*/
	static int
	zfs_smb_acl_purge(znode_t *dzp)
	{
	zap_cursor_t zc;
	zap_attribute_t zap;
	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	int error;

	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
	(error = zap_cursor_retrieve(&zc, &zap)) == 0;
	zap_cursor_advance(&zc)) {
	if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
	NULL, 0)) != 0)
	break;
	}
	zap_cursor_fini(&zc);
	return (error);
	}
	#endif /* illumos */

	static int
	zfs_ioc_smb_acl(zfs_cmd_t *zc)
	{
	#ifdef illumos
	vnode_t *vp;
	znode_t *dzp;
	vnode_t *resourcevp = NULL;
	znode_t *sharedir;
	zfsvfs_t *zfsvfs;
	nvlist_t *nvlist;
	char src, target;
	vattr_t vattr;
	vsecattr_t vsec;
	int error = 0;

	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
	NO_FOLLOW, NULL, &vp)) != 0)
	return (error);

	/* Now make sure mntpnt and dataset are ZFS */

	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 \|\|
	(strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
	zc->zc_name) != 0)) {
	VN_RELE(vp);
	return (SET_ERROR(EINVAL));
	}

	dzp = VTOZ(vp);
	zfsvfs = dzp->z_zfsvfs;
	ZFS_ENTER(zfsvfs);

	/*
	* Create share dir if its missing.
	*/
	mutex_enter(&zfsvfs->z_lock);
	if (zfsvfs->z_shares_dir == 0) {
	dmu_tx_t *tx;

	tx = dmu_tx_create(zfsvfs->z_os);
	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
	ZFS_SHARES_DIR);
	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error != 0) {
	dmu_tx_abort(tx);
	} else {
	error = zfs_create_share_dir(zfsvfs, tx);
	dmu_tx_commit(tx);
	}
	if (error != 0) {
	mutex_exit(&zfsvfs->z_lock);
	VN_RELE(vp);
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	}
	mutex_exit(&zfsvfs->z_lock);

	ASSERT(zfsvfs->z_shares_dir);
	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
	VN_RELE(vp);
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	switch (zc->zc_cookie) {
	case ZFS_SMB_ACL_ADD:
	vattr.va_mask = AT_MODE\|AT_UID\|AT_GID\|AT_TYPE;
	vattr.va_type = VREG;
	vattr.va_mode = S_IFREG\|0777;
	vattr.va_uid = 0;
	vattr.va_gid = 0;

	vsec.vsa_mask = VSA_ACE;
	vsec.vsa_aclentp = &full_access;
	vsec.vsa_aclentsz = sizeof (full_access);
	vsec.vsa_aclcnt = 1;

	error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
	&vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
	if (resourcevp)
	VN_RELE(resourcevp);
	break;

	case ZFS_SMB_ACL_REMOVE:
	error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
	NULL, 0);
	break;

	case ZFS_SMB_ACL_RENAME:
	if ((error = get_nvlist(zc->zc_nvlist_src,
	zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
	VN_RELE(vp);
	VN_RELE(ZTOV(sharedir));
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) \|\|
	nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
	&target)) {
	VN_RELE(vp);
	VN_RELE(ZTOV(sharedir));
	ZFS_EXIT(zfsvfs);
	nvlist_free(nvlist);
	return (error);
	}
	error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
	kcred, NULL, 0);
	nvlist_free(nvlist);
	break;

	case ZFS_SMB_ACL_PURGE:
	error = zfs_smb_acl_purge(sharedir);
	break;

	default:
	error = SET_ERROR(EINVAL);
	break;
	}

	VN_RELE(vp);
	VN_RELE(ZTOV(sharedir));

	ZFS_EXIT(zfsvfs);

	return (error);
	#else /* !illumos */
	return (EOPNOTSUPP);
	#endif /* illumos */
	}

	/*
	* innvl: {
	* "holds" -> { snapname -> holdname (string), ... }
	* (optional) "cleanup_fd" -> fd (int32)
	* }
	*
	* outnvl: {
	* snapname -> error value (int32)
	* ...
	* }
	*/
	/* ARGSUSED */
	static int
	zfs_ioc_hold(const char pool, nvlist_t args, nvlist_t *errlist)
	{
	nvpair_t *pair;
	nvlist_t *holds;
	int cleanup_fd = -1;
	int error;
	minor_t minor = 0;

	error = nvlist_lookup_nvlist(args, "holds", &holds);
	if (error != 0)
	return (SET_ERROR(EINVAL));

	/* make sure the user didn't pass us any invalid (empty) tags */
	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
	pair = nvlist_next_nvpair(holds, pair)) {
	char *htag;

	error = nvpair_value_string(pair, &htag);
	if (error != 0)
	return (SET_ERROR(error));

	if (strlen(htag) == 0)
	return (SET_ERROR(EINVAL));
	}

	if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
	error = zfs_onexit_fd_hold(cleanup_fd, &minor);
	if (error != 0)
	return (error);
	}

	error = dsl_dataset_user_hold(holds, minor, errlist);
	if (minor != 0)
	zfs_onexit_fd_rele(cleanup_fd);
	return (error);
	}

	/*
	* innvl is not used.
	*
	* outnvl: {
	* holdname -> time added (uint64 seconds since epoch)
	* ...
	* }
	*/
	/* ARGSUSED */
	static int
	zfs_ioc_get_holds(const char snapname, nvlist_t args, nvlist_t *outnvl)
	{
	return (dsl_dataset_get_holds(snapname, outnvl));
	}

	/*
	* innvl: {
	* snapname -> { holdname, ... }
	* ...
	* }
	*
	* outnvl: {
	* snapname -> error value (int32)
	* ...
	* }
	*/
	/* ARGSUSED */
	static int
	zfs_ioc_release(const char pool, nvlist_t holds, nvlist_t *errlist)
	{
	return (dsl_dataset_user_release(holds, errlist));
	}

	/*
	* inputs:
	* zc_name name of new filesystem or snapshot
	* zc_value full name of old snapshot
	*
	* outputs:
	* zc_cookie space in bytes
	* zc_objset_type compressed space in bytes
	* zc_perm_action uncompressed space in bytes
	*/
	static int
	zfs_ioc_space_written(zfs_cmd_t *zc)
	{
	int error;
	dsl_pool_t *dp;
	dsl_dataset_t new, old;

	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
	if (error != 0)
	return (error);
	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
	if (error != 0) {
	dsl_pool_rele(dp, FTAG);
	return (error);
	}
	error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
	if (error != 0) {
	dsl_dataset_rele(new, FTAG);
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	error = dsl_dataset_space_written(old, new, &zc->zc_cookie,
	&zc->zc_objset_type, &zc->zc_perm_action);
	dsl_dataset_rele(old, FTAG);
	dsl_dataset_rele(new, FTAG);
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	/*
	* innvl: {
	* "firstsnap" -> snapshot name
	* }
	*
	* outnvl: {
	* "used" -> space in bytes
	* "compressed" -> compressed space in bytes
	* "uncompressed" -> uncompressed space in bytes
	* }
	*/
	static int
	zfs_ioc_space_snaps(const char lastsnap, nvlist_t innvl, nvlist_t *outnvl)
	{
	int error;
	dsl_pool_t *dp;
	dsl_dataset_t new, old;
	char *firstsnap;
	uint64_t used, comp, uncomp;

	if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0)
	return (SET_ERROR(EINVAL));

	error = dsl_pool_hold(lastsnap, FTAG, &dp);
	if (error != 0)
	return (error);

	error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
	if (error == 0 && !new->ds_is_snapshot) {
	dsl_dataset_rele(new, FTAG);
	error = SET_ERROR(EINVAL);
	}
	if (error != 0) {
	dsl_pool_rele(dp, FTAG);
	return (error);
	}
	error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
	if (error == 0 && !old->ds_is_snapshot) {
	dsl_dataset_rele(old, FTAG);
	error = SET_ERROR(EINVAL);
	}
	if (error != 0) {
	dsl_dataset_rele(new, FTAG);
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
	dsl_dataset_rele(old, FTAG);
	dsl_dataset_rele(new, FTAG);
	dsl_pool_rele(dp, FTAG);
	fnvlist_add_uint64(outnvl, "used", used);
	fnvlist_add_uint64(outnvl, "compressed", comp);
	fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
	return (error);
	}

	static int
	zfs_ioc_jail(zfs_cmd_t *zc)
	{

	return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
	(int)zc->zc_jailid));
	}

	static int
	zfs_ioc_unjail(zfs_cmd_t *zc)
	{

	return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
	(int)zc->zc_jailid));
	}

	/*
	* innvl: {
	* "fd" -> file descriptor to write stream to (int32)
	* (optional) "fromsnap" -> full snap name to send an incremental from
	* (optional) "largeblockok" -> (value ignored)
	* indicates that blocks > 128KB are permitted
	* (optional) "embedok" -> (value ignored)
	* presence indicates DRR_WRITE_EMBEDDED records are permitted
	* (optional) "compressok" -> (value ignored)
	* presence indicates compressed DRR_WRITE records are permitted
	* (optional) "resume_object" and "resume_offset" -> (uint64)
	* if present, resume send stream from specified object and offset.
	* }
	*
	* outnvl is unused
	*/
	/* ARGSUSED */
	static int
	zfs_ioc_send_new(const char snapname, nvlist_t innvl, nvlist_t *outnvl)
	{
	cap_rights_t rights;
	file_t *fp;
	int error;
	offset_t off;
	char *fromname = NULL;
	int fd;
	boolean_t largeblockok;
	boolean_t embedok;
	boolean_t compressok;
	uint64_t resumeobj = 0;
	uint64_t resumeoff = 0;

	error = nvlist_lookup_int32(innvl, "fd", &fd);
	if (error != 0)
	return (SET_ERROR(EINVAL));

	(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);

	largeblockok = nvlist_exists(innvl, "largeblockok");
	embedok = nvlist_exists(innvl, "embedok");
	compressok = nvlist_exists(innvl, "compressok");

	(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
	(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);

	#ifdef illumos
	file_t *fp = getf(fd);
	#else
	fget_write(curthread, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
	#endif
	if (fp == NULL)
	return (SET_ERROR(EBADF));

	off = fp->f_offset;
	error = dmu_send(snapname, fromname, embedok, largeblockok, compressok,
	#ifdef illumos
	fd, resumeobj, resumeoff, fp->f_vnode, &off);
	#else
	fd, resumeobj, resumeoff, fp, &off);
	#endif

	#ifdef illumos
	if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
	fp->f_offset = off;
	#else
	fp->f_offset = off;
	#endif

	releasef(fd);
	return (error);
	}

	/*
	* Determine approximately how large a zfs send stream will be -- the number
	* of bytes that will be written to the fd supplied to zfs_ioc_send_new().
	*
	* innvl: {
	* (optional) "from" -> full snap or bookmark name to send an incremental
	* from
	* (optional) "largeblockok" -> (value ignored)
	* indicates that blocks > 128KB are permitted
	* (optional) "embedok" -> (value ignored)
	* presence indicates DRR_WRITE_EMBEDDED records are permitted
	* (optional) "compressok" -> (value ignored)
	* presence indicates compressed DRR_WRITE records are permitted
	* }
	*
	* outnvl: {
	* "space" -> bytes of space (uint64)
	* }
	*/
	static int
	zfs_ioc_send_space(const char snapname, nvlist_t innvl, nvlist_t *outnvl)
	{
	dsl_pool_t *dp;
	dsl_dataset_t *tosnap;
	int error;
	char *fromname;
	boolean_t compressok;
	uint64_t space;

	error = dsl_pool_hold(snapname, FTAG, &dp);
	if (error != 0)
	return (error);

	error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
	if (error != 0) {
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	compressok = nvlist_exists(innvl, "compressok");

	error = nvlist_lookup_string(innvl, "from", &fromname);
	if (error == 0) {
	if (strchr(fromname, '@') != NULL) {
	/*
	* If from is a snapshot, hold it and use the more
	* efficient dmu_send_estimate to estimate send space
	* size using deadlists.
	*/
	dsl_dataset_t *fromsnap;
	error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
	if (error != 0)
	goto out;
	error = dmu_send_estimate(tosnap, fromsnap, compressok,
	&space);
	dsl_dataset_rele(fromsnap, FTAG);
	} else if (strchr(fromname, '#') != NULL) {
	/*
	* If from is a bookmark, fetch the creation TXG of the
	* snapshot it was created from and use that to find
	* blocks that were born after it.
	*/
	zfs_bookmark_phys_t frombm;

	error = dsl_bookmark_lookup(dp, fromname, tosnap,
	&frombm);
	if (error != 0)
	goto out;
	error = dmu_send_estimate_from_txg(tosnap,
	frombm.zbm_creation_txg, compressok, &space);
	} else {
	/*
	* from is not properly formatted as a snapshot or
	* bookmark
	*/
	error = SET_ERROR(EINVAL);
	goto out;
	}
	} else {
	/*
	* If estimating the size of a full send, use dmu_send_estimate.
	*/
	error = dmu_send_estimate(tosnap, NULL, compressok, &space);
	}

	fnvlist_add_uint64(outnvl, "space", space);

	out:
	dsl_dataset_rele(tosnap, FTAG);
	dsl_pool_rele(dp, FTAG);
	return (error);
	}

	static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];

	static void
	zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
	zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
	boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
	{
	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];

	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
	ASSERT3U(ioc, <, ZFS_IOC_LAST);
	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
	ASSERT3P(vec->zvec_func, ==, NULL);

	vec->zvec_legacy_func = func;
	vec->zvec_secpolicy = secpolicy;
	vec->zvec_namecheck = namecheck;
	vec->zvec_allow_log = log_history;
	vec->zvec_pool_check = pool_check;
	}

	/*
	* See the block comment at the beginning of this file for details on
	* each argument to this function.
	*/
	static void
	zfs_ioctl_register(const char name, zfs_ioc_t ioc, zfs_ioc_func_t func,
	zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
	zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
	boolean_t allow_log)
	{
	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];

	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
	ASSERT3U(ioc, <, ZFS_IOC_LAST);
	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
	ASSERT3P(vec->zvec_func, ==, NULL);

	/* if we are logging, the name must be valid */
	ASSERT(!allow_log \|\| namecheck != NO_NAME);

	vec->zvec_name = name;
	vec->zvec_func = func;
	vec->zvec_secpolicy = secpolicy;
	vec->zvec_namecheck = namecheck;
	vec->zvec_pool_check = pool_check;
	vec->zvec_smush_outnvlist = smush_outnvlist;
	vec->zvec_allow_log = allow_log;
	}

	static void
	zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
	zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
	zfs_ioc_poolcheck_t pool_check)
	{
	zfs_ioctl_register_legacy(ioc, func, secpolicy,
	POOL_NAME, log_history, pool_check);
	}

	static void
	zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
	zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
	{
	zfs_ioctl_register_legacy(ioc, func, secpolicy,
	DATASET_NAME, B_FALSE, pool_check);
	}

	static void
	zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
	{
	zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
	POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY);
	}

	static void
	zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
	zfs_secpolicy_func_t *secpolicy)
	{
	zfs_ioctl_register_legacy(ioc, func, secpolicy,
	NO_NAME, B_FALSE, POOL_CHECK_NONE);
	}

	static void
	zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
	zfs_ioc_legacy_func_t func, zfs_secpolicy_func_t secpolicy)
	{
	zfs_ioctl_register_legacy(ioc, func, secpolicy,
	DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
	}

	static void
	zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
	{
	zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
	zfs_secpolicy_read);
	}

	static void
	zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
	zfs_secpolicy_func_t *secpolicy)
	{
	zfs_ioctl_register_legacy(ioc, func, secpolicy,
	DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY);
	}

	static void
	zfs_ioctl_init(void)
	{
	zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
	zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE);

	zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
	zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_FALSE, B_FALSE);

	zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
	zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
	POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);

	zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
	zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
	POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);

	zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
	zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
	POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);

	zfs_ioctl_register("create", ZFS_IOC_CREATE,
	zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE);

	zfs_ioctl_register("clone", ZFS_IOC_CLONE,
	zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE);
	+
	+ zfs_ioctl_register("remap", ZFS_IOC_REMAP,
	+ zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME,
	+ POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_FALSE, B_TRUE);

	zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
	zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE);

	zfs_ioctl_register("hold", ZFS_IOC_HOLD,
	zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE);
	zfs_ioctl_register("release", ZFS_IOC_RELEASE,
	zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE);

	zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
	zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
	POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);

	zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
	zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_FALSE, B_TRUE);

	zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
	zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE);

	zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
	zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
	POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);

	zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
	zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
	POOL_NAME,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE, B_TRUE);

	zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM,
	zfs_ioc_channel_program, zfs_secpolicy_config,
	POOL_NAME, POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY, B_TRUE,
	B_TRUE);

	/* IOCTLS that use the legacy function signature */

	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
	zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);

	zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
	zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
	zfs_ioc_pool_scan);
	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
	zfs_ioc_pool_upgrade);
	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
	zfs_ioc_vdev_add);
	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
	zfs_ioc_vdev_remove);
	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
	zfs_ioc_vdev_set_state);
	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
	zfs_ioc_vdev_attach);
	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
	zfs_ioc_vdev_detach);
	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
	zfs_ioc_vdev_setpath);
	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
	zfs_ioc_vdev_setfru);
	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
	zfs_ioc_pool_set_props);
	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
	zfs_ioc_vdev_split);
	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
	zfs_ioc_pool_reguid);

	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
	zfs_ioc_pool_configs, zfs_secpolicy_none);
	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
	zfs_ioc_pool_tryimport, zfs_secpolicy_config);
	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
	zfs_ioc_inject_fault, zfs_secpolicy_inject);
	zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
	zfs_ioc_clear_fault, zfs_secpolicy_inject);
	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
	zfs_ioc_inject_list_next, zfs_secpolicy_inject);

	/*
	* pool destroy, and export don't log the history as part of
	* zfsdev_ioctl, but rather zfs_ioc_pool_export
	* does the logging of those commands.
	*/
	zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
	zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
	zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
	zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);

	zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
	zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
	zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);

	zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
	zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
	zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
	zfs_ioc_dsobj_to_dsname,
	zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
	zfs_ioc_pool_get_history,
	zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);

	zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
	zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);

	zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
	zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
	zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
	zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);

	zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
	zfs_ioc_space_written);
	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
	zfs_ioc_objset_recvd_props);
	zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
	zfs_ioc_next_obj);
	zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
	zfs_ioc_get_fsacl);
	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
	zfs_ioc_objset_stats);
	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
	zfs_ioc_objset_zplprops);
	zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
	zfs_ioc_dataset_list_next);
	zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
	zfs_ioc_snapshot_list_next);
	zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
	zfs_ioc_send_progress);

	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
	zfs_ioc_diff, zfs_secpolicy_diff);
	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
	zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
	zfs_ioc_obj_to_path, zfs_secpolicy_diff);
	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
	zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
	zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
	zfs_ioc_send, zfs_secpolicy_send);

	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
	zfs_secpolicy_none);
	zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
	zfs_secpolicy_destroy);
	zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
	zfs_secpolicy_rename);
	zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
	zfs_secpolicy_recv);
	zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
	zfs_secpolicy_promote);
	zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
	zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
	zfs_secpolicy_set_fsacl);

	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
	zfs_secpolicy_share, POOL_CHECK_NONE);
	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
	zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
	zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
	zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY);
	zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
	zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
	POOL_CHECK_SUSPENDED \| POOL_CHECK_READONLY);

	#ifdef __FreeBSD__
	zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
	zfs_secpolicy_config, POOL_CHECK_NONE);
	zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
	zfs_secpolicy_config, POOL_CHECK_NONE);
	zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
	zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
	POOL_CHECK_NONE, B_FALSE, B_FALSE);
	#endif
	}

	int
	pool_status_check(const char *name, zfs_ioc_namecheck_t type,
	zfs_ioc_poolcheck_t check)
	{
	spa_t *spa;
	int error;

	ASSERT(type == POOL_NAME \|\| type == DATASET_NAME);

	if (check & POOL_CHECK_NONE)
	return (0);

	error = spa_open(name, &spa, FTAG);
	if (error == 0) {
	if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
	error = SET_ERROR(EAGAIN);
	else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
	error = SET_ERROR(EROFS);
	spa_close(spa, FTAG);
	}
	return (error);
	}

	/*
	* Find a free minor number.
	*/
	minor_t
	zfsdev_minor_alloc(void)
	{
	static minor_t last_minor;
	minor_t m;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	for (m = last_minor + 1; m != last_minor; m++) {
	if (m > ZFSDEV_MAX_MINOR)
	m = 1;
	if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
	last_minor = m;
	return (m);
	}
	}

	return (0);
	}

	static int
	zfs_ctldev_init(struct cdev *devp)
	{
	minor_t minor;
	zfs_soft_state_t *zs;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	minor = zfsdev_minor_alloc();
	if (minor == 0)
	return (SET_ERROR(ENXIO));

	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
	return (SET_ERROR(EAGAIN));

	devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);

	zs = ddi_get_soft_state(zfsdev_state, minor);
	zs->zss_type = ZSST_CTLDEV;
	zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);

	return (0);
	}

	static void
	zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
	{
	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	zfs_onexit_destroy(zo);
	ddi_soft_state_free(zfsdev_state, minor);
	}

	void *
	zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
	{
	zfs_soft_state_t *zp;

	zp = ddi_get_soft_state(zfsdev_state, minor);
	if (zp == NULL \|\| zp->zss_type != which)
	return (NULL);

	return (zp->zss_data);
	}

	static int
	zfsdev_open(struct cdev devp, int flag, int mode, struct thread td)
	{
	int error = 0;

	#ifdef illumos
	if (getminor(*devp) != 0)
	return (zvol_open(devp, flag, otyp, cr));
	#endif

	/* This is the control device. Allocate a new minor if requested. */
	if (flag & FEXCL) {
	mutex_enter(&spa_namespace_lock);
	error = zfs_ctldev_init(devp);
	mutex_exit(&spa_namespace_lock);
	}

	return (error);
	}

	static void
	zfsdev_close(void *data)
	{
	zfs_onexit_t *zo;
	minor_t minor = (minor_t)(uintptr_t)data;

	if (minor == 0)
	return;

	mutex_enter(&spa_namespace_lock);
	zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
	if (zo == NULL) {
	mutex_exit(&spa_namespace_lock);
	return;
	}
	zfs_ctldev_destroy(zo, minor);
	mutex_exit(&spa_namespace_lock);
	}

	static int
	zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
	struct thread *td)
	{
	zfs_cmd_t *zc;
	uint_t vecnum;
	int error, rc, len;
	#ifdef illumos
	minor_t minor = getminor(dev);
	#else
	zfs_iocparm_t *zc_iocparm;
	int cflag, cmd, oldvecnum;
	boolean_t newioc, compat;
	void *compat_zc = NULL;
	cred_t *cr = td->td_ucred;
	#endif
	const zfs_ioc_vec_t *vec;
	char *saved_poolname = NULL;
	nvlist_t *innvl = NULL;

	cflag = ZFS_CMD_COMPAT_NONE;
	compat = B_FALSE;
	newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */

	len = IOCPARM_LEN(zcmd);
	vecnum = cmd = zcmd & 0xff;

	/*
	* Check if we are talking to supported older binaries
	* and translate zfs_cmd if necessary
	*/
	if (len != sizeof(zfs_iocparm_t)) {
	newioc = B_FALSE;
	compat = B_TRUE;

	vecnum = cmd;

	switch (len) {
	case sizeof(zfs_cmd_zcmd_t):
	cflag = ZFS_CMD_COMPAT_LZC;
	break;
	case sizeof(zfs_cmd_deadman_t):
	cflag = ZFS_CMD_COMPAT_DEADMAN;
	break;
	case sizeof(zfs_cmd_v28_t):
	cflag = ZFS_CMD_COMPAT_V28;
	break;
	case sizeof(zfs_cmd_v15_t):
	cflag = ZFS_CMD_COMPAT_V15;
	vecnum = zfs_ioctl_v15_to_v28[cmd];

	/*
	* Return without further handling
	* if the command is blacklisted.
	*/
	if (vecnum == ZFS_IOC_COMPAT_PASS)
	return (0);
	else if (vecnum == ZFS_IOC_COMPAT_FAIL)
	return (ENOTSUP);
	break;
	default:
	return (EINVAL);
	}
	}

	#ifdef illumos
	vecnum = cmd - ZFS_IOC_FIRST;
	ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
	#endif

	if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
	return (SET_ERROR(EINVAL));
	vec = &zfs_ioc_vec[vecnum];

	zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);

	#ifdef illumos
	error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
	if (error != 0) {
	error = SET_ERROR(EFAULT);
	goto out;
	}
	#else /* !illumos */
	bzero(zc, sizeof(zfs_cmd_t));

	if (newioc) {
	zc_iocparm = (void *)arg;

	switch (zc_iocparm->zfs_ioctl_version) {
	case ZFS_IOCVER_CURRENT:
	if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
	error = SET_ERROR(EINVAL);
	goto out;
	}
	break;
	case ZFS_IOCVER_INLANES:
	if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) {
	error = SET_ERROR(EFAULT);
	goto out;
	}
	compat = B_TRUE;
	cflag = ZFS_CMD_COMPAT_INLANES;
	break;
	case ZFS_IOCVER_RESUME:
	if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) {
	error = SET_ERROR(EFAULT);
	goto out;
	}
	compat = B_TRUE;
	cflag = ZFS_CMD_COMPAT_RESUME;
	break;
	case ZFS_IOCVER_EDBP:
	if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
	error = SET_ERROR(EFAULT);
	goto out;
	}
	compat = B_TRUE;
	cflag = ZFS_CMD_COMPAT_EDBP;
	break;
	case ZFS_IOCVER_ZCMD:
	if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) \|\|
	zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
	error = SET_ERROR(EFAULT);
	goto out;
	}
	compat = B_TRUE;
	cflag = ZFS_CMD_COMPAT_ZCMD;
	break;
	default:
	error = SET_ERROR(EINVAL);
	goto out;
	/* NOTREACHED */
	}

	if (compat) {
	ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
	compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
	bzero(compat_zc, sizeof(zfs_cmd_t));

	error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
	compat_zc, zc_iocparm->zfs_cmd_size, flag);
	if (error != 0) {
	error = SET_ERROR(EFAULT);
	goto out;
	}
	} else {
	error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
	zc, zc_iocparm->zfs_cmd_size, flag);
	if (error != 0) {
	error = SET_ERROR(EFAULT);
	goto out;
	}
	}
	}

	if (compat) {
	if (newioc) {
	ASSERT(compat_zc != NULL);
	zfs_cmd_compat_get(zc, compat_zc, cflag);
	} else {
	ASSERT(compat_zc == NULL);
	zfs_cmd_compat_get(zc, arg, cflag);
	}
	oldvecnum = vecnum;
	error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
	if (error != 0)
	goto out;
	if (oldvecnum != vecnum)
	vec = &zfs_ioc_vec[vecnum];
	}
	#endif /* !illumos */

	zc->zc_iflags = flag & FKIOCTL;
	if (zc->zc_nvlist_src_size != 0) {
	error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
	zc->zc_iflags, &innvl);
	if (error != 0)
	goto out;
	}

	/* rewrite innvl for backwards compatibility */
	if (compat)
	innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag);

	/*
	* Ensure that all pool/dataset names are valid before we pass down to
	* the lower layers.
	*/
	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
	switch (vec->zvec_namecheck) {
	case POOL_NAME:
	if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
	error = SET_ERROR(EINVAL);
	else
	error = pool_status_check(zc->zc_name,
	vec->zvec_namecheck, vec->zvec_pool_check);
	break;

	case DATASET_NAME:
	if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
	error = SET_ERROR(EINVAL);
	else
	error = pool_status_check(zc->zc_name,
	vec->zvec_namecheck, vec->zvec_pool_check);
	break;

	case NO_NAME:
	break;
	}

	if (error == 0)
	error = vec->zvec_secpolicy(zc, innvl, cr);

	if (error != 0)
	goto out;

	/* legacy ioctls can modify zc_name */
	len = strcspn(zc->zc_name, "/@#") + 1;
	saved_poolname = kmem_alloc(len, KM_SLEEP);
	(void) strlcpy(saved_poolname, zc->zc_name, len);

	if (vec->zvec_func != NULL) {
	nvlist_t *outnvl;
	int puterror = 0;
	spa_t *spa;
	nvlist_t *lognv = NULL;

	ASSERT(vec->zvec_legacy_func == NULL);

	/*
	* Add the innvl to the lognv before calling the func,
	* in case the func changes the innvl.
	*/
	if (vec->zvec_allow_log) {
	lognv = fnvlist_alloc();
	fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
	vec->zvec_name);
	if (!nvlist_empty(innvl)) {
	fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
	innvl);
	}
	}

	outnvl = fnvlist_alloc();
	error = vec->zvec_func(zc->zc_name, innvl, outnvl);

	/*
	* Some commands can partially execute, modfiy state, and still
	* return an error. In these cases, attempt to record what
	* was modified.
	*/
	if ((error == 0 \|\|
	(cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) &&
	vec->zvec_allow_log &&
	spa_open(zc->zc_name, &spa, FTAG) == 0) {
	if (!nvlist_empty(outnvl)) {
	fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL,
	outnvl);
	}
	if (error != 0) {
	fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO,
	error);
	}
	(void) spa_history_log_nvl(spa, lognv);
	spa_close(spa, FTAG);
	}
	fnvlist_free(lognv);

	/* rewrite outnvl for backwards compatibility */
	if (compat)
	outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
	cflag);

	if (!nvlist_empty(outnvl) \|\| zc->zc_nvlist_dst_size != 0) {
	int smusherror = 0;
	if (vec->zvec_smush_outnvlist) {
	smusherror = nvlist_smush(outnvl,
	zc->zc_nvlist_dst_size);
	}
	if (smusherror == 0)
	puterror = put_nvlist(zc, outnvl);
	}

	if (puterror != 0)
	error = puterror;

	nvlist_free(outnvl);
	} else {
	error = vec->zvec_legacy_func(zc);
	}

	out:
	nvlist_free(innvl);

	#ifdef illumos
	rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
	if (error == 0 && rc != 0)
	error = SET_ERROR(EFAULT);
	#else
	if (compat) {
	zfs_ioctl_compat_post(zc, cmd, cflag);
	if (newioc) {
	ASSERT(compat_zc != NULL);
	ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);

	zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
	rc = ddi_copyout(compat_zc,
	(void *)(uintptr_t)zc_iocparm->zfs_cmd,
	zc_iocparm->zfs_cmd_size, flag);
	if (error == 0 && rc != 0)
	error = SET_ERROR(EFAULT);
	kmem_free(compat_zc, sizeof (zfs_cmd_t));
	} else {
	zfs_cmd_compat_put(zc, arg, vecnum, cflag);
	}
	} else {
	ASSERT(newioc);

	rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
	sizeof (zfs_cmd_t), flag);
	if (error == 0 && rc != 0)
	error = SET_ERROR(EFAULT);
	}
	#endif
	if (error == 0 && vec->zvec_allow_log) {
	char *s = tsd_get(zfs_allow_log_key);
	if (s != NULL)
	strfree(s);
	(void) tsd_set(zfs_allow_log_key, saved_poolname);
	} else {
	if (saved_poolname != NULL)
	strfree(saved_poolname);
	}

	kmem_free(zc, sizeof (zfs_cmd_t));
	return (error);
	}

	#ifdef illumos
	static int
	zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
	{
	if (cmd != DDI_ATTACH)
	return (DDI_FAILURE);

	if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
	DDI_PSEUDO, 0) == DDI_FAILURE)
	return (DDI_FAILURE);

	zfs_dip = dip;

	ddi_report_dev(dip);

	return (DDI_SUCCESS);
	}

	static int
	zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
	{
	if (spa_busy() \|\| zfs_busy() \|\| zvol_busy())
	return (DDI_FAILURE);

	if (cmd != DDI_DETACH)
	return (DDI_FAILURE);

	zfs_dip = NULL;

	ddi_prop_remove_all(dip);
	ddi_remove_minor_node(dip, NULL);

	return (DDI_SUCCESS);
	}

	/ARGSUSED/
	static int
	zfs_info(dev_info_t dip, ddi_info_cmd_t infocmd, void arg, void **result)
	{
	switch (infocmd) {
	case DDI_INFO_DEVT2DEVINFO:
	*result = zfs_dip;
	return (DDI_SUCCESS);

	case DDI_INFO_DEVT2INSTANCE:
	result = (void )0;
	return (DDI_SUCCESS);
	}

	return (DDI_FAILURE);
	}
	#endif /* illumos */

	/*
	* OK, so this is a little weird.
	*
	* /dev/zfs is the control node, i.e. minor 0.
	* /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
	*
	* /dev/zfs has basically nothing to do except serve up ioctls,
	* so most of the standard driver entry points are in zvol.c.
	*/
	#ifdef illumos
	static struct cb_ops zfs_cb_ops = {
	zfsdev_open, /* open */
	zfsdev_close, /* close */
	zvol_strategy, /* strategy */
	nodev, /* print */
	zvol_dump, /* dump */
	zvol_read, /* read */
	zvol_write, /* write */
	zfsdev_ioctl, /* ioctl */
	nodev, /* devmap */
	nodev, /* mmap */
	nodev, /* segmap */
	nochpoll, /* poll */
	ddi_prop_op, /* prop_op */
	NULL, /* streamtab */
	D_NEW \| D_MP \| D_64BIT, /* Driver compatibility flag */
	CB_REV, /* version */
	nodev, /* async read */
	nodev, /* async write */
	};

	static struct dev_ops zfs_dev_ops = {
	DEVO_REV, /* version */
	0, /* refcnt */
	zfs_info, /* info */
	nulldev, /* identify */
	nulldev, /* probe */
	zfs_attach, /* attach */
	zfs_detach, /* detach */
	nodev, /* reset */
	&zfs_cb_ops, /* driver operations */
	NULL, /* no bus operations */
	NULL, /* power */
	ddi_quiesce_not_needed, /* quiesce */
	};

	static struct modldrv zfs_modldrv = {
	&mod_driverops,
	"ZFS storage pool",
	&zfs_dev_ops
	};

	static struct modlinkage modlinkage = {
	MODREV_1,
	(void *)&zfs_modlfs,
	(void *)&zfs_modldrv,
	NULL
	};
	#endif /* illumos */

	static struct cdevsw zfs_cdevsw = {
	.d_version = D_VERSION,
	.d_open = zfsdev_open,
	.d_ioctl = zfsdev_ioctl,
	.d_name = ZFS_DEV_NAME
	};

	static void
	zfs_allow_log_destroy(void *arg)
	{
	char *poolname = arg;
	strfree(poolname);
	}

	static void
	zfsdev_init(void)
	{
	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
	ZFS_DEV_NAME);
	}

	static void
	zfsdev_fini(void)
	{
	if (zfsdev != NULL)
	destroy_dev(zfsdev);
	}

	static struct root_hold_token *zfs_root_token;
	struct proc *zfsproc;

	#ifdef illumos
	int
	_init(void)
	{
	int error;

	spa_init(FREAD \| FWRITE);
	zfs_init();
	zvol_init();
	zfs_ioctl_init();

	if ((error = mod_install(&modlinkage)) != 0) {
	zvol_fini();
	zfs_fini();
	spa_fini();
	return (error);
	}

	tsd_create(&zfs_fsyncer_key, NULL);
	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);

	error = ldi_ident_from_mod(&modlinkage, &zfs_li);
	ASSERT(error == 0);
	mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);

	return (0);
	}

	int
	_fini(void)
	{
	int error;

	if (spa_busy() \|\| zfs_busy() \|\| zvol_busy() \|\| zio_injection_enabled)
	return (SET_ERROR(EBUSY));

	if ((error = mod_remove(&modlinkage)) != 0)
	return (error);

	zvol_fini();
	zfs_fini();
	spa_fini();
	if (zfs_nfsshare_inited)
	(void) ddi_modclose(nfs_mod);
	if (zfs_smbshare_inited)
	(void) ddi_modclose(smbsrv_mod);
	if (zfs_nfsshare_inited \|\| zfs_smbshare_inited)
	(void) ddi_modclose(sharefs_mod);

	tsd_destroy(&zfs_fsyncer_key);
	ldi_ident_release(zfs_li);
	zfs_li = NULL;
	mutex_destroy(&zfs_share_lock);

	return (error);
	}

	int
	_info(struct modinfo *modinfop)
	{
	return (mod_info(&modlinkage, modinfop));
	}
	#endif /* illumos */

	static int zfs__init(void);
	static int zfs__fini(void);
	static void zfs_shutdown(void *, int);

	static eventhandler_tag zfs_shutdown_event_tag;

	#ifdef __FreeBSD__
	#define ZFS_MIN_KSTACK_PAGES 4
	#endif

	int
	zfs__init(void)
	{

	#ifdef __FreeBSD__
	#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
	printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
	"overflow panic!\nPlease consider adding "
	"'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
	ZFS_MIN_KSTACK_PAGES);
	#endif
	#endif
	zfs_root_token = root_mount_hold("ZFS");

	mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);

	spa_init(FREAD \| FWRITE);
	zfs_init();
	zvol_init();
	zfs_ioctl_init();

	tsd_create(&zfs_fsyncer_key, NULL);
	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
	tsd_create(&zfs_geom_probe_vdev_key, NULL);

	printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
	root_mount_rel(zfs_root_token);

	zfsdev_init();

	return (0);
	}

	int
	zfs__fini(void)
	{
	if (spa_busy() \|\| zfs_busy() \|\| zvol_busy() \|\|
	zio_injection_enabled) {
	return (EBUSY);
	}

	zfsdev_fini();
	zvol_fini();
	zfs_fini();
	spa_fini();

	tsd_destroy(&zfs_fsyncer_key);
	tsd_destroy(&rrw_tsd_key);
	tsd_destroy(&zfs_allow_log_key);

	mutex_destroy(&zfs_share_lock);

	return (0);
	}

	static void
	zfs_shutdown(void *arg __unused, int howto __unused)
	{

	/*
	* ZFS fini routines can not properly work in a panic-ed system.
	*/
	if (panicstr == NULL)
	(void)zfs__fini();
	}


	static int
	zfs_modevent(module_t mod, int type, void *unused __unused)
	{
	int err;

	switch (type) {
	case MOD_LOAD:
	err = zfs__init();
	if (err == 0)
	zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
	shutdown_post_sync, zfs_shutdown, NULL,
	SHUTDOWN_PRI_FIRST);
	return (err);
	case MOD_UNLOAD:
	err = zfs__fini();
	if (err == 0 && zfs_shutdown_event_tag != NULL)
	EVENTHANDLER_DEREGISTER(shutdown_post_sync,
	zfs_shutdown_event_tag);
	return (err);
	case MOD_SHUTDOWN:
	return (0);
	default:
	break;
	}
	return (EOPNOTSUPP);
	}

	static moduledata_t zfs_mod = {
	"zfsctrl",
	zfs_modevent,
	0
	};
	DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
	MODULE_VERSION(zfsctrl, 1);
	MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
	MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
	MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 332525)
	@@ -1,2564 +1,2564 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
	* All rights reserved.
	* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/sysmacros.h>
	#include <sys/kmem.h>
	#include <sys/acl.h>
	#include <sys/vnode.h>
	#include <sys/vfs.h>
	#include <sys/mntent.h>
	#include <sys/mount.h>
	#include <sys/cmn_err.h>
	#include <sys/zfs_znode.h>
	#include <sys/zfs_dir.h>
	#include <sys/zil.h>
	#include <sys/fs/zfs.h>
	#include <sys/dmu.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_deleg.h>
	#include <sys/spa.h>
	#include <sys/zap.h>
	#include <sys/sa.h>
	#include <sys/sa_impl.h>
	#include <sys/varargs.h>
	#include <sys/policy.h>
	#include <sys/atomic.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/zfs_ctldir.h>
	#include <sys/zfs_fuid.h>
	#include <sys/sunddi.h>
	#include <sys/dnlc.h>
	#include <sys/dmu_objset.h>
	#include <sys/spa_boot.h>
	#include <sys/jail.h>
	#include "zfs_comutil.h"

	struct mtx zfs_debug_mtx;
	MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);

	SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");

	int zfs_super_owner;
	SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
	"File system owner can perform privileged operation on his file systems");

	int zfs_debug_level;
	SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
	"Debug level");

	SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
	static int zfs_version_acl = ZFS_ACL_VERSION;
	SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
	"ZFS_ACL_VERSION");
	static int zfs_version_spa = SPA_VERSION;
	SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
	"SPA_VERSION");
	static int zfs_version_zpl = ZPL_VERSION;
	SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
	"ZPL_VERSION");

	static int zfs_mount(vfs_t *vfsp);
	static int zfs_umount(vfs_t *vfsp, int fflag);
	static int zfs_root(vfs_t vfsp, int flags, vnode_t *vpp);
	static int zfs_statfs(vfs_t vfsp, struct statfs statp);
	static int zfs_vget(vfs_t vfsp, ino_t ino, int flags, vnode_t *vpp);
	static int zfs_sync(vfs_t *vfsp, int waitfor);
	static int zfs_checkexp(vfs_t vfsp, struct sockaddr nam, int *extflagsp,
	struct ucred *credanonp, int numsecflavors, int **secflavors);
	static int zfs_fhtovp(vfs_t vfsp, fid_t fidp, int flags, vnode_t **vpp);
	static void zfs_objset_close(zfsvfs_t *zfsvfs);
	static void zfs_freevfs(vfs_t *vfsp);

	struct vfsops zfs_vfsops = {
	.vfs_mount = zfs_mount,
	.vfs_unmount = zfs_umount,
	.vfs_root = zfs_root,
	.vfs_statfs = zfs_statfs,
	.vfs_vget = zfs_vget,
	.vfs_sync = zfs_sync,
	.vfs_checkexp = zfs_checkexp,
	.vfs_fhtovp = zfs_fhtovp,
	};

	VFS_SET(zfs_vfsops, zfs, VFCF_JAIL \| VFCF_DELEGADMIN);

	/*
	* We need to keep a count of active fs's.
	* This is necessary to prevent our module
	* from being unloaded after a umount -f
	*/
	static uint32_t zfs_active_fs_count = 0;

	/ARGSUSED/
	static int
	zfs_sync(vfs_t *vfsp, int waitfor)
	{

	/*
	* Data integrity is job one. We don't want a compromised kernel
	* writing to the storage pool, so we never sync during panic.
	*/
	if (panicstr)
	return (0);

	/*
	* Ignore the system syncher. ZFS already commits async data
	* at zfs_txg_timeout intervals.
	*/
	if (waitfor == MNT_LAZY)
	return (0);

	if (vfsp != NULL) {
	/*
	* Sync a specific filesystem.
	*/
	zfsvfs_t *zfsvfs = vfsp->vfs_data;
	dsl_pool_t *dp;
	int error;

	error = vfs_stdsync(vfsp, waitfor);
	if (error != 0)
	return (error);

	ZFS_ENTER(zfsvfs);
	dp = dmu_objset_pool(zfsvfs->z_os);

	/*
	* If the system is shutting down, then skip any
	* filesystems which may exist on a suspended pool.
	*/
	if (sys_shutdown && spa_suspended(dp->dp_spa)) {
	ZFS_EXIT(zfsvfs);
	return (0);
	}

	if (zfsvfs->z_log != NULL)
	zil_commit(zfsvfs->z_log, 0);

	ZFS_EXIT(zfsvfs);
	} else {
	/*
	* Sync all ZFS filesystems. This is what happens when you
	* run sync(1M). Unlike other filesystems, ZFS honors the
	* request by waiting for all pools to commit all dirty data.
	*/
	spa_sync_allpools();
	}

	return (0);
	}

	#ifndef __FreeBSD_kernel__
	static int
	zfs_create_unique_device(dev_t *dev)
	{
	major_t new_major;

	do {
	ASSERT3U(zfs_minor, <=, MAXMIN32);
	minor_t start = zfs_minor;
	do {
	mutex_enter(&zfs_dev_mtx);
	if (zfs_minor >= MAXMIN32) {
	/*
	* If we're still using the real major
	* keep out of /dev/zfs and /dev/zvol minor
	* number space. If we're using a getudev()'ed
	* major number, we can use all of its minors.
	*/
	if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
	zfs_minor = ZFS_MIN_MINOR;
	else
	zfs_minor = 0;
	} else {
	zfs_minor++;
	}
	*dev = makedevice(zfs_major, zfs_minor);
	mutex_exit(&zfs_dev_mtx);
	} while (vfs_devismounted(*dev) && zfs_minor != start);
	if (zfs_minor == start) {
	/*
	* We are using all ~262,000 minor numbers for the
	* current major number. Create a new major number.
	*/
	if ((new_major = getudev()) == (major_t)-1) {
	cmn_err(CE_WARN,
	"zfs_mount: Can't get unique major "
	"device number.");
	return (-1);
	}
	mutex_enter(&zfs_dev_mtx);
	zfs_major = new_major;
	zfs_minor = 0;

	mutex_exit(&zfs_dev_mtx);
	} else {
	break;
	}
	/* CONSTANTCONDITION */
	} while (1);

	return (0);
	}
	#endif /* !__FreeBSD_kernel__ */

	static void
	atime_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;

	if (newval == TRUE) {
	zfsvfs->z_atime = TRUE;
	zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
	} else {
	zfsvfs->z_atime = FALSE;
	zfsvfs->z_vfs->vfs_flag \|= MNT_NOATIME;
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
	}
	}

	static void
	xattr_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;

	if (newval == TRUE) {
	/* XXX locking on vfs_flag? */
	#ifdef TODO
	zfsvfs->z_vfs->vfs_flag \|= VFS_XATTR;
	#endif
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
	} else {
	/* XXX locking on vfs_flag? */
	#ifdef TODO
	zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
	#endif
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
	}
	}

	static void
	blksz_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;
	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
	ASSERT(ISP2(newval));

	zfsvfs->z_max_blksz = newval;
	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
	}

	static void
	readonly_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;

	if (newval) {
	/* XXX locking on vfs_flag? */
	zfsvfs->z_vfs->vfs_flag \|= VFS_RDONLY;
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
	} else {
	/* XXX locking on vfs_flag? */
	zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
	}
	}

	static void
	setuid_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;

	if (newval == FALSE) {
	zfsvfs->z_vfs->vfs_flag \|= VFS_NOSETUID;
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
	} else {
	zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
	}
	}

	static void
	exec_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;

	if (newval == FALSE) {
	zfsvfs->z_vfs->vfs_flag \|= VFS_NOEXEC;
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
	} else {
	zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
	}
	}

	/*
	* The nbmand mount option can be changed at mount time.
	* We can't allow it to be toggled on live file systems or incorrect
	* behavior may be seen from cifs clients
	*
	* This property isn't registered via dsl_prop_register(), but this callback
	* will be called when a file system is first mounted
	*/
	static void
	nbmand_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;
	if (newval == FALSE) {
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
	} else {
	vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
	vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
	}
	}

	static void
	snapdir_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;

	zfsvfs->z_show_ctldir = newval;
	}

	static void
	vscan_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;

	zfsvfs->z_vscan = newval;
	}

	static void
	acl_mode_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;

	zfsvfs->z_acl_mode = newval;
	}

	static void
	acl_inherit_changed_cb(void *arg, uint64_t newval)
	{
	zfsvfs_t *zfsvfs = arg;

	zfsvfs->z_acl_inherit = newval;
	}

	static int
	zfs_register_callbacks(vfs_t *vfsp)
	{
	struct dsl_dataset *ds = NULL;
	objset_t *os = NULL;
	zfsvfs_t *zfsvfs = NULL;
	uint64_t nbmand;
	boolean_t readonly = B_FALSE;
	boolean_t do_readonly = B_FALSE;
	boolean_t setuid = B_FALSE;
	boolean_t do_setuid = B_FALSE;
	boolean_t exec = B_FALSE;
	boolean_t do_exec = B_FALSE;
	#ifdef illumos
	boolean_t devices = B_FALSE;
	boolean_t do_devices = B_FALSE;
	#endif
	boolean_t xattr = B_FALSE;
	boolean_t do_xattr = B_FALSE;
	boolean_t atime = B_FALSE;
	boolean_t do_atime = B_FALSE;
	int error = 0;

	ASSERT(vfsp);
	zfsvfs = vfsp->vfs_data;
	ASSERT(zfsvfs);
	os = zfsvfs->z_os;

	/*
	* This function can be called for a snapshot when we update snapshot's
	* mount point, which isn't really supported.
	*/
	if (dmu_objset_is_snapshot(os))
	return (EOPNOTSUPP);

	/*
	* The act of registering our callbacks will destroy any mount
	* options we may have. In order to enable temporary overrides
	* of mount options, we stash away the current values and
	* restore them after we register the callbacks.
	*/
	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) \|\|
	!spa_writeable(dmu_objset_spa(os))) {
	readonly = B_TRUE;
	do_readonly = B_TRUE;
	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
	readonly = B_FALSE;
	do_readonly = B_TRUE;
	}
	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
	setuid = B_FALSE;
	do_setuid = B_TRUE;
	} else {
	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
	setuid = B_FALSE;
	do_setuid = B_TRUE;
	} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
	setuid = B_TRUE;
	do_setuid = B_TRUE;
	}
	}
	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
	exec = B_FALSE;
	do_exec = B_TRUE;
	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
	exec = B_TRUE;
	do_exec = B_TRUE;
	}
	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
	xattr = B_FALSE;
	do_xattr = B_TRUE;
	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
	xattr = B_TRUE;
	do_xattr = B_TRUE;
	}
	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
	atime = B_FALSE;
	do_atime = B_TRUE;
	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
	atime = B_TRUE;
	do_atime = B_TRUE;
	}

	/*
	* We need to enter pool configuration here, so that we can use
	* dsl_prop_get_int_ds() to handle the special nbmand property below.
	* dsl_prop_get_integer() can not be used, because it has to acquire
	* spa_namespace_lock and we can not do that because we already hold
	- * z_teardown_lock. The problem is that spa_config_sync() is called
	+ * z_teardown_lock. The problem is that spa_write_cachefile() is called
	* with spa_namespace_lock held and the function calls ZFS vnode
	* operations to write the cache file and thus z_teardown_lock is
	* acquired after spa_namespace_lock.
	*/
	ds = dmu_objset_ds(os);
	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);

	/*
	* nbmand is a special property. It can only be changed at
	* mount time.
	*
	* This is weird, but it is documented to only be changeable
	* at mount time.
	*/
	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
	nbmand = B_FALSE;
	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
	nbmand = B_TRUE;
	} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	return (error);
	}

	/*
	* Register property callbacks.
	*
	* It would probably be fine to just check for i/o error from
	* the first prop_register(), but I guess I like to go
	* overboard...
	*/
	error = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
	error = error ? error : dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
	error = error ? error : dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
	error = error ? error : dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
	#ifdef illumos
	error = error ? error : dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
	#endif
	error = error ? error : dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
	error = error ? error : dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
	error = error ? error : dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
	error = error ? error : dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
	error = error ? error : dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
	zfsvfs);
	error = error ? error : dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
	if (error)
	goto unregister;

	/*
	* Invoke our callbacks to restore temporary mount options.
	*/
	if (do_readonly)
	readonly_changed_cb(zfsvfs, readonly);
	if (do_setuid)
	setuid_changed_cb(zfsvfs, setuid);
	if (do_exec)
	exec_changed_cb(zfsvfs, exec);
	if (do_xattr)
	xattr_changed_cb(zfsvfs, xattr);
	if (do_atime)
	atime_changed_cb(zfsvfs, atime);

	nbmand_changed_cb(zfsvfs, nbmand);

	return (0);

	unregister:
	dsl_prop_unregister_all(ds, zfsvfs);
	return (error);
	}

	static int
	zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
	uint64_t userp, uint64_t groupp)
	{
	/*
	* Is it a valid type of object to track?
	*/
	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
	return (SET_ERROR(ENOENT));

	/*
	* If we have a NULL data pointer
	* then assume the id's aren't changing and
	* return EEXIST to the dmu to let it know to
	* use the same ids
	*/
	if (data == NULL)
	return (SET_ERROR(EEXIST));

	if (bonustype == DMU_OT_ZNODE) {
	znode_phys_t *znp = data;
	*userp = znp->zp_uid;
	*groupp = znp->zp_gid;
	} else {
	int hdrsize;
	sa_hdr_phys_t *sap = data;
	sa_hdr_phys_t sa = *sap;
	boolean_t swap = B_FALSE;

	ASSERT(bonustype == DMU_OT_SA);

	if (sa.sa_magic == 0) {
	/*
	* This should only happen for newly created
	* files that haven't had the znode data filled
	* in yet.
	*/
	*userp = 0;
	*groupp = 0;
	return (0);
	}
	if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
	sa.sa_magic = SA_MAGIC;
	sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
	swap = B_TRUE;
	} else {
	VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
	}

	hdrsize = sa_hdrsize(&sa);
	VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
	userp = ((uint64_t *)((uintptr_t)data + hdrsize +
	SA_UID_OFFSET));
	groupp = ((uint64_t *)((uintptr_t)data + hdrsize +
	SA_GID_OFFSET));
	if (swap) {
	userp = BSWAP_64(userp);
	groupp = BSWAP_64(groupp);
	}
	}
	return (0);
	}

	static void
	fuidstr_to_sid(zfsvfs_t zfsvfs, const char fuidstr,
	char domainbuf, int buflen, uid_t ridp)
	{
	uint64_t fuid;
	const char *domain;

	fuid = zfs_strtonum(fuidstr, NULL);

	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
	if (domain)
	(void) strlcpy(domainbuf, domain, buflen);
	else
	domainbuf[0] = '\0';
	*ridp = FUID_RID(fuid);
	}

	static uint64_t
	zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
	{
	switch (type) {
	case ZFS_PROP_USERUSED:
	return (DMU_USERUSED_OBJECT);
	case ZFS_PROP_GROUPUSED:
	return (DMU_GROUPUSED_OBJECT);
	case ZFS_PROP_USERQUOTA:
	return (zfsvfs->z_userquota_obj);
	case ZFS_PROP_GROUPQUOTA:
	return (zfsvfs->z_groupquota_obj);
	}
	return (0);
	}

	int
	zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
	uint64_t cookiep, void vbuf, uint64_t *bufsizep)
	{
	int error;
	zap_cursor_t zc;
	zap_attribute_t za;
	zfs_useracct_t *buf = vbuf;
	uint64_t obj;

	if (!dmu_objset_userspace_present(zfsvfs->z_os))
	return (SET_ERROR(ENOTSUP));

	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
	if (obj == 0) {
	*bufsizep = 0;
	return (0);
	}

	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
	(error = zap_cursor_retrieve(&zc, &za)) == 0;
	zap_cursor_advance(&zc)) {
	if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
	*bufsizep)
	break;

	fuidstr_to_sid(zfsvfs, za.za_name,
	buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);

	buf->zu_space = za.za_first_integer;
	buf++;
	}
	if (error == ENOENT)
	error = 0;

	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
	*cookiep = zap_cursor_serialize(&zc);
	zap_cursor_fini(&zc);
	return (error);
	}

	/*
	* buf must be big enough (eg, 32 bytes)
	*/
	static int
	id_to_fuidstr(zfsvfs_t zfsvfs, const char domain, uid_t rid,
	char *buf, boolean_t addok)
	{
	uint64_t fuid;
	int domainid = 0;

	if (domain && domain[0]) {
	domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
	if (domainid == -1)
	return (SET_ERROR(ENOENT));
	}
	fuid = FUID_ENCODE(domainid, rid);
	(void) sprintf(buf, "%llx", (longlong_t)fuid);
	return (0);
	}

	int
	zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
	const char domain, uint64_t rid, uint64_t valp)
	{
	char buf[32];
	int err;
	uint64_t obj;

	*valp = 0;

	if (!dmu_objset_userspace_present(zfsvfs->z_os))
	return (SET_ERROR(ENOTSUP));

	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
	if (obj == 0)
	return (0);

	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
	if (err)
	return (err);

	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
	if (err == ENOENT)
	err = 0;
	return (err);
	}

	int
	zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
	const char *domain, uint64_t rid, uint64_t quota)
	{
	char buf[32];
	int err;
	dmu_tx_t *tx;
	uint64_t *objp;
	boolean_t fuid_dirtied;

	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
	return (SET_ERROR(EINVAL));

	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
	return (SET_ERROR(ENOTSUP));

	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
	&zfsvfs->z_groupquota_obj;

	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
	if (err)
	return (err);
	fuid_dirtied = zfsvfs->z_fuid_dirty;

	tx = dmu_tx_create(zfsvfs->z_os);
	dmu_tx_hold_zap(tx, objp ? objp : DMU_NEW_OBJECT, B_TRUE, NULL);
	if (*objp == 0) {
	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
	zfs_userquota_prop_prefixes[type]);
	}
	if (fuid_dirtied)
	zfs_fuid_txhold(zfsvfs, tx);
	err = dmu_tx_assign(tx, TXG_WAIT);
	if (err) {
	dmu_tx_abort(tx);
	return (err);
	}

	mutex_enter(&zfsvfs->z_lock);
	if (*objp == 0) {
	*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
	DMU_OT_NONE, 0, tx);
	VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
	zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
	}
	mutex_exit(&zfsvfs->z_lock);

	if (quota == 0) {
	err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
	if (err == ENOENT)
	err = 0;
	} else {
	err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
	}
	ASSERT(err == 0);
	if (fuid_dirtied)
	zfs_fuid_sync(zfsvfs, tx);
	dmu_tx_commit(tx);
	return (err);
	}

	boolean_t
	zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
	{
	char buf[32];
	uint64_t used, quota, usedobj, quotaobj;
	int err;

	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;

	if (quotaobj == 0 \|\| zfsvfs->z_replay)
	return (B_FALSE);

	(void) sprintf(buf, "%llx", (longlong_t)fuid);
	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
	if (err != 0)
	return (B_FALSE);

	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
	if (err != 0)
	return (B_FALSE);
	return (used >= quota);
	}

	boolean_t
	zfs_owner_overquota(zfsvfs_t zfsvfs, znode_t zp, boolean_t isgroup)
	{
	uint64_t fuid;
	uint64_t quotaobj;

	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;

	fuid = isgroup ? zp->z_gid : zp->z_uid;

	if (quotaobj == 0 \|\| zfsvfs->z_replay)
	return (B_FALSE);

	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
	}

	/*
	* Associate this zfsvfs with the given objset, which must be owned.
	* This will cache a bunch of on-disk state from the objset in the
	* zfsvfs.
	*/
	static int
	zfsvfs_init(zfsvfs_t zfsvfs, objset_t os)
	{
	int error;
	uint64_t val;

	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
	zfsvfs->z_os = os;

	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
	if (error != 0)
	return (error);
	if (zfsvfs->z_version >
	zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
	(void) printf("Can't mount a version %lld file system "
	"on a version %lld pool\n. Pool must be upgraded to mount "
	"this file system.", (u_longlong_t)zfsvfs->z_version,
	(u_longlong_t)spa_version(dmu_objset_spa(os)));
	return (SET_ERROR(ENOTSUP));
	}
	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
	if (error != 0)
	return (error);
	zfsvfs->z_norm = (int)val;

	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
	if (error != 0)
	return (error);
	zfsvfs->z_utf8 = (val != 0);

	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
	if (error != 0)
	return (error);
	zfsvfs->z_case = (uint_t)val;

	/*
	* Fold case on file systems that are always or sometimes case
	* insensitive.
	*/
	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE \|\|
	zfsvfs->z_case == ZFS_CASE_MIXED)
	zfsvfs->z_norm \|= U8_TEXTPREP_TOUPPER;

	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);

	uint64_t sa_obj = 0;
	if (zfsvfs->z_use_sa) {
	/* should either have both of these objects or none */
	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
	&sa_obj);
	if (error != 0)
	return (error);
	}

	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
	&zfsvfs->z_attr_table);
	if (error != 0)
	return (error);

	if (zfsvfs->z_version >= ZPL_VERSION_SA)
	sa_register_update_callback(os, zfs_sa_upgrade);

	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
	&zfsvfs->z_root);
	if (error != 0)
	return (error);
	ASSERT(zfsvfs->z_root != 0);

	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
	&zfsvfs->z_unlinkedobj);
	if (error != 0)
	return (error);

	error = zap_lookup(os, MASTER_NODE_OBJ,
	zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
	8, 1, &zfsvfs->z_userquota_obj);
	if (error == ENOENT)
	zfsvfs->z_userquota_obj = 0;
	else if (error != 0)
	return (error);

	error = zap_lookup(os, MASTER_NODE_OBJ,
	zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
	8, 1, &zfsvfs->z_groupquota_obj);
	if (error == ENOENT)
	zfsvfs->z_groupquota_obj = 0;
	else if (error != 0)
	return (error);

	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
	&zfsvfs->z_fuid_obj);
	if (error == ENOENT)
	zfsvfs->z_fuid_obj = 0;
	else if (error != 0)
	return (error);

	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
	&zfsvfs->z_shares_dir);
	if (error == ENOENT)
	zfsvfs->z_shares_dir = 0;
	else if (error != 0)
	return (error);

	/*
	* Only use the name cache if we are looking for a
	* name on a file system that does not require normalization
	* or case folding. We can also look there if we happen to be
	* on a non-normalizing, mixed sensitivity file system IF we
	* are looking for the exact name (which is always the case on
	* FreeBSD).
	*/
	zfsvfs->z_use_namecache = !zfsvfs->z_norm \|\|
	((zfsvfs->z_case == ZFS_CASE_MIXED) &&
	!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));

	return (0);
	}

	int
	zfsvfs_create(const char osname, zfsvfs_t *zfvp)
	{
	objset_t *os;
	zfsvfs_t *zfsvfs;
	int error;

	/*
	* XXX: Fix struct statfs so this isn't necessary!
	*
	* The 'osname' is used as the filesystem's special node, which means
	* it must fit in statfs.f_mntfromname, or else it can't be
	* enumerated, so libzfs_mnttab_find() returns NULL, which causes
	* 'zfs unmount' to think it's not mounted when it is.
	*/
	if (strlen(osname) >= MNAMELEN)
	return (SET_ERROR(ENAMETOOLONG));

	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);

	/*
	* We claim to always be readonly so we can open snapshots;
	* other ZPL code will prevent us from writing to snapshots.
	*/

	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
	if (error != 0) {
	kmem_free(zfsvfs, sizeof (zfsvfs_t));
	return (error);
	}

	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
	if (error != 0) {
	dmu_objset_disown(os, zfsvfs);
	}
	return (error);
	}


	int
	zfsvfs_create_impl(zfsvfs_t *zfvp, zfsvfs_t zfsvfs, objset_t *os)
	{
	int error;

	zfsvfs->z_vfs = NULL;
	zfsvfs->z_parent = zfsvfs;

	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
	offsetof(znode_t, z_link_node));
	#ifdef DIAGNOSTIC
	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
	#else
	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
	#endif
	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
	mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);

	error = zfsvfs_init(zfsvfs, os);
	if (error != 0) {
	*zfvp = NULL;
	kmem_free(zfsvfs, sizeof (zfsvfs_t));
	return (error);
	}

	*zfvp = zfsvfs;
	return (0);
	}

	static int
	zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
	{
	int error;

	error = zfs_register_callbacks(zfsvfs->z_vfs);
	if (error)
	return (error);

	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);

	/*
	* If we are not mounting (ie: online recv), then we don't
	* have to worry about replaying the log as we blocked all
	* operations out since we closed the ZIL.
	*/
	if (mounting) {
	boolean_t readonly;

	/*
	* During replay we remove the read only flag to
	* allow replays to succeed.
	*/
	readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
	if (readonly != 0)
	zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
	else
	zfs_unlinked_drain(zfsvfs);

	/*
	* Parse and replay the intent log.
	*
	* Because of ziltest, this must be done after
	* zfs_unlinked_drain(). (Further note: ziltest
	* doesn't use readonly mounts, where
	* zfs_unlinked_drain() isn't called.) This is because
	* ziltest causes spa_sync() to think it's committed,
	* but actually it is not, so the intent log contains
	* many txg's worth of changes.
	*
	* In particular, if object N is in the unlinked set in
	* the last txg to actually sync, then it could be
	* actually freed in a later txg and then reallocated
	* in a yet later txg. This would write a "create
	* object N" record to the intent log. Normally, this
	* would be fine because the spa_sync() would have
	* written out the fact that object N is free, before
	* we could write the "create object N" intent log
	* record.
	*
	* But when we are in ziltest mode, we advance the "open
	* txg" without actually spa_sync()-ing the changes to
	* disk. So we would see that object N is still
	* allocated and in the unlinked set, and there is an
	* intent log record saying to allocate it.
	*/
	if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
	if (zil_replay_disable) {
	zil_destroy(zfsvfs->z_log, B_FALSE);
	} else {
	zfsvfs->z_replay = B_TRUE;
	zil_replay(zfsvfs->z_os, zfsvfs,
	zfs_replay_vector);
	zfsvfs->z_replay = B_FALSE;
	}
	}
	zfsvfs->z_vfs->vfs_flag \|= readonly; /* restore readonly bit */
	}

	/*
	* Set the objset user_ptr to track its zfsvfs.
	*/
	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);

	return (0);
	}

	extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */

	void
	zfsvfs_free(zfsvfs_t *zfsvfs)
	{
	int i;

	/*
	* This is a barrier to prevent the filesystem from going away in
	* zfs_znode_move() until we can safely ensure that the filesystem is
	* not unmounted. We consider the filesystem valid before the barrier
	* and invalid after the barrier.
	*/
	rw_enter(&zfsvfs_lock, RW_READER);
	rw_exit(&zfsvfs_lock);

	zfs_fuid_destroy(zfsvfs);

	mutex_destroy(&zfsvfs->z_znodes_lock);
	mutex_destroy(&zfsvfs->z_lock);
	list_destroy(&zfsvfs->z_all_znodes);
	rrm_destroy(&zfsvfs->z_teardown_lock);
	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
	rw_destroy(&zfsvfs->z_fuid_lock);
	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
	mutex_destroy(&zfsvfs->z_hold_mtx[i]);
	kmem_free(zfsvfs, sizeof (zfsvfs_t));
	}

	static void
	zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
	{
	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
	if (zfsvfs->z_vfs) {
	if (zfsvfs->z_use_fuids) {
	vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
	vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
	vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
	vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
	vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
	vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
	} else {
	vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
	vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
	vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
	vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
	vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
	vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
	}
	}
	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
	}

	static int
	zfs_domount(vfs_t vfsp, char osname)
	{
	uint64_t recordsize, fsid_guid;
	int error = 0;
	zfsvfs_t *zfsvfs;
	vnode_t *vp;

	ASSERT(vfsp);
	ASSERT(osname);

	error = zfsvfs_create(osname, &zfsvfs);
	if (error)
	return (error);
	zfsvfs->z_vfs = vfsp;

	#ifdef illumos
	/* Initialize the generic filesystem structure. */
	vfsp->vfs_bcount = 0;
	vfsp->vfs_data = NULL;

	if (zfs_create_unique_device(&mount_dev) == -1) {
	error = SET_ERROR(ENODEV);
	goto out;
	}
	ASSERT(vfs_devismounted(mount_dev) == 0);
	#endif

	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
	NULL))
	goto out;
	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;

	vfsp->vfs_data = zfsvfs;
	vfsp->mnt_flag \|= MNT_LOCAL;
	vfsp->mnt_kern_flag \|= MNTK_LOOKUP_SHARED;
	vfsp->mnt_kern_flag \|= MNTK_SHARED_WRITES;
	vfsp->mnt_kern_flag \|= MNTK_EXTENDED_SHARED;
	vfsp->mnt_kern_flag \|= MNTK_NO_IOPF; /* vn_io_fault can be used */

	/*
	* The fsid is 64 bits, composed of an 8-bit fs type, which
	* separates our fsid from any other filesystem types, and a
	* 56-bit objset unique ID. The objset unique ID is unique to
	* all objsets open on this system, provided by unique_create().
	* The 8-bit fs type must be put in the low bits of fsid[1]
	* because that's where other Solaris filesystems put it.
	*/
	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
	vfsp->vfs_fsid.val[0] = fsid_guid;
	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) \|
	vfsp->mnt_vfc->vfc_typenum & 0xFF;

	/*
	* Set features for file system.
	*/
	zfs_set_fuid_feature(zfsvfs);
	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
	vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
	vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
	vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
	vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
	vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
	}
	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);

	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
	uint64_t pval;

	atime_changed_cb(zfsvfs, B_FALSE);
	readonly_changed_cb(zfsvfs, B_TRUE);
	if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
	goto out;
	xattr_changed_cb(zfsvfs, pval);
	zfsvfs->z_issnap = B_TRUE;
	zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;

	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
	} else {
	error = zfsvfs_setup(zfsvfs, B_TRUE);
	}

	vfs_mountedfrom(vfsp, osname);

	if (!zfsvfs->z_issnap)
	zfsctl_create(zfsvfs);
	out:
	if (error) {
	dmu_objset_disown(zfsvfs->z_os, zfsvfs);
	zfsvfs_free(zfsvfs);
	} else {
	atomic_inc_32(&zfs_active_fs_count);
	}

	return (error);
	}

	void
	zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
	{
	objset_t *os = zfsvfs->z_os;

	if (!dmu_objset_is_snapshot(os))
	dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
	}

	#ifdef SECLABEL
	/*
	* Convert a decimal digit string to a uint64_t integer.
	*/
	static int
	str_to_uint64(char str, uint64_t objnum)
	{
	uint64_t num = 0;

	while (*str) {
	if (str < '0' \|\| str > '9')
	return (SET_ERROR(EINVAL));

	num = num10 + str++ - '0';
	}

	*objnum = num;
	return (0);
	}

	/*
	* The boot path passed from the boot loader is in the form of
	* "rootpool-name/root-filesystem-object-number'. Convert this
	* string to a dataset name: "rootpool-name/root-filesystem-name".
	*/
	static int
	zfs_parse_bootfs(char bpath, char outpath)
	{
	char *slashp;
	uint64_t objnum;
	int error;

	if (bpath == 0 \|\| bpath == '/')
	return (SET_ERROR(EINVAL));

	(void) strcpy(outpath, bpath);

	slashp = strchr(bpath, '/');

	/* if no '/', just return the pool name */
	if (slashp == NULL) {
	return (0);
	}

	/* if not a number, just return the root dataset name */
	if (str_to_uint64(slashp+1, &objnum)) {
	return (0);
	}

	*slashp = '\0';
	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
	*slashp = '/';

	return (error);
	}

	/*
	* Check that the hex label string is appropriate for the dataset being
	* mounted into the global_zone proper.
	*
	* Return an error if the hex label string is not default or
	* admin_low/admin_high. For admin_low labels, the corresponding
	* dataset must be readonly.
	*/
	int
	zfs_check_global_label(const char dsname, const char hexsl)
	{
	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
	return (0);
	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
	return (0);
	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
	/* must be readonly */
	uint64_t rdonly;

	if (dsl_prop_get_integer(dsname,
	zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
	return (SET_ERROR(EACCES));
	return (rdonly ? 0 : EACCES);
	}
	return (SET_ERROR(EACCES));
	}

	/*
	* Determine whether the mount is allowed according to MAC check.
	* by comparing (where appropriate) label of the dataset against
	* the label of the zone being mounted into. If the dataset has
	* no label, create one.
	*
	* Returns 0 if access allowed, error otherwise (e.g. EACCES)
	*/
	static int
	zfs_mount_label_policy(vfs_t vfsp, char osname)
	{
	int error, retv;
	zone_t *mntzone = NULL;
	ts_label_t *mnt_tsl;
	bslabel_t *mnt_sl;
	bslabel_t ds_sl;
	char ds_hexsl[MAXNAMELEN];

	retv = EACCES; /* assume the worst */

	/*
	* Start by getting the dataset label if it exists.
	*/
	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
	1, sizeof (ds_hexsl), &ds_hexsl, NULL);
	if (error)
	return (SET_ERROR(EACCES));

	/*
	* If labeling is NOT enabled, then disallow the mount of datasets
	* which have a non-default label already. No other label checks
	* are needed.
	*/
	if (!is_system_labeled()) {
	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
	return (0);
	return (SET_ERROR(EACCES));
	}

	/*
	* Get the label of the mountpoint. If mounting into the global
	* zone (i.e. mountpoint is not within an active zone and the
	* zoned property is off), the label must be default or
	* admin_low/admin_high only; no other checks are needed.
	*/
	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
	if (mntzone->zone_id == GLOBAL_ZONEID) {
	uint64_t zoned;

	zone_rele(mntzone);

	if (dsl_prop_get_integer(osname,
	zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
	return (SET_ERROR(EACCES));
	if (!zoned)
	return (zfs_check_global_label(osname, ds_hexsl));
	else
	/*
	* This is the case of a zone dataset being mounted
	* initially, before the zone has been fully created;
	* allow this mount into global zone.
	*/
	return (0);
	}

	mnt_tsl = mntzone->zone_slabel;
	ASSERT(mnt_tsl != NULL);
	label_hold(mnt_tsl);
	mnt_sl = label2bslabel(mnt_tsl);

	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
	/*
	* The dataset doesn't have a real label, so fabricate one.
	*/
	char *str = NULL;

	if (l_to_str_internal(mnt_sl, &str) == 0 &&
	dsl_prop_set_string(osname,
	zfs_prop_to_name(ZFS_PROP_MLSLABEL),
	ZPROP_SRC_LOCAL, str) == 0)
	retv = 0;
	if (str != NULL)
	kmem_free(str, strlen(str) + 1);
	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
	/*
	* Now compare labels to complete the MAC check. If the
	* labels are equal then allow access. If the mountpoint
	* label dominates the dataset label, allow readonly access.
	* Otherwise, access is denied.
	*/
	if (blequal(mnt_sl, &ds_sl))
	retv = 0;
	else if (bldominates(mnt_sl, &ds_sl)) {
	vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
	retv = 0;
	}
	}

	label_rele(mnt_tsl);
	zone_rele(mntzone);
	return (retv);
	}
	#endif /* SECLABEL */

	#ifdef OPENSOLARIS_MOUNTROOT
	static int
	zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
	{
	int error = 0;
	static int zfsrootdone = 0;
	zfsvfs_t *zfsvfs = NULL;
	znode_t *zp = NULL;
	vnode_t *vp = NULL;
	char *zfs_bootfs;
	char *zfs_devid;

	ASSERT(vfsp);

	/*
	* The filesystem that we mount as root is defined in the
	* boot property "zfs-bootfs" with a format of
	* "poolname/root-dataset-objnum".
	*/
	if (why == ROOT_INIT) {
	if (zfsrootdone++)
	return (SET_ERROR(EBUSY));
	/*
	* the process of doing a spa_load will require the
	* clock to be set before we could (for example) do
	* something better by looking at the timestamp on
	* an uberblock, so just set it to -1.
	*/
	clkset(-1);

	if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
	cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
	"bootfs name");
	return (SET_ERROR(EINVAL));
	}
	zfs_devid = spa_get_bootprop("diskdevid");
	error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
	if (zfs_devid)
	spa_free_bootprop(zfs_devid);
	if (error) {
	spa_free_bootprop(zfs_bootfs);
	cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
	error);
	return (error);
	}
	if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
	spa_free_bootprop(zfs_bootfs);
	cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
	error);
	return (error);
	}

	spa_free_bootprop(zfs_bootfs);

	if (error = vfs_lock(vfsp))
	return (error);

	if (error = zfs_domount(vfsp, rootfs.bo_name)) {
	cmn_err(CE_NOTE, "zfs_domount: error %d", error);
	goto out;
	}

	zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
	ASSERT(zfsvfs);
	if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
	cmn_err(CE_NOTE, "zfs_zget: error %d", error);
	goto out;
	}

	vp = ZTOV(zp);
	mutex_enter(&vp->v_lock);
	vp->v_flag \|= VROOT;
	mutex_exit(&vp->v_lock);
	rootvp = vp;

	/*
	* Leave rootvp held. The root file system is never unmounted.
	*/

	vfs_add((struct vnode *)0, vfsp,
	(vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
	out:
	vfs_unlock(vfsp);
	return (error);
	} else if (why == ROOT_REMOUNT) {
	readonly_changed_cb(vfsp->vfs_data, B_FALSE);
	vfsp->vfs_flag \|= VFS_REMOUNT;

	/* refresh mount options */
	zfs_unregister_callbacks(vfsp->vfs_data);
	return (zfs_register_callbacks(vfsp));

	} else if (why == ROOT_UNMOUNT) {
	zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
	(void) zfs_sync(vfsp, 0, 0);
	return (0);
	}

	/*
	* if "why" is equal to anything else other than ROOT_INIT,
	* ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
	*/
	return (SET_ERROR(ENOTSUP));
	}
	#endif /* OPENSOLARIS_MOUNTROOT */

	static int
	getpoolname(const char osname, char poolname)
	{
	char *p;

	p = strchr(osname, '/');
	if (p == NULL) {
	if (strlen(osname) >= MAXNAMELEN)
	return (ENAMETOOLONG);
	(void) strcpy(poolname, osname);
	} else {
	if (p - osname >= MAXNAMELEN)
	return (ENAMETOOLONG);
	(void) strncpy(poolname, osname, p - osname);
	poolname[p - osname] = '\0';
	}
	return (0);
	}

	/ARGSUSED/
	static int
	zfs_mount(vfs_t *vfsp)
	{
	kthread_t *td = curthread;
	vnode_t *mvp = vfsp->mnt_vnodecovered;
	cred_t *cr = td->td_ucred;
	char *osname;
	int error = 0;
	int canwrite;

	#ifdef illumos
	if (mvp->v_type != VDIR)
	return (SET_ERROR(ENOTDIR));

	mutex_enter(&mvp->v_lock);
	if ((uap->flags & MS_REMOUNT) == 0 &&
	(uap->flags & MS_OVERLAY) == 0 &&
	(mvp->v_count != 1 \|\| (mvp->v_flag & VROOT))) {
	mutex_exit(&mvp->v_lock);
	return (SET_ERROR(EBUSY));
	}
	mutex_exit(&mvp->v_lock);

	/*
	* ZFS does not support passing unparsed data in via MS_DATA.
	* Users should use the MS_OPTIONSTR interface; this means
	* that all option parsing is already done and the options struct
	* can be interrogated.
	*/
	if ((uap->flags & MS_DATA) && uap->datalen > 0)
	return (SET_ERROR(EINVAL));

	/*
	* Get the objset name (the "special" mount argument).
	*/
	if (error = pn_get(uap->spec, fromspace, &spn))
	return (error);

	osname = spn.pn_path;
	#else /* !illumos */
	if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
	return (SET_ERROR(EPERM));

	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
	return (SET_ERROR(EINVAL));

	/*
	* If full-owner-access is enabled and delegated administration is
	* turned on, we must set nosuid.
	*/
	if (zfs_super_owner &&
	dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
	secpolicy_fs_mount_clearopts(cr, vfsp);
	}
	#endif /* illumos */

	/*
	* Check for mount privilege?
	*
	* If we don't have privilege then see if
	* we have local permission to allow it
	*/
	error = secpolicy_fs_mount(cr, mvp, vfsp);
	if (error) {
	if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
	goto out;

	if (!(vfsp->vfs_flag & MS_REMOUNT)) {
	vattr_t vattr;

	/*
	* Make sure user is the owner of the mount point
	* or has sufficient privileges.
	*/

	vattr.va_mask = AT_UID;

	vn_lock(mvp, LK_SHARED \| LK_RETRY);
	if (VOP_GETATTR(mvp, &vattr, cr)) {
	VOP_UNLOCK(mvp, 0);
	goto out;
	}

	if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
	VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
	VOP_UNLOCK(mvp, 0);
	goto out;
	}
	VOP_UNLOCK(mvp, 0);
	}

	secpolicy_fs_mount_clearopts(cr, vfsp);
	}

	/*
	* Refuse to mount a filesystem if we are in a local zone and the
	* dataset is not visible.
	*/
	if (!INGLOBALZONE(curthread) &&
	(!zone_dataset_visible(osname, &canwrite) \|\| !canwrite)) {
	error = SET_ERROR(EPERM);
	goto out;
	}

	#ifdef SECLABEL
	error = zfs_mount_label_policy(vfsp, osname);
	if (error)
	goto out;
	#endif

	vfsp->vfs_flag \|= MNT_NFS4ACLS;

	/*
	* When doing a remount, we simply refresh our temporary properties
	* according to those options set in the current VFS options.
	*/
	if (vfsp->vfs_flag & MS_REMOUNT) {
	zfsvfs_t *zfsvfs = vfsp->vfs_data;

	/*
	* Refresh mount options with z_teardown_lock blocking I/O while
	* the filesystem is in an inconsistent state.
	* The lock also serializes this code with filesystem
	* manipulations between entry to zfs_suspend_fs() and return
	* from zfs_resume_fs().
	*/
	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
	zfs_unregister_callbacks(zfsvfs);
	error = zfs_register_callbacks(vfsp);
	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
	goto out;
	}

	/* Initial root mount: try hard to import the requested root pool. */
	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
	(vfsp->vfs_flag & MNT_UPDATE) == 0) {
	char pname[MAXNAMELEN];

	error = getpoolname(osname, pname);
	if (error == 0)
	error = spa_import_rootpool(pname);
	if (error)
	goto out;
	}
	DROP_GIANT();
	error = zfs_domount(vfsp, osname);
	PICKUP_GIANT();

	#ifdef illumos
	/*
	* Add an extra VFS_HOLD on our parent vfs so that it can't
	* disappear due to a forced unmount.
	*/
	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
	VFS_HOLD(mvp->v_vfsp);
	#endif

	out:
	return (error);
	}

	static int
	zfs_statfs(vfs_t vfsp, struct statfs statp)
	{
	zfsvfs_t *zfsvfs = vfsp->vfs_data;
	uint64_t refdbytes, availbytes, usedobjs, availobjs;

	statp->f_version = STATFS_VERSION;

	ZFS_ENTER(zfsvfs);

	dmu_objset_space(zfsvfs->z_os,
	&refdbytes, &availbytes, &usedobjs, &availobjs);

	/*
	* The underlying storage pool actually uses multiple block sizes.
	* We report the fragsize as the smallest block size we support,
	* and we report our blocksize as the filesystem's maximum blocksize.
	*/
	statp->f_bsize = SPA_MINBLOCKSIZE;
	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;

	/*
	* The following report "total" blocks of various kinds in the
	* file system, but reported in terms of f_frsize - the
	* "fragment" size.
	*/

	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
	statp->f_bfree = availbytes / statp->f_bsize;
	statp->f_bavail = statp->f_bfree; /* no root reservation */

	/*
	* statvfs() should really be called statufs(), because it assumes
	* static metadata. ZFS doesn't preallocate files, so the best
	* we can do is report the max that could possibly fit in f_files,
	* and that minus the number actually used in f_ffree.
	* For f_ffree, report the smaller of the number of object available
	* and the number of blocks (each object will take at least a block).
	*/
	statp->f_ffree = MIN(availobjs, statp->f_bfree);
	statp->f_files = statp->f_ffree + usedobjs;

	/*
	* We're a zfs filesystem.
	*/
	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));

	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
	sizeof(statp->f_mntfromname));
	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
	sizeof(statp->f_mntonname));

	statp->f_namemax = MAXNAMELEN - 1;

	ZFS_EXIT(zfsvfs);
	return (0);
	}

	static int
	zfs_root(vfs_t vfsp, int flags, vnode_t *vpp)
	{
	zfsvfs_t *zfsvfs = vfsp->vfs_data;
	znode_t *rootzp;
	int error;

	ZFS_ENTER(zfsvfs);

	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
	if (error == 0)
	*vpp = ZTOV(rootzp);

	ZFS_EXIT(zfsvfs);

	if (error == 0) {
	error = vn_lock(*vpp, flags);
	if (error != 0) {
	VN_RELE(*vpp);
	*vpp = NULL;
	}
	}
	return (error);
	}

	/*
	* Teardown the zfsvfs::z_os.
	*
	* Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
	* and 'z_teardown_inactive_lock' held.
	*/
	static int
	zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
	{
	znode_t *zp;

	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);

	if (!unmounting) {
	/*
	* We purge the parent filesystem's vfsp as the parent
	* filesystem and all of its snapshots have their vnode's
	* v_vfsp set to the parent's filesystem's vfsp. Note,
	* 'z_parent' is self referential for non-snapshots.
	*/
	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
	#ifdef FREEBSD_NAMECACHE
	cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
	#endif
	}

	/*
	* Close the zil. NB: Can't close the zil while zfs_inactive
	* threads are blocked as zil_close can call zfs_inactive.
	*/
	if (zfsvfs->z_log) {
	zil_close(zfsvfs->z_log);
	zfsvfs->z_log = NULL;
	}

	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);

	/*
	* If we are not unmounting (ie: online recv) and someone already
	* unmounted this file system while we were doing the switcheroo,
	* or a reopen of z_os failed then just bail out now.
	*/
	if (!unmounting && (zfsvfs->z_unmounted \|\| zfsvfs->z_os == NULL)) {
	rw_exit(&zfsvfs->z_teardown_inactive_lock);
	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
	return (SET_ERROR(EIO));
	}

	/*
	* At this point there are no vops active, and any new vops will
	* fail with EIO since we have z_teardown_lock for writer (only
	* relavent for forced unmount).
	*
	* Release all holds on dbufs.
	*/
	mutex_enter(&zfsvfs->z_znodes_lock);
	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
	zp = list_next(&zfsvfs->z_all_znodes, zp))
	if (zp->z_sa_hdl) {
	ASSERT(ZTOV(zp)->v_count >= 0);
	zfs_znode_dmu_fini(zp);
	}
	mutex_exit(&zfsvfs->z_znodes_lock);

	/*
	* If we are unmounting, set the unmounted flag and let new vops
	* unblock. zfs_inactive will have the unmounted behavior, and all
	* other vops will fail with EIO.
	*/
	if (unmounting) {
	zfsvfs->z_unmounted = B_TRUE;
	rw_exit(&zfsvfs->z_teardown_inactive_lock);
	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
	}

	/*
	* z_os will be NULL if there was an error in attempting to reopen
	* zfsvfs, so just return as the properties had already been
	* unregistered and cached data had been evicted before.
	*/
	if (zfsvfs->z_os == NULL)
	return (0);

	/*
	* Unregister properties.
	*/
	zfs_unregister_callbacks(zfsvfs);

	/*
	* Evict cached data
	*/
	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
	!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
	txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
	dmu_objset_evict_dbufs(zfsvfs->z_os);

	return (0);
	}

	/ARGSUSED/
	static int
	zfs_umount(vfs_t *vfsp, int fflag)
	{
	kthread_t *td = curthread;
	zfsvfs_t *zfsvfs = vfsp->vfs_data;
	objset_t *os;
	cred_t *cr = td->td_ucred;
	int ret;

	ret = secpolicy_fs_unmount(cr, vfsp);
	if (ret) {
	if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
	ZFS_DELEG_PERM_MOUNT, cr))
	return (ret);
	}

	/*
	* We purge the parent filesystem's vfsp as the parent filesystem
	* and all of its snapshots have their vnode's v_vfsp set to the
	* parent's filesystem's vfsp. Note, 'z_parent' is self
	* referential for non-snapshots.
	*/
	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);

	/*
	* Unmount any snapshots mounted under .zfs before unmounting the
	* dataset itself.
	*/
	if (zfsvfs->z_ctldir != NULL) {
	if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
	return (ret);
	}

	if (fflag & MS_FORCE) {
	/*
	* Mark file system as unmounted before calling
	* vflush(FORCECLOSE). This way we ensure no future vnops
	* will be called and risk operating on DOOMED vnodes.
	*/
	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
	zfsvfs->z_unmounted = B_TRUE;
	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
	}

	/*
	* Flush all the files.
	*/
	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
	if (ret != 0)
	return (ret);

	#ifdef illumos
	if (!(fflag & MS_FORCE)) {
	/*
	* Check the number of active vnodes in the file system.
	* Our count is maintained in the vfs structure, but the
	* number is off by 1 to indicate a hold on the vfs
	* structure itself.
	*
	* The '.zfs' directory maintains a reference of its
	* own, and any active references underneath are
	* reflected in the vnode count.
	*/
	if (zfsvfs->z_ctldir == NULL) {
	if (vfsp->vfs_count > 1)
	return (SET_ERROR(EBUSY));
	} else {
	if (vfsp->vfs_count > 2 \|\|
	zfsvfs->z_ctldir->v_count > 1)
	return (SET_ERROR(EBUSY));
	}
	}
	#endif

	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
	os = zfsvfs->z_os;

	/*
	* z_os will be NULL if there was an error in
	* attempting to reopen zfsvfs.
	*/
	if (os != NULL) {
	/*
	* Unset the objset user_ptr.
	*/
	mutex_enter(&os->os_user_ptr_lock);
	dmu_objset_set_user(os, NULL);
	mutex_exit(&os->os_user_ptr_lock);

	/*
	* Finally release the objset
	*/
	dmu_objset_disown(os, zfsvfs);
	}

	/*
	* We can now safely destroy the '.zfs' directory node.
	*/
	if (zfsvfs->z_ctldir != NULL)
	zfsctl_destroy(zfsvfs);
	zfs_freevfs(vfsp);

	return (0);
	}

	static int
	zfs_vget(vfs_t vfsp, ino_t ino, int flags, vnode_t *vpp)
	{
	zfsvfs_t *zfsvfs = vfsp->vfs_data;
	znode_t *zp;
	int err;

	/*
	* zfs_zget() can't operate on virtual entries like .zfs/ or
	* .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
	* This will make NFS to switch to LOOKUP instead of using VGET.
	*/
	if (ino == ZFSCTL_INO_ROOT \|\| ino == ZFSCTL_INO_SNAPDIR \|\|
	(zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
	return (EOPNOTSUPP);

	ZFS_ENTER(zfsvfs);
	err = zfs_zget(zfsvfs, ino, &zp);
	if (err == 0 && zp->z_unlinked) {
	vrele(ZTOV(zp));
	err = EINVAL;
	}
	if (err == 0)
	*vpp = ZTOV(zp);
	ZFS_EXIT(zfsvfs);
	if (err == 0)
	err = vn_lock(*vpp, flags);
	if (err != 0)
	*vpp = NULL;
	return (err);
	}

	static int
	zfs_checkexp(vfs_t vfsp, struct sockaddr nam, int *extflagsp,
	struct ucred *credanonp, int numsecflavors, int **secflavors)
	{
	zfsvfs_t *zfsvfs = vfsp->vfs_data;

	/*
	* If this is regular file system vfsp is the same as
	* zfsvfs->z_parent->z_vfs, but if it is snapshot,
	* zfsvfs->z_parent->z_vfs represents parent file system
	* which we have to use here, because only this file system
	* has mnt_export configured.
	*/
	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
	credanonp, numsecflavors, secflavors));
	}

	CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
	CTASSERT(LONG_FID_LEN <= sizeof(struct fid));

	static int
	zfs_fhtovp(vfs_t vfsp, fid_t fidp, int flags, vnode_t **vpp)
	{
	struct componentname cn;
	zfsvfs_t *zfsvfs = vfsp->vfs_data;
	znode_t *zp;
	vnode_t *dvp;
	uint64_t object = 0;
	uint64_t fid_gen = 0;
	uint64_t gen_mask;
	uint64_t zp_gen;
	int i, err;

	*vpp = NULL;

	ZFS_ENTER(zfsvfs);

	/*
	* On FreeBSD we can get snapshot's mount point or its parent file
	* system mount point depending if snapshot is already mounted or not.
	*/
	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
	zfid_long_t zlfid = (zfid_long_t )fidp;
	uint64_t objsetid = 0;
	uint64_t setgen = 0;

	for (i = 0; i < sizeof (zlfid->zf_setid); i++)
	objsetid \|= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);

	for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
	setgen \|= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);

	ZFS_EXIT(zfsvfs);

	err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
	if (err)
	return (SET_ERROR(EINVAL));
	ZFS_ENTER(zfsvfs);
	}

	if (fidp->fid_len == SHORT_FID_LEN \|\| fidp->fid_len == LONG_FID_LEN) {
	zfid_short_t zfid = (zfid_short_t )fidp;

	for (i = 0; i < sizeof (zfid->zf_object); i++)
	object \|= ((uint64_t)zfid->zf_object[i]) << (8 * i);

	for (i = 0; i < sizeof (zfid->zf_gen); i++)
	fid_gen \|= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
	} else {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}

	/*
	* A zero fid_gen means we are in .zfs or the .zfs/snapshot
	* directory tree. If the object == zfsvfs->z_shares_dir, then
	* we are in the .zfs/shares directory tree.
	*/
	if ((fid_gen == 0 &&
	(object == ZFSCTL_INO_ROOT \|\| object == ZFSCTL_INO_SNAPDIR)) \|\|
	(zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
	ZFS_EXIT(zfsvfs);
	VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
	if (object == ZFSCTL_INO_SNAPDIR) {
	cn.cn_nameptr = "snapshot";
	cn.cn_namelen = strlen(cn.cn_nameptr);
	cn.cn_nameiop = LOOKUP;
	cn.cn_flags = ISLASTCN \| LOCKLEAF;
	cn.cn_lkflags = flags;
	VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
	vput(dvp);
	} else if (object == zfsvfs->z_shares_dir) {
	/*
	* XXX This branch must not be taken,
	* if it is, then the lookup below will
	* explode.
	*/
	cn.cn_nameptr = "shares";
	cn.cn_namelen = strlen(cn.cn_nameptr);
	cn.cn_nameiop = LOOKUP;
	cn.cn_flags = ISLASTCN;
	cn.cn_lkflags = flags;
	VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
	vput(dvp);
	} else {
	*vpp = dvp;
	}
	return (err);
	}

	gen_mask = -1ULL >> (64 - 8 * i);

	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
	if (err = zfs_zget(zfsvfs, object, &zp)) {
	ZFS_EXIT(zfsvfs);
	return (err);
	}
	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
	sizeof (uint64_t));
	zp_gen = zp_gen & gen_mask;
	if (zp_gen == 0)
	zp_gen = 1;
	if (zp->z_unlinked \|\| zp_gen != fid_gen) {
	dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
	vrele(ZTOV(zp));
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}

	*vpp = ZTOV(zp);
	ZFS_EXIT(zfsvfs);
	err = vn_lock(*vpp, flags);
	if (err == 0)
	vnode_create_vobject(*vpp, zp->z_size, curthread);
	else
	*vpp = NULL;
	return (err);
	}

	/*
	* Block out VOPs and close zfsvfs_t::z_os
	*
	* Note, if successful, then we return with the 'z_teardown_lock' and
	* 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
	* dataset and objset intact so that they can be atomically handed off during
	* a subsequent rollback or recv operation and the resume thereafter.
	*/
	int
	zfs_suspend_fs(zfsvfs_t *zfsvfs)
	{
	int error;

	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
	return (error);

	return (0);
	}

	/*
	* Rebuild SA and release VOPs. Note that ownership of the underlying dataset
	* is an invariant across any of the operations that can be performed while the
	* filesystem was suspended. Whether it succeeded or failed, the preconditions
	* are the same: the relevant objset and associated dataset are owned by
	* zfsvfs, held, and long held on entry.
	*/
	int
	zfs_resume_fs(zfsvfs_t zfsvfs, dsl_dataset_t ds)
	{
	int err;
	znode_t *zp;

	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));

	/*
	* We already own this, so just update the objset_t, as the one we
	* had before may have been evicted.
	*/
	objset_t *os;
	VERIFY3P(ds->ds_owner, ==, zfsvfs);
	VERIFY(dsl_dataset_long_held(ds));
	VERIFY0(dmu_objset_from_ds(ds, &os));

	err = zfsvfs_init(zfsvfs, os);
	if (err != 0)
	goto bail;

	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);

	zfs_set_fuid_feature(zfsvfs);

	/*
	* Attempt to re-establish all the active znodes with
	* their dbufs. If a zfs_rezget() fails, then we'll let
	* any potential callers discover that via ZFS_ENTER_VERIFY_VP
	* when they try to use their znode.
	*/
	mutex_enter(&zfsvfs->z_znodes_lock);
	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
	zp = list_next(&zfsvfs->z_all_znodes, zp)) {
	(void) zfs_rezget(zp);
	}
	mutex_exit(&zfsvfs->z_znodes_lock);

	bail:
	/* release the VOPs */
	rw_exit(&zfsvfs->z_teardown_inactive_lock);
	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);

	if (err) {
	/*
	* Since we couldn't setup the sa framework, try to force
	* unmount this file system.
	*/
	if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
	vfs_ref(zfsvfs->z_vfs);
	(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
	}
	}
	return (err);
	}

	static void
	zfs_freevfs(vfs_t *vfsp)
	{
	zfsvfs_t *zfsvfs = vfsp->vfs_data;

	#ifdef illumos
	/*
	* If this is a snapshot, we have an extra VFS_HOLD on our parent
	* from zfs_mount(). Release it here. If we came through
	* zfs_mountroot() instead, we didn't grab an extra hold, so
	* skip the VFS_RELE for rootvfs.
	*/
	if (zfsvfs->z_issnap && (vfsp != rootvfs))
	VFS_RELE(zfsvfs->z_parent->z_vfs);
	#endif

	zfsvfs_free(zfsvfs);

	atomic_dec_32(&zfs_active_fs_count);
	}

	#ifdef __i386__
	static int desiredvnodes_backup;
	#endif

	static void
	zfs_vnodes_adjust(void)
	{
	#ifdef __i386__
	int newdesiredvnodes;

	desiredvnodes_backup = desiredvnodes;

	/*
	* We calculate newdesiredvnodes the same way it is done in
	* vntblinit(). If it is equal to desiredvnodes, it means that
	* it wasn't tuned by the administrator and we can tune it down.
	*/
	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
	vm_kmem_size / (5 * (sizeof(struct vm_object) +
	sizeof(struct vnode))));
	if (newdesiredvnodes == desiredvnodes)
	desiredvnodes = (3 * newdesiredvnodes) / 4;
	#endif
	}

	static void
	zfs_vnodes_adjust_back(void)
	{

	#ifdef __i386__
	desiredvnodes = desiredvnodes_backup;
	#endif
	}

	void
	zfs_init(void)
	{

	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");

	/*
	* Initialize .zfs directory structures
	*/
	zfsctl_init();

	/*
	* Initialize znode cache, vnode ops, etc...
	*/
	zfs_znode_init();

	/*
	* Reduce number of vnodes. Originally number of vnodes is calculated
	* with UFS inode in mind. We reduce it here, because it's too big for
	* ZFS/i386.
	*/
	zfs_vnodes_adjust();

	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
	}

	void
	zfs_fini(void)
	{
	zfsctl_fini();
	zfs_znode_fini();
	zfs_vnodes_adjust_back();
	}

	int
	zfs_busy(void)
	{
	return (zfs_active_fs_count != 0);
	}

	int
	zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
	{
	int error;
	objset_t *os = zfsvfs->z_os;
	dmu_tx_t *tx;

	if (newvers < ZPL_VERSION_INITIAL \|\| newvers > ZPL_VERSION)
	return (SET_ERROR(EINVAL));

	if (newvers < zfsvfs->z_version)
	return (SET_ERROR(EINVAL));

	if (zfs_spa_version_map(newvers) >
	spa_version(dmu_objset_spa(zfsvfs->z_os)))
	return (SET_ERROR(ENOTSUP));

	tx = dmu_tx_create(os);
	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
	ZFS_SA_ATTRS);
	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
	}
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	dmu_tx_abort(tx);
	return (error);
	}

	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
	8, 1, &newvers, tx);

	if (error) {
	dmu_tx_commit(tx);
	return (error);
	}

	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
	uint64_t sa_obj;

	ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
	SPA_VERSION_SA);
	sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
	DMU_OT_NONE, 0, tx);

	error = zap_add(os, MASTER_NODE_OBJ,
	ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
	ASSERT0(error);

	VERIFY(0 == sa_set_sa_object(os, sa_obj));
	sa_register_update_callback(os, zfs_sa_upgrade);
	}

	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
	"from %llu to %llu", zfsvfs->z_version, newvers);

	dmu_tx_commit(tx);

	zfsvfs->z_version = newvers;

	zfs_set_fuid_feature(zfsvfs);

	return (0);
	}

	/*
	* Read a property stored within the master node.
	*/
	int
	zfs_get_zplprop(objset_t os, zfs_prop_t prop, uint64_t value)
	{
	const char *pname;
	int error = ENOENT;

	/*
	* Look up the file system's value for the property. For the
	* version property, we look up a slightly different string.
	*/
	if (prop == ZFS_PROP_VERSION)
	pname = ZPL_VERSION_STR;
	else
	pname = zfs_prop_to_name(prop);

	if (os != NULL) {
	ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
	error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
	}

	if (error == ENOENT) {
	/* No value set, use the default value */
	switch (prop) {
	case ZFS_PROP_VERSION:
	*value = ZPL_VERSION;
	break;
	case ZFS_PROP_NORMALIZE:
	case ZFS_PROP_UTF8ONLY:
	*value = 0;
	break;
	case ZFS_PROP_CASE:
	*value = ZFS_CASE_SENSITIVE;
	break;
	default:
	return (error);
	}
	error = 0;
	}
	return (error);
	}

	/*
	* Return true if the coresponding vfs's unmounted flag is set.
	* Otherwise return false.
	* If this function returns true we know VFS unmount has been initiated.
	*/
	boolean_t
	zfs_get_vfs_flag_unmounted(objset_t *os)
	{
	zfsvfs_t *zfvp;
	boolean_t unmounted = B_FALSE;

	ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);

	mutex_enter(&os->os_user_ptr_lock);
	zfvp = dmu_objset_get_user(os);
	if (zfvp != NULL && zfvp->z_vfs != NULL &&
	(zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
	unmounted = B_TRUE;
	mutex_exit(&os->os_user_ptr_lock);

	return (unmounted);
	}

	#ifdef _KERNEL
	void
	zfsvfs_update_fromname(const char oldname, const char newname)
	{
	char tmpbuf[MAXPATHLEN];
	struct mount *mp;
	char *fromname;
	size_t oldlen;

	oldlen = strlen(oldname);

	mtx_lock(&mountlist_mtx);
	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	fromname = mp->mnt_stat.f_mntfromname;
	if (strcmp(fromname, oldname) == 0) {
	(void)strlcpy(fromname, newname,
	sizeof(mp->mnt_stat.f_mntfromname));
	continue;
	}
	if (strncmp(fromname, oldname, oldlen) == 0 &&
	(fromname[oldlen] == '/' \|\| fromname[oldlen] == '@')) {
	(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
	newname, fromname + oldlen);
	(void)strlcpy(fromname, tmpbuf,
	sizeof(mp->mnt_stat.f_mntfromname));
	continue;
	}
	}
	mtx_unlock(&mountlist_mtx);
	}
	#endif
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 332525)
	@@ -1,6060 +1,6072 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2017 Nexenta Systems, Inc.
	*/

	/* Portions Copyright 2007 Jeremy Teo */
	/* Portions Copyright 2010 Robert Milkowski */

	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/time.h>
	#include <sys/systm.h>
	#include <sys/sysmacros.h>
	#include <sys/resource.h>
	#include <sys/vfs.h>
	#include <sys/vm.h>
	#include <sys/vnode.h>
	#include <sys/file.h>
	#include <sys/stat.h>
	#include <sys/kmem.h>
	#include <sys/taskq.h>
	#include <sys/uio.h>
	#include <sys/atomic.h>
	#include <sys/namei.h>
	#include <sys/mman.h>
	#include <sys/cmn_err.h>
	#include <sys/errno.h>
	#include <sys/unistd.h>
	#include <sys/zfs_dir.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/fs/zfs.h>
	#include <sys/dmu.h>
	#include <sys/dmu_objset.h>
	#include <sys/spa.h>
	#include <sys/txg.h>
	#include <sys/dbuf.h>
	#include <sys/zap.h>
	#include <sys/sa.h>
	#include <sys/dirent.h>
	#include <sys/policy.h>
	#include <sys/sunddi.h>
	#include <sys/filio.h>
	#include <sys/sid.h>
	#include <sys/zfs_ctldir.h>
	#include <sys/zfs_fuid.h>
	#include <sys/zfs_sa.h>
	#include <sys/zfs_rlock.h>
	#include <sys/extdirent.h>
	#include <sys/kidmap.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/sched.h>
	#include <sys/acl.h>
	#include <sys/vmmeter.h>
	#include <vm/vm_param.h>
	#include <sys/zil.h>

	/*
	* Programming rules.
	*
	* Each vnode op performs some logical unit of work. To do this, the ZPL must
	* properly lock its in-core state, create a DMU transaction, do the work,
	* record this work in the intent log (ZIL), commit the DMU transaction,
	* and wait for the intent log to commit if it is a synchronous operation.
	* Moreover, the vnode ops must work in both normal and log replay context.
	* The ordering of events is important to avoid deadlocks and references
	* to freed memory. The example below illustrates the following Big Rules:
	*
	* (1) A check must be made in each zfs thread for a mounted file system.
	* This is done avoiding races using ZFS_ENTER(zfsvfs).
	* A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
	* must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
	* can return EIO from the calling function.
	*
	* (2) VN_RELE() should always be the last thing except for zil_commit()
	* (if necessary) and ZFS_EXIT(). This is for 3 reasons:
	* First, if it's the last reference, the vnode/znode
	* can be freed, so the zp may point to freed memory. Second, the last
	* reference will call zfs_zinactive(), which may induce a lot of work --
	* pushing cached pages (which acquires range locks) and syncing out
	* cached atime changes. Third, zfs_zinactive() may require a new tx,
	* which could deadlock the system if you were already holding one.
	* If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
	*
	* (3) All range locks must be grabbed before calling dmu_tx_assign(),
	* as they can span dmu_tx_assign() calls.
	*
	* (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
	* dmu_tx_assign(). This is critical because we don't want to block
	* while holding locks.
	*
	* If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
	* reduces lock contention and CPU usage when we must wait (note that if
	* throughput is constrained by the storage, nearly every transaction
	* must wait).
	*
	* Note, in particular, that if a lock is sometimes acquired before
	* the tx assigns, and sometimes after (e.g. z_lock), then failing
	* to use a non-blocking assign can deadlock the system. The scenario:
	*
	* Thread A has grabbed a lock before calling dmu_tx_assign().
	* Thread B is in an already-assigned tx, and blocks for this lock.
	* Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
	* forever, because the previous txg can't quiesce until B's tx commits.
	*
	* If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
	* then drop all locks, call dmu_tx_wait(), and try again. On subsequent
	* calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
	* to indicate that this operation has already called dmu_tx_wait().
	* This will ensure that we don't retry forever, waiting a short bit
	* each time.
	*
	* (5) If the operation succeeded, generate the intent log entry for it
	* before dropping locks. This ensures that the ordering of events
	* in the intent log matches the order in which they actually occurred.
	* During ZIL replay the zfs_log_* functions will update the sequence
	* number to indicate the zil transaction has replayed.
	*
	* (6) At the end of each vnode op, the DMU tx must always commit,
	* regardless of whether there were any errors.
	*
	* (7) After dropping all locks, invoke zil_commit(zilog, foid)
	* to ensure that synchronous semantics are provided when necessary.
	*
	* In general, this is how things should be ordered in each vnode op:
	*
	* ZFS_ENTER(zfsvfs); // exit if unmounted
	* top:
	* zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
	* rw_enter(...); // grab any other locks you need
	* tx = dmu_tx_create(...); // get DMU tx
	* dmu_tx_hold_*(); // hold each object you might modify
	* error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) \| TXG_NOWAIT);
	* if (error) {
	* rw_exit(...); // drop locks
	* zfs_dirent_unlock(dl); // unlock directory entry
	* VN_RELE(...); // release held vnodes
	* if (error == ERESTART) {
	* waited = B_TRUE;
	* dmu_tx_wait(tx);
	* dmu_tx_abort(tx);
	* goto top;
	* }
	* dmu_tx_abort(tx); // abort DMU tx
	* ZFS_EXIT(zfsvfs); // finished in zfs
	* return (error); // really out of space
	* }
	* error = do_real_work(); // do whatever this VOP does
	* if (error == 0)
	* zfs_log_*(...); // on success, make ZIL entry
	* dmu_tx_commit(tx); // commit DMU tx -- error or not
	* rw_exit(...); // drop locks
	* zfs_dirent_unlock(dl); // unlock directory entry
	* VN_RELE(...); // release held vnodes
	* zil_commit(zilog, foid); // synchronous when necessary
	* ZFS_EXIT(zfsvfs); // finished in zfs
	* return (error); // done, report error
	*/

	/* ARGSUSED */
	static int
	zfs_open(vnode_t *vpp, int flag, cred_t cr, caller_context_t *ct)
	{
	znode_t zp = VTOZ(vpp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
	((flag & FAPPEND) == 0)) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EPERM));
	}

	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
	ZTOV(zp)->v_type == VREG &&
	!(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
	if (fs_vscan(*vpp, cr, 0) != 0) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EACCES));
	}
	}

	/* Keep a count of the synchronous opens in the znode */
	if (flag & (FSYNC \| FDSYNC))
	atomic_inc_32(&zp->z_sync_cnt);

	ZFS_EXIT(zfsvfs);
	return (0);
	}

	/* ARGSUSED */
	static int
	zfs_close(vnode_t vp, int flag, int count, offset_t offset, cred_t cr,
	caller_context_t *ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;

	/*
	* Clean up any locks held by this process on the vp.
	*/
	cleanlocks(vp, ddi_get_pid(), 0);
	cleanshares(vp, ddi_get_pid());

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	/* Decrement the synchronous opens in the znode */
	if ((flag & (FSYNC \| FDSYNC)) && (count == 1))
	atomic_dec_32(&zp->z_sync_cnt);

	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
	ZTOV(zp)->v_type == VREG &&
	!(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
	VERIFY(fs_vscan(vp, cr, 1) == 0);

	ZFS_EXIT(zfsvfs);
	return (0);
	}

	/*
	* Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
	* data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
	*/
	static int
	zfs_holey(vnode_t vp, u_long cmd, offset_t off)
	{
	znode_t *zp = VTOZ(vp);
	uint64_t noff = (uint64_t)off; / new offset */
	uint64_t file_sz;
	int error;
	boolean_t hole;

	file_sz = zp->z_size;
	if (noff >= file_sz) {
	return (SET_ERROR(ENXIO));
	}

	if (cmd == _FIO_SEEK_HOLE)
	hole = B_TRUE;
	else
	hole = B_FALSE;

	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);

	if (error == ESRCH)
	return (SET_ERROR(ENXIO));

	/*
	* We could find a hole that begins after the logical end-of-file,
	* because dmu_offset_next() only works on whole blocks. If the
	* EOF falls mid-block, then indicate that the "virtual hole"
	* at the end of the file begins at the logical EOF, rather than
	* at the end of the last block.
	*/
	if (noff > file_sz) {
	ASSERT(hole);
	noff = file_sz;
	}

	if (noff < *off)
	return (error);
	*off = noff;
	return (error);
	}

	/* ARGSUSED */
	static int
	zfs_ioctl(vnode_t vp, u_long com, intptr_t data, int flag, cred_t cred,
	int rvalp, caller_context_t ct)
	{
	offset_t off;
	offset_t ndata;
	dmu_object_info_t doi;
	int error;
	zfsvfs_t *zfsvfs;
	znode_t *zp;

	switch (com) {
	case _FIOFFS:
	{
	return (0);

	/*
	* The following two ioctls are used by bfu. Faking out,
	* necessary to avoid bfu errors.
	*/
	}
	case _FIOGDIO:
	case _FIOSDIO:
	{
	return (0);
	}

	case _FIO_SEEK_DATA:
	case _FIO_SEEK_HOLE:
	{
	#ifdef illumos
	if (ddi_copyin((void *)data, &off, sizeof (off), flag))
	return (SET_ERROR(EFAULT));
	#else
	off = (offset_t )data;
	#endif
	zp = VTOZ(vp);
	zfsvfs = zp->z_zfsvfs;
	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	/* offset parameter is in/out */
	error = zfs_holey(vp, com, &off);
	ZFS_EXIT(zfsvfs);
	if (error)
	return (error);
	#ifdef illumos
	if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
	return (SET_ERROR(EFAULT));
	#else
	(offset_t )data = off;
	#endif
	return (0);
	}
	#ifdef illumos
	case _FIO_COUNT_FILLED:
	{
	/*
	* _FIO_COUNT_FILLED adds a new ioctl command which
	* exposes the number of filled blocks in a
	* ZFS object.
	*/
	zp = VTOZ(vp);
	zfsvfs = zp->z_zfsvfs;
	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	/*
	* Wait for all dirty blocks for this object
	* to get synced out to disk, and the DMU info
	* updated.
	*/
	error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
	if (error) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Retrieve fill count from DMU object.
	*/
	error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
	if (error) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	ndata = doi.doi_fill_count;

	ZFS_EXIT(zfsvfs);
	if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
	return (SET_ERROR(EFAULT));
	return (0);
	}
	#endif
	}
	return (SET_ERROR(ENOTTY));
	}

	static vm_page_t
	page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
	{
	vm_object_t obj;
	vm_page_t pp;
	int64_t end;

	/*
	* At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
	* aligned boundaries, if the range is not aligned. As a result a
	* DEV_BSIZE subrange with partially dirty data may get marked as clean.
	* It may happen that all DEV_BSIZE subranges are marked clean and thus
	* the whole page would be considred clean despite have some dirty data.
	* For this reason we should shrink the range to DEV_BSIZE aligned
	* boundaries before calling vm_page_clear_dirty.
	*/
	end = rounddown2(off + nbytes, DEV_BSIZE);
	off = roundup2(off, DEV_BSIZE);
	nbytes = end - off;

	obj = vp->v_object;
	zfs_vmobject_assert_wlocked(obj);

	for (;;) {
	if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
	pp->valid) {
	if (vm_page_xbusied(pp)) {
	/*
	* Reference the page before unlocking and
	* sleeping so that the page daemon is less
	* likely to reclaim it.
	*/
	vm_page_reference(pp);
	vm_page_lock(pp);
	zfs_vmobject_wunlock(obj);
	vm_page_busy_sleep(pp, "zfsmwb", true);
	zfs_vmobject_wlock(obj);
	continue;
	}
	vm_page_sbusy(pp);
	} else if (pp != NULL) {
	ASSERT(!pp->valid);
	pp = NULL;
	}

	if (pp != NULL) {
	ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
	vm_object_pip_add(obj, 1);
	pmap_remove_write(pp);
	if (nbytes != 0)
	vm_page_clear_dirty(pp, off, nbytes);
	}
	break;
	}
	return (pp);
	}

	static void
	page_unbusy(vm_page_t pp)
	{

	vm_page_sunbusy(pp);
	vm_object_pip_subtract(pp->object, 1);
	}

	static vm_page_t
	page_hold(vnode_t *vp, int64_t start)
	{
	vm_object_t obj;
	vm_page_t pp;

	obj = vp->v_object;
	zfs_vmobject_assert_wlocked(obj);

	for (;;) {
	if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
	pp->valid) {
	if (vm_page_xbusied(pp)) {
	/*
	* Reference the page before unlocking and
	* sleeping so that the page daemon is less
	* likely to reclaim it.
	*/
	vm_page_reference(pp);
	vm_page_lock(pp);
	zfs_vmobject_wunlock(obj);
	vm_page_busy_sleep(pp, "zfsmwb", true);
	zfs_vmobject_wlock(obj);
	continue;
	}

	ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
	vm_page_lock(pp);
	vm_page_hold(pp);
	vm_page_unlock(pp);

	} else
	pp = NULL;
	break;
	}
	return (pp);
	}

	static void
	page_unhold(vm_page_t pp)
	{

	vm_page_lock(pp);
	vm_page_unhold(pp);
	vm_page_unlock(pp);
	}

	/*
	* When a file is memory mapped, we must keep the IO data synchronized
	* between the DMU cache and the memory mapped pages. What this means:
	*
	* On Write: If we find a memory mapped page, we write to both
	* the page and the dmu buffer.
	*/
	static void
	update_pages(vnode_t vp, int64_t start, int len, objset_t os, uint64_t oid,
	int segflg, dmu_tx_t *tx)
	{
	vm_object_t obj;
	struct sf_buf *sf;
	caddr_t va;
	int off;

	ASSERT(segflg != UIO_NOCOPY);
	ASSERT(vp->v_mount != NULL);
	obj = vp->v_object;
	ASSERT(obj != NULL);

	off = start & PAGEOFFSET;
	zfs_vmobject_wlock(obj);
	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
	vm_page_t pp;
	int nbytes = imin(PAGESIZE - off, len);

	if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
	zfs_vmobject_wunlock(obj);

	va = zfs_map_page(pp, &sf);
	(void) dmu_read(os, oid, start+off, nbytes,
	va+off, DMU_READ_PREFETCH);;
	zfs_unmap_page(sf);

	zfs_vmobject_wlock(obj);
	page_unbusy(pp);
	}
	len -= nbytes;
	off = 0;
	}
	vm_object_pip_wakeupn(obj, 0);
	zfs_vmobject_wunlock(obj);
	}

	/*
	* Read with UIO_NOCOPY flag means that sendfile(2) requests
	* ZFS to populate a range of page cache pages with data.
	*
	* NOTE: this function could be optimized to pre-allocate
	* all pages in advance, drain exclusive busy on all of them,
	* map them into contiguous KVA region and populate them
	* in one single dmu_read() call.
	*/
	static int
	mappedread_sf(vnode_t vp, int nbytes, uio_t uio)
	{
	znode_t *zp = VTOZ(vp);
	objset_t *os = zp->z_zfsvfs->z_os;
	struct sf_buf *sf;
	vm_object_t obj;
	vm_page_t pp;
	int64_t start;
	caddr_t va;
	int len = nbytes;
	int off;
	int error = 0;

	ASSERT(uio->uio_segflg == UIO_NOCOPY);
	ASSERT(vp->v_mount != NULL);
	obj = vp->v_object;
	ASSERT(obj != NULL);
	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);

	zfs_vmobject_wlock(obj);
	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
	int bytes = MIN(PAGESIZE, len);

	pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY \|
	VM_ALLOC_NORMAL \| VM_ALLOC_IGN_SBUSY);
	if (pp->valid == 0) {
	zfs_vmobject_wunlock(obj);
	va = zfs_map_page(pp, &sf);
	error = dmu_read(os, zp->z_id, start, bytes, va,
	DMU_READ_PREFETCH);
	if (bytes != PAGESIZE && error == 0)
	bzero(va + bytes, PAGESIZE - bytes);
	zfs_unmap_page(sf);
	zfs_vmobject_wlock(obj);
	vm_page_sunbusy(pp);
	vm_page_lock(pp);
	if (error) {
	if (pp->wire_count == 0 && pp->valid == 0 &&
	!vm_page_busied(pp))
	vm_page_free(pp);
	} else {
	pp->valid = VM_PAGE_BITS_ALL;
	vm_page_activate(pp);
	}
	vm_page_unlock(pp);
	} else {
	ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
	vm_page_sunbusy(pp);
	}
	if (error)
	break;
	uio->uio_resid -= bytes;
	uio->uio_offset += bytes;
	len -= bytes;
	}
	zfs_vmobject_wunlock(obj);
	return (error);
	}

	/*
	* When a file is memory mapped, we must keep the IO data synchronized
	* between the DMU cache and the memory mapped pages. What this means:
	*
	* On Read: We "read" preferentially from memory mapped pages,
	* else we default from the dmu buffer.
	*
	* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
	* the file is memory mapped.
	*/
	static int
	mappedread(vnode_t vp, int nbytes, uio_t uio)
	{
	znode_t *zp = VTOZ(vp);
	vm_object_t obj;
	int64_t start;
	caddr_t va;
	int len = nbytes;
	int off;
	int error = 0;

	ASSERT(vp->v_mount != NULL);
	obj = vp->v_object;
	ASSERT(obj != NULL);

	start = uio->uio_loffset;
	off = start & PAGEOFFSET;
	zfs_vmobject_wlock(obj);
	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
	vm_page_t pp;
	uint64_t bytes = MIN(PAGESIZE - off, len);

	if (pp = page_hold(vp, start)) {
	struct sf_buf *sf;
	caddr_t va;

	zfs_vmobject_wunlock(obj);
	va = zfs_map_page(pp, &sf);
	#ifdef illumos
	error = uiomove(va + off, bytes, UIO_READ, uio);
	#else
	error = vn_io_fault_uiomove(va + off, bytes, uio);
	#endif
	zfs_unmap_page(sf);
	zfs_vmobject_wlock(obj);
	page_unhold(pp);
	} else {
	zfs_vmobject_wunlock(obj);
	error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
	uio, bytes);
	zfs_vmobject_wlock(obj);
	}
	len -= bytes;
	off = 0;
	if (error)
	break;
	}
	zfs_vmobject_wunlock(obj);
	return (error);
	}

	offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */

	/*
	* Read bytes from specified file into supplied buffer.
	*
	* IN: vp - vnode of file to be read from.
	* uio - structure supplying read location, range info,
	* and return buffer.
	* ioflag - SYNC flags; used to provide FRSYNC semantics.
	* cr - credentials of caller.
	* ct - caller context
	*
	* OUT: uio - updated offset and range, buffer filled.
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Side Effects:
	* vp - atime updated if byte count > 0
	*/
	/* ARGSUSED */
	static int
	zfs_read(vnode_t vp, uio_t uio, int ioflag, cred_t cr, caller_context_t ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	ssize_t n, nbytes;
	int error = 0;
	rl_t *rl;
	xuio_t *xuio = NULL;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EACCES));
	}

	/*
	* Validate file offset
	*/
	if (uio->uio_loffset < (offset_t)0) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}

	/*
	* Fasttrack empty reads
	*/
	if (uio->uio_resid == 0) {
	ZFS_EXIT(zfsvfs);
	return (0);
	}

	/*
	* Check for mandatory locks
	*/
	if (MANDMODE(zp->z_mode)) {
	if (error = chklock(vp, FREAD,
	uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	}

	/*
	* If we're in FRSYNC mode, sync out this znode before reading it.
	*/
	if (zfsvfs->z_log &&
	(ioflag & FRSYNC \|\| zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
	zil_commit(zfsvfs->z_log, zp->z_id);

	/*
	* Lock the range against changes.
	*/
	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);

	/*
	* If we are reading past end-of-file we can skip
	* to the end; but we might still need to set atime.
	*/
	if (uio->uio_loffset >= zp->z_size) {
	error = 0;
	goto out;
	}

	ASSERT(uio->uio_loffset < zp->z_size);
	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);

	#ifdef illumos
	if ((uio->uio_extflg == UIO_XUIO) &&
	(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
	int nblk;
	int blksz = zp->z_blksz;
	uint64_t offset = uio->uio_loffset;

	xuio = (xuio_t *)uio;
	if ((ISP2(blksz))) {
	nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
	blksz)) / blksz;
	} else {
	ASSERT(offset + n <= blksz);
	nblk = 1;
	}
	(void) dmu_xuio_init(xuio, nblk);

	if (vn_has_cached_data(vp)) {
	/*
	* For simplicity, we always allocate a full buffer
	* even if we only expect to read a portion of a block.
	*/
	while (--nblk >= 0) {
	(void) dmu_xuio_add(xuio,
	dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
	blksz), 0, blksz);
	}
	}
	}
	#endif /* illumos */

	while (n > 0) {
	nbytes = MIN(n, zfs_read_chunk_size -
	P2PHASE(uio->uio_loffset, zfs_read_chunk_size));

	#ifdef __FreeBSD__
	if (uio->uio_segflg == UIO_NOCOPY)
	error = mappedread_sf(vp, nbytes, uio);
	else
	#endif /* __FreeBSD__ */
	if (vn_has_cached_data(vp)) {
	error = mappedread(vp, nbytes, uio);
	} else {
	error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
	uio, nbytes);
	}
	if (error) {
	/* convert checksum errors into IO errors */
	if (error == ECKSUM)
	error = SET_ERROR(EIO);
	break;
	}

	n -= nbytes;
	}
	out:
	zfs_range_unlock(rl);

	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Write the bytes to a file.
	*
	* IN: vp - vnode of file to be written to.
	* uio - structure supplying write location, range info,
	* and data buffer.
	* ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
	* set if in append mode.
	* cr - credentials of caller.
	* ct - caller context (NFS/CIFS fem monitor only)
	*
	* OUT: uio - updated offset and range.
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* vp - ctime\|mtime updated if byte count > 0
	*/

	/* ARGSUSED */
	static int
	zfs_write(vnode_t vp, uio_t uio, int ioflag, cred_t cr, caller_context_t ct)
	{
	znode_t *zp = VTOZ(vp);
	rlim64_t limit = MAXOFFSET_T;
	ssize_t start_resid = uio->uio_resid;
	ssize_t tx_bytes;
	uint64_t end_size;
	dmu_tx_t *tx;
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	zilog_t *zilog;
	offset_t woff;
	ssize_t n, nbytes;
	rl_t *rl;
	int max_blksz = zfsvfs->z_max_blksz;
	int error = 0;
	arc_buf_t *abuf;
	iovec_t *aiov = NULL;
	xuio_t *xuio = NULL;
	int i_iov = 0;
	int iovcnt = uio->uio_iovcnt;
	iovec_t *iovp = uio->uio_iov;
	int write_eof;
	int count = 0;
	sa_bulk_attr_t bulk[4];
	uint64_t mtime[2], ctime[2];

	/*
	* Fasttrack empty write
	*/
	n = start_resid;
	if (n == 0)
	return (0);

	if (limit == RLIM64_INFINITY \|\| limit > MAXOFFSET_T)
	limit = MAXOFFSET_T;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
	&zp->z_size, 8);
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	&zp->z_pflags, 8);

	/*
	* In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
	* callers might not be able to detect properly that we are read-only,
	* so check it explicitly here.
	*/
	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EROFS));
	}

	/*
	* If immutable or not appending then return EPERM.
	* Intentionally allow ZFS_READONLY through here.
	* See zfs_zaccess_common()
	*/
	if ((zp->z_pflags & ZFS_IMMUTABLE) \|\|
	((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
	(uio->uio_loffset < zp->z_size))) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EPERM));
	}

	zilog = zfsvfs->z_log;

	/*
	* Validate file offset
	*/
	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
	if (woff < 0) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}

	/*
	* Check for mandatory locks before calling zfs_range_lock()
	* in order to prevent a deadlock with locks set via fcntl().
	*/
	if (MANDMODE((mode_t)zp->z_mode) &&
	(error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	#ifdef illumos
	/*
	* Pre-fault the pages to ensure slow (eg NFS) pages
	* don't hold up txg.
	* Skip this if uio contains loaned arc_buf.
	*/
	if ((uio->uio_extflg == UIO_XUIO) &&
	(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
	xuio = (xuio_t *)uio;
	else
	uio_prefaultpages(MIN(n, max_blksz), uio);
	#endif

	/*
	* If in append mode, set the io offset pointer to eof.
	*/
	if (ioflag & FAPPEND) {
	/*
	* Obtain an appending range lock to guarantee file append
	* semantics. We reset the write offset once we have the lock.
	*/
	rl = zfs_range_lock(zp, 0, n, RL_APPEND);
	woff = rl->r_off;
	if (rl->r_len == UINT64_MAX) {
	/*
	* We overlocked the file because this write will cause
	* the file block size to increase.
	* Note that zp_size cannot change with this lock held.
	*/
	woff = zp->z_size;
	}
	uio->uio_loffset = woff;
	} else {
	/*
	* Note that if the file block size will change as a result of
	* this write, then this range lock will lock the entire file
	* so that we can re-write the block safely.
	*/
	rl = zfs_range_lock(zp, woff, n, RL_WRITER);
	}

	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
	zfs_range_unlock(rl);
	ZFS_EXIT(zfsvfs);
	return (EFBIG);
	}

	if (woff >= limit) {
	zfs_range_unlock(rl);
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EFBIG));
	}

	if ((woff + n) > limit \|\| woff > (limit - n))
	n = limit - woff;

	/* Will this write extend the file length? */
	write_eof = (woff + n > zp->z_size);

	end_size = MAX(zp->z_size, woff + n);

	/*
	* Write the file in reasonable size chunks. Each chunk is written
	* in a separate transaction; this keeps the intent log records small
	* and allows us to do more fine-grained space accounting.
	*/
	while (n > 0) {
	abuf = NULL;
	woff = uio->uio_loffset;
	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) \|\|
	zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
	if (abuf != NULL)
	dmu_return_arcbuf(abuf);
	error = SET_ERROR(EDQUOT);
	break;
	}

	if (xuio && abuf == NULL) {
	ASSERT(i_iov < iovcnt);
	aiov = &iovp[i_iov];
	abuf = dmu_xuio_arcbuf(xuio, i_iov);
	dmu_xuio_clear(xuio, i_iov);
	DTRACE_PROBE3(zfs_cp_write, int, i_iov,
	iovec_t , aiov, arc_buf_t , abuf);
	ASSERT((aiov->iov_base == abuf->b_data) \|\|
	((char )aiov->iov_base - (char )abuf->b_data +
	aiov->iov_len == arc_buf_size(abuf)));
	i_iov++;
	} else if (abuf == NULL && n >= max_blksz &&
	woff >= zp->z_size &&
	P2PHASE(woff, max_blksz) == 0 &&
	zp->z_blksz == max_blksz) {
	/*
	* This write covers a full block. "Borrow" a buffer
	* from the dmu so that we can fill it before we enter
	* a transaction. This avoids the possibility of
	* holding up the transaction if the data copy hangs
	* up on a pagefault (e.g., from an NFS server mapping).
	*/
	size_t cbytes;

	abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
	max_blksz);
	ASSERT(abuf != NULL);
	ASSERT(arc_buf_size(abuf) == max_blksz);
	if (error = uiocopy(abuf->b_data, max_blksz,
	UIO_WRITE, uio, &cbytes)) {
	dmu_return_arcbuf(abuf);
	break;
	}
	ASSERT(cbytes == max_blksz);
	}

	/*
	* Start a transaction.
	*/
	tx = dmu_tx_create(zfsvfs->z_os);
	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
	zfs_sa_upgrade_txholds(tx, zp);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	dmu_tx_abort(tx);
	if (abuf != NULL)
	dmu_return_arcbuf(abuf);
	break;
	}

	/*
	* If zfs_range_lock() over-locked we grow the blocksize
	* and then reduce the lock range. This will only happen
	* on the first iteration since zfs_range_reduce() will
	* shrink down r_len to the appropriate size.
	*/
	if (rl->r_len == UINT64_MAX) {
	uint64_t new_blksz;

	if (zp->z_blksz > max_blksz) {
	/*
	* File's blocksize is already larger than the
	* "recordsize" property. Only let it grow to
	* the next power of 2.
	*/
	ASSERT(!ISP2(zp->z_blksz));
	new_blksz = MIN(end_size,
	1 << highbit64(zp->z_blksz));
	} else {
	new_blksz = MIN(end_size, max_blksz);
	}
	zfs_grow_blocksize(zp, new_blksz, tx);
	zfs_range_reduce(rl, woff, n);
	}

	/*
	* XXX - should we really limit each write to z_max_blksz?
	* Perhaps we should use SPA_MAXBLOCKSIZE chunks?
	*/
	nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));

	if (woff + nbytes > zp->z_size)
	vnode_pager_setsize(vp, woff + nbytes);

	if (abuf == NULL) {
	tx_bytes = uio->uio_resid;
	error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
	uio, nbytes, tx);
	tx_bytes -= uio->uio_resid;
	} else {
	tx_bytes = nbytes;
	ASSERT(xuio == NULL \|\| tx_bytes == aiov->iov_len);
	/*
	* If this is not a full block write, but we are
	* extending the file past EOF and this data starts
	* block-aligned, use assign_arcbuf(). Otherwise,
	* write via dmu_write().
	*/
	if (tx_bytes < max_blksz && (!write_eof \|\|
	aiov->iov_base != abuf->b_data)) {
	ASSERT(xuio);
	dmu_write(zfsvfs->z_os, zp->z_id, woff,
	aiov->iov_len, aiov->iov_base, tx);
	dmu_return_arcbuf(abuf);
	xuio_stat_wbuf_copied();
	} else {
	ASSERT(xuio \|\| tx_bytes == max_blksz);
	dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
	woff, abuf, tx);
	}
	ASSERT(tx_bytes <= uio->uio_resid);
	uioskip(uio, tx_bytes);
	}
	if (tx_bytes && vn_has_cached_data(vp)) {
	update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
	zp->z_id, uio->uio_segflg, tx);
	}

	/*
	* If we made no progress, we're done. If we made even
	* partial progress, update the znode and ZIL accordingly.
	*/
	if (tx_bytes == 0) {
	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
	(void *)&zp->z_size, sizeof (uint64_t), tx);
	dmu_tx_commit(tx);
	ASSERT(error != 0);
	break;
	}

	/*
	* Clear Set-UID/Set-GID bits on successful write if not
	* privileged and at least one of the excute bits is set.
	*
	* It would be nice to to this after all writes have
	* been done, but that would still expose the ISUID/ISGID
	* to another app after the partial write is committed.
	*
	* Note: we don't call zfs_fuid_map_id() here because
	* user 0 is not an ephemeral uid.
	*/
	mutex_enter(&zp->z_acl_lock);
	if ((zp->z_mode & (S_IXUSR \| (S_IXUSR >> 3) \|
	(S_IXUSR >> 6))) != 0 &&
	(zp->z_mode & (S_ISUID \| S_ISGID)) != 0 &&
	secpolicy_vnode_setid_retain(vp, cr,
	(zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
	uint64_t newmode;
	zp->z_mode &= ~(S_ISUID \| S_ISGID);
	newmode = zp->z_mode;
	(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
	(void *)&newmode, sizeof (uint64_t), tx);
	}
	mutex_exit(&zp->z_acl_lock);

	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
	B_TRUE);

	/*
	* Update the file size (zp_size) if it has changed;
	* account for possible concurrent updates.
	*/
	while ((end_size = zp->z_size) < uio->uio_loffset) {
	(void) atomic_cas_64(&zp->z_size, end_size,
	uio->uio_loffset);
	#ifdef illumos
	ASSERT(error == 0);
	#else
	ASSERT(error == 0 \|\| error == EFAULT);
	#endif
	}
	/*
	* If we are replaying and eof is non zero then force
	* the file size to the specified eof. Note, there's no
	* concurrency during replay.
	*/
	if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
	zp->z_size = zfsvfs->z_replay_eof;

	if (error == 0)
	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
	else
	(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);

	zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
	dmu_tx_commit(tx);

	if (error != 0)
	break;
	ASSERT(tx_bytes == nbytes);
	n -= nbytes;

	#ifdef illumos
	if (!xuio && n > 0)
	uio_prefaultpages(MIN(n, max_blksz), uio);
	#endif
	}

	zfs_range_unlock(rl);

	/*
	* If we're in replay mode, or we made no progress, return error.
	* Otherwise, it's at least a partial write, so it's successful.
	*/
	if (zfsvfs->z_replay \|\| uio->uio_resid == start_resid) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	#ifdef __FreeBSD__
	/*
	* EFAULT means that at least one page of the source buffer was not
	* available. VFS will re-try remaining I/O upon this error.
	*/
	if (error == EFAULT) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	#endif

	if (ioflag & (FSYNC \| FDSYNC) \|\|
	zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zilog, zp->z_id);

	ZFS_EXIT(zfsvfs);
	return (0);
	}

	void
	zfs_get_done(zgd_t *zgd, int error)
	{
	znode_t *zp = zgd->zgd_private;
	objset_t *os = zp->z_zfsvfs->z_os;

	if (zgd->zgd_db)
	dmu_buf_rele(zgd->zgd_db, zgd);

	zfs_range_unlock(zgd->zgd_rl);

	/*
	* Release the vnode asynchronously as we currently have the
	* txg stopped from syncing.
	*/
	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));

	if (error == 0 && zgd->zgd_bp)
	zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);

	kmem_free(zgd, sizeof (zgd_t));
	}

	#ifdef DEBUG
	static int zil_fault_io = 0;
	#endif

	/*
	* Get data to generate a TX_WRITE intent log record.
	*/
	int
	zfs_get_data(void arg, lr_write_t lr, char buf, struct lwb lwb, zio_t *zio)
	{
	zfsvfs_t *zfsvfs = arg;
	objset_t *os = zfsvfs->z_os;
	znode_t *zp;
	uint64_t object = lr->lr_foid;
	uint64_t offset = lr->lr_offset;
	uint64_t size = lr->lr_length;
	dmu_buf_t *db;
	zgd_t *zgd;
	int error = 0;

	ASSERT3P(lwb, !=, NULL);
	ASSERT3P(zio, !=, NULL);
	ASSERT3U(size, !=, 0);

	/*
	* Nothing to do if the file has been removed
	*/
	if (zfs_zget(zfsvfs, object, &zp) != 0)
	return (SET_ERROR(ENOENT));
	if (zp->z_unlinked) {
	/*
	* Release the vnode asynchronously as we currently have the
	* txg stopped from syncing.
	*/
	VN_RELE_ASYNC(ZTOV(zp),
	dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
	return (SET_ERROR(ENOENT));
	}

	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
	zgd->zgd_lwb = lwb;
	zgd->zgd_private = zp;

	/*
	* Write records come in two flavors: immediate and indirect.
	* For small writes it's cheaper to store the data with the
	* log record (immediate); for large writes it's cheaper to
	* sync the data and get a pointer to it (indirect) so that
	* we don't have to write the data twice.
	*/
	if (buf != NULL) { /* immediate write */
	zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
	/* test for truncation needs to be done while range locked */
	if (offset >= zp->z_size) {
	error = SET_ERROR(ENOENT);
	} else {
	error = dmu_read(os, object, offset, size, buf,
	DMU_READ_NO_PREFETCH);
	}
	ASSERT(error == 0 \|\| error == ENOENT);
	} else { /* indirect write */
	/*
	* Have to lock the whole block to ensure when it's
	* written out and its checksum is being calculated
	* that no one can change the data. We need to re-check
	* blocksize after we get the lock in case it's changed!
	*/
	for (;;) {
	uint64_t blkoff;
	size = zp->z_blksz;
	blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
	offset -= blkoff;
	zgd->zgd_rl = zfs_range_lock(zp, offset, size,
	RL_READER);
	if (zp->z_blksz == size)
	break;
	offset += blkoff;
	zfs_range_unlock(zgd->zgd_rl);
	}
	/* test for truncation needs to be done while range locked */
	if (lr->lr_offset >= zp->z_size)
	error = SET_ERROR(ENOENT);
	#ifdef DEBUG
	if (zil_fault_io) {
	error = SET_ERROR(EIO);
	zil_fault_io = 0;
	}
	#endif
	if (error == 0)
	error = dmu_buf_hold(os, object, offset, zgd, &db,
	DMU_READ_NO_PREFETCH);

	if (error == 0) {
	blkptr_t *bp = &lr->lr_blkptr;

	zgd->zgd_db = db;
	zgd->zgd_bp = bp;

	ASSERT(db->db_offset == offset);
	ASSERT(db->db_size == size);

	error = dmu_sync(zio, lr->lr_common.lrc_txg,
	zfs_get_done, zgd);
	ASSERT(error \|\| lr->lr_length <= size);

	/*
	* On success, we need to wait for the write I/O
	* initiated by dmu_sync() to complete before we can
	* release this dbuf. We will finish everything up
	* in the zfs_get_done() callback.
	*/
	if (error == 0)
	return (0);

	if (error == EALREADY) {
	lr->lr_common.lrc_txtype = TX_WRITE2;
	+ /*
	+ * TX_WRITE2 relies on the data previously
	+ * written by the TX_WRITE that caused
	+ * EALREADY. We zero out the BP because
	+ * it is the old, currently-on-disk BP,
	+ * so there's no need to zio_flush() its
	+ * vdevs (flushing would needlesly hurt
	+ * performance, and doesn't work on
	+ * indirect vdevs).
	+ */
	+ zgd->zgd_bp = NULL;
	+ BP_ZERO(bp);
	error = 0;
	}
	}
	}

	zfs_get_done(zgd, error);

	return (error);
	}

	/ARGSUSED/
	static int
	zfs_access(vnode_t vp, int mode, int flag, cred_t cr,
	caller_context_t *ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	int error;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	if (flag & V_ACE_MASK)
	error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
	else
	error = zfs_zaccess_rwx(zp, mode, flag, cr);

	ZFS_EXIT(zfsvfs);
	return (error);
	}

	static int
	zfs_dd_callback(struct mount mp, void arg, int lkflags, struct vnode **vpp)
	{
	int error;

	*vpp = arg;
	error = vn_lock(*vpp, lkflags);
	if (error != 0)
	vrele(*vpp);
	return (error);
	}

	static int
	zfs_lookup_lock(vnode_t dvp, vnode_t vp, const char *name, int lkflags)
	{
	znode_t *zdp = VTOZ(dvp);
	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
	int error;
	int ltype;

	ASSERT_VOP_LOCKED(dvp, __func__);
	#ifdef DIAGNOSTIC
	if ((zdp->z_pflags & ZFS_XATTR) == 0)
	VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
	#endif

	if (name[0] == 0 \|\| (name[0] == '.' && name[1] == 0)) {
	ASSERT3P(dvp, ==, vp);
	vref(dvp);
	ltype = lkflags & LK_TYPE_MASK;
	if (ltype != VOP_ISLOCKED(dvp)) {
	if (ltype == LK_EXCLUSIVE)
	vn_lock(dvp, LK_UPGRADE \| LK_RETRY);
	else /* if (ltype == LK_SHARED) */
	vn_lock(dvp, LK_DOWNGRADE \| LK_RETRY);

	/*
	* Relock for the "." case could leave us with
	* reclaimed vnode.
	*/
	if (dvp->v_iflag & VI_DOOMED) {
	vrele(dvp);
	return (SET_ERROR(ENOENT));
	}
	}
	return (0);
	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
	/*
	* Note that in this case, dvp is the child vnode, and we
	* are looking up the parent vnode - exactly reverse from
	* normal operation. Unlocking dvp requires some rather
	* tricky unlock/relock dance to prevent mp from being freed;
	* use vn_vget_ino_gen() which takes care of all that.
	*
	* XXX Note that there is a time window when both vnodes are
	* unlocked. It is possible, although highly unlikely, that
	* during that window the parent-child relationship between
	* the vnodes may change, for example, get reversed.
	* In that case we would have a wrong lock order for the vnodes.
	* All other filesystems seem to ignore this problem, so we
	* do the same here.
	* A potential solution could be implemented as follows:
	* - using LK_NOWAIT when locking the second vnode and retrying
	* if necessary
	* - checking that the parent-child relationship still holds
	* after locking both vnodes and retrying if it doesn't
	*/
	error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
	return (error);
	} else {
	error = vn_lock(vp, lkflags);
	if (error != 0)
	vrele(vp);
	return (error);
	}
	}

	/*
	* Lookup an entry in a directory, or an extended attribute directory.
	* If it exists, return a held vnode reference for it.
	*
	* IN: dvp - vnode of directory to search.
	* nm - name of entry to lookup.
	* pnp - full pathname to lookup [UNUSED].
	* flags - LOOKUP_XATTR set if looking for an attribute.
	* rdir - root directory vnode [UNUSED].
	* cr - credentials of caller.
	* ct - caller context
	*
	* OUT: vpp - vnode of located entry, NULL if not found.
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* NA
	*/
	/* ARGSUSED */
	static int
	zfs_lookup(vnode_t dvp, char nm, vnode_t *vpp, struct componentname cnp,
	int nameiop, cred_t cr, kthread_t td, int flags)
	{
	znode_t *zdp = VTOZ(dvp);
	znode_t *zp;
	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
	int error = 0;

	/*
	* Fast path lookup, however we must skip DNLC lookup
	* for case folding or normalizing lookups because the
	* DNLC code only stores the passed in name. This means
	* creating 'a' and removing 'A' on a case insensitive
	* file system would work, but DNLC still thinks 'a'
	* exists and won't let you create it again on the next
	* pass through fast path.
	*/
	if (!(flags & LOOKUP_XATTR)) {
	if (dvp->v_type != VDIR) {
	return (SET_ERROR(ENOTDIR));
	} else if (zdp->z_sa_hdl == NULL) {
	return (SET_ERROR(EIO));
	}
	}

	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t , dvp, char , nm);

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zdp);

	*vpp = NULL;

	if (flags & LOOKUP_XATTR) {
	#ifdef TODO
	/*
	* If the xattr property is off, refuse the lookup request.
	*/
	if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}
	#endif

	/*
	* We don't allow recursive attributes..
	* Maybe someday we will.
	*/
	if (zdp->z_pflags & ZFS_XATTR) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}

	if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Do we have permission to get into attribute directory?
	*/
	if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
	B_FALSE, cr)) {
	vrele(*vpp);
	*vpp = NULL;
	}

	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Check accessibility of directory.
	*/
	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
	NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EILSEQ));
	}


	/*
	* First handle the special cases.
	*/
	if ((cnp->cn_flags & ISDOTDOT) != 0) {
	/*
	* If we are a snapshot mounted under .zfs, return
	* the vp for the snapshot directory.
	*/
	if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
	struct componentname cn;
	vnode_t *zfsctl_vp;
	int ltype;

	ZFS_EXIT(zfsvfs);
	ltype = VOP_ISLOCKED(dvp);
	VOP_UNLOCK(dvp, 0);
	error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
	&zfsctl_vp);
	if (error == 0) {
	cn.cn_nameptr = "snapshot";
	cn.cn_namelen = strlen(cn.cn_nameptr);
	cn.cn_nameiop = cnp->cn_nameiop;
	cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
	cn.cn_lkflags = cnp->cn_lkflags;
	error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
	vput(zfsctl_vp);
	}
	vn_lock(dvp, ltype \| LK_RETRY);
	return (error);
	}
	}
	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
	ZFS_EXIT(zfsvfs);
	if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
	return (SET_ERROR(ENOTSUP));
	error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
	return (error);
	}

	/*
	* The loop is retry the lookup if the parent-child relationship
	* changes during the dot-dot locking complexities.
	*/
	for (;;) {
	uint64_t parent;

	error = zfs_dirlook(zdp, nm, &zp);
	if (error == 0)
	*vpp = ZTOV(zp);

	ZFS_EXIT(zfsvfs);
	if (error != 0)
	break;

	error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
	if (error != 0) {
	/*
	* If we've got a locking error, then the vnode
	* got reclaimed because of a force unmount.
	* We never enter doomed vnodes into the name cache.
	*/
	*vpp = NULL;
	return (error);
	}

	if ((cnp->cn_flags & ISDOTDOT) == 0)
	break;

	ZFS_ENTER(zfsvfs);
	if (zdp->z_sa_hdl == NULL) {
	error = SET_ERROR(EIO);
	} else {
	error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
	&parent, sizeof (parent));
	}
	if (error != 0) {
	ZFS_EXIT(zfsvfs);
	vput(ZTOV(zp));
	break;
	}
	if (zp->z_id == parent) {
	ZFS_EXIT(zfsvfs);
	break;
	}
	vput(ZTOV(zp));
	}

	out:
	if (error != 0)
	*vpp = NULL;

	/* Translate errors and add SAVENAME when needed. */
	if (cnp->cn_flags & ISLASTCN) {
	switch (nameiop) {
	case CREATE:
	case RENAME:
	if (error == ENOENT) {
	error = EJUSTRETURN;
	cnp->cn_flags \|= SAVENAME;
	break;
	}
	/* FALLTHROUGH */
	case DELETE:
	if (error == 0)
	cnp->cn_flags \|= SAVENAME;
	break;
	}
	}

	/* Insert name into cache (as non-existent) if appropriate. */
	if (zfsvfs->z_use_namecache &&
	error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
	cache_enter(dvp, NULL, cnp);

	/* Insert name into cache if appropriate. */
	if (zfsvfs->z_use_namecache &&
	error == 0 && (cnp->cn_flags & MAKEENTRY)) {
	if (!(cnp->cn_flags & ISLASTCN) \|\|
	(nameiop != DELETE && nameiop != RENAME)) {
	cache_enter(dvp, *vpp, cnp);
	}
	}

	return (error);
	}

	/*
	* Attempt to create a new entry in a directory. If the entry
	* already exists, truncate the file if permissible, else return
	* an error. Return the vp of the created or trunc'd file.
	*
	* IN: dvp - vnode of directory to put new file entry in.
	* name - name of new file entry.
	* vap - attributes of new file.
	* excl - flag indicating exclusive or non-exclusive mode.
	* mode - mode to open file with.
	* cr - credentials of caller.
	* flag - large file flag [UNUSED].
	* ct - caller context
	* vsecp - ACL to be set
	*
	* OUT: vpp - vnode of created or trunc'd entry.
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* dvp - ctime\|mtime updated if new entry created
	* vp - ctime\|mtime always, atime if new
	*/

	/* ARGSUSED */
	static int
	zfs_create(vnode_t dvp, char name, vattr_t *vap, int excl, int mode,
	vnode_t *vpp, cred_t cr, kthread_t *td)
	{
	znode_t zp, dzp = VTOZ(dvp);
	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	zilog_t *zilog;
	objset_t *os;
	dmu_tx_t *tx;
	int error;
	ksid_t *ksid;
	uid_t uid;
	gid_t gid = crgetgid(cr);
	zfs_acl_ids_t acl_ids;
	boolean_t fuid_dirtied;
	void *vsecp = NULL;
	int flag = 0;
	uint64_t txtype;

	/*
	* If we have an ephemeral id, ACL, or XVATTR then
	* make sure file system is at proper version
	*/

	ksid = crgetsid(cr, KSID_OWNER);
	if (ksid)
	uid = ksid_getid(ksid);
	else
	uid = crgetuid(cr);

	if (zfsvfs->z_use_fuids == B_FALSE &&
	(vsecp \|\| (vap->va_mask & AT_XVATTR) \|\|
	IS_EPHEMERAL(uid) \|\| IS_EPHEMERAL(gid)))
	return (SET_ERROR(EINVAL));

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(dzp);
	os = zfsvfs->z_os;
	zilog = zfsvfs->z_log;

	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
	NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EILSEQ));
	}

	if (vap->va_mask & AT_XVATTR) {
	if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
	crgetuid(cr), cr, vap->va_type)) != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	}

	*vpp = NULL;

	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
	vap->va_mode &= ~S_ISVTX;

	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
	if (error) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	ASSERT3P(zp, ==, NULL);

	/*
	* Create a new file object and update the directory
	* to reference it.
	*/
	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
	goto out;
	}

	/*
	* We only support the creation of regular files in
	* extended attribute directories.
	*/

	if ((dzp->z_pflags & ZFS_XATTR) &&
	(vap->va_type != VREG)) {
	error = SET_ERROR(EINVAL);
	goto out;
	}

	if ((error = zfs_acl_ids_create(dzp, 0, vap,
	cr, vsecp, &acl_ids)) != 0)
	goto out;

	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
	zfs_acl_ids_free(&acl_ids);
	error = SET_ERROR(EDQUOT);
	goto out;
	}

	getnewvnode_reserve(1);

	tx = dmu_tx_create(os);

	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
	ZFS_SA_BASE_ATTR_SIZE);

	fuid_dirtied = zfsvfs->z_fuid_dirty;
	if (fuid_dirtied)
	zfs_fuid_txhold(zfsvfs, tx);
	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
	if (!zfsvfs->z_use_sa &&
	acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
	dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
	0, acl_ids.z_aclp->z_acl_bytes);
	}
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	zfs_acl_ids_free(&acl_ids);
	dmu_tx_abort(tx);
	getnewvnode_drop_reserve();
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);

	if (fuid_dirtied)
	zfs_fuid_sync(zfsvfs, tx);

	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
	vsecp, acl_ids.z_fuidp, vap);
	zfs_acl_ids_free(&acl_ids);
	dmu_tx_commit(tx);

	getnewvnode_drop_reserve();

	out:
	if (error == 0) {
	*vpp = ZTOV(zp);
	}

	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zilog, 0);

	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Remove an entry from a directory.
	*
	* IN: dvp - vnode of directory to remove entry from.
	* name - name of entry to remove.
	* cr - credentials of caller.
	* ct - caller context
	* flags - case flags
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* dvp - ctime\|mtime
	* vp - ctime (if nlink > 0)
	*/

	/ARGSUSED/
	static int
	zfs_remove(vnode_t dvp, vnode_t vp, char name, cred_t cr)
	{
	znode_t *dzp = VTOZ(dvp);
	znode_t *zp = VTOZ(vp);
	znode_t *xzp;
	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	zilog_t *zilog;
	uint64_t acl_obj, xattr_obj;
	uint64_t obj = 0;
	dmu_tx_t *tx;
	boolean_t unlinked, toobig = FALSE;
	uint64_t txtype;
	int error;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(dzp);
	ZFS_VERIFY_ZP(zp);
	zilog = zfsvfs->z_log;
	zp = VTOZ(vp);

	xattr_obj = 0;
	xzp = NULL;

	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
	goto out;
	}

	/*
	* Need to use rmdir for removing directories.
	*/
	if (vp->v_type == VDIR) {
	error = SET_ERROR(EPERM);
	goto out;
	}

	vnevent_remove(vp, dvp, name, ct);

	obj = zp->z_id;

	/* are there any extended attributes? */
	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
	&xattr_obj, sizeof (xattr_obj));
	if (error == 0 && xattr_obj) {
	error = zfs_zget(zfsvfs, xattr_obj, &xzp);
	ASSERT0(error);
	}

	/*
	* We may delete the znode now, or we may put it in the unlinked set;
	* it depends on whether we're the last link, and on whether there are
	* other holds on the vnode. So we dmu_tx_hold() the right things to
	* allow for either case.
	*/
	tx = dmu_tx_create(zfsvfs->z_os);
	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	zfs_sa_upgrade_txholds(tx, zp);
	zfs_sa_upgrade_txholds(tx, dzp);

	if (xzp) {
	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
	dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
	}

	/* charge as an update -- would be nice not to charge at all */
	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);

	/*
	* Mark this transaction as typically resulting in a net free of space
	*/
	dmu_tx_mark_netfree(tx);

	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	dmu_tx_abort(tx);
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Remove the directory entry.
	*/
	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);

	if (error) {
	dmu_tx_commit(tx);
	goto out;
	}

	if (unlinked) {
	zfs_unlinked_add(zp, tx);
	vp->v_vflag \|= VV_NOSYNC;
	}

	txtype = TX_REMOVE;
	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);

	dmu_tx_commit(tx);
	out:

	if (xzp)
	vrele(ZTOV(xzp));

	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zilog, 0);

	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Create a new directory and insert it into dvp using the name
	* provided. Return a pointer to the inserted directory.
	*
	* IN: dvp - vnode of directory to add subdir to.
	* dirname - name of new directory.
	* vap - attributes of new directory.
	* cr - credentials of caller.
	* ct - caller context
	* flags - case flags
	* vsecp - ACL to be set
	*
	* OUT: vpp - vnode of created directory.
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* dvp - ctime\|mtime updated
	* vp - ctime\|mtime\|atime updated
	*/
	/ARGSUSED/
	static int
	zfs_mkdir(vnode_t dvp, char dirname, vattr_t vap, vnode_t vpp, cred_t cr)
	{
	znode_t zp, dzp = VTOZ(dvp);
	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	zilog_t *zilog;
	uint64_t txtype;
	dmu_tx_t *tx;
	int error;
	ksid_t *ksid;
	uid_t uid;
	gid_t gid = crgetgid(cr);
	zfs_acl_ids_t acl_ids;
	boolean_t fuid_dirtied;

	ASSERT(vap->va_type == VDIR);

	/*
	* If we have an ephemeral id, ACL, or XVATTR then
	* make sure file system is at proper version
	*/

	ksid = crgetsid(cr, KSID_OWNER);
	if (ksid)
	uid = ksid_getid(ksid);
	else
	uid = crgetuid(cr);
	if (zfsvfs->z_use_fuids == B_FALSE &&
	((vap->va_mask & AT_XVATTR) \|\|
	IS_EPHEMERAL(uid) \|\| IS_EPHEMERAL(gid)))
	return (SET_ERROR(EINVAL));

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(dzp);
	zilog = zfsvfs->z_log;

	if (dzp->z_pflags & ZFS_XATTR) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}

	if (zfsvfs->z_utf8 && u8_validate(dirname,
	strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EILSEQ));
	}

	if (vap->va_mask & AT_XVATTR) {
	if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
	crgetuid(cr), cr, vap->va_type)) != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	}

	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
	NULL, &acl_ids)) != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* First make sure the new directory doesn't exist.
	*
	* Existence is checked first to make sure we don't return
	* EACCES instead of EEXIST which can cause some applications
	* to fail.
	*/
	*vpp = NULL;

	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
	zfs_acl_ids_free(&acl_ids);
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	ASSERT3P(zp, ==, NULL);

	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
	zfs_acl_ids_free(&acl_ids);
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
	zfs_acl_ids_free(&acl_ids);
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EDQUOT));
	}

	/*
	* Add a new entry to the directory.
	*/
	getnewvnode_reserve(1);
	tx = dmu_tx_create(zfsvfs->z_os);
	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
	fuid_dirtied = zfsvfs->z_fuid_dirty;
	if (fuid_dirtied)
	zfs_fuid_txhold(zfsvfs, tx);
	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
	acl_ids.z_aclp->z_acl_bytes);
	}

	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
	ZFS_SA_BASE_ATTR_SIZE);

	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	zfs_acl_ids_free(&acl_ids);
	dmu_tx_abort(tx);
	getnewvnode_drop_reserve();
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Create new node.
	*/
	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);

	if (fuid_dirtied)
	zfs_fuid_sync(zfsvfs, tx);

	/*
	* Now put new name in parent dir.
	*/
	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);

	*vpp = ZTOV(zp);

	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
	acl_ids.z_fuidp, vap);

	zfs_acl_ids_free(&acl_ids);

	dmu_tx_commit(tx);

	getnewvnode_drop_reserve();

	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zilog, 0);

	ZFS_EXIT(zfsvfs);
	return (0);
	}

	/*
	* Remove a directory subdir entry. If the current working
	* directory is the same as the subdir to be removed, the
	* remove will fail.
	*
	* IN: dvp - vnode of directory to remove from.
	* name - name of directory to be removed.
	* cwd - vnode of current working directory.
	* cr - credentials of caller.
	* ct - caller context
	* flags - case flags
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* dvp - ctime\|mtime updated
	*/
	/ARGSUSED/
	static int
	zfs_rmdir(vnode_t dvp, vnode_t vp, char name, cred_t cr)
	{
	znode_t *dzp = VTOZ(dvp);
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	zilog_t *zilog;
	dmu_tx_t *tx;
	int error;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(dzp);
	ZFS_VERIFY_ZP(zp);
	zilog = zfsvfs->z_log;


	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
	goto out;
	}

	if (vp->v_type != VDIR) {
	error = SET_ERROR(ENOTDIR);
	goto out;
	}

	vnevent_rmdir(vp, dvp, name, ct);

	tx = dmu_tx_create(zfsvfs->z_os);
	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
	zfs_sa_upgrade_txholds(tx, zp);
	zfs_sa_upgrade_txholds(tx, dzp);
	dmu_tx_mark_netfree(tx);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	dmu_tx_abort(tx);
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	cache_purge(dvp);

	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);

	if (error == 0) {
	uint64_t txtype = TX_RMDIR;
	zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
	}

	dmu_tx_commit(tx);

	cache_purge(vp);
	out:
	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zilog, 0);

	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Read as many directory entries as will fit into the provided
	* buffer from the given directory cursor position (specified in
	* the uio structure).
	*
	* IN: vp - vnode of directory to read.
	* uio - structure supplying read location, range info,
	* and return buffer.
	* cr - credentials of caller.
	* ct - caller context
	* flags - case flags
	*
	* OUT: uio - updated offset and range, buffer filled.
	* eofp - set to true if end-of-file detected.
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* vp - atime updated
	*
	* Note that the low 4 bits of the cookie returned by zap is always zero.
	* This allows us to use the low range for "special" directory entries:
	* We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
	* we use the offset 2 for the '.zfs' directory.
	*/
	/* ARGSUSED */
	static int
	zfs_readdir(vnode_t vp, uio_t uio, cred_t cr, int eofp, int ncookies, u_long *cookies)
	{
	znode_t *zp = VTOZ(vp);
	iovec_t *iovp;
	edirent_t *eodp;
	dirent64_t *odp;
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	objset_t *os;
	caddr_t outbuf;
	size_t bufsize;
	zap_cursor_t zc;
	zap_attribute_t zap;
	uint_t bytes_wanted;
	uint64_t offset; /* must be unsigned; checks for < 1 */
	uint64_t parent;
	int local_eof;
	int outcount;
	int error;
	uint8_t prefetch;
	boolean_t check_sysattrs;
	uint8_t type;
	int ncooks;
	u_long *cooks = NULL;
	int flags = 0;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
	&parent, sizeof (parent))) != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* If we are not given an eof variable,
	* use a local one.
	*/
	if (eofp == NULL)
	eofp = &local_eof;

	/*
	* Check for valid iov_len.
	*/
	if (uio->uio_iov->iov_len <= 0) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}

	/*
	* Quit if directory has been removed (posix)
	*/
	if ((*eofp = zp->z_unlinked) != 0) {
	ZFS_EXIT(zfsvfs);
	return (0);
	}

	error = 0;
	os = zfsvfs->z_os;
	offset = uio->uio_loffset;
	prefetch = zp->z_zn_prefetch;

	/*
	* Initialize the iterator cursor.
	*/
	if (offset <= 3) {
	/*
	* Start iteration from the beginning of the directory.
	*/
	zap_cursor_init(&zc, os, zp->z_id);
	} else {
	/*
	* The offset is a serialized cursor.
	*/
	zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
	}

	/*
	* Get space to change directory entries into fs independent format.
	*/
	iovp = uio->uio_iov;
	bytes_wanted = iovp->iov_len;
	if (uio->uio_segflg != UIO_SYSSPACE \|\| uio->uio_iovcnt != 1) {
	bufsize = bytes_wanted;
	outbuf = kmem_alloc(bufsize, KM_SLEEP);
	odp = (struct dirent64 *)outbuf;
	} else {
	bufsize = bytes_wanted;
	outbuf = NULL;
	odp = (struct dirent64 *)iovp->iov_base;
	}
	eodp = (struct edirent *)odp;

	if (ncookies != NULL) {
	/*
	* Minimum entry size is dirent size and 1 byte for a file name.
	*/
	ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
	cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
	*cookies = cooks;
	*ncookies = ncooks;
	}
	/*
	* If this VFS supports the system attribute view interface; and
	* we're looking at an extended attribute directory; and we care
	* about normalization conflicts on this vfs; then we must check
	* for normalization conflicts with the sysattr name space.
	*/
	#ifdef TODO
	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
	(vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
	(flags & V_RDDIR_ENTFLAGS);
	#else
	check_sysattrs = 0;
	#endif

	/*
	* Transform to file-system independent format
	*/
	outcount = 0;
	while (outcount < bytes_wanted) {
	ino64_t objnum;
	ushort_t reclen;
	off64_t *next = NULL;

	/*
	* Special case `.', `..', and `.zfs'.
	*/
	if (offset == 0) {
	(void) strcpy(zap.za_name, ".");
	zap.za_normalization_conflict = 0;
	objnum = zp->z_id;
	type = DT_DIR;
	} else if (offset == 1) {
	(void) strcpy(zap.za_name, "..");
	zap.za_normalization_conflict = 0;
	objnum = parent;
	type = DT_DIR;
	} else if (offset == 2 && zfs_show_ctldir(zp)) {
	(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
	zap.za_normalization_conflict = 0;
	objnum = ZFSCTL_INO_ROOT;
	type = DT_DIR;
	} else {
	/*
	* Grab next entry.
	*/
	if (error = zap_cursor_retrieve(&zc, &zap)) {
	if ((*eofp = (error == ENOENT)) != 0)
	break;
	else
	goto update;
	}

	if (zap.za_integer_length != 8 \|\|
	zap.za_num_integers != 1) {
	cmn_err(CE_WARN, "zap_readdir: bad directory "
	"entry, obj = %lld, offset = %lld\n",
	(u_longlong_t)zp->z_id,
	(u_longlong_t)offset);
	error = SET_ERROR(ENXIO);
	goto update;
	}

	objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
	/*
	* MacOS X can extract the object type here such as:
	* uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
	*/
	type = ZFS_DIRENT_TYPE(zap.za_first_integer);

	if (check_sysattrs && !zap.za_normalization_conflict) {
	#ifdef TODO
	zap.za_normalization_conflict =
	xattr_sysattr_casechk(zap.za_name);
	#else
	panic("%s:%u: TODO", __func__, __LINE__);
	#endif
	}
	}

	if (flags & V_RDDIR_ACCFILTER) {
	/*
	* If we have no access at all, don't include
	* this entry in the returned information
	*/
	znode_t *ezp;
	if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
	goto skip_entry;
	if (!zfs_has_access(ezp, cr)) {
	vrele(ZTOV(ezp));
	goto skip_entry;
	}
	vrele(ZTOV(ezp));
	}

	if (flags & V_RDDIR_ENTFLAGS)
	reclen = EDIRENT_RECLEN(strlen(zap.za_name));
	else
	reclen = DIRENT64_RECLEN(strlen(zap.za_name));

	/*
	* Will this entry fit in the buffer?
	*/
	if (outcount + reclen > bufsize) {
	/*
	* Did we manage to fit anything in the buffer?
	*/
	if (!outcount) {
	error = SET_ERROR(EINVAL);
	goto update;
	}
	break;
	}
	if (flags & V_RDDIR_ENTFLAGS) {
	/*
	* Add extended flag entry:
	*/
	eodp->ed_ino = objnum;
	eodp->ed_reclen = reclen;
	/* NOTE: ed_off is the offset for the next entry */
	next = &(eodp->ed_off);
	eodp->ed_eflags = zap.za_normalization_conflict ?
	ED_CASE_CONFLICT : 0;
	(void) strncpy(eodp->ed_name, zap.za_name,
	EDIRENT_NAMELEN(reclen));
	eodp = (edirent_t *)((intptr_t)eodp + reclen);
	} else {
	/*
	* Add normal entry:
	*/
	odp->d_ino = objnum;
	odp->d_reclen = reclen;
	odp->d_namlen = strlen(zap.za_name);
	(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
	odp->d_type = type;
	odp = (dirent64_t *)((intptr_t)odp + reclen);
	}
	outcount += reclen;

	ASSERT(outcount <= bufsize);

	/* Prefetch znode */
	if (prefetch)
	dmu_prefetch(os, objnum, 0, 0, 0,
	ZIO_PRIORITY_SYNC_READ);

	skip_entry:
	/*
	* Move to the next entry, fill in the previous offset.
	*/
	if (offset > 2 \|\| (offset == 2 && !zfs_show_ctldir(zp))) {
	zap_cursor_advance(&zc);
	offset = zap_cursor_serialize(&zc);
	} else {
	offset += 1;
	}

	if (cooks != NULL) {
	*cooks++ = offset;
	ncooks--;
	KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
	}
	}
	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */

	/* Subtract unused cookies */
	if (ncookies != NULL)
	*ncookies -= ncooks;

	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
	iovp->iov_base += outcount;
	iovp->iov_len -= outcount;
	uio->uio_resid -= outcount;
	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
	/*
	* Reset the pointer.
	*/
	offset = uio->uio_loffset;
	}

	update:
	zap_cursor_fini(&zc);
	if (uio->uio_segflg != UIO_SYSSPACE \|\| uio->uio_iovcnt != 1)
	kmem_free(outbuf, bufsize);

	if (error == ENOENT)
	error = 0;

	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);

	uio->uio_loffset = offset;
	ZFS_EXIT(zfsvfs);
	if (error != 0 && cookies != NULL) {
	free(*cookies, M_TEMP);
	*cookies = NULL;
	*ncookies = 0;
	}
	return (error);
	}

	ulong_t zfs_fsync_sync_cnt = 4;

	static int
	zfs_fsync(vnode_t vp, int syncflag, cred_t cr, caller_context_t *ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;

	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);

	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);
	zil_commit(zfsvfs->z_log, zp->z_id);
	ZFS_EXIT(zfsvfs);
	}
	return (0);
	}


	/*
	* Get the requested file attributes and place them in the provided
	* vattr structure.
	*
	* IN: vp - vnode of file.
	* vap - va_mask identifies requested attributes.
	* If AT_XVATTR set, then optional attrs are requested
	* flags - ATTR_NOACLCHECK (CIFS server context)
	* cr - credentials of caller.
	* ct - caller context
	*
	* OUT: vap - attribute values.
	*
	* RETURN: 0 (always succeeds).
	*/
	/* ARGSUSED */
	static int
	zfs_getattr(vnode_t vp, vattr_t vap, int flags, cred_t *cr,
	caller_context_t *ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	int error = 0;
	uint32_t blksize;
	u_longlong_t nblocks;
	uint64_t links;
	uint64_t mtime[2], ctime[2], crtime[2], rdev;
	xvattr_t xvap = (xvattr_t )vap; /* vap may be an xvattr_t * */
	xoptattr_t *xoap = NULL;
	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
	sa_bulk_attr_t bulk[4];
	int count = 0;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);

	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
	if (vp->v_type == VBLK \|\| vp->v_type == VCHR)
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
	&rdev, 8);

	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
	* Also, if we are the owner don't bother, since owner should
	* always be allowed to read basic attributes of file.
	*/
	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
	(vap->va_uid != crgetuid(cr))) {
	if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
	skipaclchk, cr)) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	}

	/*
	* Return all attributes. It's cheaper to provide the answer
	* than to determine whether we were asked the question.
	*/

	vap->va_type = IFTOVT(zp->z_mode);
	vap->va_mode = zp->z_mode & ~S_IFMT;
	#ifdef illumos
	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
	#else
	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
	#endif
	vap->va_nodeid = zp->z_id;
	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
	links = zp->z_links + 1;
	else
	links = zp->z_links;
	vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */
	vap->va_size = zp->z_size;
	#ifdef illumos
	vap->va_rdev = vp->v_rdev;
	#else
	if (vp->v_type == VBLK \|\| vp->v_type == VCHR)
	vap->va_rdev = zfs_cmpldev(rdev);
	#endif
	vap->va_seq = zp->z_seq;
	vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
	vap->va_filerev = zp->z_seq;

	/*
	* Add in any requested optional attributes and the create time.
	* Also set the corresponding bits in the returned attribute bitmap.
	*/
	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
	xoap->xoa_archive =
	((zp->z_pflags & ZFS_ARCHIVE) != 0);
	XVA_SET_RTN(xvap, XAT_ARCHIVE);
	}

	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
	xoap->xoa_readonly =
	((zp->z_pflags & ZFS_READONLY) != 0);
	XVA_SET_RTN(xvap, XAT_READONLY);
	}

	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
	xoap->xoa_system =
	((zp->z_pflags & ZFS_SYSTEM) != 0);
	XVA_SET_RTN(xvap, XAT_SYSTEM);
	}

	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
	xoap->xoa_hidden =
	((zp->z_pflags & ZFS_HIDDEN) != 0);
	XVA_SET_RTN(xvap, XAT_HIDDEN);
	}

	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
	xoap->xoa_nounlink =
	((zp->z_pflags & ZFS_NOUNLINK) != 0);
	XVA_SET_RTN(xvap, XAT_NOUNLINK);
	}

	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
	xoap->xoa_immutable =
	((zp->z_pflags & ZFS_IMMUTABLE) != 0);
	XVA_SET_RTN(xvap, XAT_IMMUTABLE);
	}

	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
	xoap->xoa_appendonly =
	((zp->z_pflags & ZFS_APPENDONLY) != 0);
	XVA_SET_RTN(xvap, XAT_APPENDONLY);
	}

	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
	xoap->xoa_nodump =
	((zp->z_pflags & ZFS_NODUMP) != 0);
	XVA_SET_RTN(xvap, XAT_NODUMP);
	}

	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
	xoap->xoa_opaque =
	((zp->z_pflags & ZFS_OPAQUE) != 0);
	XVA_SET_RTN(xvap, XAT_OPAQUE);
	}

	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
	xoap->xoa_av_quarantined =
	((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
	XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
	}

	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
	xoap->xoa_av_modified =
	((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
	XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
	}

	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
	vp->v_type == VREG) {
	zfs_sa_get_scanstamp(zp, xvap);
	}

	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
	xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
	XVA_SET_RTN(xvap, XAT_REPARSE);
	}
	if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
	xoap->xoa_generation = zp->z_gen;
	XVA_SET_RTN(xvap, XAT_GEN);
	}

	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
	xoap->xoa_offline =
	((zp->z_pflags & ZFS_OFFLINE) != 0);
	XVA_SET_RTN(xvap, XAT_OFFLINE);
	}

	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
	xoap->xoa_sparse =
	((zp->z_pflags & ZFS_SPARSE) != 0);
	XVA_SET_RTN(xvap, XAT_SPARSE);
	}
	}

	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);


	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
	vap->va_blksize = blksize;
	vap->va_bytes = nblocks << 9; /* nblocks * 512 */

	if (zp->z_blksz == 0) {
	/*
	* Block size hasn't been set; suggest maximal I/O transfers.
	*/
	vap->va_blksize = zfsvfs->z_max_blksz;
	}

	ZFS_EXIT(zfsvfs);
	return (0);
	}

	/*
	* Set the file attributes to the values contained in the
	* vattr structure.
	*
	* IN: vp - vnode of file to be modified.
	* vap - new attribute values.
	* If AT_XVATTR set, then optional attrs are being set
	* flags - ATTR_UTIME set if non-default time values provided.
	* - ATTR_NOACLCHECK (CIFS context only).
	* cr - credentials of caller.
	* ct - caller context
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* vp - ctime updated, mtime updated if size changed.
	*/
	/* ARGSUSED */
	static int
	zfs_setattr(vnode_t vp, vattr_t vap, int flags, cred_t *cr,
	caller_context_t *ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	zilog_t *zilog;
	dmu_tx_t *tx;
	vattr_t oldva;
	xvattr_t tmpxvattr;
	uint_t mask = vap->va_mask;
	uint_t saved_mask = 0;
	uint64_t saved_mode;
	int trim_mask = 0;
	uint64_t new_mode;
	uint64_t new_uid, new_gid;
	uint64_t xattr_obj;
	uint64_t mtime[2], ctime[2];
	znode_t *attrzp;
	int need_policy = FALSE;
	int err, err2;
	zfs_fuid_info_t *fuidp = NULL;
	xvattr_t xvap = (xvattr_t )vap; /* vap may be an xvattr_t * */
	xoptattr_t *xoap;
	zfs_acl_t *aclp;
	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
	boolean_t fuid_dirtied = B_FALSE;
	sa_bulk_attr_t bulk[7], xattr_bulk[7];
	int count = 0, xattr_count = 0;

	if (mask == 0)
	return (0);

	if (mask & AT_NOSET)
	return (SET_ERROR(EINVAL));

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	zilog = zfsvfs->z_log;

	/*
	* Make sure that if we have ephemeral uid/gid or xvattr specified
	* that file system is at proper version level
	*/

	if (zfsvfs->z_use_fuids == B_FALSE &&
	(((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) \|\|
	((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) \|\|
	(mask & AT_XVATTR))) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}

	if (mask & AT_SIZE && vp->v_type == VDIR) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EISDIR));
	}

	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}

	/*
	* If this is an xvattr_t, then get a pointer to the structure of
	* optional attributes. If this is NULL, then we have a vattr_t.
	*/
	xoap = xva_getxoptattr(xvap);

	xva_init(&tmpxvattr);

	/*
	* Immutable files can only alter immutable bit and atime
	*/
	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
	((mask & (AT_SIZE\|AT_UID\|AT_GID\|AT_MTIME\|AT_MODE)) \|\|
	((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EPERM));
	}

	/*
	* Note: ZFS_READONLY is handled in zfs_zaccess_common.
	*/

	/*
	* Verify timestamps doesn't overflow 32 bits.
	* ZFS can handle large timestamps, but 32bit syscalls can't
	* handle times greater than 2039. This check should be removed
	* once large timestamps are fully supported.
	*/
	if (mask & (AT_ATIME \| AT_MTIME)) {
	if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) \|\|
	((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EOVERFLOW));
	}
	}
	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
	TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EOVERFLOW));
	}

	attrzp = NULL;
	aclp = NULL;

	/* Can this be moved to before the top label? */
	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EROFS));
	}

	/*
	* First validate permissions
	*/

	if (mask & AT_SIZE) {
	/*
	* XXX - Note, we are not providing any open
	* mode flags here (like FNDELAY), so we may
	* block if there are locks present... this
	* should be addressed in openat().
	*/
	/* XXX - would it be OK to generate a log record here? */
	err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
	if (err) {
	ZFS_EXIT(zfsvfs);
	return (err);
	}
	}

	if (mask & (AT_ATIME\|AT_MTIME) \|\|
	((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) \|\|
	XVA_ISSET_REQ(xvap, XAT_READONLY) \|\|
	XVA_ISSET_REQ(xvap, XAT_ARCHIVE) \|\|
	XVA_ISSET_REQ(xvap, XAT_OFFLINE) \|\|
	XVA_ISSET_REQ(xvap, XAT_SPARSE) \|\|
	XVA_ISSET_REQ(xvap, XAT_CREATETIME) \|\|
	XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
	need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
	skipaclchk, cr);
	}

	if (mask & (AT_UID\|AT_GID)) {
	int idmask = (mask & (AT_UID\|AT_GID));
	int take_owner;
	int take_group;

	/*
	* NOTE: even if a new mode is being set,
	* we may clear S_ISUID/S_ISGID bits.
	*/

	if (!(mask & AT_MODE))
	vap->va_mode = zp->z_mode;

	/*
	* Take ownership or chgrp to group we are a member of
	*/

	take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
	take_group = (mask & AT_GID) &&
	zfs_groupmember(zfsvfs, vap->va_gid, cr);

	/*
	* If both AT_UID and AT_GID are set then take_owner and
	* take_group must both be set in order to allow taking
	* ownership.
	*
	* Otherwise, send the check through secpolicy_vnode_setattr()
	*
	*/

	if (((idmask == (AT_UID\|AT_GID)) && take_owner && take_group) \|\|
	((idmask == AT_UID) && take_owner) \|\|
	((idmask == AT_GID) && take_group)) {
	if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
	skipaclchk, cr) == 0) {
	/*
	* Remove setuid/setgid for non-privileged users
	*/
	secpolicy_setid_clear(vap, vp, cr);
	trim_mask = (mask & (AT_UID\|AT_GID));
	} else {
	need_policy = TRUE;
	}
	} else {
	need_policy = TRUE;
	}
	}

	oldva.va_mode = zp->z_mode;
	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
	if (mask & AT_XVATTR) {
	/*
	* Update xvattr mask to include only those attributes
	* that are actually changing.
	*
	* the bits will be restored prior to actually setting
	* the attributes so the caller thinks they were set.
	*/
	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
	if (xoap->xoa_appendonly !=
	((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
	need_policy = TRUE;
	} else {
	XVA_CLR_REQ(xvap, XAT_APPENDONLY);
	XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
	}
	}

	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
	if (xoap->xoa_nounlink !=
	((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
	need_policy = TRUE;
	} else {
	XVA_CLR_REQ(xvap, XAT_NOUNLINK);
	XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
	}
	}

	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
	if (xoap->xoa_immutable !=
	((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
	need_policy = TRUE;
	} else {
	XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
	XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
	}
	}

	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
	if (xoap->xoa_nodump !=
	((zp->z_pflags & ZFS_NODUMP) != 0)) {
	need_policy = TRUE;
	} else {
	XVA_CLR_REQ(xvap, XAT_NODUMP);
	XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
	}
	}

	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
	if (xoap->xoa_av_modified !=
	((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
	need_policy = TRUE;
	} else {
	XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
	XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
	}
	}

	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
	if ((vp->v_type != VREG &&
	xoap->xoa_av_quarantined) \|\|
	xoap->xoa_av_quarantined !=
	((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
	need_policy = TRUE;
	} else {
	XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
	XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
	}
	}

	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EPERM));
	}

	if (need_policy == FALSE &&
	(XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) \|\|
	XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
	need_policy = TRUE;
	}
	}

	if (mask & AT_MODE) {
	if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
	err = secpolicy_setid_setsticky_clear(vp, vap,
	&oldva, cr);
	if (err) {
	ZFS_EXIT(zfsvfs);
	return (err);
	}
	trim_mask \|= AT_MODE;
	} else {
	need_policy = TRUE;
	}
	}

	if (need_policy) {
	/*
	* If trim_mask is set then take ownership
	* has been granted or write_acl is present and user
	* has the ability to modify mode. In that case remove
	* UID\|GID and or MODE from mask so that
	* secpolicy_vnode_setattr() doesn't revoke it.
	*/

	if (trim_mask) {
	saved_mask = vap->va_mask;
	vap->va_mask &= ~trim_mask;
	if (trim_mask & AT_MODE) {
	/*
	* Save the mode, as secpolicy_vnode_setattr()
	* will overwrite it with ova.va_mode.
	*/
	saved_mode = vap->va_mode;
	}
	}
	err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
	(int ()(void , int, cred_t *))zfs_zaccess_unix, zp);
	if (err) {
	ZFS_EXIT(zfsvfs);
	return (err);
	}

	if (trim_mask) {
	vap->va_mask \|= saved_mask;
	if (trim_mask & AT_MODE) {
	/*
	* Recover the mode after
	* secpolicy_vnode_setattr().
	*/
	vap->va_mode = saved_mode;
	}
	}
	}

	/*
	* secpolicy_vnode_setattr, or take ownership may have
	* changed va_mask
	*/
	mask = vap->va_mask;

	if ((mask & (AT_UID \| AT_GID))) {
	err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
	&xattr_obj, sizeof (xattr_obj));

	if (err == 0 && xattr_obj) {
	err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
	if (err == 0) {
	err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
	if (err != 0)
	vrele(ZTOV(attrzp));
	}
	if (err)
	goto out2;
	}
	if (mask & AT_UID) {
	new_uid = zfs_fuid_create(zfsvfs,
	(uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
	if (new_uid != zp->z_uid &&
	zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
	if (attrzp)
	vput(ZTOV(attrzp));
	err = SET_ERROR(EDQUOT);
	goto out2;
	}
	}

	if (mask & AT_GID) {
	new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
	cr, ZFS_GROUP, &fuidp);
	if (new_gid != zp->z_gid &&
	zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
	if (attrzp)
	vput(ZTOV(attrzp));
	err = SET_ERROR(EDQUOT);
	goto out2;
	}
	}
	}
	tx = dmu_tx_create(zfsvfs->z_os);

	if (mask & AT_MODE) {
	uint64_t pmode = zp->z_mode;
	uint64_t acl_obj;
	new_mode = (pmode & S_IFMT) \| (vap->va_mode & ~S_IFMT);

	if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
	!(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
	err = SET_ERROR(EPERM);
	goto out;
	}

	if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
	goto out;

	if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
	/*
	* Are we upgrading ACL from old V0 format
	* to V1 format?
	*/
	if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
	zfs_znode_acl_version(zp) ==
	ZFS_ACL_VERSION_INITIAL) {
	dmu_tx_hold_free(tx, acl_obj, 0,
	DMU_OBJECT_END);
	dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
	0, aclp->z_acl_bytes);
	} else {
	dmu_tx_hold_write(tx, acl_obj, 0,
	aclp->z_acl_bytes);
	}
	} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
	dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
	0, aclp->z_acl_bytes);
	}
	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
	} else {
	if ((mask & AT_XVATTR) &&
	XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
	else
	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	}

	if (attrzp) {
	dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
	}

	fuid_dirtied = zfsvfs->z_fuid_dirty;
	if (fuid_dirtied)
	zfs_fuid_txhold(zfsvfs, tx);

	zfs_sa_upgrade_txholds(tx, zp);

	err = dmu_tx_assign(tx, TXG_WAIT);
	if (err)
	goto out;

	count = 0;
	/*
	* Set each attribute requested.
	* We group settings according to the locks they need to acquire.
	*
	* Note: you cannot set ctime directly, although it will be
	* updated as a side-effect of calling this function.
	*/

	if (mask & (AT_UID\|AT_GID\|AT_MODE))
	mutex_enter(&zp->z_acl_lock);

	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	&zp->z_pflags, sizeof (zp->z_pflags));

	if (attrzp) {
	if (mask & (AT_UID\|AT_GID\|AT_MODE))
	mutex_enter(&attrzp->z_acl_lock);
	SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
	SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
	sizeof (attrzp->z_pflags));
	}

	if (mask & (AT_UID\|AT_GID)) {

	if (mask & AT_UID) {
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
	&new_uid, sizeof (new_uid));
	zp->z_uid = new_uid;
	if (attrzp) {
	SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
	SA_ZPL_UID(zfsvfs), NULL, &new_uid,
	sizeof (new_uid));
	attrzp->z_uid = new_uid;
	}
	}

	if (mask & AT_GID) {
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
	NULL, &new_gid, sizeof (new_gid));
	zp->z_gid = new_gid;
	if (attrzp) {
	SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
	SA_ZPL_GID(zfsvfs), NULL, &new_gid,
	sizeof (new_gid));
	attrzp->z_gid = new_gid;
	}
	}
	if (!(mask & AT_MODE)) {
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
	NULL, &new_mode, sizeof (new_mode));
	new_mode = zp->z_mode;
	}
	err = zfs_acl_chown_setattr(zp);
	ASSERT(err == 0);
	if (attrzp) {
	err = zfs_acl_chown_setattr(attrzp);
	ASSERT(err == 0);
	}
	}

	if (mask & AT_MODE) {
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
	&new_mode, sizeof (new_mode));
	zp->z_mode = new_mode;
	ASSERT3U((uintptr_t)aclp, !=, 0);
	err = zfs_aclset_common(zp, aclp, cr, tx);
	ASSERT0(err);
	if (zp->z_acl_cached)
	zfs_acl_free(zp->z_acl_cached);
	zp->z_acl_cached = aclp;
	aclp = NULL;
	}


	if (mask & AT_ATIME) {
	ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
	&zp->z_atime, sizeof (zp->z_atime));
	}

	if (mask & AT_MTIME) {
	ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
	mtime, sizeof (mtime));
	}

	/* XXX - shouldn't this be done before the ATIME/MTIME checks? */
	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
	NULL, mtime, sizeof (mtime));
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
	&ctime, sizeof (ctime));
	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
	B_TRUE);
	} else if (mask != 0) {
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
	&ctime, sizeof (ctime));
	zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
	B_TRUE);
	if (attrzp) {
	SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
	SA_ZPL_CTIME(zfsvfs), NULL,
	&ctime, sizeof (ctime));
	zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
	mtime, ctime, B_TRUE);
	}
	}
	/*
	* Do this after setting timestamps to prevent timestamp
	* update from toggling bit
	*/

	if (xoap && (mask & AT_XVATTR)) {

	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
	xoap->xoa_createtime = vap->va_birthtime;
	/*
	* restore trimmed off masks
	* so that return masks can be set for caller.
	*/

	if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
	XVA_SET_REQ(xvap, XAT_APPENDONLY);
	}
	if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
	XVA_SET_REQ(xvap, XAT_NOUNLINK);
	}
	if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
	XVA_SET_REQ(xvap, XAT_IMMUTABLE);
	}
	if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
	XVA_SET_REQ(xvap, XAT_NODUMP);
	}
	if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
	XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
	}
	if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
	XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
	}

	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
	ASSERT(vp->v_type == VREG);

	zfs_xvattr_set(zp, xvap, tx);
	}

	if (fuid_dirtied)
	zfs_fuid_sync(zfsvfs, tx);

	if (mask != 0)
	zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);

	if (mask & (AT_UID\|AT_GID\|AT_MODE))
	mutex_exit(&zp->z_acl_lock);

	if (attrzp) {
	if (mask & (AT_UID\|AT_GID\|AT_MODE))
	mutex_exit(&attrzp->z_acl_lock);
	}
	out:
	if (err == 0 && attrzp) {
	err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
	xattr_count, tx);
	ASSERT(err2 == 0);
	}

	if (attrzp)
	vput(ZTOV(attrzp));

	if (aclp)
	zfs_acl_free(aclp);

	if (fuidp) {
	zfs_fuid_info_free(fuidp);
	fuidp = NULL;
	}

	if (err) {
	dmu_tx_abort(tx);
	} else {
	err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
	dmu_tx_commit(tx);
	}

	out2:
	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zilog, 0);

	ZFS_EXIT(zfsvfs);
	return (err);
	}

	/*
	* We acquire all but fdvp locks using non-blocking acquisitions. If we
	* fail to acquire any lock in the path we will drop all held locks,
	* acquire the new lock in a blocking fashion, and then release it and
	* restart the rename. This acquire/release step ensures that we do not
	* spin on a lock waiting for release. On error release all vnode locks
	* and decrement references the way tmpfs_rename() would do.
	*/
	static int
	zfs_rename_relock(struct vnode sdvp, struct vnode *svpp,
	struct vnode tdvp, struct vnode *tvpp,
	const struct componentname scnp, const struct componentname tcnp)
	{
	zfsvfs_t *zfsvfs;
	struct vnode nvp, svp, *tvp;
	znode_t sdzp, tdzp, szp, tzp;
	const char *snm = scnp->cn_nameptr;
	const char *tnm = tcnp->cn_nameptr;
	int error;

	VOP_UNLOCK(tdvp, 0);
	if (tvpp != NULL && tvpp != tdvp)
	VOP_UNLOCK(*tvpp, 0);

	relock:
	error = vn_lock(sdvp, LK_EXCLUSIVE);
	if (error)
	goto out;
	sdzp = VTOZ(sdvp);

	error = vn_lock(tdvp, LK_EXCLUSIVE \| LK_NOWAIT);
	if (error != 0) {
	VOP_UNLOCK(sdvp, 0);
	if (error != EBUSY)
	goto out;
	error = vn_lock(tdvp, LK_EXCLUSIVE);
	if (error)
	goto out;
	VOP_UNLOCK(tdvp, 0);
	goto relock;
	}
	tdzp = VTOZ(tdvp);

	/*
	* Before using sdzp and tdzp we must ensure that they are live.
	* As a porting legacy from illumos we have two things to worry
	* about. One is typical for FreeBSD and it is that the vnode is
	* not reclaimed (doomed). The other is that the znode is live.
	* The current code can invalidate the znode without acquiring the
	* corresponding vnode lock if the object represented by the znode
	* and vnode is no longer valid after a rollback or receive operation.
	* z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
	* that protects the znodes from the invalidation.
	*/
	zfsvfs = sdzp->z_zfsvfs;
	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
	ZFS_ENTER(zfsvfs);

	/*
	* We can not use ZFS_VERIFY_ZP() here because it could directly return
	* bypassing the cleanup code in the case of an error.
	*/
	if (tdzp->z_sa_hdl == NULL \|\| sdzp->z_sa_hdl == NULL) {
	ZFS_EXIT(zfsvfs);
	VOP_UNLOCK(sdvp, 0);
	VOP_UNLOCK(tdvp, 0);
	error = SET_ERROR(EIO);
	goto out;
	}

	/*
	* Re-resolve svp to be certain it still exists and fetch the
	* correct vnode.
	*/
	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
	if (error != 0) {
	/* Source entry invalid or not there. */
	ZFS_EXIT(zfsvfs);
	VOP_UNLOCK(sdvp, 0);
	VOP_UNLOCK(tdvp, 0);
	if ((scnp->cn_flags & ISDOTDOT) != 0 \|\|
	(scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
	error = SET_ERROR(EINVAL);
	goto out;
	}
	svp = ZTOV(szp);

	/*
	* Re-resolve tvp, if it disappeared we just carry on.
	*/
	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
	if (error != 0) {
	ZFS_EXIT(zfsvfs);
	VOP_UNLOCK(sdvp, 0);
	VOP_UNLOCK(tdvp, 0);
	vrele(svp);
	if ((tcnp->cn_flags & ISDOTDOT) != 0)
	error = SET_ERROR(EINVAL);
	goto out;
	}
	if (tzp != NULL)
	tvp = ZTOV(tzp);
	else
	tvp = NULL;

	/*
	* At present the vnode locks must be acquired before z_teardown_lock,
	* although it would be more logical to use the opposite order.
	*/
	ZFS_EXIT(zfsvfs);

	/*
	* Now try acquire locks on svp and tvp.
	*/
	nvp = svp;
	error = vn_lock(nvp, LK_EXCLUSIVE \| LK_NOWAIT);
	if (error != 0) {
	VOP_UNLOCK(sdvp, 0);
	VOP_UNLOCK(tdvp, 0);
	if (tvp != NULL)
	vrele(tvp);
	if (error != EBUSY) {
	vrele(nvp);
	goto out;
	}
	error = vn_lock(nvp, LK_EXCLUSIVE);
	if (error != 0) {
	vrele(nvp);
	goto out;
	}
	VOP_UNLOCK(nvp, 0);
	/*
	* Concurrent rename race.
	* XXX ?
	*/
	if (nvp == tdvp) {
	vrele(nvp);
	error = SET_ERROR(EINVAL);
	goto out;
	}
	vrele(*svpp);
	*svpp = nvp;
	goto relock;
	}
	vrele(*svpp);
	*svpp = nvp;

	if (*tvpp != NULL)
	vrele(*tvpp);
	*tvpp = NULL;
	if (tvp != NULL) {
	nvp = tvp;
	error = vn_lock(nvp, LK_EXCLUSIVE \| LK_NOWAIT);
	if (error != 0) {
	VOP_UNLOCK(sdvp, 0);
	VOP_UNLOCK(tdvp, 0);
	VOP_UNLOCK(*svpp, 0);
	if (error != EBUSY) {
	vrele(nvp);
	goto out;
	}
	error = vn_lock(nvp, LK_EXCLUSIVE);
	if (error != 0) {
	vrele(nvp);
	goto out;
	}
	vput(nvp);
	goto relock;
	}
	*tvpp = nvp;
	}

	return (0);

	out:
	return (error);
	}

	/*
	* Note that we must use VRELE_ASYNC in this function as it walks
	* up the directory tree and vrele may need to acquire an exclusive
	* lock if a last reference to a vnode is dropped.
	*/
	static int
	zfs_rename_check(znode_t szp, znode_t sdzp, znode_t *tdzp)
	{
	zfsvfs_t *zfsvfs;
	znode_t zp, zp1;
	uint64_t parent;
	int error;

	zfsvfs = tdzp->z_zfsvfs;
	if (tdzp == szp)
	return (SET_ERROR(EINVAL));
	if (tdzp == sdzp)
	return (0);
	if (tdzp->z_id == zfsvfs->z_root)
	return (0);
	zp = tdzp;
	for (;;) {
	ASSERT(!zp->z_unlinked);
	if ((error = sa_lookup(zp->z_sa_hdl,
	SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
	break;

	if (parent == szp->z_id) {
	error = SET_ERROR(EINVAL);
	break;
	}
	if (parent == zfsvfs->z_root)
	break;
	if (parent == sdzp->z_id)
	break;

	error = zfs_zget(zfsvfs, parent, &zp1);
	if (error != 0)
	break;

	if (zp != tdzp)
	VN_RELE_ASYNC(ZTOV(zp),
	dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
	zp = zp1;
	}

	if (error == ENOTDIR)
	panic("checkpath: .. not a directory\n");
	if (zp != tdzp)
	VN_RELE_ASYNC(ZTOV(zp),
	dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
	return (error);
	}

	/*
	* Move an entry from the provided source directory to the target
	* directory. Change the entry name as indicated.
	*
	* IN: sdvp - Source directory containing the "old entry".
	* snm - Old entry name.
	* tdvp - Target directory to contain the "new entry".
	* tnm - New entry name.
	* cr - credentials of caller.
	* ct - caller context
	* flags - case flags
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* sdvp,tdvp - ctime\|mtime updated
	*/
	/ARGSUSED/
	static int
	zfs_rename(vnode_t sdvp, vnode_t svpp, struct componentname scnp,
	vnode_t tdvp, vnode_t tvpp, struct componentname tcnp,
	cred_t *cr)
	{
	zfsvfs_t *zfsvfs;
	znode_t sdzp, tdzp, szp, tzp;
	zilog_t *zilog = NULL;
	dmu_tx_t *tx;
	char *snm = scnp->cn_nameptr;
	char *tnm = tcnp->cn_nameptr;
	int error = 0;

	/* Reject renames across filesystems. */
	if ((*svpp)->v_mount != tdvp->v_mount \|\|
	((tvpp) != NULL && (svpp)->v_mount != (*tvpp)->v_mount)) {
	error = SET_ERROR(EXDEV);
	goto out;
	}

	if (zfsctl_is_node(tdvp)) {
	error = SET_ERROR(EXDEV);
	goto out;
	}

	/*
	* Lock all four vnodes to ensure safety and semantics of renaming.
	*/
	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
	if (error != 0) {
	/* no vnodes are locked in the case of error here */
	return (error);
	}

	tdzp = VTOZ(tdvp);
	sdzp = VTOZ(sdvp);
	zfsvfs = tdzp->z_zfsvfs;
	zilog = zfsvfs->z_log;

	/*
	* After we re-enter ZFS_ENTER() we will have to revalidate all
	* znodes involved.
	*/
	ZFS_ENTER(zfsvfs);

	if (zfsvfs->z_utf8 && u8_validate(tnm,
	strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	error = SET_ERROR(EILSEQ);
	goto unlockout;
	}

	/* If source and target are the same file, there is nothing to do. */
	if ((svpp) == (tvpp)) {
	error = 0;
	goto unlockout;
	}

	if (((svpp)->v_type == VDIR && (svpp)->v_mountedhere != NULL) \|\|
	((tvpp) != NULL && (tvpp)->v_type == VDIR &&
	(*tvpp)->v_mountedhere != NULL)) {
	error = SET_ERROR(EXDEV);
	goto unlockout;
	}

	/*
	* We can not use ZFS_VERIFY_ZP() here because it could directly return
	* bypassing the cleanup code in the case of an error.
	*/
	if (tdzp->z_sa_hdl == NULL \|\| sdzp->z_sa_hdl == NULL) {
	error = SET_ERROR(EIO);
	goto unlockout;
	}

	szp = VTOZ(*svpp);
	tzp = tvpp == NULL ? NULL : VTOZ(tvpp);
	if (szp->z_sa_hdl == NULL \|\| (tzp != NULL && tzp->z_sa_hdl == NULL)) {
	error = SET_ERROR(EIO);
	goto unlockout;
	}

	/*
	* This is to prevent the creation of links into attribute space
	* by renaming a linked file into/outof an attribute directory.
	* See the comment in zfs_link() for why this is considered bad.
	*/
	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
	error = SET_ERROR(EINVAL);
	goto unlockout;
	}

	/*
	* Must have write access at the source to remove the old entry
	* and write access at the target to create the new entry.
	* Note that if target and source are the same, this can be
	* done in a single check.
	*/
	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
	goto unlockout;

	if ((*svpp)->v_type == VDIR) {
	/*
	* Avoid ".", "..", and aliases of "." for obvious reasons.
	*/
	if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') \|\|
	sdzp == szp \|\|
	(scnp->cn_flags \| tcnp->cn_flags) & ISDOTDOT) {
	error = EINVAL;
	goto unlockout;
	}

	/*
	* Check to make sure rename is valid.
	* Can't do a move like this: /usr/a/b to /usr/a/b/c/d
	*/
	if (error = zfs_rename_check(szp, sdzp, tdzp))
	goto unlockout;
	}

	/*
	* Does target exist?
	*/
	if (tzp) {
	/*
	* Source and target must be the same type.
	*/
	if ((*svpp)->v_type == VDIR) {
	if ((*tvpp)->v_type != VDIR) {
	error = SET_ERROR(ENOTDIR);
	goto unlockout;
	} else {
	cache_purge(tdvp);
	if (sdvp != tdvp)
	cache_purge(sdvp);
	}
	} else {
	if ((*tvpp)->v_type == VDIR) {
	error = SET_ERROR(EISDIR);
	goto unlockout;
	}
	}
	}

	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
	if (tzp)
	vnevent_rename_dest(*tvpp, tdvp, tnm, ct);

	/*
	* notify the target directory if it is not the same
	* as source directory.
	*/
	if (tdvp != sdvp) {
	vnevent_rename_dest_dir(tdvp, ct);
	}

	tx = dmu_tx_create(zfsvfs->z_os);
	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
	if (sdzp != tdzp) {
	dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
	zfs_sa_upgrade_txholds(tx, tdzp);
	}
	if (tzp) {
	dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
	zfs_sa_upgrade_txholds(tx, tzp);
	}

	zfs_sa_upgrade_txholds(tx, szp);
	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	dmu_tx_abort(tx);
	goto unlockout;
	}


	if (tzp) /* Attempt to remove the existing target */
	error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);

	if (error == 0) {
	error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
	if (error == 0) {
	szp->z_pflags \|= ZFS_AV_MODIFIED;

	error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
	(void *)&szp->z_pflags, sizeof (uint64_t), tx);
	ASSERT0(error);

	error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
	NULL);
	if (error == 0) {
	zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
	snm, tdzp, tnm, szp);

	/*
	* Update path information for the target vnode
	*/
	vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
	} else {
	/*
	* At this point, we have successfully created
	* the target name, but have failed to remove
	* the source name. Since the create was done
	* with the ZRENAMING flag, there are
	* complications; for one, the link count is
	* wrong. The easiest way to deal with this
	* is to remove the newly created target, and
	* return the original error. This must
	* succeed; fortunately, it is very unlikely to
	* fail, since we just created it.
	*/
	VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
	ZRENAMING, NULL), ==, 0);
	}
	}
	if (error == 0) {
	cache_purge(*svpp);
	if (*tvpp != NULL)
	cache_purge(*tvpp);
	cache_purge_negative(tdvp);
	}
	}

	dmu_tx_commit(tx);

	unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
	ZFS_EXIT(zfsvfs);
	VOP_UNLOCK(*svpp, 0);
	VOP_UNLOCK(sdvp, 0);

	out: /* original two vnodes are locked */
	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zilog, 0);

	if (*tvpp != NULL)
	VOP_UNLOCK(*tvpp, 0);
	if (tdvp != *tvpp)
	VOP_UNLOCK(tdvp, 0);
	return (error);
	}

	/*
	* Insert the indicated symbolic reference entry into the directory.
	*
	* IN: dvp - Directory to contain new symbolic link.
	* link - Name for new symlink entry.
	* vap - Attributes of new entry.
	* cr - credentials of caller.
	* ct - caller context
	* flags - case flags
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* dvp - ctime\|mtime updated
	*/
	/ARGSUSED/
	static int
	zfs_symlink(vnode_t dvp, vnode_t vpp, char name, vattr_t vap, char link,
	cred_t cr, kthread_t td)
	{
	znode_t zp, dzp = VTOZ(dvp);
	dmu_tx_t *tx;
	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	zilog_t *zilog;
	uint64_t len = strlen(link);
	int error;
	zfs_acl_ids_t acl_ids;
	boolean_t fuid_dirtied;
	uint64_t txtype = TX_SYMLINK;
	int flags = 0;

	ASSERT(vap->va_type == VLNK);

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(dzp);
	zilog = zfsvfs->z_log;

	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
	NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EILSEQ));
	}

	if (len > MAXPATHLEN) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(ENAMETOOLONG));
	}

	if ((error = zfs_acl_ids_create(dzp, 0,
	vap, cr, NULL, &acl_ids)) != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Attempt to lock directory; fail if entry already exists.
	*/
	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
	if (error) {
	zfs_acl_ids_free(&acl_ids);
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
	zfs_acl_ids_free(&acl_ids);
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
	zfs_acl_ids_free(&acl_ids);
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EDQUOT));
	}

	getnewvnode_reserve(1);
	tx = dmu_tx_create(zfsvfs->z_os);
	fuid_dirtied = zfsvfs->z_fuid_dirty;
	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
	ZFS_SA_BASE_ATTR_SIZE + len);
	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
	acl_ids.z_aclp->z_acl_bytes);
	}
	if (fuid_dirtied)
	zfs_fuid_txhold(zfsvfs, tx);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	zfs_acl_ids_free(&acl_ids);
	dmu_tx_abort(tx);
	getnewvnode_drop_reserve();
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Create a new object for the symlink.
	* for version 4 ZPL datsets the symlink will be an SA attribute
	*/
	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);

	if (fuid_dirtied)
	zfs_fuid_sync(zfsvfs, tx);

	if (zp->z_is_sa)
	error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
	link, len, tx);
	else
	zfs_sa_symlink(zp, link, len, tx);

	zp->z_size = len;
	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
	&zp->z_size, sizeof (zp->z_size), tx);
	/*
	* Insert the new object into the directory.
	*/
	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);

	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
	*vpp = ZTOV(zp);

	zfs_acl_ids_free(&acl_ids);

	dmu_tx_commit(tx);

	getnewvnode_drop_reserve();

	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zilog, 0);

	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Return, in the buffer contained in the provided uio structure,
	* the symbolic path referred to by vp.
	*
	* IN: vp - vnode of symbolic link.
	* uio - structure to contain the link path.
	* cr - credentials of caller.
	* ct - caller context
	*
	* OUT: uio - structure containing the link path.
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* vp - atime updated
	*/
	/* ARGSUSED */
	static int
	zfs_readlink(vnode_t vp, uio_t uio, cred_t cr, caller_context_t ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	int error;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	if (zp->z_is_sa)
	error = sa_lookup_uio(zp->z_sa_hdl,
	SA_ZPL_SYMLINK(zfsvfs), uio);
	else
	error = zfs_sa_readlink(zp, uio);

	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);

	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Insert a new entry into directory tdvp referencing svp.
	*
	* IN: tdvp - Directory to contain new entry.
	* svp - vnode of new entry.
	* name - name of new entry.
	* cr - credentials of caller.
	* ct - caller context
	*
	* RETURN: 0 on success, error code on failure.
	*
	* Timestamps:
	* tdvp - ctime\|mtime updated
	* svp - ctime updated
	*/
	/* ARGSUSED */
	static int
	zfs_link(vnode_t tdvp, vnode_t svp, char name, cred_t cr,
	caller_context_t *ct, int flags)
	{
	znode_t *dzp = VTOZ(tdvp);
	znode_t tzp, szp;
	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
	zilog_t *zilog;
	dmu_tx_t *tx;
	int error;
	uint64_t parent;
	uid_t owner;

	ASSERT(tdvp->v_type == VDIR);

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(dzp);
	zilog = zfsvfs->z_log;

	/*
	* POSIX dictates that we return EPERM here.
	* Better choices include ENOTSUP or EISDIR.
	*/
	if (svp->v_type == VDIR) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EPERM));
	}

	szp = VTOZ(svp);
	ZFS_VERIFY_ZP(szp);

	if (szp->z_pflags & (ZFS_APPENDONLY \| ZFS_IMMUTABLE \| ZFS_READONLY)) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EPERM));
	}

	/* Prevent links to .zfs/shares files */

	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
	&parent, sizeof (uint64_t))) != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	if (parent == zfsvfs->z_shares_dir) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EPERM));
	}

	if (zfsvfs->z_utf8 && u8_validate(name,
	strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EILSEQ));
	}

	/*
	* We do not support links between attributes and non-attributes
	* because of the potential security risk of creating links
	* into "normal" file space in order to circumvent restrictions
	* imposed in attribute space.
	*/
	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EINVAL));
	}


	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(EPERM));
	}

	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	/*
	* Attempt to lock directory; fail if entry already exists.
	*/
	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
	if (error) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	tx = dmu_tx_create(zfsvfs->z_os);
	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
	zfs_sa_upgrade_txholds(tx, szp);
	zfs_sa_upgrade_txholds(tx, dzp);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	dmu_tx_abort(tx);
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	error = zfs_link_create(dzp, name, szp, tx, 0);

	if (error == 0) {
	uint64_t txtype = TX_LINK;
	zfs_log_link(zilog, tx, txtype, dzp, szp, name);
	}

	dmu_tx_commit(tx);

	if (error == 0) {
	vnevent_link(svp, ct);
	}

	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zilog, 0);

	ZFS_EXIT(zfsvfs);
	return (error);
	}


	/ARGSUSED/
	void
	zfs_inactive(vnode_t vp, cred_t cr, caller_context_t *ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	int error;

	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
	if (zp->z_sa_hdl == NULL) {
	/*
	* The fs has been unmounted, or we did a
	* suspend/resume and this file no longer exists.
	*/
	rw_exit(&zfsvfs->z_teardown_inactive_lock);
	vrecycle(vp);
	return;
	}

	if (zp->z_unlinked) {
	/*
	* Fast path to recycle a vnode of a removed file.
	*/
	rw_exit(&zfsvfs->z_teardown_inactive_lock);
	vrecycle(vp);
	return;
	}

	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
	dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);

	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	zfs_sa_upgrade_txholds(tx, zp);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error) {
	dmu_tx_abort(tx);
	} else {
	(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
	(void *)&zp->z_atime, sizeof (zp->z_atime), tx);
	zp->z_atime_dirty = 0;
	dmu_tx_commit(tx);
	}
	}
	rw_exit(&zfsvfs->z_teardown_inactive_lock);
	}


	CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
	CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));

	/ARGSUSED/
	static int
	zfs_fid(vnode_t vp, fid_t fidp, caller_context_t *ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	uint32_t gen;
	uint64_t gen64;
	uint64_t object = zp->z_id;
	zfid_short_t *zfid;
	int size, i, error;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
	&gen64, sizeof (uint64_t))) != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	gen = (uint32_t)gen64;

	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;

	#ifdef illumos
	if (fidp->fid_len < size) {
	fidp->fid_len = size;
	ZFS_EXIT(zfsvfs);
	return (SET_ERROR(ENOSPC));
	}
	#else
	fidp->fid_len = size;
	#endif

	zfid = (zfid_short_t *)fidp;

	zfid->zf_len = size;

	for (i = 0; i < sizeof (zfid->zf_object); i++)
	zfid->zf_object[i] = (uint8_t)(object >> (8 * i));

	/* Must have a non-zero generation number to distinguish from .zfs */
	if (gen == 0)
	gen = 1;
	for (i = 0; i < sizeof (zfid->zf_gen); i++)
	zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));

	if (size == LONG_FID_LEN) {
	uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
	zfid_long_t *zlfid;

	zlfid = (zfid_long_t *)fidp;

	for (i = 0; i < sizeof (zlfid->zf_setid); i++)
	zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));

	/* XXX - this should be the generation number for the objset */
	for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
	zlfid->zf_setgen[i] = 0;
	}

	ZFS_EXIT(zfsvfs);
	return (0);
	}

	static int
	zfs_pathconf(vnode_t vp, int cmd, ulong_t valp, cred_t *cr,
	caller_context_t *ct)
	{
	znode_t zp, xzp;
	zfsvfs_t *zfsvfs;
	int error;

	switch (cmd) {
	case _PC_LINK_MAX:
	*valp = INT_MAX;
	return (0);

	case _PC_FILESIZEBITS:
	*valp = 64;
	return (0);
	#ifdef illumos
	case _PC_XATTR_EXISTS:
	zp = VTOZ(vp);
	zfsvfs = zp->z_zfsvfs;
	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);
	*valp = 0;
	error = zfs_dirent_lookup(zp, "", &xzp,
	ZXATTR \| ZEXISTS \| ZSHARED);
	if (error == 0) {
	if (!zfs_dirempty(xzp))
	*valp = 1;
	vrele(ZTOV(xzp));
	} else if (error == ENOENT) {
	/*
	* If there aren't extended attributes, it's the
	* same as having zero of them.
	*/
	error = 0;
	}
	ZFS_EXIT(zfsvfs);
	return (error);

	case _PC_SATTR_ENABLED:
	case _PC_SATTR_EXISTS:
	*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
	(vp->v_type == VREG \|\| vp->v_type == VDIR);
	return (0);

	case _PC_ACCESS_FILTERING:
	*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
	vp->v_type == VDIR;
	return (0);

	case _PC_ACL_ENABLED:
	*valp = _ACL_ACE_ENABLED;
	return (0);
	#endif /* illumos */
	case _PC_MIN_HOLE_SIZE:
	*valp = (int)SPA_MINBLOCKSIZE;
	return (0);
	#ifdef illumos
	case _PC_TIMESTAMP_RESOLUTION:
	/* nanosecond timestamp resolution */
	*valp = 1L;
	return (0);
	#endif
	case _PC_ACL_EXTENDED:
	*valp = 0;
	return (0);

	case _PC_ACL_NFS4:
	*valp = 1;
	return (0);

	case _PC_ACL_PATH_MAX:
	*valp = ACL_MAX_ENTRIES;
	return (0);

	default:
	return (EOPNOTSUPP);
	}
	}

	/ARGSUSED/
	static int
	zfs_getsecattr(vnode_t vp, vsecattr_t vsecp, int flag, cred_t *cr,
	caller_context_t *ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	int error;
	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);
	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
	ZFS_EXIT(zfsvfs);

	return (error);
	}

	/ARGSUSED/
	int
	zfs_setsecattr(vnode_t vp, vsecattr_t vsecp, int flag, cred_t *cr,
	caller_context_t *ct)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	int error;
	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
	zilog_t *zilog = zfsvfs->z_log;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	error = zfs_setacl(zp, vsecp, skipaclchk, cr);

	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zilog, 0);

	ZFS_EXIT(zfsvfs);
	return (error);
	}

	static int
	zfs_getpages(struct vnode vp, vm_page_t ma, int count, int *rbehind,
	int *rahead)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	objset_t *os = zp->z_zfsvfs->z_os;
	rl_t *rl;
	vm_object_t object;
	off_t start, end, obj_size;
	uint_t blksz;
	int pgsin_b, pgsin_a;
	int error;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	start = IDX_TO_OFF(ma[0]->pindex);
	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);

	/*
	* Lock a range covering all required and optional pages.
	* Note that we need to handle the case of the block size growing.
	*/
	for (;;) {
	blksz = zp->z_blksz;
	rl = zfs_range_lock(zp, rounddown(start, blksz),
	roundup(end, blksz) - rounddown(start, blksz), RL_READER);
	if (blksz == zp->z_blksz)
	break;
	zfs_range_unlock(rl);
	}

	object = ma[0]->object;
	zfs_vmobject_wlock(object);
	obj_size = object->un_pager.vnp.vnp_size;
	zfs_vmobject_wunlock(object);
	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
	zfs_range_unlock(rl);
	ZFS_EXIT(zfsvfs);
	return (zfs_vm_pagerret_bad);
	}

	pgsin_b = 0;
	if (rbehind != NULL) {
	pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
	pgsin_b = MIN(*rbehind, pgsin_b);
	}

	pgsin_a = 0;
	if (rahead != NULL) {
	pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
	if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
	pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
	pgsin_a = MIN(*rahead, pgsin_a);
	}

	/*
	* NB: we need to pass the exact byte size of the data that we expect
	* to read after accounting for the file size. This is required because
	* ZFS will panic if we request DMU to read beyond the end of the last
	* allocated block.
	*/
	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
	MIN(end, obj_size) - (end - PAGE_SIZE));

	zfs_range_unlock(rl);
	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
	ZFS_EXIT(zfsvfs);

	if (error != 0)
	return (zfs_vm_pagerret_error);

	PCPU_INC(cnt.v_vnodein);
	PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a);
	if (rbehind != NULL)
	*rbehind = pgsin_b;
	if (rahead != NULL)
	*rahead = pgsin_a;
	return (zfs_vm_pagerret_ok);
	}

	static int
	zfs_freebsd_getpages(ap)
	struct vop_getpages_args /* {
	struct vnode *a_vp;
	vm_page_t *a_m;
	int a_count;
	int *a_rbehind;
	int *a_rahead;
	} / ap;
	{

	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
	ap->a_rahead));
	}

	static int
	zfs_putpages(struct vnode vp, vm_page_t ma, size_t len, int flags,
	int *rtvals)
	{
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
	rl_t *rl;
	dmu_tx_t *tx;
	struct sf_buf *sf;
	vm_object_t object;
	vm_page_t m;
	caddr_t va;
	size_t tocopy;
	size_t lo_len;
	vm_ooffset_t lo_off;
	vm_ooffset_t off;
	uint_t blksz;
	int ncount;
	int pcount;
	int err;
	int i;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	object = vp->v_object;
	pcount = btoc(len);
	ncount = pcount;

	KASSERT(ma[0]->object == object, ("mismatching object"));
	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));

	for (i = 0; i < pcount; i++)
	rtvals[i] = zfs_vm_pagerret_error;

	off = IDX_TO_OFF(ma[0]->pindex);
	blksz = zp->z_blksz;
	lo_off = rounddown(off, blksz);
	lo_len = roundup(len + (off - lo_off), blksz);
	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);

	zfs_vmobject_wlock(object);
	if (len + off > object->un_pager.vnp.vnp_size) {
	if (object->un_pager.vnp.vnp_size > off) {
	int pgoff;

	len = object->un_pager.vnp.vnp_size - off;
	ncount = btoc(len);
	if ((pgoff = (int)len & PAGE_MASK) != 0) {
	/*
	* If the object is locked and the following
	* conditions hold, then the page's dirty
	* field cannot be concurrently changed by a
	* pmap operation.
	*/
	m = ma[ncount - 1];
	vm_page_assert_sbusied(m);
	KASSERT(!pmap_page_is_write_mapped(m),
	("zfs_putpages: page %p is not read-only", m));
	vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
	pgoff);
	}
	} else {
	len = 0;
	ncount = 0;
	}
	if (ncount < pcount) {
	for (i = ncount; i < pcount; i++) {
	rtvals[i] = zfs_vm_pagerret_bad;
	}
	}
	}
	zfs_vmobject_wunlock(object);

	if (ncount == 0)
	goto out;

	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) \|\|
	zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
	goto out;
	}

	tx = dmu_tx_create(zfsvfs->z_os);
	dmu_tx_hold_write(tx, zp->z_id, off, len);

	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
	zfs_sa_upgrade_txholds(tx, zp);
	err = dmu_tx_assign(tx, TXG_WAIT);
	if (err != 0) {
	dmu_tx_abort(tx);
	goto out;
	}

	if (zp->z_blksz < PAGE_SIZE) {
	for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
	tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
	va = zfs_map_page(ma[i], &sf);
	dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
	zfs_unmap_page(sf);
	}
	} else {
	err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
	}

	if (err == 0) {
	uint64_t mtime[2], ctime[2];
	sa_bulk_attr_t bulk[3];
	int count = 0;

	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
	&mtime, 16);
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
	&ctime, 16);
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
	&zp->z_pflags, 8);
	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
	B_TRUE);
	err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
	ASSERT0(err);
	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);

	zfs_vmobject_wlock(object);
	for (i = 0; i < ncount; i++) {
	rtvals[i] = zfs_vm_pagerret_ok;
	vm_page_undirty(ma[i]);
	}
	zfs_vmobject_wunlock(object);
	PCPU_INC(cnt.v_vnodeout);
	PCPU_ADD(cnt.v_vnodepgsout, ncount);
	}
	dmu_tx_commit(tx);

	out:
	zfs_range_unlock(rl);
	if ((flags & (zfs_vm_pagerput_sync \| zfs_vm_pagerput_inval)) != 0 \|\|
	zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
	zil_commit(zfsvfs->z_log, zp->z_id);
	ZFS_EXIT(zfsvfs);
	return (rtvals[0]);
	}

	int
	zfs_freebsd_putpages(ap)
	struct vop_putpages_args /* {
	struct vnode *a_vp;
	vm_page_t *a_m;
	int a_count;
	int a_sync;
	int *a_rtvals;
	} / ap;
	{

	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
	ap->a_rtvals));
	}

	static int
	zfs_freebsd_bmap(ap)
	struct vop_bmap_args /* {
	struct vnode *a_vp;
	daddr_t a_bn;
	struct bufobj **a_bop;
	daddr_t *a_bnp;
	int *a_runp;
	int *a_runb;
	} / ap;
	{

	if (ap->a_bop != NULL)
	*ap->a_bop = &ap->a_vp->v_bufobj;
	if (ap->a_bnp != NULL)
	*ap->a_bnp = ap->a_bn;
	if (ap->a_runp != NULL)
	*ap->a_runp = 0;
	if (ap->a_runb != NULL)
	*ap->a_runb = 0;

	return (0);
	}

	static int
	zfs_freebsd_open(ap)
	struct vop_open_args /* {
	struct vnode *a_vp;
	int a_mode;
	struct ucred *a_cred;
	struct thread *a_td;
	} / ap;
	{
	vnode_t *vp = ap->a_vp;
	znode_t *zp = VTOZ(vp);
	int error;

	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
	if (error == 0)
	vnode_create_vobject(vp, zp->z_size, ap->a_td);
	return (error);
	}

	static int
	zfs_freebsd_close(ap)
	struct vop_close_args /* {
	struct vnode *a_vp;
	int a_fflag;
	struct ucred *a_cred;
	struct thread *a_td;
	} / ap;
	{

	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
	}

	static int
	zfs_freebsd_ioctl(ap)
	struct vop_ioctl_args /* {
	struct vnode *a_vp;
	u_long a_command;
	caddr_t a_data;
	int a_fflag;
	struct ucred *cred;
	struct thread *td;
	} / ap;
	{

	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
	ap->a_fflag, ap->a_cred, NULL, NULL));
	}

	static int
	ioflags(int ioflags)
	{
	int flags = 0;

	if (ioflags & IO_APPEND)
	flags \|= FAPPEND;
	if (ioflags & IO_NDELAY)
	flags \|= FNONBLOCK;
	if (ioflags & IO_SYNC)
	flags \|= (FSYNC \| FDSYNC \| FRSYNC);

	return (flags);
	}

	static int
	zfs_freebsd_read(ap)
	struct vop_read_args /* {
	struct vnode *a_vp;
	struct uio *a_uio;
	int a_ioflag;
	struct ucred *a_cred;
	} / ap;
	{

	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
	ap->a_cred, NULL));
	}

	static int
	zfs_freebsd_write(ap)
	struct vop_write_args /* {
	struct vnode *a_vp;
	struct uio *a_uio;
	int a_ioflag;
	struct ucred *a_cred;
	} / ap;
	{

	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
	ap->a_cred, NULL));
	}

	static int
	zfs_freebsd_access(ap)
	struct vop_access_args /* {
	struct vnode *a_vp;
	accmode_t a_accmode;
	struct ucred *a_cred;
	struct thread *a_td;
	} / ap;
	{
	vnode_t *vp = ap->a_vp;
	znode_t *zp = VTOZ(vp);
	accmode_t accmode;
	int error = 0;

	/*
	* ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
	*/
	accmode = ap->a_accmode & (VREAD\|VWRITE\|VEXEC\|VAPPEND);
	if (accmode != 0)
	error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);

	/*
	* VADMIN has to be handled by vaccess().
	*/
	if (error == 0) {
	accmode = ap->a_accmode & ~(VREAD\|VWRITE\|VEXEC\|VAPPEND);
	if (accmode != 0) {
	error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
	zp->z_gid, accmode, ap->a_cred, NULL);
	}
	}

	/*
	* For VEXEC, ensure that at least one execute bit is set for
	* non-directories.
	*/
	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
	(zp->z_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH)) == 0) {
	error = EACCES;
	}

	return (error);
	}

	static int
	zfs_freebsd_lookup(ap)
	struct vop_lookup_args /* {
	struct vnode *a_dvp;
	struct vnode **a_vpp;
	struct componentname *a_cnp;
	} / ap;
	{
	struct componentname *cnp = ap->a_cnp;
	char nm[NAME_MAX + 1];

	ASSERT(cnp->cn_namelen < sizeof(nm));
	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));

	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
	cnp->cn_cred, cnp->cn_thread, 0));
	}

	static int
	zfs_cache_lookup(ap)
	struct vop_lookup_args /* {
	struct vnode *a_dvp;
	struct vnode **a_vpp;
	struct componentname *a_cnp;
	} / ap;
	{
	zfsvfs_t *zfsvfs;

	zfsvfs = ap->a_dvp->v_mount->mnt_data;
	if (zfsvfs->z_use_namecache)
	return (vfs_cache_lookup(ap));
	else
	return (zfs_freebsd_lookup(ap));
	}

	static int
	zfs_freebsd_create(ap)
	struct vop_create_args /* {
	struct vnode *a_dvp;
	struct vnode **a_vpp;
	struct componentname *a_cnp;
	struct vattr *a_vap;
	} / ap;
	{
	zfsvfs_t *zfsvfs;
	struct componentname *cnp = ap->a_cnp;
	vattr_t *vap = ap->a_vap;
	int error, mode;

	ASSERT(cnp->cn_flags & SAVENAME);

	vattr_init_mask(vap);
	mode = vap->va_mode & ALLPERMS;
	zfsvfs = ap->a_dvp->v_mount->mnt_data;

	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
	ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
	if (zfsvfs->z_use_namecache &&
	error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
	cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
	return (error);
	}

	static int
	zfs_freebsd_remove(ap)
	struct vop_remove_args /* {
	struct vnode *a_dvp;
	struct vnode *a_vp;
	struct componentname *a_cnp;
	} / ap;
	{

	ASSERT(ap->a_cnp->cn_flags & SAVENAME);

	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
	ap->a_cnp->cn_cred));
	}

	static int
	zfs_freebsd_mkdir(ap)
	struct vop_mkdir_args /* {
	struct vnode *a_dvp;
	struct vnode **a_vpp;
	struct componentname *a_cnp;
	struct vattr *a_vap;
	} / ap;
	{
	vattr_t *vap = ap->a_vap;

	ASSERT(ap->a_cnp->cn_flags & SAVENAME);

	vattr_init_mask(vap);

	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
	ap->a_cnp->cn_cred));
	}

	static int
	zfs_freebsd_rmdir(ap)
	struct vop_rmdir_args /* {
	struct vnode *a_dvp;
	struct vnode *a_vp;
	struct componentname *a_cnp;
	} / ap;
	{
	struct componentname *cnp = ap->a_cnp;

	ASSERT(cnp->cn_flags & SAVENAME);

	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
	}

	static int
	zfs_freebsd_readdir(ap)
	struct vop_readdir_args /* {
	struct vnode *a_vp;
	struct uio *a_uio;
	struct ucred *a_cred;
	int *a_eofflag;
	int *a_ncookies;
	u_long **a_cookies;
	} / ap;
	{

	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
	ap->a_ncookies, ap->a_cookies));
	}

	static int
	zfs_freebsd_fsync(ap)
	struct vop_fsync_args /* {
	struct vnode *a_vp;
	int a_waitfor;
	struct thread *a_td;
	} / ap;
	{

	vop_stdfsync(ap);
	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
	}

	static int
	zfs_freebsd_getattr(ap)
	struct vop_getattr_args /* {
	struct vnode *a_vp;
	struct vattr *a_vap;
	struct ucred *a_cred;
	} / ap;
	{
	vattr_t *vap = ap->a_vap;
	xvattr_t xvap;
	u_long fflags = 0;
	int error;

	xva_init(&xvap);
	xvap.xva_vattr = *vap;
	xvap.xva_vattr.va_mask \|= AT_XVATTR;

	/* Convert chflags into ZFS-type flags. */
	/* XXX: what about SF_SETTABLE?. */
	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
	XVA_SET_REQ(&xvap, XAT_NODUMP);
	XVA_SET_REQ(&xvap, XAT_READONLY);
	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
	XVA_SET_REQ(&xvap, XAT_SYSTEM);
	XVA_SET_REQ(&xvap, XAT_HIDDEN);
	XVA_SET_REQ(&xvap, XAT_REPARSE);
	XVA_SET_REQ(&xvap, XAT_OFFLINE);
	XVA_SET_REQ(&xvap, XAT_SPARSE);

	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
	if (error != 0)
	return (error);

	/* Convert ZFS xattr into chflags. */
	#define FLAG_CHECK(fflag, xflag, xfield) do { \
	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
	fflags \|= (fflag); \
	} while (0)
	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
	xvap.xva_xoptattrs.xoa_immutable);
	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
	xvap.xva_xoptattrs.xoa_appendonly);
	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
	xvap.xva_xoptattrs.xoa_nounlink);
	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
	xvap.xva_xoptattrs.xoa_archive);
	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
	xvap.xva_xoptattrs.xoa_nodump);
	FLAG_CHECK(UF_READONLY, XAT_READONLY,
	xvap.xva_xoptattrs.xoa_readonly);
	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
	xvap.xva_xoptattrs.xoa_system);
	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
	xvap.xva_xoptattrs.xoa_hidden);
	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
	xvap.xva_xoptattrs.xoa_reparse);
	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
	xvap.xva_xoptattrs.xoa_offline);
	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
	xvap.xva_xoptattrs.xoa_sparse);

	#undef FLAG_CHECK
	*vap = xvap.xva_vattr;
	vap->va_flags = fflags;
	return (0);
	}

	static int
	zfs_freebsd_setattr(ap)
	struct vop_setattr_args /* {
	struct vnode *a_vp;
	struct vattr *a_vap;
	struct ucred *a_cred;
	} / ap;
	{
	vnode_t *vp = ap->a_vp;
	vattr_t *vap = ap->a_vap;
	cred_t *cred = ap->a_cred;
	xvattr_t xvap;
	u_long fflags;
	uint64_t zflags;

	vattr_init_mask(vap);
	vap->va_mask &= ~AT_NOSET;

	xva_init(&xvap);
	xvap.xva_vattr = *vap;

	zflags = VTOZ(vp)->z_pflags;

	if (vap->va_flags != VNOVAL) {
	zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
	int error;

	if (zfsvfs->z_use_fuids == B_FALSE)
	return (EOPNOTSUPP);

	fflags = vap->va_flags;
	/*
	* XXX KDM
	* We need to figure out whether it makes sense to allow
	* UF_REPARSE through, since we don't really have other
	* facilities to handle reparse points and zfs_setattr()
	* doesn't currently allow setting that attribute anyway.
	*/
	if ((fflags & ~(SF_IMMUTABLE\|SF_APPEND\|SF_NOUNLINK\|UF_ARCHIVE\|
	UF_NODUMP\|UF_SYSTEM\|UF_HIDDEN\|UF_READONLY\|UF_REPARSE\|
	UF_OFFLINE\|UF_SPARSE)) != 0)
	return (EOPNOTSUPP);
	/*
	* Unprivileged processes are not permitted to unset system
	* flags, or modify flags if any system flags are set.
	* Privileged non-jail processes may not modify system flags
	* if securelevel > 0 and any existing system flags are set.
	* Privileged jail processes behave like privileged non-jail
	* processes if the security.jail.chflags_allowed sysctl is
	* is non-zero; otherwise, they behave like unprivileged
	* processes.
	*/
	if (secpolicy_fs_owner(vp->v_mount, cred) == 0 \|\|
	priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
	if (zflags &
	(ZFS_IMMUTABLE \| ZFS_APPENDONLY \| ZFS_NOUNLINK)) {
	error = securelevel_gt(cred, 0);
	if (error != 0)
	return (error);
	}
	} else {
	/*
	* Callers may only modify the file flags on objects they
	* have VADMIN rights for.
	*/
	if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
	return (error);
	if (zflags &
	(ZFS_IMMUTABLE \| ZFS_APPENDONLY \| ZFS_NOUNLINK)) {
	return (EPERM);
	}
	if (fflags &
	(SF_IMMUTABLE \| SF_APPEND \| SF_NOUNLINK)) {
	return (EPERM);
	}
	}

	#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
	if (((fflags & (fflag)) && !(zflags & (zflag))) \|\| \
	((zflags & (zflag)) && !(fflags & (fflag)))) { \
	XVA_SET_REQ(&xvap, (xflag)); \
	(xfield) = ((fflags & (fflag)) != 0); \
	} \
	} while (0)
	/* Convert chflags into ZFS-type flags. */
	/* XXX: what about SF_SETTABLE?. */
	FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
	xvap.xva_xoptattrs.xoa_immutable);
	FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
	xvap.xva_xoptattrs.xoa_appendonly);
	FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
	xvap.xva_xoptattrs.xoa_nounlink);
	FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
	xvap.xva_xoptattrs.xoa_archive);
	FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
	xvap.xva_xoptattrs.xoa_nodump);
	FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
	xvap.xva_xoptattrs.xoa_readonly);
	FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
	xvap.xva_xoptattrs.xoa_system);
	FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
	xvap.xva_xoptattrs.xoa_hidden);
	FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
	xvap.xva_xoptattrs.xoa_hidden);
	FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
	xvap.xva_xoptattrs.xoa_offline);
	FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
	xvap.xva_xoptattrs.xoa_sparse);
	#undef FLAG_CHANGE
	}
	if (vap->va_birthtime.tv_sec != VNOVAL) {
	xvap.xva_vattr.va_mask \|= AT_XVATTR;
	XVA_SET_REQ(&xvap, XAT_CREATETIME);
	}
	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
	}

	static int
	zfs_freebsd_rename(ap)
	struct vop_rename_args /* {
	struct vnode *a_fdvp;
	struct vnode *a_fvp;
	struct componentname *a_fcnp;
	struct vnode *a_tdvp;
	struct vnode *a_tvp;
	struct componentname *a_tcnp;
	} / ap;
	{
	vnode_t *fdvp = ap->a_fdvp;
	vnode_t *fvp = ap->a_fvp;
	vnode_t *tdvp = ap->a_tdvp;
	vnode_t *tvp = ap->a_tvp;
	int error;

	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME\|SAVESTART));
	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME\|SAVESTART));

	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
	ap->a_tcnp, ap->a_fcnp->cn_cred);

	vrele(fdvp);
	vrele(fvp);
	vrele(tdvp);
	if (tvp != NULL)
	vrele(tvp);

	return (error);
	}

	static int
	zfs_freebsd_symlink(ap)
	struct vop_symlink_args /* {
	struct vnode *a_dvp;
	struct vnode **a_vpp;
	struct componentname *a_cnp;
	struct vattr *a_vap;
	char *a_target;
	} / ap;
	{
	struct componentname *cnp = ap->a_cnp;
	vattr_t *vap = ap->a_vap;

	ASSERT(cnp->cn_flags & SAVENAME);

	vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
	vattr_init_mask(vap);

	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
	ap->a_target, cnp->cn_cred, cnp->cn_thread));
	}

	static int
	zfs_freebsd_readlink(ap)
	struct vop_readlink_args /* {
	struct vnode *a_vp;
	struct uio *a_uio;
	struct ucred *a_cred;
	} / ap;
	{

	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
	}

	static int
	zfs_freebsd_link(ap)
	struct vop_link_args /* {
	struct vnode *a_tdvp;
	struct vnode *a_vp;
	struct componentname *a_cnp;
	} / ap;
	{
	struct componentname *cnp = ap->a_cnp;
	vnode_t *vp = ap->a_vp;
	vnode_t *tdvp = ap->a_tdvp;

	if (tdvp->v_mount != vp->v_mount)
	return (EXDEV);

	ASSERT(cnp->cn_flags & SAVENAME);

	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
	}

	static int
	zfs_freebsd_inactive(ap)
	struct vop_inactive_args /* {
	struct vnode *a_vp;
	struct thread *a_td;
	} / ap;
	{
	vnode_t *vp = ap->a_vp;

	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
	return (0);
	}

	static int
	zfs_freebsd_reclaim(ap)
	struct vop_reclaim_args /* {
	struct vnode *a_vp;
	struct thread *a_td;
	} / ap;
	{
	vnode_t *vp = ap->a_vp;
	znode_t *zp = VTOZ(vp);
	zfsvfs_t *zfsvfs = zp->z_zfsvfs;

	ASSERT(zp != NULL);

	/* Destroy the vm object and flush associated pages. */
	vnode_destroy_vobject(vp);

	/*
	* z_teardown_inactive_lock protects from a race with
	* zfs_znode_dmu_fini in zfsvfs_teardown during
	* force unmount.
	*/
	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
	if (zp->z_sa_hdl == NULL)
	zfs_znode_free(zp);
	else
	zfs_zinactive(zp);
	rw_exit(&zfsvfs->z_teardown_inactive_lock);

	vp->v_data = NULL;
	return (0);
	}

	static int
	zfs_freebsd_fid(ap)
	struct vop_fid_args /* {
	struct vnode *a_vp;
	struct fid *a_fid;
	} / ap;
	{

	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
	}

	static int
	zfs_freebsd_pathconf(ap)
	struct vop_pathconf_args /* {
	struct vnode *a_vp;
	int a_name;
	register_t *a_retval;
	} / ap;
	{
	ulong_t val;
	int error;

	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
	if (error == 0) {
	*ap->a_retval = val;
	return (error);
	}
	if (error != EOPNOTSUPP)
	return (error);

	switch (ap->a_name) {
	case _PC_NAME_MAX:
	*ap->a_retval = NAME_MAX;
	return (0);
	case _PC_PIPE_BUF:
	if (ap->a_vp->v_type == VDIR \|\| ap->a_vp->v_type == VFIFO) {
	*ap->a_retval = PIPE_BUF;
	return (0);
	}
	return (EINVAL);
	default:
	return (vop_stdpathconf(ap));
	}
	}

	/*
	* FreeBSD's extended attributes namespace defines file name prefix for ZFS'
	* extended attribute name:
	*
	* NAMESPACE PREFIX
	* system freebsd:system:
	* user (none, can be used to access ZFS fsattr(5) attributes
	* created on Solaris)
	*/
	static int
	zfs_create_attrname(int attrnamespace, const char name, char attrname,
	size_t size)
	{
	const char namespace, prefix, *suffix;

	/* We don't allow '/' character in attribute name. */
	if (strchr(name, '/') != NULL)
	return (EINVAL);
	/* We don't allow attribute names that start with "freebsd:" string. */
	if (strncmp(name, "freebsd:", 8) == 0)
	return (EINVAL);

	bzero(attrname, size);

	switch (attrnamespace) {
	case EXTATTR_NAMESPACE_USER:
	#if 0
	prefix = "freebsd:";
	namespace = EXTATTR_NAMESPACE_USER_STRING;
	suffix = ":";
	#else
	/*
	* This is the default namespace by which we can access all
	* attributes created on Solaris.
	*/
	prefix = namespace = suffix = "";
	#endif
	break;
	case EXTATTR_NAMESPACE_SYSTEM:
	prefix = "freebsd:";
	namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
	suffix = ":";
	break;
	case EXTATTR_NAMESPACE_EMPTY:
	default:
	return (EINVAL);
	}
	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
	name) >= size) {
	return (ENAMETOOLONG);
	}
	return (0);
	}

	/*
	* Vnode operating to retrieve a named extended attribute.
	*/
	static int
	zfs_getextattr(struct vop_getextattr_args *ap)
	/*
	vop_getextattr {
	IN struct vnode *a_vp;
	IN int a_attrnamespace;
	IN const char *a_name;
	INOUT struct uio *a_uio;
	OUT size_t *a_size;
	IN struct ucred *a_cred;
	IN struct thread *a_td;
	};
	*/
	{
	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
	struct thread *td = ap->a_td;
	struct nameidata nd;
	char attrname[255];
	struct vattr va;
	vnode_t xvp = NULL, vp;
	int error, flags;

	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	ap->a_cred, ap->a_td, VREAD);
	if (error != 0)
	return (error);

	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
	sizeof(attrname));
	if (error != 0)
	return (error);

	ZFS_ENTER(zfsvfs);

	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
	LOOKUP_XATTR);
	if (error != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	flags = FREAD;
	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
	xvp, td);
	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
	vp = nd.ni_vp;
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (error != 0) {
	ZFS_EXIT(zfsvfs);
	if (error == ENOENT)
	error = ENOATTR;
	return (error);
	}

	if (ap->a_size != NULL) {
	error = VOP_GETATTR(vp, &va, ap->a_cred);
	if (error == 0)
	*ap->a_size = (size_t)va.va_size;
	} else if (ap->a_uio != NULL)
	error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);

	VOP_UNLOCK(vp, 0);
	vn_close(vp, flags, ap->a_cred, td);
	ZFS_EXIT(zfsvfs);

	return (error);
	}

	/*
	* Vnode operation to remove a named attribute.
	*/
	int
	zfs_deleteextattr(struct vop_deleteextattr_args *ap)
	/*
	vop_deleteextattr {
	IN struct vnode *a_vp;
	IN int a_attrnamespace;
	IN const char *a_name;
	IN struct ucred *a_cred;
	IN struct thread *a_td;
	};
	*/
	{
	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
	struct thread *td = ap->a_td;
	struct nameidata nd;
	char attrname[255];
	struct vattr va;
	vnode_t xvp = NULL, vp;
	int error, flags;

	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	ap->a_cred, ap->a_td, VWRITE);
	if (error != 0)
	return (error);

	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
	sizeof(attrname));
	if (error != 0)
	return (error);

	ZFS_ENTER(zfsvfs);

	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
	LOOKUP_XATTR);
	if (error != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	NDINIT_ATVP(&nd, DELETE, NOFOLLOW \| LOCKPARENT \| LOCKLEAF,
	UIO_SYSSPACE, attrname, xvp, td);
	error = namei(&nd);
	vp = nd.ni_vp;
	if (error != 0) {
	ZFS_EXIT(zfsvfs);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (error == ENOENT)
	error = ENOATTR;
	return (error);
	}

	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
	NDFREE(&nd, NDF_ONLY_PNBUF);

	vput(nd.ni_dvp);
	if (vp == nd.ni_dvp)
	vrele(vp);
	else
	vput(vp);
	ZFS_EXIT(zfsvfs);

	return (error);
	}

	/*
	* Vnode operation to set a named attribute.
	*/
	static int
	zfs_setextattr(struct vop_setextattr_args *ap)
	/*
	vop_setextattr {
	IN struct vnode *a_vp;
	IN int a_attrnamespace;
	IN const char *a_name;
	INOUT struct uio *a_uio;
	IN struct ucred *a_cred;
	IN struct thread *a_td;
	};
	*/
	{
	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
	struct thread *td = ap->a_td;
	struct nameidata nd;
	char attrname[255];
	struct vattr va;
	vnode_t xvp = NULL, vp;
	int error, flags;

	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	ap->a_cred, ap->a_td, VWRITE);
	if (error != 0)
	return (error);

	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
	sizeof(attrname));
	if (error != 0)
	return (error);

	ZFS_ENTER(zfsvfs);

	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
	LOOKUP_XATTR \| CREATE_XATTR_DIR);
	if (error != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	flags = FFLAGS(O_WRONLY \| O_CREAT);
	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
	xvp, td);
	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
	vp = nd.ni_vp;
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (error != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	VATTR_NULL(&va);
	va.va_size = 0;
	error = VOP_SETATTR(vp, &va, ap->a_cred);
	if (error == 0)
	VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);

	VOP_UNLOCK(vp, 0);
	vn_close(vp, flags, ap->a_cred, td);
	ZFS_EXIT(zfsvfs);

	return (error);
	}

	/*
	* Vnode operation to retrieve extended attributes on a vnode.
	*/
	static int
	zfs_listextattr(struct vop_listextattr_args *ap)
	/*
	vop_listextattr {
	IN struct vnode *a_vp;
	IN int a_attrnamespace;
	INOUT struct uio *a_uio;
	OUT size_t *a_size;
	IN struct ucred *a_cred;
	IN struct thread *a_td;
	};
	*/
	{
	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
	struct thread *td = ap->a_td;
	struct nameidata nd;
	char attrprefix[16];
	u_char dirbuf[sizeof(struct dirent)];
	struct dirent *dp;
	struct iovec aiov;
	struct uio auio, *uio = ap->a_uio;
	size_t *sizep = ap->a_size;
	size_t plen;
	vnode_t xvp = NULL, vp;
	int done, error, eof, pos;

	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	ap->a_cred, ap->a_td, VREAD);
	if (error != 0)
	return (error);

	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
	sizeof(attrprefix));
	if (error != 0)
	return (error);
	plen = strlen(attrprefix);

	ZFS_ENTER(zfsvfs);

	if (sizep != NULL)
	*sizep = 0;

	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
	LOOKUP_XATTR);
	if (error != 0) {
	ZFS_EXIT(zfsvfs);
	/*
	* ENOATTR means that the EA directory does not yet exist,
	* i.e. there are no extended attributes there.
	*/
	if (error == ENOATTR)
	error = 0;
	return (error);
	}

	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW \| LOCKLEAF \| LOCKSHARED,
	UIO_SYSSPACE, ".", xvp, td);
	error = namei(&nd);
	vp = nd.ni_vp;
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (error != 0) {
	ZFS_EXIT(zfsvfs);
	return (error);
	}

	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = td;
	auio.uio_rw = UIO_READ;
	auio.uio_offset = 0;

	do {
	u_char nlen;

	aiov.iov_base = (void *)dirbuf;
	aiov.iov_len = sizeof(dirbuf);
	auio.uio_resid = sizeof(dirbuf);
	error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
	done = sizeof(dirbuf) - auio.uio_resid;
	if (error != 0)
	break;
	for (pos = 0; pos < done;) {
	dp = (struct dirent *)(dirbuf + pos);
	pos += dp->d_reclen;
	/*
	* XXX: Temporarily we also accept DT_UNKNOWN, as this
	* is what we get when attribute was created on Solaris.
	*/
	if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
	continue;
	if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
	continue;
	else if (strncmp(dp->d_name, attrprefix, plen) != 0)
	continue;
	nlen = dp->d_namlen - plen;
	if (sizep != NULL)
	*sizep += 1 + nlen;
	else if (uio != NULL) {
	/*
	* Format of extattr name entry is one byte for
	* length and the rest for name.
	*/
	error = uiomove(&nlen, 1, uio->uio_rw, uio);
	if (error == 0) {
	error = uiomove(dp->d_name + plen, nlen,
	uio->uio_rw, uio);
	}
	if (error != 0)
	break;
	}
	}
	} while (!eof && error == 0);

	vput(vp);
	ZFS_EXIT(zfsvfs);

	return (error);
	}

	int
	zfs_freebsd_getacl(ap)
	struct vop_getacl_args /* {
	struct vnode *vp;
	acl_type_t type;
	struct acl *aclp;
	struct ucred *cred;
	struct thread *td;
	} / ap;
	{
	int error;
	vsecattr_t vsecattr;

	if (ap->a_type != ACL_TYPE_NFS4)
	return (EINVAL);

	vsecattr.vsa_mask = VSA_ACE \| VSA_ACECNT;
	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
	return (error);

	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
	if (vsecattr.vsa_aclentp != NULL)
	kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);

	return (error);
	}

	int
	zfs_freebsd_setacl(ap)
	struct vop_setacl_args /* {
	struct vnode *vp;
	acl_type_t type;
	struct acl *aclp;
	struct ucred *cred;
	struct thread *td;
	} / ap;
	{
	int error;
	vsecattr_t vsecattr;
	int aclbsize; /* size of acl list in bytes */
	aclent_t *aaclp;

	if (ap->a_type != ACL_TYPE_NFS4)
	return (EINVAL);

	if (ap->a_aclp == NULL)
	return (EINVAL);

	if (ap->a_aclp->acl_cnt < 1 \|\| ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
	return (EINVAL);

	/*
	* With NFSv4 ACLs, chmod(2) may need to add additional entries,
	* splitting every entry into two and appending "canonical six"
	* entries at the end. Don't allow for setting an ACL that would
	* cause chmod(2) to run out of ACL entries.
	*/
	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
	return (ENOSPC);

	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
	if (error != 0)
	return (error);

	vsecattr.vsa_mask = VSA_ACE;
	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
	aaclp = vsecattr.vsa_aclentp;
	vsecattr.vsa_aclentsz = aclbsize;

	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
	kmem_free(aaclp, aclbsize);

	return (error);
	}

	int
	zfs_freebsd_aclcheck(ap)
	struct vop_aclcheck_args /* {
	struct vnode *vp;
	acl_type_t type;
	struct acl *aclp;
	struct ucred *cred;
	struct thread *td;
	} / ap;
	{

	return (EOPNOTSUPP);
	}

	static int
	zfs_vptocnp(struct vop_vptocnp_args *ap)
	{
	vnode_t *covered_vp;
	vnode_t *vp = ap->a_vp;;
	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
	znode_t *zp = VTOZ(vp);
	int ltype;
	int error;

	ZFS_ENTER(zfsvfs);
	ZFS_VERIFY_ZP(zp);

	/*
	* If we are a snapshot mounted under .zfs, run the operation
	* on the covered vnode.
	*/
	if (zp->z_id != zfsvfs->z_root \|\| zfsvfs->z_parent == zfsvfs) {
	char name[MAXNAMLEN + 1];
	znode_t *dzp;
	size_t len;

	error = zfs_znode_parent_and_name(zp, &dzp, name);
	if (error == 0) {
	len = strlen(name);
	if (*ap->a_buflen < len)
	error = SET_ERROR(ENOMEM);
	}
	if (error == 0) {
	*ap->a_buflen -= len;
	bcopy(name, ap->a_buf + *ap->a_buflen, len);
	*ap->a_vpp = ZTOV(dzp);
	}
	ZFS_EXIT(zfsvfs);
	return (error);
	}
	ZFS_EXIT(zfsvfs);

	covered_vp = vp->v_mount->mnt_vnodecovered;
	vhold(covered_vp);
	ltype = VOP_ISLOCKED(vp);
	VOP_UNLOCK(vp, 0);
	error = vget(covered_vp, LK_SHARED \| LK_VNHELD, curthread);
	if (error == 0) {
	error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
	ap->a_buf, ap->a_buflen);
	vput(covered_vp);
	}
	vn_lock(vp, ltype \| LK_RETRY);
	if ((vp->v_iflag & VI_DOOMED) != 0)
	error = SET_ERROR(ENOENT);
	return (error);
	}

	#ifdef DIAGNOSTIC
	static int
	zfs_lock(ap)
	struct vop_lock1_args /* {
	struct vnode *a_vp;
	int a_flags;
	char *file;
	int line;
	} / ap;
	{
	vnode_t *vp;
	znode_t *zp;
	int err;

	err = vop_stdlock(ap);
	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
	vp = ap->a_vp;
	zp = vp->v_data;
	if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
	zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
	VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
	}
	return (err);
	}
	#endif

	struct vop_vector zfs_vnodeops;
	struct vop_vector zfs_fifoops;
	struct vop_vector zfs_shareops;

	struct vop_vector zfs_vnodeops = {
	.vop_default = &default_vnodeops,
	.vop_inactive = zfs_freebsd_inactive,
	.vop_reclaim = zfs_freebsd_reclaim,
	.vop_access = zfs_freebsd_access,
	.vop_lookup = zfs_cache_lookup,
	.vop_cachedlookup = zfs_freebsd_lookup,
	.vop_getattr = zfs_freebsd_getattr,
	.vop_setattr = zfs_freebsd_setattr,
	.vop_create = zfs_freebsd_create,
	.vop_mknod = zfs_freebsd_create,
	.vop_mkdir = zfs_freebsd_mkdir,
	.vop_readdir = zfs_freebsd_readdir,
	.vop_fsync = zfs_freebsd_fsync,
	.vop_open = zfs_freebsd_open,
	.vop_close = zfs_freebsd_close,
	.vop_rmdir = zfs_freebsd_rmdir,
	.vop_ioctl = zfs_freebsd_ioctl,
	.vop_link = zfs_freebsd_link,
	.vop_symlink = zfs_freebsd_symlink,
	.vop_readlink = zfs_freebsd_readlink,
	.vop_read = zfs_freebsd_read,
	.vop_write = zfs_freebsd_write,
	.vop_remove = zfs_freebsd_remove,
	.vop_rename = zfs_freebsd_rename,
	.vop_pathconf = zfs_freebsd_pathconf,
	.vop_bmap = zfs_freebsd_bmap,
	.vop_fid = zfs_freebsd_fid,
	.vop_getextattr = zfs_getextattr,
	.vop_deleteextattr = zfs_deleteextattr,
	.vop_setextattr = zfs_setextattr,
	.vop_listextattr = zfs_listextattr,
	.vop_getacl = zfs_freebsd_getacl,
	.vop_setacl = zfs_freebsd_setacl,
	.vop_aclcheck = zfs_freebsd_aclcheck,
	.vop_getpages = zfs_freebsd_getpages,
	.vop_putpages = zfs_freebsd_putpages,
	.vop_vptocnp = zfs_vptocnp,
	#ifdef DIAGNOSTIC
	.vop_lock1 = zfs_lock,
	#endif
	};

	struct vop_vector zfs_fifoops = {
	.vop_default = &fifo_specops,
	.vop_fsync = zfs_freebsd_fsync,
	.vop_access = zfs_freebsd_access,
	.vop_getattr = zfs_freebsd_getattr,
	.vop_inactive = zfs_freebsd_inactive,
	.vop_read = VOP_PANIC,
	.vop_reclaim = zfs_freebsd_reclaim,
	.vop_setattr = zfs_freebsd_setattr,
	.vop_write = VOP_PANIC,
	.vop_pathconf = zfs_freebsd_pathconf,
	.vop_fid = zfs_freebsd_fid,
	.vop_getacl = zfs_freebsd_getacl,
	.vop_setacl = zfs_freebsd_setacl,
	.vop_aclcheck = zfs_freebsd_aclcheck,
	};

	/*
	* special share hidden files vnode operations template
	*/
	struct vop_vector zfs_shareops = {
	.vop_default = &default_vnodeops,
	.vop_access = zfs_freebsd_access,
	.vop_inactive = zfs_freebsd_inactive,
	.vop_reclaim = zfs_freebsd_reclaim,
	.vop_fid = zfs_freebsd_fid,
	.vop_pathconf = zfs_freebsd_pathconf,
	};
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 332525)
	@@ -1,3249 +1,3249 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/dmu.h>
	#include <sys/zap.h>
	#include <sys/arc.h>
	#include <sys/stat.h>
	#include <sys/resource.h>
	#include <sys/zil.h>
	#include <sys/zil_impl.h>
	#include <sys/dsl_dataset.h>
	#include <sys/vdev_impl.h>
	#include <sys/dmu_tx.h>
	#include <sys/dsl_pool.h>
	#include <sys/abd.h>

	/*
	* The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
	* calls that change the file system. Each itx has enough information to
	* be able to replay them after a system crash, power loss, or
	* equivalent failure mode. These are stored in memory until either:
	*
	* 1. they are committed to the pool by the DMU transaction group
	* (txg), at which point they can be discarded; or
	* 2. they are committed to the on-disk ZIL for the dataset being
	* modified (e.g. due to an fsync, O_DSYNC, or other synchronous
	* requirement).
	*
	* In the event of a crash or power loss, the itxs contained by each
	* dataset's on-disk ZIL will be replayed when that dataset is first
	* instantianted (e.g. if the dataset is a normal fileystem, when it is
	* first mounted).
	*
	* As hinted at above, there is one ZIL per dataset (both the in-memory
	* representation, and the on-disk representation). The on-disk format
	* consists of 3 parts:
	*
	* - a single, per-dataset, ZIL header; which points to a chain of
	* - zero or more ZIL blocks; each of which contains
	* - zero or more ZIL records
	*
	* A ZIL record holds the information necessary to replay a single
	* system call transaction. A ZIL block can hold many ZIL records, and
	* the blocks are chained together, similarly to a singly linked list.
	*
	* Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
	* block in the chain, and the ZIL header points to the first block in
	* the chain.
	*
	* Note, there is not a fixed place in the pool to hold these ZIL
	* blocks; they are dynamically allocated and freed as needed from the
	* blocks available on the pool, though they can be preferentially
	* allocated from a dedicated "log" vdev.
	*/

	/*
	* This controls the amount of time that a ZIL block (lwb) will remain
	* "open" when it isn't "full", and it has a thread waiting for it to be
	* committed to stable storage. Please refer to the zil_commit_waiter()
	* function (and the comments within it) for more details.
	*/
	int zfs_commit_timeout_pct = 5;

	/*
	* Disable intent logging replay. This global ZIL switch affects all pools.
	*/
	int zil_replay_disable = 0;
	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN,
	&zil_replay_disable, 0, "Disable intent logging replay");

	/*
	* Tunable parameter for debugging or performance analysis. Setting
	* zfs_nocacheflush will cause corruption on power loss if a volatile
	* out-of-order write cache is enabled.
	*/
	boolean_t zfs_nocacheflush = B_FALSE;
	SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
	&zfs_nocacheflush, 0, "Disable cache flush");
	boolean_t zfs_trim_enabled = B_TRUE;
	SYSCTL_DECL(_vfs_zfs_trim);
	SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
	"Enable ZFS TRIM");

	/*
	* Limit SLOG write size per commit executed with synchronous priority.
	* Any writes above that will be executed with lower (asynchronous) priority
	* to limit potential SLOG device abuse by single active ZIL writer.
	*/
	uint64_t zil_slog_bulk = 768 * 1024;
	SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN,
	&zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority");

	static kmem_cache_t *zil_lwb_cache;
	static kmem_cache_t *zil_zcw_cache;

	#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
	sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))

	static int
	zil_bp_compare(const void x1, const void x2)
	{
	const dva_t dva1 = &((zil_bp_node_t )x1)->zn_dva;
	const dva_t dva2 = &((zil_bp_node_t )x2)->zn_dva;

	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
	return (-1);
	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
	return (1);

	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
	return (-1);
	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
	return (1);

	return (0);
	}

	static void
	zil_bp_tree_init(zilog_t *zilog)
	{
	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
	sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
	}

	static void
	zil_bp_tree_fini(zilog_t *zilog)
	{
	avl_tree_t *t = &zilog->zl_bp_tree;
	zil_bp_node_t *zn;
	void *cookie = NULL;

	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
	kmem_free(zn, sizeof (zil_bp_node_t));

	avl_destroy(t);
	}

	int
	zil_bp_tree_add(zilog_t zilog, const blkptr_t bp)
	{
	avl_tree_t *t = &zilog->zl_bp_tree;
	const dva_t *dva;
	zil_bp_node_t *zn;
	avl_index_t where;

	if (BP_IS_EMBEDDED(bp))
	return (0);

	dva = BP_IDENTITY(bp);

	if (avl_find(t, dva, &where) != NULL)
	return (SET_ERROR(EEXIST));

	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
	zn->zn_dva = *dva;
	avl_insert(t, zn, where);

	return (0);
	}

	static zil_header_t *
	zil_header_in_syncing_context(zilog_t *zilog)
	{
	return ((zil_header_t *)zilog->zl_header);
	}

	static void
	zil_init_log_chain(zilog_t zilog, blkptr_t bp)
	{
	zio_cksum_t *zc = &bp->blk_cksum;

	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
	}

	/*
	* Read a log block and make sure it's valid.
	*/
	static int
	zil_read_log_block(zilog_t zilog, const blkptr_t bp, blkptr_t nbp, void dst,
	char **end)
	{
	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
	arc_flags_t aflags = ARC_FLAG_WAIT;
	arc_buf_t *abuf = NULL;
	zbookmark_phys_t zb;
	int error;

	if (zilog->zl_header->zh_claim_txg == 0)
	zio_flags \|= ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SCRUB;

	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
	zio_flags \|= ZIO_FLAG_SPECULATIVE;

	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
	ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);

	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
	ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);

	if (error == 0) {
	zio_cksum_t cksum = bp->blk_cksum;

	/*
	* Validate the checksummed log block.
	*
	* Sequence numbers should be... sequential. The checksum
	* verifier for the next block should be bp's checksum plus 1.
	*
	* Also check the log chain linkage and size used.
	*/
	cksum.zc_word[ZIL_ZC_SEQ]++;

	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
	zil_chain_t *zilc = abuf->b_data;
	char lr = (char )(zilc + 1);
	uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);

	if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
	sizeof (cksum)) \|\| BP_IS_HOLE(&zilc->zc_next_blk)) {
	error = SET_ERROR(ECKSUM);
	} else {
	ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
	bcopy(lr, dst, len);
	end = (char )dst + len;
	*nbp = zilc->zc_next_blk;
	}
	} else {
	char *lr = abuf->b_data;
	uint64_t size = BP_GET_LSIZE(bp);
	zil_chain_t zilc = (zil_chain_t )(lr + size) - 1;

	if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
	sizeof (cksum)) \|\| BP_IS_HOLE(&zilc->zc_next_blk) \|\|
	(zilc->zc_nused > (size - sizeof (*zilc)))) {
	error = SET_ERROR(ECKSUM);
	} else {
	ASSERT3U(zilc->zc_nused, <=,
	SPA_OLD_MAXBLOCKSIZE);
	bcopy(lr, dst, zilc->zc_nused);
	end = (char )dst + zilc->zc_nused;
	*nbp = zilc->zc_next_blk;
	}
	}

	arc_buf_destroy(abuf, &abuf);
	}

	return (error);
	}

	/*
	* Read a TX_WRITE log data block.
	*/
	static int
	zil_read_log_data(zilog_t zilog, const lr_write_t lr, void *wbuf)
	{
	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
	const blkptr_t *bp = &lr->lr_blkptr;
	arc_flags_t aflags = ARC_FLAG_WAIT;
	arc_buf_t *abuf = NULL;
	zbookmark_phys_t zb;
	int error;

	if (BP_IS_HOLE(bp)) {
	if (wbuf != NULL)
	bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
	return (0);
	}

	if (zilog->zl_header->zh_claim_txg == 0)
	zio_flags \|= ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SCRUB;

	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
	ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));

	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
	ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);

	if (error == 0) {
	if (wbuf != NULL)
	bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
	arc_buf_destroy(abuf, &abuf);
	}

	return (error);
	}

	/*
	* Parse the intent log, and call parse_func for each valid record within.
	*/
	int
	zil_parse(zilog_t zilog, zil_parse_blk_func_t parse_blk_func,
	zil_parse_lr_func_t parse_lr_func, void arg, uint64_t txg)
	{
	const zil_header_t *zh = zilog->zl_header;
	boolean_t claimed = !!zh->zh_claim_txg;
	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
	uint64_t max_blk_seq = 0;
	uint64_t max_lr_seq = 0;
	uint64_t blk_count = 0;
	uint64_t lr_count = 0;
	blkptr_t blk, next_blk;
	char lrbuf, lrp;
	int error = 0;

	/*
	* Old logs didn't record the maximum zh_claim_lr_seq.
	*/
	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
	claim_lr_seq = UINT64_MAX;

	/*
	* Starting at the block pointed to by zh_log we read the log chain.
	* For each block in the chain we strongly check that block to
	* ensure its validity. We stop when an invalid block is found.
	* For each block pointer in the chain we call parse_blk_func().
	* For each record in each valid block we call parse_lr_func().
	* If the log has been claimed, stop if we encounter a sequence
	* number greater than the highest claimed sequence number.
	*/
	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
	zil_bp_tree_init(zilog);

	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
	uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
	int reclen;
	char *end;

	if (blk_seq > claim_blk_seq)
	break;
	if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
	break;
	ASSERT3U(max_blk_seq, <, blk_seq);
	max_blk_seq = blk_seq;
	blk_count++;

	if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
	break;

	error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
	if (error != 0)
	break;

	for (lrp = lrbuf; lrp < end; lrp += reclen) {
	lr_t lr = (lr_t )lrp;
	reclen = lr->lrc_reclen;
	ASSERT3U(reclen, >=, sizeof (lr_t));
	if (lr->lrc_seq > claim_lr_seq)
	goto done;
	if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
	goto done;
	ASSERT3U(max_lr_seq, <, lr->lrc_seq);
	max_lr_seq = lr->lrc_seq;
	lr_count++;
	}
	}
	done:
	zilog->zl_parse_error = error;
	zilog->zl_parse_blk_seq = max_blk_seq;
	zilog->zl_parse_lr_seq = max_lr_seq;
	zilog->zl_parse_blk_count = blk_count;
	zilog->zl_parse_lr_count = lr_count;

	ASSERT(!claimed \|\| !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) \|\|
	(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));

	zil_bp_tree_fini(zilog);
	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);

	return (error);
	}

	static int
	zil_claim_log_block(zilog_t zilog, blkptr_t bp, void *tx, uint64_t first_txg)
	{
	/*
	* Claim log block if not already committed and not already claimed.
	* If tx == NULL, just verify that the block is claimable.
	*/
	if (BP_IS_HOLE(bp) \|\| bp->blk_birth < first_txg \|\|
	zil_bp_tree_add(zilog, bp) != 0)
	return (0);

	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
	tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SCRUB)));
	}

	static int
	zil_claim_log_record(zilog_t zilog, lr_t lrc, void *tx, uint64_t first_txg)
	{
	lr_write_t lr = (lr_write_t )lrc;
	int error;

	if (lrc->lrc_txtype != TX_WRITE)
	return (0);

	/*
	* If the block is not readable, don't claim it. This can happen
	* in normal operation when a log block is written to disk before
	* some of the dmu_sync() blocks it points to. In this case, the
	* transaction cannot have been committed to anyone (we would have
	* waited for all writes to be stable first), so it is semantically
	* correct to declare this the end of the log.
	*/
	if (lr->lr_blkptr.blk_birth >= first_txg &&
	(error = zil_read_log_data(zilog, lr, NULL)) != 0)
	return (error);
	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
	}

	/* ARGSUSED */
	static int
	zil_free_log_block(zilog_t zilog, blkptr_t bp, void *tx, uint64_t claim_txg)
	{
	zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);

	return (0);
	}

	static int
	zil_free_log_record(zilog_t zilog, lr_t lrc, void *tx, uint64_t claim_txg)
	{
	lr_write_t lr = (lr_write_t )lrc;
	blkptr_t *bp = &lr->lr_blkptr;

	/*
	* If we previously claimed it, we need to free it.
	*/
	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
	bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
	!BP_IS_HOLE(bp))
	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);

	return (0);
	}

	static int
	zil_lwb_vdev_compare(const void x1, const void x2)
	{
	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;

	if (v1 < v2)
	return (-1);
	if (v1 > v2)
	return (1);

	return (0);
	}

	static lwb_t *
	zil_alloc_lwb(zilog_t zilog, blkptr_t bp, boolean_t slog, uint64_t txg)
	{
	lwb_t *lwb;

	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
	lwb->lwb_zilog = zilog;
	lwb->lwb_blk = *bp;
	lwb->lwb_slog = slog;
	lwb->lwb_state = LWB_STATE_CLOSED;
	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
	lwb->lwb_max_txg = txg;
	lwb->lwb_write_zio = NULL;
	lwb->lwb_root_zio = NULL;
	lwb->lwb_tx = NULL;
	lwb->lwb_issued_timestamp = 0;
	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
	lwb->lwb_nused = sizeof (zil_chain_t);
	lwb->lwb_sz = BP_GET_LSIZE(bp);
	} else {
	lwb->lwb_nused = 0;
	lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
	}

	mutex_enter(&zilog->zl_lock);
	list_insert_tail(&zilog->zl_lwb_list, lwb);
	mutex_exit(&zilog->zl_lock);

	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
	VERIFY(list_is_empty(&lwb->lwb_waiters));

	return (lwb);
	}

	static void
	zil_free_lwb(zilog_t zilog, lwb_t lwb)
	{
	ASSERT(MUTEX_HELD(&zilog->zl_lock));
	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
	VERIFY(list_is_empty(&lwb->lwb_waiters));
	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
	ASSERT(lwb->lwb_state == LWB_STATE_CLOSED \|\|
	lwb->lwb_state == LWB_STATE_DONE);

	/*
	* Clear the zilog's field to indicate this lwb is no longer
	* valid, and prevent use-after-free errors.
	*/
	if (zilog->zl_last_lwb_opened == lwb)
	zilog->zl_last_lwb_opened = NULL;

	kmem_cache_free(zil_lwb_cache, lwb);
	}

	/*
	* Called when we create in-memory log transactions so that we know
	* to cleanup the itxs at the end of spa_sync().
	*/
	void
	zilog_dirty(zilog_t *zilog, uint64_t txg)
	{
	dsl_pool_t *dp = zilog->zl_dmu_pool;
	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);

	ASSERT(spa_writeable(zilog->zl_spa));

	if (ds->ds_is_snapshot)
	panic("dirtying snapshot!");

	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
	/* up the hold count until we can be written out */
	dmu_buf_add_ref(ds->ds_dbuf, zilog);

	zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
	}
	}

	/*
	* Determine if the zil is dirty in the specified txg. Callers wanting to
	* ensure that the dirty state does not change must hold the itxg_lock for
	* the specified txg. Holding the lock will ensure that the zil cannot be
	* dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
	* state.
	*/
	boolean_t
	zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
	{
	dsl_pool_t *dp = zilog->zl_dmu_pool;

	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
	return (B_TRUE);
	return (B_FALSE);
	}

	/*
	* Determine if the zil is dirty. The zil is considered dirty if it has
	* any pending itx records that have not been cleaned by zil_clean().
	*/
	boolean_t
	zilog_is_dirty(zilog_t *zilog)
	{
	dsl_pool_t *dp = zilog->zl_dmu_pool;

	for (int t = 0; t < TXG_SIZE; t++) {
	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	/*
	* Create an on-disk intent log.
	*/
	static lwb_t *
	zil_create(zilog_t *zilog)
	{
	const zil_header_t *zh = zilog->zl_header;
	lwb_t *lwb = NULL;
	uint64_t txg = 0;
	dmu_tx_t *tx = NULL;
	blkptr_t blk;
	int error = 0;
	boolean_t slog = FALSE;

	/*
	* Wait for any previous destroy to complete.
	*/
	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);

	ASSERT(zh->zh_claim_txg == 0);
	ASSERT(zh->zh_replay_seq == 0);

	blk = zh->zh_log;

	/*
	* Allocate an initial log block if:
	* - there isn't one already
	* - the existing block is the wrong endianess
	*/
	if (BP_IS_HOLE(&blk) \|\| BP_SHOULD_BYTESWAP(&blk)) {
	tx = dmu_tx_create(zilog->zl_os);
	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	txg = dmu_tx_get_txg(tx);

	if (!BP_IS_HOLE(&blk)) {
	zio_free_zil(zilog->zl_spa, txg, &blk);
	BP_ZERO(&blk);
	}

	error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
	ZIL_MIN_BLKSZ, &slog);

	if (error == 0)
	zil_init_log_chain(zilog, &blk);
	}

	/*
	* Allocate a log write block (lwb) for the first log block.
	*/
	if (error == 0)
	lwb = zil_alloc_lwb(zilog, &blk, slog, txg);

	/*
	* If we just allocated the first log block, commit our transaction
	* and wait for zil_sync() to stuff the block poiner into zh_log.
	* (zh is part of the MOS, so we cannot modify it in open context.)
	*/
	if (tx != NULL) {
	dmu_tx_commit(tx);
	txg_wait_synced(zilog->zl_dmu_pool, txg);
	}

	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);

	return (lwb);
	}

	/*
	* In one tx, free all log blocks and clear the log header. If keep_first
	* is set, then we're replaying a log with no content. We want to keep the
	* first block, however, so that the first synchronous transaction doesn't
	* require a txg_wait_synced() in zil_create(). We don't need to
	* txg_wait_synced() here either when keep_first is set, because both
	* zil_create() and zil_destroy() will wait for any in-progress destroys
	* to complete.
	*/
	void
	zil_destroy(zilog_t *zilog, boolean_t keep_first)
	{
	const zil_header_t *zh = zilog->zl_header;
	lwb_t *lwb;
	dmu_tx_t *tx;
	uint64_t txg;

	/*
	* Wait for any previous destroy to complete.
	*/
	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);

	zilog->zl_old_header = zh; / debugging aid */

	if (BP_IS_HOLE(&zh->zh_log))
	return;

	tx = dmu_tx_create(zilog->zl_os);
	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	txg = dmu_tx_get_txg(tx);

	mutex_enter(&zilog->zl_lock);

	ASSERT3U(zilog->zl_destroy_txg, <, txg);
	zilog->zl_destroy_txg = txg;
	zilog->zl_keep_first = keep_first;

	if (!list_is_empty(&zilog->zl_lwb_list)) {
	ASSERT(zh->zh_claim_txg == 0);
	VERIFY(!keep_first);
	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
	list_remove(&zilog->zl_lwb_list, lwb);
	if (lwb->lwb_buf != NULL)
	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
	zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
	zil_free_lwb(zilog, lwb);
	}
	} else if (!keep_first) {
	zil_destroy_sync(zilog, tx);
	}
	mutex_exit(&zilog->zl_lock);

	dmu_tx_commit(tx);
	}

	void
	zil_destroy_sync(zilog_t zilog, dmu_tx_t tx)
	{
	ASSERT(list_is_empty(&zilog->zl_lwb_list));
	(void) zil_parse(zilog, zil_free_log_block,
	zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
	}

	int
	zil_claim(dsl_pool_t dp, dsl_dataset_t ds, void *txarg)
	{
	dmu_tx_t *tx = txarg;
	uint64_t first_txg = dmu_tx_get_txg(tx);
	zilog_t *zilog;
	zil_header_t *zh;
	objset_t *os;
	int error;

	error = dmu_objset_own_obj(dp, ds->ds_object,
	DMU_OST_ANY, B_FALSE, FTAG, &os);
	if (error != 0) {
	/*
	* EBUSY indicates that the objset is inconsistent, in which
	* case it can not have a ZIL.
	*/
	if (error != EBUSY) {
	cmn_err(CE_WARN, "can't open objset for %llu, error %u",
	(unsigned long long)ds->ds_object, error);
	}
	return (0);
	}

	zilog = dmu_objset_zil(os);
	zh = zil_header_in_syncing_context(zilog);

	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
	if (!BP_IS_HOLE(&zh->zh_log))
	zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
	BP_ZERO(&zh->zh_log);
	dsl_dataset_dirty(dmu_objset_ds(os), tx);
	dmu_objset_disown(os, FTAG);
	return (0);
	}

	/*
	* Claim all log blocks if we haven't already done so, and remember
	* the highest claimed sequence number. This ensures that if we can
	* read only part of the log now (e.g. due to a missing device),
	* but we can read the entire log later, we will not try to replay
	* or destroy beyond the last block we successfully claimed.
	*/
	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
	(void) zil_parse(zilog, zil_claim_log_block,
	zil_claim_log_record, tx, first_txg);
	zh->zh_claim_txg = first_txg;
	zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
	zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
	if (zilog->zl_parse_lr_count \|\| zilog->zl_parse_blk_count > 1)
	zh->zh_flags \|= ZIL_REPLAY_NEEDED;
	zh->zh_flags \|= ZIL_CLAIM_LR_SEQ_VALID;
	dsl_dataset_dirty(dmu_objset_ds(os), tx);
	}

	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
	dmu_objset_disown(os, FTAG);
	return (0);
	}

	/*
	* Check the log by walking the log chain.
	* Checksum errors are ok as they indicate the end of the chain.
	* Any other error (no device or read failure) returns an error.
	*/
	/* ARGSUSED */
	int
	zil_check_log_chain(dsl_pool_t dp, dsl_dataset_t ds, void *tx)
	{
	zilog_t *zilog;
	objset_t *os;
	blkptr_t *bp;
	int error;

	ASSERT(tx == NULL);

	error = dmu_objset_from_ds(ds, &os);
	if (error != 0) {
	cmn_err(CE_WARN, "can't open objset %llu, error %d",
	(unsigned long long)ds->ds_object, error);
	return (0);
	}

	zilog = dmu_objset_zil(os);
	bp = (blkptr_t *)&zilog->zl_header->zh_log;

	/*
	* Check the first block and determine if it's on a log device
	* which may have been removed or faulted prior to loading this
	* pool. If so, there's no point in checking the rest of the log
	* as its content should have already been synced to the pool.
	*/
	if (!BP_IS_HOLE(bp)) {
	vdev_t *vd;
	boolean_t valid = B_TRUE;

	spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
	vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
	if (vd->vdev_islog && vdev_is_dead(vd))
	valid = vdev_log_state_valid(vd);
	spa_config_exit(os->os_spa, SCL_STATE, FTAG);

	if (!valid)
	return (0);
	}

	/*
	* Because tx == NULL, zil_claim_log_block() will not actually claim
	* any blocks, but just determine whether it is possible to do so.
	* In addition to checking the log chain, zil_claim_log_block()
	* will invoke zio_claim() with a done func of spa_claim_notify(),
	* which will update spa_max_claim_txg. See spa_load() for details.
	*/
	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
	zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));

	return ((error == ECKSUM \|\| error == ENOENT) ? 0 : error);
	}

	/*
	* When an itx is "skipped", this function is used to properly mark the
	* waiter as "done, and signal any thread(s) waiting on it. An itx can
	* be skipped (and not committed to an lwb) for a variety of reasons,
	* one of them being that the itx was committed via spa_sync(), prior to
	* it being committed to an lwb; this can happen if a thread calling
	* zil_commit() is racing with spa_sync().
	*/
	static void
	zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
	{
	mutex_enter(&zcw->zcw_lock);
	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
	zcw->zcw_done = B_TRUE;
	cv_broadcast(&zcw->zcw_cv);
	mutex_exit(&zcw->zcw_lock);
	}

	/*
	* This function is used when the given waiter is to be linked into an
	* lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
	* At this point, the waiter will no longer be referenced by the itx,
	* and instead, will be referenced by the lwb.
	*/
	static void
	zil_commit_waiter_link_lwb(zil_commit_waiter_t zcw, lwb_t lwb)
	{
	/*
	* The lwb_waiters field of the lwb is protected by the zilog's
	* zl_lock, thus it must be held when calling this function.
	*/
	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));

	mutex_enter(&zcw->zcw_lock);
	ASSERT(!list_link_active(&zcw->zcw_node));
	ASSERT3P(zcw->zcw_lwb, ==, NULL);
	ASSERT3P(lwb, !=, NULL);
	ASSERT(lwb->lwb_state == LWB_STATE_OPENED \|\|
	lwb->lwb_state == LWB_STATE_ISSUED);

	list_insert_tail(&lwb->lwb_waiters, zcw);
	zcw->zcw_lwb = lwb;
	mutex_exit(&zcw->zcw_lock);
	}

	/*
	* This function is used when zio_alloc_zil() fails to allocate a ZIL
	* block, and the given waiter must be linked to the "nolwb waiters"
	* list inside of zil_process_commit_list().
	*/
	static void
	zil_commit_waiter_link_nolwb(zil_commit_waiter_t zcw, list_t nolwb)
	{
	mutex_enter(&zcw->zcw_lock);
	ASSERT(!list_link_active(&zcw->zcw_node));
	ASSERT3P(zcw->zcw_lwb, ==, NULL);
	list_insert_tail(nolwb, zcw);
	mutex_exit(&zcw->zcw_lock);
	}

	void
	zil_lwb_add_block(lwb_t lwb, const blkptr_t bp)
	{
	avl_tree_t *t = &lwb->lwb_vdev_tree;
	avl_index_t where;
	zil_vdev_node_t *zv, zvsearch;
	int ndvas = BP_GET_NDVAS(bp);
	int i;

	if (zfs_nocacheflush)
	return;

	mutex_enter(&lwb->lwb_vdev_lock);
	for (i = 0; i < ndvas; i++) {
	zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
	if (avl_find(t, &zvsearch, &where) == NULL) {
	zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
	zv->zv_vdev = zvsearch.zv_vdev;
	avl_insert(t, zv, where);
	}
	}
	mutex_exit(&lwb->lwb_vdev_lock);
	}

	void
	zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
	{
	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
	}

	/*
	* This function is a called after all VDEVs associated with a given lwb
	* write have completed their DKIOCFLUSHWRITECACHE command; or as soon
	* as the lwb write completes, if "zfs_nocacheflush" is set.
	*
	* The intention is for this function to be called as soon as the
	* contents of an lwb are considered "stable" on disk, and will survive
	* any sudden loss of power. At this point, any threads waiting for the
	* lwb to reach this state are signalled, and the "waiter" structures
	* are marked "done".
	*/
	static void
	zil_lwb_flush_vdevs_done(zio_t *zio)
	{
	lwb_t *lwb = zio->io_private;
	zilog_t *zilog = lwb->lwb_zilog;
	dmu_tx_t *tx = lwb->lwb_tx;
	zil_commit_waiter_t *zcw;

	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);

	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);

	mutex_enter(&zilog->zl_lock);

	/*
	* Ensure the lwb buffer pointer is cleared before releasing the
	* txg. If we have had an allocation failure and the txg is
	* waiting to sync then we want zil_sync() to remove the lwb so
	* that it's not picked up as the next new one in
	* zil_process_commit_list(). zil_sync() will only remove the
	* lwb if lwb_buf is null.
	*/
	lwb->lwb_buf = NULL;
	lwb->lwb_tx = NULL;

	ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
	zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;

	lwb->lwb_root_zio = NULL;
	lwb->lwb_state = LWB_STATE_DONE;

	if (zilog->zl_last_lwb_opened == lwb) {
	/*
	* Remember the highest committed log sequence number
	* for ztest. We only update this value when all the log
	* writes succeeded, because ztest wants to ASSERT that
	* it got the whole log chain.
	*/
	zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
	}

	while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
	mutex_enter(&zcw->zcw_lock);

	ASSERT(list_link_active(&zcw->zcw_node));
	list_remove(&lwb->lwb_waiters, zcw);

	ASSERT3P(zcw->zcw_lwb, ==, lwb);
	zcw->zcw_lwb = NULL;

	zcw->zcw_zio_error = zio->io_error;

	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
	zcw->zcw_done = B_TRUE;
	cv_broadcast(&zcw->zcw_cv);

	mutex_exit(&zcw->zcw_lock);
	}

	mutex_exit(&zilog->zl_lock);

	/*
	* Now that we've written this log block, we have a stable pointer
	* to the next block in the chain, so it's OK to let the txg in
	* which we allocated the next block sync.
	*/
	dmu_tx_commit(tx);
	}

	/*
	* This is called when an lwb write completes. This means, this specific
	* lwb was written to disk, and all dependent lwb have also been
	* written to disk.
	*
	* At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to
	* the VDEVs involved in writing out this specific lwb. The lwb will be
	* "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the
	* zio completion callback for the lwb's root zio.
	*/
	static void
	zil_lwb_write_done(zio_t *zio)
	{
	lwb_t *lwb = zio->io_private;
	spa_t *spa = zio->io_spa;
	zilog_t *zilog = lwb->lwb_zilog;
	avl_tree_t *t = &lwb->lwb_vdev_tree;
	void *cookie = NULL;
	zil_vdev_node_t *zv;

	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);

	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
	ASSERT(!BP_IS_GANG(zio->io_bp));
	ASSERT(!BP_IS_HOLE(zio->io_bp));
	ASSERT(BP_GET_FILL(zio->io_bp) == 0);

	abd_put(zio->io_abd);

	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);

	mutex_enter(&zilog->zl_lock);
	lwb->lwb_write_zio = NULL;
	mutex_exit(&zilog->zl_lock);

	if (avl_numnodes(t) == 0)
	return;

	/*
	* If there was an IO error, we're not going to call zio_flush()
	* on these vdevs, so we simply empty the tree and free the
	* nodes. We avoid calling zio_flush() since there isn't any
	* good reason for doing so, after the lwb block failed to be
	* written out.
	*/
	if (zio->io_error != 0) {
	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
	kmem_free(zv, sizeof (*zv));
	return;
	}

	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
	vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
	if (vd != NULL)
	zio_flush(lwb->lwb_root_zio, vd);
	kmem_free(zv, sizeof (*zv));
	}
	}

	/*
	* This function's purpose is to "open" an lwb such that it is ready to
	* accept new itxs being committed to it. To do this, the lwb's zio
	* structures are created, and linked to the lwb. This function is
	* idempotent; if the passed in lwb has already been opened, this
	* function is essentially a no-op.
	*/
	static void
	zil_lwb_write_open(zilog_t zilog, lwb_t lwb)
	{
	zbookmark_phys_t zb;
	zio_priority_t prio;

	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	ASSERT3P(lwb, !=, NULL);
	EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
	EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);

	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
	ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
	lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);

	if (lwb->lwb_root_zio == NULL) {
	abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
	BP_GET_LSIZE(&lwb->lwb_blk));

	if (!lwb->lwb_slog \|\| zilog->zl_cur_used <= zil_slog_bulk)
	prio = ZIO_PRIORITY_SYNC_WRITE;
	else
	prio = ZIO_PRIORITY_ASYNC_WRITE;

	lwb->lwb_root_zio = zio_root(zilog->zl_spa,
	zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
	ASSERT3P(lwb->lwb_root_zio, !=, NULL);

	lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
	zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
	BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
	prio, ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE, &zb);
	ASSERT3P(lwb->lwb_write_zio, !=, NULL);

	lwb->lwb_state = LWB_STATE_OPENED;

	mutex_enter(&zilog->zl_lock);

	/*
	* The zilog's "zl_last_lwb_opened" field is used to
	* build the lwb/zio dependency chain, which is used to
	* preserve the ordering of lwb completions that is
	* required by the semantics of the ZIL. Each new lwb
	* zio becomes a parent of the "previous" lwb zio, such
	* that the new lwb's zio cannot complete until the
	* "previous" lwb's zio completes.
	*
	* This is required by the semantics of zil_commit();
	* the commit waiters attached to the lwbs will be woken
	* in the lwb zio's completion callback, so this zio
	* dependency graph ensures the waiters are woken in the
	* correct order (the same order the lwbs were created).
	*/
	lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
	if (last_lwb_opened != NULL &&
	last_lwb_opened->lwb_state != LWB_STATE_DONE) {
	ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED \|\|
	last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
	ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
	zio_add_child(lwb->lwb_root_zio,
	last_lwb_opened->lwb_root_zio);
	}
	zilog->zl_last_lwb_opened = lwb;

	mutex_exit(&zilog->zl_lock);
	}

	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
	}

	/*
	* Define a limited set of intent log block sizes.
	*
	* These must be a multiple of 4KB. Note only the amount used (again
	* aligned to 4KB) actually gets written. However, we can't always just
	* allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
	*/
	uint64_t zil_block_buckets[] = {
	4096, /* non TX_WRITE */
	8192+4096, /* data base */
	321024 + 4096, / NFS writes */
	UINT64_MAX
	};

	/*
	* Start a log block write and advance to the next log block.
	* Calls are serialized.
	*/
	static lwb_t *
	zil_lwb_write_issue(zilog_t zilog, lwb_t lwb)
	{
	lwb_t *nlwb = NULL;
	zil_chain_t *zilc;
	spa_t *spa = zilog->zl_spa;
	blkptr_t *bp;
	dmu_tx_t *tx;
	uint64_t txg;
	uint64_t zil_blksz, wsz;
	int i, error;
	boolean_t slog;

	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);

	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
	zilc = (zil_chain_t *)lwb->lwb_buf;
	bp = &zilc->zc_next_blk;
	} else {
	zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
	bp = &zilc->zc_next_blk;
	}

	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);

	/*
	* Allocate the next block and save its address in this block
	* before writing it in order to establish the log chain.
	* Note that if the allocation of nlwb synced before we wrote
	* the block that points at it (lwb), we'd leak it if we crashed.
	* Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
	* We dirty the dataset to ensure that zil_sync() will be called
	* to clean up in the event of allocation failure or I/O failure.
	*/

	tx = dmu_tx_create(zilog->zl_os);

	/*
	* Since we are not going to create any new dirty data, and we
	* can even help with clearing the existing dirty data, we
	* should not be subject to the dirty data based delays. We
	* use TXG_NOTHROTTLE to bypass the delay mechanism.
	*/
	VERIFY0(dmu_tx_assign(tx, TXG_WAIT \| TXG_NOTHROTTLE));

	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	txg = dmu_tx_get_txg(tx);

	lwb->lwb_tx = tx;

	/*
	* Log blocks are pre-allocated. Here we select the size of the next
	* block, based on size used in the last block.
	* - first find the smallest bucket that will fit the block from a
	* limited set of block sizes. This is because it's faster to write
	* blocks allocated from the same metaslab as they are adjacent or
	* close.
	* - next find the maximum from the new suggested size and an array of
	* previous sizes. This lessens a picket fence effect of wrongly
	* guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
	* requests.
	*
	* Note we only write what is used, but we can't just allocate
	* the maximum block size because we can exhaust the available
	* pool log space.
	*/
	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
	for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
	continue;
	zil_blksz = zil_block_buckets[i];
	if (zil_blksz == UINT64_MAX)
	zil_blksz = SPA_OLD_MAXBLOCKSIZE;
	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
	for (i = 0; i < ZIL_PREV_BLKS; i++)
	zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);

	BP_ZERO(bp);

	/* pass the old blkptr in order to spread log blocks across devs */
	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
	if (error == 0) {
	ASSERT3U(bp->blk_birth, ==, txg);
	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;

	/*
	* Allocate a new log write block (lwb).
	*/
	nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
	}

	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
	/* For Slim ZIL only write what is used. */
	wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
	ASSERT3U(wsz, <=, lwb->lwb_sz);
	zio_shrink(lwb->lwb_write_zio, wsz);

	} else {
	wsz = lwb->lwb_sz;
	}

	zilc->zc_pad = 0;
	zilc->zc_nused = lwb->lwb_nused;
	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;

	/*
	* clear unused data for security
	*/
	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);

	spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);

	zil_lwb_add_block(lwb, &lwb->lwb_blk);
	lwb->lwb_issued_timestamp = gethrtime();
	lwb->lwb_state = LWB_STATE_ISSUED;

	zio_nowait(lwb->lwb_root_zio);
	zio_nowait(lwb->lwb_write_zio);

	/*
	* If there was an allocation failure then nlwb will be null which
	* forces a txg_wait_synced().
	*/
	return (nlwb);
	}

	static lwb_t *
	zil_lwb_commit(zilog_t zilog, itx_t itx, lwb_t *lwb)
	{
	lr_t lrcb, lrc;
	lr_write_t lrwb, lrw;
	char *lr_buf;
	uint64_t dlen, dnow, lwb_sp, reclen, txg;

	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	ASSERT3P(lwb, !=, NULL);
	ASSERT3P(lwb->lwb_buf, !=, NULL);

	zil_lwb_write_open(zilog, lwb);

	lrc = &itx->itx_lr;
	lrw = (lr_write_t *)lrc;

	/*
	* A commit itx doesn't represent any on-disk state; instead
	* it's simply used as a place holder on the commit list, and
	* provides a mechanism for attaching a "commit waiter" onto the
	* correct lwb (such that the waiter can be signalled upon
	* completion of that lwb). Thus, we don't process this itx's
	* log record if it's a commit itx (these itx's don't have log
	* records), and instead link the itx's waiter onto the lwb's
	* list of waiters.
	*
	* For more details, see the comment above zil_commit().
	*/
	if (lrc->lrc_txtype == TX_COMMIT) {
	mutex_enter(&zilog->zl_lock);
	zil_commit_waiter_link_lwb(itx->itx_private, lwb);
	itx->itx_private = NULL;
	mutex_exit(&zilog->zl_lock);
	return (lwb);
	}

	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
	dlen = P2ROUNDUP_TYPED(
	lrw->lr_length, sizeof (uint64_t), uint64_t);
	} else {
	dlen = 0;
	}
	reclen = lrc->lrc_reclen;
	zilog->zl_cur_used += (reclen + dlen);
	txg = lrc->lrc_txg;

	ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));

	cont:
	/*
	* If this record won't fit in the current log block, start a new one.
	* For WR_NEED_COPY optimize layout for minimal number of chunks.
	*/
	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
	if (reclen > lwb_sp \|\| (reclen + dlen > lwb_sp &&
	lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 \|\|
	lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
	lwb = zil_lwb_write_issue(zilog, lwb);
	if (lwb == NULL)
	return (NULL);
	zil_lwb_write_open(zilog, lwb);
	ASSERT(LWB_EMPTY(lwb));
	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
	ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
	}

	dnow = MIN(dlen, lwb_sp - reclen);
	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
	bcopy(lrc, lr_buf, reclen);
	lrcb = (lr_t )lr_buf; / Like lrc, but inside lwb. */
	lrwb = (lr_write_t )lrcb; / Like lrw, but inside lwb. */

	/*
	* If it's a write, fetch the data or get its blkptr as appropriate.
	*/
	if (lrc->lrc_txtype == TX_WRITE) {
	if (txg > spa_freeze_txg(zilog->zl_spa))
	txg_wait_synced(zilog->zl_dmu_pool, txg);
	if (itx->itx_wr_state != WR_COPIED) {
	char *dbuf;
	int error;

	if (itx->itx_wr_state == WR_NEED_COPY) {
	dbuf = lr_buf + reclen;
	lrcb->lrc_reclen += dnow;
	if (lrwb->lr_length > dnow)
	lrwb->lr_length = dnow;
	lrw->lr_offset += dnow;
	lrw->lr_length -= dnow;
	} else {
	ASSERT(itx->itx_wr_state == WR_INDIRECT);
	dbuf = NULL;
	}

	/*
	* We pass in the "lwb_write_zio" rather than
	* "lwb_root_zio" so that the "lwb_write_zio"
	* becomes the parent of any zio's created by
	* the "zl_get_data" callback. The vdevs are
	* flushed after the "lwb_write_zio" completes,
	* so we want to make sure that completion
	* callback waits for these additional zio's,
	* such that the vdevs used by those zio's will
	* be included in the lwb's vdev tree, and those
	* vdevs will be properly flushed. If we passed
	* in "lwb_root_zio" here, then these additional
	* vdevs may not be flushed; e.g. if these zio's
	* completed after "lwb_write_zio" completed.
	*/
	error = zilog->zl_get_data(itx->itx_private,
	lrwb, dbuf, lwb, lwb->lwb_write_zio);

	if (error == EIO) {
	txg_wait_synced(zilog->zl_dmu_pool, txg);
	return (lwb);
	}
	if (error != 0) {
	ASSERT(error == ENOENT \|\| error == EEXIST \|\|
	error == EALREADY);
	return (lwb);
	}
	}
	}

	/*
	* We're actually making an entry, so update lrc_seq to be the
	* log record sequence number. Note that this is generally not
	* equal to the itx sequence number because not all transactions
	* are synchronous, and sometimes spa_sync() gets there first.
	*/
	lrcb->lrc_seq = ++zilog->zl_lr_seq;
	lwb->lwb_nused += reclen + dnow;

	zil_lwb_add_txg(lwb, txg);

	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));

	dlen -= dnow;
	if (dlen > 0) {
	zilog->zl_cur_used += reclen;
	goto cont;
	}

	return (lwb);
	}

	itx_t *
	zil_itx_create(uint64_t txtype, size_t lrsize)
	{
	itx_t *itx;

	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);

	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
	itx->itx_lr.lrc_txtype = txtype;
	itx->itx_lr.lrc_reclen = lrsize;
	itx->itx_lr.lrc_seq = 0; /* defensive */
	itx->itx_sync = B_TRUE; /* default is synchronous */

	return (itx);
	}

	void
	zil_itx_destroy(itx_t *itx)
	{
	kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
	}

	/*
	* Free up the sync and async itxs. The itxs_t has already been detached
	* so no locks are needed.
	*/
	static void
	zil_itxg_clean(itxs_t *itxs)
	{
	itx_t *itx;
	list_t *list;
	avl_tree_t *t;
	void *cookie;
	itx_async_node_t *ian;

	list = &itxs->i_sync_list;
	while ((itx = list_head(list)) != NULL) {
	/*
	* In the general case, commit itxs will not be found
	* here, as they'll be committed to an lwb via
	* zil_lwb_commit(), and free'd in that function. Having
	* said that, it is still possible for commit itxs to be
	* found here, due to the following race:
	*
	* - a thread calls zil_commit() which assigns the
	* commit itx to a per-txg i_sync_list
	* - zil_itxg_clean() is called (e.g. via spa_sync())
	* while the waiter is still on the i_sync_list
	*
	* There's nothing to prevent syncing the txg while the
	* waiter is on the i_sync_list. This normally doesn't
	* happen because spa_sync() is slower than zil_commit(),
	* but if zil_commit() calls txg_wait_synced() (e.g.
	* because zil_create() or zil_commit_writer_stall() is
	* called) we will hit this case.
	*/
	if (itx->itx_lr.lrc_txtype == TX_COMMIT)
	zil_commit_waiter_skip(itx->itx_private);

	list_remove(list, itx);
	zil_itx_destroy(itx);
	}

	cookie = NULL;
	t = &itxs->i_async_tree;
	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
	list = &ian->ia_list;
	while ((itx = list_head(list)) != NULL) {
	list_remove(list, itx);
	/* commit itxs should never be on the async lists. */
	ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
	zil_itx_destroy(itx);
	}
	list_destroy(list);
	kmem_free(ian, sizeof (itx_async_node_t));
	}
	avl_destroy(t);

	kmem_free(itxs, sizeof (itxs_t));
	}

	static int
	zil_aitx_compare(const void x1, const void x2)
	{
	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;

	if (o1 < o2)
	return (-1);
	if (o1 > o2)
	return (1);

	return (0);
	}

	/*
	* Remove all async itx with the given oid.
	*/
	static void
	zil_remove_async(zilog_t *zilog, uint64_t oid)
	{
	uint64_t otxg, txg;
	itx_async_node_t *ian;
	avl_tree_t *t;
	avl_index_t where;
	list_t clean_list;
	itx_t *itx;

	ASSERT(oid != 0);
	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));

	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
	otxg = ZILTEST_TXG;
	else
	otxg = spa_last_synced_txg(zilog->zl_spa) + 1;

	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
	itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];

	mutex_enter(&itxg->itxg_lock);
	if (itxg->itxg_txg != txg) {
	mutex_exit(&itxg->itxg_lock);
	continue;
	}

	/*
	* Locate the object node and append its list.
	*/
	t = &itxg->itxg_itxs->i_async_tree;
	ian = avl_find(t, &oid, &where);
	if (ian != NULL)
	list_move_tail(&clean_list, &ian->ia_list);
	mutex_exit(&itxg->itxg_lock);
	}
	while ((itx = list_head(&clean_list)) != NULL) {
	list_remove(&clean_list, itx);
	/* commit itxs should never be on the async lists. */
	ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
	zil_itx_destroy(itx);
	}
	list_destroy(&clean_list);
	}

	void
	zil_itx_assign(zilog_t zilog, itx_t itx, dmu_tx_t *tx)
	{
	uint64_t txg;
	itxg_t *itxg;
	itxs_t itxs, clean = NULL;

	/*
	* Object ids can be re-instantiated in the next txg so
	* remove any async transactions to avoid future leaks.
	* This can happen if a fsync occurs on the re-instantiated
	* object for a WR_INDIRECT or WR_NEED_COPY write, which gets
	* the new file data and flushes a write record for the old object.
	*/
	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
	zil_remove_async(zilog, itx->itx_oid);

	/*
	* Ensure the data of a renamed file is committed before the rename.
	*/
	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
	zil_async_to_sync(zilog, itx->itx_oid);

	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
	txg = ZILTEST_TXG;
	else
	txg = dmu_tx_get_txg(tx);

	itxg = &zilog->zl_itxg[txg & TXG_MASK];
	mutex_enter(&itxg->itxg_lock);
	itxs = itxg->itxg_itxs;
	if (itxg->itxg_txg != txg) {
	if (itxs != NULL) {
	/*
	* The zil_clean callback hasn't got around to cleaning
	* this itxg. Save the itxs for release below.
	* This should be rare.
	*/
	zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
	"txg %llu", itxg->itxg_txg);
	clean = itxg->itxg_itxs;
	}
	itxg->itxg_txg = txg;
	itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);

	list_create(&itxs->i_sync_list, sizeof (itx_t),
	offsetof(itx_t, itx_node));
	avl_create(&itxs->i_async_tree, zil_aitx_compare,
	sizeof (itx_async_node_t),
	offsetof(itx_async_node_t, ia_node));
	}
	if (itx->itx_sync) {
	list_insert_tail(&itxs->i_sync_list, itx);
	} else {
	avl_tree_t *t = &itxs->i_async_tree;
	uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
	itx_async_node_t *ian;
	avl_index_t where;

	ian = avl_find(t, &foid, &where);
	if (ian == NULL) {
	ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
	list_create(&ian->ia_list, sizeof (itx_t),
	offsetof(itx_t, itx_node));
	ian->ia_foid = foid;
	avl_insert(t, ian, where);
	}
	list_insert_tail(&ian->ia_list, itx);
	}

	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);

	/*
	* We don't want to dirty the ZIL using ZILTEST_TXG, because
	* zil_clean() will never be called using ZILTEST_TXG. Thus, we
	* need to be careful to always dirty the ZIL using the "real"
	* TXG (not itxg_txg) even when the SPA is frozen.
	*/
	zilog_dirty(zilog, dmu_tx_get_txg(tx));
	mutex_exit(&itxg->itxg_lock);

	/* Release the old itxs now we've dropped the lock */
	if (clean != NULL)
	zil_itxg_clean(clean);
	}

	/*
	* If there are any in-memory intent log transactions which have now been
	* synced then start up a taskq to free them. We should only do this after we
	* have written out the uberblocks (i.e. txg has been comitted) so that
	* don't inadvertently clean out in-memory log records that would be required
	* by zil_commit().
	*/
	void
	zil_clean(zilog_t *zilog, uint64_t synced_txg)
	{
	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
	itxs_t *clean_me;

	ASSERT3U(synced_txg, <, ZILTEST_TXG);

	mutex_enter(&itxg->itxg_lock);
	if (itxg->itxg_itxs == NULL \|\| itxg->itxg_txg == ZILTEST_TXG) {
	mutex_exit(&itxg->itxg_lock);
	return;
	}
	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
	ASSERT3U(itxg->itxg_txg, !=, 0);
	clean_me = itxg->itxg_itxs;
	itxg->itxg_itxs = NULL;
	itxg->itxg_txg = 0;
	mutex_exit(&itxg->itxg_lock);
	/*
	* Preferably start a task queue to free up the old itxs but
	* if taskq_dispatch can't allocate resources to do that then
	* free it in-line. This should be rare. Note, using TQ_SLEEP
	* created a bad performance problem.
	*/
	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
	if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
	(void ()(void ))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
	zil_itxg_clean(clean_me);
	}

	/*
	* This function will traverse the queue of itxs that need to be
	* committed, and move them onto the ZIL's zl_itx_commit_list.
	*/
	static void
	zil_get_commit_list(zilog_t *zilog)
	{
	uint64_t otxg, txg;
	list_t *commit_list = &zilog->zl_itx_commit_list;

	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));

	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
	otxg = ZILTEST_TXG;
	else
	otxg = spa_last_synced_txg(zilog->zl_spa) + 1;

	/*
	* This is inherently racy, since there is nothing to prevent
	* the last synced txg from changing. That's okay since we'll
	* only commit things in the future.
	*/
	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
	itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];

	mutex_enter(&itxg->itxg_lock);
	if (itxg->itxg_txg != txg) {
	mutex_exit(&itxg->itxg_lock);
	continue;
	}

	/*
	* If we're adding itx records to the zl_itx_commit_list,
	* then the zil better be dirty in this "txg". We can assert
	* that here since we're holding the itxg_lock which will
	* prevent spa_sync from cleaning it. Once we add the itxs
	* to the zl_itx_commit_list we must commit it to disk even
	* if it's unnecessary (i.e. the txg was synced).
	*/
	ASSERT(zilog_is_dirty_in_txg(zilog, txg) \|\|
	spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
	list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);

	mutex_exit(&itxg->itxg_lock);
	}
	}

	/*
	* Move the async itxs for a specified object to commit into sync lists.
	*/
	void
	zil_async_to_sync(zilog_t *zilog, uint64_t foid)
	{
	uint64_t otxg, txg;
	itx_async_node_t *ian;
	avl_tree_t *t;
	avl_index_t where;

	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
	otxg = ZILTEST_TXG;
	else
	otxg = spa_last_synced_txg(zilog->zl_spa) + 1;

	/*
	* This is inherently racy, since there is nothing to prevent
	* the last synced txg from changing.
	*/
	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
	itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];

	mutex_enter(&itxg->itxg_lock);
	if (itxg->itxg_txg != txg) {
	mutex_exit(&itxg->itxg_lock);
	continue;
	}

	/*
	* If a foid is specified then find that node and append its
	* list. Otherwise walk the tree appending all the lists
	* to the sync list. We add to the end rather than the
	* beginning to ensure the create has happened.
	*/
	t = &itxg->itxg_itxs->i_async_tree;
	if (foid != 0) {
	ian = avl_find(t, &foid, &where);
	if (ian != NULL) {
	list_move_tail(&itxg->itxg_itxs->i_sync_list,
	&ian->ia_list);
	}
	} else {
	void *cookie = NULL;

	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
	list_move_tail(&itxg->itxg_itxs->i_sync_list,
	&ian->ia_list);
	list_destroy(&ian->ia_list);
	kmem_free(ian, sizeof (itx_async_node_t));
	}
	}
	mutex_exit(&itxg->itxg_lock);
	}
	}

	/*
	* This function will prune commit itxs that are at the head of the
	* commit list (it won't prune past the first non-commit itx), and
	* either: a) attach them to the last lwb that's still pending
	* completion, or b) skip them altogether.
	*
	* This is used as a performance optimization to prevent commit itxs
	* from generating new lwbs when it's unnecessary to do so.
	*/
	static void
	zil_prune_commit_list(zilog_t *zilog)
	{
	itx_t *itx;

	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));

	while (itx = list_head(&zilog->zl_itx_commit_list)) {
	lr_t *lrc = &itx->itx_lr;
	if (lrc->lrc_txtype != TX_COMMIT)
	break;

	mutex_enter(&zilog->zl_lock);

	lwb_t *last_lwb = zilog->zl_last_lwb_opened;
	if (last_lwb == NULL \|\| last_lwb->lwb_state == LWB_STATE_DONE) {
	/*
	* All of the itxs this waiter was waiting on
	* must have already completed (or there were
	* never any itx's for it to wait on), so it's
	* safe to skip this waiter and mark it done.
	*/
	zil_commit_waiter_skip(itx->itx_private);
	} else {
	zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
	itx->itx_private = NULL;
	}

	mutex_exit(&zilog->zl_lock);

	list_remove(&zilog->zl_itx_commit_list, itx);
	zil_itx_destroy(itx);
	}

	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
	}

	static void
	zil_commit_writer_stall(zilog_t *zilog)
	{
	/*
	* When zio_alloc_zil() fails to allocate the next lwb block on
	* disk, we must call txg_wait_synced() to ensure all of the
	* lwbs in the zilog's zl_lwb_list are synced and then freed (in
	* zil_sync()), such that any subsequent ZIL writer (i.e. a call
	* to zil_process_commit_list()) will have to call zil_create(),
	* and start a new ZIL chain.
	*
	* Since zil_alloc_zil() failed, the lwb that was previously
	* issued does not have a pointer to the "next" lwb on disk.
	* Thus, if another ZIL writer thread was to allocate the "next"
	* on-disk lwb, that block could be leaked in the event of a
	* crash (because the previous lwb on-disk would not point to
	* it).
	*
	* We must hold the zilog's zl_issuer_lock while we do this, to
	* ensure no new threads enter zil_process_commit_list() until
	* all lwb's in the zl_lwb_list have been synced and freed
	* (which is achieved via the txg_wait_synced() call).
	*/
	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
	txg_wait_synced(zilog->zl_dmu_pool, 0);
	ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
	}

	/*
	* This function will traverse the commit list, creating new lwbs as
	* needed, and committing the itxs from the commit list to these newly
	* created lwbs. Additionally, as a new lwb is created, the previous
	* lwb will be issued to the zio layer to be written to disk.
	*/
	static void
	zil_process_commit_list(zilog_t *zilog)
	{
	spa_t *spa = zilog->zl_spa;
	list_t nolwb_waiters;
	lwb_t *lwb;
	itx_t *itx;

	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));

	/*
	* Return if there's nothing to commit before we dirty the fs by
	* calling zil_create().
	*/
	if (list_head(&zilog->zl_itx_commit_list) == NULL)
	return;

	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
	offsetof(zil_commit_waiter_t, zcw_node));

	lwb = list_tail(&zilog->zl_lwb_list);
	if (lwb == NULL) {
	lwb = zil_create(zilog);
	} else {
	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
	}

	while (itx = list_head(&zilog->zl_itx_commit_list)) {
	lr_t *lrc = &itx->itx_lr;
	uint64_t txg = lrc->lrc_txg;

	ASSERT3U(txg, !=, 0);

	if (lrc->lrc_txtype == TX_COMMIT) {
	DTRACE_PROBE2(zil__process__commit__itx,
	zilog_t , zilog, itx_t , itx);
	} else {
	DTRACE_PROBE2(zil__process__normal__itx,
	zilog_t , zilog, itx_t , itx);
	}

	boolean_t synced = txg <= spa_last_synced_txg(spa);
	boolean_t frozen = txg > spa_freeze_txg(spa);

	/*
	* If the txg of this itx has already been synced out, then
	* we don't need to commit this itx to an lwb. This is
	* because the data of this itx will have already been
	* written to the main pool. This is inherently racy, and
	* it's still ok to commit an itx whose txg has already
	* been synced; this will result in a write that's
	* unnecessary, but will do no harm.
	*
	* With that said, we always want to commit TX_COMMIT itxs
	* to an lwb, regardless of whether or not that itx's txg
	* has been synced out. We do this to ensure any OPENED lwb
	* will always have at least one zil_commit_waiter_t linked
	* to the lwb.
	*
	* As a counter-example, if we skipped TX_COMMIT itx's
	* whose txg had already been synced, the following
	* situation could occur if we happened to be racing with
	* spa_sync:
	*
	* 1. we commit a non-TX_COMMIT itx to an lwb, where the
	* itx's txg is 10 and the last synced txg is 9.
	* 2. spa_sync finishes syncing out txg 10.
	* 3. we move to the next itx in the list, it's a TX_COMMIT
	* whose txg is 10, so we skip it rather than committing
	* it to the lwb used in (1).
	*
	* If the itx that is skipped in (3) is the last TX_COMMIT
	* itx in the commit list, than it's possible for the lwb
	* used in (1) to remain in the OPENED state indefinitely.
	*
	* To prevent the above scenario from occuring, ensuring
	* that once an lwb is OPENED it will transition to ISSUED
	* and eventually DONE, we always commit TX_COMMIT itx's to
	* an lwb here, even if that itx's txg has already been
	* synced.
	*
	* Finally, if the pool is frozen, we _always_ commit the
	* itx. The point of freezing the pool is to prevent data
	* from being written to the main pool via spa_sync, and
	* instead rely solely on the ZIL to persistently store the
	* data; i.e. when the pool is frozen, the last synced txg
	* value can't be trusted.
	*/
	if (frozen \|\| !synced \|\| lrc->lrc_txtype == TX_COMMIT) {
	if (lwb != NULL) {
	lwb = zil_lwb_commit(zilog, itx, lwb);
	} else if (lrc->lrc_txtype == TX_COMMIT) {
	ASSERT3P(lwb, ==, NULL);
	zil_commit_waiter_link_nolwb(
	itx->itx_private, &nolwb_waiters);
	}
	}

	list_remove(&zilog->zl_itx_commit_list, itx);
	zil_itx_destroy(itx);
	}

	if (lwb == NULL) {
	/*
	* This indicates zio_alloc_zil() failed to allocate the
	* "next" lwb on-disk. When this happens, we must stall
	* the ZIL write pipeline; see the comment within
	* zil_commit_writer_stall() for more details.
	*/
	zil_commit_writer_stall(zilog);

	/*
	* Additionally, we have to signal and mark the "nolwb"
	* waiters as "done" here, since without an lwb, we
	* can't do this via zil_lwb_flush_vdevs_done() like
	* normal.
	*/
	zil_commit_waiter_t *zcw;
	while (zcw = list_head(&nolwb_waiters)) {
	zil_commit_waiter_skip(zcw);
	list_remove(&nolwb_waiters, zcw);
	}
	} else {
	ASSERT(list_is_empty(&nolwb_waiters));
	ASSERT3P(lwb, !=, NULL);
	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);

	/*
	* At this point, the ZIL block pointed at by the "lwb"
	* variable is in one of the following states: "closed"
	* or "open".
	*
	* If its "closed", then no itxs have been committed to
	* it, so there's no point in issuing its zio (i.e.
	* it's "empty").
	*
	* If its "open" state, then it contains one or more
	* itxs that eventually need to be committed to stable
	* storage. In this case we intentionally do not issue
	* the lwb's zio to disk yet, and instead rely on one of
	* the following two mechanisms for issuing the zio:
	*
	* 1. Ideally, there will be more ZIL activity occuring
	* on the system, such that this function will be
	* immediately called again (not necessarily by the same
	* thread) and this lwb's zio will be issued via
	* zil_lwb_commit(). This way, the lwb is guaranteed to
	* be "full" when it is issued to disk, and we'll make
	* use of the lwb's size the best we can.
	*
	* 2. If there isn't sufficient ZIL activity occuring on
	* the system, such that this lwb's zio isn't issued via
	* zil_lwb_commit(), zil_commit_waiter() will issue the
	* lwb's zio. If this occurs, the lwb is not guaranteed
	* to be "full" by the time its zio is issued, and means
	* the size of the lwb was "too large" given the amount
	* of ZIL activity occuring on the system at that time.
	*
	* We do this for a couple of reasons:
	*
	* 1. To try and reduce the number of IOPs needed to
	* write the same number of itxs. If an lwb has space
	* available in it's buffer for more itxs, and more itxs
	* will be committed relatively soon (relative to the
	* latency of performing a write), then it's beneficial
	* to wait for these "next" itxs. This way, more itxs
	* can be committed to stable storage with fewer writes.
	*
	* 2. To try and use the largest lwb block size that the
	* incoming rate of itxs can support. Again, this is to
	* try and pack as many itxs into as few lwbs as
	* possible, without significantly impacting the latency
	* of each individual itx.
	*/
	}
	}

	/*
	* This function is responsible for ensuring the passed in commit waiter
	* (and associated commit itx) is committed to an lwb. If the waiter is
	* not already committed to an lwb, all itxs in the zilog's queue of
	* itxs will be processed. The assumption is the passed in waiter's
	* commit itx will found in the queue just like the other non-commit
	* itxs, such that when the entire queue is processed, the waiter will
	* have been commited to an lwb.
	*
	* The lwb associated with the passed in waiter is not guaranteed to
	* have been issued by the time this function completes. If the lwb is
	* not issued, we rely on future calls to zil_commit_writer() to issue
	* the lwb, or the timeout mechanism found in zil_commit_waiter().
	*/
	static void
	zil_commit_writer(zilog_t zilog, zil_commit_waiter_t zcw)
	{
	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
	ASSERT(spa_writeable(zilog->zl_spa));

	mutex_enter(&zilog->zl_issuer_lock);

	if (zcw->zcw_lwb != NULL \|\| zcw->zcw_done) {
	/*
	* It's possible that, while we were waiting to acquire
	* the "zl_issuer_lock", another thread committed this
	* waiter to an lwb. If that occurs, we bail out early,
	* without processing any of the zilog's queue of itxs.
	*
	* On certain workloads and system configurations, the
	* "zl_issuer_lock" can become highly contended. In an
	* attempt to reduce this contention, we immediately drop
	* the lock if the waiter has already been processed.
	*
	* We've measured this optimization to reduce CPU spent
	* contending on this lock by up to 5%, using a system
	* with 32 CPUs, low latency storage (~50 usec writes),
	* and 1024 threads performing sync writes.
	*/
	goto out;
	}

	zil_get_commit_list(zilog);
	zil_prune_commit_list(zilog);
	zil_process_commit_list(zilog);

	out:
	mutex_exit(&zilog->zl_issuer_lock);
	}

	static void
	zil_commit_waiter_timeout(zilog_t zilog, zil_commit_waiter_t zcw)
	{
	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
	ASSERT3B(zcw->zcw_done, ==, B_FALSE);

	lwb_t *lwb = zcw->zcw_lwb;
	ASSERT3P(lwb, !=, NULL);
	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);

	/*
	* If the lwb has already been issued by another thread, we can
	* immediately return since there's no work to be done (the
	* point of this function is to issue the lwb). Additionally, we
	* do this prior to acquiring the zl_issuer_lock, to avoid
	* acquiring it when it's not necessary to do so.
	*/
	if (lwb->lwb_state == LWB_STATE_ISSUED \|\|
	lwb->lwb_state == LWB_STATE_DONE)
	return;

	/*
	* In order to call zil_lwb_write_issue() we must hold the
	* zilog's "zl_issuer_lock". We can't simply acquire that lock,
	* since we're already holding the commit waiter's "zcw_lock",
	* and those two locks are aquired in the opposite order
	* elsewhere.
	*/
	mutex_exit(&zcw->zcw_lock);
	mutex_enter(&zilog->zl_issuer_lock);
	mutex_enter(&zcw->zcw_lock);

	/*
	* Since we just dropped and re-acquired the commit waiter's
	* lock, we have to re-check to see if the waiter was marked
	* "done" during that process. If the waiter was marked "done",
	* the "lwb" pointer is no longer valid (it can be free'd after
	* the waiter is marked "done"), so without this check we could
	* wind up with a use-after-free error below.
	*/
	if (zcw->zcw_done)
	goto out;

	ASSERT3P(lwb, ==, zcw->zcw_lwb);

	/*
	* We've already checked this above, but since we hadn't acquired
	* the zilog's zl_issuer_lock, we have to perform this check a
	* second time while holding the lock.
	*
	* We don't need to hold the zl_lock since the lwb cannot transition
	* from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
	* _can_ transition from ISSUED to DONE, but it's OK to race with
	* that transition since we treat the lwb the same, whether it's in
	* the ISSUED or DONE states.
	*
	* The important thing, is we treat the lwb differently depending on
	* if it's ISSUED or OPENED, and block any other threads that might
	* attempt to issue this lwb. For that reason we hold the
	* zl_issuer_lock when checking the lwb_state; we must not call
	* zil_lwb_write_issue() if the lwb had already been issued.
	*
	* See the comment above the lwb_state_t structure definition for
	* more details on the lwb states, and locking requirements.
	*/
	if (lwb->lwb_state == LWB_STATE_ISSUED \|\|
	lwb->lwb_state == LWB_STATE_DONE)
	goto out;

	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);

	/*
	* As described in the comments above zil_commit_waiter() and
	* zil_process_commit_list(), we need to issue this lwb's zio
	* since we've reached the commit waiter's timeout and it still
	* hasn't been issued.
	*/
	lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);

	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);

	/*
	* Since the lwb's zio hadn't been issued by the time this thread
	* reached its timeout, we reset the zilog's "zl_cur_used" field
	* to influence the zil block size selection algorithm.
	*
	* By having to issue the lwb's zio here, it means the size of the
	* lwb was too large, given the incoming throughput of itxs. By
	* setting "zl_cur_used" to zero, we communicate this fact to the
	* block size selection algorithm, so it can take this informaiton
	* into account, and potentially select a smaller size for the
	* next lwb block that is allocated.
	*/
	zilog->zl_cur_used = 0;

	if (nlwb == NULL) {
	/*
	* When zil_lwb_write_issue() returns NULL, this
	* indicates zio_alloc_zil() failed to allocate the
	* "next" lwb on-disk. When this occurs, the ZIL write
	* pipeline must be stalled; see the comment within the
	* zil_commit_writer_stall() function for more details.
	*
	* We must drop the commit waiter's lock prior to
	* calling zil_commit_writer_stall() or else we can wind
	* up with the following deadlock:
	*
	* - This thread is waiting for the txg to sync while
	* holding the waiter's lock; txg_wait_synced() is
	* used within txg_commit_writer_stall().
	*
	* - The txg can't sync because it is waiting for this
	* lwb's zio callback to call dmu_tx_commit().
	*
	* - The lwb's zio callback can't call dmu_tx_commit()
	* because it's blocked trying to acquire the waiter's
	* lock, which occurs prior to calling dmu_tx_commit()
	*/
	mutex_exit(&zcw->zcw_lock);
	zil_commit_writer_stall(zilog);
	mutex_enter(&zcw->zcw_lock);
	}

	out:
	mutex_exit(&zilog->zl_issuer_lock);
	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
	}

	/*
	* This function is responsible for performing the following two tasks:
	*
	* 1. its primary responsibility is to block until the given "commit
	* waiter" is considered "done".
	*
	* 2. its secondary responsibility is to issue the zio for the lwb that
	* the given "commit waiter" is waiting on, if this function has
	* waited "long enough" and the lwb is still in the "open" state.
	*
	* Given a sufficient amount of itxs being generated and written using
	* the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
	* function. If this does not occur, this secondary responsibility will
	* ensure the lwb is issued even if there is not other synchronous
	* activity on the system.
	*
	* For more details, see zil_process_commit_list(); more specifically,
	* the comment at the bottom of that function.
	*/
	static void
	zil_commit_waiter(zilog_t zilog, zil_commit_waiter_t zcw)
	{
	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
	ASSERT(spa_writeable(zilog->zl_spa));

	mutex_enter(&zcw->zcw_lock);

	/*
	* The timeout is scaled based on the lwb latency to avoid
	* significantly impacting the latency of each individual itx.
	* For more details, see the comment at the bottom of the
	* zil_process_commit_list() function.
	*/
	int pct = MAX(zfs_commit_timeout_pct, 1);
	#if defined(illumos) \|\| !defined(_KERNEL)
	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
	hrtime_t wakeup = gethrtime() + sleep;
	#else
	sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100);
	sbintime_t wakeup = getsbinuptime() + sleep;
	#endif
	boolean_t timedout = B_FALSE;

	while (!zcw->zcw_done) {
	ASSERT(MUTEX_HELD(&zcw->zcw_lock));

	lwb_t *lwb = zcw->zcw_lwb;

	/*
	* Usually, the waiter will have a non-NULL lwb field here,
	* but it's possible for it to be NULL as a result of
	* zil_commit() racing with spa_sync().
	*
	* When zil_clean() is called, it's possible for the itxg
	* list (which may be cleaned via a taskq) to contain
	* commit itxs. When this occurs, the commit waiters linked
	* off of these commit itxs will not be committed to an
	* lwb. Additionally, these commit waiters will not be
	* marked done until zil_commit_waiter_skip() is called via
	* zil_itxg_clean().
	*
	* Thus, it's possible for this commit waiter (i.e. the
	* "zcw" variable) to be found in this "in between" state;
	* where it's "zcw_lwb" field is NULL, and it hasn't yet
	* been skipped, so it's "zcw_done" field is still B_FALSE.
	*/
	IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);

	if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
	ASSERT3B(timedout, ==, B_FALSE);

	/*
	* If the lwb hasn't been issued yet, then we
	* need to wait with a timeout, in case this
	* function needs to issue the lwb after the
	* timeout is reached; responsibility (2) from
	* the comment above this function.
	*/
	#if defined(illumos) \|\| !defined(_KERNEL)
	clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv,
	&zcw->zcw_lock, wakeup, USEC2NSEC(1),
	CALLOUT_FLAG_ABSOLUTE);

	if (timeleft >= 0 \|\| zcw->zcw_done)
	continue;
	#else
	int wait_err = cv_timedwait_sbt(&zcw->zcw_cv,
	&zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE);
	if (wait_err != EWOULDBLOCK \|\| zcw->zcw_done)
	continue;
	#endif

	timedout = B_TRUE;
	zil_commit_waiter_timeout(zilog, zcw);

	if (!zcw->zcw_done) {
	/*
	* If the commit waiter has already been
	* marked "done", it's possible for the
	* waiter's lwb structure to have already
	* been freed. Thus, we can only reliably
	* make these assertions if the waiter
	* isn't done.
	*/
	ASSERT3P(lwb, ==, zcw->zcw_lwb);
	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
	}
	} else {
	/*
	* If the lwb isn't open, then it must have already
	* been issued. In that case, there's no need to
	* use a timeout when waiting for the lwb to
	* complete.
	*
	* Additionally, if the lwb is NULL, the waiter
	* will soon be signalled and marked done via
	* zil_clean() and zil_itxg_clean(), so no timeout
	* is required.
	*/

	IMPLY(lwb != NULL,
	lwb->lwb_state == LWB_STATE_ISSUED \|\|
	lwb->lwb_state == LWB_STATE_DONE);
	cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
	}
	}

	mutex_exit(&zcw->zcw_lock);
	}

	static zil_commit_waiter_t *
	zil_alloc_commit_waiter()
	{
	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);

	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
	list_link_init(&zcw->zcw_node);
	zcw->zcw_lwb = NULL;
	zcw->zcw_done = B_FALSE;
	zcw->zcw_zio_error = 0;

	return (zcw);
	}

	static void
	zil_free_commit_waiter(zil_commit_waiter_t *zcw)
	{
	ASSERT(!list_link_active(&zcw->zcw_node));
	ASSERT3P(zcw->zcw_lwb, ==, NULL);
	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
	mutex_destroy(&zcw->zcw_lock);
	cv_destroy(&zcw->zcw_cv);
	kmem_cache_free(zil_zcw_cache, zcw);
	}

	/*
	* This function is used to create a TX_COMMIT itx and assign it. This
	* way, it will be linked into the ZIL's list of synchronous itxs, and
	* then later committed to an lwb (or skipped) when
	* zil_process_commit_list() is called.
	*/
	static void
	zil_commit_itx_assign(zilog_t zilog, zil_commit_waiter_t zcw)
	{
	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));

	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
	itx->itx_sync = B_TRUE;
	itx->itx_private = zcw;

	zil_itx_assign(zilog, itx, tx);

	dmu_tx_commit(tx);
	}

	/*
	* Commit ZFS Intent Log transactions (itxs) to stable storage.
	*
	* When writing ZIL transactions to the on-disk representation of the
	* ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
	* itxs can be committed to a single lwb. Once a lwb is written and
	* committed to stable storage (i.e. the lwb is written, and vdevs have
	* been flushed), each itx that was committed to that lwb is also
	* considered to be committed to stable storage.
	*
	* When an itx is committed to an lwb, the log record (lr_t) contained
	* by the itx is copied into the lwb's zio buffer, and once this buffer
	* is written to disk, it becomes an on-disk ZIL block.
	*
	* As itxs are generated, they're inserted into the ZIL's queue of
	* uncommitted itxs. The semantics of zil_commit() are such that it will
	* block until all itxs that were in the queue when it was called, are
	* committed to stable storage.
	*
	* If "foid" is zero, this means all "synchronous" and "asynchronous"
	* itxs, for all objects in the dataset, will be committed to stable
	* storage prior to zil_commit() returning. If "foid" is non-zero, all
	* "synchronous" itxs for all objects, but only "asynchronous" itxs
	* that correspond to the foid passed in, will be committed to stable
	* storage prior to zil_commit() returning.
	*
	* Generally speaking, when zil_commit() is called, the consumer doesn't
	* actually care about _all_ of the uncommitted itxs. Instead, they're
	* simply trying to waiting for a specific itx to be committed to disk,
	* but the interface(s) for interacting with the ZIL don't allow such
	* fine-grained communication. A better interface would allow a consumer
	* to create and assign an itx, and then pass a reference to this itx to
	* zil_commit(); such that zil_commit() would return as soon as that
	* specific itx was committed to disk (instead of waiting for _all_
	* itxs to be committed).
	*
	* When a thread calls zil_commit() a special "commit itx" will be
	* generated, along with a corresponding "waiter" for this commit itx.
	* zil_commit() will wait on this waiter's CV, such that when the waiter
	* is marked done, and signalled, zil_commit() will return.
	*
	* This commit itx is inserted into the queue of uncommitted itxs. This
	* provides an easy mechanism for determining which itxs were in the
	* queue prior to zil_commit() having been called, and which itxs were
	* added after zil_commit() was called.
	*
	* The commit it is special; it doesn't have any on-disk representation.
	* When a commit itx is "committed" to an lwb, the waiter associated
	* with it is linked onto the lwb's list of waiters. Then, when that lwb
	* completes, each waiter on the lwb's list is marked done and signalled
	* -- allowing the thread waiting on the waiter to return from zil_commit().
	*
	* It's important to point out a few critical factors that allow us
	* to make use of the commit itxs, commit waiters, per-lwb lists of
	* commit waiters, and zio completion callbacks like we're doing:
	*
	* 1. The list of waiters for each lwb is traversed, and each commit
	* waiter is marked "done" and signalled, in the zio completion
	* callback of the lwb's zio[*].
	*
	* * Actually, the waiters are signalled in the zio completion
	* callback of the root zio for the DKIOCFLUSHWRITECACHE commands
	* that are sent to the vdevs upon completion of the lwb zio.
	*
	* 2. When the itxs are inserted into the ZIL's queue of uncommitted
	* itxs, the order in which they are inserted is preserved[*]; as
	* itxs are added to the queue, they are added to the tail of
	* in-memory linked lists.
	*
	* When committing the itxs to lwbs (to be written to disk), they
	* are committed in the same order in which the itxs were added to
	* the uncommitted queue's linked list(s); i.e. the linked list of
	* itxs to commit is traversed from head to tail, and each itx is
	* committed to an lwb in that order.
	*
	* * To clarify:
	*
	* - the order of "sync" itxs is preserved w.r.t. other
	* "sync" itxs, regardless of the corresponding objects.
	* - the order of "async" itxs is preserved w.r.t. other
	* "async" itxs corresponding to the same object.
	* - the order of "async" itxs is not preserved w.r.t. other
	* "async" itxs corresponding to different objects.
	* - the order of "sync" itxs w.r.t. "async" itxs (or vice
	* versa) is not preserved, even for itxs that correspond
	* to the same object.
	*
	* For more details, see: zil_itx_assign(), zil_async_to_sync(),
	* zil_get_commit_list(), and zil_process_commit_list().
	*
	* 3. The lwbs represent a linked list of blocks on disk. Thus, any
	* lwb cannot be considered committed to stable storage, until its
	* "previous" lwb is also committed to stable storage. This fact,
	* coupled with the fact described above, means that itxs are
	* committed in (roughly) the order in which they were generated.
	* This is essential because itxs are dependent on prior itxs.
	* Thus, we must not deem an itx as being committed to stable
	* storage, until all prior itxs have also been committed to
	* stable storage.
	*
	* To enforce this ordering of lwb zio's, while still leveraging as
	* much of the underlying storage performance as possible, we rely
	* on two fundamental concepts:
	*
	* 1. The creation and issuance of lwb zio's is protected by
	* the zilog's "zl_issuer_lock", which ensures only a single
	* thread is creating and/or issuing lwb's at a time
	* 2. The "previous" lwb is a child of the "current" lwb
	* (leveraging the zio parent-child depenency graph)
	*
	* By relying on this parent-child zio relationship, we can have
	* many lwb zio's concurrently issued to the underlying storage,
	* but the order in which they complete will be the same order in
	* which they were created.
	*/
	void
	zil_commit(zilog_t *zilog, uint64_t foid)
	{
	/*
	* We should never attempt to call zil_commit on a snapshot for
	* a couple of reasons:
	*
	* 1. A snapshot may never be modified, thus it cannot have any
	* in-flight itxs that would have modified the dataset.
	*
	* 2. By design, when zil_commit() is called, a commit itx will
	* be assigned to this zilog; as a result, the zilog will be
	* dirtied. We must not dirty the zilog of a snapshot; there's
	* checks in the code that enforce this invariant, and will
	* cause a panic if it's not upheld.
	*/
	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);

	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
	return;

	if (!spa_writeable(zilog->zl_spa)) {
	/*
	* If the SPA is not writable, there should never be any
	* pending itxs waiting to be committed to disk. If that
	* weren't true, we'd skip writing those itxs out, and
	* would break the sematics of zil_commit(); thus, we're
	* verifying that truth before we return to the caller.
	*/
	ASSERT(list_is_empty(&zilog->zl_lwb_list));
	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
	for (int i = 0; i < TXG_SIZE; i++)
	ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
	return;
	}

	/*
	* If the ZIL is suspended, we don't want to dirty it by calling
	* zil_commit_itx_assign() below, nor can we write out
	* lwbs like would be done in zil_commit_write(). Thus, we
	* simply rely on txg_wait_synced() to maintain the necessary
	* semantics, and avoid calling those functions altogether.
	*/
	if (zilog->zl_suspend > 0) {
	txg_wait_synced(zilog->zl_dmu_pool, 0);
	return;
	}

	zil_commit_impl(zilog, foid);
	}

	void
	zil_commit_impl(zilog_t *zilog, uint64_t foid)
	{
	/*
	* Move the "async" itxs for the specified foid to the "sync"
	* queues, such that they will be later committed (or skipped)
	* to an lwb when zil_process_commit_list() is called.
	*
	* Since these "async" itxs must be committed prior to this
	* call to zil_commit returning, we must perform this operation
	* before we call zil_commit_itx_assign().
	*/
	zil_async_to_sync(zilog, foid);

	/*
	* We allocate a new "waiter" structure which will initially be
	* linked to the commit itx using the itx's "itx_private" field.
	* Since the commit itx doesn't represent any on-disk state,
	* when it's committed to an lwb, rather than copying the its
	* lr_t into the lwb's buffer, the commit itx's "waiter" will be
	* added to the lwb's list of waiters. Then, when the lwb is
	* committed to stable storage, each waiter in the lwb's list of
	* waiters will be marked "done", and signalled.
	*
	* We must create the waiter and assign the commit itx prior to
	* calling zil_commit_writer(), or else our specific commit itx
	* is not guaranteed to be committed to an lwb prior to calling
	* zil_commit_waiter().
	*/
	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
	zil_commit_itx_assign(zilog, zcw);

	zil_commit_writer(zilog, zcw);
	zil_commit_waiter(zilog, zcw);

	if (zcw->zcw_zio_error != 0) {
	/*
	* If there was an error writing out the ZIL blocks that
	* this thread is waiting on, then we fallback to
	* relying on spa_sync() to write out the data this
	* thread is waiting on. Obviously this has performance
	* implications, but the expectation is for this to be
	* an exceptional case, and shouldn't occur often.
	*/
	DTRACE_PROBE2(zil__commit__io__error,
	zilog_t , zilog, zil_commit_waiter_t , zcw);
	txg_wait_synced(zilog->zl_dmu_pool, 0);
	}

	zil_free_commit_waiter(zcw);
	}

	/*
	* Called in syncing context to free committed log blocks and update log header.
	*/
	void
	zil_sync(zilog_t zilog, dmu_tx_t tx)
	{
	zil_header_t *zh = zil_header_in_syncing_context(zilog);
	uint64_t txg = dmu_tx_get_txg(tx);
	spa_t *spa = zilog->zl_spa;
	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
	lwb_t *lwb;

	/*
	* We don't zero out zl_destroy_txg, so make sure we don't try
	* to destroy it twice.
	*/
	if (spa_sync_pass(spa) != 1)
	return;

	mutex_enter(&zilog->zl_lock);

	ASSERT(zilog->zl_stop_sync == 0);

	if (*replayed_seq != 0) {
	ASSERT(zh->zh_replay_seq < *replayed_seq);
	zh->zh_replay_seq = *replayed_seq;
	*replayed_seq = 0;
	}

	if (zilog->zl_destroy_txg == txg) {
	blkptr_t blk = zh->zh_log;

	ASSERT(list_head(&zilog->zl_lwb_list) == NULL);

	bzero(zh, sizeof (zil_header_t));
	bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));

	if (zilog->zl_keep_first) {
	/*
	* If this block was part of log chain that couldn't
	* be claimed because a device was missing during
	* zil_claim(), but that device later returns,
	* then this block could erroneously appear valid.
	* To guard against this, assign a new GUID to the new
	* log chain so it doesn't matter what blk points to.
	*/
	zil_init_log_chain(zilog, &blk);
	zh->zh_log = blk;
	}
	}

	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
	zh->zh_log = lwb->lwb_blk;
	if (lwb->lwb_buf != NULL \|\| lwb->lwb_max_txg > txg)
	break;
	list_remove(&zilog->zl_lwb_list, lwb);
	zio_free(spa, txg, &lwb->lwb_blk);
	zil_free_lwb(zilog, lwb);

	/*
	* If we don't have anything left in the lwb list then
	* we've had an allocation failure and we need to zero
	* out the zil_header blkptr so that we don't end
	* up freeing the same block twice.
	*/
	if (list_head(&zilog->zl_lwb_list) == NULL)
	BP_ZERO(&zh->zh_log);
	}
	mutex_exit(&zilog->zl_lock);
	}

	/* ARGSUSED */
	static int
	zil_lwb_cons(void vbuf, void unused, int kmflag)
	{
	lwb_t *lwb = vbuf;
	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
	offsetof(zil_commit_waiter_t, zcw_node));
	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
	sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
	return (0);
	}

	/* ARGSUSED */
	static void
	zil_lwb_dest(void vbuf, void unused)
	{
	lwb_t *lwb = vbuf;
	mutex_destroy(&lwb->lwb_vdev_lock);
	avl_destroy(&lwb->lwb_vdev_tree);
	list_destroy(&lwb->lwb_waiters);
	}

	void
	zil_init(void)
	{
	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
	sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);

	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
	sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	}

	void
	zil_fini(void)
	{
	kmem_cache_destroy(zil_zcw_cache);
	kmem_cache_destroy(zil_lwb_cache);
	}

	void
	zil_set_sync(zilog_t *zilog, uint64_t sync)
	{
	zilog->zl_sync = sync;
	}

	void
	zil_set_logbias(zilog_t *zilog, uint64_t logbias)
	{
	zilog->zl_logbias = logbias;
	}

	zilog_t *
	zil_alloc(objset_t os, zil_header_t zh_phys)
	{
	zilog_t *zilog;

	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);

	zilog->zl_header = zh_phys;
	zilog->zl_os = os;
	zilog->zl_spa = dmu_objset_spa(os);
	zilog->zl_dmu_pool = dmu_objset_pool(os);
	zilog->zl_destroy_txg = TXG_INITIAL - 1;
	zilog->zl_logbias = dmu_objset_logbias(os);
	zilog->zl_sync = dmu_objset_syncprop(os);
	zilog->zl_dirty_max_txg = 0;
	zilog->zl_last_lwb_opened = NULL;
	zilog->zl_last_lwb_latency = 0;

	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);

	for (int i = 0; i < TXG_SIZE; i++) {
	mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
	MUTEX_DEFAULT, NULL);
	}

	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
	offsetof(lwb_t, lwb_node));

	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
	offsetof(itx_t, itx_node));

	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);

	return (zilog);
	}

	void
	zil_free(zilog_t *zilog)
	{
	zilog->zl_stop_sync = 1;

	ASSERT0(zilog->zl_suspend);
	ASSERT0(zilog->zl_suspending);

	ASSERT(list_is_empty(&zilog->zl_lwb_list));
	list_destroy(&zilog->zl_lwb_list);

	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
	list_destroy(&zilog->zl_itx_commit_list);

	for (int i = 0; i < TXG_SIZE; i++) {
	/*
	* It's possible for an itx to be generated that doesn't dirty
	* a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
	* callback to remove the entry. We remove those here.
	*
	* Also free up the ziltest itxs.
	*/
	if (zilog->zl_itxg[i].itxg_itxs)
	zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
	mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
	}

	mutex_destroy(&zilog->zl_issuer_lock);
	mutex_destroy(&zilog->zl_lock);

	cv_destroy(&zilog->zl_cv_suspend);

	kmem_free(zilog, sizeof (zilog_t));
	}

	/*
	* Open an intent log.
	*/
	zilog_t *
	zil_open(objset_t os, zil_get_data_t get_data)
	{
	zilog_t *zilog = dmu_objset_zil(os);

	ASSERT3P(zilog->zl_get_data, ==, NULL);
	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
	ASSERT(list_is_empty(&zilog->zl_lwb_list));

	zilog->zl_get_data = get_data;

	return (zilog);
	}

	/*
	* Close an intent log.
	*/
	void
	zil_close(zilog_t *zilog)
	{
	lwb_t *lwb;
	uint64_t txg;

	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
	zil_commit(zilog, 0);
	} else {
	ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
	ASSERT0(zilog->zl_dirty_max_txg);
	ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
	}

	mutex_enter(&zilog->zl_lock);
	lwb = list_tail(&zilog->zl_lwb_list);
	if (lwb == NULL)
	txg = zilog->zl_dirty_max_txg;
	else
	txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
	mutex_exit(&zilog->zl_lock);

	/*
	* We need to use txg_wait_synced() to wait long enough for the
	* ZIL to be clean, and to wait for all pending lwbs to be
	* written out.
	*/
	if (txg != 0)
	txg_wait_synced(zilog->zl_dmu_pool, txg);

	if (zilog_is_dirty(zilog))
	zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
	VERIFY(!zilog_is_dirty(zilog));

	zilog->zl_get_data = NULL;

	/*
	* We should have only one lwb left on the list; remove it now.
	*/
	mutex_enter(&zilog->zl_lock);
	lwb = list_head(&zilog->zl_lwb_list);
	if (lwb != NULL) {
	ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
	list_remove(&zilog->zl_lwb_list, lwb);
	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
	zil_free_lwb(zilog, lwb);
	}
	mutex_exit(&zilog->zl_lock);
	}

	static char *suspend_tag = "zil suspending";

	/*
	* Suspend an intent log. While in suspended mode, we still honor
	* synchronous semantics, but we rely on txg_wait_synced() to do it.
	* On old version pools, we suspend the log briefly when taking a
	* snapshot so that it will have an empty intent log.
	*
	* Long holds are not really intended to be used the way we do here --
	* held for such a short time. A concurrent caller of dsl_dataset_long_held()
	* could fail. Therefore we take pains to only put a long hold if it is
	* actually necessary. Fortunately, it will only be necessary if the
	* objset is currently mounted (or the ZVOL equivalent). In that case it
	* will already have a long hold, so we are not really making things any worse.
	*
	* Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
	* zvol_state_t), and use their mechanism to prevent their hold from being
	* dropped (e.g. VFS_HOLD()). However, that would be even more pain for
	* very little gain.
	*
	* if cookiep == NULL, this does both the suspend & resume.
	* Otherwise, it returns with the dataset "long held", and the cookie
	* should be passed into zil_resume().
	*/
	int
	zil_suspend(const char osname, void *cookiep)
	{
	objset_t *os;
	zilog_t *zilog;
	const zil_header_t *zh;
	int error;

	error = dmu_objset_hold(osname, suspend_tag, &os);
	if (error != 0)
	return (error);
	zilog = dmu_objset_zil(os);

	mutex_enter(&zilog->zl_lock);
	zh = zilog->zl_header;

	if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
	mutex_exit(&zilog->zl_lock);
	dmu_objset_rele(os, suspend_tag);
	return (SET_ERROR(EBUSY));
	}

	/*
	* Don't put a long hold in the cases where we can avoid it. This
	* is when there is no cookie so we are doing a suspend & resume
	* (i.e. called from zil_vdev_offline()), and there's nothing to do
	* for the suspend because it's already suspended, or there's no ZIL.
	*/
	if (cookiep == NULL && !zilog->zl_suspending &&
	(zilog->zl_suspend > 0 \|\| BP_IS_HOLE(&zh->zh_log))) {
	mutex_exit(&zilog->zl_lock);
	dmu_objset_rele(os, suspend_tag);
	return (0);
	}

	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);

	zilog->zl_suspend++;

	if (zilog->zl_suspend > 1) {
	/*
	* Someone else is already suspending it.
	* Just wait for them to finish.
	*/

	while (zilog->zl_suspending)
	cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
	mutex_exit(&zilog->zl_lock);

	if (cookiep == NULL)
	zil_resume(os);
	else
	*cookiep = os;
	return (0);
	}

	/*
	* If there is no pointer to an on-disk block, this ZIL must not
	* be active (e.g. filesystem not mounted), so there's nothing
	* to clean up.
	*/
	if (BP_IS_HOLE(&zh->zh_log)) {
	ASSERT(cookiep != NULL); /* fast path already handled */

	*cookiep = os;
	mutex_exit(&zilog->zl_lock);
	return (0);
	}

	zilog->zl_suspending = B_TRUE;
	mutex_exit(&zilog->zl_lock);

	/*
	* We need to use zil_commit_impl to ensure we wait for all
	* LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed
	* to disk before proceeding. If we used zil_commit instead, it
	* would just call txg_wait_synced(), because zl_suspend is set.
	* txg_wait_synced() doesn't wait for these lwb's to be
	* LWB_STATE_DONE before returning.
	*/
	zil_commit_impl(zilog, 0);

	/*
	* Now that we've ensured all lwb's are LWB_STATE_DONE, we use
	* txg_wait_synced() to ensure the data from the zilog has
	* migrated to the main pool before calling zil_destroy().
	*/
	txg_wait_synced(zilog->zl_dmu_pool, 0);

	zil_destroy(zilog, B_FALSE);

	mutex_enter(&zilog->zl_lock);
	zilog->zl_suspending = B_FALSE;
	cv_broadcast(&zilog->zl_cv_suspend);
	mutex_exit(&zilog->zl_lock);

	if (cookiep == NULL)
	zil_resume(os);
	else
	*cookiep = os;
	return (0);
	}

	void
	zil_resume(void *cookie)
	{
	objset_t *os = cookie;
	zilog_t *zilog = dmu_objset_zil(os);

	mutex_enter(&zilog->zl_lock);
	ASSERT(zilog->zl_suspend != 0);
	zilog->zl_suspend--;
	mutex_exit(&zilog->zl_lock);
	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
	}

	typedef struct zil_replay_arg {
	zil_replay_func_t **zr_replay;
	void *zr_arg;
	boolean_t zr_byteswap;
	char *zr_lr;
	} zil_replay_arg_t;

	static int
	zil_replay_error(zilog_t zilog, lr_t lr, int error)
	{
	char name[ZFS_MAX_DATASET_NAME_LEN];

	zilog->zl_replaying_seq--; /* didn't actually replay this one */

	dmu_objset_name(zilog->zl_os, name);

	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
	"dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
	(u_longlong_t)lr->lrc_seq,
	(u_longlong_t)(lr->lrc_txtype & ~TX_CI),
	(lr->lrc_txtype & TX_CI) ? "CI" : "");

	return (error);
	}

	static int
	zil_replay_log_record(zilog_t zilog, lr_t lr, void *zra, uint64_t claim_txg)
	{
	zil_replay_arg_t *zr = zra;
	const zil_header_t *zh = zilog->zl_header;
	uint64_t reclen = lr->lrc_reclen;
	uint64_t txtype = lr->lrc_txtype;
	int error = 0;

	zilog->zl_replaying_seq = lr->lrc_seq;

	if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
	return (0);

	if (lr->lrc_txg < claim_txg) /* already committed */
	return (0);

	/* Strip case-insensitive bit, still present in log record */
	txtype &= ~TX_CI;

	if (txtype == 0 \|\| txtype >= TX_MAX_TYPE)
	return (zil_replay_error(zilog, lr, EINVAL));

	/*
	* If this record type can be logged out of order, the object
	* (lr_foid) may no longer exist. That's legitimate, not an error.
	*/
	if (TX_OOO(txtype)) {
	error = dmu_object_info(zilog->zl_os,
	((lr_ooo_t *)lr)->lr_foid, NULL);
	if (error == ENOENT \|\| error == EEXIST)
	return (0);
	}

	/*
	* Make a copy of the data so we can revise and extend it.
	*/
	bcopy(lr, zr->zr_lr, reclen);

	/*
	* If this is a TX_WRITE with a blkptr, suck in the data.
	*/
	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
	error = zil_read_log_data(zilog, (lr_write_t *)lr,
	zr->zr_lr + reclen);
	if (error != 0)
	return (zil_replay_error(zilog, lr, error));
	}

	/*
	* The log block containing this lr may have been byteswapped
	* so that we can easily examine common fields like lrc_txtype.
	* However, the log is a mix of different record types, and only the
	* replay vectors know how to byteswap their records. Therefore, if
	* the lr was byteswapped, undo it before invoking the replay vector.
	*/
	if (zr->zr_byteswap)
	byteswap_uint64_array(zr->zr_lr, reclen);

	/*
	* We must now do two things atomically: replay this log record,
	* and update the log header sequence number to reflect the fact that
	* we did so. At the end of each replay function the sequence number
	* is updated if we are in replay mode.
	*/
	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
	if (error != 0) {
	/*
	* The DMU's dnode layer doesn't see removes until the txg
	* commits, so a subsequent claim can spuriously fail with
	* EEXIST. So if we receive any error we try syncing out
	* any removes then retry the transaction. Note that we
	* specify B_FALSE for byteswap now, so we don't do it twice.
	*/
	txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
	if (error != 0)
	return (zil_replay_error(zilog, lr, error));
	}
	return (0);
	}

	/* ARGSUSED */
	static int
	zil_incr_blks(zilog_t zilog, blkptr_t bp, void *arg, uint64_t claim_txg)
	{
	zilog->zl_replay_blks++;

	return (0);
	}

	/*
	* If this dataset has a non-empty intent log, replay it and destroy it.
	*/
	void
	zil_replay(objset_t os, void arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
	{
	zilog_t *zilog = dmu_objset_zil(os);
	const zil_header_t *zh = zilog->zl_header;
	zil_replay_arg_t zr;

	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
	zil_destroy(zilog, B_TRUE);
	return;
	}

	zr.zr_replay = replay_func;
	zr.zr_arg = arg;
	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
	zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);

	/*
	* Wait for in-progress removes to sync before starting replay.
	*/
	txg_wait_synced(zilog->zl_dmu_pool, 0);

	zilog->zl_replay = B_TRUE;
	zilog->zl_replay_time = ddi_get_lbolt();
	ASSERT(zilog->zl_replay_blks == 0);
	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
	zh->zh_claim_txg);
	kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);

	zil_destroy(zilog, B_FALSE);
	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
	zilog->zl_replay = B_FALSE;
	}

	boolean_t
	zil_replaying(zilog_t zilog, dmu_tx_t tx)
	{
	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
	return (B_TRUE);

	if (zilog->zl_replay) {
	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
	zilog->zl_replaying_seq;
	return (B_TRUE);
	}

	return (B_FALSE);
	}

	/* ARGSUSED */
	int
	-zil_vdev_offline(const char osname, void arg)
	+zil_reset(const char osname, void arg)
	{
	int error;

	error = zil_suspend(osname, NULL);
	if (error != 0)
	return (SET_ERROR(EEXIST));
	return (0);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 332525)
	@@ -1,4151 +1,4183 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
	* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	*/

	#include <sys/sysmacros.h>
	#include <sys/zfs_context.h>
	#include <sys/fm/fs/zfs.h>
	#include <sys/spa.h>
	#include <sys/txg.h>
	#include <sys/spa_impl.h>
	#include <sys/vdev_impl.h>
	#include <sys/zio_impl.h>
	#include <sys/zio_compress.h>
	#include <sys/zio_checksum.h>
	#include <sys/dmu_objset.h>
	#include <sys/arc.h>
	#include <sys/ddt.h>
	#include <sys/trim_map.h>
	#include <sys/blkptr.h>
	#include <sys/zfeature.h>
	#include <sys/metaslab_impl.h>
	#include <sys/abd.h>

	SYSCTL_DECL(_vfs_zfs);
	SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
	#if defined(__amd64__)
	static int zio_use_uma = 1;
	#else
	static int zio_use_uma = 0;
	#endif
	SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
	"Use uma(9) for ZIO allocations");
	static int zio_exclude_metadata = 0;
	SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
	"Exclude metadata buffers from dumps as well");

	zio_trim_stats_t zio_trim_stats = {
	{ "bytes", KSTAT_DATA_UINT64,
	"Number of bytes successfully TRIMmed" },
	{ "success", KSTAT_DATA_UINT64,
	"Number of successful TRIM requests" },
	{ "unsupported", KSTAT_DATA_UINT64,
	"Number of TRIM requests that failed because TRIM is not supported" },
	{ "failed", KSTAT_DATA_UINT64,
	"Number of TRIM requests that failed for reasons other than not supported" },
	};

	static kstat_t *zio_trim_ksp;

	/*
	* ==========================================================================
	* I/O type descriptions
	* ==========================================================================
	*/
	const char *zio_type_name[ZIO_TYPES] = {
	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
	"zio_ioctl"
	};

	boolean_t zio_dva_throttle_enabled = B_TRUE;
	SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN,
	&zio_dva_throttle_enabled, 0, "");

	/*
	* ==========================================================================
	* I/O kmem caches
	* ==========================================================================
	*/
	kmem_cache_t *zio_cache;
	kmem_cache_t *zio_link_cache;
	kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
	kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];

	#ifdef _KERNEL
	extern vmem_t *zio_alloc_arena;
	#endif

	#define ZIO_PIPELINE_CONTINUE 0x100
	#define ZIO_PIPELINE_STOP 0x101

	#define BP_SPANB(indblkshift, level) \
	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
	#define COMPARE_META_LEVEL 0x80000000ul
	/*
	* The following actions directly effect the spa's sync-to-convergence logic.
	* The values below define the sync pass when we start performing the action.
	* Care should be taken when changing these values as they directly impact
	* spa_sync() performance. Tuning these values may introduce subtle performance
	* pathologies and should only be done in the context of performance analysis.
	* These tunables will eventually be removed and replaced with #defines once
	* enough analysis has been done to determine optimal values.
	*
	* The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
	* regular blocks are not deferred.
	*/
	int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
	SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
	&zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
	int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
	SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
	&zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
	int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
	SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
	&zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");

	/*
	* An allocating zio is one that either currently has the DVA allocate
	* stage set or will have it later in its lifetime.
	*/
	#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)

	boolean_t zio_requeue_io_start_cut_in_line = B_TRUE;

	#ifdef illumos
	#ifdef ZFS_DEBUG
	int zio_buf_debug_limit = 16384;
	#else
	int zio_buf_debug_limit = 0;
	#endif
	#endif

	static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);

	void
	zio_init(void)
	{
	size_t c;
	zio_cache = kmem_cache_create("zio_cache",
	sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	zio_link_cache = kmem_cache_create("zio_link_cache",
	sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	if (!zio_use_uma)
	goto out;

	/*
	* For small buffers, we want a cache for each multiple of
	* SPA_MINBLOCKSIZE. For larger buffers, we want a cache
	* for each quarter-power of 2.
	*/
	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
	size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
	size_t p2 = size;
	size_t align = 0;
	int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0;

	while (!ISP2(p2))
	p2 &= p2 - 1;

	#ifdef illumos
	#ifndef _KERNEL
	/*
	* If we are using watchpoints, put each buffer on its own page,
	* to eliminate the performance overhead of trapping to the
	* kernel when modifying a non-watched buffer that shares the
	* page with a watched buffer.
	*/
	if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
	continue;
	#endif
	#endif /* illumos */
	if (size <= 4 * SPA_MINBLOCKSIZE) {
	align = SPA_MINBLOCKSIZE;
	} else if (IS_P2ALIGNED(size, p2 >> 2)) {
	align = MIN(p2 >> 2, PAGESIZE);
	}

	if (align != 0) {
	char name[36];
	(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
	zio_buf_cache[c] = kmem_cache_create(name, size,
	align, NULL, NULL, NULL, NULL, NULL, cflags);

	/*
	* Since zio_data bufs do not appear in crash dumps, we
	* pass KMC_NOTOUCH so that no allocator metadata is
	* stored with the buffers.
	*/
	(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
	zio_data_buf_cache[c] = kmem_cache_create(name, size,
	align, NULL, NULL, NULL, NULL, NULL,
	cflags \| KMC_NOTOUCH \| KMC_NODEBUG);
	}
	}

	while (--c != 0) {
	ASSERT(zio_buf_cache[c] != NULL);
	if (zio_buf_cache[c - 1] == NULL)
	zio_buf_cache[c - 1] = zio_buf_cache[c];

	ASSERT(zio_data_buf_cache[c] != NULL);
	if (zio_data_buf_cache[c - 1] == NULL)
	zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
	}
	out:

	zio_inject_init();

	zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
	KSTAT_TYPE_NAMED,
	sizeof(zio_trim_stats) / sizeof(kstat_named_t),
	KSTAT_FLAG_VIRTUAL);

	if (zio_trim_ksp != NULL) {
	zio_trim_ksp->ks_data = &zio_trim_stats;
	kstat_install(zio_trim_ksp);
	}
	}

	void
	zio_fini(void)
	{
	size_t c;
	kmem_cache_t *last_cache = NULL;
	kmem_cache_t *last_data_cache = NULL;

	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
	if (zio_buf_cache[c] != last_cache) {
	last_cache = zio_buf_cache[c];
	kmem_cache_destroy(zio_buf_cache[c]);
	}
	zio_buf_cache[c] = NULL;

	if (zio_data_buf_cache[c] != last_data_cache) {
	last_data_cache = zio_data_buf_cache[c];
	kmem_cache_destroy(zio_data_buf_cache[c]);
	}
	zio_data_buf_cache[c] = NULL;
	}

	kmem_cache_destroy(zio_link_cache);
	kmem_cache_destroy(zio_cache);

	zio_inject_fini();

	if (zio_trim_ksp != NULL) {
	kstat_delete(zio_trim_ksp);
	zio_trim_ksp = NULL;
	}
	}

	/*
	* ==========================================================================
	* Allocate and free I/O buffers
	* ==========================================================================
	*/

	/*
	* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
	* crashdump if the kernel panics, so use it judiciously. Obviously, it's
	* useful to inspect ZFS metadata, but if possible, we should avoid keeping
	* excess / transient data in-core during a crashdump.
	*/
	void *
	zio_buf_alloc(size_t size)
	{
	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
	int flags = zio_exclude_metadata ? KM_NODEBUG : 0;

	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);

	if (zio_use_uma)
	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
	else
	return (kmem_alloc(size, KM_SLEEP\|flags));
	}

	/*
	* Use zio_data_buf_alloc to allocate data. The data will not appear in a
	* crashdump if the kernel panics. This exists so that we will limit the amount
	* of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
	* of kernel heap dumped to disk when the kernel panics)
	*/
	void *
	zio_data_buf_alloc(size_t size)
	{
	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;

	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);

	if (zio_use_uma)
	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
	else
	return (kmem_alloc(size, KM_SLEEP \| KM_NODEBUG));
	}

	void
	zio_buf_free(void *buf, size_t size)
	{
	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;

	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);

	if (zio_use_uma)
	kmem_cache_free(zio_buf_cache[c], buf);
	else
	kmem_free(buf, size);
	}

	void
	zio_data_buf_free(void *buf, size_t size)
	{
	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;

	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);

	if (zio_use_uma)
	kmem_cache_free(zio_data_buf_cache[c], buf);
	else
	kmem_free(buf, size);
	}

	/*
	* ==========================================================================
	* Push and pop I/O transform buffers
	* ==========================================================================
	*/
	void
	zio_push_transform(zio_t zio, abd_t data, uint64_t size, uint64_t bufsize,
	zio_transform_func_t *transform)
	{
	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);

	/*
	* Ensure that anyone expecting this zio to contain a linear ABD isn't
	* going to get a nasty surprise when they try to access the data.
	*/
	#ifdef illumos
	IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
	#else
	IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd),
	abd_is_linear(data));
	#endif

	zt->zt_orig_abd = zio->io_abd;
	zt->zt_orig_size = zio->io_size;
	zt->zt_bufsize = bufsize;
	zt->zt_transform = transform;

	zt->zt_next = zio->io_transform_stack;
	zio->io_transform_stack = zt;

	zio->io_abd = data;
	zio->io_size = size;
	}

	void
	zio_pop_transforms(zio_t *zio)
	{
	zio_transform_t *zt;

	while ((zt = zio->io_transform_stack) != NULL) {
	if (zt->zt_transform != NULL)
	zt->zt_transform(zio,
	zt->zt_orig_abd, zt->zt_orig_size);

	if (zt->zt_bufsize != 0)
	abd_free(zio->io_abd);

	zio->io_abd = zt->zt_orig_abd;
	zio->io_size = zt->zt_orig_size;
	zio->io_transform_stack = zt->zt_next;

	kmem_free(zt, sizeof (zio_transform_t));
	}
	}

	/*
	* ==========================================================================
	* I/O transform callbacks for subblocks and decompression
	* ==========================================================================
	*/
	static void
	zio_subblock(zio_t zio, abd_t data, uint64_t size)
	{
	ASSERT(zio->io_size > size);

	if (zio->io_type == ZIO_TYPE_READ)
	abd_copy(data, zio->io_abd, size);
	}

	static void
	zio_decompress(zio_t zio, abd_t data, uint64_t size)
	{
	if (zio->io_error == 0) {
	void *tmp = abd_borrow_buf(data, size);
	int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
	zio->io_abd, tmp, zio->io_size, size);
	abd_return_buf_copy(data, tmp, size);

	if (ret != 0)
	zio->io_error = SET_ERROR(EIO);
	}
	}

	/*
	* ==========================================================================
	* I/O parent/child relationships and pipeline interlocks
	* ==========================================================================
	*/
	zio_t *
	zio_walk_parents(zio_t cio, zio_link_t *zl)
	{
	list_t *pl = &cio->io_parent_list;

	zl = (zl == NULL) ? list_head(pl) : list_next(pl, *zl);
	if (*zl == NULL)
	return (NULL);

	ASSERT((*zl)->zl_child == cio);
	return ((*zl)->zl_parent);
	}

	zio_t *
	zio_walk_children(zio_t pio, zio_link_t *zl)
	{
	list_t *cl = &pio->io_child_list;

	zl = (zl == NULL) ? list_head(cl) : list_next(cl, *zl);
	if (*zl == NULL)
	return (NULL);

	ASSERT((*zl)->zl_parent == pio);
	return ((*zl)->zl_child);
	}

	zio_t *
	zio_unique_parent(zio_t *cio)
	{
	zio_link_t *zl = NULL;
	zio_t *pio = zio_walk_parents(cio, &zl);

	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
	return (pio);
	}

	void
	zio_add_child(zio_t pio, zio_t cio)
	{
	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);

	/*
	* Logical I/Os can have logical, gang, or vdev children.
	* Gang I/Os can have gang or vdev children.
	* Vdev I/Os can only have vdev children.
	* The following ASSERT captures all of these constraints.
	*/
	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);

	zl->zl_parent = pio;
	zl->zl_child = cio;

	mutex_enter(&cio->io_lock);
	mutex_enter(&pio->io_lock);

	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);

	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
	pio->io_children[cio->io_child_type][w] += !cio->io_state[w];

	list_insert_head(&pio->io_child_list, zl);
	list_insert_head(&cio->io_parent_list, zl);

	pio->io_child_count++;
	cio->io_parent_count++;

	mutex_exit(&pio->io_lock);
	mutex_exit(&cio->io_lock);
	}

	static void
	zio_remove_child(zio_t pio, zio_t cio, zio_link_t *zl)
	{
	ASSERT(zl->zl_parent == pio);
	ASSERT(zl->zl_child == cio);

	mutex_enter(&cio->io_lock);
	mutex_enter(&pio->io_lock);

	list_remove(&pio->io_child_list, zl);
	list_remove(&cio->io_parent_list, zl);

	pio->io_child_count--;
	cio->io_parent_count--;

	mutex_exit(&pio->io_lock);
	mutex_exit(&cio->io_lock);

	kmem_cache_free(zio_link_cache, zl);
	}

	static boolean_t
	zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
	{
	boolean_t waiting = B_FALSE;

	mutex_enter(&zio->io_lock);
	ASSERT(zio->io_stall == NULL);
	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
	if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
	continue;

	uint64_t *countp = &zio->io_children[c][wait];
	if (*countp != 0) {
	zio->io_stage >>= 1;
	ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
	zio->io_stall = countp;
	waiting = B_TRUE;
	break;
	}
	}
	mutex_exit(&zio->io_lock);
	return (waiting);
	}

	static void
	zio_notify_parent(zio_t pio, zio_t zio, enum zio_wait_type wait)
	{
	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
	int *errorp = &pio->io_child_error[zio->io_child_type];

	mutex_enter(&pio->io_lock);
	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
	errorp = zio_worst_error(errorp, zio->io_error);
	pio->io_reexecute \|= zio->io_reexecute;
	ASSERT3U(*countp, >, 0);

	(*countp)--;

	if (*countp == 0 && pio->io_stall == countp) {
	zio_taskq_type_t type =
	pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
	ZIO_TASKQ_INTERRUPT;
	pio->io_stall = NULL;
	mutex_exit(&pio->io_lock);
	/*
	* Dispatch the parent zio in its own taskq so that
	* the child can continue to make progress. This also
	* prevents overflowing the stack when we have deeply nested
	* parent-child relationships.
	*/
	zio_taskq_dispatch(pio, type, B_FALSE);
	} else {
	mutex_exit(&pio->io_lock);
	}
	}

	static void
	zio_inherit_child_errors(zio_t *zio, enum zio_child c)
	{
	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
	zio->io_error = zio->io_child_error[c];
	}

	int
	zio_bookmark_compare(const void x1, const void x2)
	{
	const zio_t *z1 = x1;
	const zio_t *z2 = x2;

	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
	return (-1);
	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
	return (1);

	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
	return (-1);
	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
	return (1);

	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
	return (-1);
	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
	return (1);

	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
	return (-1);
	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
	return (1);

	if (z1 < z2)
	return (-1);
	if (z1 > z2)
	return (1);

	return (0);
	}

	/*
	* ==========================================================================
	* Create the various types of I/O (read, write, free, etc)
	* ==========================================================================
	*/
	static zio_t *
	zio_create(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t *bp,
	abd_t data, uint64_t lsize, uint64_t psize, zio_done_func_t done,
	void *private, zio_type_t type, zio_priority_t priority,
	enum zio_flag flags, vdev_t *vd, uint64_t offset,
	const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
	{
	zio_t *zio;

	ASSERT3U(type == ZIO_TYPE_FREE \|\| psize, <=, SPA_MAXBLOCKSIZE);
	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);

	ASSERT(!vd \|\| spa_config_held(spa, SCL_STATE_ALL, RW_READER));
	ASSERT(!bp \|\| !(flags & ZIO_FLAG_CONFIG_WRITER));
	ASSERT(vd \|\| stage == ZIO_STAGE_OPEN);

	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0);

	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
	bzero(zio, sizeof (zio_t));

	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);

	list_create(&zio->io_parent_list, sizeof (zio_link_t),
	offsetof(zio_link_t, zl_parent_node));
	list_create(&zio->io_child_list, sizeof (zio_link_t),
	offsetof(zio_link_t, zl_child_node));
	metaslab_trace_init(&zio->io_alloc_list);

	if (vd != NULL)
	zio->io_child_type = ZIO_CHILD_VDEV;
	else if (flags & ZIO_FLAG_GANG_CHILD)
	zio->io_child_type = ZIO_CHILD_GANG;
	else if (flags & ZIO_FLAG_DDT_CHILD)
	zio->io_child_type = ZIO_CHILD_DDT;
	else
	zio->io_child_type = ZIO_CHILD_LOGICAL;

	if (bp != NULL) {
	zio->io_bp = (blkptr_t *)bp;
	zio->io_bp_copy = *bp;
	zio->io_bp_orig = *bp;
	if (type != ZIO_TYPE_WRITE \|\|
	zio->io_child_type == ZIO_CHILD_DDT)
	zio->io_bp = &zio->io_bp_copy; /* so caller can free */
	if (zio->io_child_type == ZIO_CHILD_LOGICAL)
	zio->io_logical = zio;
	if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
	pipeline \|= ZIO_GANG_STAGES;
	}

	zio->io_spa = spa;
	zio->io_txg = txg;
	zio->io_done = done;
	zio->io_private = private;
	zio->io_type = type;
	zio->io_priority = priority;
	zio->io_vd = vd;
	zio->io_offset = offset;
	zio->io_orig_abd = zio->io_abd = data;
	zio->io_orig_size = zio->io_size = psize;
	zio->io_lsize = lsize;
	zio->io_orig_flags = zio->io_flags = flags;
	zio->io_orig_stage = zio->io_stage = stage;
	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
	zio->io_pipeline_trace = ZIO_STAGE_OPEN;

	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);

	if (zb != NULL)
	zio->io_bookmark = *zb;

	if (pio != NULL) {
	if (zio->io_logical == NULL)
	zio->io_logical = pio->io_logical;
	if (zio->io_child_type == ZIO_CHILD_GANG)
	zio->io_gang_leader = pio->io_gang_leader;
	zio_add_child(pio, zio);
	}

	return (zio);
	}

	static void
	zio_destroy(zio_t *zio)
	{
	metaslab_trace_fini(&zio->io_alloc_list);
	list_destroy(&zio->io_parent_list);
	list_destroy(&zio->io_child_list);
	mutex_destroy(&zio->io_lock);
	cv_destroy(&zio->io_cv);
	kmem_cache_free(zio_cache, zio);
	}

	zio_t *
	zio_null(zio_t pio, spa_t spa, vdev_t vd, zio_done_func_t done,
	void *private, enum zio_flag flags)
	{
	zio_t *zio;

	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
	ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
	ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);

	return (zio);
	}

	zio_t *
	zio_root(spa_t spa, zio_done_func_t done, void *private, enum zio_flag flags)
	{
	return (zio_null(NULL, spa, NULL, done, private, flags));
	}

	void
	zfs_blkptr_verify(spa_t spa, const blkptr_t bp)
	{
	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
	zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
	bp, (longlong_t)BP_GET_TYPE(bp));
	}
	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS \|\|
	BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
	zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
	bp, (longlong_t)BP_GET_CHECKSUM(bp));
	}
	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS \|\|
	BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
	zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
	bp, (longlong_t)BP_GET_COMPRESS(bp));
	}
	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
	zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
	bp, (longlong_t)BP_GET_LSIZE(bp));
	}
	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
	zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
	bp, (longlong_t)BP_GET_PSIZE(bp));
	}

	if (BP_IS_EMBEDDED(bp)) {
	if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
	zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
	bp, (longlong_t)BPE_GET_ETYPE(bp));
	}
	}

	/*
	* Pool-specific checks.
	*
	* Note: it would be nice to verify that the blk_birth and
	* BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
	* allows the birth time of log blocks (and dmu_sync()-ed blocks
	* that are in the log) to be arbitrarily large.
	*/
	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
	if (vdevid >= spa->spa_root_vdev->vdev_children) {
	zfs_panic_recover("blkptr at %p DVA %u has invalid "
	"VDEV %llu",
	bp, i, (longlong_t)vdevid);
	continue;
	}
	vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
	if (vd == NULL) {
	zfs_panic_recover("blkptr at %p DVA %u has invalid "
	"VDEV %llu",
	bp, i, (longlong_t)vdevid);
	continue;
	}
	if (vd->vdev_ops == &vdev_hole_ops) {
	zfs_panic_recover("blkptr at %p DVA %u has hole "
	"VDEV %llu",
	bp, i, (longlong_t)vdevid);
	continue;
	}
	if (vd->vdev_ops == &vdev_missing_ops) {
	/*
	* "missing" vdevs are valid during import, but we
	* don't have their detailed info (e.g. asize), so
	* we can't perform any more checks on them.
	*/
	continue;
	}
	uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
	uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
	if (BP_IS_GANG(bp))
	asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
	if (offset + asize > vd->vdev_asize) {
	zfs_panic_recover("blkptr at %p DVA %u has invalid "
	"OFFSET %llu",
	bp, i, (longlong_t)offset);
	}
	}
	}

	zio_t *
	zio_read(zio_t pio, spa_t spa, const blkptr_t *bp,
	abd_t data, uint64_t size, zio_done_func_t done, void *private,
	zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
	{
	zio_t *zio;

	zfs_blkptr_verify(spa, bp);

	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
	data, size, size, done, private,
	ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
	ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
	ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);

	return (zio);
	}

	zio_t *
	zio_write(zio_t pio, spa_t spa, uint64_t txg, blkptr_t *bp,
	abd_t data, uint64_t lsize, uint64_t psize, const zio_prop_t zp,
	zio_done_func_t ready, zio_done_func_t children_ready,
	zio_done_func_t physdone, zio_done_func_t done,
	void *private, zio_priority_t priority, enum zio_flag flags,
	const zbookmark_phys_t *zb)
	{
	zio_t *zio;

	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
	zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
	zp->zp_compress >= ZIO_COMPRESS_OFF &&
	zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
	DMU_OT_IS_VALID(zp->zp_type) &&
	zp->zp_level < 32 &&
	zp->zp_copies > 0 &&
	zp->zp_copies <= spa_max_replication(spa));

	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
	ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
	ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
	ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);

	zio->io_ready = ready;
	zio->io_children_ready = children_ready;
	zio->io_physdone = physdone;
	zio->io_prop = *zp;

	/*
	* Data can be NULL if we are going to call zio_write_override() to
	* provide the already-allocated BP. But we may need the data to
	* verify a dedup hit (if requested). In this case, don't try to
	* dedup (just take the already-allocated BP verbatim).
	*/
	if (data == NULL && zio->io_prop.zp_dedup_verify) {
	zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
	}

	return (zio);
	}

	zio_t *
	zio_rewrite(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp, abd_t data,
	uint64_t size, zio_done_func_t done, void private,
	zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
	{
	zio_t *zio;

	zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
	ZIO_TYPE_WRITE, priority, flags \| ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
	ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);

	return (zio);
	}

	void
	zio_write_override(zio_t zio, blkptr_t bp, int copies, boolean_t nopwrite)
	{
	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));

	/*
	* We must reset the io_prop to match the values that existed
	* when the bp was first written by dmu_sync() keeping in mind
	* that nopwrite and dedup are mutually exclusive.
	*/
	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
	zio->io_prop.zp_nopwrite = nopwrite;
	zio->io_prop.zp_copies = copies;
	zio->io_bp_override = bp;
	}

	void
	zio_free(spa_t spa, uint64_t txg, const blkptr_t bp)
	{

	+ zfs_blkptr_verify(spa, bp);
	+
	/*
	* The check for EMBEDDED is a performance optimization. We
	* process the free here (by ignoring it) rather than
	* putting it on the list and then processing it in zio_free_sync().
	*/
	if (BP_IS_EMBEDDED(bp))
	return;
	metaslab_check_free(spa, bp);

	/*
	* Frees that are for the currently-syncing txg, are not going to be
	* deferred, and which will not need to do a read (i.e. not GANG or
	* DEDUP), can be processed immediately. Otherwise, put them on the
	* in-memory list for later processing.
	*/
	if (zfs_trim_enabled \|\| BP_IS_GANG(bp) \|\| BP_GET_DEDUP(bp) \|\|
	txg != spa->spa_syncing_txg \|\|
	spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
	bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
	} else {
	VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
	BP_GET_PSIZE(bp), 0)));
	}
	}

	zio_t *
	zio_free_sync(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t *bp,
	uint64_t size, enum zio_flag flags)
	{
	zio_t *zio;
	enum zio_stage stage = ZIO_FREE_PIPELINE;

	ASSERT(!BP_IS_HOLE(bp));
	ASSERT(spa_syncing_txg(spa) == txg);
	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);

	if (BP_IS_EMBEDDED(bp))
	return (zio_null(pio, spa, NULL, NULL, NULL, 0));

	metaslab_check_free(spa, bp);
	arc_freed(spa, bp);

	if (zfs_trim_enabled)
	stage \|= ZIO_STAGE_ISSUE_ASYNC \| ZIO_STAGE_VDEV_IO_START \|
	ZIO_STAGE_VDEV_IO_ASSESS;
	/*
	* GANG and DEDUP blocks can induce a read (for the gang block header,
	* or the DDT), so issue them asynchronously so that this thread is
	* not tied up.
	*/
	else if (BP_IS_GANG(bp) \|\| BP_GET_DEDUP(bp))
	stage \|= ZIO_STAGE_ISSUE_ASYNC;

	flags \|= ZIO_FLAG_DONT_QUEUE;

	zio = zio_create(pio, spa, txg, bp, NULL, size,
	size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
	flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);

	return (zio);
	}

	zio_t *
	zio_claim(zio_t pio, spa_t spa, uint64_t txg, const blkptr_t *bp,
	zio_done_func_t done, void private, enum zio_flag flags)
	{
	zio_t *zio;

	- dprintf_bp(bp, "claiming in txg %llu", txg);
	+ zfs_blkptr_verify(spa, bp);

	if (BP_IS_EMBEDDED(bp))
	return (zio_null(pio, spa, NULL, NULL, NULL, 0));

	/*
	* A claim is an allocation of a specific block. Claims are needed
	* to support immediate writes in the intent log. The issue is that
	* immediate writes contain committed data, but in a txg that was
	* not committed. Upon opening the pool after an unclean shutdown,
	* the intent log claims all blocks that contain immediate write data
	* so that the SPA knows they're in use.
	*
	* All claims must be resolved in the first txg -- before the SPA
	* starts allocating blocks -- so that nothing is allocated twice.
	* If txg == 0 we just verify that the block is claimable.
	*/
	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
	ASSERT(txg == spa_first_txg(spa) \|\| txg == 0);
	ASSERT(!BP_GET_DEDUP(bp) \|\| !spa_writeable(spa)); /* zdb(1M) */

	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
	BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
	flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
	ASSERT0(zio->io_queued_timestamp);

	return (zio);
	}

	zio_t *
	zio_ioctl(zio_t pio, spa_t spa, vdev_t *vd, int cmd, uint64_t offset,
	uint64_t size, zio_done_func_t done, void private,
	zio_priority_t priority, enum zio_flag flags)
	{
	zio_t *zio;
	int c;

	if (vd->vdev_children == 0) {
	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
	ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
	ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);

	zio->io_cmd = cmd;
	} else {
	zio = zio_null(pio, spa, NULL, NULL, NULL, flags);

	for (c = 0; c < vd->vdev_children; c++)
	zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
	offset, size, done, private, priority, flags));
	}

	return (zio);
	}

	zio_t *
	zio_read_phys(zio_t pio, vdev_t vd, uint64_t offset, uint64_t size,
	abd_t data, int checksum, zio_done_func_t done, void *private,
	zio_priority_t priority, enum zio_flag flags, boolean_t labels)
	{
	zio_t *zio;

	ASSERT(vd->vdev_children == 0);
	ASSERT(!labels \|\| offset + size <= VDEV_LABEL_START_SIZE \|\|
	offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
	ASSERT3U(offset + size, <=, vd->vdev_psize);

	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
	private, ZIO_TYPE_READ, priority, flags \| ZIO_FLAG_PHYSICAL, vd,
	offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);

	zio->io_prop.zp_checksum = checksum;

	return (zio);
	}

	zio_t *
	zio_write_phys(zio_t pio, vdev_t vd, uint64_t offset, uint64_t size,
	abd_t data, int checksum, zio_done_func_t done, void *private,
	zio_priority_t priority, enum zio_flag flags, boolean_t labels)
	{
	zio_t *zio;

	ASSERT(vd->vdev_children == 0);
	ASSERT(!labels \|\| offset + size <= VDEV_LABEL_START_SIZE \|\|
	offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
	ASSERT3U(offset + size, <=, vd->vdev_psize);

	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
	private, ZIO_TYPE_WRITE, priority, flags \| ZIO_FLAG_PHYSICAL, vd,
	offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);

	zio->io_prop.zp_checksum = checksum;

	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
	/*
	* zec checksums are necessarily destructive -- they modify
	* the end of the write buffer to hold the verifier/checksum.
	* Therefore, we must make a local copy in case the data is
	* being written to multiple places in parallel.
	*/
	abd_t *wbuf = abd_alloc_sametype(data, size);
	abd_copy(wbuf, data, size);

	zio_push_transform(zio, wbuf, size, size, NULL);
	}

	return (zio);
	}

	/*
	* Create a child I/O to do some work for us.
	*/
	zio_t *
	zio_vdev_child_io(zio_t pio, blkptr_t bp, vdev_t *vd, uint64_t offset,
	abd_t *data, uint64_t size, int type, zio_priority_t priority,
	enum zio_flag flags, zio_done_func_t done, void private)
	{
	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
	zio_t *zio;

	- ASSERT(vd->vdev_parent ==
	- (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
	+ /*
	+ * vdev child I/Os do not propagate their error to the parent.
	+ * Therefore, for correct operation the caller must check for
	+ * and handle the error in the child i/o's done callback.
	+ * The only exceptions are i/os that we don't care about
	+ * (OPTIONAL or REPAIR).
	+ */
	+ ASSERT((flags & ZIO_FLAG_OPTIONAL) \|\| (flags & ZIO_FLAG_IO_REPAIR) \|\|
	+ done != NULL);

	+ /*
	+ * In the common case, where the parent zio was to a normal vdev,
	+ * the child zio must be to a child vdev of that vdev. Otherwise,
	+ * the child zio must be to a top-level vdev.
	+ */
	+ if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) {
	+ ASSERT3P(vd->vdev_parent, ==, pio->io_vd);
	+ } else {
	+ ASSERT3P(vd, ==, vd->vdev_top);
	+ }
	+
	if (type == ZIO_TYPE_READ && bp != NULL) {
	/*
	* If we have the bp, then the child should perform the
	* checksum and the parent need not. This pushes error
	* detection as close to the leaves as possible and
	* eliminates redundant checksums in the interior nodes.
	*/
	pipeline \|= ZIO_STAGE_CHECKSUM_VERIFY;
	pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
	}

	/* Not all IO types require vdev io done stage e.g. free */
	if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
	pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;

	- if (vd->vdev_children == 0)
	+ if (vd->vdev_ops->vdev_op_leaf) {
	+ ASSERT0(vd->vdev_children);
	offset += VDEV_LABEL_START_SIZE;
	+ }

	- flags \|= ZIO_VDEV_CHILD_FLAGS(pio) \| ZIO_FLAG_DONT_PROPAGATE;
	+ flags \|= ZIO_VDEV_CHILD_FLAGS(pio);

	/*
	* If we've decided to do a repair, the write is not speculative --
	* even if the original read was.
	*/
	if (flags & ZIO_FLAG_IO_REPAIR)
	flags &= ~ZIO_FLAG_SPECULATIVE;

	/*
	* If we're creating a child I/O that is not associated with a
	* top-level vdev, then the child zio is not an allocating I/O.
	* If this is a retried I/O then we ignore it since we will
	* have already processed the original allocating I/O.
	*/
	if (flags & ZIO_FLAG_IO_ALLOCATING &&
	(vd != vd->vdev_top \|\| (flags & ZIO_FLAG_IO_RETRY))) {
	metaslab_class_t *mc = spa_normal_class(pio->io_spa);

	ASSERT(mc->mc_alloc_throttle_enabled);
	ASSERT(type == ZIO_TYPE_WRITE);
	ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
	ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
	ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) \|\|
	pio->io_child_type == ZIO_CHILD_GANG);

	flags &= ~ZIO_FLAG_IO_ALLOCATING;
	}

	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
	done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
	ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);

	zio->io_physdone = pio->io_physdone;
	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
	zio->io_logical->io_phys_children++;

	return (zio);
	}

	zio_t *
	zio_vdev_delegated_io(vdev_t vd, uint64_t offset, abd_t data, uint64_t size,
	int type, zio_priority_t priority, enum zio_flag flags,
	zio_done_func_t done, void private)
	{
	zio_t *zio;

	ASSERT(vd->vdev_ops->vdev_op_leaf);

	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
	data, size, size, done, private, type, priority,
	flags \| ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_RETRY \| ZIO_FLAG_DELEGATED,
	vd, offset, NULL,
	ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);

	return (zio);
	}

	void
	zio_flush(zio_t zio, vdev_t vd)
	{
	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
	NULL, NULL, ZIO_PRIORITY_NOW,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_DONT_RETRY));
	}

	zio_t *
	zio_trim(zio_t zio, spa_t spa, vdev_t *vd, uint64_t offset, uint64_t size)
	{

	ASSERT(vd->vdev_ops->vdev_op_leaf);

	return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL,
	ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE \|
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE \| ZIO_FLAG_DONT_RETRY,
	vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
	}

	void
	zio_shrink(zio_t *zio, uint64_t size)
	{
	ASSERT3P(zio->io_executor, ==, NULL);
	ASSERT3P(zio->io_orig_size, ==, zio->io_size);
	ASSERT3U(size, <=, zio->io_size);

	/*
	* We don't shrink for raidz because of problems with the
	* reconstruction when reading back less than the block size.
	* Note, BP_IS_RAIDZ() assumes no compression.
	*/
	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
	if (!BP_IS_RAIDZ(zio->io_bp)) {
	/* we are not doing a raw write */
	ASSERT3U(zio->io_size, ==, zio->io_lsize);
	zio->io_orig_size = zio->io_size = zio->io_lsize = size;
	}
	}

	/*
	* ==========================================================================
	* Prepare to read and write logical blocks
	* ==========================================================================
	*/

	static int
	zio_read_bp_init(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;

	+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
	+
	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
	zio->io_child_type == ZIO_CHILD_LOGICAL &&
	!(zio->io_flags & ZIO_FLAG_RAW)) {
	uint64_t psize =
	BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
	zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
	psize, psize, zio_decompress);
	}

	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

	int psize = BPE_GET_PSIZE(bp);
	void *data = abd_borrow_buf(zio->io_abd, psize);
	decode_embedded_bp_compressed(bp, data);
	abd_return_buf_copy(zio->io_abd, data, psize);
	} else {
	ASSERT(!BP_IS_EMBEDDED(bp));
	+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
	}

	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
	zio->io_flags \|= ZIO_FLAG_DONT_CACHE;

	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
	zio->io_flags \|= ZIO_FLAG_DONT_CACHE;

	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
	zio->io_pipeline = ZIO_DDT_READ_PIPELINE;

	return (ZIO_PIPELINE_CONTINUE);
	}

	static int
	zio_write_bp_init(zio_t *zio)
	{
	if (!IO_IS_ALLOCATING(zio))
	return (ZIO_PIPELINE_CONTINUE);

	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);

	if (zio->io_bp_override) {
	blkptr_t *bp = zio->io_bp;
	zio_prop_t *zp = &zio->io_prop;

	ASSERT(bp->blk_birth != zio->io_txg);
	ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);

	bp = zio->io_bp_override;
	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

	if (BP_IS_EMBEDDED(bp))
	return (ZIO_PIPELINE_CONTINUE);

	/*
	* If we've been overridden and nopwrite is set then
	* set the flag accordingly to indicate that a nopwrite
	* has already occurred.
	*/
	if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
	ASSERT(!zp->zp_dedup);
	ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
	zio->io_flags \|= ZIO_FLAG_NOPWRITE;
	return (ZIO_PIPELINE_CONTINUE);
	}

	ASSERT(!zp->zp_nopwrite);

	if (BP_IS_HOLE(bp) \|\| !zp->zp_dedup)
	return (ZIO_PIPELINE_CONTINUE);

	ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
	ZCHECKSUM_FLAG_DEDUP) \|\| zp->zp_dedup_verify);

	if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
	BP_SET_DEDUP(bp, 1);
	zio->io_pipeline \|= ZIO_STAGE_DDT_WRITE;
	return (ZIO_PIPELINE_CONTINUE);
	}

	/*
	* We were unable to handle this as an override bp, treat
	* it as a regular write I/O.
	*/
	zio->io_bp_override = NULL;
	*bp = zio->io_bp_orig;
	zio->io_pipeline = zio->io_orig_pipeline;
	}

	return (ZIO_PIPELINE_CONTINUE);
	}

	static int
	zio_write_compress(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;
	zio_prop_t *zp = &zio->io_prop;
	enum zio_compress compress = zp->zp_compress;
	blkptr_t *bp = zio->io_bp;
	uint64_t lsize = zio->io_lsize;
	uint64_t psize = zio->io_size;
	int pass = 1;

	EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);

	/*
	* If our children haven't all reached the ready stage,
	* wait for them and then repeat this pipeline stage.
	*/
	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT \|
	ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
	return (ZIO_PIPELINE_STOP);
	}

	if (!IO_IS_ALLOCATING(zio))
	return (ZIO_PIPELINE_CONTINUE);

	if (zio->io_children_ready != NULL) {
	/*
	* Now that all our children are ready, run the callback
	* associated with this zio in case it wants to modify the
	* data to be written.
	*/
	ASSERT3U(zp->zp_level, >, 0);
	zio->io_children_ready(zio);
	}

	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
	ASSERT(zio->io_bp_override == NULL);

	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
	/*
	* We're rewriting an existing block, which means we're
	* working on behalf of spa_sync(). For spa_sync() to
	* converge, it must eventually be the case that we don't
	* have to allocate new blocks. But compression changes
	* the blocksize, which forces a reallocate, and makes
	* convergence take longer. Therefore, after the first
	* few passes, stop compressing to ensure convergence.
	*/
	pass = spa_sync_pass(spa);

	ASSERT(zio->io_txg == spa_syncing_txg(spa));
	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	ASSERT(!BP_GET_DEDUP(bp));

	if (pass >= zfs_sync_pass_dont_compress)
	compress = ZIO_COMPRESS_OFF;

	/* Make sure someone doesn't change their mind on overwrites */
	ASSERT(BP_IS_EMBEDDED(bp) \|\| MIN(zp->zp_copies + BP_IS_GANG(bp),
	spa_max_replication(spa)) == BP_GET_NDVAS(bp));
	}

	/* If it's a compressed write that is not raw, compress the buffer. */
	if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
	void *cbuf = zio_buf_alloc(lsize);
	psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
	if (psize == 0 \|\| psize == lsize) {
	compress = ZIO_COMPRESS_OFF;
	zio_buf_free(cbuf, lsize);
	} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
	zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
	spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
	encode_embedded_bp_compressed(bp,
	cbuf, compress, lsize, psize);
	BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
	BP_SET_TYPE(bp, zio->io_prop.zp_type);
	BP_SET_LEVEL(bp, zio->io_prop.zp_level);
	zio_buf_free(cbuf, lsize);
	bp->blk_birth = zio->io_txg;
	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	ASSERT(spa_feature_is_active(spa,
	SPA_FEATURE_EMBEDDED_DATA));
	return (ZIO_PIPELINE_CONTINUE);
	} else {
	/*
	* Round up compressed size up to the ashift
	* of the smallest-ashift device, and zero the tail.
	* This ensures that the compressed size of the BP
	* (and thus compressratio property) are correct,
	* in that we charge for the padding used to fill out
	* the last sector.
	*/
	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
	size_t rounded = (size_t)P2ROUNDUP(psize,
	1ULL << spa->spa_min_ashift);
	if (rounded >= lsize) {
	compress = ZIO_COMPRESS_OFF;
	zio_buf_free(cbuf, lsize);
	psize = lsize;
	} else {
	abd_t *cdata = abd_get_from_buf(cbuf, lsize);
	abd_take_ownership_of_buf(cdata, B_TRUE);
	abd_zero_off(cdata, psize, rounded - psize);
	psize = rounded;
	zio_push_transform(zio, cdata,
	psize, lsize, NULL);
	}
	}

	/*
	* We were unable to handle this as an override bp, treat
	* it as a regular write I/O.
	*/
	zio->io_bp_override = NULL;
	*bp = zio->io_bp_orig;
	zio->io_pipeline = zio->io_orig_pipeline;
	} else {
	ASSERT3U(psize, !=, 0);
	}

	/*
	* The final pass of spa_sync() must be all rewrites, but the first
	* few passes offer a trade-off: allocating blocks defers convergence,
	* but newly allocated blocks are sequential, so they can be written
	* to disk faster. Therefore, we allow the first few passes of
	* spa_sync() to allocate new blocks, but force rewrites after that.
	* There should only be a handful of blocks after pass 1 in any case.
	*/
	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
	BP_GET_PSIZE(bp) == psize &&
	pass >= zfs_sync_pass_rewrite) {
	ASSERT(psize != 0);
	enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
	zio->io_pipeline = ZIO_REWRITE_PIPELINE \| gang_stages;
	zio->io_flags \|= ZIO_FLAG_IO_REWRITE;
	} else {
	BP_ZERO(bp);
	zio->io_pipeline = ZIO_WRITE_PIPELINE;
	}

	if (psize == 0) {
	if (zio->io_bp_orig.blk_birth != 0 &&
	spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
	BP_SET_LSIZE(bp, lsize);
	BP_SET_TYPE(bp, zp->zp_type);
	BP_SET_LEVEL(bp, zp->zp_level);
	BP_SET_BIRTH(bp, zio->io_txg, 0);
	}
	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	} else {
	ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
	BP_SET_LSIZE(bp, lsize);
	BP_SET_TYPE(bp, zp->zp_type);
	BP_SET_LEVEL(bp, zp->zp_level);
	BP_SET_PSIZE(bp, psize);
	BP_SET_COMPRESS(bp, compress);
	BP_SET_CHECKSUM(bp, zp->zp_checksum);
	BP_SET_DEDUP(bp, zp->zp_dedup);
	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
	if (zp->zp_dedup) {
	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
	zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
	}
	if (zp->zp_nopwrite) {
	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
	zio->io_pipeline \|= ZIO_STAGE_NOP_WRITE;
	}
	}
	return (ZIO_PIPELINE_CONTINUE);
	}

	static int
	zio_free_bp_init(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;

	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
	if (BP_GET_DEDUP(bp))
	zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
	}

	+ ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
	+
	return (ZIO_PIPELINE_CONTINUE);
	}

	/*
	* ==========================================================================
	* Execute the I/O pipeline
	* ==========================================================================
	*/

	static void
	zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
	{
	spa_t *spa = zio->io_spa;
	zio_type_t t = zio->io_type;
	int flags = (cutinline ? TQ_FRONT : 0);

	ASSERT(q == ZIO_TASKQ_ISSUE \|\| q == ZIO_TASKQ_INTERRUPT);

	/*
	* If we're a config writer or a probe, the normal issue and
	* interrupt threads may all be blocked waiting for the config lock.
	* In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
	*/
	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER \| ZIO_FLAG_PROBE))
	t = ZIO_TYPE_NULL;

	/*
	* A similar issue exists for the L2ARC write thread until L2ARC 2.0.
	*/
	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
	t = ZIO_TYPE_NULL;

	/*
	* If this is a high priority I/O, then use the high priority taskq if
	* available.
	*/
	if (zio->io_priority == ZIO_PRIORITY_NOW &&
	spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
	q++;

	ASSERT3U(q, <, ZIO_TASKQ_TYPES);

	/*
	* NB: We are assuming that the zio can only be dispatched
	* to a single taskq at a time. It would be a grievous error
	* to dispatch the zio to another taskq at the same time.
	*/
	#if defined(illumos) \|\| !defined(_KERNEL)
	ASSERT(zio->io_tqent.tqent_next == NULL);
	#else
	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
	#endif
	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
	flags, &zio->io_tqent);
	}

	static boolean_t
	zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
	{
	kthread_t *executor = zio->io_executor;
	spa_t *spa = zio->io_spa;

	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
	uint_t i;
	for (i = 0; i < tqs->stqs_count; i++) {
	if (taskq_member(tqs->stqs_taskq[i], executor))
	return (B_TRUE);
	}
	}

	return (B_FALSE);
	}

	static int
	zio_issue_async(zio_t *zio)
	{
	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);

	return (ZIO_PIPELINE_STOP);
	}

	void
	zio_interrupt(zio_t *zio)
	{
	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
	}

	void
	zio_delay_interrupt(zio_t *zio)
	{
	/*
	* The timeout_generic() function isn't defined in userspace, so
	* rather than trying to implement the function, the zio delay
	* functionality has been disabled for userspace builds.
	*/

	#ifdef _KERNEL
	/*
	* If io_target_timestamp is zero, then no delay has been registered
	* for this IO, thus jump to the end of this function and "skip" the
	* delay; issuing it directly to the zio layer.
	*/
	if (zio->io_target_timestamp != 0) {
	hrtime_t now = gethrtime();

	if (now >= zio->io_target_timestamp) {
	/*
	* This IO has already taken longer than the target
	* delay to complete, so we don't want to delay it
	* any longer; we "miss" the delay and issue it
	* directly to the zio layer. This is likely due to
	* the target latency being set to a value less than
	* the underlying hardware can satisfy (e.g. delay
	* set to 1ms, but the disks take 10ms to complete an
	* IO request).
	*/

	DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
	hrtime_t, now);

	zio_interrupt(zio);
	} else {
	hrtime_t diff = zio->io_target_timestamp - now;

	DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
	hrtime_t, now, hrtime_t, diff);

	(void) timeout_generic(CALLOUT_NORMAL,
	(void ()(void ))zio_interrupt, zio, diff, 1, 0);
	}

	return;
	}
	#endif

	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
	zio_interrupt(zio);
	}

	/*
	* Execute the I/O pipeline until one of the following occurs:
	*
	* (1) the I/O completes
	* (2) the pipeline stalls waiting for dependent child I/Os
	* (3) the I/O issues, so we're waiting for an I/O completion interrupt
	* (4) the I/O is delegated by vdev-level caching or aggregation
	* (5) the I/O is deferred due to vdev-level queueing
	* (6) the I/O is handed off to another thread.
	*
	* In all cases, the pipeline stops whenever there's no CPU work; it never
	* burns a thread in cv_wait().
	*
	* There's no locking on io_stage because there's no legitimate way
	* for multiple threads to be attempting to process the same I/O.
	*/
	static zio_pipe_stage_t *zio_pipeline[];

	void
	zio_execute(zio_t *zio)
	{
	zio->io_executor = curthread;

	ASSERT3U(zio->io_queued_timestamp, >, 0);

	while (zio->io_stage < ZIO_STAGE_DONE) {
	enum zio_stage pipeline = zio->io_pipeline;
	enum zio_stage stage = zio->io_stage;
	int rv;

	ASSERT(!MUTEX_HELD(&zio->io_lock));
	ASSERT(ISP2(stage));
	ASSERT(zio->io_stall == NULL);

	do {
	stage <<= 1;
	} while ((stage & pipeline) == 0);

	ASSERT(stage <= ZIO_STAGE_DONE);

	/*
	* If we are in interrupt context and this pipeline stage
	* will grab a config lock that is held across I/O,
	* or may wait for an I/O that needs an interrupt thread
	* to complete, issue async to avoid deadlock.
	*
	* For VDEV_IO_START, we cut in line so that the io will
	* be sent to disk promptly.
	*/
	if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
	zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
	boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
	zio_requeue_io_start_cut_in_line : B_FALSE;
	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
	return;
	}

	zio->io_stage = stage;
	zio->io_pipeline_trace \|= zio->io_stage;
	rv = zio_pipeline[highbit64(stage) - 1](zio);

	if (rv == ZIO_PIPELINE_STOP)
	return;

	ASSERT(rv == ZIO_PIPELINE_CONTINUE);
	}
	}

	/*
	* ==========================================================================
	* Initiate I/O, either sync or async
	* ==========================================================================
	*/
	int
	zio_wait(zio_t *zio)
	{
	int error;

	ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN);
	ASSERT3P(zio->io_executor, ==, NULL);

	zio->io_waiter = curthread;
	ASSERT0(zio->io_queued_timestamp);
	zio->io_queued_timestamp = gethrtime();

	zio_execute(zio);

	mutex_enter(&zio->io_lock);
	while (zio->io_executor != NULL)
	cv_wait(&zio->io_cv, &zio->io_lock);
	mutex_exit(&zio->io_lock);

	error = zio->io_error;
	zio_destroy(zio);

	return (error);
	}

	void
	zio_nowait(zio_t *zio)
	{
	ASSERT3P(zio->io_executor, ==, NULL);

	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
	zio_unique_parent(zio) == NULL) {
	/*
	* This is a logical async I/O with no parent to wait for it.
	* We add it to the spa_async_root_zio "Godfather" I/O which
	* will ensure they complete prior to unloading the pool.
	*/
	spa_t *spa = zio->io_spa;

	zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
	}

	ASSERT0(zio->io_queued_timestamp);
	zio->io_queued_timestamp = gethrtime();
	zio_execute(zio);
	}

	/*
	* ==========================================================================
	* Reexecute, cancel, or suspend/resume failed I/O
	* ==========================================================================
	*/

	static void
	zio_reexecute(zio_t *pio)
	{
	zio_t cio, cio_next;

	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
	ASSERT(pio->io_gang_leader == NULL);
	ASSERT(pio->io_gang_tree == NULL);

	pio->io_flags = pio->io_orig_flags;
	pio->io_stage = pio->io_orig_stage;
	pio->io_pipeline = pio->io_orig_pipeline;
	pio->io_reexecute = 0;
	pio->io_flags \|= ZIO_FLAG_REEXECUTED;
	pio->io_pipeline_trace = 0;
	pio->io_error = 0;
	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
	pio->io_state[w] = 0;
	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
	pio->io_child_error[c] = 0;

	if (IO_IS_ALLOCATING(pio))
	BP_ZERO(pio->io_bp);

	/*
	* As we reexecute pio's children, new children could be created.
	* New children go to the head of pio's io_child_list, however,
	* so we will (correctly) not reexecute them. The key is that
	* the remainder of pio's io_child_list, from 'cio_next' onward,
	* cannot be affected by any side effects of reexecuting 'cio'.
	*/
	zio_link_t *zl = NULL;
	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
	cio_next = zio_walk_children(pio, &zl);
	mutex_enter(&pio->io_lock);
	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
	pio->io_children[cio->io_child_type][w]++;
	mutex_exit(&pio->io_lock);
	zio_reexecute(cio);
	}

	/*
	* Now that all children have been reexecuted, execute the parent.
	* We don't reexecute "The Godfather" I/O here as it's the
	* responsibility of the caller to wait on it.
	*/
	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
	pio->io_queued_timestamp = gethrtime();
	zio_execute(pio);
	}
	}

	void
	zio_suspend(spa_t spa, zio_t zio)
	{
	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
	fm_panic("Pool '%s' has encountered an uncorrectable I/O "
	"failure and the failure mode property for this pool "
	"is set to panic.", spa_name(spa));

	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);

	mutex_enter(&spa->spa_suspend_lock);

	if (spa->spa_suspend_zio_root == NULL)
	spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \|
	ZIO_FLAG_GODFATHER);

	spa->spa_suspended = B_TRUE;

	if (zio != NULL) {
	ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
	ASSERT(zio != spa->spa_suspend_zio_root);
	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
	ASSERT(zio_unique_parent(zio) == NULL);
	ASSERT(zio->io_stage == ZIO_STAGE_DONE);
	zio_add_child(spa->spa_suspend_zio_root, zio);
	}

	mutex_exit(&spa->spa_suspend_lock);
	}

	int
	zio_resume(spa_t *spa)
	{
	zio_t *pio;

	/*
	* Reexecute all previously suspended i/o.
	*/
	mutex_enter(&spa->spa_suspend_lock);
	spa->spa_suspended = B_FALSE;
	cv_broadcast(&spa->spa_suspend_cv);
	pio = spa->spa_suspend_zio_root;
	spa->spa_suspend_zio_root = NULL;
	mutex_exit(&spa->spa_suspend_lock);

	if (pio == NULL)
	return (0);

	zio_reexecute(pio);
	return (zio_wait(pio));
	}

	void
	zio_resume_wait(spa_t *spa)
	{
	mutex_enter(&spa->spa_suspend_lock);
	while (spa_suspended(spa))
	cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
	mutex_exit(&spa->spa_suspend_lock);
	}

	/*
	* ==========================================================================
	* Gang blocks.
	*
	* A gang block is a collection of small blocks that looks to the DMU
	* like one large block. When zio_dva_allocate() cannot find a block
	* of the requested size, due to either severe fragmentation or the pool
	* being nearly full, it calls zio_write_gang_block() to construct the
	* block from smaller fragments.
	*
	* A gang block consists of a gang header (zio_gbh_phys_t) and up to
	* three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
	* an indirect block: it's an array of block pointers. It consumes
	* only one sector and hence is allocatable regardless of fragmentation.
	* The gang header's bps point to its gang members, which hold the data.
	*
	* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
	* as the verifier to ensure uniqueness of the SHA256 checksum.
	* Critically, the gang block bp's blk_cksum is the checksum of the data,
	* not the gang header. This ensures that data block signatures (needed for
	* deduplication) are independent of how the block is physically stored.
	*
	* Gang blocks can be nested: a gang member may itself be a gang block.
	* Thus every gang block is a tree in which root and all interior nodes are
	* gang headers, and the leaves are normal blocks that contain user data.
	* The root of the gang tree is called the gang leader.
	*
	* To perform any operation (read, rewrite, free, claim) on a gang block,
	* zio_gang_assemble() first assembles the gang tree (minus data leaves)
	* in the io_gang_tree field of the original logical i/o by recursively
	* reading the gang leader and all gang headers below it. This yields
	* an in-core tree containing the contents of every gang header and the
	* bps for every constituent of the gang block.
	*
	* With the gang tree now assembled, zio_gang_issue() just walks the gang tree
	* and invokes a callback on each bp. To free a gang block, zio_gang_issue()
	* calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
	* zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
	* zio_read_gang() is a wrapper around zio_read() that omits reading gang
	* headers, since we already have those in io_gang_tree. zio_rewrite_gang()
	* performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
	* of the gang header plus zio_checksum_compute() of the data to update the
	* gang header's blk_cksum as described above.
	*
	* The two-phase assemble/issue model solves the problem of partial failure --
	* what if you'd freed part of a gang block but then couldn't read the
	* gang header for another part? Assembling the entire gang tree first
	* ensures that all the necessary gang header I/O has succeeded before
	* starting the actual work of free, claim, or write. Once the gang tree
	* is assembled, free and claim are in-memory operations that cannot fail.
	*
	* In the event that a gang write fails, zio_dva_unallocate() walks the
	* gang tree to immediately free (i.e. insert back into the space map)
	* everything we've allocated. This ensures that we don't get ENOSPC
	* errors during repeated suspend/resume cycles due to a flaky device.
	*
	* Gang rewrites only happen during sync-to-convergence. If we can't assemble
	* the gang tree, we won't modify the block, so we can safely defer the free
	* (knowing that the block is still intact). If we can assemble the gang
	* tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
	* each constituent bp and we can allocate a new block on the next sync pass.
	*
	* In all cases, the gang tree allows complete recovery from partial failure.
	* ==========================================================================
	*/

	static void
	zio_gang_issue_func_done(zio_t *zio)
	{
	abd_put(zio->io_abd);
	}

	static zio_t *
	zio_read_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data,
	uint64_t offset)
	{
	if (gn != NULL)
	return (pio);

	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
	BP_GET_PSIZE(bp), zio_gang_issue_func_done,
	NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
	&pio->io_bookmark));
	}

	static zio_t *
	zio_rewrite_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data,
	uint64_t offset)
	{
	zio_t *zio;

	if (gn != NULL) {
	abd_t *gbh_abd =
	abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
	zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
	gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
	pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
	&pio->io_bookmark);
	/*
	* As we rewrite each gang header, the pipeline will compute
	* a new gang block header checksum for it; but no one will
	* compute a new data checksum, so we do that here. The one
	* exception is the gang leader: the pipeline already computed
	* its data checksum because that stage precedes gang assembly.
	* (Presently, nothing actually uses interior data checksums;
	* this is just good hygiene.)
	*/
	if (gn != pio->io_gang_leader->io_gang_tree) {
	abd_t *buf = abd_get_offset(data, offset);

	zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
	buf, BP_GET_PSIZE(bp));

	abd_put(buf);
	}
	/*
	* If we are here to damage data for testing purposes,
	* leave the GBH alone so that we can detect the damage.
	*/
	if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
	zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
	} else {
	zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
	abd_get_offset(data, offset), BP_GET_PSIZE(bp),
	zio_gang_issue_func_done, NULL, pio->io_priority,
	ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
	}

	return (zio);
	}

	/* ARGSUSED */
	static zio_t *
	zio_free_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data,
	uint64_t offset)
	{
	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
	BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
	ZIO_GANG_CHILD_FLAGS(pio)));
	}

	/* ARGSUSED */
	static zio_t *
	zio_claim_gang(zio_t pio, blkptr_t bp, zio_gang_node_t gn, abd_t data,
	uint64_t offset)
	{
	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
	NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
	}

	static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
	NULL,
	zio_read_gang,
	zio_rewrite_gang,
	zio_free_gang,
	zio_claim_gang,
	NULL
	};

	static void zio_gang_tree_assemble_done(zio_t *zio);

	static zio_gang_node_t *
	zio_gang_node_alloc(zio_gang_node_t **gnpp)
	{
	zio_gang_node_t *gn;

	ASSERT(*gnpp == NULL);

	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
	*gnpp = gn;

	return (gn);
	}

	static void
	zio_gang_node_free(zio_gang_node_t **gnpp)
	{
	zio_gang_node_t gn = gnpp;

	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
	ASSERT(gn->gn_child[g] == NULL);

	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
	kmem_free(gn, sizeof (*gn));
	*gnpp = NULL;
	}

	static void
	zio_gang_tree_free(zio_gang_node_t **gnpp)
	{
	zio_gang_node_t gn = gnpp;

	if (gn == NULL)
	return;

	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
	zio_gang_tree_free(&gn->gn_child[g]);

	zio_gang_node_free(gnpp);
	}

	static void
	zio_gang_tree_assemble(zio_t gio, blkptr_t bp, zio_gang_node_t **gnpp)
	{
	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);

	ASSERT(gio->io_gang_leader == gio);
	ASSERT(BP_IS_GANG(bp));

	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
	zio_gang_tree_assemble_done, gn, gio->io_priority,
	ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
	}

	static void
	zio_gang_tree_assemble_done(zio_t *zio)
	{
	zio_t *gio = zio->io_gang_leader;
	zio_gang_node_t *gn = zio->io_private;
	blkptr_t *bp = zio->io_bp;

	ASSERT(gio == zio_unique_parent(zio));
	ASSERT(zio->io_child_count == 0);

	if (zio->io_error)
	return;

	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
	if (BP_SHOULD_BYTESWAP(bp))
	byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);

	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);

	abd_put(zio->io_abd);

	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
	blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
	if (!BP_IS_GANG(gbp))
	continue;
	zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
	}
	}

	static void
	zio_gang_tree_issue(zio_t pio, zio_gang_node_t gn, blkptr_t bp, abd_t data,
	uint64_t offset)
	{
	zio_t *gio = pio->io_gang_leader;
	zio_t *zio;

	ASSERT(BP_IS_GANG(bp) == !!gn);
	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) \|\| gn == gio->io_gang_tree);

	/*
	* If you're a gang header, your data is in gn->gn_gbh.
	* If you're a gang member, your data is in 'data' and gn == NULL.
	*/
	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);

	if (gn != NULL) {
	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);

	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
	blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
	if (BP_IS_HOLE(gbp))
	continue;
	zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
	offset);
	offset += BP_GET_PSIZE(gbp);
	}
	}

	if (gn == gio->io_gang_tree && gio->io_abd != NULL)
	ASSERT3U(gio->io_size, ==, offset);

	if (zio != pio)
	zio_nowait(zio);
	}

	static int
	zio_gang_assemble(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;

	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);

	zio->io_gang_leader = zio;

	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);

	return (ZIO_PIPELINE_CONTINUE);
	}

	static int
	zio_gang_issue(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;

	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
	return (ZIO_PIPELINE_STOP);
	}

	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);

	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
	zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
	0);
	else
	zio_gang_tree_free(&zio->io_gang_tree);

	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

	return (ZIO_PIPELINE_CONTINUE);
	}

	static void
	zio_write_gang_member_ready(zio_t *zio)
	{
	zio_t *pio = zio_unique_parent(zio);
	zio_t *gio = zio->io_gang_leader;
	dva_t *cdva = zio->io_bp->blk_dva;
	dva_t *pdva = pio->io_bp->blk_dva;
	uint64_t asize;

	if (BP_IS_HOLE(zio->io_bp))
	return;

	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));

	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));

	mutex_enter(&pio->io_lock);
	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
	ASSERT(DVA_GET_GANG(&pdva[d]));
	asize = DVA_GET_ASIZE(&pdva[d]);
	asize += DVA_GET_ASIZE(&cdva[d]);
	DVA_SET_ASIZE(&pdva[d], asize);
	}
	mutex_exit(&pio->io_lock);
	}

	static void
	zio_write_gang_done(zio_t *zio)
	{
	abd_put(zio->io_abd);
	}

	static int
	zio_write_gang_block(zio_t *pio)
	{
	spa_t *spa = pio->io_spa;
	metaslab_class_t *mc = spa_normal_class(spa);
	blkptr_t *bp = pio->io_bp;
	zio_t *gio = pio->io_gang_leader;
	zio_t *zio;
	zio_gang_node_t gn, *gnpp;
	zio_gbh_phys_t *gbh;
	abd_t *gbh_abd;
	uint64_t txg = pio->io_txg;
	uint64_t resid = pio->io_size;
	uint64_t lsize;
	int copies = gio->io_prop.zp_copies;
	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
	zio_prop_t zp;
	int error;

	int flags = METASLAB_HINTBP_FAVOR \| METASLAB_GANG_HEADER;
	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
	ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
	ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));

	flags \|= METASLAB_ASYNC_ALLOC;
	VERIFY(refcount_held(&mc->mc_alloc_slots, pio));

	/*
	* The logical zio has already placed a reservation for
	* 'copies' allocation slots but gang blocks may require
	* additional copies. These additional copies
	* (i.e. gbh_copies - copies) are guaranteed to succeed
	* since metaslab_class_throttle_reserve() always allows
	* additional reservations for gang blocks.
	*/
	VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
	pio, flags));
	}

	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
	bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
	&pio->io_alloc_list, pio);
	if (error) {
	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
	ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
	ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));

	/*
	* If we failed to allocate the gang block header then
	* we remove any additional allocation reservations that
	* we placed here. The original reservation will
	* be removed when the logical I/O goes to the ready
	* stage.
	*/
	metaslab_class_throttle_unreserve(mc,
	gbh_copies - copies, pio);
	}
	pio->io_error = error;
	return (ZIO_PIPELINE_CONTINUE);
	}

	if (pio == gio) {
	gnpp = &gio->io_gang_tree;
	} else {
	gnpp = pio->io_private;
	ASSERT(pio->io_ready == zio_write_gang_member_ready);
	}

	gn = zio_gang_node_alloc(gnpp);
	gbh = gn->gn_gbh;
	bzero(gbh, SPA_GANGBLOCKSIZE);
	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);

	/*
	* Create the gang header.
	*/
	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
	zio_write_gang_done, NULL, pio->io_priority,
	ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);

	/*
	* Create and nowait the gang children.
	*/
	for (int g = 0; resid != 0; resid -= lsize, g++) {
	lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
	SPA_MINBLOCKSIZE);
	ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);

	zp.zp_checksum = gio->io_prop.zp_checksum;
	zp.zp_compress = ZIO_COMPRESS_OFF;
	zp.zp_type = DMU_OT_NONE;
	zp.zp_level = 0;
	zp.zp_copies = gio->io_prop.zp_copies;
	zp.zp_dedup = B_FALSE;
	zp.zp_dedup_verify = B_FALSE;
	zp.zp_nopwrite = B_FALSE;

	zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
	abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
	lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
	zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
	ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);

	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
	ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
	ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));

	/*
	* Gang children won't throttle but we should
	* account for their work, so reserve an allocation
	* slot for them here.
	*/
	VERIFY(metaslab_class_throttle_reserve(mc,
	zp.zp_copies, cio, flags));
	}
	zio_nowait(cio);
	}

	/*
	* Set pio's pipeline to just wait for zio to finish.
	*/
	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

	zio_nowait(zio);

	return (ZIO_PIPELINE_CONTINUE);
	}

	/*
	* The zio_nop_write stage in the pipeline determines if allocating a
	* new bp is necessary. The nopwrite feature can handle writes in
	* either syncing or open context (i.e. zil writes) and as a result is
	* mutually exclusive with dedup.
	*
	* By leveraging a cryptographically secure checksum, such as SHA256, we
	* can compare the checksums of the new data and the old to determine if
	* allocating a new block is required. Note that our requirements for
	* cryptographic strength are fairly weak: there can't be any accidental
	* hash collisions, but we don't need to be secure against intentional
	* (malicious) collisions. To trigger a nopwrite, you have to be able
	* to write the file to begin with, and triggering an incorrect (hash
	* collision) nopwrite is no worse than simply writing to the file.
	* That said, there are no known attacks against the checksum algorithms
	* used for nopwrite, assuming that the salt and the checksums
	* themselves remain secret.
	*/
	static int
	zio_nop_write(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;
	blkptr_t *bp_orig = &zio->io_bp_orig;
	zio_prop_t *zp = &zio->io_prop;

	ASSERT(BP_GET_LEVEL(bp) == 0);
	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
	ASSERT(zp->zp_nopwrite);
	ASSERT(!zp->zp_dedup);
	ASSERT(zio->io_bp_override == NULL);
	ASSERT(IO_IS_ALLOCATING(zio));

	/*
	* Check to see if the original bp and the new bp have matching
	* characteristics (i.e. same checksum, compression algorithms, etc).
	* If they don't then just continue with the pipeline which will
	* allocate a new bp.
	*/
	if (BP_IS_HOLE(bp_orig) \|\|
	!(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
	ZCHECKSUM_FLAG_NOPWRITE) \|\|
	BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) \|\|
	BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) \|\|
	BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) \|\|
	zp->zp_copies != BP_GET_NDVAS(bp_orig))
	return (ZIO_PIPELINE_CONTINUE);

	/*
	* If the checksums match then reset the pipeline so that we
	* avoid allocating a new bp and issuing any I/O.
	*/
	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
	ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
	ZCHECKSUM_FLAG_NOPWRITE);
	ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
	ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
	ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
	ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
	sizeof (uint64_t)) == 0);

	bp = bp_orig;
	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
	zio->io_flags \|= ZIO_FLAG_NOPWRITE;
	}

	return (ZIO_PIPELINE_CONTINUE);
	}

	/*
	* ==========================================================================
	* Dedup
	* ==========================================================================
	*/
	static void
	zio_ddt_child_read_done(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;
	ddt_entry_t *dde = zio->io_private;
	ddt_phys_t *ddp;
	zio_t *pio = zio_unique_parent(zio);

	mutex_enter(&pio->io_lock);
	ddp = ddt_phys_select(dde, bp);
	if (zio->io_error == 0)
	ddt_phys_clear(ddp); /* this ddp doesn't need repair */

	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
	dde->dde_repair_abd = zio->io_abd;
	else
	abd_free(zio->io_abd);
	mutex_exit(&pio->io_lock);
	}

	static int
	zio_ddt_read_start(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;

	ASSERT(BP_GET_DEDUP(bp));
	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);

	if (zio->io_child_error[ZIO_CHILD_DDT]) {
	ddt_t *ddt = ddt_select(zio->io_spa, bp);
	ddt_entry_t *dde = ddt_repair_start(ddt, bp);
	ddt_phys_t *ddp = dde->dde_phys;
	ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
	blkptr_t blk;

	ASSERT(zio->io_vsd == NULL);
	zio->io_vsd = dde;

	if (ddp_self == NULL)
	return (ZIO_PIPELINE_CONTINUE);

	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
	if (ddp->ddp_phys_birth == 0 \|\| ddp == ddp_self)
	continue;
	ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
	&blk);
	zio_nowait(zio_read(zio, zio->io_spa, &blk,
	abd_alloc_for_io(zio->io_size, B_TRUE),
	zio->io_size, zio_ddt_child_read_done, dde,
	zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) \|
	ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
	}
	return (ZIO_PIPELINE_CONTINUE);
	}

	zio_nowait(zio_read(zio, zio->io_spa, bp,
	zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
	ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));

	return (ZIO_PIPELINE_CONTINUE);
	}

	static int
	zio_ddt_read_done(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;

	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
	return (ZIO_PIPELINE_STOP);
	}

	ASSERT(BP_GET_DEDUP(bp));
	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);

	if (zio->io_child_error[ZIO_CHILD_DDT]) {
	ddt_t *ddt = ddt_select(zio->io_spa, bp);
	ddt_entry_t *dde = zio->io_vsd;
	if (ddt == NULL) {
	ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
	return (ZIO_PIPELINE_CONTINUE);
	}
	if (dde == NULL) {
	zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
	return (ZIO_PIPELINE_STOP);
	}
	if (dde->dde_repair_abd != NULL) {
	abd_copy(zio->io_abd, dde->dde_repair_abd,
	zio->io_size);
	zio->io_child_error[ZIO_CHILD_DDT] = 0;
	}
	ddt_repair_done(ddt, dde);
	zio->io_vsd = NULL;
	}

	ASSERT(zio->io_vsd == NULL);

	return (ZIO_PIPELINE_CONTINUE);
	}

	static boolean_t
	zio_ddt_collision(zio_t zio, ddt_t ddt, ddt_entry_t *dde)
	{
	spa_t *spa = zio->io_spa;
	boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW);

	/* We should never get a raw, override zio */
	ASSERT(!(zio->io_bp_override && do_raw));

	/*
	* Note: we compare the original data, not the transformed data,
	* because when zio->io_bp is an override bp, we will not have
	* pushed the I/O transforms. That's an important optimization
	* because otherwise we'd compress/encrypt all dmu_sync() data twice.
	*/
	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
	zio_t *lio = dde->dde_lead_zio[p];

	if (lio != NULL) {
	return (lio->io_orig_size != zio->io_orig_size \|\|
	abd_cmp(zio->io_orig_abd, lio->io_orig_abd,
	zio->io_orig_size) != 0);
	}
	}

	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
	ddt_phys_t *ddp = &dde->dde_phys[p];

	if (ddp->ddp_phys_birth != 0) {
	arc_buf_t *abuf = NULL;
	arc_flags_t aflags = ARC_FLAG_WAIT;
	int zio_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE;
	blkptr_t blk = *zio->io_bp;
	int error;

	ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);

	ddt_exit(ddt);

	/*
	* Intuitively, it would make more sense to compare
	* io_abd than io_orig_abd in the raw case since you
	* don't want to look at any transformations that have
	* happened to the data. However, for raw I/Os the
	* data will actually be the same in io_abd and
	* io_orig_abd, so all we have to do is issue this as
	* a raw ARC read.
	*/
	if (do_raw) {
	zio_flags \|= ZIO_FLAG_RAW;
	ASSERT3U(zio->io_size, ==, zio->io_orig_size);
	ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd,
	zio->io_size));
	ASSERT3P(zio->io_transform_stack, ==, NULL);
	}

	error = arc_read(NULL, spa, &blk,
	arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
	zio_flags, &aflags, &zio->io_bookmark);

	if (error == 0) {
	if (arc_buf_size(abuf) != zio->io_orig_size \|\|
	abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
	zio->io_orig_size) != 0)
	error = SET_ERROR(EEXIST);
	arc_buf_destroy(abuf, &abuf);
	}

	ddt_enter(ddt);
	return (error != 0);
	}
	}

	return (B_FALSE);
	}

	static void
	zio_ddt_child_write_ready(zio_t *zio)
	{
	int p = zio->io_prop.zp_copies;
	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
	ddt_entry_t *dde = zio->io_private;
	ddt_phys_t *ddp = &dde->dde_phys[p];
	zio_t *pio;

	if (zio->io_error)
	return;

	ddt_enter(ddt);

	ASSERT(dde->dde_lead_zio[p] == zio);

	ddt_phys_fill(ddp, zio->io_bp);

	zio_link_t *zl = NULL;
	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
	ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);

	ddt_exit(ddt);
	}

	static void
	zio_ddt_child_write_done(zio_t *zio)
	{
	int p = zio->io_prop.zp_copies;
	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
	ddt_entry_t *dde = zio->io_private;
	ddt_phys_t *ddp = &dde->dde_phys[p];

	ddt_enter(ddt);

	ASSERT(ddp->ddp_refcnt == 0);
	ASSERT(dde->dde_lead_zio[p] == zio);
	dde->dde_lead_zio[p] = NULL;

	if (zio->io_error == 0) {
	zio_link_t *zl = NULL;
	while (zio_walk_parents(zio, &zl) != NULL)
	ddt_phys_addref(ddp);
	} else {
	ddt_phys_clear(ddp);
	}

	ddt_exit(ddt);
	}

	static void
	zio_ddt_ditto_write_done(zio_t *zio)
	{
	int p = DDT_PHYS_DITTO;
	zio_prop_t *zp = &zio->io_prop;
	blkptr_t *bp = zio->io_bp;
	ddt_t *ddt = ddt_select(zio->io_spa, bp);
	ddt_entry_t *dde = zio->io_private;
	ddt_phys_t *ddp = &dde->dde_phys[p];
	ddt_key_t *ddk = &dde->dde_key;

	ddt_enter(ddt);

	ASSERT(ddp->ddp_refcnt == 0);
	ASSERT(dde->dde_lead_zio[p] == zio);
	dde->dde_lead_zio[p] = NULL;

	if (zio->io_error == 0) {
	ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
	ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
	ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
	if (ddp->ddp_phys_birth != 0)
	ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
	ddt_phys_fill(ddp, bp);
	}

	ddt_exit(ddt);
	}

	static int
	zio_ddt_write(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;
	blkptr_t *bp = zio->io_bp;
	uint64_t txg = zio->io_txg;
	zio_prop_t *zp = &zio->io_prop;
	int p = zp->zp_copies;
	int ditto_copies;
	zio_t *cio = NULL;
	zio_t *dio = NULL;
	ddt_t *ddt = ddt_select(spa, bp);
	ddt_entry_t *dde;
	ddt_phys_t *ddp;

	ASSERT(BP_GET_DEDUP(bp));
	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
	ASSERT(BP_IS_HOLE(bp) \|\| zio->io_bp_override);
	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));

	ddt_enter(ddt);
	dde = ddt_lookup(ddt, bp, B_TRUE);
	ddp = &dde->dde_phys[p];

	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
	/*
	* If we're using a weak checksum, upgrade to a strong checksum
	* and try again. If we're already using a strong checksum,
	* we can't resolve it, so just convert to an ordinary write.
	* (And automatically e-mail a paper to Nature?)
	*/
	if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
	ZCHECKSUM_FLAG_DEDUP)) {
	zp->zp_checksum = spa_dedup_checksum(spa);
	zio_pop_transforms(zio);
	zio->io_stage = ZIO_STAGE_OPEN;
	BP_ZERO(bp);
	} else {
	zp->zp_dedup = B_FALSE;
	BP_SET_DEDUP(bp, B_FALSE);
	}
	ASSERT(!BP_GET_DEDUP(bp));
	zio->io_pipeline = ZIO_WRITE_PIPELINE;
	ddt_exit(ddt);
	return (ZIO_PIPELINE_CONTINUE);
	}

	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
	ASSERT(ditto_copies < SPA_DVAS_PER_BP);

	if (ditto_copies > ddt_ditto_copies_present(dde) &&
	dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
	zio_prop_t czp = *zp;

	czp.zp_copies = ditto_copies;

	/*
	* If we arrived here with an override bp, we won't have run
	* the transform stack, so we won't have the data we need to
	* generate a child i/o. So, toss the override bp and restart.
	* This is safe, because using the override bp is just an
	* optimization; and it's rare, so the cost doesn't matter.
	*/
	if (zio->io_bp_override) {
	zio_pop_transforms(zio);
	zio->io_stage = ZIO_STAGE_OPEN;
	zio->io_pipeline = ZIO_WRITE_PIPELINE;
	zio->io_bp_override = NULL;
	BP_ZERO(bp);
	ddt_exit(ddt);
	return (ZIO_PIPELINE_CONTINUE);
	}

	dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
	zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
	NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
	ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);

	zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
	dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
	}

	if (ddp->ddp_phys_birth != 0 \|\| dde->dde_lead_zio[p] != NULL) {
	if (ddp->ddp_phys_birth != 0)
	ddt_bp_fill(ddp, bp, txg);
	if (dde->dde_lead_zio[p] != NULL)
	zio_add_child(zio, dde->dde_lead_zio[p]);
	else
	ddt_phys_addref(ddp);
	} else if (zio->io_bp_override) {
	ASSERT(bp->blk_birth == txg);
	ASSERT(BP_EQUAL(bp, zio->io_bp_override));
	ddt_phys_fill(ddp, bp);
	ddt_phys_addref(ddp);
	} else {
	cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
	zio->io_orig_size, zio->io_orig_size, zp,
	zio_ddt_child_write_ready, NULL, NULL,
	zio_ddt_child_write_done, dde, zio->io_priority,
	ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);

	zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
	dde->dde_lead_zio[p] = cio;
	}

	ddt_exit(ddt);

	if (cio)
	zio_nowait(cio);
	if (dio)
	zio_nowait(dio);

	return (ZIO_PIPELINE_CONTINUE);
	}

	ddt_entry_t freedde; / for debugging */

	static int
	zio_ddt_free(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;
	blkptr_t *bp = zio->io_bp;
	ddt_t *ddt = ddt_select(spa, bp);
	ddt_entry_t *dde;
	ddt_phys_t *ddp;

	ASSERT(BP_GET_DEDUP(bp));
	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);

	ddt_enter(ddt);
	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
	ddp = ddt_phys_select(dde, bp);
	ddt_phys_decref(ddp);
	ddt_exit(ddt);

	return (ZIO_PIPELINE_CONTINUE);
	}

	/*
	* ==========================================================================
	* Allocate and free blocks
	* ==========================================================================
	*/

	static zio_t *
	zio_io_to_allocate(spa_t *spa)
	{
	zio_t *zio;

	ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));

	zio = avl_first(&spa->spa_alloc_tree);
	if (zio == NULL)
	return (NULL);

	ASSERT(IO_IS_ALLOCATING(zio));

	/*
	* Try to place a reservation for this zio. If we're unable to
	* reserve then we throttle.
	*/
	if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
	zio->io_prop.zp_copies, zio, 0)) {
	return (NULL);
	}

	avl_remove(&spa->spa_alloc_tree, zio);
	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);

	return (zio);
	}

	static int
	zio_dva_throttle(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;
	zio_t *nio;

	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE \|\|
	!spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled \|\|
	zio->io_child_type == ZIO_CHILD_GANG \|\|
	zio->io_flags & ZIO_FLAG_NODATA) {
	return (ZIO_PIPELINE_CONTINUE);
	}

	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);

	ASSERT3U(zio->io_queued_timestamp, >, 0);
	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);

	mutex_enter(&spa->spa_alloc_lock);

	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
	avl_add(&spa->spa_alloc_tree, zio);

	nio = zio_io_to_allocate(zio->io_spa);
	mutex_exit(&spa->spa_alloc_lock);

	if (nio == zio)
	return (ZIO_PIPELINE_CONTINUE);

	if (nio != NULL) {
	ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
	/*
	* We are passing control to a new zio so make sure that
	* it is processed by a different thread. We do this to
	* avoid stack overflows that can occur when parents are
	* throttled and children are making progress. We allow
	* it to go to the head of the taskq since it's already
	* been waiting.
	*/
	zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
	}
	return (ZIO_PIPELINE_STOP);
	}

	void
	zio_allocate_dispatch(spa_t *spa)
	{
	zio_t *zio;

	mutex_enter(&spa->spa_alloc_lock);
	zio = zio_io_to_allocate(spa);
	mutex_exit(&spa->spa_alloc_lock);
	if (zio == NULL)
	return;

	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
	ASSERT0(zio->io_error);
	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
	}

	static int
	zio_dva_allocate(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;
	metaslab_class_t *mc = spa_normal_class(spa);
	blkptr_t *bp = zio->io_bp;
	int error;
	int flags = 0;

	if (zio->io_gang_leader == NULL) {
	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
	zio->io_gang_leader = zio;
	}

	ASSERT(BP_IS_HOLE(bp));
	ASSERT0(BP_GET_NDVAS(bp));
	ASSERT3U(zio->io_prop.zp_copies, >, 0);
	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));

	if (zio->io_flags & ZIO_FLAG_NODATA) {
	flags \|= METASLAB_DONT_THROTTLE;
	}
	if (zio->io_flags & ZIO_FLAG_GANG_CHILD) {
	flags \|= METASLAB_GANG_CHILD;
	}
	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) {
	flags \|= METASLAB_ASYNC_ALLOC;
	}

	error = metaslab_alloc(spa, mc, zio->io_size, bp,
	zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
	&zio->io_alloc_list, zio);

	if (error != 0) {
	spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
	"size %llu, error %d", spa_name(spa), zio, zio->io_size,
	error);
	if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
	return (zio_write_gang_block(zio));
	zio->io_error = error;
	}

	return (ZIO_PIPELINE_CONTINUE);
	}

	static int
	zio_dva_free(zio_t *zio)
	{
	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);

	return (ZIO_PIPELINE_CONTINUE);
	}

	static int
	zio_dva_claim(zio_t *zio)
	{
	int error;

	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
	if (error)
	zio->io_error = error;

	return (ZIO_PIPELINE_CONTINUE);
	}

	/*
	* Undo an allocation. This is used by zio_done() when an I/O fails
	* and we want to give back the block we just allocated.
	* This handles both normal blocks and gang blocks.
	*/
	static void
	zio_dva_unallocate(zio_t zio, zio_gang_node_t gn, blkptr_t *bp)
	{
	ASSERT(bp->blk_birth == zio->io_txg \|\| BP_IS_HOLE(bp));
	ASSERT(zio->io_bp_override == NULL);

	if (!BP_IS_HOLE(bp))
	metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);

	if (gn != NULL) {
	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
	zio_dva_unallocate(zio, gn->gn_child[g],
	&gn->gn_gbh->zg_blkptr[g]);
	}
	}
	}

	/*
	* Try to allocate an intent log block. Return 0 on success, errno on failure.
	*/
	int
	zio_alloc_zil(spa_t spa, uint64_t txg, blkptr_t new_bp, blkptr_t *old_bp,
	uint64_t size, boolean_t *slog)
	{
	int error = 1;
	zio_alloc_list_t io_alloc_list;

	ASSERT(txg > spa_syncing_txg(spa));

	metaslab_trace_init(&io_alloc_list);
	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
	txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL);
	if (error == 0) {
	*slog = TRUE;
	} else {
	error = metaslab_alloc(spa, spa_normal_class(spa), size,
	new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
	&io_alloc_list, NULL);
	if (error == 0)
	*slog = FALSE;
	}
	metaslab_trace_fini(&io_alloc_list);

	if (error == 0) {
	BP_SET_LSIZE(new_bp, size);
	BP_SET_PSIZE(new_bp, size);
	BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
	BP_SET_CHECKSUM(new_bp,
	spa_version(spa) >= SPA_VERSION_SLIM_ZIL
	? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
	BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
	BP_SET_LEVEL(new_bp, 0);
	BP_SET_DEDUP(new_bp, 0);
	BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
	} else {
	zfs_dbgmsg("%s: zil block allocation failure: "
	"size %llu, error %d", spa_name(spa), size, error);
	}

	return (error);
	}

	/*
	* Free an intent log block.
	*/
	void
	zio_free_zil(spa_t spa, uint64_t txg, blkptr_t bp)
	{
	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
	ASSERT(!BP_IS_GANG(bp));

	zio_free(spa, txg, bp);
	}

	/*
	* ==========================================================================
	* Read, write and delete to physical devices
	* ==========================================================================
	*/


	/*
	* Issue an I/O to the underlying vdev. Typically the issue pipeline
	* stops after this stage and will resume upon I/O completion.
	* However, there are instances where the vdev layer may need to
	* continue the pipeline when an I/O was not issued. Since the I/O
	* that was sent to the vdev layer might be different than the one
	* currently active in the pipeline (see vdev_queue_io()), we explicitly
	* force the underlying vdev layers to call either zio_execute() or
	* zio_interrupt() to ensure that the pipeline continues with the correct I/O.
	*/
	static int
	zio_vdev_io_start(zio_t *zio)
	{
	vdev_t *vd = zio->io_vd;
	uint64_t align;
	spa_t *spa = zio->io_spa;
	int ret;

	ASSERT(zio->io_error == 0);
	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);

	if (vd == NULL) {
	if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
	spa_config_enter(spa, SCL_ZIO, zio, RW_READER);

	/*
	* The mirror_ops handle multiple DVAs in a single BP.
	*/
	vdev_mirror_ops.vdev_op_io_start(zio);
	return (ZIO_PIPELINE_STOP);
	}

	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
	zio->io_priority == ZIO_PRIORITY_NOW) {
	trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
	return (ZIO_PIPELINE_CONTINUE);
	}

	ASSERT3P(zio->io_logical, !=, zio);
	+ if (zio->io_type == ZIO_TYPE_WRITE && zio->io_vd->vdev_removing) {
	+ ASSERT(zio->io_flags &
	+ (ZIO_FLAG_PHYSICAL \| ZIO_FLAG_SELF_HEAL \|
	+ ZIO_FLAG_INDUCE_DAMAGE));
	+ }

	/*
	* We keep track of time-sensitive I/Os so that the scan thread
	* can quickly react to certain workloads. In particular, we care
	* about non-scrubbing, top-level reads and writes with the following
	* characteristics:
	* - synchronous writes of user data to non-slog devices
	* - any reads of user data
	* When these conditions are met, adjust the timestamp of spa_last_io
	* which allows the scan thread to adjust its workload accordingly.
	*/
	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
	vd == vd->vdev_top && !vd->vdev_islog &&
	zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
	zio->io_txg != spa_syncing_txg(spa)) {
	uint64_t old = spa->spa_last_io;
	uint64_t new = ddi_get_lbolt64();
	if (old != new)
	(void) atomic_cas_64(&spa->spa_last_io, old, new);
	}

	align = 1ULL << vd->vdev_top->vdev_ashift;

	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
	P2PHASE(zio->io_size, align) != 0) {
	/* Transform logical writes to be a full physical block size. */
	uint64_t asize = P2ROUNDUP(zio->io_size, align);
	abd_t *abuf = NULL;
	if (zio->io_type == ZIO_TYPE_READ \|\|
	zio->io_type == ZIO_TYPE_WRITE)
	abuf = abd_alloc_sametype(zio->io_abd, asize);
	ASSERT(vd == vd->vdev_top);
	if (zio->io_type == ZIO_TYPE_WRITE) {
	abd_copy(abuf, zio->io_abd, zio->io_size);
	abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
	}
	zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
	zio_subblock);
	}

	/*
	* If this is not a physical io, make sure that it is properly aligned
	* before proceeding.
	*/
	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
	ASSERT0(P2PHASE(zio->io_offset, align));
	ASSERT0(P2PHASE(zio->io_size, align));
	} else {
	/*
	* For the physical io we allow alignment
	* to a logical block size.
	*/
	uint64_t log_align =
	1ULL << vd->vdev_top->vdev_logical_ashift;
	ASSERT0(P2PHASE(zio->io_offset, log_align));
	ASSERT0(P2PHASE(zio->io_size, log_align));
	}

	VERIFY(zio->io_type == ZIO_TYPE_READ \|\| spa_writeable(spa));

	/*
	* If this is a repair I/O, and there's no self-healing involved --
	* that is, we're just resilvering what we expect to resilver --
	* then don't do the I/O unless zio's txg is actually in vd's DTL.
	* This prevents spurious resilvering with nested replication.
	* For example, given a mirror of mirrors, (A+B)+(C+D), if only
	* A is out of date, we'll read from C+D, then use the data to
	* resilver A+B -- but we don't actually want to resilver B, just A.
	* The top-level mirror has no way to know this, so instead we just
	* discard unnecessary repairs as we work our way down the vdev tree.
	* The same logic applies to any form of nested replication:
	* ditto + mirror, RAID-Z + replacing, etc. This covers them all.
	*/
	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
	!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
	zio->io_txg != 0 && /* not a delegated i/o */
	!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
	zio_vdev_io_bypass(zio);
	return (ZIO_PIPELINE_CONTINUE);
	}

	if (vd->vdev_ops->vdev_op_leaf) {
	switch (zio->io_type) {
	case ZIO_TYPE_READ:
	if (vdev_cache_read(zio))
	return (ZIO_PIPELINE_CONTINUE);
	/* FALLTHROUGH */
	case ZIO_TYPE_WRITE:
	case ZIO_TYPE_FREE:
	if ((zio = vdev_queue_io(zio)) == NULL)
	return (ZIO_PIPELINE_STOP);

	if (!vdev_accessible(vd, zio)) {
	zio->io_error = SET_ERROR(ENXIO);
	zio_interrupt(zio);
	return (ZIO_PIPELINE_STOP);
	}
	break;
	}
	/*
	* Note that we ignore repair writes for TRIM because they can
	* conflict with normal writes. This isn't an issue because, by
	* definition, we only repair blocks that aren't freed.
	*/
	if (zio->io_type == ZIO_TYPE_WRITE &&
	!(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
	!trim_map_write_start(zio))
	return (ZIO_PIPELINE_STOP);
	}

	vd->vdev_ops->vdev_op_io_start(zio);
	return (ZIO_PIPELINE_STOP);
	}

	static int
	zio_vdev_io_done(zio_t *zio)
	{
	vdev_t *vd = zio->io_vd;
	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
	boolean_t unexpected_error = B_FALSE;

	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
	return (ZIO_PIPELINE_STOP);
	}

	ASSERT(zio->io_type == ZIO_TYPE_READ \|\|
	zio->io_type == ZIO_TYPE_WRITE \|\| zio->io_type == ZIO_TYPE_FREE);

	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
	(zio->io_type == ZIO_TYPE_READ \|\| zio->io_type == ZIO_TYPE_WRITE \|\|
	zio->io_type == ZIO_TYPE_FREE)) {

	if (zio->io_type == ZIO_TYPE_WRITE &&
	!(zio->io_flags & ZIO_FLAG_IO_REPAIR))
	trim_map_write_done(zio);

	vdev_queue_io_done(zio);

	if (zio->io_type == ZIO_TYPE_WRITE)
	vdev_cache_write(zio);

	if (zio_injection_enabled && zio->io_error == 0)
	zio->io_error = zio_handle_device_injection(vd,
	zio, EIO);

	if (zio_injection_enabled && zio->io_error == 0)
	zio->io_error = zio_handle_label_injection(zio, EIO);

	if (zio->io_error) {
	if (zio->io_error == ENOTSUP &&
	zio->io_type == ZIO_TYPE_FREE) {
	/* Not all devices support TRIM. */
	} else if (!vdev_accessible(vd, zio)) {
	zio->io_error = SET_ERROR(ENXIO);
	} else {
	unexpected_error = B_TRUE;
	}
	}
	}

	ops->vdev_op_io_done(zio);

	if (unexpected_error)
	VERIFY(vdev_probe(vd, zio) == NULL);

	return (ZIO_PIPELINE_CONTINUE);
	}

	/*
	* For non-raidz ZIOs, we can just copy aside the bad data read from the
	* disk, and use that to finish the checksum ereport later.
	*/
	static void
	zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
	const void *good_buf)
	{
	/* no processing needed */
	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
	}

	/ARGSUSED/
	void
	zio_vsd_default_cksum_report(zio_t zio, zio_cksum_report_t zcr, void *ignored)
	{
	void *buf = zio_buf_alloc(zio->io_size);

	abd_copy_to_buf(buf, zio->io_abd, zio->io_size);

	zcr->zcr_cbinfo = zio->io_size;
	zcr->zcr_cbdata = buf;
	zcr->zcr_finish = zio_vsd_default_cksum_finish;
	zcr->zcr_free = zio_buf_free;
	}

	static int
	zio_vdev_io_assess(zio_t *zio)
	{
	vdev_t *vd = zio->io_vd;

	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
	return (ZIO_PIPELINE_STOP);
	}

	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
	spa_config_exit(zio->io_spa, SCL_ZIO, zio);

	if (zio->io_vsd != NULL) {
	zio->io_vsd_ops->vsd_free(zio);
	zio->io_vsd = NULL;
	}

	if (zio_injection_enabled && zio->io_error == 0)
	zio->io_error = zio_handle_fault_injection(zio, EIO);

	if (zio->io_type == ZIO_TYPE_FREE &&
	zio->io_priority != ZIO_PRIORITY_NOW) {
	switch (zio->io_error) {
	case 0:
	ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
	ZIO_TRIM_STAT_BUMP(success);
	break;
	case EOPNOTSUPP:
	ZIO_TRIM_STAT_BUMP(unsupported);
	break;
	default:
	ZIO_TRIM_STAT_BUMP(failed);
	break;
	}
	}

	/*
	* If the I/O failed, determine whether we should attempt to retry it.
	*
	* On retry, we cut in line in the issue queue, since we don't want
	* compression/checksumming/etc. work to prevent our (cheap) IO reissue.
	*/
	if (zio->io_error && vd == NULL &&
	!(zio->io_flags & (ZIO_FLAG_DONT_RETRY \| ZIO_FLAG_IO_RETRY))) {
	ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
	zio->io_error = 0;
	zio->io_flags \|= ZIO_FLAG_IO_RETRY \|
	ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_AGGREGATE;
	zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
	zio_requeue_io_start_cut_in_line);
	return (ZIO_PIPELINE_STOP);
	}

	/*
	* If we got an error on a leaf device, convert it to ENXIO
	* if the device is not accessible at all.
	*/
	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
	!vdev_accessible(vd, zio))
	zio->io_error = SET_ERROR(ENXIO);

	/*
	* If we can't write to an interior vdev (mirror or RAID-Z),
	* set vdev_cant_write so that we stop trying to allocate from it.
	*/
	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
	vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
	vd->vdev_cant_write = B_TRUE;
	}

	/*
	* If a cache flush returns ENOTSUP or ENOTTY, we know that no future
	* attempts will ever succeed. In this case we set a persistent bit so
	* that we don't bother with it in the future.
	*/
	if ((zio->io_error == ENOTSUP \|\| zio->io_error == ENOTTY) &&
	zio->io_type == ZIO_TYPE_IOCTL &&
	zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
	vd->vdev_nowritecache = B_TRUE;

	if (zio->io_error)
	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
	zio->io_physdone != NULL) {
	ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
	ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
	zio->io_physdone(zio->io_logical);
	}

	return (ZIO_PIPELINE_CONTINUE);
	}

	void
	zio_vdev_io_reissue(zio_t *zio)
	{
	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
	ASSERT(zio->io_error == 0);

	zio->io_stage >>= 1;
	}

	void
	zio_vdev_io_redone(zio_t *zio)
	{
	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);

	zio->io_stage >>= 1;
	}

	void
	zio_vdev_io_bypass(zio_t *zio)
	{
	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
	ASSERT(zio->io_error == 0);

	zio->io_flags \|= ZIO_FLAG_IO_BYPASS;
	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
	}

	/*
	* ==========================================================================
	* Generate and verify checksums
	* ==========================================================================
	*/
	static int
	zio_checksum_generate(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;
	enum zio_checksum checksum;

	if (bp == NULL) {
	/*
	* This is zio_write_phys().
	* We're either generating a label checksum, or none at all.
	*/
	checksum = zio->io_prop.zp_checksum;

	if (checksum == ZIO_CHECKSUM_OFF)
	return (ZIO_PIPELINE_CONTINUE);

	ASSERT(checksum == ZIO_CHECKSUM_LABEL);
	} else {
	if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
	ASSERT(!IO_IS_ALLOCATING(zio));
	checksum = ZIO_CHECKSUM_GANG_HEADER;
	} else {
	checksum = BP_GET_CHECKSUM(bp);
	}
	}

	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);

	return (ZIO_PIPELINE_CONTINUE);
	}

	static int
	zio_checksum_verify(zio_t *zio)
	{
	zio_bad_cksum_t info;
	blkptr_t *bp = zio->io_bp;
	int error;

	ASSERT(zio->io_vd != NULL);

	if (bp == NULL) {
	/*
	* This is zio_read_phys().
	* We're either verifying a label checksum, or nothing at all.
	*/
	if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
	return (ZIO_PIPELINE_CONTINUE);

	ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
	}

	if ((error = zio_checksum_error(zio, &info)) != 0) {
	zio->io_error = error;
	if (error == ECKSUM &&
	!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
	zfs_ereport_start_checksum(zio->io_spa,
	zio->io_vd, zio, zio->io_offset,
	zio->io_size, NULL, &info);
	}
	}

	return (ZIO_PIPELINE_CONTINUE);
	}

	/*
	* Called by RAID-Z to ensure we don't compute the checksum twice.
	*/
	void
	zio_checksum_verified(zio_t *zio)
	{
	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
	}

	/*
	* ==========================================================================
	* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
	* An error of 0 indicates success. ENXIO indicates whole-device failure,
	* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
	* indicate errors that are specific to one I/O, and most likely permanent.
	* Any other error is presumed to be worse because we weren't expecting it.
	* ==========================================================================
	*/
	int
	zio_worst_error(int e1, int e2)
	{
	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
	int r1, r2;

	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
	if (e1 == zio_error_rank[r1])
	break;

	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
	if (e2 == zio_error_rank[r2])
	break;

	return (r1 > r2 ? e1 : e2);
	}

	/*
	* ==========================================================================
	* I/O completion
	* ==========================================================================
	*/
	static int
	zio_ready(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;
	zio_t pio, pio_next;
	zio_link_t *zl = NULL;

	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT \| ZIO_CHILD_DDT_BIT,
	ZIO_WAIT_READY)) {
	return (ZIO_PIPELINE_STOP);
	}

	if (zio->io_ready) {
	ASSERT(IO_IS_ALLOCATING(zio));
	ASSERT(bp->blk_birth == zio->io_txg \|\| BP_IS_HOLE(bp) \|\|
	(zio->io_flags & ZIO_FLAG_NOPWRITE));
	ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);

	zio->io_ready(zio);
	}

	if (bp != NULL && bp != &zio->io_bp_copy)
	zio->io_bp_copy = *bp;

	if (zio->io_error != 0) {
	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
	ASSERT(IO_IS_ALLOCATING(zio));
	ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
	/*
	* We were unable to allocate anything, unreserve and
	* issue the next I/O to allocate.
	*/
	metaslab_class_throttle_unreserve(
	spa_normal_class(zio->io_spa),
	zio->io_prop.zp_copies, zio);
	zio_allocate_dispatch(zio->io_spa);
	}
	}

	mutex_enter(&zio->io_lock);
	zio->io_state[ZIO_WAIT_READY] = 1;
	pio = zio_walk_parents(zio, &zl);
	mutex_exit(&zio->io_lock);

	/*
	* As we notify zio's parents, new parents could be added.
	* New parents go to the head of zio's io_parent_list, however,
	* so we will (correctly) not notify them. The remainder of zio's
	* io_parent_list, from 'pio_next' onward, cannot change because
	* all parents must wait for us to be done before they can be done.
	*/
	for (; pio != NULL; pio = pio_next) {
	pio_next = zio_walk_parents(zio, &zl);
	zio_notify_parent(pio, zio, ZIO_WAIT_READY);
	}

	if (zio->io_flags & ZIO_FLAG_NODATA) {
	if (BP_IS_GANG(bp)) {
	zio->io_flags &= ~ZIO_FLAG_NODATA;
	} else {
	ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
	zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
	}
	}

	if (zio_injection_enabled &&
	zio->io_spa->spa_syncing_txg == zio->io_txg)
	zio_handle_ignored_writes(zio);

	return (ZIO_PIPELINE_CONTINUE);
	}

	/*
	* Update the allocation throttle accounting.
	*/
	static void
	zio_dva_throttle_done(zio_t *zio)
	{
	zio_t *lio = zio->io_logical;
	zio_t *pio = zio_unique_parent(zio);
	vdev_t *vd = zio->io_vd;
	int flags = METASLAB_ASYNC_ALLOC;

	ASSERT3P(zio->io_bp, !=, NULL);
	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
	ASSERT(vd != NULL);
	ASSERT3P(vd, ==, vd->vdev_top);
	ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR \| ZIO_FLAG_IO_RETRY)));
	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));

	/*
	* Parents of gang children can have two flavors -- ones that
	* allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
	* and ones that allocated the constituent blocks. The allocation
	* throttle needs to know the allocating parent zio so we must find
	* it here.
	*/
	if (pio->io_child_type == ZIO_CHILD_GANG) {
	/*
	* If our parent is a rewrite gang child then our grandparent
	* would have been the one that performed the allocation.
	*/
	if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
	pio = zio_unique_parent(pio);
	flags \|= METASLAB_GANG_CHILD;
	}

	ASSERT(IO_IS_ALLOCATING(pio));
	ASSERT3P(zio, !=, zio->io_logical);
	ASSERT(zio->io_logical != NULL);
	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);

	mutex_enter(&pio->io_lock);
	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
	mutex_exit(&pio->io_lock);

	metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
	1, pio);

	/*
	* Call into the pipeline to see if there is more work that
	* needs to be done. If there is work to be done it will be
	* dispatched to another taskq thread.
	*/
	zio_allocate_dispatch(zio->io_spa);
	}

	static int
	zio_done(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;
	zio_t *lio = zio->io_logical;
	blkptr_t *bp = zio->io_bp;
	vdev_t *vd = zio->io_vd;
	uint64_t psize = zio->io_size;
	zio_t pio, pio_next;
	metaslab_class_t *mc = spa_normal_class(spa);
	zio_link_t *zl = NULL;

	/*
	* If our children haven't all completed,
	* wait for them and then repeat this pipeline stage.
	*/
	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
	return (ZIO_PIPELINE_STOP);
	}

	/*
	* If the allocation throttle is enabled, then update the accounting.
	* We only track child I/Os that are part of an allocating async
	* write. We must do this since the allocation is performed
	* by the logical I/O but the actual write is done by child I/Os.
	*/
	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
	zio->io_child_type == ZIO_CHILD_VDEV) {
	ASSERT(mc->mc_alloc_throttle_enabled);
	zio_dva_throttle_done(zio);
	}

	/*
	* If the allocation throttle is enabled, verify that
	* we have decremented the refcounts for every I/O that was throttled.
	*/
	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
	ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
	ASSERT(bp != NULL);
	metaslab_group_alloc_verify(spa, zio->io_bp, zio);
	VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
	}

	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
	ASSERT(zio->io_children[c][w] == 0);

	if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
	ASSERT(bp->blk_pad[0] == 0);
	ASSERT(bp->blk_pad[1] == 0);
	ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 \|\|
	(bp == zio_unique_parent(zio)->io_bp));
	if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
	zio->io_bp_override == NULL &&
	!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
	ASSERT(!BP_SHOULD_BYTESWAP(bp));
	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
	ASSERT(BP_COUNT_GANG(bp) == 0 \|\|
	(BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
	}
	if (zio->io_flags & ZIO_FLAG_NOPWRITE)
	VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
	}

	/*
	* If there were child vdev/gang/ddt errors, they apply to us now.
	*/
	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);

	/*
	* If the I/O on the transformed data was successful, generate any
	* checksum reports now while we still have the transformed data.
	*/
	if (zio->io_error == 0) {
	while (zio->io_cksum_report != NULL) {
	zio_cksum_report_t *zcr = zio->io_cksum_report;
	uint64_t align = zcr->zcr_align;
	uint64_t asize = P2ROUNDUP(psize, align);
	char *abuf = NULL;
	abd_t *adata = zio->io_abd;

	if (asize != psize) {
	adata = abd_alloc_linear(asize, B_TRUE);
	abd_copy(adata, zio->io_abd, psize);
	abd_zero_off(adata, psize, asize - psize);
	}

	if (adata != NULL)
	abuf = abd_borrow_buf_copy(adata, asize);

	zio->io_cksum_report = zcr->zcr_next;
	zcr->zcr_next = NULL;
	zcr->zcr_finish(zcr, abuf);
	zfs_ereport_free_checksum(zcr);

	if (adata != NULL)
	abd_return_buf(adata, abuf, asize);

	if (asize != psize)
	abd_free(adata);
	}
	}

	zio_pop_transforms(zio); /* note: may set zio->io_error */

	vdev_stat_update(zio, psize);

	if (zio->io_error) {
	/*
	* If this I/O is attached to a particular vdev,
	* generate an error message describing the I/O failure
	* at the block level. We ignore these errors if the
	* device is currently unavailable.
	*/
	if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
	zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);

	if ((zio->io_error == EIO \|\| !(zio->io_flags &
	(ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_DONT_PROPAGATE))) &&
	zio == lio) {
	/*
	* For logical I/O requests, tell the SPA to log the
	* error and generate a logical data ereport.
	*/
	spa_log_error(spa, zio);
	zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
	0, 0);
	}
	}

	if (zio->io_error && zio == lio) {
	/*
	* Determine whether zio should be reexecuted. This will
	* propagate all the way to the root via zio_notify_parent().
	*/
	ASSERT(vd == NULL && bp != NULL);
	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);

	if (IO_IS_ALLOCATING(zio) &&
	!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
	if (zio->io_error != ENOSPC)
	zio->io_reexecute \|= ZIO_REEXECUTE_NOW;
	else
	zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND;
	}

	if ((zio->io_type == ZIO_TYPE_READ \|\|
	zio->io_type == ZIO_TYPE_FREE) &&
	!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
	zio->io_error == ENXIO &&
	spa_load_state(spa) == SPA_LOAD_NONE &&
	spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
	zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND;

	if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
	zio->io_reexecute \|= ZIO_REEXECUTE_SUSPEND;

	/*
	* Here is a possibly good place to attempt to do
	* either combinatorial reconstruction or error correction
	* based on checksums. It also might be a good place
	* to send out preliminary ereports before we suspend
	* processing.
	*/
	}

	/*
	* If there were logical child errors, they apply to us now.
	* We defer this until now to avoid conflating logical child
	* errors with errors that happened to the zio itself when
	* updating vdev stats and reporting FMA events above.
	*/
	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);

	if ((zio->io_error \|\| zio->io_reexecute) &&
	IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
	!(zio->io_flags & (ZIO_FLAG_IO_REWRITE \| ZIO_FLAG_NOPWRITE)))
	zio_dva_unallocate(zio, zio->io_gang_tree, bp);

	zio_gang_tree_free(&zio->io_gang_tree);

	/*
	* Godfather I/Os should never suspend.
	*/
	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
	(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
	zio->io_reexecute = 0;

	if (zio->io_reexecute) {
	/*
	* This is a logical I/O that wants to reexecute.
	*
	* Reexecute is top-down. When an i/o fails, if it's not
	* the root, it simply notifies its parent and sticks around.
	* The parent, seeing that it still has children in zio_done(),
	* does the same. This percolates all the way up to the root.
	* The root i/o will reexecute or suspend the entire tree.
	*
	* This approach ensures that zio_reexecute() honors
	* all the original i/o dependency relationships, e.g.
	* parents not executing until children are ready.
	*/
	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);

	zio->io_gang_leader = NULL;

	mutex_enter(&zio->io_lock);
	zio->io_state[ZIO_WAIT_DONE] = 1;
	mutex_exit(&zio->io_lock);

	/*
	* "The Godfather" I/O monitors its children but is
	* not a true parent to them. It will track them through
	* the pipeline but severs its ties whenever they get into
	* trouble (e.g. suspended). This allows "The Godfather"
	* I/O to return status without blocking.
	*/
	zl = NULL;
	for (pio = zio_walk_parents(zio, &zl); pio != NULL;
	pio = pio_next) {
	zio_link_t *remove_zl = zl;
	pio_next = zio_walk_parents(zio, &zl);

	if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
	(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
	zio_remove_child(pio, zio, remove_zl);
	zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
	}
	}

	if ((pio = zio_unique_parent(zio)) != NULL) {
	/*
	* We're not a root i/o, so there's nothing to do
	* but notify our parent. Don't propagate errors
	* upward since we haven't permanently failed yet.
	*/
	ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
	zio->io_flags \|= ZIO_FLAG_DONT_PROPAGATE;
	zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
	} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
	/*
	* We'd fail again if we reexecuted now, so suspend
	* until conditions improve (e.g. device comes online).
	*/
	zio_suspend(spa, zio);
	} else {
	/*
	* Reexecution is potentially a huge amount of work.
	* Hand it off to the otherwise-unused claim taskq.
	*/
	#if defined(illumos) \|\| !defined(_KERNEL)
	ASSERT(zio->io_tqent.tqent_next == NULL);
	#else
	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
	#endif
	spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
	ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
	0, &zio->io_tqent);
	}
	return (ZIO_PIPELINE_STOP);
	}

	ASSERT(zio->io_child_count == 0);
	ASSERT(zio->io_reexecute == 0);
	ASSERT(zio->io_error == 0 \|\| (zio->io_flags & ZIO_FLAG_CANFAIL));

	/*
	* Report any checksum errors, since the I/O is complete.
	*/
	while (zio->io_cksum_report != NULL) {
	zio_cksum_report_t *zcr = zio->io_cksum_report;
	zio->io_cksum_report = zcr->zcr_next;
	zcr->zcr_next = NULL;
	zcr->zcr_finish(zcr, NULL);
	zfs_ereport_free_checksum(zcr);
	}

	/*
	* It is the responsibility of the done callback to ensure that this
	* particular zio is no longer discoverable for adoption, and as
	* such, cannot acquire any new parents.
	*/
	if (zio->io_done)
	zio->io_done(zio);

	mutex_enter(&zio->io_lock);
	zio->io_state[ZIO_WAIT_DONE] = 1;
	mutex_exit(&zio->io_lock);

	zl = NULL;
	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
	zio_link_t *remove_zl = zl;
	pio_next = zio_walk_parents(zio, &zl);
	zio_remove_child(pio, zio, remove_zl);
	zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
	}

	if (zio->io_waiter != NULL) {
	mutex_enter(&zio->io_lock);
	zio->io_executor = NULL;
	cv_broadcast(&zio->io_cv);
	mutex_exit(&zio->io_lock);
	} else {
	zio_destroy(zio);
	}

	return (ZIO_PIPELINE_STOP);
	}

	/*
	* ==========================================================================
	* I/O pipeline definition
	* ==========================================================================
	*/
	static zio_pipe_stage_t *zio_pipeline[] = {
	NULL,
	zio_read_bp_init,
	zio_write_bp_init,
	zio_free_bp_init,
	zio_issue_async,
	zio_write_compress,
	zio_checksum_generate,
	zio_nop_write,
	zio_ddt_read_start,
	zio_ddt_read_done,
	zio_ddt_write,
	zio_ddt_free,
	zio_gang_assemble,
	zio_gang_issue,
	zio_dva_throttle,
	zio_dva_allocate,
	zio_dva_free,
	zio_dva_claim,
	zio_ready,
	zio_vdev_io_start,
	zio_vdev_io_done,
	zio_vdev_io_assess,
	zio_checksum_verify,
	zio_done
	};




	/*
	* Compare two zbookmark_phys_t's to see which we would reach first in a
	* pre-order traversal of the object tree.
	*
	* This is simple in every case aside from the meta-dnode object. For all other
	* objects, we traverse them in order (object 1 before object 2, and so on).
	* However, all of these objects are traversed while traversing object 0, since
	* the data it points to is the list of objects. Thus, we need to convert to a
	* canonical representation so we can compare meta-dnode bookmarks to
	* non-meta-dnode bookmarks.
	*
	* We do this by calculating "equivalents" for each field of the zbookmark.
	* zbookmarks outside of the meta-dnode use their own object and level, and
	* calculate the level 0 equivalent (the first L0 blkid that is contained in the
	* blocks this bookmark refers to) by multiplying their blkid by their span
	* (the number of L0 blocks contained within one block at their level).
	* zbookmarks inside the meta-dnode calculate their object equivalent
	* (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
	* level + 1<<31 (any value larger than a level could ever be) for their level.
	* This causes them to always compare before a bookmark in their object
	* equivalent, compare appropriately to bookmarks in other objects, and to
	* compare appropriately to other bookmarks in the meta-dnode.
	*/
	int
	zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
	const zbookmark_phys_t zb1, const zbookmark_phys_t zb2)
	{
	/*
	* These variables represent the "equivalent" values for the zbookmark,
	* after converting zbookmarks inside the meta dnode to their
	* normal-object equivalents.
	*/
	uint64_t zb1obj, zb2obj;
	uint64_t zb1L0, zb2L0;
	uint64_t zb1level, zb2level;

	if (zb1->zb_object == zb2->zb_object &&
	zb1->zb_level == zb2->zb_level &&
	zb1->zb_blkid == zb2->zb_blkid)
	return (0);

	/*
	* BP_SPANB calculates the span in blocks.
	*/
	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);

	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
	zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
	zb1L0 = 0;
	zb1level = zb1->zb_level + COMPARE_META_LEVEL;
	} else {
	zb1obj = zb1->zb_object;
	zb1level = zb1->zb_level;
	}

	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
	zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
	zb2L0 = 0;
	zb2level = zb2->zb_level + COMPARE_META_LEVEL;
	} else {
	zb2obj = zb2->zb_object;
	zb2level = zb2->zb_level;
	}

	/* Now that we have a canonical representation, do the comparison. */
	if (zb1obj != zb2obj)
	return (zb1obj < zb2obj ? -1 : 1);
	else if (zb1L0 != zb2L0)
	return (zb1L0 < zb2L0 ? -1 : 1);
	else if (zb1level != zb2level)
	return (zb1level > zb2level ? -1 : 1);
	/*
	* This can (theoretically) happen if the bookmarks have the same object
	* and level, but different blkids, if the block sizes are not the same.
	* There is presently no way to change the indirect block sizes
	*/
	return (0);
	}

	/*
	* This function checks the following: given that last_block is the place that
	* our traversal stopped last time, does that guarantee that we've visited
	* every node under subtree_root? Therefore, we can't just use the raw output
	* of zbookmark_compare. We have to pass in a modified version of
	* subtree_root; by incrementing the block id, and then checking whether
	* last_block is before or equal to that, we can tell whether or not having
	* visited last_block implies that all of subtree_root's children have been
	* visited.
	*/
	boolean_t
	zbookmark_subtree_completed(const dnode_phys_t *dnp,
	const zbookmark_phys_t subtree_root, const zbookmark_phys_t last_block)
	{
	zbookmark_phys_t mod_zb = *subtree_root;
	mod_zb.zb_blkid++;
	ASSERT(last_block->zb_level == 0);

	/* The objset_phys_t isn't before anything. */
	if (dnp == NULL)
	return (B_FALSE);

	/*
	* We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
	* data block size in sectors, because that variable is only used if
	* the bookmark refers to a block in the meta-dnode. Since we don't
	* know without examining it what object it refers to, and there's no
	* harm in passing in this value in other cases, we always pass it in.
	*
	* We pass in 0 for the indirect block size shift because zb2 must be
	* level 0. The indirect block size is only used to calculate the span
	* of the bookmark, but since the bookmark must be level 0, the span is
	* always 1, so the math works out.
	*
	* If you make changes to how the zbookmark_compare code works, be sure
	* to make sure that this code still works afterwards.
	*/
	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
	1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
	last_block) <= 0);
	}
	Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
	===================================================================
	--- stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h (revision 332524)
	+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h (revision 332525)
	@@ -1,1066 +1,1096 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
	* Copyright (c) 2014 Integros [integros.com]
	* Copyright 2017 Joyent, Inc.
	* Copyright (c) 2017 Datto Inc.
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#ifndef _SYS_FS_ZFS_H
	#define _SYS_FS_ZFS_H

	#include <sys/types.h>
	#include <sys/ioccom.h>
	#include <sys/time.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	/*
	* Types and constants shared between userland and the kernel.
	*/

	/*
	* Each dataset can be one of the following types. These constants can be
	* combined into masks that can be passed to various functions.
	*/
	typedef enum {
	ZFS_TYPE_FILESYSTEM = (1 << 0),
	ZFS_TYPE_SNAPSHOT = (1 << 1),
	ZFS_TYPE_VOLUME = (1 << 2),
	ZFS_TYPE_POOL = (1 << 3),
	ZFS_TYPE_BOOKMARK = (1 << 4)
	} zfs_type_t;

	/*
	* NB: lzc_dataset_type should be updated whenever a new objset type is added,
	* if it represents a real type of a dataset that can be created from userland.
	*/
	typedef enum dmu_objset_type {
	DMU_OST_NONE,
	DMU_OST_META,
	DMU_OST_ZFS,
	DMU_OST_ZVOL,
	DMU_OST_OTHER, /* For testing only! */
	DMU_OST_ANY, /* Be careful! */
	DMU_OST_NUMTYPES
	} dmu_objset_type_t;

	#define ZFS_TYPE_DATASET \
	(ZFS_TYPE_FILESYSTEM \| ZFS_TYPE_VOLUME \| ZFS_TYPE_SNAPSHOT)

	/*
	* All of these include the terminating NUL byte.
	*/
	#define ZAP_MAXNAMELEN 256
	#define ZAP_MAXVALUELEN (1024 * 8)
	#define ZAP_OLDMAXVALUELEN 1024
	#define ZFS_MAX_DATASET_NAME_LEN 256

	/*
	* Dataset properties are identified by these constants and must be added to
	* the end of this list to ensure that external consumers are not affected
	* by the change. If you make any changes to this list, be sure to update
	* the property table in usr/src/common/zfs/zfs_prop.c.
	*/
	typedef enum {
	ZPROP_CONT = -2,
	ZPROP_INVAL = -1,
	ZFS_PROP_TYPE = 0,
	ZFS_PROP_CREATION,
	ZFS_PROP_USED,
	ZFS_PROP_AVAILABLE,
	ZFS_PROP_REFERENCED,
	ZFS_PROP_COMPRESSRATIO,
	ZFS_PROP_MOUNTED,
	ZFS_PROP_ORIGIN,
	ZFS_PROP_QUOTA,
	ZFS_PROP_RESERVATION,
	ZFS_PROP_VOLSIZE,
	ZFS_PROP_VOLBLOCKSIZE,
	ZFS_PROP_RECORDSIZE,
	ZFS_PROP_MOUNTPOINT,
	ZFS_PROP_SHARENFS,
	ZFS_PROP_CHECKSUM,
	ZFS_PROP_COMPRESSION,
	ZFS_PROP_ATIME,
	ZFS_PROP_DEVICES,
	ZFS_PROP_EXEC,
	ZFS_PROP_SETUID,
	ZFS_PROP_READONLY,
	ZFS_PROP_ZONED,
	ZFS_PROP_SNAPDIR,
	ZFS_PROP_ACLMODE,
	ZFS_PROP_ACLINHERIT,
	ZFS_PROP_CREATETXG, /* not exposed to the user */
	ZFS_PROP_NAME, /* not exposed to the user */
	ZFS_PROP_CANMOUNT,
	ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
	ZFS_PROP_XATTR,
	ZFS_PROP_NUMCLONES, /* not exposed to the user */
	ZFS_PROP_COPIES,
	ZFS_PROP_VERSION,
	ZFS_PROP_UTF8ONLY,
	ZFS_PROP_NORMALIZE,
	ZFS_PROP_CASE,
	ZFS_PROP_VSCAN,
	ZFS_PROP_NBMAND,
	ZFS_PROP_SHARESMB,
	ZFS_PROP_REFQUOTA,
	ZFS_PROP_REFRESERVATION,
	ZFS_PROP_GUID,
	ZFS_PROP_PRIMARYCACHE,
	ZFS_PROP_SECONDARYCACHE,
	ZFS_PROP_USEDSNAP,
	ZFS_PROP_USEDDS,
	ZFS_PROP_USEDCHILD,
	ZFS_PROP_USEDREFRESERV,
	ZFS_PROP_USERACCOUNTING, /* not exposed to the user */
	ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */
	ZFS_PROP_DEFER_DESTROY,
	ZFS_PROP_USERREFS,
	ZFS_PROP_LOGBIAS,
	ZFS_PROP_UNIQUE, /* not exposed to the user */
	ZFS_PROP_OBJSETID, /* not exposed to the user */
	ZFS_PROP_DEDUP,
	ZFS_PROP_MLSLABEL,
	ZFS_PROP_SYNC,
	ZFS_PROP_REFRATIO,
	ZFS_PROP_WRITTEN,
	ZFS_PROP_CLONES,
	ZFS_PROP_LOGICALUSED,
	ZFS_PROP_LOGICALREFERENCED,
	ZFS_PROP_INCONSISTENT, /* not exposed to the user */
	ZFS_PROP_VOLMODE,
	ZFS_PROP_FILESYSTEM_LIMIT,
	ZFS_PROP_SNAPSHOT_LIMIT,
	ZFS_PROP_FILESYSTEM_COUNT,
	ZFS_PROP_SNAPSHOT_COUNT,
	ZFS_PROP_REDUNDANT_METADATA,
	ZFS_PROP_PREV_SNAP,
	ZFS_PROP_RECEIVE_RESUME_TOKEN,
	+ ZFS_PROP_REMAPTXG, /* not exposed to the user */
	ZFS_NUM_PROPS
	} zfs_prop_t;

	typedef enum {
	ZFS_PROP_USERUSED,
	ZFS_PROP_USERQUOTA,
	ZFS_PROP_GROUPUSED,
	ZFS_PROP_GROUPQUOTA,
	ZFS_NUM_USERQUOTA_PROPS
	} zfs_userquota_prop_t;

	extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];

	/*
	* Pool properties are identified by these constants and must be added to the
	* end of this list to ensure that external consumers are not affected
	* by the change. If you make any changes to this list, be sure to update
	* the property table in usr/src/common/zfs/zpool_prop.c.
	*/
	typedef enum {
	ZPOOL_PROP_INVAL = -1,
	ZPOOL_PROP_NAME,
	ZPOOL_PROP_SIZE,
	ZPOOL_PROP_CAPACITY,
	ZPOOL_PROP_ALTROOT,
	ZPOOL_PROP_HEALTH,
	ZPOOL_PROP_GUID,
	ZPOOL_PROP_VERSION,
	ZPOOL_PROP_BOOTFS,
	ZPOOL_PROP_DELEGATION,
	ZPOOL_PROP_AUTOREPLACE,
	ZPOOL_PROP_CACHEFILE,
	ZPOOL_PROP_FAILUREMODE,
	ZPOOL_PROP_LISTSNAPS,
	ZPOOL_PROP_AUTOEXPAND,
	ZPOOL_PROP_DEDUPDITTO,
	ZPOOL_PROP_DEDUPRATIO,
	ZPOOL_PROP_FREE,
	ZPOOL_PROP_ALLOCATED,
	ZPOOL_PROP_READONLY,
	ZPOOL_PROP_COMMENT,
	ZPOOL_PROP_EXPANDSZ,
	ZPOOL_PROP_FREEING,
	ZPOOL_PROP_FRAGMENTATION,
	ZPOOL_PROP_LEAKED,
	ZPOOL_PROP_MAXBLOCKSIZE,
	ZPOOL_PROP_BOOTSIZE,
	ZPOOL_NUM_PROPS
	} zpool_prop_t;

	/* Small enough to not hog a whole line of printout in zpool(1M). */
	#define ZPROP_MAX_COMMENT 32

	#define ZPROP_VALUE "value"
	#define ZPROP_SOURCE "source"

	typedef enum {
	ZPROP_SRC_NONE = 0x1,
	ZPROP_SRC_DEFAULT = 0x2,
	ZPROP_SRC_TEMPORARY = 0x4,
	ZPROP_SRC_LOCAL = 0x8,
	ZPROP_SRC_INHERITED = 0x10,
	ZPROP_SRC_RECEIVED = 0x20
	} zprop_source_t;

	#define ZPROP_SRC_ALL 0x3f

	#define ZPROP_SOURCE_VAL_RECVD "$recvd"
	#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS"
	/*
	* Dataset flag implemented as a special entry in the props zap object
	* indicating that the dataset has received properties on or after
	* SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
	* just as it did in earlier versions, and thereafter, local properties are
	* preserved.
	*/
	#define ZPROP_HAS_RECVD "$hasrecvd"

	typedef enum {
	ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
	ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
	} zprop_errflags_t;

	typedef int (zprop_func)(int, void );

	/*
	* Properties to be set on the root file system of a new pool
	* are stuffed into their own nvlist, which is then included in
	* the properties nvlist with the pool properties.
	*/
	#define ZPOOL_ROOTFS_PROPS "root-props-nvl"

	/*
	* Length of 'written@' and 'written#'
	*/
	#define ZFS_WRITTEN_PROP_PREFIX_LEN 8

	/*
	* Dataset property functions shared between libzfs and kernel.
	*/
	const char *zfs_prop_default_string(zfs_prop_t);
	uint64_t zfs_prop_default_numeric(zfs_prop_t);
	boolean_t zfs_prop_readonly(zfs_prop_t);
	boolean_t zfs_prop_visible(zfs_prop_t prop);
	boolean_t zfs_prop_inheritable(zfs_prop_t);
	boolean_t zfs_prop_setonce(zfs_prop_t);
	const char *zfs_prop_to_name(zfs_prop_t);
	zfs_prop_t zfs_name_to_prop(const char *);
	boolean_t zfs_prop_user(const char *);
	boolean_t zfs_prop_userquota(const char *);
	int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
	int zfs_prop_string_to_index(zfs_prop_t, const char , uint64_t );
	uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
	boolean_t zfs_prop_valid_for_type(int, zfs_type_t);

	/*
	* Pool property functions shared between libzfs and kernel.
	*/
	zpool_prop_t zpool_name_to_prop(const char *);
	const char *zpool_prop_to_name(zpool_prop_t);
	const char *zpool_prop_default_string(zpool_prop_t);
	uint64_t zpool_prop_default_numeric(zpool_prop_t);
	boolean_t zpool_prop_readonly(zpool_prop_t);
	boolean_t zpool_prop_feature(const char *);
	boolean_t zpool_prop_unsupported(const char *name);
	int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
	int zpool_prop_string_to_index(zpool_prop_t, const char , uint64_t );
	uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);

	/*
	* Definitions for the Delegation.
	*/
	typedef enum {
	ZFS_DELEG_WHO_UNKNOWN = 0,
	ZFS_DELEG_USER = 'u',
	ZFS_DELEG_USER_SETS = 'U',
	ZFS_DELEG_GROUP = 'g',
	ZFS_DELEG_GROUP_SETS = 'G',
	ZFS_DELEG_EVERYONE = 'e',
	ZFS_DELEG_EVERYONE_SETS = 'E',
	ZFS_DELEG_CREATE = 'c',
	ZFS_DELEG_CREATE_SETS = 'C',
	ZFS_DELEG_NAMED_SET = 's',
	ZFS_DELEG_NAMED_SET_SETS = 'S'
	} zfs_deleg_who_type_t;

	typedef enum {
	ZFS_DELEG_NONE = 0,
	ZFS_DELEG_PERM_LOCAL = 1,
	ZFS_DELEG_PERM_DESCENDENT = 2,
	ZFS_DELEG_PERM_LOCALDESCENDENT = 3,
	ZFS_DELEG_PERM_CREATE = 4
	} zfs_deleg_inherit_t;

	#define ZFS_DELEG_PERM_UID "uid"
	#define ZFS_DELEG_PERM_GID "gid"
	#define ZFS_DELEG_PERM_GROUPS "groups"

	#define ZFS_MLSLABEL_DEFAULT "none"

	#define ZFS_SMB_ACL_SRC "src"
	#define ZFS_SMB_ACL_TARGET "target"

	typedef enum {
	ZFS_CANMOUNT_OFF = 0,
	ZFS_CANMOUNT_ON = 1,
	ZFS_CANMOUNT_NOAUTO = 2
	} zfs_canmount_type_t;

	typedef enum {
	ZFS_LOGBIAS_LATENCY = 0,
	ZFS_LOGBIAS_THROUGHPUT = 1
	} zfs_logbias_op_t;

	typedef enum zfs_share_op {
	ZFS_SHARE_NFS = 0,
	ZFS_UNSHARE_NFS = 1,
	ZFS_SHARE_SMB = 2,
	ZFS_UNSHARE_SMB = 3
	} zfs_share_op_t;

	typedef enum zfs_smb_acl_op {
	ZFS_SMB_ACL_ADD,
	ZFS_SMB_ACL_REMOVE,
	ZFS_SMB_ACL_RENAME,
	ZFS_SMB_ACL_PURGE
	} zfs_smb_acl_op_t;

	typedef enum zfs_cache_type {
	ZFS_CACHE_NONE = 0,
	ZFS_CACHE_METADATA = 1,
	ZFS_CACHE_ALL = 2
	} zfs_cache_type_t;

	typedef enum {
	ZFS_SYNC_STANDARD = 0,
	ZFS_SYNC_ALWAYS = 1,
	ZFS_SYNC_DISABLED = 2
	} zfs_sync_type_t;

	typedef enum {
	ZFS_VOLMODE_DEFAULT = 0,
	ZFS_VOLMODE_GEOM = 1,
	ZFS_VOLMODE_DEV = 2,
	ZFS_VOLMODE_NONE = 3
	} zfs_volmode_t;

	typedef enum {
	ZFS_REDUNDANT_METADATA_ALL,
	ZFS_REDUNDANT_METADATA_MOST
	} zfs_redundant_metadata_type_t;

	/*
	* On-disk version number.
	*/
	#define SPA_VERSION_1 1ULL
	#define SPA_VERSION_2 2ULL
	#define SPA_VERSION_3 3ULL
	#define SPA_VERSION_4 4ULL
	#define SPA_VERSION_5 5ULL
	#define SPA_VERSION_6 6ULL
	#define SPA_VERSION_7 7ULL
	#define SPA_VERSION_8 8ULL
	#define SPA_VERSION_9 9ULL
	#define SPA_VERSION_10 10ULL
	#define SPA_VERSION_11 11ULL
	#define SPA_VERSION_12 12ULL
	#define SPA_VERSION_13 13ULL
	#define SPA_VERSION_14 14ULL
	#define SPA_VERSION_15 15ULL
	#define SPA_VERSION_16 16ULL
	#define SPA_VERSION_17 17ULL
	#define SPA_VERSION_18 18ULL
	#define SPA_VERSION_19 19ULL
	#define SPA_VERSION_20 20ULL
	#define SPA_VERSION_21 21ULL
	#define SPA_VERSION_22 22ULL
	#define SPA_VERSION_23 23ULL
	#define SPA_VERSION_24 24ULL
	#define SPA_VERSION_25 25ULL
	#define SPA_VERSION_26 26ULL
	#define SPA_VERSION_27 27ULL
	#define SPA_VERSION_28 28ULL
	#define SPA_VERSION_5000 5000ULL

	/*
	* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
	* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
	* and do the appropriate changes. Also bump the version number in
	* usr/src/grub/capability.
	*/
	#define SPA_VERSION SPA_VERSION_5000
	#define SPA_VERSION_STRING "5000"

	/*
	* Symbolic names for the changes that caused a SPA_VERSION switch.
	* Used in the code when checking for presence or absence of a feature.
	* Feel free to define multiple symbolic names for each version if there
	* were multiple changes to on-disk structures during that version.
	*
	* NOTE: When checking the current SPA_VERSION in your code, be sure
	* to use spa_version() since it reports the version of the
	* last synced uberblock. Checking the in-flight version can
	* be dangerous in some cases.
	*/
	#define SPA_VERSION_INITIAL SPA_VERSION_1
	#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
	#define SPA_VERSION_SPARES SPA_VERSION_3
	#define SPA_VERSION_RAIDZ2 SPA_VERSION_3
	#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3
	#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
	#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
	#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
	#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5
	#define SPA_VERSION_BOOTFS SPA_VERSION_6
	#define SPA_VERSION_SLOGS SPA_VERSION_7
	#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8
	#define SPA_VERSION_FUID SPA_VERSION_9
	#define SPA_VERSION_REFRESERVATION SPA_VERSION_9
	#define SPA_VERSION_REFQUOTA SPA_VERSION_9
	#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9
	#define SPA_VERSION_L2CACHE SPA_VERSION_10
	#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11
	#define SPA_VERSION_ORIGIN SPA_VERSION_11
	#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11
	#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12
	#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
	#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
	#define SPA_VERSION_USERSPACE SPA_VERSION_15
	#define SPA_VERSION_STMF_PROP SPA_VERSION_16
	#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
	#define SPA_VERSION_USERREFS SPA_VERSION_18
	#define SPA_VERSION_HOLES SPA_VERSION_19
	#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
	#define SPA_VERSION_DEDUP SPA_VERSION_21
	#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
	#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
	#define SPA_VERSION_SA SPA_VERSION_24
	#define SPA_VERSION_SCAN SPA_VERSION_25
	#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
	#define SPA_VERSION_DEADLISTS SPA_VERSION_26
	#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
	#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
	#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28
	#define SPA_VERSION_FEATURES SPA_VERSION_5000

	#define SPA_VERSION_IS_SUPPORTED(v) \
	(((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) \|\| \
	((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))

	/*
	* ZPL version - rev'd whenever an incompatible on-disk format change
	* occurs. This is independent of SPA/DMU/ZAP versioning. You must
	* also update the version_table[] and help message in zfs_prop.c.
	*
	* When changing, be sure to teach GRUB how to read the new format!
	* See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*}
	*/
	#define ZPL_VERSION_1 1ULL
	#define ZPL_VERSION_2 2ULL
	#define ZPL_VERSION_3 3ULL
	#define ZPL_VERSION_4 4ULL
	#define ZPL_VERSION_5 5ULL
	#define ZPL_VERSION ZPL_VERSION_5
	#define ZPL_VERSION_STRING "5"

	#define ZPL_VERSION_INITIAL ZPL_VERSION_1
	#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2
	#define ZPL_VERSION_FUID ZPL_VERSION_3
	#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3
	#define ZPL_VERSION_SYSATTR ZPL_VERSION_3
	#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
	#define ZPL_VERSION_SA ZPL_VERSION_5

	/* Rewind request information */
	#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
	#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
	#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */
	#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */
	#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */
	#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */
	#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */

	typedef struct zpool_rewind_policy {
	uint32_t zrp_request; /* rewind behavior requested */
	uint64_t zrp_maxmeta; /* max acceptable meta-data errors */
	uint64_t zrp_maxdata; /* max acceptable data errors */
	uint64_t zrp_txg; /* specific txg to load */
	} zpool_rewind_policy_t;

	/*
	* The following are configuration names used in the nvlist describing a pool's
	- * configuration.
	+ * configuration. New on-disk names should be prefixed with "<reverse-DNS>:"
	+ * (e.g. "org.open-zfs:") to avoid conflicting names being developed
	+ * independently.
	*/
	#define ZPOOL_CONFIG_VERSION "version"
	#define ZPOOL_CONFIG_POOL_NAME "name"
	#define ZPOOL_CONFIG_POOL_STATE "state"
	#define ZPOOL_CONFIG_POOL_TXG "txg"
	#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
	#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
	#define ZPOOL_CONFIG_TOP_GUID "top_guid"
	#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
	#define ZPOOL_CONFIG_TYPE "type"
	#define ZPOOL_CONFIG_CHILDREN "children"
	#define ZPOOL_CONFIG_ID "id"
	#define ZPOOL_CONFIG_GUID "guid"
	+#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object"
	+#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births"
	+#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev"
	#define ZPOOL_CONFIG_PATH "path"
	#define ZPOOL_CONFIG_DEVID "devid"
	#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
	#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
	#define ZPOOL_CONFIG_ASHIFT "ashift"
	#define ZPOOL_CONFIG_ASIZE "asize"
	#define ZPOOL_CONFIG_DTL "DTL"
	#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
	+#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */
	#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
	+#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */
	#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
	#define ZPOOL_CONFIG_ERRCOUNT "error_count"
	#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
	#define ZPOOL_CONFIG_SPARES "spares"
	#define ZPOOL_CONFIG_IS_SPARE "is_spare"
	#define ZPOOL_CONFIG_NPARITY "nparity"
	#define ZPOOL_CONFIG_HOSTID "hostid"
	#define ZPOOL_CONFIG_HOSTNAME "hostname"
	#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
	#define ZPOOL_CONFIG_UNSPARE "unspare"
	#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
	#define ZPOOL_CONFIG_IS_LOG "is_log"
	#define ZPOOL_CONFIG_L2CACHE "l2cache"
	#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
	#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
	#define ZPOOL_CONFIG_IS_HOLE "is_hole"
	#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram"
	#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats"
	#define ZPOOL_CONFIG_DDT_STATS "ddt_stats"
	#define ZPOOL_CONFIG_SPLIT "splitcfg"
	#define ZPOOL_CONFIG_ORIG_GUID "orig_guid"
	#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
	#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
	#define ZPOOL_CONFIG_REMOVING "removing"
	#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg"
	#define ZPOOL_CONFIG_COMMENT "comment"
	#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
	#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
	#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
	#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */
	#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */
	#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */
	#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */
	#define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */
	#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */
	#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
	#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */
	#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top"
	#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf"
	#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps"
	/*
	* The persistent vdev state is stored as separate values rather than a single
	* 'vdev_state' entry. This is because a device can be in multiple states, such
	* as offline and degraded.
	*/
	#define ZPOOL_CONFIG_OFFLINE "offline"
	#define ZPOOL_CONFIG_FAULTED "faulted"
	#define ZPOOL_CONFIG_DEGRADED "degraded"
	#define ZPOOL_CONFIG_REMOVED "removed"
	#define ZPOOL_CONFIG_FRU "fru"
	#define ZPOOL_CONFIG_AUX_STATE "aux_state"

	/* Rewind policy parameters */
	#define ZPOOL_REWIND_POLICY "rewind-policy"
	#define ZPOOL_REWIND_REQUEST "rewind-request"
	#define ZPOOL_REWIND_REQUEST_TXG "rewind-request-txg"
	#define ZPOOL_REWIND_META_THRESH "rewind-meta-thresh"
	#define ZPOOL_REWIND_DATA_THRESH "rewind-data-thresh"

	/* Rewind data discovered */
	#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts"
	#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors"
	#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind"

	#define VDEV_TYPE_ROOT "root"
	#define VDEV_TYPE_MIRROR "mirror"
	#define VDEV_TYPE_REPLACING "replacing"
	#define VDEV_TYPE_RAIDZ "raidz"
	#define VDEV_TYPE_DISK "disk"
	#define VDEV_TYPE_FILE "file"
	#define VDEV_TYPE_MISSING "missing"
	#define VDEV_TYPE_HOLE "hole"
	#define VDEV_TYPE_SPARE "spare"
	#define VDEV_TYPE_LOG "log"
	#define VDEV_TYPE_L2CACHE "l2cache"
	+#define VDEV_TYPE_INDIRECT "indirect"

	+/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */
	+#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \
	+ "com.delphix:indirect_obsolete_sm"
	+#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
	+ "com.delphix:obsolete_counts_are_precise"
	+
	/*
	* This is needed in userland to report the minimum necessary device size.
	*
	* Note that the zfs test suite uses 64MB vdevs.
	*/
	#define SPA_MINDEVSIZE (64ULL << 20)

	/*
	* Set if the fragmentation has not yet been calculated. This can happen
	* because the space maps have not been upgraded or the histogram feature
	* is not enabled.
	*/
	#define ZFS_FRAG_INVALID UINT64_MAX

	/*
	* The location of the pool configuration repository, shared between kernel and
	* userland.
	*/
	#define ZPOOL_CACHE "/boot/zfs/zpool.cache"

	/*
	* vdev states are ordered from least to most healthy.
	* A vdev that's CANT_OPEN or below is considered unusable.
	*/
	typedef enum vdev_state {
	VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
	VDEV_STATE_CLOSED, /* Not currently open */
	VDEV_STATE_OFFLINE, /* Not allowed to open */
	VDEV_STATE_REMOVED, /* Explicitly removed from system */
	VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
	VDEV_STATE_FAULTED, /* External request to fault device */
	VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
	VDEV_STATE_HEALTHY /* Presumed good */
	} vdev_state_t;

	#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY

	/*
	* vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
	* of the vdev stats structure uses these constants to distinguish why.
	*/
	typedef enum vdev_aux {
	VDEV_AUX_NONE, /* no error */
	VDEV_AUX_OPEN_FAILED, /* ldi_open_() or vn_open() failed /
	VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
	VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
	VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
	VDEV_AUX_TOO_SMALL, /* vdev size is too small */
	VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
	VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
	VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
	VDEV_AUX_UNSUP_FEAT, /* unsupported features */
	VDEV_AUX_SPARED, /* hot spare used in another pool */
	VDEV_AUX_ERR_EXCEEDED, /* too many errors */
	VDEV_AUX_IO_FAILURE, /* experienced I/O failure */
	VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */
	VDEV_AUX_EXTERNAL, /* external diagnosis */
	VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */
	VDEV_AUX_ASHIFT_TOO_BIG /* vdev's min block size is too large */
	} vdev_aux_t;

	/*
	* pool state. The following states are written to disk as part of the normal
	* SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining
	* states are software abstractions used at various levels to communicate
	* pool state.
	*/
	typedef enum pool_state {
	POOL_STATE_ACTIVE = 0, /* In active use */
	POOL_STATE_EXPORTED, /* Explicitly exported */
	POOL_STATE_DESTROYED, /* Explicitly destroyed */
	POOL_STATE_SPARE, /* Reserved for hot spare use */
	POOL_STATE_L2CACHE, /* Level 2 ARC device */
	POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
	POOL_STATE_UNAVAIL, /* Internal libzfs state */
	POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
	} pool_state_t;

	/*
	* Scan Functions.
	*/
	typedef enum pool_scan_func {
	POOL_SCAN_NONE,
	POOL_SCAN_SCRUB,
	POOL_SCAN_RESILVER,
	POOL_SCAN_FUNCS
	} pool_scan_func_t;

	/*
	* Used to control scrub pause and resume.
	*/
	typedef enum pool_scrub_cmd {
	POOL_SCRUB_NORMAL = 0,
	POOL_SCRUB_PAUSE,
	POOL_SCRUB_FLAGS_END
	} pool_scrub_cmd_t;


	/*
	* ZIO types. Needed to interpret vdev statistics below.
	*/
	typedef enum zio_type {
	ZIO_TYPE_NULL = 0,
	ZIO_TYPE_READ,
	ZIO_TYPE_WRITE,
	ZIO_TYPE_FREE,
	ZIO_TYPE_CLAIM,
	ZIO_TYPE_IOCTL,
	ZIO_TYPES
	} zio_type_t;

	/*
	* Pool statistics. Note: all fields should be 64-bit because this
	* is passed between kernel and userland as an nvlist uint64 array.
	*/
	typedef struct pool_scan_stat {
	/* values stored on disk */
	uint64_t pss_func; /* pool_scan_func_t */
	uint64_t pss_state; /* dsl_scan_state_t */
	uint64_t pss_start_time; /* scan start time */
	uint64_t pss_end_time; /* scan end time */
	uint64_t pss_to_examine; /* total bytes to scan */
	uint64_t pss_examined; /* total examined bytes */
	uint64_t pss_to_process; /* total bytes to process */
	uint64_t pss_processed; /* total processed bytes */
	uint64_t pss_errors; /* scan errors */

	/* values not stored on disk */
	uint64_t pss_pass_exam; /* examined bytes per scan pass */
	uint64_t pss_pass_start; /* start time of a scan pass */
	uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */
	/* cumulative time scrub spent paused, needed for rate calculation */
	uint64_t pss_pass_scrub_spent_paused;
	} pool_scan_stat_t;

	+typedef struct pool_removal_stat {
	+ uint64_t prs_state; /* dsl_scan_state_t */
	+ uint64_t prs_removing_vdev;
	+ uint64_t prs_start_time;
	+ uint64_t prs_end_time;
	+ uint64_t prs_to_copy; /* bytes that need to be copied */
	+ uint64_t prs_copied; /* bytes copied so far */
	+ /*
	+ * bytes of memory used for indirect mappings.
	+ * This includes all removed vdevs.
	+ */
	+ uint64_t prs_mapping_memory;
	+} pool_removal_stat_t;
	+
	typedef enum dsl_scan_state {
	DSS_NONE,
	DSS_SCANNING,
	DSS_FINISHED,
	DSS_CANCELED,
	DSS_NUM_STATES
	} dsl_scan_state_t;


	/*
	* Vdev statistics. Note: all fields should be 64-bit because this
	* is passed between kernel and userland as an nvlist uint64 array.
	*/
	typedef struct vdev_stat {
	hrtime_t vs_timestamp; /* time since vdev load */
	uint64_t vs_state; /* vdev state */
	uint64_t vs_aux; /* see vdev_aux_t */
	uint64_t vs_alloc; /* space allocated */
	uint64_t vs_space; /* total capacity */
	uint64_t vs_dspace; /* deflated capacity */
	uint64_t vs_rsize; /* replaceable dev size */
	uint64_t vs_esize; /* expandable dev size */
	uint64_t vs_ops[ZIO_TYPES]; /* operation count */
	uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
	uint64_t vs_read_errors; /* read errors */
	uint64_t vs_write_errors; /* write errors */
	uint64_t vs_checksum_errors; /* checksum errors */
	uint64_t vs_self_healed; /* self-healed bytes */
	uint64_t vs_scan_removing; /* removing? */
	uint64_t vs_scan_processed; /* scan processed bytes */
	uint64_t vs_configured_ashift; /* TLV vdev_ashift */
	uint64_t vs_logical_ashift; /* vdev_logical_ashift */
	uint64_t vs_physical_ashift; /* vdev_physical_ashift */
	uint64_t vs_fragmentation; /* device fragmentation */
	} vdev_stat_t;
	#define VDEV_STAT_VALID(field, uint64_t_field_count) \
	((uint64_t_field_count * sizeof(uint64_t)) >= \
	(offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field)))

	/*
	* DDT statistics. Note: all fields should be 64-bit because this
	* is passed between kernel and userland as an nvlist uint64 array.
	*/
	typedef struct ddt_object {
	uint64_t ddo_count; /* number of elments in ddt */
	uint64_t ddo_dspace; /* size of ddt on disk */
	uint64_t ddo_mspace; /* size of ddt in-core */
	} ddt_object_t;

	typedef struct ddt_stat {
	uint64_t dds_blocks; /* blocks */
	uint64_t dds_lsize; /* logical size */
	uint64_t dds_psize; /* physical size */
	uint64_t dds_dsize; /* deflated allocated size */
	uint64_t dds_ref_blocks; /* referenced blocks */
	uint64_t dds_ref_lsize; /* referenced lsize * refcnt */
	uint64_t dds_ref_psize; /* referenced psize * refcnt */
	uint64_t dds_ref_dsize; /* referenced dsize * refcnt */
	} ddt_stat_t;

	typedef struct ddt_histogram {
	ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */
	} ddt_histogram_t;

	#define ZVOL_DRIVER "zvol"
	#define ZFS_DRIVER "zfs"
	#define ZFS_DEV_NAME "zfs"
	#define ZFS_DEV "/dev/" ZFS_DEV_NAME
	#define ZFS_DISK_ROOT "/dev/dsk"
	#define ZFS_DISK_ROOTD ZFS_DISK_ROOT "/"
	#define ZFS_RDISK_ROOT "/dev/rdsk"
	#define ZFS_RDISK_ROOTD ZFS_RDISK_ROOT "/"

	/* general zvol path */
	#define ZVOL_DIR "/dev/zvol"
	/* expansion */
	#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:"
	/* for dump and swap */
	#define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/"
	#define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/"

	#define ZVOL_PROP_NAME "name"
	#define ZVOL_DEFAULT_BLOCKSIZE 8192

	/*
	* /dev/zfs ioctl numbers.
	*/
	typedef enum zfs_ioc {
	ZFS_IOC_FIRST = 0,
	ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST,
	ZFS_IOC_POOL_DESTROY,
	ZFS_IOC_POOL_IMPORT,
	ZFS_IOC_POOL_EXPORT,
	ZFS_IOC_POOL_CONFIGS,
	ZFS_IOC_POOL_STATS,
	ZFS_IOC_POOL_TRYIMPORT,
	ZFS_IOC_POOL_SCAN,
	ZFS_IOC_POOL_FREEZE,
	ZFS_IOC_POOL_UPGRADE,
	ZFS_IOC_POOL_GET_HISTORY,
	ZFS_IOC_VDEV_ADD,
	ZFS_IOC_VDEV_REMOVE,
	ZFS_IOC_VDEV_SET_STATE,
	ZFS_IOC_VDEV_ATTACH,
	ZFS_IOC_VDEV_DETACH,
	ZFS_IOC_VDEV_SETPATH,
	ZFS_IOC_VDEV_SETFRU,
	ZFS_IOC_OBJSET_STATS,
	ZFS_IOC_OBJSET_ZPLPROPS,
	ZFS_IOC_DATASET_LIST_NEXT,
	ZFS_IOC_SNAPSHOT_LIST_NEXT,
	ZFS_IOC_SET_PROP,
	ZFS_IOC_CREATE,
	ZFS_IOC_DESTROY,
	ZFS_IOC_ROLLBACK,
	ZFS_IOC_RENAME,
	ZFS_IOC_RECV,
	ZFS_IOC_SEND,
	ZFS_IOC_INJECT_FAULT,
	ZFS_IOC_CLEAR_FAULT,
	ZFS_IOC_INJECT_LIST_NEXT,
	ZFS_IOC_ERROR_LOG,
	ZFS_IOC_CLEAR,
	ZFS_IOC_PROMOTE,
	ZFS_IOC_DESTROY_SNAPS,
	ZFS_IOC_SNAPSHOT,
	ZFS_IOC_DSOBJ_TO_DSNAME,
	ZFS_IOC_OBJ_TO_PATH,
	ZFS_IOC_POOL_SET_PROPS,
	ZFS_IOC_POOL_GET_PROPS,
	ZFS_IOC_SET_FSACL,
	ZFS_IOC_GET_FSACL,
	ZFS_IOC_SHARE,
	ZFS_IOC_INHERIT_PROP,
	ZFS_IOC_SMB_ACL,
	ZFS_IOC_USERSPACE_ONE,
	ZFS_IOC_USERSPACE_MANY,
	ZFS_IOC_USERSPACE_UPGRADE,
	ZFS_IOC_HOLD,
	ZFS_IOC_RELEASE,
	ZFS_IOC_GET_HOLDS,
	ZFS_IOC_OBJSET_RECVD_PROPS,
	ZFS_IOC_VDEV_SPLIT,
	ZFS_IOC_NEXT_OBJ,
	ZFS_IOC_DIFF,
	ZFS_IOC_TMP_SNAPSHOT,
	ZFS_IOC_OBJ_TO_STATS,
	ZFS_IOC_JAIL,
	ZFS_IOC_UNJAIL,
	ZFS_IOC_POOL_REGUID,
	ZFS_IOC_SPACE_WRITTEN,
	ZFS_IOC_SPACE_SNAPS,
	ZFS_IOC_SEND_PROGRESS,
	ZFS_IOC_POOL_REOPEN,
	ZFS_IOC_LOG_HISTORY,
	ZFS_IOC_SEND_NEW,
	ZFS_IOC_SEND_SPACE,
	ZFS_IOC_CLONE,
	ZFS_IOC_BOOKMARK,
	ZFS_IOC_GET_BOOKMARKS,
	ZFS_IOC_DESTROY_BOOKMARKS,
	#ifdef __FreeBSD__
	ZFS_IOC_NEXTBOOT,
	#endif
	ZFS_IOC_CHANNEL_PROGRAM,
	+ ZFS_IOC_REMAP,
	ZFS_IOC_LAST
	} zfs_ioc_t;

	/*
	* Internal SPA load state. Used by FMA diagnosis engine.
	*/
	typedef enum {
	SPA_LOAD_NONE, /* no load in progress */
	SPA_LOAD_OPEN, /* normal open */
	SPA_LOAD_IMPORT, /* import in progress */
	SPA_LOAD_TRYIMPORT, /* tryimport in progress */
	SPA_LOAD_RECOVER, /* recovery requested */
	SPA_LOAD_ERROR, /* load failed */
	SPA_LOAD_CREATE /* creation in progress */
	} spa_load_state_t;

	/*
	* Bookmark name values.
	*/
	#define ZPOOL_ERR_LIST "error list"
	#define ZPOOL_ERR_DATASET "dataset"
	#define ZPOOL_ERR_OBJECT "object"

	#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1)

	/*
	* The following are names used in the nvlist describing
	* the pool's history log.
	*/
	#define ZPOOL_HIST_RECORD "history record"
	#define ZPOOL_HIST_TIME "history time"
	#define ZPOOL_HIST_CMD "history command"
	#define ZPOOL_HIST_WHO "history who"
	#define ZPOOL_HIST_ZONE "history zone"
	#define ZPOOL_HIST_HOST "history hostname"
	#define ZPOOL_HIST_TXG "history txg"
	#define ZPOOL_HIST_INT_EVENT "history internal event"
	#define ZPOOL_HIST_INT_STR "history internal str"
	#define ZPOOL_HIST_INT_NAME "internal_name"
	#define ZPOOL_HIST_IOCTL "ioctl"
	#define ZPOOL_HIST_INPUT_NVL "in_nvl"
	#define ZPOOL_HIST_OUTPUT_NVL "out_nvl"
	#define ZPOOL_HIST_DSNAME "dsname"
	#define ZPOOL_HIST_DSID "dsid"
	#define ZPOOL_HIST_ERRNO "errno"

	/*
	* Flags for ZFS_IOC_VDEV_SET_STATE
	*/
	#define ZFS_ONLINE_CHECKREMOVE 0x1
	#define ZFS_ONLINE_UNSPARE 0x2
	#define ZFS_ONLINE_FORCEFAULT 0x4
	#define ZFS_ONLINE_EXPAND 0x8
	#define ZFS_OFFLINE_TEMPORARY 0x1

	/*
	* Flags for ZFS_IOC_POOL_IMPORT
	*/
	#define ZFS_IMPORT_NORMAL 0x0
	#define ZFS_IMPORT_VERBATIM 0x1
	#define ZFS_IMPORT_ANY_HOST 0x2
	#define ZFS_IMPORT_MISSING_LOG 0x4
	#define ZFS_IMPORT_ONLY 0x8

	/*
	* Channel program argument/return nvlist keys and defaults.
	*/
	#define ZCP_ARG_PROGRAM "program"
	#define ZCP_ARG_ARGLIST "arg"
	#define ZCP_ARG_SYNC "sync"
	#define ZCP_ARG_INSTRLIMIT "instrlimit"
	#define ZCP_ARG_MEMLIMIT "memlimit"

	#define ZCP_ARG_CLIARGV "argv"

	#define ZCP_RET_ERROR "error"
	#define ZCP_RET_RETURN "return"

	#define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000)
	#define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT)
	#define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024)
	#define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT)

	/*
	* Sysevent payload members. ZFS will generate the following sysevents with the
	* given payloads:
	*
	* ESC_ZFS_RESILVER_START
	* ESC_ZFS_RESILVER_END
	* ESC_ZFS_POOL_DESTROY
	* ESC_ZFS_POOL_REGUID
	*
	* ZFS_EV_POOL_NAME DATA_TYPE_STRING
	* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
	*
	* ESC_ZFS_VDEV_REMOVE
	* ESC_ZFS_VDEV_CLEAR
	* ESC_ZFS_VDEV_CHECK
	*
	* ZFS_EV_POOL_NAME DATA_TYPE_STRING
	* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
	* ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional)
	* ZFS_EV_VDEV_GUID DATA_TYPE_UINT64
	*
	* ESC_ZFS_HISTORY_EVENT
	*
	* ZFS_EV_POOL_NAME DATA_TYPE_STRING
	* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
	* ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional)
	* ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional)
	* ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional)
	* ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional)
	* ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional)
	* ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional)
	* ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional)
	* ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional)
	* ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional)
	* ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional)
	* ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional)
	* ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional)
	*
	* The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the
	* history log nvlist. The keynames will be free of any spaces or other
	* characters that could be potentially unexpected to consumers of the
	* sysevents.
	*/
	#define ZFS_EV_POOL_NAME "pool_name"
	#define ZFS_EV_POOL_GUID "pool_guid"
	#define ZFS_EV_VDEV_PATH "vdev_path"
	#define ZFS_EV_VDEV_GUID "vdev_guid"
	#define ZFS_EV_HIST_TIME "history_time"
	#define ZFS_EV_HIST_CMD "history_command"
	#define ZFS_EV_HIST_WHO "history_who"
	#define ZFS_EV_HIST_ZONE "history_zone"
	#define ZFS_EV_HIST_HOST "history_hostname"
	#define ZFS_EV_HIST_TXG "history_txg"
	#define ZFS_EV_HIST_INT_EVENT "history_internal_event"
	#define ZFS_EV_HIST_INT_STR "history_internal_str"
	#define ZFS_EV_HIST_INT_NAME "history_internal_name"
	#define ZFS_EV_HIST_IOCTL "history_ioctl"
	#define ZFS_EV_HIST_DSNAME "history_dsname"
	#define ZFS_EV_HIST_DSID "history_dsid"

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_FS_ZFS_H */
	Index: stable/11/sys/conf/files
	===================================================================
	--- stable/11/sys/conf/files (revision 332524)
	+++ stable/11/sys/conf/files (revision 332525)
	@@ -1,4743 +1,4747 @@
	# $FreeBSD$
	#
	# The long compile-with and dependency lines are required because of
	# limitations in config: backslash-newline doesn't work in strings, and
	# dependency lines other than the first are silently ignored.
	#
	acpi_quirks.h optional acpi \
	dependency "$S/tools/acpi_quirks2h.awk $S/dev/acpica/acpi_quirks" \
	compile-with "${AWK} -f $S/tools/acpi_quirks2h.awk $S/dev/acpica/acpi_quirks" \
	no-obj no-implicit-rule before-depend \
	clean "acpi_quirks.h"
	bhnd_nvram_map.h optional bhnd \
	dependency "$S/dev/bhnd/tools/nvram_map_gen.sh $S/dev/bhnd/tools/nvram_map_gen.awk $S/dev/bhnd/nvram/nvram_map" \
	compile-with "sh $S/dev/bhnd/tools/nvram_map_gen.sh $S/dev/bhnd/nvram/nvram_map -h" \
	no-obj no-implicit-rule before-depend \
	clean "bhnd_nvram_map.h"
	bhnd_nvram_map_data.h optional bhnd \
	dependency "$S/dev/bhnd/tools/nvram_map_gen.sh $S/dev/bhnd/tools/nvram_map_gen.awk $S/dev/bhnd/nvram/nvram_map" \
	compile-with "sh $S/dev/bhnd/tools/nvram_map_gen.sh $S/dev/bhnd/nvram/nvram_map -d" \
	no-obj no-implicit-rule before-depend \
	clean "bhnd_nvram_map_data.h"
	#
	# The 'fdt_dtb_file' target covers an actual DTB file name, which is derived
	# from the specified source (DTS) file: <platform>.dts -> <platform>.dtb
	#
	fdt_dtb_file optional fdt fdt_dtb_static \
	compile-with "sh -c 'MACHINE=${MACHINE} $S/tools/fdt/make_dtb.sh $S ${FDT_DTS_FILE} ${.CURDIR}'" \
	no-obj no-implicit-rule before-depend \
	clean "${FDT_DTS_FILE:R}.dtb"
	fdt_static_dtb.h optional fdt fdt_dtb_static \
	compile-with "sh -c 'MACHINE=${MACHINE} $S/tools/fdt/make_dtbh.sh ${FDT_DTS_FILE} ${.CURDIR}'" \
	dependency "fdt_dtb_file" \
	no-obj no-implicit-rule before-depend \
	clean "fdt_static_dtb.h"
	feeder_eq_gen.h optional sound \
	dependency "$S/tools/sound/feeder_eq_mkfilter.awk" \
	compile-with "${AWK} -f $S/tools/sound/feeder_eq_mkfilter.awk -- ${FEEDER_EQ_PRESETS} > feeder_eq_gen.h" \
	no-obj no-implicit-rule before-depend \
	clean "feeder_eq_gen.h"
	feeder_rate_gen.h optional sound \
	dependency "$S/tools/sound/feeder_rate_mkfilter.awk" \
	compile-with "${AWK} -f $S/tools/sound/feeder_rate_mkfilter.awk -- ${FEEDER_RATE_PRESETS} > feeder_rate_gen.h" \
	no-obj no-implicit-rule before-depend \
	clean "feeder_rate_gen.h"
	snd_fxdiv_gen.h optional sound \
	dependency "$S/tools/sound/snd_fxdiv_gen.awk" \
	compile-with "${AWK} -f $S/tools/sound/snd_fxdiv_gen.awk -- > snd_fxdiv_gen.h" \
	no-obj no-implicit-rule before-depend \
	clean "snd_fxdiv_gen.h"
	miidevs.h optional miibus \| mii \
	dependency "$S/tools/miidevs2h.awk $S/dev/mii/miidevs" \
	compile-with "${AWK} -f $S/tools/miidevs2h.awk $S/dev/mii/miidevs" \
	no-obj no-implicit-rule before-depend \
	clean "miidevs.h"
	pccarddevs.h standard \
	dependency "$S/tools/pccarddevs2h.awk $S/dev/pccard/pccarddevs" \
	compile-with "${AWK} -f $S/tools/pccarddevs2h.awk $S/dev/pccard/pccarddevs" \
	no-obj no-implicit-rule before-depend \
	clean "pccarddevs.h"
	kbdmuxmap.h optional kbdmux_dflt_keymap \
	compile-with "kbdcontrol -P ${S:S/sys$/share/}/vt/keymaps -P ${S:S/sys$/share/}/syscons/keymaps -L ${KBDMUX_DFLT_KEYMAP} \| sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > kbdmuxmap.h" \
	no-obj no-implicit-rule before-depend \
	clean "kbdmuxmap.h"
	teken_state.h optional sc \| vt \
	dependency "$S/teken/gensequences $S/teken/sequences" \
	compile-with "${AWK} -f $S/teken/gensequences $S/teken/sequences > teken_state.h" \
	no-obj no-implicit-rule before-depend \
	clean "teken_state.h"
	usbdevs.h optional usb \
	dependency "$S/tools/usbdevs2h.awk $S/dev/usb/usbdevs" \
	compile-with "${AWK} -f $S/tools/usbdevs2h.awk $S/dev/usb/usbdevs -h" \
	no-obj no-implicit-rule before-depend \
	clean "usbdevs.h"
	usbdevs_data.h optional usb \
	dependency "$S/tools/usbdevs2h.awk $S/dev/usb/usbdevs" \
	compile-with "${AWK} -f $S/tools/usbdevs2h.awk $S/dev/usb/usbdevs -d" \
	no-obj no-implicit-rule before-depend \
	clean "usbdevs_data.h"
	cam/cam.c optional scbus
	cam/cam_compat.c optional scbus
	cam/cam_iosched.c optional scbus
	cam/cam_periph.c optional scbus
	cam/cam_queue.c optional scbus
	cam/cam_sim.c optional scbus
	cam/cam_xpt.c optional scbus
	cam/ata/ata_all.c optional scbus
	cam/ata/ata_xpt.c optional scbus
	cam/ata/ata_pmp.c optional scbus
	cam/nvme/nvme_all.c optional scbus nvme
	cam/nvme/nvme_da.c optional scbus nvme da
	cam/nvme/nvme_xpt.c optional scbus nvme
	cam/scsi/scsi_xpt.c optional scbus
	cam/scsi/scsi_all.c optional scbus
	cam/scsi/scsi_cd.c optional cd
	cam/scsi/scsi_ch.c optional ch
	cam/ata/ata_da.c optional ada \| da
	cam/ctl/ctl.c optional ctl
	cam/ctl/ctl_backend.c optional ctl
	cam/ctl/ctl_backend_block.c optional ctl
	cam/ctl/ctl_backend_ramdisk.c optional ctl
	cam/ctl/ctl_cmd_table.c optional ctl
	cam/ctl/ctl_frontend.c optional ctl
	cam/ctl/ctl_frontend_cam_sim.c optional ctl
	cam/ctl/ctl_frontend_ioctl.c optional ctl
	cam/ctl/ctl_frontend_iscsi.c optional ctl cfiscsi
	cam/ctl/ctl_ha.c optional ctl
	cam/ctl/ctl_scsi_all.c optional ctl
	cam/ctl/ctl_tpc.c optional ctl
	cam/ctl/ctl_tpc_local.c optional ctl
	cam/ctl/ctl_error.c optional ctl
	cam/ctl/ctl_util.c optional ctl
	cam/ctl/scsi_ctl.c optional ctl
	cam/scsi/scsi_da.c optional da
	cam/scsi/scsi_low.c optional ct \| ncv \| nsp \| stg
	cam/scsi/scsi_pass.c optional pass
	cam/scsi/scsi_pt.c optional pt
	cam/scsi/scsi_sa.c optional sa
	cam/scsi/scsi_enc.c optional ses
	cam/scsi/scsi_enc_ses.c optional ses
	cam/scsi/scsi_enc_safte.c optional ses
	cam/scsi/scsi_sg.c optional sg
	cam/scsi/scsi_targ_bh.c optional targbh
	cam/scsi/scsi_target.c optional targ
	cam/scsi/smp_all.c optional scbus
	# shared between zfs and dtrace
	cddl/compat/opensolaris/kern/opensolaris.c optional zfs \| dtrace compile-with "${CDDL_C}"
	cddl/compat/opensolaris/kern/opensolaris_cmn_err.c optional zfs \| dtrace compile-with "${CDDL_C}"
	cddl/compat/opensolaris/kern/opensolaris_kmem.c optional zfs \| dtrace compile-with "${CDDL_C}"
	cddl/compat/opensolaris/kern/opensolaris_misc.c optional zfs \| dtrace compile-with "${CDDL_C}"
	cddl/compat/opensolaris/kern/opensolaris_proc.c optional zfs \| dtrace compile-with "${CDDL_C}"
	cddl/compat/opensolaris/kern/opensolaris_sunddi.c optional zfs \| dtrace compile-with "${CDDL_C}"
	cddl/compat/opensolaris/kern/opensolaris_taskq.c optional zfs \| dtrace compile-with "${CDDL_C}"
	# zfs specific
	cddl/compat/opensolaris/kern/opensolaris_acl.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_dtrace.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_kobj.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_kstat.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_lookup.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_policy.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_string.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_sysevent.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_uio.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_vfs.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_vm.c optional zfs compile-with "${ZFS_C}"
	cddl/compat/opensolaris/kern/opensolaris_zone.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/acl/acl_common.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/avl/avl.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/unicode/u8_textprep.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/zfs/zfeature_common.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/zfs/zfs_comutil.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/zfs/zfs_deleg.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/zfs/zfs_prop.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/zfs/zpool_prop.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/common/zfs/zprop_common.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/vnode.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c optional zfs compile-with "${ZFS_C}" \
	warning "kernel contains CDDL licensed ZFS filesystem"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c optional zfs compile-with "${ZFS_C}"
	+cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}"
	+cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c optional zfs compile-with "${ZFS_C}"
	+cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c optional zfs compile-with "${ZFS_C}"
	+cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/os/callb.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/os/fm.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/os/list.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/zmod/adler32.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/zmod/deflate.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/zmod/inffast.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/zmod/inflate.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/zmod/inftrees.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/zmod/trees.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/zmod/zmod.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/zmod/zmod_subr.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/zmod/zutil.c optional zfs compile-with "${ZFS_C}"
	# zfs lua support
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c optional zfs compile-with "${ZFS_C}"
	cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c optional zfs compile-with "${ZFS_C}"
	# dtrace specific
	cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c optional dtrace compile-with "${DTRACE_C}" \
	warning "kernel contains CDDL licensed DTRACE"
	cddl/dev/dtmalloc/dtmalloc.c optional dtmalloc \| dtraceall compile-with "${CDDL_C}"
	cddl/dev/profile/profile.c optional dtrace_profile \| dtraceall compile-with "${CDDL_C}"
	cddl/dev/sdt/sdt.c optional dtrace_sdt \| dtraceall compile-with "${CDDL_C}"
	cddl/dev/fbt/fbt.c optional dtrace_fbt \| dtraceall compile-with "${FBT_C}"
	cddl/dev/systrace/systrace.c optional dtrace_systrace \| dtraceall compile-with "${CDDL_C}"
	cddl/dev/prototype.c optional dtrace_prototype \| dtraceall compile-with "${CDDL_C}"
	fs/nfsclient/nfs_clkdtrace.c optional dtnfscl nfscl \| dtraceall nfscl compile-with "${CDDL_C}"
	compat/cloudabi/cloudabi_clock.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi/cloudabi_errno.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi/cloudabi_fd.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi/cloudabi_file.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi/cloudabi_futex.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi/cloudabi_mem.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi/cloudabi_proc.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi/cloudabi_random.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi/cloudabi_sock.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi/cloudabi_thread.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi/cloudabi_vdso.c optional compat_cloudabi32 \| compat_cloudabi64
	compat/cloudabi32/cloudabi32_fd.c optional compat_cloudabi32
	compat/cloudabi32/cloudabi32_module.c optional compat_cloudabi32
	compat/cloudabi32/cloudabi32_poll.c optional compat_cloudabi32
	compat/cloudabi32/cloudabi32_sock.c optional compat_cloudabi32
	compat/cloudabi32/cloudabi32_syscalls.c optional compat_cloudabi32
	compat/cloudabi32/cloudabi32_sysent.c optional compat_cloudabi32
	compat/cloudabi32/cloudabi32_thread.c optional compat_cloudabi32
	compat/cloudabi64/cloudabi64_fd.c optional compat_cloudabi64
	compat/cloudabi64/cloudabi64_module.c optional compat_cloudabi64
	compat/cloudabi64/cloudabi64_poll.c optional compat_cloudabi64
	compat/cloudabi64/cloudabi64_sock.c optional compat_cloudabi64
	compat/cloudabi64/cloudabi64_syscalls.c optional compat_cloudabi64
	compat/cloudabi64/cloudabi64_sysent.c optional compat_cloudabi64
	compat/cloudabi64/cloudabi64_thread.c optional compat_cloudabi64
	compat/freebsd32/freebsd32_capability.c optional compat_freebsd32
	compat/freebsd32/freebsd32_ioctl.c optional compat_freebsd32
	compat/freebsd32/freebsd32_misc.c optional compat_freebsd32
	compat/freebsd32/freebsd32_syscalls.c optional compat_freebsd32
	compat/freebsd32/freebsd32_sysent.c optional compat_freebsd32
	contrib/ck/src/ck_array.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/ck/src/ck_barrier_centralized.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/ck/src/ck_barrier_combining.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/ck/src/ck_barrier_dissemination.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/ck/src/ck_barrier_mcs.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/ck/src/ck_barrier_tournament.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/ck/src/ck_epoch.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/ck/src/ck_hp.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/ck/src/ck_hs.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/ck/src/ck_ht.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/ck/src/ck_rhs.c standard compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	contrib/dev/acpica/common/ahids.c optional acpi acpi_debug
	contrib/dev/acpica/common/ahuuids.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbcmds.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbconvert.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbdisply.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbexec.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbhistry.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbinput.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbmethod.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbnames.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbobject.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbstats.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbtest.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbutils.c optional acpi acpi_debug
	contrib/dev/acpica/components/debugger/dbxface.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmbuffer.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmcstyle.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmdeferred.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmnames.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmopcode.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmresrc.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmresrcl.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmresrcl2.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmresrcs.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmutils.c optional acpi acpi_debug
	contrib/dev/acpica/components/disassembler/dmwalk.c optional acpi acpi_debug
	contrib/dev/acpica/components/dispatcher/dsargs.c optional acpi
	contrib/dev/acpica/components/dispatcher/dscontrol.c optional acpi
	contrib/dev/acpica/components/dispatcher/dsdebug.c optional acpi
	contrib/dev/acpica/components/dispatcher/dsfield.c optional acpi
	contrib/dev/acpica/components/dispatcher/dsinit.c optional acpi
	contrib/dev/acpica/components/dispatcher/dsmethod.c optional acpi
	contrib/dev/acpica/components/dispatcher/dsmthdat.c optional acpi
	contrib/dev/acpica/components/dispatcher/dsobject.c optional acpi
	contrib/dev/acpica/components/dispatcher/dsopcode.c optional acpi
	contrib/dev/acpica/components/dispatcher/dspkginit.c optional acpi
	contrib/dev/acpica/components/dispatcher/dsutils.c optional acpi
	contrib/dev/acpica/components/dispatcher/dswexec.c optional acpi
	contrib/dev/acpica/components/dispatcher/dswload.c optional acpi
	contrib/dev/acpica/components/dispatcher/dswload2.c optional acpi
	contrib/dev/acpica/components/dispatcher/dswscope.c optional acpi
	contrib/dev/acpica/components/dispatcher/dswstate.c optional acpi
	contrib/dev/acpica/components/events/evevent.c optional acpi
	contrib/dev/acpica/components/events/evglock.c optional acpi
	contrib/dev/acpica/components/events/evgpe.c optional acpi
	contrib/dev/acpica/components/events/evgpeblk.c optional acpi
	contrib/dev/acpica/components/events/evgpeinit.c optional acpi
	contrib/dev/acpica/components/events/evgpeutil.c optional acpi
	contrib/dev/acpica/components/events/evhandler.c optional acpi
	contrib/dev/acpica/components/events/evmisc.c optional acpi
	contrib/dev/acpica/components/events/evregion.c optional acpi
	contrib/dev/acpica/components/events/evrgnini.c optional acpi
	contrib/dev/acpica/components/events/evsci.c optional acpi
	contrib/dev/acpica/components/events/evxface.c optional acpi
	contrib/dev/acpica/components/events/evxfevnt.c optional acpi
	contrib/dev/acpica/components/events/evxfgpe.c optional acpi
	contrib/dev/acpica/components/events/evxfregn.c optional acpi
	contrib/dev/acpica/components/executer/exconcat.c optional acpi
	contrib/dev/acpica/components/executer/exconfig.c optional acpi
	contrib/dev/acpica/components/executer/exconvrt.c optional acpi
	contrib/dev/acpica/components/executer/excreate.c optional acpi
	contrib/dev/acpica/components/executer/exdebug.c optional acpi
	contrib/dev/acpica/components/executer/exdump.c optional acpi
	contrib/dev/acpica/components/executer/exfield.c optional acpi
	contrib/dev/acpica/components/executer/exfldio.c optional acpi
	contrib/dev/acpica/components/executer/exmisc.c optional acpi
	contrib/dev/acpica/components/executer/exmutex.c optional acpi
	contrib/dev/acpica/components/executer/exnames.c optional acpi
	contrib/dev/acpica/components/executer/exoparg1.c optional acpi
	contrib/dev/acpica/components/executer/exoparg2.c optional acpi
	contrib/dev/acpica/components/executer/exoparg3.c optional acpi
	contrib/dev/acpica/components/executer/exoparg6.c optional acpi
	contrib/dev/acpica/components/executer/exprep.c optional acpi
	contrib/dev/acpica/components/executer/exregion.c optional acpi
	contrib/dev/acpica/components/executer/exresnte.c optional acpi
	contrib/dev/acpica/components/executer/exresolv.c optional acpi
	contrib/dev/acpica/components/executer/exresop.c optional acpi
	contrib/dev/acpica/components/executer/exstore.c optional acpi
	contrib/dev/acpica/components/executer/exstoren.c optional acpi
	contrib/dev/acpica/components/executer/exstorob.c optional acpi
	contrib/dev/acpica/components/executer/exsystem.c optional acpi
	contrib/dev/acpica/components/executer/extrace.c optional acpi
	contrib/dev/acpica/components/executer/exutils.c optional acpi
	contrib/dev/acpica/components/hardware/hwacpi.c optional acpi
	contrib/dev/acpica/components/hardware/hwesleep.c optional acpi
	contrib/dev/acpica/components/hardware/hwgpe.c optional acpi
	contrib/dev/acpica/components/hardware/hwpci.c optional acpi
	contrib/dev/acpica/components/hardware/hwregs.c optional acpi
	contrib/dev/acpica/components/hardware/hwsleep.c optional acpi
	contrib/dev/acpica/components/hardware/hwtimer.c optional acpi
	contrib/dev/acpica/components/hardware/hwvalid.c optional acpi
	contrib/dev/acpica/components/hardware/hwxface.c optional acpi
	contrib/dev/acpica/components/hardware/hwxfsleep.c optional acpi
	contrib/dev/acpica/components/namespace/nsaccess.c optional acpi
	contrib/dev/acpica/components/namespace/nsalloc.c optional acpi
	contrib/dev/acpica/components/namespace/nsarguments.c optional acpi
	contrib/dev/acpica/components/namespace/nsconvert.c optional acpi
	contrib/dev/acpica/components/namespace/nsdump.c optional acpi
	contrib/dev/acpica/components/namespace/nseval.c optional acpi
	contrib/dev/acpica/components/namespace/nsinit.c optional acpi
	contrib/dev/acpica/components/namespace/nsload.c optional acpi
	contrib/dev/acpica/components/namespace/nsnames.c optional acpi
	contrib/dev/acpica/components/namespace/nsobject.c optional acpi
	contrib/dev/acpica/components/namespace/nsparse.c optional acpi
	contrib/dev/acpica/components/namespace/nspredef.c optional acpi
	contrib/dev/acpica/components/namespace/nsprepkg.c optional acpi
	contrib/dev/acpica/components/namespace/nsrepair.c optional acpi
	contrib/dev/acpica/components/namespace/nsrepair2.c optional acpi
	contrib/dev/acpica/components/namespace/nssearch.c optional acpi
	contrib/dev/acpica/components/namespace/nsutils.c optional acpi
	contrib/dev/acpica/components/namespace/nswalk.c optional acpi
	contrib/dev/acpica/components/namespace/nsxfeval.c optional acpi
	contrib/dev/acpica/components/namespace/nsxfname.c optional acpi
	contrib/dev/acpica/components/namespace/nsxfobj.c optional acpi
	contrib/dev/acpica/components/parser/psargs.c optional acpi
	contrib/dev/acpica/components/parser/psloop.c optional acpi
	contrib/dev/acpica/components/parser/psobject.c optional acpi
	contrib/dev/acpica/components/parser/psopcode.c optional acpi
	contrib/dev/acpica/components/parser/psopinfo.c optional acpi
	contrib/dev/acpica/components/parser/psparse.c optional acpi
	contrib/dev/acpica/components/parser/psscope.c optional acpi
	contrib/dev/acpica/components/parser/pstree.c optional acpi
	contrib/dev/acpica/components/parser/psutils.c optional acpi
	contrib/dev/acpica/components/parser/pswalk.c optional acpi
	contrib/dev/acpica/components/parser/psxface.c optional acpi
	contrib/dev/acpica/components/resources/rsaddr.c optional acpi
	contrib/dev/acpica/components/resources/rscalc.c optional acpi
	contrib/dev/acpica/components/resources/rscreate.c optional acpi
	contrib/dev/acpica/components/resources/rsdump.c optional acpi acpi_debug
	contrib/dev/acpica/components/resources/rsdumpinfo.c optional acpi
	contrib/dev/acpica/components/resources/rsinfo.c optional acpi
	contrib/dev/acpica/components/resources/rsio.c optional acpi
	contrib/dev/acpica/components/resources/rsirq.c optional acpi
	contrib/dev/acpica/components/resources/rslist.c optional acpi
	contrib/dev/acpica/components/resources/rsmemory.c optional acpi
	contrib/dev/acpica/components/resources/rsmisc.c optional acpi
	contrib/dev/acpica/components/resources/rsserial.c optional acpi
	contrib/dev/acpica/components/resources/rsutils.c optional acpi
	contrib/dev/acpica/components/resources/rsxface.c optional acpi
	contrib/dev/acpica/components/tables/tbdata.c optional acpi
	contrib/dev/acpica/components/tables/tbfadt.c optional acpi
	contrib/dev/acpica/components/tables/tbfind.c optional acpi
	contrib/dev/acpica/components/tables/tbinstal.c optional acpi
	contrib/dev/acpica/components/tables/tbprint.c optional acpi
	contrib/dev/acpica/components/tables/tbutils.c optional acpi
	contrib/dev/acpica/components/tables/tbxface.c optional acpi
	contrib/dev/acpica/components/tables/tbxfload.c optional acpi
	contrib/dev/acpica/components/tables/tbxfroot.c optional acpi
	contrib/dev/acpica/components/utilities/utaddress.c optional acpi
	contrib/dev/acpica/components/utilities/utalloc.c optional acpi
	contrib/dev/acpica/components/utilities/utascii.c optional acpi
	contrib/dev/acpica/components/utilities/utbuffer.c optional acpi
	contrib/dev/acpica/components/utilities/utcache.c optional acpi
	contrib/dev/acpica/components/utilities/utcopy.c optional acpi
	contrib/dev/acpica/components/utilities/utdebug.c optional acpi
	contrib/dev/acpica/components/utilities/utdecode.c optional acpi
	contrib/dev/acpica/components/utilities/utdelete.c optional acpi
	contrib/dev/acpica/components/utilities/uterror.c optional acpi
	contrib/dev/acpica/components/utilities/uteval.c optional acpi
	contrib/dev/acpica/components/utilities/utexcep.c optional acpi
	contrib/dev/acpica/components/utilities/utglobal.c optional acpi
	contrib/dev/acpica/components/utilities/uthex.c optional acpi
	contrib/dev/acpica/components/utilities/utids.c optional acpi
	contrib/dev/acpica/components/utilities/utinit.c optional acpi
	contrib/dev/acpica/components/utilities/utlock.c optional acpi
	contrib/dev/acpica/components/utilities/utmath.c optional acpi
	contrib/dev/acpica/components/utilities/utmisc.c optional acpi
	contrib/dev/acpica/components/utilities/utmutex.c optional acpi
	contrib/dev/acpica/components/utilities/utnonansi.c optional acpi
	contrib/dev/acpica/components/utilities/utobject.c optional acpi
	contrib/dev/acpica/components/utilities/utosi.c optional acpi
	contrib/dev/acpica/components/utilities/utownerid.c optional acpi
	contrib/dev/acpica/components/utilities/utpredef.c optional acpi
	contrib/dev/acpica/components/utilities/utresdecode.c optional acpi acpi_debug
	contrib/dev/acpica/components/utilities/utresrc.c optional acpi
	contrib/dev/acpica/components/utilities/utstate.c optional acpi
	contrib/dev/acpica/components/utilities/utstring.c optional acpi
	contrib/dev/acpica/components/utilities/utstrsuppt.c optional acpi
	contrib/dev/acpica/components/utilities/utstrtoul64.c optional acpi
	contrib/dev/acpica/components/utilities/utuuid.c optional acpi acpi_debug
	contrib/dev/acpica/components/utilities/utxface.c optional acpi
	contrib/dev/acpica/components/utilities/utxferror.c optional acpi
	contrib/dev/acpica/components/utilities/utxfinit.c optional acpi
	contrib/dev/acpica/os_specific/service_layers/osgendbg.c optional acpi acpi_debug
	contrib/ipfilter/netinet/fil.c optional ipfilter inet \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_auth.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_fil_freebsd.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_frag.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_log.c optional ipfilter inet \
	compile-with "${NORMAL_C} -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_nat.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_proxy.c optional ipfilter inet \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_state.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_lookup.c optional ipfilter inet \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN} -Wno-unused -Wno-error -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_pool.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_htable.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_sync.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/mlfk_ipl.c optional ipfilter inet \
	compile-with "${NORMAL_C} -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_nat6.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_rules.c optional ipfilter inet \
	compile-with "${NORMAL_C} -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_scan.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/ip_dstlist.c optional ipfilter inet \
	compile-with "${NORMAL_C} -Wno-unused -I$S/contrib/ipfilter"
	contrib/ipfilter/netinet/radix_ipf.c optional ipfilter inet \
	compile-with "${NORMAL_C} -I$S/contrib/ipfilter"
	contrib/libfdt/fdt.c optional fdt
	contrib/libfdt/fdt_ro.c optional fdt
	contrib/libfdt/fdt_rw.c optional fdt
	contrib/libfdt/fdt_strerror.c optional fdt
	contrib/libfdt/fdt_sw.c optional fdt
	contrib/libfdt/fdt_wip.c optional fdt
	contrib/libnv/dnvlist.c standard
	contrib/libnv/nvlist.c standard
	contrib/libnv/nvpair.c standard
	contrib/ngatm/netnatm/api/cc_conn.c optional ngatm_ccatm \
	compile-with "${NORMAL_C_NOWERROR} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/api/cc_data.c optional ngatm_ccatm \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/api/cc_dump.c optional ngatm_ccatm \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/api/cc_port.c optional ngatm_ccatm \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/api/cc_sig.c optional ngatm_ccatm \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/api/cc_user.c optional ngatm_ccatm \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/api/unisap.c optional ngatm_ccatm \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/misc/straddr.c optional ngatm_atmbase \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/misc/unimsg_common.c optional ngatm_atmbase \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/msg/traffic.c optional ngatm_atmbase \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/msg/uni_ie.c optional ngatm_atmbase \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/msg/uni_msg.c optional ngatm_atmbase \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/saal/saal_sscfu.c optional ngatm_sscfu \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/saal/saal_sscop.c optional ngatm_sscop \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/sig/sig_call.c optional ngatm_uni \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/sig/sig_coord.c optional ngatm_uni \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/sig/sig_party.c optional ngatm_uni \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/sig/sig_print.c optional ngatm_uni \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/sig/sig_reset.c optional ngatm_uni \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/sig/sig_uni.c optional ngatm_uni \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/sig/sig_unimsgcpy.c optional ngatm_uni \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	contrib/ngatm/netnatm/sig/sig_verify.c optional ngatm_uni \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	crypto/blowfish/bf_ecb.c optional ipsec \| ipsec_support
	crypto/blowfish/bf_skey.c optional crypto \| ipsec \| ipsec_support
	crypto/camellia/camellia.c optional crypto \| ipsec \| ipsec_support
	crypto/camellia/camellia-api.c optional crypto \| ipsec \| ipsec_support
	crypto/des/des_ecb.c optional crypto \| ipsec \| ipsec_support \| netsmb
	crypto/des/des_setkey.c optional crypto \| ipsec \| ipsec_support \| netsmb
	crypto/rc4/rc4.c optional netgraph_mppc_encryption \| kgssapi
	crypto/rijndael/rijndael-alg-fst.c optional crypto \| geom_bde \| \
	ipsec \| ipsec_support \| random !random_loadable \| wlan_ccmp
	crypto/rijndael/rijndael-api-fst.c optional geom_bde \| random !random_loadable
	crypto/rijndael/rijndael-api.c optional crypto \| ipsec \| ipsec_support \| \
	wlan_ccmp
	crypto/sha1.c optional carp \| crypto \| ipsec \| \
	ipsec_support \| netgraph_mppc_encryption \| sctp
	crypto/sha2/sha256c.c optional crypto \| geom_bde \| ipsec \| \
	ipsec_support \| random !random_loadable \| sctp \| zfs
	crypto/sha2/sha512c.c optional crypto \| geom_bde \| ipsec \| \
	ipsec_support \| zfs
	crypto/skein/skein.c optional crypto \| zfs
	crypto/skein/skein_block.c optional crypto \| zfs
	crypto/siphash/siphash.c optional inet \| inet6
	crypto/siphash/siphash_test.c optional inet \| inet6
	ddb/db_access.c optional ddb
	ddb/db_break.c optional ddb
	ddb/db_capture.c optional ddb
	ddb/db_command.c optional ddb
	ddb/db_examine.c optional ddb
	ddb/db_expr.c optional ddb
	ddb/db_input.c optional ddb
	ddb/db_lex.c optional ddb
	ddb/db_main.c optional ddb
	ddb/db_output.c optional ddb
	ddb/db_print.c optional ddb
	ddb/db_ps.c optional ddb
	ddb/db_run.c optional ddb
	ddb/db_script.c optional ddb
	ddb/db_sym.c optional ddb
	ddb/db_thread.c optional ddb
	ddb/db_textdump.c optional ddb
	ddb/db_variables.c optional ddb
	ddb/db_watch.c optional ddb
	ddb/db_write_cmd.c optional ddb
	dev/aac/aac.c optional aac
	dev/aac/aac_cam.c optional aacp aac
	dev/aac/aac_debug.c optional aac
	dev/aac/aac_disk.c optional aac
	dev/aac/aac_linux.c optional aac compat_linux
	dev/aac/aac_pci.c optional aac pci
	dev/aacraid/aacraid.c optional aacraid
	dev/aacraid/aacraid_cam.c optional aacraid scbus
	dev/aacraid/aacraid_debug.c optional aacraid
	dev/aacraid/aacraid_linux.c optional aacraid compat_linux
	dev/aacraid/aacraid_pci.c optional aacraid pci
	dev/acpi_support/acpi_wmi.c optional acpi_wmi acpi
	dev/acpi_support/acpi_asus.c optional acpi_asus acpi
	dev/acpi_support/acpi_asus_wmi.c optional acpi_asus_wmi acpi
	dev/acpi_support/acpi_fujitsu.c optional acpi_fujitsu acpi
	dev/acpi_support/acpi_hp.c optional acpi_hp acpi
	dev/acpi_support/acpi_ibm.c optional acpi_ibm acpi
	dev/acpi_support/acpi_panasonic.c optional acpi_panasonic acpi
	dev/acpi_support/acpi_sony.c optional acpi_sony acpi
	dev/acpi_support/acpi_toshiba.c optional acpi_toshiba acpi
	dev/acpi_support/atk0110.c optional aibs acpi
	dev/acpica/Osd/OsdDebug.c optional acpi
	dev/acpica/Osd/OsdHardware.c optional acpi
	dev/acpica/Osd/OsdInterrupt.c optional acpi
	dev/acpica/Osd/OsdMemory.c optional acpi
	dev/acpica/Osd/OsdSchedule.c optional acpi
	dev/acpica/Osd/OsdStream.c optional acpi
	dev/acpica/Osd/OsdSynch.c optional acpi
	dev/acpica/Osd/OsdTable.c optional acpi
	dev/acpica/acpi.c optional acpi
	dev/acpica/acpi_acad.c optional acpi
	dev/acpica/acpi_battery.c optional acpi
	dev/acpica/acpi_button.c optional acpi
	dev/acpica/acpi_cmbat.c optional acpi
	dev/acpica/acpi_cpu.c optional acpi
	dev/acpica/acpi_ec.c optional acpi
	dev/acpica/acpi_isab.c optional acpi isa
	dev/acpica/acpi_lid.c optional acpi
	dev/acpica/acpi_package.c optional acpi
	dev/acpica/acpi_pci.c optional acpi pci
	dev/acpica/acpi_pci_link.c optional acpi pci
	dev/acpica/acpi_pcib.c optional acpi pci
	dev/acpica/acpi_pcib_acpi.c optional acpi pci
	dev/acpica/acpi_pcib_pci.c optional acpi pci
	dev/acpica/acpi_perf.c optional acpi
	dev/acpica/acpi_powerres.c optional acpi
	dev/acpica/acpi_quirk.c optional acpi
	dev/acpica/acpi_resource.c optional acpi
	dev/acpica/acpi_container.c optional acpi
	dev/acpica/acpi_smbat.c optional acpi
	dev/acpica/acpi_thermal.c optional acpi
	dev/acpica/acpi_throttle.c optional acpi
	dev/acpica/acpi_timer.c optional acpi
	dev/acpica/acpi_video.c optional acpi_video acpi
	dev/acpica/acpi_dock.c optional acpi_dock acpi
	dev/adlink/adlink.c optional adlink
	dev/advansys/adv_eisa.c optional adv eisa
	dev/advansys/adv_pci.c optional adv pci
	dev/advansys/advansys.c optional adv
	dev/advansys/advlib.c optional adv
	dev/advansys/advmcode.c optional adv
	dev/advansys/adw_pci.c optional adw pci
	dev/advansys/adwcam.c optional adw
	dev/advansys/adwlib.c optional adw
	dev/advansys/adwmcode.c optional adw
	dev/ae/if_ae.c optional ae pci
	dev/age/if_age.c optional age pci
	dev/agp/agp.c optional agp pci
	dev/agp/agp_if.m optional agp pci
	dev/aha/aha.c optional aha
	dev/aha/aha_isa.c optional aha isa
	dev/aha/aha_mca.c optional aha mca
	dev/ahb/ahb.c optional ahb eisa
	dev/ahci/ahci.c optional ahci
	dev/ahci/ahciem.c optional ahci
	dev/ahci/ahci_pci.c optional ahci pci
	dev/aic/aic.c optional aic
	dev/aic/aic_pccard.c optional aic pccard
	dev/aic7xxx/ahc_eisa.c optional ahc eisa
	dev/aic7xxx/ahc_isa.c optional ahc isa
	dev/aic7xxx/ahc_pci.c optional ahc pci \
	compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
	dev/aic7xxx/ahd_pci.c optional ahd pci \
	compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
	dev/aic7xxx/aic7770.c optional ahc
	dev/aic7xxx/aic79xx.c optional ahd pci
	dev/aic7xxx/aic79xx_osm.c optional ahd pci
	dev/aic7xxx/aic79xx_pci.c optional ahd pci
	dev/aic7xxx/aic79xx_reg_print.c optional ahd pci ahd_reg_pretty_print
	dev/aic7xxx/aic7xxx.c optional ahc
	dev/aic7xxx/aic7xxx_93cx6.c optional ahc
	dev/aic7xxx/aic7xxx_osm.c optional ahc
	dev/aic7xxx/aic7xxx_pci.c optional ahc pci
	dev/aic7xxx/aic7xxx_reg_print.c optional ahc ahc_reg_pretty_print
	dev/alc/if_alc.c optional alc pci
	dev/ale/if_ale.c optional ale pci
	dev/alpm/alpm.c optional alpm pci
	dev/altera/avgen/altera_avgen.c optional altera_avgen
	dev/altera/avgen/altera_avgen_fdt.c optional altera_avgen fdt
	dev/altera/avgen/altera_avgen_nexus.c optional altera_avgen
	dev/altera/sdcard/altera_sdcard.c optional altera_sdcard
	dev/altera/sdcard/altera_sdcard_disk.c optional altera_sdcard
	dev/altera/sdcard/altera_sdcard_io.c optional altera_sdcard
	dev/altera/sdcard/altera_sdcard_fdt.c optional altera_sdcard fdt
	dev/altera/sdcard/altera_sdcard_nexus.c optional altera_sdcard
	dev/altera/pio/pio.c optional altera_pio
	dev/altera/pio/pio_if.m optional altera_pio
	dev/amdpm/amdpm.c optional amdpm pci \| nfpm pci
	dev/amdsmb/amdsmb.c optional amdsmb pci
	dev/amr/amr.c optional amr
	dev/amr/amr_cam.c optional amrp amr
	dev/amr/amr_disk.c optional amr
	dev/amr/amr_linux.c optional amr compat_linux
	dev/amr/amr_pci.c optional amr pci
	dev/an/if_an.c optional an
	dev/an/if_an_isa.c optional an isa
	dev/an/if_an_pccard.c optional an pccard
	dev/an/if_an_pci.c optional an pci
	#
	dev/ata/ata_if.m optional ata \| atacore
	dev/ata/ata-all.c optional ata \| atacore
	dev/ata/ata-dma.c optional ata \| atacore
	dev/ata/ata-lowlevel.c optional ata \| atacore
	dev/ata/ata-sata.c optional ata \| atacore
	dev/ata/ata-card.c optional ata pccard \| atapccard
	dev/ata/ata-cbus.c optional ata pc98 \| atapc98
	dev/ata/ata-isa.c optional ata isa \| ataisa
	dev/ata/ata-pci.c optional ata pci \| atapci
	dev/ata/chipsets/ata-acard.c optional ata pci \| ataacard
	dev/ata/chipsets/ata-acerlabs.c optional ata pci \| ataacerlabs
	dev/ata/chipsets/ata-amd.c optional ata pci \| ataamd
	dev/ata/chipsets/ata-ati.c optional ata pci \| ataati
	dev/ata/chipsets/ata-cenatek.c optional ata pci \| atacenatek
	dev/ata/chipsets/ata-cypress.c optional ata pci \| atacypress
	dev/ata/chipsets/ata-cyrix.c optional ata pci \| atacyrix
	dev/ata/chipsets/ata-highpoint.c optional ata pci \| atahighpoint
	dev/ata/chipsets/ata-intel.c optional ata pci \| ataintel
	dev/ata/chipsets/ata-ite.c optional ata pci \| ataite
	dev/ata/chipsets/ata-jmicron.c optional ata pci \| atajmicron
	dev/ata/chipsets/ata-marvell.c optional ata pci \| atamarvell
	dev/ata/chipsets/ata-micron.c optional ata pci \| atamicron
	dev/ata/chipsets/ata-national.c optional ata pci \| atanational
	dev/ata/chipsets/ata-netcell.c optional ata pci \| atanetcell
	dev/ata/chipsets/ata-nvidia.c optional ata pci \| atanvidia
	dev/ata/chipsets/ata-promise.c optional ata pci \| atapromise
	dev/ata/chipsets/ata-serverworks.c optional ata pci \| ataserverworks
	dev/ata/chipsets/ata-siliconimage.c optional ata pci \| atasiliconimage \| ataati
	dev/ata/chipsets/ata-sis.c optional ata pci \| atasis
	dev/ata/chipsets/ata-via.c optional ata pci \| atavia
	#
	dev/ath/if_ath_pci.c optional ath_pci pci \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	#
	dev/ath/if_ath_ahb.c optional ath_ahb \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	#
	dev/ath/if_ath.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_alq.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_beacon.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_btcoex.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_btcoex_mci.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_debug.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_descdma.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_keycache.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_ioctl.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_led.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_lna_div.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_tx.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_tx_edma.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_tx_ht.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_tdma.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_sysctl.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_rx.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_rx_edma.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/if_ath_spectral.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/ah_osdep.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	#
	dev/ath/ath_hal/ah.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/ath_hal/ah_eeprom_v1.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/ath_hal/ah_eeprom_v3.c optional ath_hal \| ath_ar5211 \| ath_ar5212 \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/ath_hal/ah_eeprom_v14.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/ath_hal/ah_eeprom_v4k.c \
	optional ath_hal \| ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/ath_hal/ah_eeprom_9287.c \
	optional ath_hal \| ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/ath_hal/ah_regdomain.c optional ath \
	compile-with "${NORMAL_C} ${NO_WSHIFT_COUNT_NEGATIVE} ${NO_WSHIFT_COUNT_OVERFLOW} -I$S/dev/ath"
	# ar5210
	dev/ath/ath_hal/ar5210/ar5210_attach.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5210/ar5210_beacon.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5210/ar5210_interrupts.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5210/ar5210_keycache.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5210/ar5210_misc.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5210/ar5210_phy.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5210/ar5210_power.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5210/ar5210_recv.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5210/ar5210_reset.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5210/ar5210_xmit.c optional ath_hal \| ath_ar5210 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	# ar5211
	dev/ath/ath_hal/ar5211/ar5211_attach.c optional ath_hal \| ath_ar5211 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5211/ar5211_beacon.c optional ath_hal \| ath_ar5211 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5211/ar5211_interrupts.c optional ath_hal \| ath_ar5211 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5211/ar5211_keycache.c optional ath_hal \| ath_ar5211 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5211/ar5211_misc.c optional ath_hal \| ath_ar5211 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5211/ar5211_phy.c optional ath_hal \| ath_ar5211 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5211/ar5211_power.c optional ath_hal \| ath_ar5211 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5211/ar5211_recv.c optional ath_hal \| ath_ar5211 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5211/ar5211_reset.c optional ath_hal \| ath_ar5211 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5211/ar5211_xmit.c optional ath_hal \| ath_ar5211 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	# ar5212
	dev/ath/ath_hal/ar5212/ar5212_ani.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_attach.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_beacon.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_eeprom.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_gpio.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_interrupts.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_keycache.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_misc.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_phy.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_power.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_recv.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_reset.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_rfgain.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5212_xmit.c \
	optional ath_hal \| ath_ar5212 \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| \
	ath_ar9285 ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	# ar5416 (depends on ar5212)
	dev/ath/ath_hal/ar5416/ar5416_ani.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_attach.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_beacon.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_btcoex.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_cal.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_cal_iq.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_cal_adcgain.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_cal_adcdc.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_eeprom.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_gpio.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_interrupts.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_keycache.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_misc.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_phy.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_power.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_radar.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_recv.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_reset.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_spectral.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar5416_xmit.c \
	optional ath_hal \| ath_ar5416 \| ath_ar9160 \| ath_ar9280 \| ath_ar9285 \| \
	ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	# ar9130 (depends upon ar5416) - also requires AH_SUPPORT_AR9130
	#
	# Since this is an embedded MAC SoC, there's no need to compile it into the
	# default HAL.
	dev/ath/ath_hal/ar9001/ar9130_attach.c optional ath_ar9130 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9001/ar9130_phy.c optional ath_ar9130 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9001/ar9130_eeprom.c optional ath_ar9130 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	# ar9160 (depends on ar5416)
	dev/ath/ath_hal/ar9001/ar9160_attach.c optional ath_hal \| ath_ar9160 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	# ar9280 (depends on ar5416)
	dev/ath/ath_hal/ar9002/ar9280_attach.c optional ath_hal \| ath_ar9280 \| \
	ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9280_olc.c optional ath_hal \| ath_ar9280 \| \
	ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	# ar9285 (depends on ar5416 and ar9280)
	dev/ath/ath_hal/ar9002/ar9285_attach.c optional ath_hal \| ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9285_btcoex.c optional ath_hal \| ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9285_reset.c optional ath_hal \| ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9285_cal.c optional ath_hal \| ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9285_phy.c optional ath_hal \| ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9285_diversity.c optional ath_hal \| ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	# ar9287 (depends on ar5416)
	dev/ath/ath_hal/ar9002/ar9287_attach.c optional ath_hal \| ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9287_reset.c optional ath_hal \| ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9287_cal.c optional ath_hal \| ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9287_olc.c optional ath_hal \| ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"

	# ar9300
	contrib/dev/ath/ath_hal/ar9300/ar9300_ani.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_attach.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_beacon.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_eeprom.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal ${NO_WCONSTANT_CONVERSION}"
	contrib/dev/ath/ath_hal/ar9300/ar9300_freebsd.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_gpio.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_interrupts.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_keycache.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_mci.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_misc.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_paprd.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_phy.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_power.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_radar.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_radio.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_recv.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_recv_ds.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_reset.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal ${NO_WSOMETIMES_UNINITIALIZED} -Wno-unused-function"
	contrib/dev/ath/ath_hal/ar9300/ar9300_stub.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_stub_funcs.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_spectral.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_timer.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_xmit.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"
	contrib/dev/ath/ath_hal/ar9300/ar9300_xmit_ds.c optional ath_hal \| ath_ar9300 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal"

	# rf backends
	dev/ath/ath_hal/ar5212/ar2316.c optional ath_rf2316 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar2317.c optional ath_rf2317 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar2413.c optional ath_hal \| ath_rf2413 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar2425.c optional ath_hal \| ath_rf2425 \| ath_rf2417 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5111.c optional ath_hal \| ath_rf5111 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5112.c optional ath_hal \| ath_rf5112 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5212/ar5413.c optional ath_hal \| ath_rf5413 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar5416/ar2133.c optional ath_hal \| ath_ar5416 \| \
	ath_ar9130 \| ath_ar9160 \| ath_ar9280 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9280.c optional ath_hal \| ath_ar9280 \| ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9285.c optional ath_hal \| ath_ar9285 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"
	dev/ath/ath_hal/ar9002/ar9287.c optional ath_hal \| ath_ar9287 \
	compile-with "${NORMAL_C} -I$S/dev/ath -I$S/dev/ath/ath_hal"

	# ath rate control algorithms
	dev/ath/ath_rate/amrr/amrr.c optional ath_rate_amrr \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/ath_rate/onoe/onoe.c optional ath_rate_onoe \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	dev/ath/ath_rate/sample/sample.c optional ath_rate_sample \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	# ath DFS modules
	dev/ath/ath_dfs/null/dfs_null.c optional ath \
	compile-with "${NORMAL_C} -I$S/dev/ath"
	#
	dev/bce/if_bce.c optional bce
	dev/bfe/if_bfe.c optional bfe
	dev/bge/if_bge.c optional bge
	dev/bhnd/bhnd.c optional bhnd
	dev/bhnd/bhnd_nexus.c optional bhnd siba_nexus \| \
	bhnd bcma_nexus
	dev/bhnd/bhnd_subr.c optional bhnd
	dev/bhnd/bhnd_bus_if.m optional bhnd
	dev/bhnd/bhndb/bhnd_bhndb.c optional bhndb bhnd
	dev/bhnd/bhndb/bhndb.c optional bhndb bhnd
	dev/bhnd/bhndb/bhndb_bus_if.m optional bhndb bhnd
	dev/bhnd/bhndb/bhndb_hwdata.c optional bhndb bhnd
	dev/bhnd/bhndb/bhndb_if.m optional bhndb bhnd
	dev/bhnd/bhndb/bhndb_pci.c optional bhndb bhnd pci
	dev/bhnd/bhndb/bhndb_pci_hwdata.c optional bhndb bhnd pci
	dev/bhnd/bhndb/bhndb_pci_sprom.c optional bhndb bhnd pci
	dev/bhnd/bhndb/bhndb_subr.c optional bhndb bhnd
	dev/bhnd/bcma/bcma.c optional bcma bhnd
	dev/bhnd/bcma/bcma_bhndb.c optional bcma bhnd bhndb
	dev/bhnd/bcma/bcma_erom.c optional bcma bhnd
	dev/bhnd/bcma/bcma_nexus.c optional bcma_nexus bcma bhnd
	dev/bhnd/bcma/bcma_subr.c optional bcma bhnd
	dev/bhnd/cores/chipc/chipc.c optional bhnd
	dev/bhnd/cores/chipc/chipc_cfi.c optional bhnd cfi
	dev/bhnd/cores/chipc/chipc_slicer.c optional bhnd cfi \| bhnd spibus
	dev/bhnd/cores/chipc/chipc_spi.c optional bhnd spibus
	dev/bhnd/cores/chipc/chipc_subr.c optional bhnd
	dev/bhnd/cores/chipc/bhnd_chipc_if.m optional bhnd
	dev/bhnd/cores/chipc/bhnd_sprom_chipc.c optional bhnd
	dev/bhnd/cores/pci/bhnd_pci.c optional bhnd pci
	dev/bhnd/cores/pci/bhnd_pci_hostb.c optional bhndb bhnd pci
	dev/bhnd/cores/pci/bhnd_pcib.c optional bhnd_pcib bhnd pci
	dev/bhnd/cores/pcie2/bhnd_pcie2.c optional bhnd pci
	dev/bhnd/cores/pcie2/bhnd_pcie2_hostb.c optional bhndb bhnd pci
	dev/bhnd/cores/pcie2/bhnd_pcie2b.c optional bhnd_pcie2b bhnd pci
	dev/bhnd/nvram/bhnd_nvram_if.m optional bhnd
	dev/bhnd/nvram/bhnd_sprom.c optional bhnd
	dev/bhnd/nvram/bhnd_sprom_subr.c optional bhnd
	dev/bhnd/nvram/nvram_subr.c optional bhnd
	dev/bhnd/siba/siba.c optional siba bhnd
	dev/bhnd/siba/siba_bhndb.c optional siba bhnd bhndb
	dev/bhnd/siba/siba_nexus.c optional siba_nexus siba bhnd
	dev/bhnd/siba/siba_subr.c optional siba bhnd
	#
	dev/bktr/bktr_audio.c optional bktr pci
	dev/bktr/bktr_card.c optional bktr pci
	dev/bktr/bktr_core.c optional bktr pci
	dev/bktr/bktr_i2c.c optional bktr pci smbus
	dev/bktr/bktr_os.c optional bktr pci
	dev/bktr/bktr_tuner.c optional bktr pci
	dev/bktr/msp34xx.c optional bktr pci
	dev/bnxt/bnxt_hwrm.c optional bnxt iflib pci
	dev/bnxt/bnxt_sysctl.c optional bnxt iflib pci
	dev/bnxt/bnxt_txrx.c optional bnxt iflib pci
	dev/bnxt/if_bnxt.c optional bnxt iflib pci
	dev/buslogic/bt.c optional bt
	dev/buslogic/bt_eisa.c optional bt eisa
	dev/buslogic/bt_isa.c optional bt isa
	dev/buslogic/bt_mca.c optional bt mca
	dev/buslogic/bt_pci.c optional bt pci
	dev/bwi/bwimac.c optional bwi
	dev/bwi/bwiphy.c optional bwi
	dev/bwi/bwirf.c optional bwi
	dev/bwi/if_bwi.c optional bwi
	dev/bwi/if_bwi_pci.c optional bwi pci
	# XXX Work around clang warnings, until maintainer approves fix.
	dev/bwn/if_bwn.c optional bwn siba_bwn \
	compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}"
	dev/bwn/if_bwn_pci.c optional bwn pci bhnd
	dev/bwn/if_bwn_phy_common.c optional bwn siba_bwn
	dev/bwn/if_bwn_phy_g.c optional bwn siba_bwn \
	compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED} ${NO_WCONSTANT_CONVERSION}"
	dev/bwn/if_bwn_phy_lp.c optional bwn siba_bwn \
	compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}"
	dev/bwn/if_bwn_phy_n.c optional bwn siba_bwn
	dev/bwn/if_bwn_util.c optional bwn siba_bwn
	dev/bwn/bwn_mac.c optional bwn bhnd
	dev/cardbus/cardbus.c optional cardbus
	dev/cardbus/cardbus_cis.c optional cardbus
	dev/cardbus/cardbus_device.c optional cardbus
	dev/cas/if_cas.c optional cas
	dev/cfi/cfi_bus_fdt.c optional cfi fdt
	dev/cfi/cfi_bus_nexus.c optional cfi
	dev/cfi/cfi_core.c optional cfi
	dev/cfi/cfi_dev.c optional cfi
	dev/cfi/cfi_disk.c optional cfid
	dev/chromebook_platform/chromebook_platform.c optional chromebook_platform
	dev/ciss/ciss.c optional ciss
	dev/cm/smc90cx6.c optional cm
	dev/cmx/cmx.c optional cmx
	dev/cmx/cmx_pccard.c optional cmx pccard
	dev/cpufreq/ichss.c optional cpufreq
	dev/cs/if_cs.c optional cs
	dev/cs/if_cs_isa.c optional cs isa
	dev/cs/if_cs_pccard.c optional cs pccard
	dev/cxgb/cxgb_main.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/cxgb_sge.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/common/cxgb_mc5.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/common/cxgb_vsc7323.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/common/cxgb_vsc8211.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/common/cxgb_ael1002.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/common/cxgb_aq100x.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/common/cxgb_mv88e1xxx.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/common/cxgb_xgmac.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/common/cxgb_t3_hw.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/common/cxgb_tn1010.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/sys/uipc_mvec.c optional cxgb pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgb/cxgb_t3fw.c optional cxgb cxgb_t3fw \
	compile-with "${NORMAL_C} -I$S/dev/cxgb"
	dev/cxgbe/t4_if.m optional cxgbe pci
	dev/cxgbe/t4_iov.c optional cxgbe pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/t4_mp_ring.c optional cxgbe pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/t4_main.c optional cxgbe pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/t4_netmap.c optional cxgbe pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/t4_sched.c optional cxgbe pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/t4_sge.c optional cxgbe pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/t4_l2t.c optional cxgbe pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/t4_tracer.c optional cxgbe pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/t4_vf.c optional cxgbev pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/common/t4_hw.c optional cxgbe pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/common/t4vf_hw.c optional cxgbev pci \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/cudbg/cudbg_common.c optional cxgbe \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/cudbg/cudbg_flash_utils.c optional cxgbe \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/cudbg/cudbg_lib.c optional cxgbe \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/cudbg/cudbg_wtp.c optional cxgbe \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/cudbg/fastlz.c optional cxgbe \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	dev/cxgbe/cudbg/fastlz_api.c optional cxgbe \
	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
	t4fw_cfg.c optional cxgbe \
	compile-with "${AWK} -f $S/tools/fw_stub.awk t4fw_cfg.fw:t4fw_cfg t4fw_cfg_uwire.fw:t4fw_cfg_uwire t4fw.fw:t4fw -mt4fw_cfg -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "t4fw_cfg.c"
	t4fw_cfg.fwo optional cxgbe \
	dependency "t4fw_cfg.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "t4fw_cfg.fwo"
	t4fw_cfg.fw optional cxgbe \
	dependency "$S/dev/cxgbe/firmware/t4fw_cfg.txt" \
	compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
	no-obj no-implicit-rule \
	clean "t4fw_cfg.fw"
	t4fw_cfg_uwire.fwo optional cxgbe \
	dependency "t4fw_cfg_uwire.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "t4fw_cfg_uwire.fwo"
	t4fw_cfg_uwire.fw optional cxgbe \
	dependency "$S/dev/cxgbe/firmware/t4fw_cfg_uwire.txt" \
	compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
	no-obj no-implicit-rule \
	clean "t4fw_cfg_uwire.fw"
	t4fw.fwo optional cxgbe \
	dependency "t4fw.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "t4fw.fwo"
	t4fw.fw optional cxgbe \
	dependency "$S/dev/cxgbe/firmware/t4fw-1.16.63.0.bin.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "t4fw.fw"
	t5fw_cfg.c optional cxgbe \
	compile-with "${AWK} -f $S/tools/fw_stub.awk t5fw_cfg.fw:t5fw_cfg t5fw_cfg_uwire.fw:t5fw_cfg_uwire t5fw.fw:t5fw -mt5fw_cfg -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "t5fw_cfg.c"
	t5fw_cfg.fwo optional cxgbe \
	dependency "t5fw_cfg.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "t5fw_cfg.fwo"
	t5fw_cfg.fw optional cxgbe \
	dependency "$S/dev/cxgbe/firmware/t5fw_cfg.txt" \
	compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
	no-obj no-implicit-rule \
	clean "t5fw_cfg.fw"
	t5fw_cfg_uwire.fwo optional cxgbe \
	dependency "t5fw_cfg_uwire.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "t5fw_cfg_uwire.fwo"
	t5fw_cfg_uwire.fw optional cxgbe \
	dependency "$S/dev/cxgbe/firmware/t5fw_cfg_uwire.txt" \
	compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
	no-obj no-implicit-rule \
	clean "t5fw_cfg_uwire.fw"
	t5fw.fwo optional cxgbe \
	dependency "t5fw.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "t5fw.fwo"
	t5fw.fw optional cxgbe \
	dependency "$S/dev/cxgbe/firmware/t5fw-1.16.63.0.bin.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "t5fw.fw"
	t6fw_cfg.c optional cxgbe \
	compile-with "${AWK} -f $S/tools/fw_stub.awk t6fw_cfg.fw:t6fw_cfg t6fw_cfg_uwire.fw:t6fw_cfg_uwire t6fw.fw:t6fw -mt6fw_cfg -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "t6fw_cfg.c"
	t6fw_cfg.fwo optional cxgbe \
	dependency "t6fw_cfg.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "t6fw_cfg.fwo"
	t6fw_cfg.fw optional cxgbe \
	dependency "$S/dev/cxgbe/firmware/t6fw_cfg.txt" \
	compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
	no-obj no-implicit-rule \
	clean "t6fw_cfg.fw"
	t6fw_cfg_uwire.fwo optional cxgbe \
	dependency "t6fw_cfg_uwire.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "t6fw_cfg_uwire.fwo"
	t6fw_cfg_uwire.fw optional cxgbe \
	dependency "$S/dev/cxgbe/firmware/t6fw_cfg_uwire.txt" \
	compile-with "${CP} ${.ALLSRC} ${.TARGET}" \
	no-obj no-implicit-rule \
	clean "t6fw_cfg_uwire.fw"
	t6fw.fwo optional cxgbe \
	dependency "t6fw.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "t6fw.fwo"
	t6fw.fw optional cxgbe \
	dependency "$S/dev/cxgbe/firmware/t6fw-1.16.63.0.bin.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "t6fw.fw"
	dev/cy/cy.c optional cy
	dev/cy/cy_isa.c optional cy isa
	dev/cy/cy_pci.c optional cy pci
	dev/cyapa/cyapa.c optional cyapa iicbus
	dev/dc/if_dc.c optional dc pci
	dev/dc/dcphy.c optional dc pci
	dev/dc/pnphy.c optional dc pci
	dev/dcons/dcons.c optional dcons
	dev/dcons/dcons_crom.c optional dcons_crom
	dev/dcons/dcons_os.c optional dcons
	dev/de/if_de.c optional de pci
	dev/digi/CX.c optional digi_CX
	dev/digi/CX_PCI.c optional digi_CX_PCI
	dev/digi/EPCX.c optional digi_EPCX
	dev/digi/EPCX_PCI.c optional digi_EPCX_PCI
	dev/digi/Xe.c optional digi_Xe
	dev/digi/Xem.c optional digi_Xem
	dev/digi/Xr.c optional digi_Xr
	dev/digi/digi.c optional digi
	dev/digi/digi_isa.c optional digi isa
	dev/digi/digi_pci.c optional digi pci
	dev/dpt/dpt_eisa.c optional dpt eisa
	dev/dpt/dpt_pci.c optional dpt pci
	dev/dpt/dpt_scsi.c optional dpt
	dev/drm/ati_pcigart.c optional drm
	dev/drm/drm_agpsupport.c optional drm
	dev/drm/drm_auth.c optional drm
	dev/drm/drm_bufs.c optional drm
	dev/drm/drm_context.c optional drm
	dev/drm/drm_dma.c optional drm
	dev/drm/drm_drawable.c optional drm
	dev/drm/drm_drv.c optional drm
	dev/drm/drm_fops.c optional drm
	dev/drm/drm_hashtab.c optional drm
	dev/drm/drm_ioctl.c optional drm
	dev/drm/drm_irq.c optional drm
	dev/drm/drm_lock.c optional drm
	dev/drm/drm_memory.c optional drm
	dev/drm/drm_mm.c optional drm
	dev/drm/drm_pci.c optional drm
	dev/drm/drm_scatter.c optional drm
	dev/drm/drm_sman.c optional drm
	dev/drm/drm_sysctl.c optional drm
	dev/drm/drm_vm.c optional drm
	dev/drm/i915_dma.c optional i915drm
	dev/drm/i915_drv.c optional i915drm
	dev/drm/i915_irq.c optional i915drm
	dev/drm/i915_mem.c optional i915drm
	dev/drm/i915_suspend.c optional i915drm
	dev/drm/mach64_dma.c optional mach64drm
	dev/drm/mach64_drv.c optional mach64drm
	dev/drm/mach64_irq.c optional mach64drm
	dev/drm/mach64_state.c optional mach64drm
	dev/drm/mga_dma.c optional mgadrm
	dev/drm/mga_drv.c optional mgadrm
	dev/drm/mga_irq.c optional mgadrm
	dev/drm/mga_state.c optional mgadrm
	dev/drm/mga_warp.c optional mgadrm
	dev/drm/r128_cce.c optional r128drm \
	compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
	dev/drm/r128_drv.c optional r128drm
	dev/drm/r128_irq.c optional r128drm
	dev/drm/r128_state.c optional r128drm
	dev/drm/r300_cmdbuf.c optional radeondrm
	dev/drm/r600_blit.c optional radeondrm
	dev/drm/r600_cp.c optional radeondrm \
	compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
	dev/drm/radeon_cp.c optional radeondrm \
	compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
	dev/drm/radeon_cs.c optional radeondrm
	dev/drm/radeon_drv.c optional radeondrm
	dev/drm/radeon_irq.c optional radeondrm
	dev/drm/radeon_mem.c optional radeondrm
	dev/drm/radeon_state.c optional radeondrm
	dev/drm/savage_bci.c optional savagedrm
	dev/drm/savage_drv.c optional savagedrm
	dev/drm/savage_state.c optional savagedrm
	dev/drm/sis_drv.c optional sisdrm
	dev/drm/sis_ds.c optional sisdrm
	dev/drm/sis_mm.c optional sisdrm
	dev/drm/tdfx_drv.c optional tdfxdrm
	dev/drm/via_dma.c optional viadrm
	dev/drm/via_dmablit.c optional viadrm
	dev/drm/via_drv.c optional viadrm
	dev/drm/via_irq.c optional viadrm
	dev/drm/via_map.c optional viadrm
	dev/drm/via_mm.c optional viadrm
	dev/drm/via_verifier.c optional viadrm
	dev/drm/via_video.c optional viadrm
	dev/drm2/drm_agpsupport.c optional drm2
	dev/drm2/drm_auth.c optional drm2
	dev/drm2/drm_bufs.c optional drm2
	dev/drm2/drm_buffer.c optional drm2
	dev/drm2/drm_context.c optional drm2
	dev/drm2/drm_crtc.c optional drm2
	dev/drm2/drm_crtc_helper.c optional drm2
	dev/drm2/drm_dma.c optional drm2
	dev/drm2/drm_dp_helper.c optional drm2
	dev/drm2/drm_dp_iic_helper.c optional drm2
	dev/drm2/drm_drv.c optional drm2
	dev/drm2/drm_edid.c optional drm2
	dev/drm2/drm_fb_helper.c optional drm2
	dev/drm2/drm_fops.c optional drm2
	dev/drm2/drm_gem.c optional drm2
	dev/drm2/drm_gem_names.c optional drm2
	dev/drm2/drm_global.c optional drm2
	dev/drm2/drm_hashtab.c optional drm2
	dev/drm2/drm_ioctl.c optional drm2
	dev/drm2/drm_irq.c optional drm2
	dev/drm2/drm_linux_list_sort.c optional drm2
	dev/drm2/drm_lock.c optional drm2
	dev/drm2/drm_memory.c optional drm2
	dev/drm2/drm_mm.c optional drm2
	dev/drm2/drm_modes.c optional drm2
	dev/drm2/drm_pci.c optional drm2
	dev/drm2/drm_platform.c optional drm2
	dev/drm2/drm_scatter.c optional drm2
	dev/drm2/drm_stub.c optional drm2
	dev/drm2/drm_sysctl.c optional drm2
	dev/drm2/drm_vm.c optional drm2
	dev/drm2/drm_os_freebsd.c optional drm2
	dev/drm2/ttm/ttm_agp_backend.c optional drm2
	dev/drm2/ttm/ttm_lock.c optional drm2
	dev/drm2/ttm/ttm_object.c optional drm2
	dev/drm2/ttm/ttm_tt.c optional drm2
	dev/drm2/ttm/ttm_bo_util.c optional drm2
	dev/drm2/ttm/ttm_bo.c optional drm2
	dev/drm2/ttm/ttm_bo_manager.c optional drm2
	dev/drm2/ttm/ttm_execbuf_util.c optional drm2
	dev/drm2/ttm/ttm_memory.c optional drm2
	dev/drm2/ttm/ttm_page_alloc.c optional drm2
	dev/drm2/ttm/ttm_bo_vm.c optional drm2
	dev/drm2/ati_pcigart.c optional drm2 agp pci
	dev/ed/if_ed.c optional ed
	dev/ed/if_ed_novell.c optional ed
	dev/ed/if_ed_rtl80x9.c optional ed
	dev/ed/if_ed_pccard.c optional ed pccard
	dev/ed/if_ed_pci.c optional ed pci
	dev/efidev/efidev.c optional efirt
	dev/efidev/efirt.c optional efirt
	dev/efidev/efirtc.c optional efirt
	dev/eisa/eisa_if.m standard
	dev/eisa/eisaconf.c optional eisa
	dev/e1000/if_em.c optional em \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/if_lem.c optional em \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/if_igb.c optional igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_80003es2lan.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_82540.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_82541.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_82542.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_82543.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_82571.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_82575.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_ich8lan.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_i210.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_api.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_mac.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_manage.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_nvm.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_phy.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_vf.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_mbx.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/e1000/e1000_osdep.c optional em \| igb \
	compile-with "${NORMAL_C} -I$S/dev/e1000"
	dev/et/if_et.c optional et
	dev/ena/ena.c optional ena \
	compile-with "${NORMAL_C} -I$S/contrib"
	dev/ena/ena_sysctl.c optional ena \
	compile-with "${NORMAL_C} -I$S/contrib"
	contrib/ena-com/ena_com.c optional ena
	contrib/ena-com/ena_eth_com.c optional ena
	dev/en/if_en_pci.c optional en pci
	dev/en/midway.c optional en
	dev/ep/if_ep.c optional ep
	dev/ep/if_ep_eisa.c optional ep eisa
	dev/ep/if_ep_isa.c optional ep isa
	dev/ep/if_ep_mca.c optional ep mca
	dev/ep/if_ep_pccard.c optional ep pccard
	dev/esp/esp_pci.c optional esp pci
	dev/esp/ncr53c9x.c optional esp
	dev/etherswitch/arswitch/arswitch.c optional arswitch
	dev/etherswitch/arswitch/arswitch_reg.c optional arswitch
	dev/etherswitch/arswitch/arswitch_phy.c optional arswitch
	dev/etherswitch/arswitch/arswitch_8216.c optional arswitch
	dev/etherswitch/arswitch/arswitch_8226.c optional arswitch
	dev/etherswitch/arswitch/arswitch_8316.c optional arswitch
	dev/etherswitch/arswitch/arswitch_8327.c optional arswitch
	dev/etherswitch/arswitch/arswitch_7240.c optional arswitch
	dev/etherswitch/arswitch/arswitch_9340.c optional arswitch
	dev/etherswitch/arswitch/arswitch_vlans.c optional arswitch
	dev/etherswitch/etherswitch.c optional etherswitch
	dev/etherswitch/etherswitch_if.m optional etherswitch
	dev/etherswitch/ip17x/ip17x.c optional ip17x
	dev/etherswitch/ip17x/ip175c.c optional ip17x
	dev/etherswitch/ip17x/ip175d.c optional ip17x
	dev/etherswitch/ip17x/ip17x_phy.c optional ip17x
	dev/etherswitch/ip17x/ip17x_vlans.c optional ip17x
	dev/etherswitch/miiproxy.c optional miiproxy
	dev/etherswitch/rtl8366/rtl8366rb.c optional rtl8366rb
	dev/etherswitch/ukswitch/ukswitch.c optional ukswitch
	dev/evdev/cdev.c optional evdev
	dev/evdev/evdev.c optional evdev
	dev/evdev/evdev_mt.c optional evdev
	dev/evdev/evdev_utils.c optional evdev
	dev/evdev/uinput.c optional evdev uinput
	dev/ex/if_ex.c optional ex
	dev/ex/if_ex_isa.c optional ex isa
	dev/ex/if_ex_pccard.c optional ex pccard
	dev/exca/exca.c optional cbb
	dev/extres/clk/clk.c optional ext_resources clk
	dev/extres/clk/clkdev_if.m optional ext_resources clk
	dev/extres/clk/clknode_if.m optional ext_resources clk
	dev/extres/clk/clk_bus.c optional ext_resources clk fdt
	dev/extres/clk/clk_div.c optional ext_resources clk
	dev/extres/clk/clk_fixed.c optional ext_resources clk
	dev/extres/clk/clk_gate.c optional ext_resources clk
	dev/extres/clk/clk_mux.c optional ext_resources clk
	dev/extres/phy/phy.c optional ext_resources phy
	dev/extres/phy/phydev_if.m optional ext_resources phy
	dev/extres/phy/phynode_if.m optional ext_resources phy
	dev/extres/hwreset/hwreset.c optional ext_resources hwreset
	dev/extres/hwreset/hwreset_if.m optional ext_resources hwreset
	dev/extres/regulator/regdev_if.m optional ext_resources regulator
	dev/extres/regulator/regnode_if.m optional ext_resources regulator
	dev/extres/regulator/regulator.c optional ext_resources regulator
	dev/extres/regulator/regulator_bus.c optional ext_resources regulator fdt
	dev/extres/regulator/regulator_fixed.c optional ext_resources regulator
	dev/fatm/if_fatm.c optional fatm pci
	dev/fb/fbd.c optional fbd \| vt
	dev/fb/fb_if.m standard
	dev/fb/splash.c optional sc splash
	dev/fdt/fdt_clock.c optional fdt fdt_clock
	dev/fdt/fdt_clock_if.m optional fdt fdt_clock
	dev/fdt/fdt_common.c optional fdt
	dev/fdt/fdt_pinctrl.c optional fdt fdt_pinctrl
	dev/fdt/fdt_pinctrl_if.m optional fdt fdt_pinctrl
	dev/fdt/fdt_slicer.c optional fdt cfi \| fdt nand \| fdt mx25l
	dev/fdt/fdt_static_dtb.S optional fdt fdt_dtb_static \
	dependency "fdt_dtb_file"
	dev/fdt/simplebus.c optional fdt
	dev/fe/if_fe.c optional fe
	dev/fe/if_fe_pccard.c optional fe pccard
	dev/filemon/filemon.c optional filemon
	dev/firewire/firewire.c optional firewire
	dev/firewire/fwcrom.c optional firewire
	dev/firewire/fwdev.c optional firewire
	dev/firewire/fwdma.c optional firewire
	dev/firewire/fwmem.c optional firewire
	dev/firewire/fwohci.c optional firewire
	dev/firewire/fwohci_pci.c optional firewire pci
	dev/firewire/if_fwe.c optional fwe
	dev/firewire/if_fwip.c optional fwip
	dev/firewire/sbp.c optional sbp
	dev/firewire/sbp_targ.c optional sbp_targ
	dev/flash/at45d.c optional at45d
	dev/flash/mx25l.c optional mx25l
	dev/fxp/if_fxp.c optional fxp
	dev/fxp/inphy.c optional fxp
	dev/gem/if_gem.c optional gem
	dev/gem/if_gem_pci.c optional gem pci
	dev/gem/if_gem_sbus.c optional gem sbus
	dev/gpio/gpiobacklight.c optional gpiobacklight fdt
	dev/gpio/gpiokeys.c optional gpiokeys fdt
	dev/gpio/gpiokeys_codes.c optional gpiokeys fdt
	dev/gpio/gpiobus.c optional gpio \
	dependency "gpiobus_if.h"
	dev/gpio/gpioc.c optional gpio \
	dependency "gpio_if.h"
	dev/gpio/gpioiic.c optional gpioiic
	dev/gpio/gpioled.c optional gpioled !fdt
	dev/gpio/gpioled_fdt.c optional gpioled fdt
	dev/gpio/gpiospi.c optional gpiospi
	dev/gpio/gpio_if.m optional gpio
	dev/gpio/gpiobus_if.m optional gpio
	dev/gpio/gpiopps.c optional gpiopps
	dev/gpio/ofw_gpiobus.c optional fdt gpio
	dev/hatm/if_hatm.c optional hatm pci
	dev/hatm/if_hatm_intr.c optional hatm pci
	dev/hatm/if_hatm_ioctl.c optional hatm pci
	dev/hatm/if_hatm_rx.c optional hatm pci
	dev/hatm/if_hatm_tx.c optional hatm pci
	dev/hifn/hifn7751.c optional hifn
	dev/hme/if_hme.c optional hme
	dev/hme/if_hme_pci.c optional hme pci
	dev/hme/if_hme_sbus.c optional hme sbus
	dev/hptiop/hptiop.c optional hptiop scbus
	dev/hwpmc/hwpmc_logging.c optional hwpmc
	dev/hwpmc/hwpmc_mod.c optional hwpmc
	dev/hwpmc/hwpmc_soft.c optional hwpmc
	dev/ichiic/ig4_acpi.c optional ig4 acpi iicbus
	dev/ichiic/ig4_iic.c optional ig4 iicbus
	dev/ichiic/ig4_pci.c optional ig4 pci iicbus
	dev/ichsmb/ichsmb.c optional ichsmb
	dev/ichsmb/ichsmb_pci.c optional ichsmb pci
	dev/ida/ida.c optional ida
	dev/ida/ida_disk.c optional ida
	dev/ida/ida_eisa.c optional ida eisa
	dev/ida/ida_pci.c optional ida pci
	dev/ie/if_ie.c optional ie isa nowerror
	dev/ie/if_ie_isa.c optional ie isa
	dev/iicbus/ad7418.c optional ad7418
	dev/iicbus/ds1307.c optional ds1307
	dev/iicbus/ds13rtc.c optional ds13rtc \| ds133x \| ds1374
	dev/iicbus/ds1672.c optional ds1672
	dev/iicbus/ds3231.c optional ds3231
	dev/iicbus/rtc8583.c optional rtc8583
	dev/iicbus/icee.c optional icee
	dev/iicbus/if_ic.c optional ic
	dev/iicbus/iic.c optional iic
	dev/iicbus/iic_recover_bus.c optional iicbus
	dev/iicbus/iicbb.c optional iicbb
	dev/iicbus/iicbb_if.m optional iicbb
	dev/iicbus/iicbus.c optional iicbus
	dev/iicbus/iicbus_if.m optional iicbus
	dev/iicbus/iiconf.c optional iicbus
	dev/iicbus/iicsmb.c optional iicsmb \
	dependency "iicbus_if.h"
	dev/iicbus/iicoc.c optional iicoc
	dev/iicbus/isl12xx.c optional isl12xx
	dev/iicbus/lm75.c optional lm75
	dev/iicbus/nxprtc.c optional nxprtc \| pcf8563
	dev/iicbus/ofw_iicbus.c optional fdt iicbus
	dev/iicbus/s35390a.c optional s35390a
	dev/iir/iir.c optional iir
	dev/iir/iir_ctrl.c optional iir
	dev/iir/iir_pci.c optional iir pci
	dev/intpm/intpm.c optional intpm pci
	# XXX Work around clang warning, until maintainer approves fix.
	dev/ips/ips.c optional ips \
	compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}"
	dev/ips/ips_commands.c optional ips
	dev/ips/ips_disk.c optional ips
	dev/ips/ips_ioctl.c optional ips
	dev/ips/ips_pci.c optional ips pci
	dev/ipw/if_ipw.c optional ipw
	ipwbssfw.c optional ipwbssfw \| ipwfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk ipw_bss.fw:ipw_bss:130 -lintel_ipw -mipw_bss -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "ipwbssfw.c"
	ipw_bss.fwo optional ipwbssfw \| ipwfw \
	dependency "ipw_bss.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "ipw_bss.fwo"
	ipw_bss.fw optional ipwbssfw \| ipwfw \
	dependency "$S/contrib/dev/ipw/ipw2100-1.3.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "ipw_bss.fw"
	ipwibssfw.c optional ipwibssfw \| ipwfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk ipw_ibss.fw:ipw_ibss:130 -lintel_ipw -mipw_ibss -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "ipwibssfw.c"
	ipw_ibss.fwo optional ipwibssfw \| ipwfw \
	dependency "ipw_ibss.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "ipw_ibss.fwo"
	ipw_ibss.fw optional ipwibssfw \| ipwfw \
	dependency "$S/contrib/dev/ipw/ipw2100-1.3-i.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "ipw_ibss.fw"
	ipwmonitorfw.c optional ipwmonitorfw \| ipwfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk ipw_monitor.fw:ipw_monitor:130 -lintel_ipw -mipw_monitor -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "ipwmonitorfw.c"
	ipw_monitor.fwo optional ipwmonitorfw \| ipwfw \
	dependency "ipw_monitor.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "ipw_monitor.fwo"
	ipw_monitor.fw optional ipwmonitorfw \| ipwfw \
	dependency "$S/contrib/dev/ipw/ipw2100-1.3-p.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "ipw_monitor.fw"
	dev/iscsi/icl.c optional iscsi
	dev/iscsi/icl_conn_if.m optional cfiscsi \| iscsi
	dev/iscsi/icl_soft.c optional iscsi
	dev/iscsi/icl_soft_proxy.c optional iscsi
	dev/iscsi/iscsi.c optional iscsi scbus
	dev/iscsi_initiator/iscsi.c optional iscsi_initiator scbus
	dev/iscsi_initiator/iscsi_subr.c optional iscsi_initiator scbus
	dev/iscsi_initiator/isc_cam.c optional iscsi_initiator scbus
	dev/iscsi_initiator/isc_soc.c optional iscsi_initiator scbus
	dev/iscsi_initiator/isc_sm.c optional iscsi_initiator scbus
	dev/iscsi_initiator/isc_subr.c optional iscsi_initiator scbus
	dev/ismt/ismt.c optional ismt
	dev/isl/isl.c optional isl iicbus
	dev/isp/isp.c optional isp
	dev/isp/isp_freebsd.c optional isp
	dev/isp/isp_library.c optional isp
	dev/isp/isp_pci.c optional isp pci
	dev/isp/isp_sbus.c optional isp sbus
	dev/isp/isp_target.c optional isp
	dev/ispfw/ispfw.c optional ispfw
	dev/iwi/if_iwi.c optional iwi
	iwibssfw.c optional iwibssfw \| iwifw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwi_bss.fw:iwi_bss:300 -lintel_iwi -miwi_bss -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwibssfw.c"
	iwi_bss.fwo optional iwibssfw \| iwifw \
	dependency "iwi_bss.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwi_bss.fwo"
	iwi_bss.fw optional iwibssfw \| iwifw \
	dependency "$S/contrib/dev/iwi/ipw2200-bss.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwi_bss.fw"
	iwiibssfw.c optional iwiibssfw \| iwifw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwi_ibss.fw:iwi_ibss:300 -lintel_iwi -miwi_ibss -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwiibssfw.c"
	iwi_ibss.fwo optional iwiibssfw \| iwifw \
	dependency "iwi_ibss.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwi_ibss.fwo"
	iwi_ibss.fw optional iwiibssfw \| iwifw \
	dependency "$S/contrib/dev/iwi/ipw2200-ibss.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwi_ibss.fw"
	iwimonitorfw.c optional iwimonitorfw \| iwifw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwi_monitor.fw:iwi_monitor:300 -lintel_iwi -miwi_monitor -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwimonitorfw.c"
	iwi_monitor.fwo optional iwimonitorfw \| iwifw \
	dependency "iwi_monitor.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwi_monitor.fwo"
	iwi_monitor.fw optional iwimonitorfw \| iwifw \
	dependency "$S/contrib/dev/iwi/ipw2200-sniffer.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwi_monitor.fw"
	dev/iwm/if_iwm.c optional iwm
	dev/iwm/if_iwm_7000.c optional iwm
	dev/iwm/if_iwm_8000.c optional iwm
	dev/iwm/if_iwm_binding.c optional iwm
	dev/iwm/if_iwm_fw.c optional iwm
	dev/iwm/if_iwm_led.c optional iwm
	dev/iwm/if_iwm_mac_ctxt.c optional iwm
	dev/iwm/if_iwm_notif_wait.c optional iwm
	dev/iwm/if_iwm_pcie_trans.c optional iwm
	dev/iwm/if_iwm_phy_ctxt.c optional iwm
	dev/iwm/if_iwm_phy_db.c optional iwm
	dev/iwm/if_iwm_power.c optional iwm
	dev/iwm/if_iwm_scan.c optional iwm
	dev/iwm/if_iwm_sta.c optional iwm
	dev/iwm/if_iwm_time_event.c optional iwm
	dev/iwm/if_iwm_util.c optional iwm
	iwm3160fw.c optional iwm3160fw \| iwmfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwm3160.fw:iwm3160fw -miwm3160fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwm3160fw.c"
	iwm3160fw.fwo optional iwm3160fw \| iwmfw \
	dependency "iwm3160.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwm3160fw.fwo"
	iwm3160.fw optional iwm3160fw \| iwmfw \
	dependency "$S/contrib/dev/iwm/iwm-3160-17.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwm3160.fw"
	iwm7260fw.c optional iwm7260fw \| iwmfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwm7260.fw:iwm7260fw -miwm7260fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwm7260fw.c"
	iwm7260fw.fwo optional iwm7260fw \| iwmfw \
	dependency "iwm7260.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwm7260fw.fwo"
	iwm7260.fw optional iwm7260fw \| iwmfw \
	dependency "$S/contrib/dev/iwm/iwm-7260-17.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwm7260.fw"
	iwm7265fw.c optional iwm7265fw \| iwmfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwm7265.fw:iwm7265fw -miwm7265fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwm7265fw.c"
	iwm7265fw.fwo optional iwm7265fw \| iwmfw \
	dependency "iwm7265.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwm7265fw.fwo"
	iwm7265.fw optional iwm7265fw \| iwmfw \
	dependency "$S/contrib/dev/iwm/iwm-7265-17.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwm7265.fw"
	iwm7265Dfw.c optional iwm7265Dfw \| iwmfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwm7265D.fw:iwm7265Dfw -miwm7265Dfw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwm7265Dfw.c"
	iwm7265Dfw.fwo optional iwm7265Dfw \| iwmfw \
	dependency "iwm7265D.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwm7265Dfw.fwo"
	iwm7265D.fw optional iwm7265Dfw \| iwmfw \
	dependency "$S/contrib/dev/iwm/iwm-7265D-17.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwm7265D.fw"
	iwm8000Cfw.c optional iwm8000Cfw \| iwmfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwm8000C.fw:iwm8000Cfw -miwm8000Cfw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwm8000Cfw.c"
	iwm8000Cfw.fwo optional iwm8000Cfw \| iwmfw \
	dependency "iwm8000C.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwm8000Cfw.fwo"
	iwm8000C.fw optional iwm8000Cfw \| iwmfw \
	dependency "$S/contrib/dev/iwm/iwm-8000C-17.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwm8000C.fw"
	iwm8265.fw optional iwm8265fw \| iwmfw \
	dependency "$S/contrib/dev/iwm/iwm-8265-22.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwm8265.fw"
	iwm8265fw.c optional iwm8265fw \| iwmfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwm8265.fw:iwm8265fw -miwm8265fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwm8265fw.c"
	iwm8265fw.fwo optional iwm8265fw \| iwmfw \
	dependency "iwm8265.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwm8265fw.fwo"
	dev/iwn/if_iwn.c optional iwn
	iwn1000fw.c optional iwn1000fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn1000.fw:iwn1000fw -miwn1000fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn1000fw.c"
	iwn1000fw.fwo optional iwn1000fw \| iwnfw \
	dependency "iwn1000.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn1000fw.fwo"
	iwn1000.fw optional iwn1000fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-1000-39.31.5.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn1000.fw"
	iwn100fw.c optional iwn100fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn100.fw:iwn100fw -miwn100fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn100fw.c"
	iwn100fw.fwo optional iwn100fw \| iwnfw \
	dependency "iwn100.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn100fw.fwo"
	iwn100.fw optional iwn100fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-100-39.31.5.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn100.fw"
	iwn105fw.c optional iwn105fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn105.fw:iwn105fw -miwn105fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn105fw.c"
	iwn105fw.fwo optional iwn105fw \| iwnfw \
	dependency "iwn105.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn105fw.fwo"
	iwn105.fw optional iwn105fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-105-6-18.168.6.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn105.fw"
	iwn135fw.c optional iwn135fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn135.fw:iwn135fw -miwn135fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn135fw.c"
	iwn135fw.fwo optional iwn135fw \| iwnfw \
	dependency "iwn135.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn135fw.fwo"
	iwn135.fw optional iwn135fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-135-6-18.168.6.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn135.fw"
	iwn2000fw.c optional iwn2000fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn2000.fw:iwn2000fw -miwn2000fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn2000fw.c"
	iwn2000fw.fwo optional iwn2000fw \| iwnfw \
	dependency "iwn2000.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn2000fw.fwo"
	iwn2000.fw optional iwn2000fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-2000-18.168.6.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn2000.fw"
	iwn2030fw.c optional iwn2030fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn2030.fw:iwn2030fw -miwn2030fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn2030fw.c"
	iwn2030fw.fwo optional iwn2030fw \| iwnfw \
	dependency "iwn2030.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn2030fw.fwo"
	iwn2030.fw optional iwn2030fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwnwifi-2030-18.168.6.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn2030.fw"
	iwn4965fw.c optional iwn4965fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn4965.fw:iwn4965fw -miwn4965fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn4965fw.c"
	iwn4965fw.fwo optional iwn4965fw \| iwnfw \
	dependency "iwn4965.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn4965fw.fwo"
	iwn4965.fw optional iwn4965fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-4965-228.61.2.24.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn4965.fw"
	iwn5000fw.c optional iwn5000fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn5000.fw:iwn5000fw -miwn5000fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn5000fw.c"
	iwn5000fw.fwo optional iwn5000fw \| iwnfw \
	dependency "iwn5000.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn5000fw.fwo"
	iwn5000.fw optional iwn5000fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-5000-8.83.5.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn5000.fw"
	iwn5150fw.c optional iwn5150fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn5150.fw:iwn5150fw -miwn5150fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn5150fw.c"
	iwn5150fw.fwo optional iwn5150fw \| iwnfw \
	dependency "iwn5150.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn5150fw.fwo"
	iwn5150.fw optional iwn5150fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-5150-8.24.2.2.fw.uu"\
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn5150.fw"
	iwn6000fw.c optional iwn6000fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6000.fw:iwn6000fw -miwn6000fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn6000fw.c"
	iwn6000fw.fwo optional iwn6000fw \| iwnfw \
	dependency "iwn6000.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn6000fw.fwo"
	iwn6000.fw optional iwn6000fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-6000-9.221.4.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn6000.fw"
	iwn6000g2afw.c optional iwn6000g2afw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6000g2a.fw:iwn6000g2afw -miwn6000g2afw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn6000g2afw.c"
	iwn6000g2afw.fwo optional iwn6000g2afw \| iwnfw \
	dependency "iwn6000g2a.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn6000g2afw.fwo"
	iwn6000g2a.fw optional iwn6000g2afw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-6000g2a-18.168.6.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn6000g2a.fw"
	iwn6000g2bfw.c optional iwn6000g2bfw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6000g2b.fw:iwn6000g2bfw -miwn6000g2bfw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn6000g2bfw.c"
	iwn6000g2bfw.fwo optional iwn6000g2bfw \| iwnfw \
	dependency "iwn6000g2b.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn6000g2bfw.fwo"
	iwn6000g2b.fw optional iwn6000g2bfw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-6000g2b-18.168.6.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn6000g2b.fw"
	iwn6050fw.c optional iwn6050fw \| iwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk iwn6050.fw:iwn6050fw -miwn6050fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "iwn6050fw.c"
	iwn6050fw.fwo optional iwn6050fw \| iwnfw \
	dependency "iwn6050.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "iwn6050fw.fwo"
	iwn6050.fw optional iwn6050fw \| iwnfw \
	dependency "$S/contrib/dev/iwn/iwlwifi-6050-41.28.5.1.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "iwn6050.fw"
	dev/ixgb/if_ixgb.c optional ixgb
	dev/ixgb/ixgb_ee.c optional ixgb
	dev/ixgb/ixgb_hw.c optional ixgb
	dev/ixgbe/if_ix.c optional ix inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe -DSMP"
	dev/ixgbe/if_ixv.c optional ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe -DSMP"
	dev/ixgbe/if_bypass.c optional ix inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_netmap.c optional ix inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/if_fdir.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/if_sriov.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ix_txrx.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_osdep.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_phy.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_api.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_common.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_mbx.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_vf.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_82598.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_82599.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_x540.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_x550.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_dcb.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_dcb_82598.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/ixgbe/ixgbe_dcb_82599.c optional ix inet \| ixv inet \
	compile-with "${NORMAL_C} -I$S/dev/ixgbe"
	dev/jedec_dimm/jedec_dimm.c optional jedec_dimm smbus
	dev/jedec_ts/jedec_ts.c optional jedec_ts smbus
	dev/jme/if_jme.c optional jme pci
	dev/joy/joy.c optional joy
	dev/joy/joy_isa.c optional joy isa
	dev/kbd/kbd.c optional atkbd \| pckbd \| sc \| ukbd \| vt
	dev/kbdmux/kbdmux.c optional kbdmux
	dev/ksyms/ksyms.c optional ksyms
	dev/le/am7990.c optional le
	dev/le/am79900.c optional le
	dev/le/if_le_pci.c optional le pci
	dev/le/lance.c optional le
	dev/led/led.c standard
	dev/lge/if_lge.c optional lge
	dev/liquidio/base/cn23xx_pf_device.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/base/lio_console.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/base/lio_ctrl.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/base/lio_device.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/base/lio_droq.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/base/lio_mem_ops.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/base/lio_request_manager.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/base/lio_response_manager.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/lio_core.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/lio_ioctl.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/lio_main.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/lio_rss.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/lio_rxtx.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	dev/liquidio/lio_sysctl.c optional lio \
	compile-with "${NORMAL_C} \
	-I$S/dev/liquidio -I$S/dev/liquidio/base -DSMP"
	lio.c optional lio \
	compile-with "${AWK} -f $S/tools/fw_stub.awk lio_23xx_nic.bin.fw:lio_23xx_nic.bin -mlio_23xx_nic.bin -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "lio.c"
	lio_23xx_nic.bin.fw.fwo optional lio \
	dependency "lio_23xx_nic.bin.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "lio_23xx_nic.bin.fw.fwo"
	lio_23xx_nic.bin.fw optional lio \
	dependency "$S/contrib/dev/liquidio/lio_23xx_nic.bin.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "lio_23xx_nic.bin.fw"
	dev/lmc/if_lmc.c optional lmc
	dev/malo/if_malo.c optional malo
	dev/malo/if_malohal.c optional malo
	dev/malo/if_malo_pci.c optional malo pci
	dev/mc146818/mc146818.c optional mc146818
	dev/mca/mca_bus.c optional mca
	dev/mcd/mcd.c optional mcd isa nowerror
	dev/mcd/mcd_isa.c optional mcd isa nowerror
	dev/md/md.c optional md
	dev/mdio/mdio_if.m optional miiproxy \| mdio
	dev/mdio/mdio.c optional miiproxy \| mdio
	dev/mem/memdev.c optional mem
	dev/mem/memutil.c optional mem
	dev/mfi/mfi.c optional mfi
	dev/mfi/mfi_debug.c optional mfi
	dev/mfi/mfi_pci.c optional mfi pci
	dev/mfi/mfi_disk.c optional mfi
	dev/mfi/mfi_syspd.c optional mfi
	dev/mfi/mfi_tbolt.c optional mfi
	dev/mfi/mfi_linux.c optional mfi compat_linux
	dev/mfi/mfi_cam.c optional mfip scbus
	dev/mii/acphy.c optional miibus \| acphy
	dev/mii/amphy.c optional miibus \| amphy
	dev/mii/atphy.c optional miibus \| atphy
	dev/mii/axphy.c optional miibus \| axphy
	dev/mii/bmtphy.c optional miibus \| bmtphy
	dev/mii/brgphy.c optional miibus \| brgphy
	dev/mii/ciphy.c optional miibus \| ciphy
	dev/mii/e1000phy.c optional miibus \| e1000phy
	dev/mii/gentbi.c optional miibus \| gentbi
	dev/mii/icsphy.c optional miibus \| icsphy
	dev/mii/ip1000phy.c optional miibus \| ip1000phy
	dev/mii/jmphy.c optional miibus \| jmphy
	dev/mii/lxtphy.c optional miibus \| lxtphy
	dev/mii/micphy.c optional miibus fdt \| micphy fdt
	dev/mii/mii.c optional miibus \| mii
	dev/mii/mii_bitbang.c optional miibus \| mii_bitbang
	dev/mii/mii_physubr.c optional miibus \| mii
	dev/mii/mii_fdt.c optional miibus fdt \| mii fdt
	dev/mii/miibus_if.m optional miibus \| mii
	dev/mii/mlphy.c optional miibus \| mlphy
	dev/mii/nsgphy.c optional miibus \| nsgphy
	dev/mii/nsphy.c optional miibus \| nsphy
	dev/mii/nsphyter.c optional miibus \| nsphyter
	dev/mii/pnaphy.c optional miibus \| pnaphy
	dev/mii/qsphy.c optional miibus \| qsphy
	dev/mii/rdcphy.c optional miibus \| rdcphy
	dev/mii/rgephy.c optional miibus \| rgephy
	dev/mii/rlphy.c optional miibus \| rlphy
	dev/mii/rlswitch.c optional rlswitch
	dev/mii/smcphy.c optional miibus \| smcphy
	dev/mii/smscphy.c optional miibus \| smscphy
	dev/mii/tdkphy.c optional miibus \| tdkphy
	dev/mii/tlphy.c optional miibus \| tlphy
	dev/mii/truephy.c optional miibus \| truephy
	dev/mii/ukphy.c optional miibus \| mii
	dev/mii/ukphy_subr.c optional miibus \| mii
	dev/mii/vscphy.c optional miibus \| vscphy
	dev/mii/xmphy.c optional miibus \| xmphy
	dev/mk48txx/mk48txx.c optional mk48txx
	dev/mlx/mlx.c optional mlx
	dev/mlx/mlx_disk.c optional mlx
	dev/mlx/mlx_pci.c optional mlx pci
	dev/mly/mly.c optional mly
	dev/mmc/mmc_subr.c optional mmc \| mmcsd
	dev/mmc/mmc.c optional mmc
	dev/mmc/mmcbr_if.m standard
	dev/mmc/mmcbus_if.m standard
	dev/mmc/mmcsd.c optional mmcsd
	dev/mn/if_mn.c optional mn pci
	dev/mpr/mpr.c optional mpr
	dev/mpr/mpr_config.c optional mpr
	# XXX Work around clang warning, until maintainer approves fix.
	dev/mpr/mpr_mapping.c optional mpr \
	compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}"
	dev/mpr/mpr_pci.c optional mpr pci
	dev/mpr/mpr_sas.c optional mpr \
	compile-with "${NORMAL_C} ${NO_WUNNEEDED_INTERNAL_DECL}"
	dev/mpr/mpr_sas_lsi.c optional mpr
	dev/mpr/mpr_table.c optional mpr
	dev/mpr/mpr_user.c optional mpr
	dev/mps/mps.c optional mps
	dev/mps/mps_config.c optional mps
	# XXX Work around clang warning, until maintainer approves fix.
	dev/mps/mps_mapping.c optional mps \
	compile-with "${NORMAL_C} ${NO_WSOMETIMES_UNINITIALIZED}"
	dev/mps/mps_pci.c optional mps pci
	dev/mps/mps_sas.c optional mps \
	compile-with "${NORMAL_C} ${NO_WUNNEEDED_INTERNAL_DECL}"
	dev/mps/mps_sas_lsi.c optional mps
	dev/mps/mps_table.c optional mps
	dev/mps/mps_user.c optional mps
	dev/mpt/mpt.c optional mpt
	dev/mpt/mpt_cam.c optional mpt
	dev/mpt/mpt_debug.c optional mpt
	dev/mpt/mpt_pci.c optional mpt pci
	dev/mpt/mpt_raid.c optional mpt
	dev/mpt/mpt_user.c optional mpt
	dev/mrsas/mrsas.c optional mrsas
	dev/mrsas/mrsas_cam.c optional mrsas
	dev/mrsas/mrsas_ioctl.c optional mrsas
	dev/mrsas/mrsas_fp.c optional mrsas
	dev/msk/if_msk.c optional msk
	dev/mvs/mvs.c optional mvs
	dev/mvs/mvs_if.m optional mvs
	dev/mvs/mvs_pci.c optional mvs pci
	dev/mwl/if_mwl.c optional mwl
	dev/mwl/if_mwl_pci.c optional mwl pci
	dev/mwl/mwlhal.c optional mwl
	mwlfw.c optional mwlfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk mw88W8363.fw:mw88W8363fw mwlboot.fw:mwlboot -mmwl -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "mwlfw.c"
	mw88W8363.fwo optional mwlfw \
	dependency "mw88W8363.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "mw88W8363.fwo"
	mw88W8363.fw optional mwlfw \
	dependency "$S/contrib/dev/mwl/mw88W8363.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "mw88W8363.fw"
	mwlboot.fwo optional mwlfw \
	dependency "mwlboot.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "mwlboot.fwo"
	mwlboot.fw optional mwlfw \
	dependency "$S/contrib/dev/mwl/mwlboot.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "mwlboot.fw"
	dev/mxge/if_mxge.c optional mxge pci
	dev/mxge/mxge_eth_z8e.c optional mxge pci
	dev/mxge/mxge_ethp_z8e.c optional mxge pci
	dev/mxge/mxge_rss_eth_z8e.c optional mxge pci
	dev/mxge/mxge_rss_ethp_z8e.c optional mxge pci
	dev/my/if_my.c optional my
	dev/nand/nand.c optional nand
	dev/nand/nand_bbt.c optional nand
	dev/nand/nand_cdev.c optional nand
	dev/nand/nand_generic.c optional nand
	dev/nand/nand_geom.c optional nand
	dev/nand/nand_id.c optional nand
	dev/nand/nandbus.c optional nand
	dev/nand/nandbus_if.m optional nand
	dev/nand/nand_if.m optional nand
	dev/nand/nandsim.c optional nandsim nand
	dev/nand/nandsim_chip.c optional nandsim nand
	dev/nand/nandsim_ctrl.c optional nandsim nand
	dev/nand/nandsim_log.c optional nandsim nand
	dev/nand/nandsim_swap.c optional nandsim nand
	dev/nand/nfc_if.m optional nand
	dev/ncr/ncr.c optional ncr pci
	dev/ncv/ncr53c500.c optional ncv
	dev/ncv/ncr53c500_pccard.c optional ncv pccard
	dev/netmap/netmap.c optional netmap
	dev/netmap/netmap_freebsd.c optional netmap
	dev/netmap/netmap_generic.c optional netmap
	dev/netmap/netmap_mbq.c optional netmap
	dev/netmap/netmap_mem2.c optional netmap
	dev/netmap/netmap_monitor.c optional netmap
	dev/netmap/netmap_offloadings.c optional netmap
	dev/netmap/netmap_pipe.c optional netmap
	dev/netmap/netmap_vale.c optional netmap
	# compile-with "${NORMAL_C} -Wconversion -Wextra"
	dev/nfsmb/nfsmb.c optional nfsmb pci
	dev/nge/if_nge.c optional nge
	dev/nxge/if_nxge.c optional nxge \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
	dev/nxge/xgehal/xgehal-device.c optional nxge \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
	dev/nxge/xgehal/xgehal-mm.c optional nxge
	dev/nxge/xgehal/xge-queue.c optional nxge
	dev/nxge/xgehal/xgehal-driver.c optional nxge \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
	dev/nxge/xgehal/xgehal-ring.c optional nxge \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
	dev/nxge/xgehal/xgehal-channel.c optional nxge \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
	dev/nxge/xgehal/xgehal-fifo.c optional nxge \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
	dev/nxge/xgehal/xgehal-stats.c optional nxge \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
	dev/nxge/xgehal/xgehal-config.c optional nxge
	dev/nxge/xgehal/xgehal-mgmt.c optional nxge \
	compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
	dev/nmdm/nmdm.c optional nmdm
	dev/nsp/nsp.c optional nsp
	dev/nsp/nsp_pccard.c optional nsp pccard
	dev/null/null.c standard
	dev/oce/oce_hw.c optional oce pci
	dev/oce/oce_if.c optional oce pci
	dev/oce/oce_mbox.c optional oce pci
	dev/oce/oce_queue.c optional oce pci
	dev/oce/oce_sysctl.c optional oce pci
	dev/oce/oce_util.c optional oce pci
	dev/ocs_fc/ocs_pci.c optional ocs_fc pci
	dev/ocs_fc/ocs_ioctl.c optional ocs_fc pci
	dev/ocs_fc/ocs_os.c optional ocs_fc pci
	dev/ocs_fc/ocs_utils.c optional ocs_fc pci
	dev/ocs_fc/ocs_hw.c optional ocs_fc pci
	dev/ocs_fc/ocs_hw_queues.c optional ocs_fc pci
	dev/ocs_fc/sli4.c optional ocs_fc pci
	dev/ocs_fc/ocs_sm.c optional ocs_fc pci
	dev/ocs_fc/ocs_device.c optional ocs_fc pci
	dev/ocs_fc/ocs_xport.c optional ocs_fc pci
	dev/ocs_fc/ocs_domain.c optional ocs_fc pci
	dev/ocs_fc/ocs_sport.c optional ocs_fc pci
	dev/ocs_fc/ocs_els.c optional ocs_fc pci
	dev/ocs_fc/ocs_fabric.c optional ocs_fc pci
	dev/ocs_fc/ocs_io.c optional ocs_fc pci
	dev/ocs_fc/ocs_node.c optional ocs_fc pci
	dev/ocs_fc/ocs_scsi.c optional ocs_fc pci
	dev/ocs_fc/ocs_unsol.c optional ocs_fc pci
	dev/ocs_fc/ocs_ddump.c optional ocs_fc pci
	dev/ocs_fc/ocs_mgmt.c optional ocs_fc pci
	dev/ocs_fc/ocs_cam.c optional ocs_fc pci
	dev/ofw/ofw_bus_if.m optional fdt
	dev/ofw/ofw_bus_subr.c optional fdt
	dev/ofw/ofw_fdt.c optional fdt
	dev/ofw/ofw_if.m optional fdt
	dev/ofw/ofw_subr.c optional fdt
	dev/ofw/ofwbus.c optional fdt
	dev/ofw/openfirm.c optional fdt
	dev/ofw/openfirmio.c optional fdt
	dev/ow/ow.c optional ow \
	dependency "owll_if.h" \
	dependency "own_if.h"
	dev/ow/owll_if.m optional ow
	dev/ow/own_if.m optional ow
	dev/ow/ow_temp.c optional ow_temp
	dev/ow/owc_gpiobus.c optional owc gpio
	dev/patm/if_patm.c optional patm pci
	dev/patm/if_patm_attach.c optional patm pci
	dev/patm/if_patm_intr.c optional patm pci
	dev/patm/if_patm_ioctl.c optional patm pci
	dev/patm/if_patm_rtables.c optional patm pci
	dev/patm/if_patm_rx.c optional patm pci
	dev/patm/if_patm_tx.c optional patm pci
	dev/pbio/pbio.c optional pbio isa
	dev/pccard/card_if.m standard
	dev/pccard/pccard.c optional pccard
	dev/pccard/pccard_cis.c optional pccard
	dev/pccard/pccard_cis_quirks.c optional pccard
	dev/pccard/pccard_device.c optional pccard
	dev/pccard/power_if.m standard
	dev/pccbb/pccbb.c optional cbb
	dev/pccbb/pccbb_isa.c optional cbb isa
	dev/pccbb/pccbb_pci.c optional cbb pci
	dev/pcf/pcf.c optional pcf
	dev/pci/eisa_pci.c optional pci eisa
	dev/pci/fixup_pci.c optional pci
	dev/pci/hostb_pci.c optional pci
	dev/pci/ignore_pci.c optional pci
	dev/pci/isa_pci.c optional pci isa
	dev/pci/pci.c optional pci
	dev/pci/pci_if.m standard
	dev/pci/pci_iov.c optional pci pci_iov
	dev/pci/pci_iov_if.m standard
	dev/pci/pci_iov_schema.c optional pci pci_iov
	dev/pci/pci_pci.c optional pci
	dev/pci/pci_subr.c optional pci
	dev/pci/pci_user.c optional pci
	dev/pci/pcib_if.m standard
	dev/pci/pcib_support.c standard
	dev/pci/vga_pci.c optional pci
	dev/pcn/if_pcn.c optional pcn pci
	dev/pdq/if_fea.c optional fea eisa
	dev/pdq/if_fpa.c optional fpa pci
	dev/pdq/pdq.c optional nowerror fea eisa \| fpa pci
	dev/pdq/pdq_ifsubr.c optional nowerror fea eisa \| fpa pci
	dev/pms/freebsd/driver/ini/src/agtiapi.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/sadisc.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/mpi.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/saframe.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/sahw.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/sainit.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/saint.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/sampicmd.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/sampirsp.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/saphy.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/saport.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/sasata.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/sasmp.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/sassp.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/satimer.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/sautil.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/saioctlcmd.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sallsdk/spc/mpidebug.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/discovery/dm/dminit.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/discovery/dm/dmsmp.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/discovery/dm/dmdisc.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/discovery/dm/dmport.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/discovery/dm/dmtimer.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/discovery/dm/dmmisc.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sat/src/sminit.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sat/src/smmisc.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sat/src/smsat.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sat/src/smsatcb.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sat/src/smsathw.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/sat/src/smtimer.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/tdinit.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/tdmisc.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/tdesgl.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/tdport.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/tdint.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/tdioctl.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/tdhw.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/ossacmnapi.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/tddmcmnapi.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/tdsmcmnapi.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/common/tdtimers.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/sas/ini/itdio.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/sas/ini/itdcb.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/sas/ini/itdinit.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/sas/ini/itddisc.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/sata/host/sat.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/sata/host/ossasat.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/pms/RefTisa/tisa/sassata/sata/host/sathw.c optional pmspcv \
	compile-with "${NORMAL_C} -Wunused-variable -Woverflow -Wparentheses -w"
	dev/ppbus/if_plip.c optional plip
	dev/ppbus/immio.c optional vpo
	dev/ppbus/lpbb.c optional lpbb
	dev/ppbus/lpt.c optional lpt
	dev/ppbus/pcfclock.c optional pcfclock
	dev/ppbus/ppb_1284.c optional ppbus
	dev/ppbus/ppb_base.c optional ppbus
	dev/ppbus/ppb_msq.c optional ppbus
	dev/ppbus/ppbconf.c optional ppbus
	dev/ppbus/ppbus_if.m optional ppbus
	dev/ppbus/ppi.c optional ppi
	dev/ppbus/pps.c optional pps
	dev/ppbus/vpo.c optional vpo
	dev/ppbus/vpoio.c optional vpo
	dev/ppc/ppc.c optional ppc
	dev/ppc/ppc_acpi.c optional ppc acpi
	dev/ppc/ppc_isa.c optional ppc isa
	dev/ppc/ppc_pci.c optional ppc pci
	dev/ppc/ppc_puc.c optional ppc puc
	dev/proto/proto_bus_isa.c optional proto acpi \| proto isa
	dev/proto/proto_bus_pci.c optional proto pci
	dev/proto/proto_busdma.c optional proto
	dev/proto/proto_core.c optional proto
	dev/pst/pst-iop.c optional pst
	dev/pst/pst-pci.c optional pst pci
	dev/pst/pst-raid.c optional pst
	dev/pty/pty.c optional pty
	dev/puc/puc.c optional puc
	dev/puc/puc_cfg.c optional puc
	dev/puc/puc_pccard.c optional puc pccard
	dev/puc/puc_pci.c optional puc pci
	dev/puc/pucdata.c optional puc pci
	dev/quicc/quicc_core.c optional quicc
	dev/ral/rt2560.c optional ral
	dev/ral/rt2661.c optional ral
	dev/ral/rt2860.c optional ral
	dev/ral/if_ral_pci.c optional ral pci
	rt2561fw.c optional rt2561fw \| ralfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk rt2561.fw:rt2561fw -mrt2561 -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "rt2561fw.c"
	rt2561fw.fwo optional rt2561fw \| ralfw \
	dependency "rt2561.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "rt2561fw.fwo"
	rt2561.fw optional rt2561fw \| ralfw \
	dependency "$S/contrib/dev/ral/rt2561.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "rt2561.fw"
	rt2561sfw.c optional rt2561sfw \| ralfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk rt2561s.fw:rt2561sfw -mrt2561s -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "rt2561sfw.c"
	rt2561sfw.fwo optional rt2561sfw \| ralfw \
	dependency "rt2561s.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "rt2561sfw.fwo"
	rt2561s.fw optional rt2561sfw \| ralfw \
	dependency "$S/contrib/dev/ral/rt2561s.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "rt2561s.fw"
	rt2661fw.c optional rt2661fw \| ralfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk rt2661.fw:rt2661fw -mrt2661 -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "rt2661fw.c"
	rt2661fw.fwo optional rt2661fw \| ralfw \
	dependency "rt2661.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "rt2661fw.fwo"
	rt2661.fw optional rt2661fw \| ralfw \
	dependency "$S/contrib/dev/ral/rt2661.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "rt2661.fw"
	rt2860fw.c optional rt2860fw \| ralfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk rt2860.fw:rt2860fw -mrt2860 -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "rt2860fw.c"
	rt2860fw.fwo optional rt2860fw \| ralfw \
	dependency "rt2860.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "rt2860fw.fwo"
	rt2860.fw optional rt2860fw \| ralfw \
	dependency "$S/contrib/dev/ral/rt2860.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "rt2860.fw"
	dev/random/random_infra.c optional random
	dev/random/random_harvestq.c optional random
	dev/random/randomdev.c optional random random_yarrow \| \
	random !random_yarrow !random_loadable
	dev/random/yarrow.c optional random random_yarrow
	dev/random/fortuna.c optional random !random_yarrow !random_loadable
	dev/random/hash.c optional random random_yarrow \| \
	random !random_yarrow !random_loadable
	dev/rc/rc.c optional rc
	dev/rccgpio/rccgpio.c optional rccgpio gpio
	dev/re/if_re.c optional re
	dev/rl/if_rl.c optional rl pci
	dev/rndtest/rndtest.c optional rndtest
	dev/rp/rp.c optional rp
	dev/rp/rp_isa.c optional rp isa
	dev/rp/rp_pci.c optional rp pci
	dev/rtwn/if_rtwn.c optional rtwn
	rtwn-rtl8192cfwU.c optional rtwn-rtl8192cfwU \| rtwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk rtwn-rtl8192cfwU.fw:rtwn-rtl8192cfwU:111 -mrtwn-rtl8192cfwU -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "rtwn-rtl8192cfwU.c"
	rtwn-rtl8192cfwU.fwo optional rtwn-rtl8192cfwU \| rtwnfw \
	dependency "rtwn-rtl8192cfwU.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "rtwn-rtl8192cfwU.fwo"
	rtwn-rtl8192cfwU.fw optional rtwn-rtl8192cfwU \| rtwnfw \
	dependency "$S/contrib/dev/rtwn/rtwn-rtl8192cfwU.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "rtwn-rtl8192cfwU.fw"
	rtwn-rtl8192cfwU_B.c optional rtwn-rtl8192cfwU_B \| rtwnfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk rtwn-rtl8192cfwU_B.fw:rtwn-rtl8192cfwU_B:111 -mrtwn-rtl8192cfwU_B -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "rtwn-rtl8192cfwU_B.c"
	rtwn-rtl8192cfwU_B.fwo optional rtwn-rtl8192cfwU_B \| rtwnfw \
	dependency "rtwn-rtl8192cfwU_B.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "rtwn-rtl8192cfwU_B.fwo"
	rtwn-rtl8192cfwU_B.fw optional rtwn-rtl8192cfwU_B \| rtwnfw \
	dependency "$S/contrib/dev/rtwn/rtwn-rtl8192cfwU_B.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "rtwn-rtl8192cfwU_B.fw"
	dev/safe/safe.c optional safe
	dev/scc/scc_if.m optional scc
	dev/scc/scc_bfe_ebus.c optional scc ebus
	dev/scc/scc_bfe_quicc.c optional scc quicc
	dev/scc/scc_bfe_sbus.c optional scc fhc \| scc sbus
	dev/scc/scc_core.c optional scc
	dev/scc/scc_dev_quicc.c optional scc quicc
	dev/scc/scc_dev_sab82532.c optional scc
	dev/scc/scc_dev_z8530.c optional scc
	dev/scd/scd.c optional scd isa
	dev/scd/scd_isa.c optional scd isa
	dev/sdhci/sdhci.c optional sdhci
	dev/sdhci/sdhci_fdt_gpio.c optional sdhci fdt gpio
	dev/sdhci/sdhci_if.m optional sdhci
	dev/sdhci/sdhci_acpi.c optional sdhci acpi
	dev/sdhci/sdhci_pci.c optional sdhci pci
	dev/sf/if_sf.c optional sf pci
	dev/sge/if_sge.c optional sge pci
	dev/si/si.c optional si \
	compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}"
	dev/si/si2_z280.c optional si
	dev/si/si3_t225.c optional si
	dev/si/si_eisa.c optional si eisa
	dev/si/si_isa.c optional si isa
	dev/si/si_pci.c optional si pci
	dev/siba/siba_bwn.c optional siba_bwn pci
	dev/siba/siba_core.c optional siba_bwn pci
	dev/siis/siis.c optional siis pci
	dev/sis/if_sis.c optional sis pci
	dev/sk/if_sk.c optional sk pci
	dev/smbus/smb.c optional smb
	dev/smbus/smbconf.c optional smbus
	dev/smbus/smbus.c optional smbus
	dev/smbus/smbus_if.m optional smbus
	dev/smc/if_smc.c optional smc
	dev/smc/if_smc_fdt.c optional smc fdt
	dev/sn/if_sn.c optional sn
	dev/sn/if_sn_isa.c optional sn isa
	dev/sn/if_sn_pccard.c optional sn pccard
	dev/snp/snp.c optional snp
	dev/sound/clone.c optional sound
	dev/sound/unit.c optional sound
	dev/sound/isa/ad1816.c optional snd_ad1816 isa
	dev/sound/isa/ess.c optional snd_ess isa
	dev/sound/isa/gusc.c optional snd_gusc isa
	dev/sound/isa/mss.c optional snd_mss isa
	dev/sound/isa/sb16.c optional snd_sb16 isa
	dev/sound/isa/sb8.c optional snd_sb8 isa
	dev/sound/isa/sbc.c optional snd_sbc isa
	dev/sound/isa/sndbuf_dma.c optional sound isa
	dev/sound/pci/als4000.c optional snd_als4000 pci
	dev/sound/pci/atiixp.c optional snd_atiixp pci
	dev/sound/pci/cmi.c optional snd_cmi pci
	dev/sound/pci/cs4281.c optional snd_cs4281 pci
	dev/sound/pci/csa.c optional snd_csa pci
	dev/sound/pci/csapcm.c optional snd_csa pci
	dev/sound/pci/ds1.c optional snd_ds1 pci
	dev/sound/pci/emu10k1.c optional snd_emu10k1 pci
	dev/sound/pci/emu10kx.c optional snd_emu10kx pci
	dev/sound/pci/emu10kx-pcm.c optional snd_emu10kx pci
	dev/sound/pci/emu10kx-midi.c optional snd_emu10kx pci
	dev/sound/pci/envy24.c optional snd_envy24 pci
	dev/sound/pci/envy24ht.c optional snd_envy24ht pci
	dev/sound/pci/es137x.c optional snd_es137x pci
	dev/sound/pci/fm801.c optional snd_fm801 pci
	dev/sound/pci/ich.c optional snd_ich pci
	dev/sound/pci/maestro.c optional snd_maestro pci
	dev/sound/pci/maestro3.c optional snd_maestro3 pci
	dev/sound/pci/neomagic.c optional snd_neomagic pci
	dev/sound/pci/solo.c optional snd_solo pci
	dev/sound/pci/spicds.c optional snd_spicds pci
	dev/sound/pci/t4dwave.c optional snd_t4dwave pci
	dev/sound/pci/via8233.c optional snd_via8233 pci
	dev/sound/pci/via82c686.c optional snd_via82c686 pci
	dev/sound/pci/vibes.c optional snd_vibes pci
	dev/sound/pci/hda/hdaa.c optional snd_hda pci
	dev/sound/pci/hda/hdaa_patches.c optional snd_hda pci
	dev/sound/pci/hda/hdac.c optional snd_hda pci
	dev/sound/pci/hda/hdac_if.m optional snd_hda pci
	dev/sound/pci/hda/hdacc.c optional snd_hda pci
	dev/sound/pci/hdspe.c optional snd_hdspe pci
	dev/sound/pci/hdspe-pcm.c optional snd_hdspe pci
	dev/sound/pcm/ac97.c optional sound
	dev/sound/pcm/ac97_if.m optional sound
	dev/sound/pcm/ac97_patch.c optional sound
	dev/sound/pcm/buffer.c optional sound \
	dependency "snd_fxdiv_gen.h"
	dev/sound/pcm/channel.c optional sound
	dev/sound/pcm/channel_if.m optional sound
	dev/sound/pcm/dsp.c optional sound
	dev/sound/pcm/feeder.c optional sound
	dev/sound/pcm/feeder_chain.c optional sound
	dev/sound/pcm/feeder_eq.c optional sound \
	dependency "feeder_eq_gen.h" \
	dependency "snd_fxdiv_gen.h"
	dev/sound/pcm/feeder_if.m optional sound
	dev/sound/pcm/feeder_format.c optional sound \
	dependency "snd_fxdiv_gen.h"
	dev/sound/pcm/feeder_matrix.c optional sound \
	dependency "snd_fxdiv_gen.h"
	dev/sound/pcm/feeder_mixer.c optional sound \
	dependency "snd_fxdiv_gen.h"
	dev/sound/pcm/feeder_rate.c optional sound \
	dependency "feeder_rate_gen.h" \
	dependency "snd_fxdiv_gen.h"
	dev/sound/pcm/feeder_volume.c optional sound \
	dependency "snd_fxdiv_gen.h"
	dev/sound/pcm/mixer.c optional sound
	dev/sound/pcm/mixer_if.m optional sound
	dev/sound/pcm/sndstat.c optional sound
	dev/sound/pcm/sound.c optional sound
	dev/sound/pcm/vchan.c optional sound
	dev/sound/usb/uaudio.c optional snd_uaudio usb
	dev/sound/usb/uaudio_pcm.c optional snd_uaudio usb
	dev/sound/midi/midi.c optional sound
	dev/sound/midi/mpu401.c optional sound
	dev/sound/midi/mpu_if.m optional sound
	dev/sound/midi/mpufoi_if.m optional sound
	dev/sound/midi/sequencer.c optional sound
	dev/sound/midi/synth_if.m optional sound
	dev/spibus/ofw_spibus.c optional fdt spibus
	dev/spibus/spibus.c optional spibus \
	dependency "spibus_if.h"
	dev/spibus/spigen.c optional spigen
	dev/spibus/spibus_if.m optional spibus
	dev/ste/if_ste.c optional ste pci
	dev/stg/tmc18c30.c optional stg
	dev/stg/tmc18c30_isa.c optional stg isa
	dev/stg/tmc18c30_pccard.c optional stg pccard
	dev/stg/tmc18c30_pci.c optional stg pci
	dev/stg/tmc18c30_subr.c optional stg
	dev/stge/if_stge.c optional stge
	dev/streams/streams.c optional streams
	dev/sym/sym_hipd.c optional sym \
	dependency "$S/dev/sym/sym_{conf,defs}.h"
	dev/syscons/blank/blank_saver.c optional blank_saver
	dev/syscons/daemon/daemon_saver.c optional daemon_saver
	dev/syscons/dragon/dragon_saver.c optional dragon_saver
	dev/syscons/fade/fade_saver.c optional fade_saver
	dev/syscons/fire/fire_saver.c optional fire_saver
	dev/syscons/green/green_saver.c optional green_saver
	dev/syscons/logo/logo.c optional logo_saver
	dev/syscons/logo/logo_saver.c optional logo_saver
	dev/syscons/rain/rain_saver.c optional rain_saver
	dev/syscons/schistory.c optional sc
	dev/syscons/scmouse.c optional sc
	dev/syscons/scterm.c optional sc
	dev/syscons/scvidctl.c optional sc
	dev/syscons/snake/snake_saver.c optional snake_saver
	dev/syscons/star/star_saver.c optional star_saver
	dev/syscons/syscons.c optional sc
	dev/syscons/sysmouse.c optional sc
	dev/syscons/warp/warp_saver.c optional warp_saver
	dev/tdfx/tdfx_linux.c optional tdfx_linux tdfx compat_linux
	dev/tdfx/tdfx_pci.c optional tdfx pci
	dev/ti/if_ti.c optional ti pci
	dev/tl/if_tl.c optional tl pci
	dev/trm/trm.c optional trm
	dev/twa/tw_cl_init.c optional twa \
	compile-with "${NORMAL_C} -I$S/dev/twa"
	dev/twa/tw_cl_intr.c optional twa \
	compile-with "${NORMAL_C} -I$S/dev/twa"
	dev/twa/tw_cl_io.c optional twa \
	compile-with "${NORMAL_C} -I$S/dev/twa"
	dev/twa/tw_cl_misc.c optional twa \
	compile-with "${NORMAL_C} -I$S/dev/twa"
	dev/twa/tw_osl_cam.c optional twa \
	compile-with "${NORMAL_C} -I$S/dev/twa"
	dev/twa/tw_osl_freebsd.c optional twa \
	compile-with "${NORMAL_C} -I$S/dev/twa"
	dev/twe/twe.c optional twe
	dev/twe/twe_freebsd.c optional twe
	dev/tws/tws.c optional tws
	dev/tws/tws_cam.c optional tws
	dev/tws/tws_hdm.c optional tws
	dev/tws/tws_services.c optional tws
	dev/tws/tws_user.c optional tws
	dev/tx/if_tx.c optional tx
	dev/txp/if_txp.c optional txp
	dev/uart/uart_bus_acpi.c optional uart acpi
	dev/uart/uart_bus_ebus.c optional uart ebus
	dev/uart/uart_bus_fdt.c optional uart fdt
	dev/uart/uart_bus_isa.c optional uart isa
	dev/uart/uart_bus_pccard.c optional uart pccard
	dev/uart/uart_bus_pci.c optional uart pci
	dev/uart/uart_bus_puc.c optional uart puc
	dev/uart/uart_bus_scc.c optional uart scc
	dev/uart/uart_core.c optional uart
	dev/uart/uart_dbg.c optional uart gdb
	dev/uart/uart_dev_ns8250.c optional uart uart_ns8250 \| uart uart_snps
	dev/uart/uart_dev_pl011.c optional uart pl011
	dev/uart/uart_dev_quicc.c optional uart quicc
	dev/uart/uart_dev_sab82532.c optional uart uart_sab82532
	dev/uart/uart_dev_sab82532.c optional uart scc
	dev/uart/uart_dev_snps.c optional uart uart_snps
	dev/uart/uart_dev_z8530.c optional uart uart_z8530
	dev/uart/uart_dev_z8530.c optional uart scc
	dev/uart/uart_if.m optional uart
	dev/uart/uart_subr.c optional uart
	dev/uart/uart_tty.c optional uart
	dev/ubsec/ubsec.c optional ubsec
	#
	# USB controller drivers
	#
	dev/usb/controller/at91dci.c optional at91dci
	dev/usb/controller/at91dci_atmelarm.c optional at91dci at91rm9200
	dev/usb/controller/musb_otg.c optional musb
	dev/usb/controller/musb_otg_atmelarm.c optional musb at91rm9200
	dev/usb/controller/dwc_otg.c optional dwcotg
	dev/usb/controller/dwc_otg_fdt.c optional dwcotg fdt
	dev/usb/controller/ehci.c optional ehci
	dev/usb/controller/ehci_pci.c optional ehci pci
	dev/usb/controller/ohci.c optional ohci
	dev/usb/controller/ohci_pci.c optional ohci pci
	dev/usb/controller/uhci.c optional uhci
	dev/usb/controller/uhci_pci.c optional uhci pci
	dev/usb/controller/xhci.c optional xhci
	dev/usb/controller/xhci_pci.c optional xhci pci
	dev/usb/controller/saf1761_otg.c optional saf1761otg
	dev/usb/controller/saf1761_otg_fdt.c optional saf1761otg fdt
	dev/usb/controller/uss820dci.c optional uss820dci
	dev/usb/controller/uss820dci_atmelarm.c optional uss820dci at91rm9200
	dev/usb/controller/usb_controller.c optional usb
	#
	# USB storage drivers
	#
	dev/usb/storage/cfumass.c optional cfumass ctl
	dev/usb/storage/umass.c optional umass
	dev/usb/storage/urio.c optional urio
	dev/usb/storage/ustorage_fs.c optional usfs
	#
	# USB core
	#
	dev/usb/usb_busdma.c optional usb
	dev/usb/usb_core.c optional usb
	dev/usb/usb_debug.c optional usb
	dev/usb/usb_dev.c optional usb
	dev/usb/usb_device.c optional usb
	dev/usb/usb_dynamic.c optional usb
	dev/usb/usb_error.c optional usb
	dev/usb/usb_generic.c optional usb
	dev/usb/usb_handle_request.c optional usb
	dev/usb/usb_hid.c optional usb
	dev/usb/usb_hub.c optional usb
	dev/usb/usb_if.m optional usb
	dev/usb/usb_lookup.c optional usb
	dev/usb/usb_mbuf.c optional usb
	dev/usb/usb_msctest.c optional usb
	dev/usb/usb_parse.c optional usb
	dev/usb/usb_pf.c optional usb
	dev/usb/usb_process.c optional usb
	dev/usb/usb_request.c optional usb
	dev/usb/usb_transfer.c optional usb
	dev/usb/usb_util.c optional usb
	#
	# USB network drivers
	#
	dev/usb/net/if_aue.c optional aue
	dev/usb/net/if_axe.c optional axe
	dev/usb/net/if_axge.c optional axge
	dev/usb/net/if_cdce.c optional cdce
	dev/usb/net/if_cue.c optional cue
	dev/usb/net/if_ipheth.c optional ipheth
	dev/usb/net/if_kue.c optional kue
	dev/usb/net/if_mos.c optional mos
	dev/usb/net/if_rue.c optional rue
	dev/usb/net/if_smsc.c optional smsc
	dev/usb/net/if_udav.c optional udav
	dev/usb/net/if_ure.c optional ure
	dev/usb/net/if_usie.c optional usie
	dev/usb/net/if_urndis.c optional urndis
	dev/usb/net/ruephy.c optional rue
	dev/usb/net/usb_ethernet.c optional uether \| aue \| axe \| axge \| cdce \| \
	cue \| ipheth \| kue \| mos \| rue \| \
	smsc \| udav \| ure \| urndis
	dev/usb/net/uhso.c optional uhso
	#
	# USB WLAN drivers
	#
	dev/usb/wlan/if_rsu.c optional rsu
	rsu-rtl8712fw.c optional rsu-rtl8712fw \| rsufw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk rsu-rtl8712fw.fw:rsu-rtl8712fw:120 -mrsu-rtl8712fw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "rsu-rtl8712fw.c"
	rsu-rtl8712fw.fwo optional rsu-rtl8712fw \| rsufw \
	dependency "rsu-rtl8712fw.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "rsu-rtl8712fw.fwo"
	rsu-rtl8712fw.fw optional rsu-rtl8712.fw \| rsufw \
	dependency "$S/contrib/dev/rsu/rsu-rtl8712fw.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "rsu-rtl8712fw.fw"
	dev/usb/wlan/if_rum.c optional rum
	dev/usb/wlan/if_run.c optional run
	runfw.c optional runfw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk run.fw:runfw -mrunfw -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "runfw.c"
	runfw.fwo optional runfw \
	dependency "run.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "runfw.fwo"
	run.fw optional runfw \
	dependency "$S/contrib/dev/run/rt2870.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "run.fw"
	dev/usb/wlan/if_uath.c optional uath
	dev/usb/wlan/if_upgt.c optional upgt
	dev/usb/wlan/if_ural.c optional ural
	dev/usb/wlan/if_urtw.c optional urtw
	dev/usb/wlan/if_zyd.c optional zyd
	#
	# USB serial and parallel port drivers
	#
	dev/usb/serial/u3g.c optional u3g
	dev/usb/serial/uark.c optional uark
	dev/usb/serial/ubsa.c optional ubsa
	dev/usb/serial/ubser.c optional ubser
	dev/usb/serial/uchcom.c optional uchcom
	dev/usb/serial/ucycom.c optional ucycom
	dev/usb/serial/ufoma.c optional ufoma
	dev/usb/serial/uftdi.c optional uftdi
	dev/usb/serial/ugensa.c optional ugensa
	dev/usb/serial/uipaq.c optional uipaq
	dev/usb/serial/ulpt.c optional ulpt
	dev/usb/serial/umcs.c optional umcs
	dev/usb/serial/umct.c optional umct
	dev/usb/serial/umodem.c optional umodem
	dev/usb/serial/umoscom.c optional umoscom
	dev/usb/serial/uplcom.c optional uplcom
	dev/usb/serial/uslcom.c optional uslcom
	dev/usb/serial/uvisor.c optional uvisor
	dev/usb/serial/uvscom.c optional uvscom
	dev/usb/serial/usb_serial.c optional ucom \| u3g \| uark \| ubsa \| ubser \| \
	uchcom \| ucycom \| ufoma \| uftdi \| \
	ugensa \| uipaq \| umcs \| umct \| \
	umodem \| umoscom \| uplcom \| usie \| \
	uslcom \| uvisor \| uvscom
	#
	# USB misc drivers
	#
	dev/usb/misc/ufm.c optional ufm
	dev/usb/misc/udbp.c optional udbp
	dev/usb/misc/ugold.c optional ugold
	dev/usb/misc/uled.c optional uled
	#
	# USB input drivers
	#
	dev/usb/input/atp.c optional atp
	dev/usb/input/uep.c optional uep
	dev/usb/input/uhid.c optional uhid
	dev/usb/input/ukbd.c optional ukbd
	dev/usb/input/ums.c optional ums
	dev/usb/input/wmt.c optional wmt
	dev/usb/input/wsp.c optional wsp
	#
	# USB quirks
	#
	dev/usb/quirk/usb_quirk.c optional usb
	#
	# USB templates
	#
	dev/usb/template/usb_template.c optional usb_template
	dev/usb/template/usb_template_audio.c optional usb_template
	dev/usb/template/usb_template_cdce.c optional usb_template
	dev/usb/template/usb_template_kbd.c optional usb_template
	dev/usb/template/usb_template_modem.c optional usb_template
	dev/usb/template/usb_template_mouse.c optional usb_template
	dev/usb/template/usb_template_msc.c optional usb_template
	dev/usb/template/usb_template_mtp.c optional usb_template
	dev/usb/template/usb_template_phone.c optional usb_template
	dev/usb/template/usb_template_serialnet.c optional usb_template
	dev/usb/template/usb_template_midi.c optional usb_template
	#
	# USB video drivers
	#
	dev/usb/video/udl.c optional udl
	#
	# USB END
	#
	dev/videomode/videomode.c optional videomode
	dev/videomode/edid.c optional videomode
	dev/videomode/pickmode.c optional videomode
	dev/videomode/vesagtf.c optional videomode
	dev/utopia/idtphy.c optional utopia
	dev/utopia/suni.c optional utopia
	dev/utopia/utopia.c optional utopia
	dev/vge/if_vge.c optional vge
	dev/viapm/viapm.c optional viapm pci
	dev/virtio/virtio.c optional virtio
	dev/virtio/virtqueue.c optional virtio
	dev/virtio/virtio_bus_if.m optional virtio
	dev/virtio/virtio_if.m optional virtio
	dev/virtio/pci/virtio_pci.c optional virtio_pci
	dev/virtio/mmio/virtio_mmio.c optional virtio_mmio
	dev/virtio/mmio/virtio_mmio_if.m optional virtio_mmio
	dev/virtio/network/if_vtnet.c optional vtnet
	dev/virtio/block/virtio_blk.c optional virtio_blk
	dev/virtio/balloon/virtio_balloon.c optional virtio_balloon
	dev/virtio/scsi/virtio_scsi.c optional virtio_scsi
	dev/virtio/random/virtio_random.c optional virtio_random
	dev/virtio/console/virtio_console.c optional virtio_console
	dev/vkbd/vkbd.c optional vkbd
	dev/vr/if_vr.c optional vr pci
	dev/vt/colors/vt_termcolors.c optional vt
	dev/vt/font/vt_font_default.c optional vt
	dev/vt/font/vt_mouse_cursor.c optional vt
	dev/vt/hw/efifb/efifb.c optional vt_efifb
	dev/vt/hw/fb/vt_fb.c optional vt
	dev/vt/hw/vga/vt_vga.c optional vt vt_vga
	dev/vt/logo/logo_freebsd.c optional vt splash
	dev/vt/logo/logo_beastie.c optional vt splash
	dev/vt/vt_buf.c optional vt
	dev/vt/vt_consolectl.c optional vt
	dev/vt/vt_core.c optional vt
	dev/vt/vt_cpulogos.c optional vt splash
	dev/vt/vt_font.c optional vt
	dev/vt/vt_sysmouse.c optional vt
	dev/vte/if_vte.c optional vte pci
	dev/vx/if_vx.c optional vx
	dev/vx/if_vx_eisa.c optional vx eisa
	dev/vx/if_vx_pci.c optional vx pci
	dev/vxge/vxge.c optional vxge
	dev/vxge/vxgehal/vxgehal-ifmsg.c optional vxge
	dev/vxge/vxgehal/vxgehal-mrpcim.c optional vxge
	dev/vxge/vxgehal/vxge-queue.c optional vxge
	dev/vxge/vxgehal/vxgehal-ring.c optional vxge
	dev/vxge/vxgehal/vxgehal-swapper.c optional vxge
	dev/vxge/vxgehal/vxgehal-mgmt.c optional vxge
	dev/vxge/vxgehal/vxgehal-srpcim.c optional vxge
	dev/vxge/vxgehal/vxgehal-config.c optional vxge
	dev/vxge/vxgehal/vxgehal-blockpool.c optional vxge
	dev/vxge/vxgehal/vxgehal-doorbells.c optional vxge
	dev/vxge/vxgehal/vxgehal-mgmtaux.c optional vxge
	dev/vxge/vxgehal/vxgehal-device.c optional vxge
	dev/vxge/vxgehal/vxgehal-mm.c optional vxge
	dev/vxge/vxgehal/vxgehal-driver.c optional vxge
	dev/vxge/vxgehal/vxgehal-virtualpath.c optional vxge
	dev/vxge/vxgehal/vxgehal-channel.c optional vxge
	dev/vxge/vxgehal/vxgehal-fifo.c optional vxge
	dev/watchdog/watchdog.c standard
	dev/wb/if_wb.c optional wb pci
	dev/wds/wd7000.c optional wds isa
	dev/wi/if_wi.c optional wi
	dev/wi/if_wi_pccard.c optional wi pccard
	dev/wi/if_wi_pci.c optional wi pci
	dev/wl/if_wl.c optional wl isa
	dev/wpi/if_wpi.c optional wpi pci
	wpifw.c optional wpifw \
	compile-with "${AWK} -f $S/tools/fw_stub.awk wpi.fw:wpifw:153229 -mwpi -c${.TARGET}" \
	no-implicit-rule before-depend local \
	clean "wpifw.c"
	wpifw.fwo optional wpifw \
	dependency "wpi.fw" \
	compile-with "${NORMAL_FWO}" \
	no-implicit-rule \
	clean "wpifw.fwo"
	wpi.fw optional wpifw \
	dependency "$S/contrib/dev/wpi/iwlwifi-3945-15.32.2.9.fw.uu" \
	compile-with "${NORMAL_FW}" \
	no-obj no-implicit-rule \
	clean "wpi.fw"
	dev/xe/if_xe.c optional xe
	dev/xe/if_xe_pccard.c optional xe pccard
	dev/xen/balloon/balloon.c optional xenhvm
	dev/xen/blkfront/blkfront.c optional xenhvm
	dev/xen/blkback/blkback.c optional xenhvm
	dev/xen/console/xen_console.c optional xenhvm
	dev/xen/control/control.c optional xenhvm
	dev/xen/grant_table/grant_table.c optional xenhvm
	dev/xen/netback/netback.c optional xenhvm
	dev/xen/netfront/netfront.c optional xenhvm
	dev/xen/xenpci/xenpci.c optional xenpci
	dev/xen/timer/timer.c optional xenhvm
	dev/xen/pvcpu/pvcpu.c optional xenhvm
	dev/xen/xenstore/xenstore.c optional xenhvm
	dev/xen/xenstore/xenstore_dev.c optional xenhvm
	dev/xen/xenstore/xenstored_dev.c optional xenhvm
	dev/xen/evtchn/evtchn_dev.c optional xenhvm
	dev/xen/privcmd/privcmd.c optional xenhvm
	dev/xen/debug/debug.c optional xenhvm
	dev/xl/if_xl.c optional xl pci
	dev/xl/xlphy.c optional xl pci
	fs/autofs/autofs.c optional autofs
	fs/autofs/autofs_vfsops.c optional autofs
	fs/autofs/autofs_vnops.c optional autofs
	fs/deadfs/dead_vnops.c standard
	fs/devfs/devfs_devs.c standard
	fs/devfs/devfs_dir.c standard
	fs/devfs/devfs_rule.c standard
	fs/devfs/devfs_vfsops.c standard
	fs/devfs/devfs_vnops.c standard
	fs/fdescfs/fdesc_vfsops.c optional fdescfs
	fs/fdescfs/fdesc_vnops.c optional fdescfs
	fs/fifofs/fifo_vnops.c standard
	fs/cuse/cuse.c optional cuse
	fs/fuse/fuse_device.c optional fuse
	fs/fuse/fuse_file.c optional fuse
	fs/fuse/fuse_internal.c optional fuse
	fs/fuse/fuse_io.c optional fuse
	fs/fuse/fuse_ipc.c optional fuse
	fs/fuse/fuse_main.c optional fuse
	fs/fuse/fuse_node.c optional fuse
	fs/fuse/fuse_vfsops.c optional fuse
	fs/fuse/fuse_vnops.c optional fuse
	fs/msdosfs/msdosfs_conv.c optional msdosfs
	fs/msdosfs/msdosfs_denode.c optional msdosfs
	fs/msdosfs/msdosfs_fat.c optional msdosfs
	fs/msdosfs/msdosfs_fileno.c optional msdosfs
	fs/msdosfs/msdosfs_iconv.c optional msdosfs_iconv
	fs/msdosfs/msdosfs_lookup.c optional msdosfs
	fs/msdosfs/msdosfs_vfsops.c optional msdosfs
	fs/msdosfs/msdosfs_vnops.c optional msdosfs
	fs/nandfs/bmap.c optional nandfs
	fs/nandfs/nandfs_alloc.c optional nandfs
	fs/nandfs/nandfs_bmap.c optional nandfs
	fs/nandfs/nandfs_buffer.c optional nandfs
	fs/nandfs/nandfs_cleaner.c optional nandfs
	fs/nandfs/nandfs_cpfile.c optional nandfs
	fs/nandfs/nandfs_dat.c optional nandfs
	fs/nandfs/nandfs_dir.c optional nandfs
	fs/nandfs/nandfs_ifile.c optional nandfs
	fs/nandfs/nandfs_segment.c optional nandfs
	fs/nandfs/nandfs_subr.c optional nandfs
	fs/nandfs/nandfs_sufile.c optional nandfs
	fs/nandfs/nandfs_vfsops.c optional nandfs
	fs/nandfs/nandfs_vnops.c optional nandfs
	fs/nfs/nfs_commonkrpc.c optional nfscl \| nfsd
	fs/nfs/nfs_commonsubs.c optional nfscl \| nfsd
	fs/nfs/nfs_commonport.c optional nfscl \| nfsd
	fs/nfs/nfs_commonacl.c optional nfscl \| nfsd
	fs/nfsclient/nfs_clcomsubs.c optional nfscl
	fs/nfsclient/nfs_clsubs.c optional nfscl
	fs/nfsclient/nfs_clstate.c optional nfscl
	fs/nfsclient/nfs_clkrpc.c optional nfscl
	fs/nfsclient/nfs_clrpcops.c optional nfscl
	fs/nfsclient/nfs_clvnops.c optional nfscl
	fs/nfsclient/nfs_clnode.c optional nfscl
	fs/nfsclient/nfs_clvfsops.c optional nfscl
	fs/nfsclient/nfs_clport.c optional nfscl
	fs/nfsclient/nfs_clbio.c optional nfscl
	fs/nfsclient/nfs_clnfsiod.c optional nfscl
	fs/nfsserver/nfs_fha_new.c optional nfsd inet
	fs/nfsserver/nfs_nfsdsocket.c optional nfsd inet
	fs/nfsserver/nfs_nfsdsubs.c optional nfsd inet
	fs/nfsserver/nfs_nfsdstate.c optional nfsd inet
	fs/nfsserver/nfs_nfsdkrpc.c optional nfsd inet
	fs/nfsserver/nfs_nfsdserv.c optional nfsd inet
	fs/nfsserver/nfs_nfsdport.c optional nfsd inet
	fs/nfsserver/nfs_nfsdcache.c optional nfsd inet
	fs/nullfs/null_subr.c optional nullfs
	fs/nullfs/null_vfsops.c optional nullfs
	fs/nullfs/null_vnops.c optional nullfs
	fs/procfs/procfs.c optional procfs
	fs/procfs/procfs_ctl.c optional procfs
	fs/procfs/procfs_dbregs.c optional procfs
	fs/procfs/procfs_fpregs.c optional procfs
	fs/procfs/procfs_ioctl.c optional procfs
	fs/procfs/procfs_map.c optional procfs
	fs/procfs/procfs_mem.c optional procfs
	fs/procfs/procfs_note.c optional procfs
	fs/procfs/procfs_osrel.c optional procfs
	fs/procfs/procfs_regs.c optional procfs
	fs/procfs/procfs_rlimit.c optional procfs
	fs/procfs/procfs_status.c optional procfs
	fs/procfs/procfs_type.c optional procfs
	fs/pseudofs/pseudofs.c optional pseudofs
	fs/pseudofs/pseudofs_fileno.c optional pseudofs
	fs/pseudofs/pseudofs_vncache.c optional pseudofs
	fs/pseudofs/pseudofs_vnops.c optional pseudofs
	fs/smbfs/smbfs_io.c optional smbfs
	fs/smbfs/smbfs_node.c optional smbfs
	fs/smbfs/smbfs_smb.c optional smbfs
	fs/smbfs/smbfs_subr.c optional smbfs
	fs/smbfs/smbfs_vfsops.c optional smbfs
	fs/smbfs/smbfs_vnops.c optional smbfs
	fs/udf/osta.c optional udf
	fs/udf/udf_iconv.c optional udf_iconv
	fs/udf/udf_vfsops.c optional udf
	fs/udf/udf_vnops.c optional udf
	fs/unionfs/union_subr.c optional unionfs
	fs/unionfs/union_vfsops.c optional unionfs
	fs/unionfs/union_vnops.c optional unionfs
	fs/tmpfs/tmpfs_vnops.c optional tmpfs
	fs/tmpfs/tmpfs_fifoops.c optional tmpfs
	fs/tmpfs/tmpfs_vfsops.c optional tmpfs
	fs/tmpfs/tmpfs_subr.c optional tmpfs
	gdb/gdb_cons.c optional gdb
	gdb/gdb_main.c optional gdb
	gdb/gdb_packet.c optional gdb
	geom/bde/g_bde.c optional geom_bde
	geom/bde/g_bde_crypt.c optional geom_bde
	geom/bde/g_bde_lock.c optional geom_bde
	geom/bde/g_bde_work.c optional geom_bde
	geom/cache/g_cache.c optional geom_cache
	geom/concat/g_concat.c optional geom_concat
	geom/eli/g_eli.c optional geom_eli
	geom/eli/g_eli_crypto.c optional geom_eli
	geom/eli/g_eli_ctl.c optional geom_eli
	geom/eli/g_eli_hmac.c optional geom_eli
	geom/eli/g_eli_integrity.c optional geom_eli
	geom/eli/g_eli_key.c optional geom_eli
	geom/eli/g_eli_key_cache.c optional geom_eli
	geom/eli/g_eli_privacy.c optional geom_eli
	geom/eli/pkcs5v2.c optional geom_eli
	geom/gate/g_gate.c optional geom_gate
	geom/geom_aes.c optional geom_aes
	geom/geom_bsd.c optional geom_bsd
	geom/geom_bsd_enc.c optional geom_bsd \| geom_part_bsd
	geom/geom_ccd.c optional ccd \| geom_ccd
	geom/geom_ctl.c standard
	geom/geom_dev.c standard
	geom/geom_disk.c standard
	geom/geom_dump.c standard
	geom/geom_event.c standard
	geom/geom_fox.c optional geom_fox
	geom/geom_flashmap.c optional fdt cfi \| fdt nand \| fdt mx25l \| mmcsd
	geom/geom_io.c standard
	geom/geom_kern.c standard
	geom/geom_map.c optional geom_map
	geom/geom_mbr.c optional geom_mbr
	geom/geom_mbr_enc.c optional geom_mbr
	geom/geom_pc98.c optional geom_pc98
	geom/geom_pc98_enc.c optional geom_pc98
	geom/geom_redboot.c optional geom_redboot
	geom/geom_slice.c standard
	geom/geom_subr.c standard
	geom/geom_sunlabel.c optional geom_sunlabel
	geom/geom_sunlabel_enc.c optional geom_sunlabel
	geom/geom_vfs.c standard
	geom/geom_vol_ffs.c optional geom_vol
	geom/journal/g_journal.c optional geom_journal
	geom/journal/g_journal_ufs.c optional geom_journal
	geom/label/g_label.c optional geom_label \| geom_label_gpt
	geom/label/g_label_ext2fs.c optional geom_label
	geom/label/g_label_iso9660.c optional geom_label
	geom/label/g_label_msdosfs.c optional geom_label
	geom/label/g_label_ntfs.c optional geom_label
	geom/label/g_label_reiserfs.c optional geom_label
	geom/label/g_label_ufs.c optional geom_label
	geom/label/g_label_gpt.c optional geom_label \| geom_label_gpt
	geom/label/g_label_disk_ident.c optional geom_label
	geom/linux_lvm/g_linux_lvm.c optional geom_linux_lvm
	geom/mirror/g_mirror.c optional geom_mirror
	geom/mirror/g_mirror_ctl.c optional geom_mirror
	geom/mountver/g_mountver.c optional geom_mountver
	geom/multipath/g_multipath.c optional geom_multipath
	geom/nop/g_nop.c optional geom_nop
	geom/part/g_part.c standard
	geom/part/g_part_if.m standard
	geom/part/g_part_apm.c optional geom_part_apm
	geom/part/g_part_bsd.c optional geom_part_bsd
	geom/part/g_part_bsd64.c optional geom_part_bsd64
	geom/part/g_part_ebr.c optional geom_part_ebr
	geom/part/g_part_gpt.c optional geom_part_gpt
	geom/part/g_part_ldm.c optional geom_part_ldm
	geom/part/g_part_mbr.c optional geom_part_mbr
	geom/part/g_part_pc98.c optional geom_part_pc98
	geom/part/g_part_vtoc8.c optional geom_part_vtoc8
	geom/raid/g_raid.c optional geom_raid
	geom/raid/g_raid_ctl.c optional geom_raid
	geom/raid/g_raid_md_if.m optional geom_raid
	geom/raid/g_raid_tr_if.m optional geom_raid
	geom/raid/md_ddf.c optional geom_raid
	geom/raid/md_intel.c optional geom_raid
	geom/raid/md_jmicron.c optional geom_raid
	geom/raid/md_nvidia.c optional geom_raid
	geom/raid/md_promise.c optional geom_raid
	geom/raid/md_sii.c optional geom_raid
	geom/raid/tr_concat.c optional geom_raid
	geom/raid/tr_raid0.c optional geom_raid
	geom/raid/tr_raid1.c optional geom_raid
	geom/raid/tr_raid1e.c optional geom_raid
	geom/raid/tr_raid5.c optional geom_raid
	geom/raid3/g_raid3.c optional geom_raid3
	geom/raid3/g_raid3_ctl.c optional geom_raid3
	geom/shsec/g_shsec.c optional geom_shsec
	geom/stripe/g_stripe.c optional geom_stripe
	contrib/xz-embedded/freebsd/xz_malloc.c \
	optional xz_embedded \| geom_uzip \
	compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/"
	contrib/xz-embedded/linux/lib/xz/xz_crc32.c \
	optional xz_embedded \| geom_uzip \
	compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/"
	contrib/xz-embedded/linux/lib/xz/xz_dec_bcj.c \
	optional xz_embedded \| geom_uzip \
	compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/"
	contrib/xz-embedded/linux/lib/xz/xz_dec_lzma2.c \
	optional xz_embedded \| geom_uzip \
	compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/"
	contrib/xz-embedded/linux/lib/xz/xz_dec_stream.c \
	optional xz_embedded \| geom_uzip \
	compile-with "${NORMAL_C} -I$S/contrib/xz-embedded/freebsd/ -I$S/contrib/xz-embedded/linux/lib/xz/ -I$S/contrib/xz-embedded/linux/include/linux/"
	geom/uzip/g_uzip.c optional geom_uzip
	geom/uzip/g_uzip_lzma.c optional geom_uzip
	geom/uzip/g_uzip_wrkthr.c optional geom_uzip
	geom/uzip/g_uzip_zlib.c optional geom_uzip
	geom/vinum/geom_vinum.c optional geom_vinum
	geom/vinum/geom_vinum_create.c optional geom_vinum
	geom/vinum/geom_vinum_drive.c optional geom_vinum
	geom/vinum/geom_vinum_plex.c optional geom_vinum
	geom/vinum/geom_vinum_volume.c optional geom_vinum
	geom/vinum/geom_vinum_subr.c optional geom_vinum
	geom/vinum/geom_vinum_raid5.c optional geom_vinum
	geom/vinum/geom_vinum_share.c optional geom_vinum
	geom/vinum/geom_vinum_list.c optional geom_vinum
	geom/vinum/geom_vinum_rm.c optional geom_vinum
	geom/vinum/geom_vinum_init.c optional geom_vinum
	geom/vinum/geom_vinum_state.c optional geom_vinum
	geom/vinum/geom_vinum_rename.c optional geom_vinum
	geom/vinum/geom_vinum_move.c optional geom_vinum
	geom/vinum/geom_vinum_events.c optional geom_vinum
	geom/virstor/binstream.c optional geom_virstor
	geom/virstor/g_virstor.c optional geom_virstor
	geom/virstor/g_virstor_md.c optional geom_virstor
	geom/zero/g_zero.c optional geom_zero
	fs/ext2fs/ext2_acl.c optional ext2fs
	fs/ext2fs/ext2_alloc.c optional ext2fs
	fs/ext2fs/ext2_balloc.c optional ext2fs
	fs/ext2fs/ext2_bmap.c optional ext2fs
	fs/ext2fs/ext2_csum.c optional ext2fs
	fs/ext2fs/ext2_extattr.c optional ext2fs
	fs/ext2fs/ext2_extents.c optional ext2fs
	fs/ext2fs/ext2_inode.c optional ext2fs
	fs/ext2fs/ext2_inode_cnv.c optional ext2fs
	fs/ext2fs/ext2_hash.c optional ext2fs
	fs/ext2fs/ext2_htree.c optional ext2fs
	fs/ext2fs/ext2_lookup.c optional ext2fs
	fs/ext2fs/ext2_subr.c optional ext2fs
	fs/ext2fs/ext2_vfsops.c optional ext2fs
	fs/ext2fs/ext2_vnops.c optional ext2fs
	#
	isa/isa_if.m standard
	isa/isa_common.c optional isa
	isa/isahint.c optional isa
	isa/pnp.c optional isa isapnp
	isa/pnpparse.c optional isa isapnp
	fs/cd9660/cd9660_bmap.c optional cd9660
	fs/cd9660/cd9660_lookup.c optional cd9660
	fs/cd9660/cd9660_node.c optional cd9660
	fs/cd9660/cd9660_rrip.c optional cd9660
	fs/cd9660/cd9660_util.c optional cd9660
	fs/cd9660/cd9660_vfsops.c optional cd9660
	fs/cd9660/cd9660_vnops.c optional cd9660
	fs/cd9660/cd9660_iconv.c optional cd9660_iconv
	kern/bus_if.m standard
	kern/clock_if.m standard
	kern/cpufreq_if.m standard
	kern/device_if.m standard
	kern/imgact_binmisc.c optional imagact_binmisc
	kern/imgact_elf.c standard
	kern/imgact_elf32.c optional compat_freebsd32
	kern/imgact_shell.c standard
	kern/inflate.c optional gzip
	kern/init_main.c standard
	kern/init_sysent.c standard
	kern/ksched.c optional _kposix_priority_scheduling
	kern/kern_acct.c standard
	kern/kern_alq.c optional alq
	kern/kern_clock.c standard
	kern/kern_condvar.c standard
	kern/kern_conf.c standard
	kern/kern_cons.c standard
	kern/kern_cpu.c standard
	kern/kern_cpuset.c standard
	kern/kern_context.c standard
	kern/kern_descrip.c standard
	kern/kern_dtrace.c optional kdtrace_hooks
	kern/kern_dump.c standard
	kern/kern_environment.c standard
	kern/kern_et.c standard
	kern/kern_event.c standard
	kern/kern_exec.c standard
	kern/kern_exit.c standard
	kern/kern_fail.c standard
	kern/kern_ffclock.c standard
	kern/kern_fork.c standard
	kern/kern_gzio.c optional gzio
	kern/kern_hhook.c standard
	kern/kern_idle.c standard
	kern/kern_intr.c standard
	kern/kern_jail.c standard
	kern/kern_khelp.c standard
	kern/kern_kthread.c standard
	kern/kern_ktr.c optional ktr
	kern/kern_ktrace.c standard
	kern/kern_linker.c standard
	kern/kern_lock.c standard
	kern/kern_lockf.c standard
	kern/kern_lockstat.c optional kdtrace_hooks
	kern/kern_loginclass.c standard
	kern/kern_malloc.c standard
	kern/kern_mbuf.c standard
	kern/kern_mib.c standard
	kern/kern_module.c standard
	kern/kern_mtxpool.c standard
	kern/kern_mutex.c standard
	kern/kern_ntptime.c standard
	kern/kern_numa.c standard
	kern/kern_osd.c standard
	kern/kern_physio.c standard
	kern/kern_pmc.c standard
	kern/kern_poll.c optional device_polling
	kern/kern_priv.c standard
	kern/kern_proc.c standard
	kern/kern_procctl.c standard
	kern/kern_prot.c standard
	kern/kern_racct.c standard
	kern/kern_rangelock.c standard
	kern/kern_rctl.c standard
	kern/kern_resource.c standard
	kern/kern_rmlock.c standard
	kern/kern_rwlock.c standard
	kern/kern_sdt.c optional kdtrace_hooks
	kern/kern_sema.c standard
	kern/kern_sendfile.c standard
	kern/kern_sharedpage.c standard
	kern/kern_shutdown.c standard
	kern/kern_sig.c standard
	kern/kern_switch.c standard
	kern/kern_sx.c standard
	kern/kern_synch.c standard
	kern/kern_syscalls.c standard
	kern/kern_sysctl.c standard
	kern/kern_tc.c standard
	kern/kern_thr.c standard
	kern/kern_thread.c standard
	kern/kern_time.c standard
	kern/kern_timeout.c standard
	kern/kern_umtx.c standard
	kern/kern_uuid.c standard
	kern/kern_xxx.c standard
	kern/link_elf.c standard
	kern/linker_if.m standard
	kern/md4c.c optional netsmb
	kern/md5c.c standard
	kern/p1003_1b.c standard
	kern/posix4_mib.c standard
	kern/sched_4bsd.c optional sched_4bsd
	kern/sched_ule.c optional sched_ule
	kern/serdev_if.m standard
	kern/stack_protector.c standard \
	compile-with "${NORMAL_C:N-fstack-protector*}"
	kern/subr_acl_nfs4.c optional ufs_acl \| zfs
	kern/subr_acl_posix1e.c optional ufs_acl
	kern/subr_autoconf.c standard
	kern/subr_blist.c standard
	kern/subr_bus.c standard
	kern/subr_bus_dma.c standard
	kern/subr_bufring.c standard
	kern/subr_capability.c standard
	kern/subr_clock.c standard
	kern/subr_counter.c standard
	kern/subr_devstat.c standard
	kern/subr_disk.c standard
	kern/subr_eventhandler.c standard
	kern/subr_fattime.c standard
	kern/subr_firmware.c optional firmware
	kern/subr_gtaskqueue.c standard
	kern/subr_hash.c standard
	kern/subr_hints.c standard
	kern/subr_kdb.c standard
	kern/subr_kobj.c standard
	kern/subr_lock.c standard
	kern/subr_log.c standard
	kern/subr_mbpool.c optional libmbpool
	kern/subr_mchain.c optional libmchain
	kern/subr_module.c standard
	kern/subr_msgbuf.c standard
	kern/subr_param.c standard
	kern/subr_pcpu.c standard
	kern/subr_pctrie.c standard
	kern/subr_power.c standard
	kern/subr_prf.c standard
	kern/subr_prof.c standard
	kern/subr_rman.c standard
	kern/subr_rtc.c standard
	kern/subr_sbuf.c standard
	kern/subr_scanf.c standard
	kern/subr_sglist.c standard
	kern/subr_sleepqueue.c standard
	kern/subr_smp.c standard
	kern/subr_stack.c optional ddb \| stack \| ktr
	kern/subr_taskqueue.c standard
	kern/subr_terminal.c optional vt
	kern/subr_trap.c standard
	kern/subr_turnstile.c standard
	kern/subr_uio.c standard
	kern/subr_unit.c standard
	kern/subr_vmem.c standard
	kern/subr_witness.c optional witness
	kern/sys_capability.c standard
	kern/sys_generic.c standard
	kern/sys_pipe.c standard
	kern/sys_procdesc.c standard
	kern/sys_process.c standard
	kern/sys_socket.c standard
	kern/syscalls.c standard
	kern/sysv_ipc.c standard
	kern/sysv_msg.c optional sysvmsg
	kern/sysv_sem.c optional sysvsem
	kern/sysv_shm.c optional sysvshm
	kern/tty.c standard
	kern/tty_compat.c optional compat_43tty
	kern/tty_info.c standard
	kern/tty_inq.c standard
	kern/tty_outq.c standard
	kern/tty_pts.c standard
	kern/tty_tty.c standard
	kern/tty_ttydisc.c standard
	kern/uipc_accf.c standard
	kern/uipc_debug.c optional ddb
	kern/uipc_domain.c standard
	kern/uipc_mbuf.c standard
	kern/uipc_mbuf2.c standard
	kern/uipc_mbufhash.c standard
	kern/uipc_mqueue.c optional p1003_1b_mqueue
	kern/uipc_sem.c optional p1003_1b_semaphores
	kern/uipc_shm.c standard
	kern/uipc_sockbuf.c standard
	kern/uipc_socket.c standard
	kern/uipc_syscalls.c standard
	kern/uipc_usrreq.c standard
	kern/vfs_acl.c standard
	kern/vfs_aio.c standard
	kern/vfs_bio.c standard
	kern/vfs_cache.c standard
	kern/vfs_cluster.c standard
	kern/vfs_default.c standard
	kern/vfs_export.c standard
	kern/vfs_extattr.c standard
	kern/vfs_hash.c standard
	kern/vfs_init.c standard
	kern/vfs_lookup.c standard
	kern/vfs_mount.c standard
	kern/vfs_mountroot.c standard
	kern/vfs_subr.c standard
	kern/vfs_syscalls.c standard
	kern/vfs_vnops.c standard
	#
	# Kernel GSS-API
	#
	gssd.h optional kgssapi \
	dependency "$S/kgssapi/gssd.x" \
	compile-with "RPCGEN_CPP='${CPP}' rpcgen -hM $S/kgssapi/gssd.x \| grep -v pthread.h > gssd.h" \
	no-obj no-implicit-rule before-depend local \
	clean "gssd.h"
	gssd_xdr.c optional kgssapi \
	dependency "$S/kgssapi/gssd.x gssd.h" \
	compile-with "RPCGEN_CPP='${CPP}' rpcgen -c $S/kgssapi/gssd.x -o gssd_xdr.c" \
	no-implicit-rule before-depend local \
	clean "gssd_xdr.c"
	gssd_clnt.c optional kgssapi \
	dependency "$S/kgssapi/gssd.x gssd.h" \
	compile-with "RPCGEN_CPP='${CPP}' rpcgen -lM $S/kgssapi/gssd.x \| grep -v string.h > gssd_clnt.c" \
	no-implicit-rule before-depend local \
	clean "gssd_clnt.c"
	kgssapi/gss_accept_sec_context.c optional kgssapi
	kgssapi/gss_add_oid_set_member.c optional kgssapi
	kgssapi/gss_acquire_cred.c optional kgssapi
	kgssapi/gss_canonicalize_name.c optional kgssapi
	kgssapi/gss_create_empty_oid_set.c optional kgssapi
	kgssapi/gss_delete_sec_context.c optional kgssapi
	kgssapi/gss_display_status.c optional kgssapi
	kgssapi/gss_export_name.c optional kgssapi
	kgssapi/gss_get_mic.c optional kgssapi
	kgssapi/gss_init_sec_context.c optional kgssapi
	kgssapi/gss_impl.c optional kgssapi
	kgssapi/gss_import_name.c optional kgssapi
	kgssapi/gss_names.c optional kgssapi
	kgssapi/gss_pname_to_uid.c optional kgssapi
	kgssapi/gss_release_buffer.c optional kgssapi
	kgssapi/gss_release_cred.c optional kgssapi
	kgssapi/gss_release_name.c optional kgssapi
	kgssapi/gss_release_oid_set.c optional kgssapi
	kgssapi/gss_set_cred_option.c optional kgssapi
	kgssapi/gss_test_oid_set_member.c optional kgssapi
	kgssapi/gss_unwrap.c optional kgssapi
	kgssapi/gss_verify_mic.c optional kgssapi
	kgssapi/gss_wrap.c optional kgssapi
	kgssapi/gss_wrap_size_limit.c optional kgssapi
	kgssapi/gssd_prot.c optional kgssapi
	kgssapi/krb5/krb5_mech.c optional kgssapi
	kgssapi/krb5/kcrypto.c optional kgssapi
	kgssapi/krb5/kcrypto_aes.c optional kgssapi
	kgssapi/krb5/kcrypto_arcfour.c optional kgssapi
	kgssapi/krb5/kcrypto_des.c optional kgssapi
	kgssapi/krb5/kcrypto_des3.c optional kgssapi
	kgssapi/kgss_if.m optional kgssapi
	kgssapi/gsstest.c optional kgssapi_debug
	# These files in libkern/ are those needed by all architectures. Some
	# of the files in libkern/ are only needed on some architectures, e.g.,
	# libkern/divdi3.c is needed by i386 but not alpha. Also, some of these
	# routines may be optimized for a particular platform. In either case,
	# the file should be moved to conf/files.<arch> from here.
	#
	libkern/arc4random.c standard
	libkern/asprintf.c standard
	libkern/bcd.c standard
	libkern/bsearch.c standard
	libkern/crc32.c standard
	libkern/explicit_bzero.c standard
	libkern/fnmatch.c standard
	libkern/iconv.c optional libiconv
	libkern/iconv_converter_if.m optional libiconv
	libkern/iconv_ucs.c optional libiconv
	libkern/iconv_xlat.c optional libiconv
	libkern/iconv_xlat16.c optional libiconv
	libkern/inet_aton.c standard
	libkern/inet_ntoa.c standard
	libkern/inet_ntop.c standard
	libkern/inet_pton.c standard
	libkern/jenkins_hash.c standard
	libkern/murmur3_32.c standard
	libkern/mcount.c optional profiling-routine
	libkern/memcchr.c standard
	libkern/memchr.c standard
	libkern/memcmp.c standard
	libkern/memmem.c optional gdb
	libkern/qsort.c standard
	libkern/qsort_r.c standard
	libkern/random.c standard
	libkern/scanc.c standard
	libkern/strcasecmp.c standard
	libkern/strcat.c standard
	libkern/strchr.c standard
	libkern/strcmp.c standard
	libkern/strcpy.c standard
	libkern/strcspn.c standard
	libkern/strdup.c standard
	libkern/strndup.c standard
	libkern/strlcat.c standard
	libkern/strlcpy.c standard
	libkern/strlen.c standard
	libkern/strncat.c standard
	libkern/strncmp.c standard
	libkern/strncpy.c standard
	libkern/strnlen.c standard
	libkern/strrchr.c standard
	libkern/strsep.c standard
	libkern/strspn.c standard
	libkern/strstr.c standard
	libkern/strtol.c standard
	libkern/strtoq.c standard
	libkern/strtoul.c standard
	libkern/strtouq.c standard
	libkern/strvalid.c standard
	libkern/timingsafe_bcmp.c standard
	libkern/zlib.c optional crypto \| geom_uzip \| ipsec \| \
	ipsec_support \| mxge \| netgraph_deflate \| ddb_ctf \| gzio
	net/altq/altq_cbq.c optional altq
	net/altq/altq_cdnr.c optional altq
	net/altq/altq_codel.c optional altq
	net/altq/altq_hfsc.c optional altq
	net/altq/altq_fairq.c optional altq
	net/altq/altq_priq.c optional altq
	net/altq/altq_red.c optional altq
	net/altq/altq_rio.c optional altq
	net/altq/altq_rmclass.c optional altq
	net/altq/altq_subr.c optional altq
	net/bpf.c standard
	net/bpf_buffer.c optional bpf
	net/bpf_jitter.c optional bpf_jitter
	net/bpf_filter.c optional bpf \| netgraph_bpf
	net/bpf_zerocopy.c optional bpf
	net/bridgestp.c optional bridge \| if_bridge
	net/flowtable.c optional flowtable inet \| flowtable inet6
	net/ieee8023ad_lacp.c optional lagg
	net/if.c standard
	net/if_arcsubr.c optional arcnet
	net/if_atmsubr.c optional atm
	net/if_bridge.c optional bridge inet \| if_bridge inet
	net/if_clone.c standard
	net/if_dead.c standard
	net/if_debug.c optional ddb
	net/if_disc.c optional disc
	net/if_edsc.c optional edsc
	net/if_enc.c optional enc inet \| enc inet6
	net/if_epair.c optional epair
	net/if_ethersubr.c optional ether
	net/if_fddisubr.c optional fddi
	net/if_fwsubr.c optional fwip
	net/if_gif.c optional gif inet \| gif inet6 \| \
	netgraph_gif inet \| netgraph_gif inet6
	net/if_gre.c optional gre inet \| gre inet6
	net/if_ipsec.c optional inet ipsec \| inet6 ipsec
	net/if_iso88025subr.c optional token
	net/if_lagg.c optional lagg
	net/if_loop.c optional loop
	net/if_llatbl.c standard
	net/if_me.c optional me inet
	net/if_media.c standard
	net/if_mib.c standard
	net/if_spppfr.c optional sppp \| netgraph_sppp
	net/if_spppsubr.c optional sppp \| netgraph_sppp
	net/if_stf.c optional stf inet inet6
	net/if_tun.c optional tun
	net/if_tap.c optional tap
	net/if_vlan.c optional vlan
	net/if_vxlan.c optional vxlan inet \| vxlan inet6
	net/ifdi_if.m optional ether pci
	net/iflib.c optional ether pci
	net/mp_ring.c optional ether
	net/mppcc.c optional netgraph_mppc_compression
	net/mppcd.c optional netgraph_mppc_compression
	net/netisr.c standard
	net/pfil.c optional ether \| inet
	net/radix.c standard
	net/radix_mpath.c standard
	net/raw_cb.c standard
	net/raw_usrreq.c standard
	net/route.c standard
	net/rss_config.c optional inet rss \| inet6 rss
	net/rtsock.c standard
	net/slcompress.c optional netgraph_vjc \| sppp \| \
	netgraph_sppp
	net/toeplitz.c optional inet rss \| inet6 rss
	net/vnet.c optional vimage
	net80211/ieee80211.c optional wlan
	net80211/ieee80211_acl.c optional wlan wlan_acl
	net80211/ieee80211_action.c optional wlan
	net80211/ieee80211_ageq.c optional wlan
	net80211/ieee80211_adhoc.c optional wlan \
	compile-with "${NORMAL_C} -Wno-unused-function"
	net80211/ieee80211_ageq.c optional wlan
	net80211/ieee80211_amrr.c optional wlan \| wlan_amrr
	net80211/ieee80211_crypto.c optional wlan \
	compile-with "${NORMAL_C} -Wno-unused-function"
	net80211/ieee80211_crypto_ccmp.c optional wlan wlan_ccmp
	net80211/ieee80211_crypto_none.c optional wlan
	net80211/ieee80211_crypto_tkip.c optional wlan wlan_tkip
	net80211/ieee80211_crypto_wep.c optional wlan wlan_wep
	net80211/ieee80211_ddb.c optional wlan ddb
	net80211/ieee80211_dfs.c optional wlan
	net80211/ieee80211_freebsd.c optional wlan
	net80211/ieee80211_hostap.c optional wlan \
	compile-with "${NORMAL_C} -Wno-unused-function"
	net80211/ieee80211_ht.c optional wlan
	net80211/ieee80211_hwmp.c optional wlan ieee80211_support_mesh
	net80211/ieee80211_input.c optional wlan
	net80211/ieee80211_ioctl.c optional wlan
	net80211/ieee80211_mesh.c optional wlan ieee80211_support_mesh \
	compile-with "${NORMAL_C} -Wno-unused-function"
	net80211/ieee80211_monitor.c optional wlan
	net80211/ieee80211_node.c optional wlan
	net80211/ieee80211_output.c optional wlan
	net80211/ieee80211_phy.c optional wlan
	net80211/ieee80211_power.c optional wlan
	net80211/ieee80211_proto.c optional wlan
	net80211/ieee80211_radiotap.c optional wlan
	net80211/ieee80211_ratectl.c optional wlan
	net80211/ieee80211_ratectl_none.c optional wlan
	net80211/ieee80211_regdomain.c optional wlan
	net80211/ieee80211_rssadapt.c optional wlan wlan_rssadapt
	net80211/ieee80211_scan.c optional wlan
	net80211/ieee80211_scan_sta.c optional wlan
	net80211/ieee80211_sta.c optional wlan \
	compile-with "${NORMAL_C} -Wno-unused-function"
	net80211/ieee80211_superg.c optional wlan ieee80211_support_superg
	net80211/ieee80211_scan_sw.c optional wlan
	net80211/ieee80211_tdma.c optional wlan ieee80211_support_tdma
	net80211/ieee80211_wds.c optional wlan
	net80211/ieee80211_xauth.c optional wlan wlan_xauth
	net80211/ieee80211_alq.c optional wlan ieee80211_alq
	netgraph/atm/ccatm/ng_ccatm.c optional ngatm_ccatm \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	netgraph/atm/ng_atm.c optional ngatm_atm
	netgraph/atm/ngatmbase.c optional ngatm_atmbase \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	netgraph/atm/sscfu/ng_sscfu.c optional ngatm_sscfu \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	netgraph/atm/sscop/ng_sscop.c optional ngatm_sscop \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	netgraph/atm/uni/ng_uni.c optional ngatm_uni \
	compile-with "${NORMAL_C} -I$S/contrib/ngatm"
	netgraph/bluetooth/common/ng_bluetooth.c optional netgraph_bluetooth
	netgraph/bluetooth/drivers/bt3c/ng_bt3c_pccard.c optional netgraph_bluetooth_bt3c
	netgraph/bluetooth/drivers/h4/ng_h4.c optional netgraph_bluetooth_h4
	netgraph/bluetooth/drivers/ubt/ng_ubt.c optional netgraph_bluetooth_ubt usb
	netgraph/bluetooth/drivers/ubtbcmfw/ubtbcmfw.c optional netgraph_bluetooth_ubtbcmfw usb
	netgraph/bluetooth/hci/ng_hci_cmds.c optional netgraph_bluetooth_hci
	netgraph/bluetooth/hci/ng_hci_evnt.c optional netgraph_bluetooth_hci
	netgraph/bluetooth/hci/ng_hci_main.c optional netgraph_bluetooth_hci
	netgraph/bluetooth/hci/ng_hci_misc.c optional netgraph_bluetooth_hci
	netgraph/bluetooth/hci/ng_hci_ulpi.c optional netgraph_bluetooth_hci
	netgraph/bluetooth/l2cap/ng_l2cap_cmds.c optional netgraph_bluetooth_l2cap
	netgraph/bluetooth/l2cap/ng_l2cap_evnt.c optional netgraph_bluetooth_l2cap
	netgraph/bluetooth/l2cap/ng_l2cap_llpi.c optional netgraph_bluetooth_l2cap
	netgraph/bluetooth/l2cap/ng_l2cap_main.c optional netgraph_bluetooth_l2cap
	netgraph/bluetooth/l2cap/ng_l2cap_misc.c optional netgraph_bluetooth_l2cap
	netgraph/bluetooth/l2cap/ng_l2cap_ulpi.c optional netgraph_bluetooth_l2cap
	netgraph/bluetooth/socket/ng_btsocket.c optional netgraph_bluetooth_socket
	netgraph/bluetooth/socket/ng_btsocket_hci_raw.c optional netgraph_bluetooth_socket
	netgraph/bluetooth/socket/ng_btsocket_l2cap.c optional netgraph_bluetooth_socket
	netgraph/bluetooth/socket/ng_btsocket_l2cap_raw.c optional netgraph_bluetooth_socket
	netgraph/bluetooth/socket/ng_btsocket_rfcomm.c optional netgraph_bluetooth_socket
	netgraph/bluetooth/socket/ng_btsocket_sco.c optional netgraph_bluetooth_socket
	netgraph/netflow/netflow.c optional netgraph_netflow
	netgraph/netflow/netflow_v9.c optional netgraph_netflow
	netgraph/netflow/ng_netflow.c optional netgraph_netflow
	netgraph/ng_UI.c optional netgraph_UI
	netgraph/ng_async.c optional netgraph_async
	netgraph/ng_atmllc.c optional netgraph_atmllc
	netgraph/ng_base.c optional netgraph
	netgraph/ng_bpf.c optional netgraph_bpf
	netgraph/ng_bridge.c optional netgraph_bridge
	netgraph/ng_car.c optional netgraph_car
	netgraph/ng_cisco.c optional netgraph_cisco
	netgraph/ng_deflate.c optional netgraph_deflate
	netgraph/ng_device.c optional netgraph_device
	netgraph/ng_echo.c optional netgraph_echo
	netgraph/ng_eiface.c optional netgraph_eiface
	netgraph/ng_ether.c optional netgraph_ether
	netgraph/ng_ether_echo.c optional netgraph_ether_echo
	netgraph/ng_frame_relay.c optional netgraph_frame_relay
	netgraph/ng_gif.c optional netgraph_gif inet6 \| netgraph_gif inet
	netgraph/ng_gif_demux.c optional netgraph_gif_demux
	netgraph/ng_hole.c optional netgraph_hole
	netgraph/ng_iface.c optional netgraph_iface
	netgraph/ng_ip_input.c optional netgraph_ip_input
	netgraph/ng_ipfw.c optional netgraph_ipfw inet ipfirewall
	netgraph/ng_ksocket.c optional netgraph_ksocket
	netgraph/ng_l2tp.c optional netgraph_l2tp
	netgraph/ng_lmi.c optional netgraph_lmi
	netgraph/ng_mppc.c optional netgraph_mppc_compression \| \
	netgraph_mppc_encryption
	netgraph/ng_nat.c optional netgraph_nat inet libalias
	netgraph/ng_one2many.c optional netgraph_one2many
	netgraph/ng_parse.c optional netgraph
	netgraph/ng_patch.c optional netgraph_patch
	netgraph/ng_pipe.c optional netgraph_pipe
	netgraph/ng_ppp.c optional netgraph_ppp
	netgraph/ng_pppoe.c optional netgraph_pppoe
	netgraph/ng_pptpgre.c optional netgraph_pptpgre
	netgraph/ng_pred1.c optional netgraph_pred1
	netgraph/ng_rfc1490.c optional netgraph_rfc1490
	netgraph/ng_socket.c optional netgraph_socket
	netgraph/ng_split.c optional netgraph_split
	netgraph/ng_sppp.c optional netgraph_sppp
	netgraph/ng_tag.c optional netgraph_tag
	netgraph/ng_tcpmss.c optional netgraph_tcpmss
	netgraph/ng_tee.c optional netgraph_tee
	netgraph/ng_tty.c optional netgraph_tty
	netgraph/ng_vjc.c optional netgraph_vjc
	netgraph/ng_vlan.c optional netgraph_vlan
	netinet/accf_data.c optional accept_filter_data inet
	netinet/accf_dns.c optional accept_filter_dns inet
	netinet/accf_http.c optional accept_filter_http inet
	netinet/if_atm.c optional atm
	netinet/if_ether.c optional inet ether
	netinet/igmp.c optional inet
	netinet/in.c optional inet
	netinet/in_debug.c optional inet ddb
	netinet/in_kdtrace.c optional inet \| inet6
	netinet/ip_carp.c optional inet carp \| inet6 carp
	netinet/in_fib.c optional inet
	netinet/in_gif.c optional gif inet \| netgraph_gif inet
	netinet/ip_gre.c optional gre inet
	netinet/ip_id.c optional inet
	netinet/in_jail.c optional inet
	netinet/in_mcast.c optional inet
	netinet/in_pcb.c optional inet \| inet6
	netinet/in_pcbgroup.c optional inet pcbgroup \| inet6 pcbgroup
	netinet/in_proto.c optional inet \| inet6
	netinet/in_rmx.c optional inet
	netinet/in_rss.c optional inet rss
	netinet/ip_divert.c optional inet ipdivert ipfirewall
	netinet/ip_ecn.c optional inet \| inet6
	netinet/ip_encap.c optional inet \| inet6
	netinet/ip_fastfwd.c optional inet
	netinet/ip_icmp.c optional inet \| inet6
	netinet/ip_input.c optional inet
	netinet/ip_mroute.c optional mrouting inet
	netinet/ip_options.c optional inet
	netinet/ip_output.c optional inet
	netinet/ip_reass.c optional inet
	netinet/raw_ip.c optional inet \| inet6
	netinet/cc/cc.c optional inet \| inet6
	netinet/cc/cc_newreno.c optional inet \| inet6
	netinet/sctp_asconf.c optional inet sctp \| inet6 sctp
	netinet/sctp_auth.c optional inet sctp \| inet6 sctp
	netinet/sctp_bsd_addr.c optional inet sctp \| inet6 sctp
	netinet/sctp_cc_functions.c optional inet sctp \| inet6 sctp
	netinet/sctp_crc32.c optional inet \| inet6
	netinet/sctp_indata.c optional inet sctp \| inet6 sctp
	netinet/sctp_input.c optional inet sctp \| inet6 sctp
	netinet/sctp_output.c optional inet sctp \| inet6 sctp
	netinet/sctp_pcb.c optional inet sctp \| inet6 sctp
	netinet/sctp_peeloff.c optional inet sctp \| inet6 sctp
	netinet/sctp_ss_functions.c optional inet sctp \| inet6 sctp
	netinet/sctp_syscalls.c optional inet sctp \| inet6 sctp
	netinet/sctp_sysctl.c optional inet sctp \| inet6 sctp
	netinet/sctp_timer.c optional inet sctp \| inet6 sctp
	netinet/sctp_usrreq.c optional inet sctp \| inet6 sctp
	netinet/sctputil.c optional inet sctp \| inet6 sctp
	netinet/siftr.c optional inet siftr alq \| inet6 siftr alq
	netinet/tcp_debug.c optional tcpdebug
	netinet/tcp_fastopen.c optional inet tcp_rfc7413 \| inet6 tcp_rfc7413
	netinet/tcp_hostcache.c optional inet \| inet6
	netinet/tcp_input.c optional inet \| inet6
	netinet/tcp_lro.c optional inet \| inet6
	netinet/tcp_output.c optional inet \| inet6
	netinet/tcp_offload.c optional tcp_offload inet \| tcp_offload inet6
	netinet/tcp_pcap.c optional inet tcppcap \| inet6 tcppcap
	netinet/tcp_reass.c optional inet \| inet6
	netinet/tcp_sack.c optional inet \| inet6
	netinet/tcp_subr.c optional inet \| inet6
	netinet/tcp_syncache.c optional inet \| inet6
	netinet/tcp_timer.c optional inet \| inet6
	netinet/tcp_timewait.c optional inet \| inet6
	netinet/tcp_usrreq.c optional inet \| inet6
	netinet/udp_usrreq.c optional inet \| inet6
	netinet/libalias/alias.c optional libalias inet \| netgraph_nat inet
	netinet/libalias/alias_db.c optional libalias inet \| netgraph_nat inet
	netinet/libalias/alias_mod.c optional libalias \| netgraph_nat
	netinet/libalias/alias_proxy.c optional libalias inet \| netgraph_nat inet
	netinet/libalias/alias_util.c optional libalias inet \| netgraph_nat inet
	netinet/libalias/alias_sctp.c optional libalias inet \| netgraph_nat inet
	netinet6/dest6.c optional inet6
	netinet6/frag6.c optional inet6
	netinet6/icmp6.c optional inet6
	netinet6/in6.c optional inet6
	netinet6/in6_cksum.c optional inet6
	netinet6/in6_fib.c optional inet6
	netinet6/in6_gif.c optional gif inet6 \| netgraph_gif inet6
	netinet6/in6_ifattach.c optional inet6
	netinet6/in6_jail.c optional inet6
	netinet6/in6_mcast.c optional inet6
	netinet6/in6_pcb.c optional inet6
	netinet6/in6_pcbgroup.c optional inet6 pcbgroup
	netinet6/in6_proto.c optional inet6
	netinet6/in6_rmx.c optional inet6
	netinet6/in6_rss.c optional inet6 rss
	netinet6/in6_src.c optional inet6
	netinet6/ip6_fastfwd.c optional inet6
	netinet6/ip6_forward.c optional inet6
	netinet6/ip6_gre.c optional gre inet6
	netinet6/ip6_id.c optional inet6
	netinet6/ip6_input.c optional inet6
	netinet6/ip6_mroute.c optional mrouting inet6
	netinet6/ip6_output.c optional inet6
	netinet6/mld6.c optional inet6
	netinet6/nd6.c optional inet6
	netinet6/nd6_nbr.c optional inet6
	netinet6/nd6_rtr.c optional inet6
	netinet6/raw_ip6.c optional inet6
	netinet6/route6.c optional inet6
	netinet6/scope6.c optional inet6
	netinet6/sctp6_usrreq.c optional inet6 sctp
	netinet6/udp6_usrreq.c optional inet6
	netipsec/ipsec.c optional ipsec inet \| ipsec inet6
	netipsec/ipsec_input.c optional ipsec inet \| ipsec inet6
	netipsec/ipsec_mbuf.c optional ipsec inet \| ipsec inet6
	netipsec/ipsec_mod.c optional ipsec inet \| ipsec inet6
	netipsec/ipsec_output.c optional ipsec inet \| ipsec inet6
	netipsec/ipsec_pcb.c optional ipsec inet \| ipsec inet6 \| \
	ipsec_support inet \| ipsec_support inet6
	netipsec/key.c optional ipsec inet \| ipsec inet6 \| \
	ipsec_support inet \| ipsec_support inet6
	netipsec/key_debug.c optional ipsec inet \| ipsec inet6 \| \
	ipsec_support inet \| ipsec_support inet6
	netipsec/keysock.c optional ipsec inet \| ipsec inet6 \| \
	ipsec_support inet \| ipsec_support inet6
	netipsec/subr_ipsec.c optional ipsec inet \| ipsec inet6 \| \
	ipsec_support inet \| ipsec_support inet6
	netipsec/udpencap.c optional ipsec inet
	netipsec/xform_ah.c optional ipsec inet \| ipsec inet6
	netipsec/xform_esp.c optional ipsec inet \| ipsec inet6
	netipsec/xform_ipcomp.c optional ipsec inet \| ipsec inet6
	netipsec/xform_tcp.c optional ipsec inet tcp_signature \| \
	ipsec inet6 tcp_signature \| ipsec_support inet tcp_signature \| \
	ipsec_support inet6 tcp_signature
	netnatm/natm.c optional natm
	netnatm/natm_pcb.c optional natm
	netnatm/natm_proto.c optional natm
	netpfil/ipfw/dn_aqm_codel.c optional inet dummynet
	netpfil/ipfw/dn_aqm_pie.c optional inet dummynet
	netpfil/ipfw/dn_heap.c optional inet dummynet
	netpfil/ipfw/dn_sched_fifo.c optional inet dummynet
	netpfil/ipfw/dn_sched_fq_codel.c optional inet dummynet
	netpfil/ipfw/dn_sched_fq_pie.c optional inet dummynet
	netpfil/ipfw/dn_sched_prio.c optional inet dummynet
	netpfil/ipfw/dn_sched_qfq.c optional inet dummynet
	netpfil/ipfw/dn_sched_rr.c optional inet dummynet
	netpfil/ipfw/dn_sched_wf2q.c optional inet dummynet
	netpfil/ipfw/ip_dummynet.c optional inet dummynet
	netpfil/ipfw/ip_dn_io.c optional inet dummynet
	netpfil/ipfw/ip_dn_glue.c optional inet dummynet
	netpfil/ipfw/ip_fw2.c optional inet ipfirewall
	netpfil/ipfw/ip_fw_bpf.c optional inet ipfirewall
	netpfil/ipfw/ip_fw_dynamic.c optional inet ipfirewall \
	compile-with "${NORMAL_C} -I$S/contrib/ck/include"
	netpfil/ipfw/ip_fw_eaction.c optional inet ipfirewall
	netpfil/ipfw/ip_fw_log.c optional inet ipfirewall
	netpfil/ipfw/ip_fw_pfil.c optional inet ipfirewall
	netpfil/ipfw/ip_fw_sockopt.c optional inet ipfirewall
	netpfil/ipfw/ip_fw_table.c optional inet ipfirewall
	netpfil/ipfw/ip_fw_table_algo.c optional inet ipfirewall
	netpfil/ipfw/ip_fw_table_value.c optional inet ipfirewall
	netpfil/ipfw/ip_fw_iface.c optional inet ipfirewall
	netpfil/ipfw/ip_fw_nat.c optional inet ipfirewall_nat
	netpfil/ipfw/nat64/ip_fw_nat64.c optional inet inet6 ipfirewall \
	ipfirewall_nat64
	netpfil/ipfw/nat64/nat64lsn.c optional inet inet6 ipfirewall \
	ipfirewall_nat64
	netpfil/ipfw/nat64/nat64lsn_control.c optional inet inet6 ipfirewall \
	ipfirewall_nat64
	netpfil/ipfw/nat64/nat64stl.c optional inet inet6 ipfirewall \
	ipfirewall_nat64
	netpfil/ipfw/nat64/nat64stl_control.c optional inet inet6 ipfirewall \
	ipfirewall_nat64
	netpfil/ipfw/nat64/nat64_translate.c optional inet inet6 ipfirewall \
	ipfirewall_nat64
	netpfil/ipfw/nptv6/ip_fw_nptv6.c optional inet inet6 ipfirewall \
	ipfirewall_nptv6
	netpfil/ipfw/nptv6/nptv6.c optional inet inet6 ipfirewall \
	ipfirewall_nptv6
	netpfil/ipfw/pmod/ip_fw_pmod.c optional inet ipfirewall_pmod
	netpfil/ipfw/pmod/tcpmod.c optional inet ipfirewall_pmod
	netpfil/pf/if_pflog.c optional pflog pf inet
	netpfil/pf/if_pfsync.c optional pfsync pf inet
	netpfil/pf/pf.c optional pf inet
	netpfil/pf/pf_if.c optional pf inet
	netpfil/pf/pf_ioctl.c optional pf inet
	netpfil/pf/pf_lb.c optional pf inet
	netpfil/pf/pf_norm.c optional pf inet
	netpfil/pf/pf_osfp.c optional pf inet
	netpfil/pf/pf_ruleset.c optional pf inet
	netpfil/pf/pf_table.c optional pf inet
	netpfil/pf/in4_cksum.c optional pf inet
	netsmb/smb_conn.c optional netsmb
	netsmb/smb_crypt.c optional netsmb
	netsmb/smb_dev.c optional netsmb
	netsmb/smb_iod.c optional netsmb
	netsmb/smb_rq.c optional netsmb
	netsmb/smb_smb.c optional netsmb
	netsmb/smb_subr.c optional netsmb
	netsmb/smb_trantcp.c optional netsmb
	netsmb/smb_usr.c optional netsmb
	nfs/bootp_subr.c optional bootp nfscl
	nfs/krpc_subr.c optional bootp nfscl
	nfs/nfs_diskless.c optional nfscl nfs_root
	nfs/nfs_fha.c optional nfsd
	nfs/nfs_lock.c optional nfscl \| nfslockd \| nfsd
	nfs/nfs_nfssvc.c optional nfscl \| nfsd
	nlm/nlm_advlock.c optional nfslockd \| nfsd
	nlm/nlm_prot_clnt.c optional nfslockd \| nfsd
	nlm/nlm_prot_impl.c optional nfslockd \| nfsd
	nlm/nlm_prot_server.c optional nfslockd \| nfsd
	nlm/nlm_prot_svc.c optional nfslockd \| nfsd
	nlm/nlm_prot_xdr.c optional nfslockd \| nfsd
	nlm/sm_inter_xdr.c optional nfslockd \| nfsd

	# Linux Kernel Programming Interface
	compat/linuxkpi/common/src/linux_kmod.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_compat.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_current.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_hrtimer.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_kthread.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_lock.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_page.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_pci.c optional compat_linuxkpi pci \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_tasklet.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_idr.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_radix.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_rcu.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C} -I$S/contrib/ck/include"
	compat/linuxkpi/common/src/linux_schedule.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_slab.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_usb.c optional compat_linuxkpi usb \
	compile-with "${LINUXKPI_C}"
	compat/linuxkpi/common/src/linux_work.c optional compat_linuxkpi \
	compile-with "${LINUXKPI_C}"

	# OpenFabrics Enterprise Distribution (Infiniband)
	ofed/drivers/infiniband/core/ib_addr.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_agent.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_cache.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_cm.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_cma.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_cq.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_device.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_fmr_pool.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_iwcm.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_iwpm_msg.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_iwpm_util.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_mad.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_mad_rmpp.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_multicast.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_packer.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_sa_query.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_smi.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_sysfs.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_ucm.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_ucma.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_ud_header.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_umem.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_user_mad.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_uverbs_cmd.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_uverbs_main.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_uverbs_marshall.c optional ofed \
	compile-with "${OFED_C}"
	ofed/drivers/infiniband/core/ib_verbs.c optional ofed \
	compile-with "${OFED_C}"

	ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c optional ipoib \
	compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
	#ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c optional ipoib \
	# compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
	ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c optional ipoib \
	compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
	ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c optional ipoib \
	compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
	ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c optional ipoib \
	compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
	ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c optional ipoib \
	compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"
	#ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c optional ipoib \
	# compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/"

	ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c optional sdp inet \
	compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/"
	ofed/drivers/infiniband/ulp/sdp/sdp_main.c optional sdp inet \
	compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/"
	ofed/drivers/infiniband/ulp/sdp/sdp_rx.c optional sdp inet \
	compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/"
	ofed/drivers/infiniband/ulp/sdp/sdp_cma.c optional sdp inet \
	compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/"
	ofed/drivers/infiniband/ulp/sdp/sdp_tx.c optional sdp inet \
	compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/"

	dev/mthca/mthca_allocator.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_av.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_catas.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_cmd.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_cq.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_eq.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_mad.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_main.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_mcg.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_memfree.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_mr.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_pd.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_profile.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_provider.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_qp.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_reset.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_srq.c optional mthca pci ofed \
	compile-with "${OFED_C}"
	dev/mthca/mthca_uar.c optional mthca pci ofed \
	compile-with "${OFED_C}"

	dev/mlx4/mlx4_ib/mlx4_ib_alias_GUID.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_mcg.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_sysfs.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_cm.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_ah.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_cq.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_doorbell.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_mad.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_main.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_mr.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_qp.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_srq.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_ib/mlx4_ib_wc.c optional mlx4ib pci ofed \
	compile-with "${OFED_C}"

	dev/mlx4/mlx4_core/mlx4_alloc.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_catas.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_cmd.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_cq.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_eq.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_fw.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_fw_qos.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_icm.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_intf.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_main.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_mcg.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_mr.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_pd.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_port.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_profile.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_qp.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_reset.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_sense.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_srq.c optional mlx4 pci \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_core/mlx4_resource_tracker.c optional mlx4 pci \
	compile-with "${OFED_C}"

	dev/mlx4/mlx4_en/mlx4_en_cq.c optional mlx4en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_en/mlx4_en_main.c optional mlx4en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_en/mlx4_en_netdev.c optional mlx4en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_en/mlx4_en_port.c optional mlx4en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_en/mlx4_en_resources.c optional mlx4en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_en/mlx4_en_rx.c optional mlx4en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx4/mlx4_en/mlx4_en_tx.c optional mlx4en pci inet inet6 \
	compile-with "${OFED_C}"

	dev/mlx5/mlx5_ib/mlx5_ib_ah.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_cong.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_cq.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_gsi.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_mad.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_main.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_mem.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_mr.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_qp.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_srq.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_ib/mlx5_ib_virt.c optional mlx5ib pci ofed \
	compile-with "${OFED_C}"

	dev/mlx5/mlx5_core/mlx5_alloc.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_cmd.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_cq.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_crspace.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_diagnostics.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_eq.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_fs_cmd.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_fs_tree.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_fw.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_fwdump.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_fwdump_regmaps.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_health.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_mad.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_main.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_mcg.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_mr.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_pagealloc.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_pd.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_port.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_qp.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_srq.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_transobj.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_uar.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_vport.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_vsc.c optional mlx5 pci \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_core/mlx5_wq.c optional mlx5 pci \
	compile-with "${OFED_C}"

	dev/mlx5/mlx5_en/mlx5_en_ethtool.c optional mlx5en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_en/mlx5_en_main.c optional mlx5en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_en/mlx5_en_tx.c optional mlx5en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_en/mlx5_en_flow_table.c optional mlx5en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_en/mlx5_en_rx.c optional mlx5en pci inet inet6 \
	compile-with "${OFED_C}"
	dev/mlx5/mlx5_en/mlx5_en_txrx.c optional mlx5en pci inet inet6 \
	compile-with "${OFED_C}"

	# crypto support
	opencrypto/cast.c optional crypto \| ipsec \| ipsec_support
	opencrypto/criov.c optional crypto \| ipsec \| ipsec_support
	opencrypto/crypto.c optional crypto \| ipsec \| ipsec_support
	opencrypto/cryptodev.c optional cryptodev
	opencrypto/cryptodev_if.m optional crypto \| ipsec \| ipsec_support
	opencrypto/cryptosoft.c optional crypto \| ipsec \| ipsec_support
	opencrypto/cryptodeflate.c optional crypto \| ipsec \| ipsec_support
	opencrypto/gmac.c optional crypto \| ipsec \| ipsec_support
	opencrypto/gfmult.c optional crypto \| ipsec \| ipsec_support
	opencrypto/rmd160.c optional crypto \| ipsec \| ipsec_support
	opencrypto/skipjack.c optional crypto \| ipsec \| ipsec_support
	opencrypto/xform.c optional crypto \| ipsec \| ipsec_support
	rpc/auth_none.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/auth_unix.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/authunix_prot.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/clnt_bck.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/clnt_dg.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/clnt_rc.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/clnt_vc.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/getnetconfig.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/replay.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/rpc_callmsg.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/rpc_generic.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/rpc_prot.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/rpcb_clnt.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/rpcb_prot.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/svc.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/svc_auth.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/svc_auth_unix.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/svc_dg.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/svc_generic.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/svc_vc.c optional krpc \| nfslockd \| nfscl \| nfsd
	rpc/rpcsec_gss/rpcsec_gss.c optional krpc kgssapi \| nfslockd kgssapi \| nfscl kgssapi \| nfsd kgssapi
	rpc/rpcsec_gss/rpcsec_gss_conf.c optional krpc kgssapi \| nfslockd kgssapi \| nfscl kgssapi \| nfsd kgssapi
	rpc/rpcsec_gss/rpcsec_gss_misc.c optional krpc kgssapi \| nfslockd kgssapi \| nfscl kgssapi \| nfsd kgssapi
	rpc/rpcsec_gss/rpcsec_gss_prot.c optional krpc kgssapi \| nfslockd kgssapi \| nfscl kgssapi \| nfsd kgssapi
	rpc/rpcsec_gss/svc_rpcsec_gss.c optional krpc kgssapi \| nfslockd kgssapi \| nfscl kgssapi \| nfsd kgssapi
	security/audit/audit.c optional audit
	security/audit/audit_arg.c optional audit
	security/audit/audit_bsm.c optional audit
	security/audit/audit_bsm_klib.c optional audit
	security/audit/audit_pipe.c optional audit
	security/audit/audit_syscalls.c standard
	security/audit/audit_trigger.c optional audit
	security/audit/audit_worker.c optional audit
	security/audit/bsm_domain.c optional audit
	security/audit/bsm_errno.c optional audit
	security/audit/bsm_fcntl.c optional audit
	security/audit/bsm_socket_type.c optional audit
	security/audit/bsm_token.c optional audit
	security/mac/mac_audit.c optional mac audit
	security/mac/mac_cred.c optional mac
	security/mac/mac_framework.c optional mac
	security/mac/mac_inet.c optional mac inet \| mac inet6
	security/mac/mac_inet6.c optional mac inet6
	security/mac/mac_label.c optional mac
	security/mac/mac_net.c optional mac
	security/mac/mac_pipe.c optional mac
	security/mac/mac_posix_sem.c optional mac
	security/mac/mac_posix_shm.c optional mac
	security/mac/mac_priv.c optional mac
	security/mac/mac_process.c optional mac
	security/mac/mac_socket.c optional mac
	security/mac/mac_syscalls.c standard
	security/mac/mac_system.c optional mac
	security/mac/mac_sysv_msg.c optional mac
	security/mac/mac_sysv_sem.c optional mac
	security/mac/mac_sysv_shm.c optional mac
	security/mac/mac_vfs.c optional mac
	security/mac_biba/mac_biba.c optional mac_biba
	security/mac_bsdextended/mac_bsdextended.c optional mac_bsdextended
	security/mac_bsdextended/ugidfw_system.c optional mac_bsdextended
	security/mac_bsdextended/ugidfw_vnode.c optional mac_bsdextended
	security/mac_ifoff/mac_ifoff.c optional mac_ifoff
	security/mac_lomac/mac_lomac.c optional mac_lomac
	security/mac_mls/mac_mls.c optional mac_mls
	security/mac_none/mac_none.c optional mac_none
	security/mac_partition/mac_partition.c optional mac_partition
	security/mac_portacl/mac_portacl.c optional mac_portacl
	security/mac_seeotheruids/mac_seeotheruids.c optional mac_seeotheruids
	security/mac_stub/mac_stub.c optional mac_stub
	security/mac_test/mac_test.c optional mac_test
	teken/teken.c optional sc \| vt
	ufs/ffs/ffs_alloc.c optional ffs
	ufs/ffs/ffs_balloc.c optional ffs
	ufs/ffs/ffs_inode.c optional ffs
	ufs/ffs/ffs_snapshot.c optional ffs
	ufs/ffs/ffs_softdep.c optional ffs
	ufs/ffs/ffs_subr.c optional ffs
	ufs/ffs/ffs_tables.c optional ffs
	ufs/ffs/ffs_vfsops.c optional ffs
	ufs/ffs/ffs_vnops.c optional ffs
	ufs/ffs/ffs_rawread.c optional ffs directio
	ufs/ffs/ffs_suspend.c optional ffs
	ufs/ufs/ufs_acl.c optional ffs
	ufs/ufs/ufs_bmap.c optional ffs
	ufs/ufs/ufs_dirhash.c optional ffs
	ufs/ufs/ufs_extattr.c optional ffs
	ufs/ufs/ufs_gjournal.c optional ffs UFS_GJOURNAL
	ufs/ufs/ufs_inode.c optional ffs
	ufs/ufs/ufs_lookup.c optional ffs
	ufs/ufs/ufs_quota.c optional ffs
	ufs/ufs/ufs_vfsops.c optional ffs
	ufs/ufs/ufs_vnops.c optional ffs
	vm/default_pager.c standard
	vm/device_pager.c standard
	vm/phys_pager.c standard
	vm/redzone.c optional DEBUG_REDZONE
	vm/sg_pager.c standard
	vm/swap_pager.c standard
	vm/uma_core.c standard
	vm/uma_dbg.c standard
	vm/memguard.c optional DEBUG_MEMGUARD
	vm/vm_domain.c standard
	vm/vm_fault.c standard
	vm/vm_glue.c standard
	vm/vm_init.c standard
	vm/vm_kern.c standard
	vm/vm_map.c standard
	vm/vm_meter.c standard
	vm/vm_mmap.c standard
	vm/vm_object.c standard
	vm/vm_page.c standard
	vm/vm_pageout.c standard
	vm/vm_pager.c standard
	vm/vm_phys.c standard
	vm/vm_radix.c standard
	vm/vm_reserv.c standard
	vm/vm_swapout.c optional !NO_SWAPPING
	vm/vm_swapout_dummy.c optional NO_SWAPPING
	vm/vm_unix.c standard
	vm/vm_zeroidle.c standard
	vm/vnode_pager.c standard
	xen/features.c optional xenhvm
	xen/xenbus/xenbus_if.m optional xenhvm
	xen/xenbus/xenbus.c optional xenhvm
	xen/xenbus/xenbusb_if.m optional xenhvm
	xen/xenbus/xenbusb.c optional xenhvm
	xen/xenbus/xenbusb_front.c optional xenhvm
	xen/xenbus/xenbusb_back.c optional xenhvm
	xen/xenmem/xenmem_if.m optional xenhvm
	xdr/xdr.c optional krpc \| nfslockd \| nfscl \| nfsd
	xdr/xdr_array.c optional krpc \| nfslockd \| nfscl \| nfsd
	xdr/xdr_mbuf.c optional krpc \| nfslockd \| nfscl \| nfsd
	xdr/xdr_mem.c optional krpc \| nfslockd \| nfscl \| nfsd
	xdr/xdr_reference.c optional krpc \| nfslockd \| nfscl \| nfsd
	xdr/xdr_sizeof.c optional krpc \| nfslockd \| nfscl \| nfsd
	Index: stable/11
	===================================================================
	--- stable/11 (revision 332524)
	+++ stable/11 (revision 332525)

	Property changes on: stable/11
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /head:r329732

File Metadata

Mime Type: application/octet-stream
Expires: Mon, May 6, 10:42 AM (2 d)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: 2LpEwBq4K3Gd
Default Alt Text: (4 MB)

Offset	End	Complete
0	4194304	Yes
4194304	4203124	Yes

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions