diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index e1b1ae3ffff7..46dd903f8449 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -1,7250 +1,7255 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Joyent, Inc.
  */
 
 /*
  * The objective of this program is to provide a DMU/ZAP/SPA stress test
  * that runs entirely in userland, is easy to use, and easy to extend.
  *
  * The overall design of the ztest program is as follows:
  *
  * (1) For each major functional area (e.g. adding vdevs to a pool,
  *     creating and destroying datasets, reading and writing objects, etc)
  *     we have a simple routine to test that functionality.  These
  *     individual routines do not have to do anything "stressful".
  *
  * (2) We turn these simple functionality tests into a stress test by
  *     running them all in parallel, with as many threads as desired,
  *     and spread across as many datasets, objects, and vdevs as desired.
  *
  * (3) While all this is happening, we inject faults into the pool to
  *     verify that self-healing data really works.
  *
  * (4) Every time we open a dataset, we change its checksum and compression
  *     functions.  Thus even individual objects vary from block to block
  *     in which checksum they use and whether they're compressed.
  *
  * (5) To verify that we never lose on-disk consistency after a crash,
  *     we run the entire test in a child of the main process.
  *     At random times, the child self-immolates with a SIGKILL.
  *     This is the software equivalent of pulling the power cord.
  *     The parent then runs the test again, using the existing
  *     storage pool, as many times as desired. If backwards compatibility
  *     testing is enabled ztest will sometimes run the "older" version
  *     of ztest after a SIGKILL.
  *
  * (6) To verify that we don't have future leaks or temporal incursions,
  *     many of the functional tests record the transaction group number
  *     as part of their data.  When reading old data, they verify that
  *     the transaction group number is less than the current, open txg.
  *     If you add a new test, please do this if applicable.
  *
  * (7) Threads are created with a reduced stack size, for sanity checking.
  *     Therefore, it's important not to allocate huge buffers on the stack.
  *
  * When run with no arguments, ztest runs for about five minutes and
  * produces no output if successful.  To get a little bit of information,
  * specify -V.  To get more information, specify -VV, and so on.
  *
  * To turn this into an overnight stress test, use -T to specify run time.
  *
  * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
  * to increase the pool capacity, fanout, and overall stress level.
  *
  * Use the -k option to set the desired frequency of kills.
  *
  * When ztest invokes itself it passes all relevant information through a
  * temporary file which is mmap-ed in the child process. This allows shared
  * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always
  * stored at offset 0 of this file and contains information on the size and
  * number of shared structures in the file. The information stored in this file
  * must remain backwards compatible with older versions of ztest so that
  * ztest can invoke them during backwards compatibility testing (-B).
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/dmu_objset.h>
 #include <sys/poll.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/zfs_rlock.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/spa_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_scan.h>
 #include <sys/zio_checksum.h>
 #include <sys/refcount.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_userhold.h>
 #include <sys/abd.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <signal.h>
 #include <umem.h>
 #include <ctype.h>
 #include <math.h>
 #include <sys/fs/zfs.h>
 #include <zfs_fletcher.h>
 #include <libnvpair.h>
 #include <libzfs.h>
 #ifdef __GLIBC__
 #include <execinfo.h> /* for backtrace() */
 #endif
 
 static int ztest_fd_data = -1;
 static int ztest_fd_rand = -1;
 
 typedef struct ztest_shared_hdr {
 	uint64_t	zh_hdr_size;
 	uint64_t	zh_opts_size;
 	uint64_t	zh_size;
 	uint64_t	zh_stats_size;
 	uint64_t	zh_stats_count;
 	uint64_t	zh_ds_size;
 	uint64_t	zh_ds_count;
 } ztest_shared_hdr_t;
 
 static ztest_shared_hdr_t *ztest_shared_hdr;
 
 typedef struct ztest_shared_opts {
 	char zo_pool[ZFS_MAX_DATASET_NAME_LEN];
 	char zo_dir[ZFS_MAX_DATASET_NAME_LEN];
 	char zo_alt_ztest[MAXNAMELEN];
 	char zo_alt_libpath[MAXNAMELEN];
 	uint64_t zo_vdevs;
 	uint64_t zo_vdevtime;
 	size_t zo_vdev_size;
 	int zo_ashift;
 	int zo_mirrors;
 	int zo_raidz;
 	int zo_raidz_parity;
 	int zo_datasets;
 	int zo_threads;
 	uint64_t zo_passtime;
 	uint64_t zo_killrate;
 	int zo_verbose;
 	int zo_init;
 	uint64_t zo_time;
 	uint64_t zo_maxloops;
 	uint64_t zo_metaslab_gang_bang;
 	int zo_mmp_test;
 } ztest_shared_opts_t;
 
 static const ztest_shared_opts_t ztest_opts_defaults = {
 	.zo_pool = { 'z', 't', 'e', 's', 't', '\0' },
 	.zo_dir = { '/', 't', 'm', 'p', '\0' },
 	.zo_alt_ztest = { '\0' },
 	.zo_alt_libpath = { '\0' },
 	.zo_vdevs = 5,
 	.zo_ashift = SPA_MINBLOCKSHIFT,
 	.zo_mirrors = 2,
 	.zo_raidz = 4,
 	.zo_raidz_parity = 1,
 	.zo_vdev_size = SPA_MINDEVSIZE * 4,	/* 256m default size */
 	.zo_datasets = 7,
 	.zo_threads = 23,
 	.zo_passtime = 60,		/* 60 seconds */
 	.zo_killrate = 70,		/* 70% kill rate */
 	.zo_verbose = 0,
 	.zo_mmp_test = 0,
 	.zo_init = 1,
 	.zo_time = 300,			/* 5 minutes */
 	.zo_maxloops = 50,		/* max loops during spa_freeze() */
 	.zo_metaslab_gang_bang = 32 << 10
 };
 
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
 extern int metaslab_preload_limit;
 extern boolean_t zfs_compressed_arc_enabled;
 extern int  zfs_abd_scatter_enabled;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
 static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345";
 
 typedef struct ztest_shared_ds {
 	uint64_t	zd_seq;
 } ztest_shared_ds_t;
 
 static ztest_shared_ds_t *ztest_shared_ds;
 #define	ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])
 
 #define	BT_MAGIC	0x123456789abcdefULL
 #define	MAXFAULTS() \
 	(MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
 
 enum ztest_io_type {
 	ZTEST_IO_WRITE_TAG,
 	ZTEST_IO_WRITE_PATTERN,
 	ZTEST_IO_WRITE_ZEROES,
 	ZTEST_IO_TRUNCATE,
 	ZTEST_IO_SETATTR,
 	ZTEST_IO_REWRITE,
 	ZTEST_IO_TYPES
 };
 
 typedef struct ztest_block_tag {
 	uint64_t	bt_magic;
 	uint64_t	bt_objset;
 	uint64_t	bt_object;
 	uint64_t	bt_dnodesize;
 	uint64_t	bt_offset;
 	uint64_t	bt_gen;
 	uint64_t	bt_txg;
 	uint64_t	bt_crtxg;
 } ztest_block_tag_t;
 
 typedef struct bufwad {
 	uint64_t	bw_index;
 	uint64_t	bw_txg;
 	uint64_t	bw_data;
 } bufwad_t;
 
 typedef struct rll {
 	void		*rll_writer;
 	int		rll_readers;
 	kmutex_t	rll_lock;
 	kcondvar_t	rll_cv;
 } rll_t;
 
 typedef struct zll {
 	list_t z_list;
 	kmutex_t z_lock;
 } zll_t;
 
 #define	ZTEST_RANGE_LOCKS	64
 #define	ZTEST_OBJECT_LOCKS	64
 
 /*
  * Object descriptor.  Used as a template for object lookup/create/remove.
  */
 typedef struct ztest_od {
 	uint64_t	od_dir;
 	uint64_t	od_object;
 	dmu_object_type_t od_type;
 	dmu_object_type_t od_crtype;
 	uint64_t	od_blocksize;
 	uint64_t	od_crblocksize;
 	uint64_t	od_crdnodesize;
 	uint64_t	od_gen;
 	uint64_t	od_crgen;
 	char		od_name[ZFS_MAX_DATASET_NAME_LEN];
 } ztest_od_t;
 
 /*
  * Per-dataset state.
  */
 typedef struct ztest_ds {
 	ztest_shared_ds_t *zd_shared;
 	objset_t	*zd_os;
 	rwlock_t	zd_zilog_lock;
 	zilog_t		*zd_zilog;
 	ztest_od_t	*zd_od;		/* debugging aid */
 	char		zd_name[ZFS_MAX_DATASET_NAME_LEN];
 	kmutex_t	zd_dirobj_lock;
 	rll_t		zd_object_lock[ZTEST_OBJECT_LOCKS];
 	zll_t		zd_range_lock[ZTEST_RANGE_LOCKS];
 } ztest_ds_t;
 
 /*
  * Per-iteration state.
  */
 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
 
 typedef struct ztest_info {
 	ztest_func_t	*zi_func;	/* test function */
 	uint64_t	zi_iters;	/* iterations per execution */
 	uint64_t	*zi_interval;	/* execute every <interval> seconds */
 	const char	*zi_funcname;	/* name of test function */
 } ztest_info_t;
 
 typedef struct ztest_shared_callstate {
 	uint64_t	zc_count;	/* per-pass count */
 	uint64_t	zc_time;	/* per-pass time */
 	uint64_t	zc_next;	/* next time to call this function */
 } ztest_shared_callstate_t;
 
 static ztest_shared_callstate_t *ztest_shared_callstate;
 #define	ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c])
 
 ztest_func_t ztest_dmu_read_write;
 ztest_func_t ztest_dmu_write_parallel;
 ztest_func_t ztest_dmu_object_alloc_free;
 ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
 ztest_func_t ztest_zil_commit;
 ztest_func_t ztest_zil_remount;
 ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_objset_create_destroy;
 ztest_func_t ztest_dmu_prealloc;
 ztest_func_t ztest_fzap;
 ztest_func_t ztest_dmu_snapshot_create_destroy;
 ztest_func_t ztest_dsl_prop_get_set;
 ztest_func_t ztest_spa_prop_get_set;
 ztest_func_t ztest_spa_create_destroy;
 ztest_func_t ztest_fault_inject;
 ztest_func_t ztest_ddt_repair;
 ztest_func_t ztest_dmu_snapshot_hold;
 ztest_func_t ztest_mmp_enable_disable;
 ztest_func_t ztest_spa_rename;
 ztest_func_t ztest_scrub;
 ztest_func_t ztest_dsl_dataset_promote_busy;
 ztest_func_t ztest_vdev_attach_detach;
 ztest_func_t ztest_vdev_LUN_growth;
 ztest_func_t ztest_vdev_add_remove;
 ztest_func_t ztest_vdev_aux_add_remove;
 ztest_func_t ztest_split_pool;
 ztest_func_t ztest_reguid;
 ztest_func_t ztest_spa_upgrade;
 ztest_func_t ztest_fletcher;
 ztest_func_t ztest_fletcher_incr;
 ztest_func_t ztest_verify_dnode_bt;
 
 uint64_t zopt_always = 0ULL * NANOSEC;		/* all the time */
 uint64_t zopt_incessant = 1ULL * NANOSEC / 10;	/* every 1/10 second */
 uint64_t zopt_often = 1ULL * NANOSEC;		/* every second */
 uint64_t zopt_sometimes = 10ULL * NANOSEC;	/* every 10 seconds */
 uint64_t zopt_rarely = 60ULL * NANOSEC;		/* every 60 seconds */
 
 #define	ZTI_INIT(func, iters, interval) \
 	{   .zi_func = (func), \
 	    .zi_iters = (iters), \
 	    .zi_interval = (interval), \
 	    .zi_funcname = # func }
 
 ztest_info_t ztest_info[] = {
 	ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always),
 	ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always),
 	ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always),
 	ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always),
 	ZTI_INIT(ztest_zap, 30, &zopt_always),
 	ZTI_INIT(ztest_zap_parallel, 100, &zopt_always),
 	ZTI_INIT(ztest_split_pool, 1, &zopt_always),
 	ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant),
 	ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often),
 	ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often),
 	ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often),
 	ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes),
 #if 0
 	ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes),
 #endif
 	ZTI_INIT(ztest_fzap, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_ddt_repair, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_reguid, 1, &zopt_rarely),
 	ZTI_INIT(ztest_spa_rename, 1, &zopt_rarely),
 	ZTI_INIT(ztest_scrub, 1, &zopt_rarely),
 	ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely),
 	ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely),
 	ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely),
 	ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime),
 	ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime),
 	ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
 	ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
 	ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
 };
 
 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
 
 /*
  * The following struct is used to hold a list of uncalled commit callbacks.
  * The callbacks are ordered by txg number.
  */
 typedef struct ztest_cb_list {
 	kmutex_t	zcl_callbacks_lock;
 	list_t		zcl_callbacks;
 } ztest_cb_list_t;
 
 /*
  * Stuff we need to share writably between parent and child.
  */
 typedef struct ztest_shared {
 	boolean_t	zs_do_init;
 	hrtime_t	zs_proc_start;
 	hrtime_t	zs_proc_stop;
 	hrtime_t	zs_thread_start;
 	hrtime_t	zs_thread_stop;
 	hrtime_t	zs_thread_kill;
 	uint64_t	zs_enospc_count;
 	uint64_t	zs_vdev_next_leaf;
 	uint64_t	zs_vdev_aux;
 	uint64_t	zs_alloc;
 	uint64_t	zs_space;
 	uint64_t	zs_splits;
 	uint64_t	zs_mirrors;
 	uint64_t	zs_metaslab_sz;
 	uint64_t	zs_metaslab_df_alloc_threshold;
 	uint64_t	zs_guid;
 } ztest_shared_t;
 
 #define	ID_PARALLEL	-1ULL
 
 static char ztest_dev_template[] = "%s/%s.%llua";
 static char ztest_aux_template[] = "%s/%s.%s.%llu";
 ztest_shared_t *ztest_shared;
 
 static spa_t *ztest_spa = NULL;
 static ztest_ds_t *ztest_ds;
 
 static kmutex_t ztest_vdev_lock;
 
 /*
  * The ztest_name_lock protects the pool and dataset namespace used by
  * the individual tests. To modify the namespace, consumers must grab
  * this lock as writer. Grabbing the lock as reader will ensure that the
  * namespace does not change while the lock is held.
  */
 static rwlock_t ztest_name_lock;
 
 static boolean_t ztest_dump_core = B_TRUE;
 static boolean_t ztest_exiting;
 
 /* Global commit callback list */
 static ztest_cb_list_t zcl;
 /* Commit cb delay */
 static uint64_t zc_min_txg_delay = UINT64_MAX;
 static int zc_cb_counter = 0;
 
 /*
  * Minimum number of commit callbacks that need to be registered for us to check
  * whether the minimum txg delay is acceptable.
  */
 #define	ZTEST_COMMIT_CB_MIN_REG	100
 
 /*
  * If a number of txgs equal to this threshold have been created after a commit
  * callback has been registered but not called, then we assume there is an
  * implementation bug.
  */
 #define	ZTEST_COMMIT_CB_THRESH	(TXG_CONCURRENT_STATES + 1000)
 
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
 
 enum ztest_object {
 	ZTEST_META_DNODE = 0,
 	ZTEST_DIROBJ,
 	ZTEST_OBJECTS
 };
 
 static void usage(boolean_t) __NORETURN;
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init(void)
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 #define	BACKTRACE_SZ	100
 
 static void sig_handler(int signo)
 {
 	struct sigaction action;
 #ifdef __GLIBC__ /* backtrace() is a GNU extension */
 	int nptrs;
 	void *buffer[BACKTRACE_SZ];
 
 	nptrs = backtrace(buffer, BACKTRACE_SZ);
 	backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO);
 #endif
 
 	/*
 	 * Restore default action and re-raise signal so SIGSEGV and
 	 * SIGABRT can trigger a core dump.
 	 */
 	action.sa_handler = SIG_DFL;
 	sigemptyset(&action.sa_mask);
 	action.sa_flags = 0;
 	(void) sigaction(signo, &action, NULL);
 	raise(signo);
 }
 
 #define	FATAL_MSG_SZ	1024
 
 char *fatal_msg;
 
 static void
 fatal(int do_perror, char *message, ...)
 {
 	va_list args;
 	int save_errno = errno;
 	char *buf;
 
 	(void) fflush(stdout);
 	buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL);
 
 	va_start(args, message);
 	(void) sprintf(buf, "ztest: ");
 	/* LINTED */
 	(void) vsprintf(buf + strlen(buf), message, args);
 	va_end(args);
 	if (do_perror) {
 		(void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
 		    ": %s", strerror(save_errno));
 	}
 	(void) fprintf(stderr, "%s\n", buf);
 	fatal_msg = buf;			/* to ease debugging */
 	if (ztest_dump_core)
 		abort();
 	exit(3);
 }
 
 static int
 str2shift(const char *buf)
 {
 	const char *ends = "BKMGTPEZ";
 	int i;
 
 	if (buf[0] == '\0')
 		return (0);
 	for (i = 0; i < strlen(ends); i++) {
 		if (toupper(buf[0]) == ends[i])
 			break;
 	}
 	if (i == strlen(ends)) {
 		(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
 		    buf);
 		usage(B_FALSE);
 	}
 	if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
 		return (10*i);
 	}
 	(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
 	usage(B_FALSE);
 	/* NOTREACHED */
 }
 
 static uint64_t
 nicenumtoull(const char *buf)
 {
 	char *end;
 	uint64_t val;
 
 	val = strtoull(buf, &end, 0);
 	if (end == buf) {
 		(void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
 		usage(B_FALSE);
 	} else if (end[0] == '.') {
 		double fval = strtod(buf, &end);
 		fval *= pow(2, str2shift(end));
 		if (fval > UINT64_MAX) {
 			(void) fprintf(stderr, "ztest: value too large: %s\n",
 			    buf);
 			usage(B_FALSE);
 		}
 		val = (uint64_t)fval;
 	} else {
 		int shift = str2shift(end);
 		if (shift >= 64 || (val << shift) >> shift != val) {
 			(void) fprintf(stderr, "ztest: value too large: %s\n",
 			    buf);
 			usage(B_FALSE);
 		}
 		val <<= shift;
 	}
 	return (val);
 }
 
 static void
 usage(boolean_t requested)
 {
 	const ztest_shared_opts_t *zo = &ztest_opts_defaults;
 
 	char nice_vdev_size[NN_NUMBUF_SZ];
 	char nice_gang_bang[NN_NUMBUF_SZ];
 	FILE *fp = requested ? stdout : stderr;
 
 	nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size));
 	nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang,
 	    sizeof (nice_gang_bang));
 
 	(void) fprintf(fp, "Usage: %s\n"
 	    "\t[-v vdevs (default: %llu)]\n"
 	    "\t[-s size_of_each_vdev (default: %s)]\n"
 	    "\t[-a alignment_shift (default: %d)] use 0 for random\n"
 	    "\t[-m mirror_copies (default: %d)]\n"
 	    "\t[-r raidz_disks (default: %d)]\n"
 	    "\t[-R raidz_parity (default: %d)]\n"
 	    "\t[-d datasets (default: %d)]\n"
 	    "\t[-t threads (default: %d)]\n"
 	    "\t[-g gang_block_threshold (default: %s)]\n"
 	    "\t[-i init_count (default: %d)] initialize pool i times\n"
 	    "\t[-k kill_percentage (default: %llu%%)]\n"
 	    "\t[-p pool_name (default: %s)]\n"
 	    "\t[-f dir (default: %s)] file directory for vdev files\n"
 	    "\t[-M] Multi-host simulate pool imported on remote host\n"
 	    "\t[-V] verbose (use multiple times for ever more blather)\n"
 	    "\t[-E] use existing pool instead of creating new one\n"
 	    "\t[-T time (default: %llu sec)] total run time\n"
 	    "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
 	    "\t[-P passtime (default: %llu sec)] time per pass\n"
 	    "\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
 	    "\t[-o variable=value] ... set global variable to an unsigned\n"
 	    "\t    32-bit integer value\n"
 	    "\t[-h] (print help)\n"
 	    "",
 	    zo->zo_pool,
 	    (u_longlong_t)zo->zo_vdevs,			/* -v */
 	    nice_vdev_size,				/* -s */
 	    zo->zo_ashift,				/* -a */
 	    zo->zo_mirrors,				/* -m */
 	    zo->zo_raidz,				/* -r */
 	    zo->zo_raidz_parity,			/* -R */
 	    zo->zo_datasets,				/* -d */
 	    zo->zo_threads,				/* -t */
 	    nice_gang_bang,				/* -g */
 	    zo->zo_init,				/* -i */
 	    (u_longlong_t)zo->zo_killrate,		/* -k */
 	    zo->zo_pool,				/* -p */
 	    zo->zo_dir,					/* -f */
 	    (u_longlong_t)zo->zo_time,			/* -T */
 	    (u_longlong_t)zo->zo_maxloops,		/* -F */
 	    (u_longlong_t)zo->zo_passtime);
 	exit(requested ? 0 : 1);
 }
 
 static void
 process_options(int argc, char **argv)
 {
 	char *path;
 	ztest_shared_opts_t *zo = &ztest_opts;
 
 	int opt;
 	uint64_t value;
 	char altdir[MAXNAMELEN] = { 0 };
 
 	bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
 
 	while ((opt = getopt(argc, argv,
 	    "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:")) != EOF) {
 		value = 0;
 		switch (opt) {
 		case 'v':
 		case 's':
 		case 'a':
 		case 'm':
 		case 'r':
 		case 'R':
 		case 'd':
 		case 't':
 		case 'g':
 		case 'i':
 		case 'k':
 		case 'T':
 		case 'P':
 		case 'F':
 			value = nicenumtoull(optarg);
 		}
 		switch (opt) {
 		case 'v':
 			zo->zo_vdevs = value;
 			break;
 		case 's':
 			zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value);
 			break;
 		case 'a':
 			zo->zo_ashift = value;
 			break;
 		case 'm':
 			zo->zo_mirrors = value;
 			break;
 		case 'r':
 			zo->zo_raidz = MAX(1, value);
 			break;
 		case 'R':
 			zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
 			break;
 		case 'd':
 			zo->zo_datasets = MAX(1, value);
 			break;
 		case 't':
 			zo->zo_threads = MAX(1, value);
 			break;
 		case 'g':
 			zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1,
 			    value);
 			break;
 		case 'i':
 			zo->zo_init = value;
 			break;
 		case 'k':
 			zo->zo_killrate = value;
 			break;
 		case 'p':
 			(void) strlcpy(zo->zo_pool, optarg,
 			    sizeof (zo->zo_pool));
 			break;
 		case 'f':
 			path = realpath(optarg, NULL);
 			if (path == NULL) {
 				(void) fprintf(stderr, "error: %s: %s\n",
 				    optarg, strerror(errno));
 				usage(B_FALSE);
 			} else {
 				(void) strlcpy(zo->zo_dir, path,
 				    sizeof (zo->zo_dir));
 				free(path);
 			}
 			break;
 		case 'M':
 			zo->zo_mmp_test = 1;
 			break;
 		case 'V':
 			zo->zo_verbose++;
 			break;
 		case 'E':
 			zo->zo_init = 0;
 			break;
 		case 'T':
 			zo->zo_time = value;
 			break;
 		case 'P':
 			zo->zo_passtime = MAX(1, value);
 			break;
 		case 'F':
 			zo->zo_maxloops = MAX(1, value);
 			break;
 		case 'B':
 			(void) strlcpy(altdir, optarg, sizeof (altdir));
 			break;
 		case 'o':
 			if (set_global_var(optarg) != 0)
 				usage(B_FALSE);
 			break;
 		case 'h':
 			usage(B_TRUE);
 			break;
 		case '?':
 		default:
 			usage(B_FALSE);
 			break;
 		}
 	}
 
 	zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
 
 	zo->zo_vdevtime =
 	    (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
 	    UINT64_MAX >> 2);
 
 	if (strlen(altdir) > 0) {
 		char *cmd;
 		char *realaltdir;
 		char *bin;
 		char *ztest;
 		char *isa;
 		int isalen;
 
 		cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 		realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 
 		VERIFY(NULL != realpath(getexecname(), cmd));
 		if (0 != access(altdir, F_OK)) {
 			ztest_dump_core = B_FALSE;
 			fatal(B_TRUE, "invalid alternate ztest path: %s",
 			    altdir);
 		}
 		VERIFY(NULL != realpath(altdir, realaltdir));
 
 		/*
 		 * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest".
 		 * We want to extract <isa> to determine if we should use
 		 * 32 or 64 bit binaries.
 		 */
 		bin = strstr(cmd, "/usr/bin/");
 		ztest = strstr(bin, "/ztest");
 		isa = bin + 9;
 		isalen = ztest - isa;
 		(void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest),
 		    "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa);
 		(void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath),
 		    "%s/usr/lib/%.*s", realaltdir, isalen, isa);
 
 		if (0 != access(zo->zo_alt_ztest, X_OK)) {
 			ztest_dump_core = B_FALSE;
 			fatal(B_TRUE, "invalid alternate ztest: %s",
 			    zo->zo_alt_ztest);
 		} else if (0 != access(zo->zo_alt_libpath, X_OK)) {
 			ztest_dump_core = B_FALSE;
 			fatal(B_TRUE, "invalid alternate lib directory %s",
 			    zo->zo_alt_libpath);
 		}
 
 		umem_free(cmd, MAXPATHLEN);
 		umem_free(realaltdir, MAXPATHLEN);
 	}
 }
 
 static void
 ztest_kill(ztest_shared_t *zs)
 {
 	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
 	zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
 
 	/*
 	 * Before we kill off ztest, make sure that the config is updated.
 	 * See comment above spa_config_sync().
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa_config_sync(ztest_spa, B_FALSE, B_FALSE);
 	mutex_exit(&spa_namespace_lock);
 
 	(void) kill(getpid(), SIGKILL);
 }
 
 static uint64_t
 ztest_random(uint64_t range)
 {
 	uint64_t r;
 
 	ASSERT3S(ztest_fd_rand, >=, 0);
 
 	if (range == 0)
 		return (0);
 
 	if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
 		fatal(1, "short read from /dev/urandom");
 
 	return (r % range);
 }
 
 /* ARGSUSED */
 static void
 ztest_record_enospc(const char *s)
 {
 	ztest_shared->zs_enospc_count++;
 }
 
 static uint64_t
 ztest_get_ashift(void)
 {
 	if (ztest_opts.zo_ashift == 0)
 		return (SPA_MINBLOCKSHIFT + ztest_random(5));
 	return (ztest_opts.zo_ashift);
 }
 
 static nvlist_t *
 make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
 {
 	char *pathbuf;
 	uint64_t vdev;
 	nvlist_t *file;
 
 	pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 
 	if (ashift == 0)
 		ashift = ztest_get_ashift();
 
 	if (path == NULL) {
 		path = pathbuf;
 
 		if (aux != NULL) {
 			vdev = ztest_shared->zs_vdev_aux;
 			(void) snprintf(path, MAXPATHLEN,
 			    ztest_aux_template, ztest_opts.zo_dir,
 			    pool == NULL ? ztest_opts.zo_pool : pool,
 			    aux, vdev);
 		} else {
 			vdev = ztest_shared->zs_vdev_next_leaf++;
 			(void) snprintf(path, MAXPATHLEN,
 			    ztest_dev_template, ztest_opts.zo_dir,
 			    pool == NULL ? ztest_opts.zo_pool : pool, vdev);
 		}
 	}
 
 	if (size != 0) {
 		int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
 		if (fd == -1)
 			fatal(1, "can't open %s", path);
 		if (ftruncate(fd, size) != 0)
 			fatal(1, "can't ftruncate %s", path);
 		(void) close(fd);
 	}
 
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
 	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 	umem_free(pathbuf, MAXPATHLEN);
 
 	return (file);
 }
 
 static nvlist_t *
 make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
     uint64_t ashift, int r)
 {
 	nvlist_t *raidz, **child;
 	int c;
 
 	if (r < 2)
 		return (make_vdev_file(path, aux, pool, size, ashift));
 	child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < r; c++)
 		child[c] = make_vdev_file(path, aux, pool, size, ashift);
 
 	VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_RAIDZ) == 0);
 	VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
 	    ztest_opts.zo_raidz_parity) == 0);
 	VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
 	    child, r) == 0);
 
 	for (c = 0; c < r; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, r * sizeof (nvlist_t *));
 
 	return (raidz);
 }
 
 static nvlist_t *
 make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
     uint64_t ashift, int r, int m)
 {
 	nvlist_t *mirror, **child;
 	int c;
 
 	if (m < 1)
 		return (make_vdev_raidz(path, aux, pool, size, ashift, r));
 
 	child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < m; c++)
 		child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
 
 	VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_MIRROR) == 0);
 	VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
 	    child, m) == 0);
 
 	for (c = 0; c < m; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, m * sizeof (nvlist_t *));
 
 	return (mirror);
 }
 
 static nvlist_t *
 make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
     int log, int r, int m, int t)
 {
 	nvlist_t *root, **child;
 	int c;
 
 	ASSERT(t > 0);
 
 	child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < t; c++) {
 		child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
 		    r, m);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
 		    log) == 0);
 	}
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
 	    child, t) == 0);
 
 	for (c = 0; c < t; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, t * sizeof (nvlist_t *));
 
 	return (root);
 }
 
 /*
  * Find a random spa version. Returns back a random spa version in the
  * range [initial_version, SPA_VERSION_FEATURES].
  */
 static uint64_t
 ztest_random_spa_version(uint64_t initial_version)
 {
 	uint64_t version = initial_version;
 
 	if (version <= SPA_VERSION_BEFORE_FEATURES) {
 		version = version +
 		    ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
 	}
 
 	if (version > SPA_VERSION_BEFORE_FEATURES)
 		version = SPA_VERSION_FEATURES;
 
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 	return (version);
 }
 
 static int
 ztest_random_blocksize(void)
 {
 	/*
 	 * Choose a block size >= the ashift.
 	 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
 	 */
 	int maxbs = SPA_OLD_MAXBLOCKSHIFT;
 	if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
 		maxbs = 20;
 	uint64_t block_shift =
 	    ztest_random(maxbs - ztest_spa->spa_max_ashift + 1);
 	return (1 << (SPA_MINBLOCKSHIFT + block_shift));
 }
 
 static int
 ztest_random_dnodesize(void)
 {
 	int slots;
 	int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT;
 
 	if (max_slots == DNODE_MIN_SLOTS)
 		return (DNODE_MIN_SIZE);
 
 	/*
 	 * Weight the random distribution more heavily toward smaller
 	 * dnode sizes since that is more likely to reflect real-world
 	 * usage.
 	 */
 	ASSERT3U(max_slots, >, 4);
 	switch (ztest_random(10)) {
 	case 0:
 		slots = 5 + ztest_random(max_slots - 4);
 		break;
 	case 1 ... 4:
 		slots = 2 + ztest_random(3);
 		break;
 	default:
 		slots = 1;
 		break;
 	}
 
 	return (slots << DNODE_SHIFT);
 }
 
 static int
 ztest_random_ibshift(void)
 {
 	return (DN_MIN_INDBLKSHIFT +
 	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
 }
 
 static uint64_t
 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
 {
 	uint64_t top;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *tvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	do {
 		top = ztest_random(rvd->vdev_children);
 		tvd = rvd->vdev_child[top];
 	} while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
 	    tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
 
 	return (top);
 }
 
 static uint64_t
 ztest_random_dsl_prop(zfs_prop_t prop)
 {
 	uint64_t value;
 
 	do {
 		value = zfs_prop_random_value(prop, ztest_random(-1ULL));
 	} while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
 
 	return (value);
 }
 
 static int
 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
     boolean_t inherit)
 {
 	const char *propname = zfs_prop_to_name(prop);
 	const char *valname;
 	char *setpoint;
 	uint64_t curval;
 	int error;
 
 	error = dsl_prop_set_int(osname, propname,
 	    (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
 
 	if (error == ENOSPC) {
 		ztest_record_enospc(FTAG);
 		return (error);
 	}
 	ASSERT0(error);
 
 	setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 	VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
 
 	if (ztest_opts.zo_verbose >= 6) {
 		int err;
 
 		err = zfs_prop_index_to_string(prop, curval, &valname);
 		if (err)
 			(void) printf("%s %s = %llu at '%s'\n", osname,
 			    propname, (unsigned long long)curval, setpoint);
 		else
 			(void) printf("%s %s = %s at '%s'\n",
 			    osname, propname, valname, setpoint);
 	}
 	umem_free(setpoint, MAXPATHLEN);
 
 	return (error);
 }
 
 static int
 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value)
 {
 	spa_t *spa = ztest_spa;
 	nvlist_t *props = NULL;
 	int error;
 
 	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
 
 	error = spa_prop_set(spa, props);
 
 	nvlist_free(props);
 
 	if (error == ENOSPC) {
 		ztest_record_enospc(FTAG);
 		return (error);
 	}
 	ASSERT0(error);
 
 	return (error);
 }
 
 static int
 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
 {
 	int err;
 
 	err = dmu_objset_own(name, type, readonly, decrypt, tag, osp);
 	if (decrypt && err == EACCES) {
 		char ddname[ZFS_MAX_DATASET_NAME_LEN];
 		dsl_crypto_params_t *dcp;
 		nvlist_t *crypto_args = fnvlist_alloc();
 		char *cp = NULL;
 
 		/* spa_keystore_load_wkey() expects a dsl dir name */
 		strcpy(ddname, name);
 		cp = strchr(ddname, '@');
 		if (cp != NULL)
 			*cp = '\0';
 
 		fnvlist_add_uint8_array(crypto_args, "wkeydata",
 		    (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN);
 		VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL,
 		    crypto_args, &dcp));
 		err = spa_keystore_load_wkey(ddname, dcp, B_FALSE);
 		dsl_crypto_params_free(dcp, B_FALSE);
 		fnvlist_free(crypto_args);
 
 		if (err != 0)
 			return (err);
 
 		err = dmu_objset_own(name, type, readonly, decrypt, tag, osp);
 	}
 
 	return (err);
 }
 
 
 /*
  * Object and range lock mechanics
  */
 typedef struct {
 	list_node_t z_lnode;
 	refcount_t z_refcnt;
 	uint64_t z_object;
 	zfs_rlock_t z_range_lock;
 } ztest_znode_t;
 
 typedef struct {
 	rl_t *z_rl;
 	ztest_znode_t *z_ztznode;
 } ztest_zrl_t;
 
 static ztest_znode_t *
 ztest_znode_init(uint64_t object)
 {
 	ztest_znode_t *zp = umem_alloc(sizeof (*zp), UMEM_NOFAIL);
 
 	list_link_init(&zp->z_lnode);
 	refcount_create(&zp->z_refcnt);
 	zp->z_object = object;
 	zfs_rlock_init(&zp->z_range_lock);
 
 	return (zp);
 }
 
 static void
 ztest_znode_fini(ztest_znode_t *zp)
 {
 	ASSERT(refcount_is_zero(&zp->z_refcnt));
 	zfs_rlock_destroy(&zp->z_range_lock);
 	zp->z_object = 0;
 	refcount_destroy(&zp->z_refcnt);
 	list_link_init(&zp->z_lnode);
 	umem_free(zp, sizeof (*zp));
 }
 
 static void
 ztest_zll_init(zll_t *zll)
 {
 	mutex_init(&zll->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zll->z_list, sizeof (ztest_znode_t),
 	    offsetof(ztest_znode_t, z_lnode));
 }
 
 static void
 ztest_zll_destroy(zll_t *zll)
 {
 	list_destroy(&zll->z_list);
 	mutex_destroy(&zll->z_lock);
 }
 
 #define	RL_TAG "range_lock"
 static ztest_znode_t *
 ztest_znode_get(ztest_ds_t *zd, uint64_t object)
 {
 	zll_t *zll = &zd->zd_range_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
 	ztest_znode_t *zp = NULL;
 	mutex_enter(&zll->z_lock);
 	for (zp = list_head(&zll->z_list); (zp);
 	    zp = list_next(&zll->z_list, zp)) {
 		if (zp->z_object == object) {
 			refcount_add(&zp->z_refcnt, RL_TAG);
 			break;
 		}
 	}
 	if (zp == NULL) {
 		zp = ztest_znode_init(object);
 		refcount_add(&zp->z_refcnt, RL_TAG);
 		list_insert_head(&zll->z_list, zp);
 	}
 	mutex_exit(&zll->z_lock);
 	return (zp);
 }
 
 static void
 ztest_znode_put(ztest_ds_t *zd, ztest_znode_t *zp)
 {
 	zll_t *zll = NULL;
 	ASSERT3U(zp->z_object, !=, 0);
 	zll = &zd->zd_range_lock[zp->z_object & (ZTEST_OBJECT_LOCKS - 1)];
 	mutex_enter(&zll->z_lock);
 	refcount_remove(&zp->z_refcnt, RL_TAG);
 	if (refcount_is_zero(&zp->z_refcnt)) {
 		list_remove(&zll->z_list, zp);
 		ztest_znode_fini(zp);
 	}
 	mutex_exit(&zll->z_lock);
 }
 
 
 static void
 ztest_rll_init(rll_t *rll)
 {
 	rll->rll_writer = NULL;
 	rll->rll_readers = 0;
 	mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL);
 }
 
 static void
 ztest_rll_destroy(rll_t *rll)
 {
 	ASSERT(rll->rll_writer == NULL);
 	ASSERT(rll->rll_readers == 0);
 	mutex_destroy(&rll->rll_lock);
 	cv_destroy(&rll->rll_cv);
 }
 
 static void
 ztest_rll_lock(rll_t *rll, rl_type_t type)
 {
 	mutex_enter(&rll->rll_lock);
 
 	if (type == RL_READER) {
 		while (rll->rll_writer != NULL)
 			(void) cv_wait(&rll->rll_cv, &rll->rll_lock);
 		rll->rll_readers++;
 	} else {
 		while (rll->rll_writer != NULL || rll->rll_readers)
 			(void) cv_wait(&rll->rll_cv, &rll->rll_lock);
 		rll->rll_writer = curthread;
 	}
 
 	mutex_exit(&rll->rll_lock);
 }
 
 static void
 ztest_rll_unlock(rll_t *rll)
 {
 	mutex_enter(&rll->rll_lock);
 
 	if (rll->rll_writer) {
 		ASSERT(rll->rll_readers == 0);
 		rll->rll_writer = NULL;
 	} else {
 		ASSERT(rll->rll_readers != 0);
 		ASSERT(rll->rll_writer == NULL);
 		rll->rll_readers--;
 	}
 
 	if (rll->rll_writer == NULL && rll->rll_readers == 0)
 		cv_broadcast(&rll->rll_cv);
 
 	mutex_exit(&rll->rll_lock);
 }
 
 static void
 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
 {
 	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
 
 	ztest_rll_lock(rll, type);
 }
 
 static void
 ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
 {
 	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
 
 	ztest_rll_unlock(rll);
 }
 
 static ztest_zrl_t *
 ztest_zrl_init(rl_t *rl, ztest_znode_t *zp)
 {
 	ztest_zrl_t *zrl = umem_alloc(sizeof (*zrl), UMEM_NOFAIL);
 	zrl->z_rl = rl;
 	zrl->z_ztznode = zp;
 	return (zrl);
 }
 
 static void
 ztest_zrl_fini(ztest_zrl_t *zrl)
 {
 	umem_free(zrl, sizeof (*zrl));
 }
 
 static ztest_zrl_t *
 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
     uint64_t size, rl_type_t type)
 {
 	ztest_znode_t *zp = ztest_znode_get(zd, object);
 	rl_t *rl = zfs_range_lock(&zp->z_range_lock, offset,
 	    size, type);
 	return (ztest_zrl_init(rl, zp));
 }
 
 static void
 ztest_range_unlock(ztest_ds_t *zd, ztest_zrl_t *zrl)
 {
 	zfs_range_unlock(zrl->z_rl);
 	ztest_znode_put(zd, zrl->z_ztznode);
 	ztest_zrl_fini(zrl);
 }
 
 static void
 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
 {
 	zd->zd_os = os;
 	zd->zd_zilog = dmu_objset_zil(os);
 	zd->zd_shared = szd;
 	dmu_objset_name(os, zd->zd_name);
 	int l;
 
 	if (zd->zd_shared != NULL)
 		zd->zd_shared->zd_seq = 0;
 
 	VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0);
 	mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
 		ztest_rll_init(&zd->zd_object_lock[l]);
 
 	for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
 		ztest_zll_init(&zd->zd_range_lock[l]);
 }
 
 static void
 ztest_zd_fini(ztest_ds_t *zd)
 {
 	int l;
 
 	mutex_destroy(&zd->zd_dirobj_lock);
 	(void) rwlock_destroy(&zd->zd_zilog_lock);
 
 	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
 		ztest_rll_destroy(&zd->zd_object_lock[l]);
 
 	for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
 		ztest_zll_destroy(&zd->zd_range_lock[l]);
 }
 
 #define	TXG_MIGHTWAIT	(ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
 
 static uint64_t
 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
 {
 	uint64_t txg;
 	int error;
 
 	/*
 	 * Attempt to assign tx to some transaction group.
 	 */
 	error = dmu_tx_assign(tx, txg_how);
 	if (error) {
 		if (error == ERESTART) {
 			ASSERT(txg_how == TXG_NOWAIT);
 			dmu_tx_wait(tx);
 		} else {
 			ASSERT3U(error, ==, ENOSPC);
 			ztest_record_enospc(tag);
 		}
 		dmu_tx_abort(tx);
 		return (0);
 	}
 	txg = dmu_tx_get_txg(tx);
 	ASSERT(txg != 0);
 	return (txg);
 }
 
 static void
 ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
 {
 	uint64_t *ip = buf;
 	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
 
 	while (ip < ip_end)
 		*ip++ = value;
 }
 
 #ifndef NDEBUG
 static boolean_t
 ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
 {
 	uint64_t *ip = buf;
 	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
 	uint64_t diff = 0;
 
 	while (ip < ip_end)
 		diff |= (value - *ip++);
 
 	return (diff == 0);
 }
 #endif
 
 static void
 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
     uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg,
     uint64_t crtxg)
 {
 	bt->bt_magic = BT_MAGIC;
 	bt->bt_objset = dmu_objset_id(os);
 	bt->bt_object = object;
 	bt->bt_dnodesize = dnodesize;
 	bt->bt_offset = offset;
 	bt->bt_gen = gen;
 	bt->bt_txg = txg;
 	bt->bt_crtxg = crtxg;
 }
 
 static void
 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
     uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg,
     uint64_t crtxg)
 {
 	ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
 	ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
 	ASSERT3U(bt->bt_object, ==, object);
 	ASSERT3U(bt->bt_dnodesize, ==, dnodesize);
 	ASSERT3U(bt->bt_offset, ==, offset);
 	ASSERT3U(bt->bt_gen, <=, gen);
 	ASSERT3U(bt->bt_txg, <=, txg);
 	ASSERT3U(bt->bt_crtxg, ==, crtxg);
 }
 
 static ztest_block_tag_t *
 ztest_bt_bonus(dmu_buf_t *db)
 {
 	dmu_object_info_t doi;
 	ztest_block_tag_t *bt;
 
 	dmu_object_info_from_db(db, &doi);
 	ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
 	ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
 	bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
 
 	return (bt);
 }
 
 /*
  * Generate a token to fill up unused bonus buffer space.  Try to make
  * it unique to the object, generation, and offset to verify that data
  * is not getting overwritten by data from other dnodes.
  */
 #define	ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \
 	(((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset))
 
 /*
  * Fill up the unused bonus buffer region before the block tag with a
  * verifiable pattern. Filling the whole bonus area with non-zero data
  * helps ensure that all dnode traversal code properly skips the
  * interior regions of large dnodes.
  */
 void
 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj,
     objset_t *os, uint64_t gen)
 {
 	uint64_t *bonusp;
 
 	ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8));
 
 	for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) {
 		uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os),
 		    gen, bonusp - (uint64_t *)db->db_data);
 		*bonusp = token;
 	}
 }
 
 /*
  * Verify that the unused area of a bonus buffer is filled with the
  * expected tokens.
  */
 void
 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj,
     objset_t *os, uint64_t gen)
 {
 	uint64_t *bonusp;
 
 	for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) {
 		uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os),
 		    gen, bonusp - (uint64_t *)db->db_data);
 		VERIFY3U(*bonusp, ==, token);
 	}
 }
 
 /*
  * ZIL logging ops
  */
 
 #define	lrz_type	lr_mode
 #define	lrz_blocksize	lr_uid
 #define	lrz_ibshift	lr_gid
 #define	lrz_bonustype	lr_rdev
 #define	lrz_dnodesize	lr_crtime[1]
 
 static void
 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
 {
 	char *name = (void *)(lr + 1);		/* name follows lr */
 	size_t namesize = strlen(name) + 1;
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) + namesize - sizeof (lr_t));
 
 	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 static void
 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
 {
 	char *name = (void *)(lr + 1);		/* name follows lr */
 	size_t namesize = strlen(name) + 1;
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) + namesize - sizeof (lr_t));
 
 	itx->itx_oid = object;
 	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 static void
 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
 {
 	itx_t *itx;
 	itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
 
 	if (zil_replaying(zd->zd_zilog, tx))
 		return;
 
 	if (lr->lr_length > ZIL_MAX_LOG_DATA)
 		write_state = WR_INDIRECT;
 
 	itx = zil_itx_create(TX_WRITE,
 	    sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
 
 	if (write_state == WR_COPIED &&
 	    dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
 	    ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
 		zil_itx_destroy(itx);
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 		write_state = WR_NEED_COPY;
 	}
 	itx->itx_private = zd;
 	itx->itx_wr_state = write_state;
 	itx->itx_sync = (ztest_random(8) == 0);
 
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));
 
 	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 static void
 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
 {
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));
 
 	itx->itx_sync = B_FALSE;
 	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 static void
 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
 {
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));
 
 	itx->itx_sync = B_FALSE;
 	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 /*
  * ZIL replay ops
  */
 static int
 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 {
 	ztest_ds_t *zd = arg1;
 	lr_create_t *lr = arg2;
 	char *name = (void *)(lr + 1);		/* name follows lr */
 	objset_t *os = zd->zd_os;
 	ztest_block_tag_t *bbt;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 	uint64_t txg;
 	int error = 0;
 	int bonuslen;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
 	ASSERT(name[0] != '\0');
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
 
 	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 	} else {
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 	}
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0)
 		return (ENOSPC);
 
 	ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
 	bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize);
 
 	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
 		if (lr->lr_foid == 0) {
 			lr->lr_foid = zap_create_dnsize(os,
 			    lr->lrz_type, lr->lrz_bonustype,
 			    bonuslen, lr->lrz_dnodesize, tx);
 		} else {
 			error = zap_create_claim_dnsize(os, lr->lr_foid,
 			    lr->lrz_type, lr->lrz_bonustype,
 			    bonuslen, lr->lrz_dnodesize, tx);
 		}
 	} else {
 		if (lr->lr_foid == 0) {
 			lr->lr_foid = dmu_object_alloc_dnsize(os,
 			    lr->lrz_type, 0, lr->lrz_bonustype,
 			    bonuslen, lr->lrz_dnodesize, tx);
 		} else {
 			error = dmu_object_claim_dnsize(os, lr->lr_foid,
 			    lr->lrz_type, 0, lr->lrz_bonustype,
 			    bonuslen, lr->lrz_dnodesize, tx);
 		}
 	}
 
 	if (error) {
 		ASSERT3U(error, ==, EEXIST);
 		ASSERT(zd->zd_zilog->zl_replay);
 		dmu_tx_commit(tx);
 		return (error);
 	}
 
 	ASSERT(lr->lr_foid != 0);
 
 	if (lr->lrz_type != DMU_OT_ZAP_OTHER)
 		VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
 		    lr->lrz_blocksize, lr->lrz_ibshift, tx));
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
 	bbt = ztest_bt_bonus(db);
 	dmu_buf_will_dirty(db, tx);
 	ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL,
 	    lr->lr_gen, txg, txg);
 	ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen);
 	dmu_buf_rele(db, FTAG);
 
 	VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
 	    &lr->lr_foid, tx));
 
 	(void) ztest_log_create(zd, tx, lr);
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 static int
 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
 {
 	ztest_ds_t *zd = arg1;
 	lr_remove_t *lr = arg2;
 	char *name = (void *)(lr + 1);		/* name follows lr */
 	objset_t *os = zd->zd_os;
 	dmu_object_info_t doi;
 	dmu_tx_t *tx;
 	uint64_t object, txg;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
 	ASSERT(name[0] != '\0');
 
 	VERIFY3U(0, ==,
 	    zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
 	ASSERT(object != 0);
 
 	ztest_object_lock(zd, object, RL_WRITER);
 
 	VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
 		ztest_object_unlock(zd, object);
 		return (ENOSPC);
 	}
 
 	if (doi.doi_type == DMU_OT_ZAP_OTHER) {
 		VERIFY3U(0, ==, zap_destroy(os, object, tx));
 	} else {
 		VERIFY3U(0, ==, dmu_object_free(os, object, tx));
 	}
 
 	VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
 
 	(void) ztest_log_remove(zd, tx, lr, object);
 
 	dmu_tx_commit(tx);
 
 	ztest_object_unlock(zd, object);
 
 	return (0);
 }
 
 static int
 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 {
 	ztest_ds_t *zd = arg1;
 	lr_write_t *lr = arg2;
 	objset_t *os = zd->zd_os;
 	void *data = lr + 1;			/* data follows lr */
 	uint64_t offset, length;
 	ztest_block_tag_t *bt = data;
 	ztest_block_tag_t *bbt;
 	uint64_t gen, txg, lrtxg, crtxg;
 	dmu_object_info_t doi;
 	dmu_tx_t *tx;
 	dmu_buf_t *db;
 	arc_buf_t *abuf = NULL;
 	ztest_zrl_t *rl;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 		if (length < blocksize) {
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
 	}
 
 	if (bt->bt_magic == BSWAP_64(BT_MAGIC))
 		byteswap_uint64_array(bt, sizeof (*bt));
 
 	if (bt->bt_magic != BT_MAGIC)
 		bt = NULL;
 
 	ztest_object_lock(zd, lr->lr_foid, RL_READER);
 	rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
 
 	dmu_object_info_from_db(db, &doi);
 
 	bbt = ztest_bt_bonus(db);
 	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
 	gen = bbt->bt_gen;
 	crtxg = bbt->bt_crtxg;
 	lrtxg = lr->lr_common.lrc_txg;
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
 
 	if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
 	    P2PHASE(offset, length) == 0)
 		abuf = dmu_request_arcbuf(db, length);
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
 		if (abuf != NULL)
 			dmu_return_arcbuf(abuf);
 		dmu_buf_rele(db, FTAG);
 		ztest_range_unlock(zd, rl);
 		ztest_object_unlock(zd, lr->lr_foid);
 		return (ENOSPC);
 	}
 
 	if (bt != NULL) {
 		/*
 		 * Usually, verify the old data before writing new data --
 		 * but not always, because we also want to verify correct
 		 * behavior when the data was not recently read into cache.
 		 */
 		ASSERT(offset % doi.doi_data_block_size == 0);
 		if (ztest_random(4) != 0) {
 			int prefetch = ztest_random(2) ?
 			    DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
 			ztest_block_tag_t rbt;
 
 			VERIFY(dmu_read(os, lr->lr_foid, offset,
 			    sizeof (rbt), &rbt, prefetch) == 0);
 			if (rbt.bt_magic == BT_MAGIC) {
 				ztest_bt_verify(&rbt, os, lr->lr_foid, 0,
 				    offset, gen, txg, crtxg);
 			}
 		}
 
 		/*
 		 * Writes can appear to be newer than the bonus buffer because
 		 * the ztest_get_data() callback does a dmu_read() of the
 		 * open-context data, which may be different than the data
 		 * as it was when the write was generated.
 		 */
 		if (zd->zd_zilog->zl_replay) {
 			ztest_bt_verify(bt, os, lr->lr_foid, 0, offset,
 			    MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
 			    bt->bt_crtxg);
 		}
 
 		/*
 		 * Set the bt's gen/txg to the bonus buffer's gen/txg
 		 * so that all of the usual ASSERTs will work.
 		 */
 		ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg,
 		    crtxg);
 	}
 
 	if (abuf == NULL) {
 		dmu_write(os, lr->lr_foid, offset, length, data, tx);
 	} else {
 		bcopy(data, abuf->b_data, length);
 		dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx);
 	}
 
 	(void) ztest_log_write(zd, tx, lr);
 
 	dmu_buf_rele(db, FTAG);
 
 	dmu_tx_commit(tx);
 
 	ztest_range_unlock(zd, rl);
 	ztest_object_unlock(zd, lr->lr_foid);
 
 	return (0);
 }
 
 static int
 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 {
 	ztest_ds_t *zd = arg1;
 	lr_truncate_t *lr = arg2;
 	objset_t *os = zd->zd_os;
 	dmu_tx_t *tx;
 	uint64_t txg;
 	ztest_zrl_t *rl;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	ztest_object_lock(zd, lr->lr_foid, RL_READER);
 	rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
 	    RL_WRITER);
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
 		ztest_range_unlock(zd, rl);
 		ztest_object_unlock(zd, lr->lr_foid);
 		return (ENOSPC);
 	}
 
 	VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
 	    lr->lr_length, tx) == 0);
 
 	(void) ztest_log_truncate(zd, tx, lr);
 
 	dmu_tx_commit(tx);
 
 	ztest_range_unlock(zd, rl);
 	ztest_object_unlock(zd, lr->lr_foid);
 
 	return (0);
 }
 
 static int
 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
 {
 	ztest_ds_t *zd = arg1;
 	lr_setattr_t *lr = arg2;
 	objset_t *os = zd->zd_os;
 	dmu_tx_t *tx;
 	dmu_buf_t *db;
 	ztest_block_tag_t *bbt;
 	uint64_t txg, lrtxg, crtxg, dnodesize;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, lr->lr_foid);
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
 		dmu_buf_rele(db, FTAG);
 		ztest_object_unlock(zd, lr->lr_foid);
 		return (ENOSPC);
 	}
 
 	bbt = ztest_bt_bonus(db);
 	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
 	crtxg = bbt->bt_crtxg;
 	lrtxg = lr->lr_common.lrc_txg;
 	dnodesize = bbt->bt_dnodesize;
 
 	if (zd->zd_zilog->zl_replay) {
 		ASSERT(lr->lr_size != 0);
 		ASSERT(lr->lr_mode != 0);
 		ASSERT(lrtxg != 0);
 	} else {
 		/*
 		 * Randomly change the size and increment the generation.
 		 */
 		lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
 		    sizeof (*bbt);
 		lr->lr_mode = bbt->bt_gen + 1;
 		ASSERT(lrtxg == 0);
 	}
 
 	/*
 	 * Verify that the current bonus buffer is not newer than our txg.
 	 */
 	ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
 	    MAX(txg, lrtxg), crtxg);
 
 	dmu_buf_will_dirty(db, tx);
 
 	ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
 	ASSERT3U(lr->lr_size, <=, db->db_size);
 	VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
 	bbt = ztest_bt_bonus(db);
 
 	ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
 	    txg, crtxg);
 	ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen);
 	dmu_buf_rele(db, FTAG);
 
 	(void) ztest_log_setattr(zd, tx, lr);
 
 	dmu_tx_commit(tx);
 
 	ztest_object_unlock(zd, lr->lr_foid);
 
 	return (0);
 }
 
 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
 	NULL,			/* 0 no such transaction type */
 	ztest_replay_create,	/* TX_CREATE */
 	NULL,			/* TX_MKDIR */
 	NULL,			/* TX_MKXATTR */
 	NULL,			/* TX_SYMLINK */
 	ztest_replay_remove,	/* TX_REMOVE */
 	NULL,			/* TX_RMDIR */
 	NULL,			/* TX_LINK */
 	NULL,			/* TX_RENAME */
 	ztest_replay_write,	/* TX_WRITE */
 	ztest_replay_truncate,	/* TX_TRUNCATE */
 	ztest_replay_setattr,	/* TX_SETATTR */
 	NULL,			/* TX_ACL */
 	NULL,			/* TX_CREATE_ACL */
 	NULL,			/* TX_CREATE_ATTR */
 	NULL,			/* TX_CREATE_ACL_ATTR */
 	NULL,			/* TX_MKDIR_ACL */
 	NULL,			/* TX_MKDIR_ATTR */
 	NULL,			/* TX_MKDIR_ACL_ATTR */
 	NULL,			/* TX_WRITE2 */
 };
 
 /*
  * ZIL get_data callbacks
  */
 typedef struct ztest_zgd_private {
 	ztest_ds_t *z_zd;
 	ztest_zrl_t *z_rl;
 	uint64_t z_object;
 } ztest_zgd_private_t;
 
 static void
 ztest_get_done(zgd_t *zgd, int error)
 {
 	ztest_zgd_private_t *zzp = zgd->zgd_private;
 	ztest_ds_t *zd = zzp->z_zd;
 	uint64_t object = zzp->z_object;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	ztest_range_unlock(zd, zzp->z_rl);
 	ztest_object_unlock(zd, object);
 
 	if (error == 0 && zgd->zgd_bp)
-		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 	umem_free(zgd, sizeof (*zgd));
 	umem_free(zzp, sizeof (*zzp));
 }
 
 static int
-ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
+    zio_t *zio)
 {
 	ztest_ds_t *zd = arg;
 	objset_t *os = zd->zd_os;
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	uint64_t txg = lr->lr_common.lrc_txg;
 	uint64_t crtxg;
 	dmu_object_info_t doi;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error;
 	ztest_zgd_private_t *zgd_private;
 
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(zio, !=, NULL);
+	ASSERT3U(size, !=, 0);
+
 	ztest_object_lock(zd, object, RL_READER);
 	error = dmu_bonus_hold(os, object, FTAG, &db);
 	if (error) {
 		ztest_object_unlock(zd, object);
 		return (error);
 	}
 
 	crtxg = ztest_bt_bonus(db)->bt_crtxg;
 
 	if (crtxg == 0 || crtxg > txg) {
 		dmu_buf_rele(db, FTAG);
 		ztest_object_unlock(zd, object);
 		return (ENOENT);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	dmu_buf_rele(db, FTAG);
 	db = NULL;
 
 	zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
-	zgd->zgd_zilog = zd->zd_zilog;
+	zgd->zgd_lwb = lwb;
 	zgd_private = umem_zalloc(sizeof (ztest_zgd_private_t), UMEM_NOFAIL);
 	zgd_private->z_zd = zd;
 	zgd_private->z_object = object;
 	zgd->zgd_private = zgd_private;
 
 	if (buf != NULL) {	/* immediate write */
 		zgd_private->z_rl = ztest_range_lock(zd, object, offset, size,
 		    RL_READER);
 		zgd->zgd_rl = zgd_private->z_rl->z_rl;
 
 		error = dmu_read(os, object, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
 		ASSERT(error == 0);
 	} else {
 		size = doi.doi_data_block_size;
 		if (ISP2(size)) {
 			offset = P2ALIGN(offset, size);
 		} else {
 			ASSERT(offset < size);
 			offset = 0;
 		}
 
 		zgd_private->z_rl = ztest_range_lock(zd, object, offset, size,
 		    RL_READER);
 		zgd->zgd_rl = zgd_private->z_rl->z_rl;
 
 		error = dmu_buf_hold(os, object, offset, zgd, &db,
 		    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    ztest_get_done, zgd);
 
 			if (error == 0)
 				return (0);
 		}
 	}
 
 	ztest_get_done(zgd, error);
 
 	return (error);
 }
 
 static void *
 ztest_lr_alloc(size_t lrsize, char *name)
 {
 	char *lr;
 	size_t namesize = name ? strlen(name) + 1 : 0;
 
 	lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
 
 	if (name)
 		bcopy(name, lr + lrsize, namesize);
 
 	return (lr);
 }
 
 void
 ztest_lr_free(void *lr, size_t lrsize, char *name)
 {
 	size_t namesize = name ? strlen(name) + 1 : 0;
 
 	umem_free(lr, lrsize + namesize);
 }
 
 /*
  * Lookup a bunch of objects.  Returns the number of objects not found.
  */
 static int
 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
 {
 	int missing = 0;
 	int error;
 	int i;
 
 	ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
 
 	for (i = 0; i < count; i++, od++) {
 		od->od_object = 0;
 		error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
 		    sizeof (uint64_t), 1, &od->od_object);
 		if (error) {
 			ASSERT(error == ENOENT);
 			ASSERT(od->od_object == 0);
 			missing++;
 		} else {
 			dmu_buf_t *db;
 			ztest_block_tag_t *bbt;
 			dmu_object_info_t doi;
 
 			ASSERT(od->od_object != 0);
 			ASSERT(missing == 0);	/* there should be no gaps */
 
 			ztest_object_lock(zd, od->od_object, RL_READER);
 			VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
 			    od->od_object, FTAG, &db));
 			dmu_object_info_from_db(db, &doi);
 			bbt = ztest_bt_bonus(db);
 			ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
 			od->od_type = doi.doi_type;
 			od->od_blocksize = doi.doi_data_block_size;
 			od->od_gen = bbt->bt_gen;
 			dmu_buf_rele(db, FTAG);
 			ztest_object_unlock(zd, od->od_object);
 		}
 	}
 
 	return (missing);
 }
 
 static int
 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
 {
 	int missing = 0;
 	int i;
 
 	ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
 
 	for (i = 0; i < count; i++, od++) {
 		if (missing) {
 			od->od_object = 0;
 			missing++;
 			continue;
 		}
 
 		lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
 
 		lr->lr_doid = od->od_dir;
 		lr->lr_foid = 0;	/* 0 to allocate, > 0 to claim */
 		lr->lrz_type = od->od_crtype;
 		lr->lrz_blocksize = od->od_crblocksize;
 		lr->lrz_ibshift = ztest_random_ibshift();
 		lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
 		lr->lrz_dnodesize = od->od_crdnodesize;
 		lr->lr_gen = od->od_crgen;
 		lr->lr_crtime[0] = time(NULL);
 
 		if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
 			ASSERT(missing == 0);
 			od->od_object = 0;
 			missing++;
 		} else {
 			od->od_object = lr->lr_foid;
 			od->od_type = od->od_crtype;
 			od->od_blocksize = od->od_crblocksize;
 			od->od_gen = od->od_crgen;
 			ASSERT(od->od_object != 0);
 		}
 
 		ztest_lr_free(lr, sizeof (*lr), od->od_name);
 	}
 
 	return (missing);
 }
 
 static int
 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
 {
 	int missing = 0;
 	int error;
 	int i;
 
 	ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
 
 	od += count - 1;
 
 	for (i = count - 1; i >= 0; i--, od--) {
 		if (missing) {
 			missing++;
 			continue;
 		}
 
 		/*
 		 * No object was found.
 		 */
 		if (od->od_object == 0)
 			continue;
 
 		lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
 
 		lr->lr_doid = od->od_dir;
 
 		if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
 			ASSERT3U(error, ==, ENOSPC);
 			missing++;
 		} else {
 			od->od_object = 0;
 		}
 		ztest_lr_free(lr, sizeof (*lr), od->od_name);
 	}
 
 	return (missing);
 }
 
 static int
 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
     void *data)
 {
 	lr_write_t *lr;
 	int error;
 
 	lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
 
 	lr->lr_foid = object;
 	lr->lr_offset = offset;
 	lr->lr_length = size;
 	lr->lr_blkoff = 0;
 	BP_ZERO(&lr->lr_blkptr);
 
 	bcopy(data, lr + 1, size);
 
 	error = ztest_replay_write(zd, lr, B_FALSE);
 
 	ztest_lr_free(lr, sizeof (*lr) + size, NULL);
 
 	return (error);
 }
 
 static int
 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
 {
 	lr_truncate_t *lr;
 	int error;
 
 	lr = ztest_lr_alloc(sizeof (*lr), NULL);
 
 	lr->lr_foid = object;
 	lr->lr_offset = offset;
 	lr->lr_length = size;
 
 	error = ztest_replay_truncate(zd, lr, B_FALSE);
 
 	ztest_lr_free(lr, sizeof (*lr), NULL);
 
 	return (error);
 }
 
 static int
 ztest_setattr(ztest_ds_t *zd, uint64_t object)
 {
 	lr_setattr_t *lr;
 	int error;
 
 	lr = ztest_lr_alloc(sizeof (*lr), NULL);
 
 	lr->lr_foid = object;
 	lr->lr_size = 0;
 	lr->lr_mode = 0;
 
 	error = ztest_replay_setattr(zd, lr, B_FALSE);
 
 	ztest_lr_free(lr, sizeof (*lr), NULL);
 
 	return (error);
 }
 
 static void
 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
 {
 	objset_t *os = zd->zd_os;
 	dmu_tx_t *tx;
 	uint64_t txg;
 	ztest_zrl_t *rl;
 
 	txg_wait_synced(dmu_objset_pool(os), 0);
 
 	ztest_object_lock(zd, object, RL_READER);
 	rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, object, offset, size);
 
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 
 	if (txg != 0) {
 		dmu_prealloc(os, object, offset, size, tx);
 		dmu_tx_commit(tx);
 		txg_wait_synced(dmu_objset_pool(os), txg);
 	} else {
 		(void) dmu_free_long_range(os, object, offset, size);
 	}
 
 	ztest_range_unlock(zd, rl);
 	ztest_object_unlock(zd, object);
 }
 
 static void
 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
 {
 	int err;
 	ztest_block_tag_t wbt;
 	dmu_object_info_t doi;
 	enum ztest_io_type io_type;
 	uint64_t blocksize;
 	void *data;
 
 	VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
 	blocksize = doi.doi_data_block_size;
 	data = umem_alloc(blocksize, UMEM_NOFAIL);
 
 	/*
 	 * Pick an i/o type at random, biased toward writing block tags.
 	 */
 	io_type = ztest_random(ZTEST_IO_TYPES);
 	if (ztest_random(2) == 0)
 		io_type = ZTEST_IO_WRITE_TAG;
 
 	(void) rw_rdlock(&zd->zd_zilog_lock);
 
 	switch (io_type) {
 
 	case ZTEST_IO_WRITE_TAG:
 		ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize,
 		    offset, 0, 0, 0);
 		(void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
 		break;
 
 	case ZTEST_IO_WRITE_PATTERN:
 		(void) memset(data, 'a' + (object + offset) % 5, blocksize);
 		if (ztest_random(2) == 0) {
 			/*
 			 * Induce fletcher2 collisions to ensure that
 			 * zio_ddt_collision() detects and resolves them
 			 * when using fletcher2-verify for deduplication.
 			 */
 			((uint64_t *)data)[0] ^= 1ULL << 63;
 			((uint64_t *)data)[4] ^= 1ULL << 63;
 		}
 		(void) ztest_write(zd, object, offset, blocksize, data);
 		break;
 
 	case ZTEST_IO_WRITE_ZEROES:
 		bzero(data, blocksize);
 		(void) ztest_write(zd, object, offset, blocksize, data);
 		break;
 
 	case ZTEST_IO_TRUNCATE:
 		(void) ztest_truncate(zd, object, offset, blocksize);
 		break;
 
 	case ZTEST_IO_SETATTR:
 		(void) ztest_setattr(zd, object);
 		break;
 	default:
 		break;
 
 	case ZTEST_IO_REWRITE:
 		(void) rw_rdlock(&ztest_name_lock);
 		err = ztest_dsl_prop_set_uint64(zd->zd_name,
 		    ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
 		    B_FALSE);
 		VERIFY(err == 0 || err == ENOSPC);
 		err = ztest_dsl_prop_set_uint64(zd->zd_name,
 		    ZFS_PROP_COMPRESSION,
 		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
 		    B_FALSE);
 		VERIFY(err == 0 || err == ENOSPC);
 		(void) rw_unlock(&ztest_name_lock);
 
 		VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
 		    DMU_READ_NO_PREFETCH));
 
 		(void) ztest_write(zd, object, offset, blocksize, data);
 		break;
 	}
 
 	(void) rw_unlock(&zd->zd_zilog_lock);
 
 	umem_free(data, blocksize);
 }
 
 /*
  * Initialize an object description template.
  */
 static void
 ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
     dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize,
     uint64_t gen)
 {
 	od->od_dir = ZTEST_DIROBJ;
 	od->od_object = 0;
 
 	od->od_crtype = type;
 	od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
 	od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize();
 	od->od_crgen = gen;
 
 	od->od_type = DMU_OT_NONE;
 	od->od_blocksize = 0;
 	od->od_gen = 0;
 
 	(void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
 	    tag, (longlong_t)id, (u_longlong_t)index);
 }
 
 /*
  * Lookup or create the objects for a test using the od template.
  * If the objects do not all exist, or if 'remove' is specified,
  * remove any existing objects and create new ones.  Otherwise,
  * use the existing objects.
  */
 static int
 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
 {
 	int count = size / sizeof (*od);
 	int rv = 0;
 
 	mutex_enter(&zd->zd_dirobj_lock);
 	if ((ztest_lookup(zd, od, count) != 0 || remove) &&
 	    (ztest_remove(zd, od, count) != 0 ||
 	    ztest_create(zd, od, count) != 0))
 		rv = -1;
 	zd->zd_od = od;
 	mutex_exit(&zd->zd_dirobj_lock);
 
 	return (rv);
 }
 
 /* ARGSUSED */
 void
 ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
 {
 	zilog_t *zilog = zd->zd_zilog;
 
 	(void) rw_rdlock(&zd->zd_zilog_lock);
 
 	zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
 
 	/*
 	 * Remember the committed values in zd, which is in parent/child
 	 * shared memory.  If we die, the next iteration of ztest_run()
 	 * will verify that the log really does contain this record.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zd->zd_shared != NULL);
 	ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
 	zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
 	mutex_exit(&zilog->zl_lock);
 
 	(void) rw_unlock(&zd->zd_zilog_lock);
 }
 
 /*
  * This function is designed to simulate the operations that occur during a
  * mount/unmount operation.  We hold the dataset across these operations in an
  * attempt to expose any implicit assumptions about ZIL management.
  */
 /* ARGSUSED */
 void
 ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 
 	/*
 	 * We grab the zd_dirobj_lock to ensure that no other thread is
 	 * updating the zil (i.e. adding in-memory log records) and the
 	 * zd_zilog_lock to block any I/O.
 	 */
 	mutex_enter(&zd->zd_dirobj_lock);
 	(void) rw_wrlock(&zd->zd_zilog_lock);
 
 	/* zfsvfs_teardown() */
 	zil_close(zd->zd_zilog);
 
 	/* zfsvfs_setup() */
 	VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
 	zil_replay(os, zd, ztest_replay_vector);
 
 	(void) rw_unlock(&zd->zd_zilog_lock);
 	mutex_exit(&zd->zd_dirobj_lock);
 }
 
 /*
  * Verify that we can't destroy an active pool, create an existing pool,
  * or create a pool with a bad vdev spec.
  */
 /* ARGSUSED */
 void
 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_opts_t *zo = &ztest_opts;
 	spa_t *spa;
 	nvlist_t *nvroot;
 
 	if (zo->zo_mmp_test)
 		return;
 
 	/*
 	 * Attempt to create using a bad file.
 	 */
 	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
 	VERIFY3U(ENOENT, ==,
 	    spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
 	nvlist_free(nvroot);
 
 	/*
 	 * Attempt to create using a bad mirror.
 	 */
 	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
 	VERIFY3U(ENOENT, ==,
 	    spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
 	nvlist_free(nvroot);
 
 	/*
 	 * Attempt to create an existing pool.  It shouldn't matter
 	 * what's in the nvroot; we should fail with EEXIST.
 	 */
 	(void) rw_rdlock(&ztest_name_lock);
 	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
 	VERIFY3U(EEXIST, ==,
 	    spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL));
 	nvlist_free(nvroot);
 	VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
 	VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
 	spa_close(spa, FTAG);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
  * Start and then stop the MMP threads to ensure the startup and shutdown code
  * works properly.  Actual protection and property-related code tested via ZTS.
  */
 /* ARGSUSED */
 void
 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_opts_t *zo = &ztest_opts;
 	spa_t *spa = ztest_spa;
 
 	if (zo->zo_mmp_test)
 		return;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	mutex_enter(&spa->spa_props_lock);
 
 	if (!spa_multihost(spa)) {
 		spa->spa_multihost = B_TRUE;
 		mmp_thread_start(spa);
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 	mmp_signal_all_threads();
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	mutex_enter(&spa->spa_props_lock);
 
 	if (spa_multihost(spa)) {
 		mmp_thread_stop(spa);
 		spa->spa_multihost = B_FALSE;
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 /* ARGSUSED */
 void
 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
 {
 	spa_t *spa;
 	uint64_t initial_version = SPA_VERSION_INITIAL;
 	uint64_t version, newversion;
 	nvlist_t *nvroot, *props;
 	char *name;
 
 	if (ztest_opts.zo_mmp_test)
 		return;
 
 	mutex_enter(&ztest_vdev_lock);
 	name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
 
 	/*
 	 * Clean up from previous runs.
 	 */
 	(void) spa_destroy(name);
 
 	nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
 	    0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
 
 	/*
 	 * If we're configuring a RAIDZ device then make sure that the
 	 * the initial version is capable of supporting that feature.
 	 */
 	switch (ztest_opts.zo_raidz_parity) {
 	case 0:
 	case 1:
 		initial_version = SPA_VERSION_INITIAL;
 		break;
 	case 2:
 		initial_version = SPA_VERSION_RAIDZ2;
 		break;
 	case 3:
 		initial_version = SPA_VERSION_RAIDZ3;
 		break;
 	}
 
 	/*
 	 * Create a pool with a spa version that can be upgraded. Pick
 	 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
 	 */
 	do {
 		version = ztest_random_spa_version(initial_version);
 	} while (version > SPA_VERSION_BEFORE_FEATURES);
 
 	props = fnvlist_alloc();
 	fnvlist_add_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
 	VERIFY3S(spa_create(name, nvroot, props, NULL, NULL), ==, 0);
 	fnvlist_free(nvroot);
 	fnvlist_free(props);
 
 	VERIFY3S(spa_open(name, &spa, FTAG), ==, 0);
 	VERIFY3U(spa_version(spa), ==, version);
 	newversion = ztest_random_spa_version(version + 1);
 
 	if (ztest_opts.zo_verbose >= 4) {
 		(void) printf("upgrading spa version from %llu to %llu\n",
 		    (u_longlong_t)version, (u_longlong_t)newversion);
 	}
 
 	spa_upgrade(spa, newversion);
 	VERIFY3U(spa_version(spa), >, version);
 	VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION)));
 	spa_close(spa, FTAG);
 
 	strfree(name);
 	mutex_exit(&ztest_vdev_lock);
 }
 
 static vdev_t *
 vdev_lookup_by_path(vdev_t *vd, const char *path)
 {
 	vdev_t *mvd;
 	int c;
 
 	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
 		return (vd);
 
 	for (c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 /*
  * Find the first available hole which can be used as a top-level.
  */
 int
 find_vdev_hole(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
 	ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
 
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *cvd = rvd->vdev_child[c];
 
 		if (cvd->vdev_ishole)
 			break;
 	}
 	return (c);
 }
 
 /*
  * Verify that vdev_add() works as expected.
  */
 /* ARGSUSED */
 void
 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	uint64_t leaves;
 	uint64_t guid;
 	nvlist_t *nvroot;
 	int error;
 
 	if (ztest_opts.zo_mmp_test)
 		return;
 
 	mutex_enter(&ztest_vdev_lock);
 	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
 
 	/*
 	 * If we have slogs then remove them 1/4 of the time.
 	 */
 	if (spa_has_slogs(spa) && ztest_random(4) == 0) {
 		/*
 		 * Grab the guid from the head of the log class rotor.
 		 */
 		guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
 
 		spa_config_exit(spa, SCL_VDEV, FTAG);
 
 		/*
 		 * We have to grab the zs_name_lock as writer to
 		 * prevent a race between removing a slog (dmu_objset_find)
 		 * and destroying a dataset. Removing the slog will
 		 * grab a reference on the dataset which may cause
 		 * dsl_destroy_head() to fail with EBUSY thus
 		 * leaving the dataset in an inconsistent state.
 		 */
 		rw_wrlock(&ztest_name_lock);
 		error = spa_vdev_remove(spa, guid, B_FALSE);
 		rw_unlock(&ztest_name_lock);
 
 		if (error && error != EEXIST)
 			fatal(0, "spa_vdev_remove() = %d", error);
 	} else {
 		spa_config_exit(spa, SCL_VDEV, FTAG);
 
 		/*
 		 * Make 1/4 of the devices be log devices.
 		 */
 		nvroot = make_vdev_root(NULL, NULL, NULL,
 		    ztest_opts.zo_vdev_size, 0,
 		    ztest_random(4) == 0, ztest_opts.zo_raidz,
 		    zs->zs_mirrors, 1);
 
 		error = spa_vdev_add(spa, nvroot);
 		nvlist_free(nvroot);
 
 		if (error == ENOSPC)
 			ztest_record_enospc("spa_vdev_add");
 		else if (error != 0)
 			fatal(0, "spa_vdev_add() = %d", error);
 	}
 
 	mutex_exit(&ztest_vdev_lock);
 }
 
 /*
  * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
  */
 /* ARGSUSED */
 void
 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	spa_aux_vdev_t *sav;
 	char *aux;
 	char *path;
 	uint64_t guid = 0;
 	int error;
 
 	if (ztest_opts.zo_mmp_test)
 		return;
 
 	path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 
 	if (ztest_random(2) == 0) {
 		sav = &spa->spa_spares;
 		aux = ZPOOL_CONFIG_SPARES;
 	} else {
 		sav = &spa->spa_l2cache;
 		aux = ZPOOL_CONFIG_L2CACHE;
 	}
 
 	mutex_enter(&ztest_vdev_lock);
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	if (sav->sav_count != 0 && ztest_random(4) == 0) {
 		/*
 		 * Pick a random device to remove.
 		 */
 		guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
 	} else {
 		/*
 		 * Find an unused device we can add.
 		 */
 		zs->zs_vdev_aux = 0;
 		for (;;) {
 			int c;
 			(void) snprintf(path, MAXPATHLEN, ztest_aux_template,
 			    ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
 			    zs->zs_vdev_aux);
 			for (c = 0; c < sav->sav_count; c++)
 				if (strcmp(sav->sav_vdevs[c]->vdev_path,
 				    path) == 0)
 					break;
 			if (c == sav->sav_count &&
 			    vdev_lookup_by_path(rvd, path) == NULL)
 				break;
 			zs->zs_vdev_aux++;
 		}
 	}
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	if (guid == 0) {
 		/*
 		 * Add a new device.
 		 */
 		nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
 		    (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
 		error = spa_vdev_add(spa, nvroot);
 		if (error != 0)
 			fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
 		nvlist_free(nvroot);
 	} else {
 		/*
 		 * Remove an existing device.  Sometimes, dirty its
 		 * vdev state first to make sure we handle removal
 		 * of devices that have pending state changes.
 		 */
 		if (ztest_random(2) == 0)
 			(void) vdev_online(spa, guid, 0, NULL);
 
 		error = spa_vdev_remove(spa, guid, B_FALSE);
 		if (error != 0 && error != EBUSY)
 			fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
 	}
 
 	mutex_exit(&ztest_vdev_lock);
 
 	umem_free(path, MAXPATHLEN);
 }
 
 /*
  * split a pool if it has mirror tlvdevs
  */
 /* ARGSUSED */
 void
 ztest_split_pool(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	nvlist_t *tree, **child, *config, *split, **schild;
 	uint_t c, children, schildren = 0, lastlogid = 0;
 	int error = 0;
 
 	if (ztest_opts.zo_mmp_test)
 		return;
 
 	mutex_enter(&ztest_vdev_lock);
 
 	/* ensure we have a useable config; mirrors of raidz aren't supported */
 	if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
 		mutex_exit(&ztest_vdev_lock);
 		return;
 	}
 
 	/* clean up the old pool, if any */
 	(void) spa_destroy("splitp");
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	/* generate a config from the existing config */
 	mutex_enter(&spa->spa_props_lock);
 	VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
 	    &tree) == 0);
 	mutex_exit(&spa->spa_props_lock);
 
 	VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) == 0);
 
 	schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
 	for (c = 0; c < children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		nvlist_t **mchild;
 		uint_t mchildren;
 
 		if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
 			VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
 			    0) == 0);
 			VERIFY(nvlist_add_string(schild[schildren],
 			    ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
 			VERIFY(nvlist_add_uint64(schild[schildren],
 			    ZPOOL_CONFIG_IS_HOLE, 1) == 0);
 			if (lastlogid == 0)
 				lastlogid = schildren;
 			++schildren;
 			continue;
 		}
 		lastlogid = 0;
 		VERIFY(nvlist_lookup_nvlist_array(child[c],
 		    ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
 		VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
 	}
 
 	/* OK, create a config that can be used to split */
 	VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
 	    lastlogid != 0 ? lastlogid : schildren) == 0);
 
 	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
 
 	for (c = 0; c < schildren; c++)
 		nvlist_free(schild[c]);
 	free(schild);
 	nvlist_free(split);
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	(void) rw_wrlock(&ztest_name_lock);
 	error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
 	(void) rw_unlock(&ztest_name_lock);
 
 	nvlist_free(config);
 
 	if (error == 0) {
 		(void) printf("successful split - results:\n");
 		mutex_enter(&spa_namespace_lock);
 		show_pool_stats(spa);
 		show_pool_stats(spa_lookup("splitp"));
 		mutex_exit(&spa_namespace_lock);
 		++zs->zs_splits;
 		--zs->zs_mirrors;
 	}
 	mutex_exit(&ztest_vdev_lock);
 
 }
 
 /*
  * Verify that we can attach and detach devices.
  */
 /* ARGSUSED */
 void
 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *pvd;
 	nvlist_t *root;
 	uint64_t leaves;
 	uint64_t leaf, top;
 	uint64_t ashift = ztest_get_ashift();
 	uint64_t oldguid, pguid;
 	uint64_t oldsize, newsize;
 	char *oldpath, *newpath;
 	int replacing;
 	int oldvd_has_siblings = B_FALSE;
 	int newvd_is_spare = B_FALSE;
 	int oldvd_is_log;
 	int error, expected_error;
 
 	if (ztest_opts.zo_mmp_test)
 		return;
 
 	oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 	newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 
 	mutex_enter(&ztest_vdev_lock);
 	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	/*
 	 * Decide whether to do an attach or a replace.
 	 */
 	replacing = ztest_random(2);
 
 	/*
 	 * Pick a random top-level vdev.
 	 */
 	top = ztest_random_vdev_top(spa, B_TRUE);
 
 	/*
 	 * Pick a random leaf within it.
 	 */
 	leaf = ztest_random(leaves);
 
 	/*
 	 * Locate this vdev.
 	 */
 	oldvd = rvd->vdev_child[top];
 	if (zs->zs_mirrors >= 1) {
 		ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
 		ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
 		oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
 	}
 	if (ztest_opts.zo_raidz > 1) {
 		ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
 		ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
 		oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
 	}
 
 	/*
 	 * If we're already doing an attach or replace, oldvd may be a
 	 * mirror vdev -- in which case, pick a random child.
 	 */
 	while (oldvd->vdev_children != 0) {
 		oldvd_has_siblings = B_TRUE;
 		ASSERT(oldvd->vdev_children >= 2);
 		oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
 	}
 
 	oldguid = oldvd->vdev_guid;
 	oldsize = vdev_get_min_asize(oldvd);
 	oldvd_is_log = oldvd->vdev_top->vdev_islog;
 	(void) strcpy(oldpath, oldvd->vdev_path);
 	pvd = oldvd->vdev_parent;
 	pguid = pvd->vdev_guid;
 
 	/*
 	 * If oldvd has siblings, then half of the time, detach it.
 	 */
 	if (oldvd_has_siblings && ztest_random(2) == 0) {
 		spa_config_exit(spa, SCL_VDEV, FTAG);
 		error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
 		if (error != 0 && error != ENODEV && error != EBUSY &&
 		    error != ENOTSUP)
 			fatal(0, "detach (%s) returned %d", oldpath, error);
 		goto out;
 	}
 
 	/*
 	 * For the new vdev, choose with equal probability between the two
 	 * standard paths (ending in either 'a' or 'b') or a random hot spare.
 	 */
 	if (sav->sav_count != 0 && ztest_random(3) == 0) {
 		newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
 		newvd_is_spare = B_TRUE;
 		(void) strcpy(newpath, newvd->vdev_path);
 	} else {
 		(void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,
 		    ztest_opts.zo_dir, ztest_opts.zo_pool,
 		    top * leaves + leaf);
 		if (ztest_random(2) == 0)
 			newpath[strlen(newpath) - 1] = 'b';
 		newvd = vdev_lookup_by_path(rvd, newpath);
 	}
 
 	if (newvd) {
 		newsize = vdev_get_min_asize(newvd);
 	} else {
 		/*
 		 * Make newsize a little bigger or smaller than oldsize.
 		 * If it's smaller, the attach should fail.
 		 * If it's larger, and we're doing a replace,
 		 * we should get dynamic LUN growth when we're done.
 		 */
 		newsize = 10 * oldsize / (9 + ztest_random(3));
 	}
 
 	/*
 	 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
 	 * unless it's a replace; in that case any non-replacing parent is OK.
 	 *
 	 * If newvd is already part of the pool, it should fail with EBUSY.
 	 *
 	 * If newvd is too small, it should fail with EOVERFLOW.
 	 */
 	if (pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_root_ops && (!replacing ||
 	    pvd->vdev_ops == &vdev_replacing_ops ||
 	    pvd->vdev_ops == &vdev_spare_ops))
 		expected_error = ENOTSUP;
 	else if (newvd_is_spare && (!replacing || oldvd_is_log))
 		expected_error = ENOTSUP;
 	else if (newvd == oldvd)
 		expected_error = replacing ? 0 : EBUSY;
 	else if (vdev_lookup_by_path(rvd, newpath) != NULL)
 		expected_error = EBUSY;
 	else if (newsize < oldsize)
 		expected_error = EOVERFLOW;
 	else if (ashift > oldvd->vdev_top->vdev_ashift)
 		expected_error = EDOM;
 	else
 		expected_error = 0;
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	/*
 	 * Build the nvlist describing newpath.
 	 */
 	root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
 	    ashift, 0, 0, 0, 1);
 
 	error = spa_vdev_attach(spa, oldguid, root, replacing);
 
 	nvlist_free(root);
 
 	/*
 	 * If our parent was the replacing vdev, but the replace completed,
 	 * then instead of failing with ENOTSUP we may either succeed,
 	 * fail with ENODEV, or fail with EOVERFLOW.
 	 */
 	if (expected_error == ENOTSUP &&
 	    (error == 0 || error == ENODEV || error == EOVERFLOW))
 		expected_error = error;
 
 	/*
 	 * If someone grew the LUN, the replacement may be too small.
 	 */
 	if (error == EOVERFLOW || error == EBUSY)
 		expected_error = error;
 
 	/* XXX workaround 6690467 */
 	if (error != expected_error && expected_error != EBUSY) {
 		fatal(0, "attach (%s %llu, %s %llu, %d) "
 		    "returned %d, expected %d",
 		    oldpath, oldsize, newpath,
 		    newsize, replacing, error, expected_error);
 	}
 out:
 	mutex_exit(&ztest_vdev_lock);
 
 	umem_free(oldpath, MAXPATHLEN);
 	umem_free(newpath, MAXPATHLEN);
 }
 
 /*
  * Callback function which expands the physical size of the vdev.
  */
 vdev_t *
 grow_vdev(vdev_t *vd, void *arg)
 {
 	ASSERTV(spa_t *spa = vd->vdev_spa);
 	size_t *newsize = arg;
 	size_t fsize;
 	int fd;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
 		return (vd);
 
 	fsize = lseek(fd, 0, SEEK_END);
 	VERIFY(ftruncate(fd, *newsize) == 0);
 
 	if (ztest_opts.zo_verbose >= 6) {
 		(void) printf("%s grew from %lu to %lu bytes\n",
 		    vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
 	}
 	(void) close(fd);
 	return (NULL);
 }
 
 /*
  * Callback function which expands a given vdev by calling vdev_online().
  */
 /* ARGSUSED */
 vdev_t *
 online_vdev(vdev_t *vd, void *arg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
 	uint64_t guid = vd->vdev_guid;
 	uint64_t generation = spa->spa_config_generation + 1;
 	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/* Calling vdev_online will initialize the new metaslabs */
 	spa_config_exit(spa, SCL_STATE, spa);
 	error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	/*
 	 * If vdev_online returned an error or the underlying vdev_open
 	 * failed then we abort the expand. The only way to know that
 	 * vdev_open fails is by checking the returned newstate.
 	 */
 	if (error || newstate != VDEV_STATE_HEALTHY) {
 		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("Unable to expand vdev, state %llu, "
 			    "error %d\n", (u_longlong_t)newstate, error);
 		}
 		return (vd);
 	}
 	ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
 
 	/*
 	 * Since we dropped the lock we need to ensure that we're
 	 * still talking to the original vdev. It's possible this
 	 * vdev may have been detached/replaced while we were
 	 * trying to online it.
 	 */
 	if (generation != spa->spa_config_generation) {
 		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("vdev configuration has changed, "
 			    "guid %llu, state %llu, expected gen %llu, "
 			    "got gen %llu\n",
 			    (u_longlong_t)guid,
 			    (u_longlong_t)tvd->vdev_state,
 			    (u_longlong_t)generation,
 			    (u_longlong_t)spa->spa_config_generation);
 		}
 		return (vd);
 	}
 	return (NULL);
 }
 
 /*
  * Traverse the vdev tree calling the supplied function.
  * We continue to walk the tree until we either have walked all
  * children or we receive a non-NULL return from the callback.
  * If a NULL callback is passed, then we just return back the first
  * leaf vdev we encounter.
  */
 vdev_t *
 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
 {
 	uint_t c;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		if (func == NULL)
 			return (vd);
 		else
 			return (func(vd, arg));
 	}
 
 	for (c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
 			return (cvd);
 	}
 	return (NULL);
 }
 
 /*
  * Verify that dynamic LUN growth works as expected.
  */
 /* ARGSUSED */
 void
 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 {
 	spa_t *spa = ztest_spa;
 	vdev_t *vd, *tvd;
 	metaslab_class_t *mc;
 	metaslab_group_t *mg;
 	size_t psize, newsize;
 	uint64_t top;
 	uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
 
 	mutex_enter(&ztest_vdev_lock);
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	top = ztest_random_vdev_top(spa, B_TRUE);
 
 	tvd = spa->spa_root_vdev->vdev_child[top];
 	mg = tvd->vdev_mg;
 	mc = mg->mg_class;
 	old_ms_count = tvd->vdev_ms_count;
 	old_class_space = metaslab_class_get_space(mc);
 
 	/*
 	 * Determine the size of the first leaf vdev associated with
 	 * our top-level device.
 	 */
 	vd = vdev_walk_tree(tvd, NULL, NULL);
 	ASSERT3P(vd, !=, NULL);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	psize = vd->vdev_psize;
 
 	/*
 	 * We only try to expand the vdev if it's healthy, less than 4x its
 	 * original size, and it has a valid psize.
 	 */
 	if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
 	    psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
 		spa_config_exit(spa, SCL_STATE, spa);
 		mutex_exit(&ztest_vdev_lock);
 		return;
 	}
 	ASSERT(psize > 0);
 	newsize = psize + psize / 8;
 	ASSERT3U(newsize, >, psize);
 
 	if (ztest_opts.zo_verbose >= 6) {
 		(void) printf("Expanding LUN %s from %lu to %lu\n",
 		    vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
 	}
 
 	/*
 	 * Growing the vdev is a two step process:
 	 *	1). expand the physical size (i.e. relabel)
 	 *	2). online the vdev to create the new metaslabs
 	 */
 	if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
 	    vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
 	    tvd->vdev_state != VDEV_STATE_HEALTHY) {
 		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("Could not expand LUN because "
 			    "the vdev configuration changed.\n");
 		}
 		spa_config_exit(spa, SCL_STATE, spa);
 		mutex_exit(&ztest_vdev_lock);
 		return;
 	}
 
 	spa_config_exit(spa, SCL_STATE, spa);
 
 	/*
 	 * Expanding the LUN will update the config asynchronously,
 	 * thus we must wait for the async thread to complete any
 	 * pending tasks before proceeding.
 	 */
 	for (;;) {
 		boolean_t done;
 		mutex_enter(&spa->spa_async_lock);
 		done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
 		mutex_exit(&spa->spa_async_lock);
 		if (done)
 			break;
 		txg_wait_synced(spa_get_dsl(spa), 0);
 		(void) poll(NULL, 0, 100);
 	}
 
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	tvd = spa->spa_root_vdev->vdev_child[top];
 	new_ms_count = tvd->vdev_ms_count;
 	new_class_space = metaslab_class_get_space(mc);
 
 	if (tvd->vdev_mg != mg || mg->mg_class != mc) {
 		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("Could not verify LUN expansion due to "
 			    "intervening vdev offline or remove.\n");
 		}
 		spa_config_exit(spa, SCL_STATE, spa);
 		mutex_exit(&ztest_vdev_lock);
 		return;
 	}
 
 	/*
 	 * Make sure we were able to grow the vdev.
 	 */
 	if (new_ms_count <= old_ms_count)
 		fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
 		    old_ms_count, new_ms_count);
 
 	/*
 	 * Make sure we were able to grow the pool.
 	 */
 	if (new_class_space <= old_class_space)
 		fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
 		    old_class_space, new_class_space);
 
 	if (ztest_opts.zo_verbose >= 5) {
 		char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ];
 
 		nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf));
 		nicenum(new_class_space, newnumbuf, sizeof (newnumbuf));
 		(void) printf("%s grew from %s to %s\n",
 		    spa->spa_name, oldnumbuf, newnumbuf);
 	}
 
 	spa_config_exit(spa, SCL_STATE, spa);
 	mutex_exit(&ztest_vdev_lock);
 }
 
 /*
  * Verify that dmu_objset_{create,destroy,open,close} work as expected.
  */
 /* ARGSUSED */
 static void
 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	/*
 	 * Create the objects common to all ztest datasets.
 	 */
 	VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
 	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
 }
 
 static int
 ztest_dataset_create(char *dsname)
 {
 	int err;
 	uint64_t rand;
 	dsl_crypto_params_t *dcp = NULL;
 
 	/*
 	 * 50% of the time, we create encrypted datasets
 	 * using a random cipher suite and a hard-coded
 	 * wrapping key.
 	 */
 	rand = ztest_random(2);
 	if (rand != 0) {
 		nvlist_t *crypto_args = fnvlist_alloc();
 		nvlist_t *props = fnvlist_alloc();
 
 		/* slight bias towards the default cipher suite */
 		rand = ztest_random(ZIO_CRYPT_FUNCTIONS);
 		if (rand < ZIO_CRYPT_AES_128_CCM)
 			rand = ZIO_CRYPT_ON;
 
 		fnvlist_add_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand);
 		fnvlist_add_uint8_array(crypto_args, "wkeydata",
 		    (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN);
 
 		/*
 		 * These parameters aren't really used by the kernel. They
 		 * are simply stored so that userspace knows how to load
 		 * the wrapping key.
 		 */
 		fnvlist_add_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW);
 		fnvlist_add_string(props,
 		    zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt");
 		fnvlist_add_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL);
 		fnvlist_add_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL);
 
 		VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props,
 		    crypto_args, &dcp));
 
 		fnvlist_free(crypto_args);
 		fnvlist_free(props);
 	}
 
 	err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp,
 	    ztest_objset_create_cb, NULL);
 	dsl_crypto_params_free(dcp, !!err);
 
 	rand = ztest_random(100);
 	if (err || rand < 80)
 		return (err);
 
 	if (ztest_opts.zo_verbose >= 5)
 		(void) printf("Setting dataset %s to sync always\n", dsname);
 	return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
 	    ZFS_SYNC_ALWAYS, B_FALSE));
 }
 
 /* ARGSUSED */
 static int
 ztest_objset_destroy_cb(const char *name, void *arg)
 {
 	objset_t *os;
 	dmu_object_info_t doi;
 	int error;
 
 	/*
 	 * Verify that the dataset contains a directory object.
 	 */
 	VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
 	    B_TRUE, FTAG, &os));
 	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
 	if (error != ENOENT) {
 		/* We could have crashed in the middle of destroying it */
 		ASSERT0(error);
 		ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
 		ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
 	}
 	dmu_objset_disown(os, B_TRUE, FTAG);
 
 	/*
 	 * Destroy the dataset.
 	 */
 	if (strchr(name, '@') != NULL) {
 		VERIFY0(dsl_destroy_snapshot(name, B_TRUE));
 	} else {
 		error = dsl_destroy_head(name);
 		/* There could be a hold on this dataset */
 		if (error != EBUSY)
 			ASSERT0(error);
 	}
 	return (0);
 }
 
 static boolean_t
 ztest_snapshot_create(char *osname, uint64_t id)
 {
 	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	int error;
 
 	(void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
 
 	error = dmu_objset_snapshot_one(osname, snapname);
 	if (error == ENOSPC) {
 		ztest_record_enospc(FTAG);
 		return (B_FALSE);
 	}
 	if (error != 0 && error != EEXIST) {
 		fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
 		    snapname, error);
 	}
 	return (B_TRUE);
 }
 
 static boolean_t
 ztest_snapshot_destroy(char *osname, uint64_t id)
 {
 	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	int error;
 
 	(void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname,
 	    (u_longlong_t)id);
 
 	error = dsl_destroy_snapshot(snapname, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
 	return (B_TRUE);
 }
 
 /* ARGSUSED */
 void
 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_ds_t *zdtmp;
 	int iters;
 	int error;
 	objset_t *os, *os2;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	zilog_t *zilog;
 	int i;
 
 	zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL);
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	(void) snprintf(name, sizeof (name), "%s/temp_%llu",
 	    ztest_opts.zo_pool, (u_longlong_t)id);
 
 	/*
 	 * If this dataset exists from a previous run, process its replay log
 	 * half of the time.  If we don't replay it, then dsl_destroy_head()
 	 * (invoked from ztest_objset_destroy_cb()) should just throw it away.
 	 */
 	if (ztest_random(2) == 0 &&
 	    ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE,
 	    B_TRUE, FTAG, &os) == 0) {
 		ztest_zd_init(zdtmp, NULL, os);
 		zil_replay(os, zdtmp, ztest_replay_vector);
 		ztest_zd_fini(zdtmp);
 		txg_wait_synced(dmu_objset_pool(os), 0);
 		dmu_objset_disown(os, B_TRUE, FTAG);
 	}
 
 	/*
 	 * There may be an old instance of the dataset we're about to
 	 * create lying around from a previous run.  If so, destroy it
 	 * and all of its snapshots.
 	 */
 	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
 	    DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
 	/*
 	 * Verify that the destroyed dataset is no longer in the namespace.
 	 */
 	VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
 	    B_TRUE, FTAG, &os));
 
 	/*
 	 * Verify that we can create a new dataset.
 	 */
 	error = ztest_dataset_create(name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", name, error);
 	}
 
 	VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE,
 	    FTAG, &os));
 
 	ztest_zd_init(zdtmp, NULL, os);
 
 	/*
 	 * Open the intent log for it.
 	 */
 	zilog = zil_open(os, ztest_get_data);
 
 	/*
 	 * Put some objects in there, do a little I/O to them,
 	 * and randomly take a couple of snapshots along the way.
 	 */
 	iters = ztest_random(5);
 	for (i = 0; i < iters; i++) {
 		ztest_dmu_object_alloc_free(zdtmp, id);
 		if (ztest_random(iters) == 0)
 			(void) ztest_snapshot_create(name, i);
 	}
 
 	/*
 	 * Verify that we cannot create an existing dataset.
 	 */
 	VERIFY3U(EEXIST, ==,
 	    dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL));
 
 	/*
 	 * Verify that we can hold an objset that is also owned.
 	 */
 	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
 	dmu_objset_rele(os2, FTAG);
 
 	/*
 	 * Verify that we cannot own an objset that is already owned.
 	 */
 	VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER,
 	    B_FALSE, B_TRUE, FTAG, &os2));
 
 	zil_close(zilog);
 	txg_wait_synced(spa_get_dsl(os->os_spa), 0);
 	dmu_objset_disown(os, B_TRUE, FTAG);
 	ztest_zd_fini(zdtmp);
 out:
 	(void) rw_unlock(&ztest_name_lock);
 
 	umem_free(zdtmp, sizeof (ztest_ds_t));
 }
 
 /*
  * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
  */
 void
 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
 	(void) rw_rdlock(&ztest_name_lock);
 	(void) ztest_snapshot_destroy(zd->zd_name, id);
 	(void) ztest_snapshot_create(zd->zd_name, id);
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
  * Cleanup non-standard snapshots and clones.
  */
 void
 ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
 {
 	char *snap1name;
 	char *clone1name;
 	char *snap2name;
 	char *clone2name;
 	char *snap3name;
 	int error;
 
 	snap1name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
 	clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
 	snap2name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
 	clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
 	snap3name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
 
 	(void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN,
 	    "%s@s1_%llu", osname, (u_longlong_t)id);
 	(void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN,
 	    "%s/c1_%llu", osname, (u_longlong_t)id);
 	(void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN,
 	    "%s@s2_%llu", clone1name, (u_longlong_t)id);
 	(void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN,
 	    "%s/c2_%llu", osname, (u_longlong_t)id);
 	(void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN,
 	    "%s@s3_%llu", clone1name, (u_longlong_t)id);
 
 	error = dsl_destroy_head(clone2name);
 	if (error && error != ENOENT)
 		fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
 	error = dsl_destroy_snapshot(snap3name, B_FALSE);
 	if (error && error != ENOENT)
 		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
 	error = dsl_destroy_snapshot(snap2name, B_FALSE);
 	if (error && error != ENOENT)
 		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
 	error = dsl_destroy_head(clone1name);
 	if (error && error != ENOENT)
 		fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
 	error = dsl_destroy_snapshot(snap1name, B_FALSE);
 	if (error && error != ENOENT)
 		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
 
 	umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN);
 	umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN);
 	umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN);
 	umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN);
 	umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN);
 }
 
 /*
  * Verify dsl_dataset_promote handles EBUSY
  */
 void
 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os;
 	char *snap1name;
 	char *clone1name;
 	char *snap2name;
 	char *clone2name;
 	char *snap3name;
 	char *osname = zd->zd_name;
 	int error;
 
 	snap1name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
 	clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
 	snap2name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
 	clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
 	snap3name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	ztest_dsl_dataset_cleanup(osname, id);
 
 	(void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN,
 	    "%s@s1_%llu", osname, (u_longlong_t)id);
 	(void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN,
 	    "%s/c1_%llu", osname, (u_longlong_t)id);
 	(void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN,
 	    "%s@s2_%llu", clone1name, (u_longlong_t)id);
 	(void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN,
 	    "%s/c2_%llu", osname, (u_longlong_t)id);
 	(void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN,
 	    "%s@s3_%llu", clone1name, (u_longlong_t)id);
 
 	error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
 	}
 
 	error = dmu_objset_clone(clone1name, snap1name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
 	}
 
 	error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
 	}
 
 	error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
 	}
 
 	error = dmu_objset_clone(clone2name, snap3name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
 			goto out;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
 	}
 
 	error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE,
 	    FTAG, &os);
 	if (error)
 		fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
 	error = dsl_dataset_promote(clone2name, NULL);
 	if (error == ENOSPC) {
 		dmu_objset_disown(os, B_TRUE, FTAG);
 		ztest_record_enospc(FTAG);
 		goto out;
 	}
 	if (error != EBUSY)
 		fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
 		    error);
 	dmu_objset_disown(os, B_TRUE, FTAG);
 
 out:
 	ztest_dsl_dataset_cleanup(osname, id);
 
 	(void) rw_unlock(&ztest_name_lock);
 
 	umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN);
 	umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN);
 	umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN);
 	umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN);
 	umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN);
 }
 
 #undef OD_ARRAY_SIZE
 #define	OD_ARRAY_SIZE	4
 
 /*
  * Verify that dmu_object_{alloc,free} work as expected.
  */
 void
 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_od_t *od;
 	int batchsize;
 	int size;
 	int b;
 
 	size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
 	od = umem_alloc(size, UMEM_NOFAIL);
 	batchsize = OD_ARRAY_SIZE;
 
 	for (b = 0; b < batchsize; b++)
 		ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER,
 		    0, 0, 0);
 
 	/*
 	 * Destroy the previous batch of objects, create a new batch,
 	 * and do some I/O on the new objects.
 	 */
 	if (ztest_object_init(zd, od, size, B_TRUE) != 0)
 		return;
 
 	while (ztest_random(4 * batchsize) != 0)
 		ztest_io(zd, od[ztest_random(batchsize)].od_object,
 		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 
 	umem_free(od, size);
 }
 
 #undef OD_ARRAY_SIZE
 #define	OD_ARRAY_SIZE	2
 
 /*
  * Verify that dmu_{read,write} work as expected.
  */
 void
 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
 {
 	int size;
 	ztest_od_t *od;
 
 	objset_t *os = zd->zd_os;
 	size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
 	od = umem_alloc(size, UMEM_NOFAIL);
 	dmu_tx_t *tx;
 	int i, freeit, error;
 	uint64_t n, s, txg;
 	bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
 	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
 	uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
 	uint64_t regions = 997;
 	uint64_t stride = 123456789ULL;
 	uint64_t width = 40;
 	int free_percent = 5;
 
 	/*
 	 * This test uses two objects, packobj and bigobj, that are always
 	 * updated together (i.e. in the same tx) so that their contents are
 	 * in sync and can be compared.  Their contents relate to each other
 	 * in a simple way: packobj is a dense array of 'bufwad' structures,
 	 * while bigobj is a sparse array of the same bufwads.  Specifically,
 	 * for any index n, there are three bufwads that should be identical:
 	 *
 	 *	packobj, at offset n * sizeof (bufwad_t)
 	 *	bigobj, at the head of the nth chunk
 	 *	bigobj, at the tail of the nth chunk
 	 *
 	 * The chunk size is arbitrary. It doesn't have to be a power of two,
 	 * and it doesn't have any relation to the object blocksize.
 	 * The only requirement is that it can hold at least two bufwads.
 	 *
 	 * Normally, we write the bufwad to each of these locations.
 	 * However, free_percent of the time we instead write zeroes to
 	 * packobj and perform a dmu_free_range() on bigobj.  By comparing
 	 * bigobj to packobj, we can verify that the DMU is correctly
 	 * tracking which parts of an object are allocated and free,
 	 * and that the contents of the allocated blocks are correct.
 	 */
 
 	/*
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
 	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize);
 	ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
 	    chunksize);
 
 	if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
 		umem_free(od, size);
 		return;
 	}
 
 	bigobj = od[0].od_object;
 	packobj = od[1].od_object;
 	chunksize = od[0].od_gen;
 	ASSERT(chunksize == od[1].od_gen);
 
 	/*
 	 * Prefetch a random chunk of the big object.
 	 * Our aim here is to get some async reads in flight
 	 * for blocks that we may free below; the DMU should
 	 * handle this race correctly.
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(2 * width - 1);
 	dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
 	    ZIO_PRIORITY_SYNC_READ);
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(width - 1);
 
 	packoff = n * sizeof (bufwad_t);
 	packsize = s * sizeof (bufwad_t);
 
 	bigoff = n * chunksize;
 	bigsize = s * chunksize;
 
 	packbuf = umem_alloc(packsize, UMEM_NOFAIL);
 	bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
 
 	/*
 	 * free_percent of the time, free a range of bigobj rather than
 	 * overwriting it.
 	 */
 	freeit = (ztest_random(100) < free_percent);
 
 	/*
 	 * Read the current contents of our objects.
 	 */
 	error = dmu_read(os, packobj, packoff, packsize, packbuf,
 	    DMU_READ_PREFETCH);
 	ASSERT0(error);
 	error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
 	    DMU_READ_PREFETCH);
 	ASSERT0(error);
 
 	/*
 	 * Get a tx for the mods to both packobj and bigobj.
 	 */
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, packobj, packoff, packsize);
 
 	if (freeit)
 		dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
 	else
 		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
 
 	/* This accounts for setting the checksum/compression. */
 	dmu_tx_hold_bonus(tx, bigobj);
 
 	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 	if (txg == 0) {
 		umem_free(packbuf, packsize);
 		umem_free(bigbuf, bigsize);
 		umem_free(od, size);
 		return;
 	}
 
 	enum zio_checksum cksum;
 	do {
 		cksum = (enum zio_checksum)
 		    ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
 	} while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
 	dmu_object_set_checksum(os, bigobj, cksum, tx);
 
 	enum zio_compress comp;
 	do {
 		comp = (enum zio_compress)
 		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
 	} while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
 	dmu_object_set_compress(os, bigobj, comp, tx);
 
 	/*
 	 * For each index from n to n + s, verify that the existing bufwad
 	 * in packobj matches the bufwads at the head and tail of the
 	 * corresponding chunk in bigobj.  Then update all three bufwads
 	 * with the new values we want to write out.
 	 */
 	for (i = 0; i < s; i++) {
 		/* LINTED */
 		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
 		/* LINTED */
 		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
 		/* LINTED */
 		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
 
 		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
 		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
 
 		if (pack->bw_txg > txg)
 			fatal(0, "future leak: got %llx, open txg is %llx",
 			    pack->bw_txg, txg);
 
 		if (pack->bw_data != 0 && pack->bw_index != n + i)
 			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
 			    pack->bw_index, n, i);
 
 		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
 
 		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
 
 		if (freeit) {
 			bzero(pack, sizeof (bufwad_t));
 		} else {
 			pack->bw_index = n + i;
 			pack->bw_txg = txg;
 			pack->bw_data = 1 + ztest_random(-2ULL);
 		}
 		*bigH = *pack;
 		*bigT = *pack;
 	}
 
 	/*
 	 * We've verified all the old bufwads, and made new ones.
 	 * Now write them out.
 	 */
 	dmu_write(os, packobj, packoff, packsize, packbuf, tx);
 
 	if (freeit) {
 		if (ztest_opts.zo_verbose >= 7) {
 			(void) printf("freeing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
 		VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
 	} else {
 		if (ztest_opts.zo_verbose >= 7) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
 		dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Sanity check the stuff we just wrote.
 	 */
 	{
 		void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
 		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
 		VERIFY(0 == dmu_read(os, packobj, packoff,
 		    packsize, packcheck, DMU_READ_PREFETCH));
 		VERIFY(0 == dmu_read(os, bigobj, bigoff,
 		    bigsize, bigcheck, DMU_READ_PREFETCH));
 
 		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
 		ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
 
 		umem_free(packcheck, packsize);
 		umem_free(bigcheck, bigsize);
 	}
 
 	umem_free(packbuf, packsize);
 	umem_free(bigbuf, bigsize);
 	umem_free(od, size);
 }
 
 void
 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
     uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
 {
 	uint64_t i;
 	bufwad_t *pack;
 	bufwad_t *bigH;
 	bufwad_t *bigT;
 
 	/*
 	 * For each index from n to n + s, verify that the existing bufwad
 	 * in packobj matches the bufwads at the head and tail of the
 	 * corresponding chunk in bigobj.  Then update all three bufwads
 	 * with the new values we want to write out.
 	 */
 	for (i = 0; i < s; i++) {
 		/* LINTED */
 		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
 		/* LINTED */
 		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
 		/* LINTED */
 		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
 
 		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
 		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
 
 		if (pack->bw_txg > txg)
 			fatal(0, "future leak: got %llx, open txg is %llx",
 			    pack->bw_txg, txg);
 
 		if (pack->bw_data != 0 && pack->bw_index != n + i)
 			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
 			    pack->bw_index, n, i);
 
 		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
 
 		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
 
 		pack->bw_index = n + i;
 		pack->bw_txg = txg;
 		pack->bw_data = 1 + ztest_random(-2ULL);
 
 		*bigH = *pack;
 		*bigT = *pack;
 	}
 }
 
 #undef OD_ARRAY_SIZE
 #define	OD_ARRAY_SIZE	2
 
 void
 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t *od;
 	dmu_tx_t *tx;
 	uint64_t i;
 	int error;
 	int size;
 	uint64_t n, s, txg;
 	bufwad_t *packbuf, *bigbuf;
 	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
 	uint64_t blocksize = ztest_random_blocksize();
 	uint64_t chunksize = blocksize;
 	uint64_t regions = 997;
 	uint64_t stride = 123456789ULL;
 	uint64_t width = 9;
 	dmu_buf_t *bonus_db;
 	arc_buf_t **bigbuf_arcbufs;
 	dmu_object_info_t doi;
 
 	size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
 	od = umem_alloc(size, UMEM_NOFAIL);
 
 	/*
 	 * This test uses two objects, packobj and bigobj, that are always
 	 * updated together (i.e. in the same tx) so that their contents are
 	 * in sync and can be compared.  Their contents relate to each other
 	 * in a simple way: packobj is a dense array of 'bufwad' structures,
 	 * while bigobj is a sparse array of the same bufwads.  Specifically,
 	 * for any index n, there are three bufwads that should be identical:
 	 *
 	 *	packobj, at offset n * sizeof (bufwad_t)
 	 *	bigobj, at the head of the nth chunk
 	 *	bigobj, at the tail of the nth chunk
 	 *
 	 * The chunk size is set equal to bigobj block size so that
 	 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates.
 	 */
 
 	/*
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
 	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
 	ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
 	    chunksize);
 
 
 	if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
 		umem_free(od, size);
 		return;
 	}
 
 	bigobj = od[0].od_object;
 	packobj = od[1].od_object;
 	blocksize = od[0].od_blocksize;
 	chunksize = blocksize;
 	ASSERT(chunksize == od[1].od_gen);
 
 	VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
 	VERIFY(ISP2(doi.doi_data_block_size));
 	VERIFY(chunksize == doi.doi_data_block_size);
 	VERIFY(chunksize >= 2 * sizeof (bufwad_t));
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(width - 1);
 
 	packoff = n * sizeof (bufwad_t);
 	packsize = s * sizeof (bufwad_t);
 
 	bigoff = n * chunksize;
 	bigsize = s * chunksize;
 
 	packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
 	bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
 
 	bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
 
 	/*
 	 * Iteration 0 test zcopy for DB_UNCACHED dbufs.
 	 * Iteration 1 test zcopy to already referenced dbufs.
 	 * Iteration 2 test zcopy to dirty dbuf in the same txg.
 	 * Iteration 3 test zcopy to dbuf dirty in previous txg.
 	 * Iteration 4 test zcopy when dbuf is no longer dirty.
 	 * Iteration 5 test zcopy when it can't be done.
 	 * Iteration 6 one more zcopy write.
 	 */
 	for (i = 0; i < 7; i++) {
 		uint64_t j;
 		uint64_t off;
 
 		/*
 		 * In iteration 5 (i == 5) use arcbufs
 		 * that don't match bigobj blksz to test
 		 * dmu_assign_arcbuf_by_dbuf() when it can't directly
 		 * assign an arcbuf to a dbuf.
 		 */
 		for (j = 0; j < s; j++) {
 			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
 				bigbuf_arcbufs[j] =
 				    dmu_request_arcbuf(bonus_db, chunksize);
 			} else {
 				bigbuf_arcbufs[2 * j] =
 				    dmu_request_arcbuf(bonus_db, chunksize / 2);
 				bigbuf_arcbufs[2 * j + 1] =
 				    dmu_request_arcbuf(bonus_db, chunksize / 2);
 			}
 		}
 
 		/*
 		 * Get a tx for the mods to both packobj and bigobj.
 		 */
 		tx = dmu_tx_create(os);
 
 		dmu_tx_hold_write(tx, packobj, packoff, packsize);
 		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
 
 		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 		if (txg == 0) {
 			umem_free(packbuf, packsize);
 			umem_free(bigbuf, bigsize);
 			for (j = 0; j < s; j++) {
 				if (i != 5 ||
 				    chunksize < (SPA_MINBLOCKSIZE * 2)) {
 					dmu_return_arcbuf(bigbuf_arcbufs[j]);
 				} else {
 					dmu_return_arcbuf(
 					    bigbuf_arcbufs[2 * j]);
 					dmu_return_arcbuf(
 					    bigbuf_arcbufs[2 * j + 1]);
 				}
 			}
 			umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
 			umem_free(od, size);
 			dmu_buf_rele(bonus_db, FTAG);
 			return;
 		}
 
 		/*
 		 * 50% of the time don't read objects in the 1st iteration to
 		 * test dmu_assign_arcbuf_by_dbuf() for the case when there are
 		 * no existing dbufs for the specified offsets.
 		 */
 		if (i != 0 || ztest_random(2) != 0) {
 			error = dmu_read(os, packobj, packoff,
 			    packsize, packbuf, DMU_READ_PREFETCH);
 			ASSERT0(error);
 			error = dmu_read(os, bigobj, bigoff, bigsize,
 			    bigbuf, DMU_READ_PREFETCH);
 			ASSERT0(error);
 		}
 		compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
 		    n, chunksize, txg);
 
 		/*
 		 * We've verified all the old bufwads, and made new ones.
 		 * Now write them out.
 		 */
 		dmu_write(os, packobj, packoff, packsize, packbuf, tx);
 		if (ztest_opts.zo_verbose >= 7) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
 		for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
 			dmu_buf_t *dbt;
 			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
 				bcopy((caddr_t)bigbuf + (off - bigoff),
 				    bigbuf_arcbufs[j]->b_data, chunksize);
 			} else {
 				bcopy((caddr_t)bigbuf + (off - bigoff),
 				    bigbuf_arcbufs[2 * j]->b_data,
 				    chunksize / 2);
 				bcopy((caddr_t)bigbuf + (off - bigoff) +
 				    chunksize / 2,
 				    bigbuf_arcbufs[2 * j + 1]->b_data,
 				    chunksize / 2);
 			}
 
 			if (i == 1) {
 				VERIFY(dmu_buf_hold(os, bigobj, off,
 				    FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
 			}
 			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
 				dmu_assign_arcbuf_by_dbuf(bonus_db, off,
 				    bigbuf_arcbufs[j], tx);
 			} else {
 				dmu_assign_arcbuf_by_dbuf(bonus_db, off,
 				    bigbuf_arcbufs[2 * j], tx);
 				dmu_assign_arcbuf_by_dbuf(bonus_db,
 				    off + chunksize / 2,
 				    bigbuf_arcbufs[2 * j + 1], tx);
 			}
 			if (i == 1) {
 				dmu_buf_rele(dbt, FTAG);
 			}
 		}
 		dmu_tx_commit(tx);
 
 		/*
 		 * Sanity check the stuff we just wrote.
 		 */
 		{
 			void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
 			void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
 			VERIFY(0 == dmu_read(os, packobj, packoff,
 			    packsize, packcheck, DMU_READ_PREFETCH));
 			VERIFY(0 == dmu_read(os, bigobj, bigoff,
 			    bigsize, bigcheck, DMU_READ_PREFETCH));
 
 			ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
 			ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
 
 			umem_free(packcheck, packsize);
 			umem_free(bigcheck, bigsize);
 		}
 		if (i == 2) {
 			txg_wait_open(dmu_objset_pool(os), 0);
 		} else if (i == 3) {
 			txg_wait_synced(dmu_objset_pool(os), 0);
 		}
 	}
 
 	dmu_buf_rele(bonus_db, FTAG);
 	umem_free(packbuf, packsize);
 	umem_free(bigbuf, bigsize);
 	umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
 	umem_free(od, size);
 }
 
 /* ARGSUSED */
 void
 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_od_t *od;
 
 	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
 	uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
 	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 
 	/*
 	 * Have multiple threads write to large offsets in an object
 	 * to verify that parallel writes to an object -- even to the
 	 * same blocks within the object -- doesn't cause any trouble.
 	 */
 	ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0)
 		return;
 
 	while (ztest_random(10) != 0)
 		ztest_io(zd, od->od_object, offset);
 
 	umem_free(od, sizeof (ztest_od_t));
 }
 
 void
 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_od_t *od;
 	uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
 	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 	uint64_t count = ztest_random(20) + 1;
 	uint64_t blocksize = ztest_random_blocksize();
 	void *data;
 
 	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
 
 	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (ztest_od_t),
 	    !ztest_random(2)) != 0) {
 		umem_free(od, sizeof (ztest_od_t));
 		return;
 	}
 
 	if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) {
 		umem_free(od, sizeof (ztest_od_t));
 		return;
 	}
 
 	ztest_prealloc(zd, od->od_object, offset, count * blocksize);
 
 	data = umem_zalloc(blocksize, UMEM_NOFAIL);
 
 	while (ztest_random(count) != 0) {
 		uint64_t randoff = offset + (ztest_random(count) * blocksize);
 		if (ztest_write(zd, od->od_object, randoff, blocksize,
 		    data) != 0)
 			break;
 		while (ztest_random(4) != 0)
 			ztest_io(zd, od->od_object, randoff);
 	}
 
 	umem_free(data, blocksize);
 	umem_free(od, sizeof (ztest_od_t));
 }
 
 /*
  * Verify that zap_{create,destroy,add,remove,update} work as expected.
  */
 #define	ZTEST_ZAP_MIN_INTS	1
 #define	ZTEST_ZAP_MAX_INTS	4
 #define	ZTEST_ZAP_MAX_PROPS	1000
 
 void
 ztest_zap(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t *od;
 	uint64_t object;
 	uint64_t txg, last_txg;
 	uint64_t value[ZTEST_ZAP_MAX_INTS];
 	uint64_t zl_ints, zl_intsize, prop;
 	int i, ints;
 	dmu_tx_t *tx;
 	char propname[100], txgname[100];
 	int error;
 	char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
 
 	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
 	ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (ztest_od_t),
 	    !ztest_random(2)) != 0)
 		goto out;
 
 	object = od->od_object;
 
 	/*
 	 * Generate a known hash collision, and verify that
 	 * we can lookup and remove both entries.
 	 */
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
 	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 	if (txg == 0)
 		goto out;
 	for (i = 0; i < 2; i++) {
 		value[i] = i;
 		VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
 		    1, &value[i], tx));
 	}
 	for (i = 0; i < 2; i++) {
 		VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
 		    sizeof (uint64_t), 1, &value[i], tx));
 		VERIFY3U(0, ==,
 		    zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
 		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 		ASSERT3U(zl_ints, ==, 1);
 	}
 	for (i = 0; i < 2; i++) {
 		VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
 	}
 	dmu_tx_commit(tx);
 
 	/*
 	 * Generate a buch of random entries.
 	 */
 	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
 
 	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
 	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
 	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
 	bzero(value, sizeof (value));
 	last_txg = 0;
 
 	/*
 	 * If these zap entries already exist, validate their contents.
 	 */
 	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
 	if (error == 0) {
 		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 		ASSERT3U(zl_ints, ==, 1);
 
 		VERIFY(zap_lookup(os, object, txgname, zl_intsize,
 		    zl_ints, &last_txg) == 0);
 
 		VERIFY(zap_length(os, object, propname, &zl_intsize,
 		    &zl_ints) == 0);
 
 		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 		ASSERT3U(zl_ints, ==, ints);
 
 		VERIFY(zap_lookup(os, object, propname, zl_intsize,
 		    zl_ints, value) == 0);
 
 		for (i = 0; i < ints; i++) {
 			ASSERT3U(value[i], ==, last_txg + object + i);
 		}
 	} else {
 		ASSERT3U(error, ==, ENOENT);
 	}
 
 	/*
 	 * Atomically update two entries in our zap object.
 	 * The first is named txg_%llu, and contains the txg
 	 * in which the property was last updated.  The second
 	 * is named prop_%llu, and the nth element of its value
 	 * should be txg + object + n.
 	 */
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
 	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 	if (txg == 0)
 		goto out;
 
 	if (last_txg > txg)
 		fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
 
 	for (i = 0; i < ints; i++)
 		value[i] = txg + object + i;
 
 	VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
 	    1, &txg, tx));
 	VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
 	    ints, value, tx));
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Remove a random pair of entries.
 	 */
 	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
 	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
 	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
 
 	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
 
 	if (error == ENOENT)
 		goto out;
 
 	ASSERT0(error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
 	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 	if (txg == 0)
 		goto out;
 	VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
 	VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
 	dmu_tx_commit(tx);
 out:
 	umem_free(od, sizeof (ztest_od_t));
 }
 
 /*
  * Testcase to test the upgrading of a microzap to fatzap.
  */
 void
 ztest_fzap(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t *od;
 	uint64_t object, txg;
 	int i;
 
 	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
 	ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (ztest_od_t),
 	    !ztest_random(2)) != 0)
 		goto out;
 	object = od->od_object;
 
 	/*
 	 * Add entries to this ZAP and make sure it spills over
 	 * and gets upgraded to a fatzap. Also, since we are adding
 	 * 2050 entries we should see ptrtbl growth and leaf-block split.
 	 */
 	for (i = 0; i < 2050; i++) {
 		char name[ZFS_MAX_DATASET_NAME_LEN];
 		uint64_t value = i;
 		dmu_tx_t *tx;
 		int error;
 
 		(void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
 		    (u_longlong_t)id, (u_longlong_t)value);
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_zap(tx, object, B_TRUE, name);
 		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 		if (txg == 0)
 			goto out;
 		error = zap_add(os, object, name, sizeof (uint64_t), 1,
 		    &value, tx);
 		ASSERT(error == 0 || error == EEXIST);
 		dmu_tx_commit(tx);
 	}
 out:
 	umem_free(od, sizeof (ztest_od_t));
 }
 
 /* ARGSUSED */
 void
 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t *od;
 	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
 	dmu_tx_t *tx;
 	int i, namelen, error;
 	int micro = ztest_random(2);
 	char name[20], string_value[20];
 	void *data;
 
 	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
 	ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
 		umem_free(od, sizeof (ztest_od_t));
 		return;
 	}
 
 	object = od->od_object;
 
 	/*
 	 * Generate a random name of the form 'xxx.....' where each
 	 * x is a random printable character and the dots are dots.
 	 * There are 94 such characters, and the name length goes from
 	 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
 	 */
 	namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
 
 	for (i = 0; i < 3; i++)
 		name[i] = '!' + ztest_random('~' - '!' + 1);
 	for (; i < namelen - 1; i++)
 		name[i] = '.';
 	name[i] = '\0';
 
 	if ((namelen & 1) || micro) {
 		wsize = sizeof (txg);
 		wc = 1;
 		data = &txg;
 	} else {
 		wsize = 1;
 		wc = namelen;
 		data = string_value;
 	}
 
 	count = -1ULL;
 	VERIFY0(zap_count(os, object, &count));
 	ASSERT(count != -1ULL);
 
 	/*
 	 * Select an operation: length, lookup, add, update, remove.
 	 */
 	i = ztest_random(5);
 
 	if (i >= 2) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
 		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 		if (txg == 0) {
 			umem_free(od, sizeof (ztest_od_t));
 			return;
 		}
 		bcopy(name, string_value, namelen);
 	} else {
 		tx = NULL;
 		txg = 0;
 		bzero(string_value, namelen);
 	}
 
 	switch (i) {
 
 	case 0:
 		error = zap_length(os, object, name, &zl_wsize, &zl_wc);
 		if (error == 0) {
 			ASSERT3U(wsize, ==, zl_wsize);
 			ASSERT3U(wc, ==, zl_wc);
 		} else {
 			ASSERT3U(error, ==, ENOENT);
 		}
 		break;
 
 	case 1:
 		error = zap_lookup(os, object, name, wsize, wc, data);
 		if (error == 0) {
 			if (data == string_value &&
 			    bcmp(name, data, namelen) != 0)
 				fatal(0, "name '%s' != val '%s' len %d",
 				    name, data, namelen);
 		} else {
 			ASSERT3U(error, ==, ENOENT);
 		}
 		break;
 
 	case 2:
 		error = zap_add(os, object, name, wsize, wc, data, tx);
 		ASSERT(error == 0 || error == EEXIST);
 		break;
 
 	case 3:
 		VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
 		break;
 
 	case 4:
 		error = zap_remove(os, object, name, tx);
 		ASSERT(error == 0 || error == ENOENT);
 		break;
 	}
 
 	if (tx != NULL)
 		dmu_tx_commit(tx);
 
 	umem_free(od, sizeof (ztest_od_t));
 }
 
 /*
  * Commit callback data.
  */
 typedef struct ztest_cb_data {
 	list_node_t		zcd_node;
 	uint64_t		zcd_txg;
 	int			zcd_expected_err;
 	boolean_t		zcd_added;
 	boolean_t		zcd_called;
 	spa_t			*zcd_spa;
 } ztest_cb_data_t;
 
 /* This is the actual commit callback function */
 static void
 ztest_commit_callback(void *arg, int error)
 {
 	ztest_cb_data_t *data = arg;
 	uint64_t synced_txg;
 
 	VERIFY(data != NULL);
 	VERIFY3S(data->zcd_expected_err, ==, error);
 	VERIFY(!data->zcd_called);
 
 	synced_txg = spa_last_synced_txg(data->zcd_spa);
 	if (data->zcd_txg > synced_txg)
 		fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
 		    ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
 		    synced_txg);
 
 	data->zcd_called = B_TRUE;
 
 	if (error == ECANCELED) {
 		ASSERT0(data->zcd_txg);
 		ASSERT(!data->zcd_added);
 
 		/*
 		 * The private callback data should be destroyed here, but
 		 * since we are going to check the zcd_called field after
 		 * dmu_tx_abort(), we will destroy it there.
 		 */
 		return;
 	}
 
 	ASSERT(data->zcd_added);
 	ASSERT3U(data->zcd_txg, !=, 0);
 
 	(void) mutex_enter(&zcl.zcl_callbacks_lock);
 
 	/* See if this cb was called more quickly */
 	if ((synced_txg - data->zcd_txg) < zc_min_txg_delay)
 		zc_min_txg_delay = synced_txg - data->zcd_txg;
 
 	/* Remove our callback from the list */
 	list_remove(&zcl.zcl_callbacks, data);
 
 	(void) mutex_exit(&zcl.zcl_callbacks_lock);
 
 	umem_free(data, sizeof (ztest_cb_data_t));
 }
 
 /* Allocate and initialize callback data structure */
 static ztest_cb_data_t *
 ztest_create_cb_data(objset_t *os, uint64_t txg)
 {
 	ztest_cb_data_t *cb_data;
 
 	cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
 
 	cb_data->zcd_txg = txg;
 	cb_data->zcd_spa = dmu_objset_spa(os);
 	list_link_init(&cb_data->zcd_node);
 
 	return (cb_data);
 }
 
 /*
  * Commit callback test.
  */
 void
 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	ztest_od_t *od;
 	dmu_tx_t *tx;
 	ztest_cb_data_t *cb_data[3], *tmp_cb;
 	uint64_t old_txg, txg;
 	int i, error = 0;
 
 	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
 	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
 		umem_free(od, sizeof (ztest_od_t));
 		return;
 	}
 
 	tx = dmu_tx_create(os);
 
 	cb_data[0] = ztest_create_cb_data(os, 0);
 	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
 
 	dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t));
 
 	/* Every once in a while, abort the transaction on purpose */
 	if (ztest_random(100) == 0)
 		error = -1;
 
 	if (!error)
 		error = dmu_tx_assign(tx, TXG_NOWAIT);
 
 	txg = error ? 0 : dmu_tx_get_txg(tx);
 
 	cb_data[0]->zcd_txg = txg;
 	cb_data[1] = ztest_create_cb_data(os, txg);
 	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
 
 	if (error) {
 		/*
 		 * It's not a strict requirement to call the registered
 		 * callbacks from inside dmu_tx_abort(), but that's what
 		 * it's supposed to happen in the current implementation
 		 * so we will check for that.
 		 */
 		for (i = 0; i < 2; i++) {
 			cb_data[i]->zcd_expected_err = ECANCELED;
 			VERIFY(!cb_data[i]->zcd_called);
 		}
 
 		dmu_tx_abort(tx);
 
 		for (i = 0; i < 2; i++) {
 			VERIFY(cb_data[i]->zcd_called);
 			umem_free(cb_data[i], sizeof (ztest_cb_data_t));
 		}
 
 		umem_free(od, sizeof (ztest_od_t));
 		return;
 	}
 
 	cb_data[2] = ztest_create_cb_data(os, txg);
 	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
 
 	/*
 	 * Read existing data to make sure there isn't a future leak.
 	 */
 	VERIFY(0 == dmu_read(os, od->od_object, 0, sizeof (uint64_t),
 	    &old_txg, DMU_READ_PREFETCH));
 
 	if (old_txg > txg)
 		fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
 		    old_txg, txg);
 
 	dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx);
 
 	(void) mutex_enter(&zcl.zcl_callbacks_lock);
 
 	/*
 	 * Since commit callbacks don't have any ordering requirement and since
 	 * it is theoretically possible for a commit callback to be called
 	 * after an arbitrary amount of time has elapsed since its txg has been
 	 * synced, it is difficult to reliably determine whether a commit
 	 * callback hasn't been called due to high load or due to a flawed
 	 * implementation.
 	 *
 	 * In practice, we will assume that if after a certain number of txgs a
 	 * commit callback hasn't been called, then most likely there's an
 	 * implementation bug..
 	 */
 	tmp_cb = list_head(&zcl.zcl_callbacks);
 	if (tmp_cb != NULL &&
 	    tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) {
 		fatal(0, "Commit callback threshold exceeded, oldest txg: %"
 		    PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
 	}
 
 	/*
 	 * Let's find the place to insert our callbacks.
 	 *
 	 * Even though the list is ordered by txg, it is possible for the
 	 * insertion point to not be the end because our txg may already be
 	 * quiescing at this point and other callbacks in the open txg
 	 * (from other objsets) may have sneaked in.
 	 */
 	tmp_cb = list_tail(&zcl.zcl_callbacks);
 	while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
 		tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
 
 	/* Add the 3 callbacks to the list */
 	for (i = 0; i < 3; i++) {
 		if (tmp_cb == NULL)
 			list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
 		else
 			list_insert_after(&zcl.zcl_callbacks, tmp_cb,
 			    cb_data[i]);
 
 		cb_data[i]->zcd_added = B_TRUE;
 		VERIFY(!cb_data[i]->zcd_called);
 
 		tmp_cb = cb_data[i];
 	}
 
 	zc_cb_counter += 3;
 
 	(void) mutex_exit(&zcl.zcl_callbacks_lock);
 
 	dmu_tx_commit(tx);
 
 	umem_free(od, sizeof (ztest_od_t));
 }
 
 /*
  * Visit each object in the dataset. Verify that its properties
  * are consistent what was stored in the block tag when it was created,
  * and that its unused bonus buffer space has not been overwritten.
  */
 /* ARGSUSED */
 void
 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id)
 {
 	objset_t *os = zd->zd_os;
 	uint64_t obj;
 	int err = 0;
 
 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
 		ztest_block_tag_t *bt = NULL;
 		dmu_object_info_t doi;
 		dmu_buf_t *db;
 
 		if (dmu_bonus_hold(os, obj, FTAG, &db) != 0)
 			continue;
 
 		dmu_object_info_from_db(db, &doi);
 		if (doi.doi_bonus_size >= sizeof (*bt))
 			bt = ztest_bt_bonus(db);
 
 		if (bt && bt->bt_magic == BT_MAGIC) {
 			ztest_bt_verify(bt, os, obj, doi.doi_dnodesize,
 			    bt->bt_offset, bt->bt_gen, bt->bt_txg,
 			    bt->bt_crtxg);
 			ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen);
 		}
 
 		dmu_buf_rele(db, FTAG);
 	}
 }
 
 /* ARGSUSED */
 void
 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
 {
 	zfs_prop_t proplist[] = {
 		ZFS_PROP_CHECKSUM,
 		ZFS_PROP_COMPRESSION,
 		ZFS_PROP_COPIES,
 		ZFS_PROP_DEDUP
 	};
 	int p;
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
 		(void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
 		    ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
 
 	VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE,
 	    ztest_random_blocksize(), (int)ztest_random(2)));
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /* ARGSUSED */
 void
 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
 {
 	nvlist_t *props = NULL;
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
 	    ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
 
 	VERIFY0(spa_prop_get(ztest_spa, &props));
 
 	if (ztest_opts.zo_verbose >= 6)
 		dump_nvlist(props, 4);
 
 	nvlist_free(props);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 static int
 user_release_one(const char *snapname, const char *holdname)
 {
 	nvlist_t *snaps, *holds;
 	int error;
 
 	snaps = fnvlist_alloc();
 	holds = fnvlist_alloc();
 	fnvlist_add_boolean(holds, holdname);
 	fnvlist_add_nvlist(snaps, snapname, holds);
 	fnvlist_free(holds);
 	error = dsl_dataset_user_release(snaps, NULL);
 	fnvlist_free(snaps);
 	return (error);
 }
 
 /*
  * Test snapshot hold/release and deferred destroy.
  */
 void
 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
 {
 	int error;
 	objset_t *os = zd->zd_os;
 	objset_t *origin;
 	char snapname[100];
 	char fullname[100];
 	char clonename[100];
 	char tag[100];
 	char osname[ZFS_MAX_DATASET_NAME_LEN];
 	nvlist_t *holds;
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	dmu_objset_name(os, osname);
 
 	(void) snprintf(snapname, sizeof (snapname), "sh1_%llu",
 	    (u_longlong_t)id);
 	(void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
 	(void) snprintf(clonename, sizeof (clonename),
 	    "%s/ch1_%llu", osname, (u_longlong_t)id);
 	(void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id);
 
 	/*
 	 * Clean up from any previous run.
 	 */
 	error = dsl_destroy_head(clonename);
 	if (error != ENOENT)
 		ASSERT0(error);
 	error = user_release_one(fullname, tag);
 	if (error != ESRCH && error != ENOENT)
 		ASSERT0(error);
 	error = dsl_destroy_snapshot(fullname, B_FALSE);
 	if (error != ENOENT)
 		ASSERT0(error);
 
 	/*
 	 * Create snapshot, clone it, mark snap for deferred destroy,
 	 * destroy clone, verify snap was also destroyed.
 	 */
 	error = dmu_objset_snapshot_one(osname, snapname);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_snapshot");
 			goto out;
 		}
 		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
 	}
 
 	error = dmu_objset_clone(clonename, fullname);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_clone");
 			goto out;
 		}
 		fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
 	}
 
 	error = dsl_destroy_snapshot(fullname, B_TRUE);
 	if (error) {
 		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
 		    fullname, error);
 	}
 
 	error = dsl_destroy_head(clonename);
 	if (error)
 		fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
 
 	error = dmu_objset_hold(fullname, FTAG, &origin);
 	if (error != ENOENT)
 		fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
 
 	/*
 	 * Create snapshot, add temporary hold, verify that we can't
 	 * destroy a held snapshot, mark for deferred destroy,
 	 * release hold, verify snapshot was destroyed.
 	 */
 	error = dmu_objset_snapshot_one(osname, snapname);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_snapshot");
 			goto out;
 		}
 		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
 	}
 
 	holds = fnvlist_alloc();
 	fnvlist_add_string(holds, fullname, tag);
 	error = dsl_dataset_user_hold(holds, 0, NULL);
 	fnvlist_free(holds);
 
 	if (error == ENOSPC) {
 		ztest_record_enospc("dsl_dataset_user_hold");
 		goto out;
 	} else if (error) {
 		fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
 		    fullname, tag, error);
 	}
 
 	error = dsl_destroy_snapshot(fullname, B_FALSE);
 	if (error != EBUSY) {
 		fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
 		    fullname, error);
 	}
 
 	error = dsl_destroy_snapshot(fullname, B_TRUE);
 	if (error) {
 		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
 		    fullname, error);
 	}
 
 	error = user_release_one(fullname, tag);
 	if (error)
 		fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);
 
 	VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
 
 out:
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
  * Inject random faults into the on-disk data.
  */
 /* ARGSUSED */
 void
 ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	int fd;
 	uint64_t offset;
 	uint64_t leaves;
 	uint64_t bad = 0x1990c0ffeedecadeull;
 	uint64_t top, leaf;
 	char *path0;
 	char *pathrand;
 	size_t fsize;
 	int bshift = SPA_MAXBLOCKSHIFT + 2;
 	int iters = 1000;
 	int maxfaults;
 	int mirror_save;
 	vdev_t *vd0 = NULL;
 	uint64_t guid0 = 0;
 	boolean_t islog = B_FALSE;
 
 	path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 	pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 
 	mutex_enter(&ztest_vdev_lock);
 	maxfaults = MAXFAULTS();
 	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
 	mirror_save = zs->zs_mirrors;
 	mutex_exit(&ztest_vdev_lock);
 
 	ASSERT(leaves >= 1);
 
 	/*
 	 * Grab the name lock as reader. There are some operations
 	 * which don't like to have their vdevs changed while
 	 * they are in progress (i.e. spa_change_guid). Those
 	 * operations will have grabbed the name lock as writer.
 	 */
 	(void) rw_rdlock(&ztest_name_lock);
 
 	/*
 	 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	if (ztest_random(2) == 0) {
 		/*
 		 * Inject errors on a normal data device or slog device.
 		 */
 		top = ztest_random_vdev_top(spa, B_TRUE);
 		leaf = ztest_random(leaves) + zs->zs_splits;
 
 		/*
 		 * Generate paths to the first leaf in this top-level vdev,
 		 * and to the random leaf we selected.  We'll induce transient
 		 * write failures and random online/offline activity on leaf 0,
 		 * and we'll write random garbage to the randomly chosen leaf.
 		 */
 		(void) snprintf(path0, MAXPATHLEN, ztest_dev_template,
 		    ztest_opts.zo_dir, ztest_opts.zo_pool,
 		    top * leaves + zs->zs_splits);
 		(void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template,
 		    ztest_opts.zo_dir, ztest_opts.zo_pool,
 		    top * leaves + leaf);
 
 		vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
 		if (vd0 != NULL && vd0->vdev_top->vdev_islog)
 			islog = B_TRUE;
 
 		/*
 		 * If the top-level vdev needs to be resilvered
 		 * then we only allow faults on the device that is
 		 * resilvering.
 		 */
 		if (vd0 != NULL && maxfaults != 1 &&
 		    (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
 		    vd0->vdev_resilver_txg != 0)) {
 			/*
 			 * Make vd0 explicitly claim to be unreadable,
 			 * or unwriteable, or reach behind its back
 			 * and close the underlying fd.  We can do this if
 			 * maxfaults == 0 because we'll fail and reexecute,
 			 * and we can do it if maxfaults >= 2 because we'll
 			 * have enough redundancy.  If maxfaults == 1, the
 			 * combination of this with injection of random data
 			 * corruption below exceeds the pool's fault tolerance.
 			 */
 			vdev_file_t *vf = vd0->vdev_tsd;
 
 			if (vf != NULL && ztest_random(3) == 0) {
 				(void) close(vf->vf_vnode->v_fd);
 				vf->vf_vnode->v_fd = -1;
 			} else if (ztest_random(2) == 0) {
 				vd0->vdev_cant_read = B_TRUE;
 			} else {
 				vd0->vdev_cant_write = B_TRUE;
 			}
 			guid0 = vd0->vdev_guid;
 		}
 	} else {
 		/*
 		 * Inject errors on an l2cache device.
 		 */
 		spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 		if (sav->sav_count == 0) {
 			spa_config_exit(spa, SCL_STATE, FTAG);
 			(void) rw_unlock(&ztest_name_lock);
 			goto out;
 		}
 		vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
 		guid0 = vd0->vdev_guid;
 		(void) strcpy(path0, vd0->vdev_path);
 		(void) strcpy(pathrand, vd0->vdev_path);
 
 		leaf = 0;
 		leaves = 1;
 		maxfaults = INT_MAX;	/* no limit on cache devices */
 	}
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 	(void) rw_unlock(&ztest_name_lock);
 
 	/*
 	 * If we can tolerate two or more faults, or we're dealing
 	 * with a slog, randomly online/offline vd0.
 	 */
 	if ((maxfaults >= 2 || islog) && guid0 != 0) {
 		if (ztest_random(10) < 6) {
 			int flags = (ztest_random(2) == 0 ?
 			    ZFS_OFFLINE_TEMPORARY : 0);
 
 			/*
 			 * We have to grab the zs_name_lock as writer to
 			 * prevent a race between offlining a slog and
 			 * destroying a dataset. Offlining the slog will
 			 * grab a reference on the dataset which may cause
 			 * dsl_destroy_head() to fail with EBUSY thus
 			 * leaving the dataset in an inconsistent state.
 			 */
 			if (islog)
 				(void) rw_wrlock(&ztest_name_lock);
 
 			VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
 
 			if (islog)
 				(void) rw_unlock(&ztest_name_lock);
 		} else {
 			/*
 			 * Ideally we would like to be able to randomly
 			 * call vdev_[on|off]line without holding locks
 			 * to force unpredictable failures but the side
 			 * effects of vdev_[on|off]line prevent us from
 			 * doing so. We grab the ztest_vdev_lock here to
 			 * prevent a race between injection testing and
 			 * aux_vdev removal.
 			 */
 			mutex_enter(&ztest_vdev_lock);
 			(void) vdev_online(spa, guid0, 0, NULL);
 			mutex_exit(&ztest_vdev_lock);
 		}
 	}
 
 	if (maxfaults == 0)
 		goto out;
 
 	/*
 	 * We have at least single-fault tolerance, so inject data corruption.
 	 */
 	fd = open(pathrand, O_RDWR);
 
 	if (fd == -1)	/* we hit a gap in the device namespace */
 		goto out;
 
 	fsize = lseek(fd, 0, SEEK_END);
 
 	while (--iters != 0) {
 		/*
 		 * The offset must be chosen carefully to ensure that
 		 * we do not inject a given logical block with errors
 		 * on two different leaf devices, because ZFS can not
 		 * tolerate that (if maxfaults==1).
 		 *
 		 * We divide each leaf into chunks of size
 		 * (# leaves * SPA_MAXBLOCKSIZE * 4).  Within each chunk
 		 * there is a series of ranges to which we can inject errors.
 		 * Each range can accept errors on only a single leaf vdev.
 		 * The error injection ranges are separated by ranges
 		 * which we will not inject errors on any device (DMZs).
 		 * Each DMZ must be large enough such that a single block
 		 * can not straddle it, so that a single block can not be
 		 * a target in two different injection ranges (on different
 		 * leaf vdevs).
 		 *
 		 * For example, with 3 leaves, each chunk looks like:
 		 *    0 to  32M: injection range for leaf 0
 		 *  32M to  64M: DMZ - no injection allowed
 		 *  64M to  96M: injection range for leaf 1
 		 *  96M to 128M: DMZ - no injection allowed
 		 * 128M to 160M: injection range for leaf 2
 		 * 160M to 192M: DMZ - no injection allowed
 		 */
 		offset = ztest_random(fsize / (leaves << bshift)) *
 		    (leaves << bshift) + (leaf << bshift) +
 		    (ztest_random(1ULL << (bshift - 1)) & -8ULL);
 
 		/*
 		 * Only allow damage to the labels at one end of the vdev.
 		 *
 		 * If all labels are damaged, the device will be totally
 		 * inaccessible, which will result in loss of data,
 		 * because we also damage (parts of) the other side of
 		 * the mirror/raidz.
 		 *
 		 * Additionally, we will always have both an even and an
 		 * odd label, so that we can handle crashes in the
 		 * middle of vdev_config_sync().
 		 */
 		if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
 			continue;
 
 		/*
 		 * The two end labels are stored at the "end" of the disk, but
 		 * the end of the disk (vdev_psize) is aligned to
 		 * sizeof (vdev_label_t).
 		 */
 		uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
 		if ((leaf & 1) == 1 &&
 		    offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
 			continue;
 
 		mutex_enter(&ztest_vdev_lock);
 		if (mirror_save != zs->zs_mirrors) {
 			mutex_exit(&ztest_vdev_lock);
 			(void) close(fd);
 			goto out;
 		}
 
 		if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
 			fatal(1, "can't inject bad word at 0x%llx in %s",
 			    offset, pathrand);
 
 		mutex_exit(&ztest_vdev_lock);
 
 		if (ztest_opts.zo_verbose >= 7)
 			(void) printf("injected bad word into %s,"
 			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
 	}
 
 	(void) close(fd);
 out:
 	umem_free(path0, MAXPATHLEN);
 	umem_free(pathrand, MAXPATHLEN);
 }
 
 /*
  * Verify that DDT repair works as expected.
  */
 void
 ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
 	spa_t *spa = ztest_spa;
 	objset_t *os = zd->zd_os;
 	ztest_od_t *od;
 	uint64_t object, blocksize, txg, pattern, psize;
 	enum zio_checksum checksum = spa_dedup_checksum(spa);
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 	abd_t *abd;
 	blkptr_t blk;
 	int copies = 2 * ZIO_DEDUPDITTO_MIN;
 	int i;
 
 	blocksize = ztest_random_blocksize();
 	blocksize = MIN(blocksize, 2048);	/* because we write so many */
 
 	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
 	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
 
 	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
 		umem_free(od, sizeof (ztest_od_t));
 		return;
 	}
 
 	/*
 	 * Take the name lock as writer to prevent anyone else from changing
 	 * the pool and dataset properies we need to maintain during this test.
 	 */
 	(void) rw_wrlock(&ztest_name_lock);
 
 	if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
 	    B_FALSE) != 0 ||
 	    ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
 	    B_FALSE) != 0) {
 		(void) rw_unlock(&ztest_name_lock);
 		umem_free(od, sizeof (ztest_od_t));
 		return;
 	}
 
 	dmu_objset_stats_t dds;
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
 	object = od[0].od_object;
 	blocksize = od[0].od_blocksize;
 	pattern = zs->zs_guid ^ dds.dds_guid;
 
 	ASSERT(object != 0);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_write(tx, object, 0, copies * blocksize);
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
 		(void) rw_unlock(&ztest_name_lock);
 		umem_free(od, sizeof (ztest_od_t));
 		return;
 	}
 
 	/*
 	 * Write all the copies of our block.
 	 */
 	for (i = 0; i < copies; i++) {
 		uint64_t offset = i * blocksize;
 		int error = dmu_buf_hold(os, object, offset, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (error != 0) {
 			fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
 			    os, (long long)object, (long long) offset, error);
 		}
 		ASSERT(db->db_offset == offset);
 		ASSERT(db->db_size == blocksize);
 		ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
 		    ztest_pattern_match(db->db_data, db->db_size, 0ULL));
 		dmu_buf_will_fill(db, tx);
 		ztest_pattern_set(db->db_data, db->db_size, pattern);
 		dmu_buf_rele(db, FTAG);
 	}
 
 	dmu_tx_commit(tx);
 	txg_wait_synced(spa_get_dsl(spa), txg);
 
 	/*
 	 * Find out what block we got.
 	 */
 	VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
 	    DMU_READ_NO_PREFETCH));
 	blk = *((dmu_buf_impl_t *)db)->db_blkptr;
 	dmu_buf_rele(db, FTAG);
 
 	/*
 	 * Damage the block.  Dedup-ditto will save us when we read it later.
 	 */
 	psize = BP_GET_PSIZE(&blk);
 	abd = abd_alloc_linear(psize, B_TRUE);
 	ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
 
 	(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
 	    abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
 
 	abd_free(abd);
 
 	(void) rw_unlock(&ztest_name_lock);
 	umem_free(od, sizeof (ztest_od_t));
 }
 
 /*
  * Scrub the pool.
  */
 /* ARGSUSED */
 void
 ztest_scrub(ztest_ds_t *zd, uint64_t id)
 {
 	spa_t *spa = ztest_spa;
 
 	(void) spa_scan(spa, POOL_SCAN_SCRUB);
 	(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
 	(void) spa_scan(spa, POOL_SCAN_SCRUB);
 }
 
 /*
  * Change the guid for the pool.
  */
 /* ARGSUSED */
 void
 ztest_reguid(ztest_ds_t *zd, uint64_t id)
 {
 	spa_t *spa = ztest_spa;
 	uint64_t orig, load;
 	int error;
 
 	if (ztest_opts.zo_mmp_test)
 		return;
 
 	orig = spa_guid(spa);
 	load = spa_load_guid(spa);
 
 	(void) rw_wrlock(&ztest_name_lock);
 	error = spa_change_guid(spa);
 	(void) rw_unlock(&ztest_name_lock);
 
 	if (error != 0)
 		return;
 
 	if (ztest_opts.zo_verbose >= 4) {
 		(void) printf("Changed guid old %llu -> %llu\n",
 		    (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
 	}
 
 	VERIFY3U(orig, !=, spa_guid(spa));
 	VERIFY3U(load, ==, spa_load_guid(spa));
 }
 
 /*
  * Rename the pool to a different name and then rename it back.
  */
 /* ARGSUSED */
 void
 ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
 {
 	char *oldname, *newname;
 	spa_t *spa;
 
 	if (ztest_opts.zo_mmp_test)
 		return;
 
 	(void) rw_wrlock(&ztest_name_lock);
 
 	oldname = ztest_opts.zo_pool;
 	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
 	(void) strcpy(newname, oldname);
 	(void) strcat(newname, "_tmp");
 
 	/*
 	 * Do the rename
 	 */
 	VERIFY3U(0, ==, spa_rename(oldname, newname));
 
 	/*
 	 * Try to open it under the old name, which shouldn't exist
 	 */
 	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
 	 * Open it under the new name and make sure it's still the same spa_t.
 	 */
 	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
 
 	ASSERT(spa == ztest_spa);
 	spa_close(spa, FTAG);
 
 	/*
 	 * Rename it back to the original
 	 */
 	VERIFY3U(0, ==, spa_rename(newname, oldname));
 
 	/*
 	 * Make sure it can still be opened
 	 */
 	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
 
 	ASSERT(spa == ztest_spa);
 	spa_close(spa, FTAG);
 
 	umem_free(newname, strlen(newname) + 1);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
 
 void
 ztest_fletcher(ztest_ds_t *zd, uint64_t id)
 {
 	hrtime_t end = gethrtime() + NANOSEC;
 
 	while (gethrtime() <= end) {
 		int run_count = 100;
 		void *buf;
 		struct abd *abd_data, *abd_meta;
 		uint32_t size;
 		int *ptr;
 		int i;
 		zio_cksum_t zc_ref;
 		zio_cksum_t zc_ref_byteswap;
 
 		size = ztest_random_blocksize();
 
 		buf = umem_alloc(size, UMEM_NOFAIL);
 		abd_data = abd_alloc(size, B_FALSE);
 		abd_meta = abd_alloc(size, B_TRUE);
 
 		for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++)
 			*ptr = ztest_random(UINT_MAX);
 
 		abd_copy_from_buf_off(abd_data, buf, 0, size);
 		abd_copy_from_buf_off(abd_meta, buf, 0, size);
 
 		VERIFY0(fletcher_4_impl_set("scalar"));
 		fletcher_4_native(buf, size, NULL, &zc_ref);
 		fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap);
 
 		VERIFY0(fletcher_4_impl_set("cycle"));
 		while (run_count-- > 0) {
 			zio_cksum_t zc;
 			zio_cksum_t zc_byteswap;
 
 			fletcher_4_byteswap(buf, size, NULL, &zc_byteswap);
 			fletcher_4_native(buf, size, NULL, &zc);
 
 			VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc)));
 			VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap,
 			    sizeof (zc_byteswap)));
 
 			/* Test ABD - data */
 			abd_fletcher_4_byteswap(abd_data, size, NULL,
 			    &zc_byteswap);
 			abd_fletcher_4_native(abd_data, size, NULL, &zc);
 
 			VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc)));
 			VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap,
 			    sizeof (zc_byteswap)));
 
 			/* Test ABD - metadata */
 			abd_fletcher_4_byteswap(abd_meta, size, NULL,
 			    &zc_byteswap);
 			abd_fletcher_4_native(abd_meta, size, NULL, &zc);
 
 			VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc)));
 			VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap,
 			    sizeof (zc_byteswap)));
 
 		}
 
 		umem_free(buf, size);
 		abd_free(abd_data);
 		abd_free(abd_meta);
 	}
 }
 
 void
 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id)
 {
 	void *buf;
 	size_t size;
 	int *ptr;
 	int i;
 	zio_cksum_t zc_ref;
 	zio_cksum_t zc_ref_bswap;
 
 	hrtime_t end = gethrtime() + NANOSEC;
 
 	while (gethrtime() <= end) {
 		int run_count = 100;
 
 		size = ztest_random_blocksize();
 		buf = umem_alloc(size, UMEM_NOFAIL);
 
 		for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++)
 			*ptr = ztest_random(UINT_MAX);
 
 		VERIFY0(fletcher_4_impl_set("scalar"));
 		fletcher_4_native(buf, size, NULL, &zc_ref);
 		fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap);
 
 		VERIFY0(fletcher_4_impl_set("cycle"));
 
 		while (run_count-- > 0) {
 			zio_cksum_t zc;
 			zio_cksum_t zc_bswap;
 			size_t pos = 0;
 
 			ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
 			ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0);
 
 			while (pos < size) {
 				size_t inc = 64 * ztest_random(size / 67);
 				/* sometimes add few bytes to test non-simd */
 				if (ztest_random(100) < 10)
 					inc += P2ALIGN(ztest_random(64),
 					    sizeof (uint32_t));
 
 				if (inc > (size - pos))
 					inc = size - pos;
 
 				fletcher_4_incremental_native(buf + pos, inc,
 				    &zc);
 				fletcher_4_incremental_byteswap(buf + pos, inc,
 				    &zc_bswap);
 
 				pos += inc;
 			}
 
 			VERIFY3U(pos, ==, size);
 
 			VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref));
 			VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap));
 
 			/*
 			 * verify if incremental on the whole buffer is
 			 * equivalent to non-incremental version
 			 */
 			ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
 			ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0);
 
 			fletcher_4_incremental_native(buf, size, &zc);
 			fletcher_4_incremental_byteswap(buf, size, &zc_bswap);
 
 			VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref));
 			VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap));
 		}
 
 		umem_free(buf, size);
 	}
 }
 
 static int
 ztest_check_path(char *path)
 {
 	struct stat s;
 	/* return true on success */
 	return (!stat(path, &s));
 }
 
 static void
 ztest_get_zdb_bin(char *bin, int len)
 {
 	char *zdb_path;
 	/*
 	 * Try to use ZDB_PATH and in-tree zdb path. If not successful, just
 	 * let popen to search through PATH.
 	 */
 	if ((zdb_path = getenv("ZDB_PATH"))) {
 		strlcpy(bin, zdb_path, len); /* In env */
 		if (!ztest_check_path(bin)) {
 			ztest_dump_core = 0;
 			fatal(1, "invalid ZDB_PATH '%s'", bin);
 		}
 		return;
 	}
 
 	VERIFY(realpath(getexecname(), bin) != NULL);
 	if (strstr(bin, "/ztest/")) {
 		strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */
 		strcat(bin, "/zdb/zdb");
 		if (ztest_check_path(bin))
 			return;
 	}
 	strcpy(bin, "zdb");
 }
 
 /*
  * Verify pool integrity by running zdb.
  */
 static void
 ztest_run_zdb(char *pool)
 {
 	int status;
 	char *bin;
 	char *zdb;
 	char *zbuf;
 	const int len = MAXPATHLEN + MAXNAMELEN + 20;
 	FILE *fp;
 
 	bin = umem_alloc(len, UMEM_NOFAIL);
 	zdb = umem_alloc(len, UMEM_NOFAIL);
 	zbuf = umem_alloc(1024, UMEM_NOFAIL);
 
 	ztest_get_zdb_bin(bin, len);
 
 	(void) sprintf(zdb,
 	    "%s -bcc%s%s -G -d -U %s %s",
 	    bin,
 	    ztest_opts.zo_verbose >= 3 ? "s" : "",
 	    ztest_opts.zo_verbose >= 4 ? "v" : "",
 	    spa_config_path,
 	    pool);
 
 	if (ztest_opts.zo_verbose >= 5)
 		(void) printf("Executing %s\n", strstr(zdb, "zdb "));
 
 	fp = popen(zdb, "r");
 
 	while (fgets(zbuf, 1024, fp) != NULL)
 		if (ztest_opts.zo_verbose >= 3)
 			(void) printf("%s", zbuf);
 
 	status = pclose(fp);
 
 	if (status == 0)
 		goto out;
 
 	ztest_dump_core = 0;
 	if (WIFEXITED(status))
 		fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
 	else
 		fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
 out:
 	umem_free(bin, len);
 	umem_free(zdb, len);
 	umem_free(zbuf, 1024);
 }
 
 static void
 ztest_walk_pool_directory(char *header)
 {
 	spa_t *spa = NULL;
 
 	if (ztest_opts.zo_verbose >= 6)
 		(void) printf("%s\n", header);
 
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL)
 		if (ztest_opts.zo_verbose >= 6)
 			(void) printf("\t%s\n", spa_name(spa));
 	mutex_exit(&spa_namespace_lock);
 }
 
 static void
 ztest_spa_import_export(char *oldname, char *newname)
 {
 	nvlist_t *config, *newconfig;
 	uint64_t pool_guid;
 	spa_t *spa;
 	int error;
 
 	if (ztest_opts.zo_verbose >= 4) {
 		(void) printf("import/export: old = %s, new = %s\n",
 		    oldname, newname);
 	}
 
 	/*
 	 * Clean up from previous runs.
 	 */
 	(void) spa_destroy(newname);
 
 	/*
 	 * Get the pool's configuration and guid.
 	 */
 	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
 	 * Kick off a scrub to tickle scrub/export races.
 	 */
 	if (ztest_random(2) == 0)
 		(void) spa_scan(spa, POOL_SCAN_SCRUB);
 
 	pool_guid = spa_guid(spa);
 	spa_close(spa, FTAG);
 
 	ztest_walk_pool_directory("pools before export");
 
 	/*
 	 * Export it.
 	 */
 	VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
 
 	ztest_walk_pool_directory("pools after export");
 
 	/*
 	 * Try to import it.
 	 */
 	newconfig = spa_tryimport(config);
 	ASSERT(newconfig != NULL);
 	nvlist_free(newconfig);
 
 	/*
 	 * Import it under the new name.
 	 */
 	error = spa_import(newname, config, NULL, 0);
 	if (error != 0) {
 		dump_nvlist(config, 0);
 		fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
 		    oldname, newname, error);
 	}
 
 	ztest_walk_pool_directory("pools after import");
 
 	/*
 	 * Try to import it again -- should fail with EEXIST.
 	 */
 	VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
 
 	/*
 	 * Try to import it under a different name -- should fail with EEXIST.
 	 */
 	VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
 
 	/*
 	 * Verify that the pool is no longer visible under the old name.
 	 */
 	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
 	 * Verify that we can open and close the pool using the new name.
 	 */
 	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
 	ASSERT(pool_guid == spa_guid(spa));
 	spa_close(spa, FTAG);
 
 	nvlist_free(config);
 }
 
 static void
 ztest_resume(spa_t *spa)
 {
 	if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6)
 		(void) printf("resuming from suspended state\n");
 	spa_vdev_state_enter(spa, SCL_NONE);
 	vdev_clear(spa, NULL);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 	(void) zio_resume(spa);
 }
 
 static void
 ztest_resume_thread(void *arg)
 {
 	spa_t *spa = arg;
 
 	while (!ztest_exiting) {
 		if (spa_suspended(spa))
 			ztest_resume(spa);
 		(void) poll(NULL, 0, 100);
 
 		/*
 		 * Periodically change the zfs_compressed_arc_enabled setting.
 		 */
 		if (ztest_random(10) == 0)
 			zfs_compressed_arc_enabled = ztest_random(2);
 
 		/*
 		 * Periodically change the zfs_abd_scatter_enabled setting.
 		 */
 		if (ztest_random(10) == 0)
 			zfs_abd_scatter_enabled = ztest_random(2);
 	}
 
 	thread_exit();
 }
 
 #define	GRACE	300
 
 #if 0
 static void
 ztest_deadman_alarm(int sig)
 {
 	fatal(0, "failed to complete within %d seconds of deadline", GRACE);
 }
 #endif
 
 static void
 ztest_execute(int test, ztest_info_t *zi, uint64_t id)
 {
 	ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets];
 	ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test);
 	hrtime_t functime = gethrtime();
 	int i;
 
 	for (i = 0; i < zi->zi_iters; i++)
 		zi->zi_func(zd, id);
 
 	functime = gethrtime() - functime;
 
 	atomic_add_64(&zc->zc_count, 1);
 	atomic_add_64(&zc->zc_time, functime);
 
 	if (ztest_opts.zo_verbose >= 4)
 		(void) printf("%6.2f sec in %s\n",
 		    (double)functime / NANOSEC, zi->zi_funcname);
 }
 
 static void
 ztest_thread(void *arg)
 {
 	int rand;
 	uint64_t id = (uintptr_t)arg;
 	ztest_shared_t *zs = ztest_shared;
 	uint64_t call_next;
 	hrtime_t now;
 	ztest_info_t *zi;
 	ztest_shared_callstate_t *zc;
 
 	while ((now = gethrtime()) < zs->zs_thread_stop) {
 		/*
 		 * See if it's time to force a crash.
 		 */
 		if (now > zs->zs_thread_kill)
 			ztest_kill(zs);
 
 		/*
 		 * If we're getting ENOSPC with some regularity, stop.
 		 */
 		if (zs->zs_enospc_count > 10)
 			break;
 
 		/*
 		 * Pick a random function to execute.
 		 */
 		rand = ztest_random(ZTEST_FUNCS);
 		zi = &ztest_info[rand];
 		zc = ZTEST_GET_SHARED_CALLSTATE(rand);
 		call_next = zc->zc_next;
 
 		if (now >= call_next &&
 		    atomic_cas_64(&zc->zc_next, call_next, call_next +
 		    ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) {
 			ztest_execute(rand, zi, id);
 		}
 	}
 
 	thread_exit();
 }
 
 static void
 ztest_dataset_name(char *dsname, char *pool, int d)
 {
 	(void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d);
 }
 
 static void
 ztest_dataset_destroy(int d)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	int t;
 
 	ztest_dataset_name(name, ztest_opts.zo_pool, d);
 
 	if (ztest_opts.zo_verbose >= 3)
 		(void) printf("Destroying %s to free up space\n", name);
 
 	/*
 	 * Cleanup any non-standard clones and snapshots.  In general,
 	 * ztest thread t operates on dataset (t % zopt_datasets),
 	 * so there may be more than one thing to clean up.
 	 */
 	for (t = d; t < ztest_opts.zo_threads;
 	    t += ztest_opts.zo_datasets)
 		ztest_dsl_dataset_cleanup(name, t);
 
 	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
 	    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 }
 
 static void
 ztest_dataset_dirobj_verify(ztest_ds_t *zd)
 {
 	uint64_t usedobjs, dirobjs, scratch;
 
 	/*
 	 * ZTEST_DIROBJ is the object directory for the entire dataset.
 	 * Therefore, the number of objects in use should equal the
 	 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
 	 * If not, we have an object leak.
 	 *
 	 * Note that we can only check this in ztest_dataset_open(),
 	 * when the open-context and syncing-context values agree.
 	 * That's because zap_count() returns the open-context value,
 	 * while dmu_objset_space() returns the rootbp fill count.
 	 */
 	VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
 	dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
 	ASSERT3U(dirobjs + 1, ==, usedobjs);
 }
 
 static int
 ztest_dataset_open(int d)
 {
 	ztest_ds_t *zd = &ztest_ds[d];
 	uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
 	objset_t *os;
 	zilog_t *zilog;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	int error;
 
 	ztest_dataset_name(name, ztest_opts.zo_pool, d);
 
 	(void) rw_rdlock(&ztest_name_lock);
 
 	error = ztest_dataset_create(name);
 	if (error == ENOSPC) {
 		(void) rw_unlock(&ztest_name_lock);
 		ztest_record_enospc(FTAG);
 		return (error);
 	}
 	ASSERT(error == 0 || error == EEXIST);
 
 	VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE,
 	    B_TRUE, zd, &os));
 	(void) rw_unlock(&ztest_name_lock);
 
 	ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
 
 	zilog = zd->zd_zilog;
 
 	if (zilog->zl_header->zh_claim_lr_seq != 0 &&
 	    zilog->zl_header->zh_claim_lr_seq < committed_seq)
 		fatal(0, "missing log records: claimed %llu < committed %llu",
 		    zilog->zl_header->zh_claim_lr_seq, committed_seq);
 
 	ztest_dataset_dirobj_verify(zd);
 
 	zil_replay(os, zd, ztest_replay_vector);
 
 	ztest_dataset_dirobj_verify(zd);
 
 	if (ztest_opts.zo_verbose >= 6)
 		(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
 		    zd->zd_name,
 		    (u_longlong_t)zilog->zl_parse_blk_count,
 		    (u_longlong_t)zilog->zl_parse_lr_count,
 		    (u_longlong_t)zilog->zl_replaying_seq);
 
 	zilog = zil_open(os, ztest_get_data);
 
 	if (zilog->zl_replaying_seq != 0 &&
 	    zilog->zl_replaying_seq < committed_seq)
 		fatal(0, "missing log records: replayed %llu < committed %llu",
 		    zilog->zl_replaying_seq, committed_seq);
 
 	return (0);
 }
 
 static void
 ztest_dataset_close(int d)
 {
 	ztest_ds_t *zd = &ztest_ds[d];
 
 	zil_close(zd->zd_zilog);
 	txg_wait_synced(spa_get_dsl(zd->zd_os->os_spa), 0);
 	dmu_objset_disown(zd->zd_os, B_TRUE, zd);
 
 	ztest_zd_fini(zd);
 }
 
 /*
  * Kick off threads to run tests on all datasets in parallel.
  */
 static void
 ztest_run(ztest_shared_t *zs)
 {
 	spa_t *spa;
 	objset_t *os;
 	kthread_t *resume_thread;
 	kthread_t **run_threads;
 	uint64_t object;
 	int error;
 	int t, d;
 
 	ztest_exiting = B_FALSE;
 
 	/*
 	 * Initialize parent/child shared state.
 	 */
 	mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
 
 	zs->zs_thread_start = gethrtime();
 	zs->zs_thread_stop =
 	    zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
 	zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
 	zs->zs_thread_kill = zs->zs_thread_stop;
 	if (ztest_random(100) < ztest_opts.zo_killrate) {
 		zs->zs_thread_kill -=
 		    ztest_random(ztest_opts.zo_passtime * NANOSEC);
 	}
 
 	mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
 	    offsetof(ztest_cb_data_t, zcd_node));
 
 	/*
 	 * Open our pool.
 	 */
 	kernel_init(FREAD | FWRITE);
 	VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	spa->spa_debug = B_TRUE;
 	metaslab_preload_limit = ztest_random(20) + 1;
 	ztest_spa = spa;
 
 	dmu_objset_stats_t dds;
 	VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
 	    DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 	zs->zs_guid = dds.dds_guid;
 	dmu_objset_disown(os, B_TRUE, FTAG);
 
 	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
 
 	/*
 	 * We don't expect the pool to suspend unless maxfaults == 0,
 	 * in which case ztest_fault_inject() temporarily takes away
 	 * the only valid replica.
 	 */
 	if (MAXFAULTS() == 0)
 		spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
 	else
 		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
 
 	/*
 	 * Create a thread to periodically resume suspended I/O.
 	 */
 	resume_thread = thread_create(NULL, 0, ztest_resume_thread,
 	    spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
 
 #if 0
 	/*
 	 * Set a deadman alarm to abort() if we hang.
 	 */
 	signal(SIGALRM, ztest_deadman_alarm);
 	alarm((zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + GRACE);
 #endif
 
 	/*
 	 * Verify that we can safely inquire about about any object,
 	 * whether it's allocated or not.  To make it interesting,
 	 * we probe a 5-wide window around each power of two.
 	 * This hits all edge cases, including zero and the max.
 	 */
 	for (t = 0; t < 64; t++) {
 		for (d = -5; d <= 5; d++) {
 			error = dmu_object_info(spa->spa_meta_objset,
 			    (1ULL << t) + d, NULL);
 			ASSERT(error == 0 || error == ENOENT ||
 			    error == EINVAL);
 		}
 	}
 
 	/*
 	 * If we got any ENOSPC errors on the previous run, destroy something.
 	 */
 	if (zs->zs_enospc_count != 0) {
 		int d = ztest_random(ztest_opts.zo_datasets);
 		ztest_dataset_destroy(d);
 	}
 	zs->zs_enospc_count = 0;
 
 	run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),
 	    UMEM_NOFAIL);
 
 	if (ztest_opts.zo_verbose >= 4)
 		(void) printf("starting main threads...\n");
 
 	/*
 	 * Kick off all the tests that run in parallel.
 	 */
 	for (t = 0; t < ztest_opts.zo_threads; t++) {
 		if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {
 			umem_free(run_threads, ztest_opts.zo_threads *
 			    sizeof (kthread_t *));
 			return;
 		}
 
 		run_threads[t] = thread_create(NULL, 0, ztest_thread,
 		    (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE,
 		    defclsyspri);
 	}
 
 	/*
 	 * Wait for all of the tests to complete.  We go in reverse order
 	 * so we don't close datasets while threads are still using them.
 	 */
 	for (t = ztest_opts.zo_threads - 1; t >= 0; t--) {
 		VERIFY0(thread_join(run_threads[t]));
 		if (t < ztest_opts.zo_datasets)
 			ztest_dataset_close(t);
 	}
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
 
 	umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *));
 
 	/* Kill the resume thread */
 	ztest_exiting = B_TRUE;
 	VERIFY0(thread_join(resume_thread));
 	ztest_resume(spa);
 
 	/*
 	 * Right before closing the pool, kick off a bunch of async I/O;
 	 * spa_close() should wait for it to complete.
 	 */
 	for (object = 1; object < 50; object++) {
 		dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	/* Verify that at least one commit cb was called in a timely fashion */
 	if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG)
 		VERIFY0(zc_min_txg_delay);
 
 	spa_close(spa, FTAG);
 
 	/*
 	 * Verify that we can loop over all pools.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
 		if (ztest_opts.zo_verbose > 3)
 			(void) printf("spa_next: found %s\n", spa_name(spa));
 	mutex_exit(&spa_namespace_lock);
 
 	/*
 	 * Verify that we can export the pool and reimport it under a
 	 * different name.
 	 */
 	if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) {
 		char name[ZFS_MAX_DATASET_NAME_LEN];
 		(void) snprintf(name, sizeof (name), "%s_import",
 		    ztest_opts.zo_pool);
 		ztest_spa_import_export(ztest_opts.zo_pool, name);
 		ztest_spa_import_export(name, ztest_opts.zo_pool);
 	}
 
 	kernel_fini();
 
 	list_destroy(&zcl.zcl_callbacks);
 	mutex_destroy(&zcl.zcl_callbacks_lock);
 	(void) rwlock_destroy(&ztest_name_lock);
 	mutex_destroy(&ztest_vdev_lock);
 }
 
 static void
 ztest_freeze(void)
 {
 	ztest_ds_t *zd = &ztest_ds[0];
 	spa_t *spa;
 	int numloops = 0;
 
 	if (ztest_opts.zo_verbose >= 3)
 		(void) printf("testing spa_freeze()...\n");
 
 	kernel_init(FREAD | FWRITE);
 	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	VERIFY3U(0, ==, ztest_dataset_open(0));
 	spa->spa_debug = B_TRUE;
 	ztest_spa = spa;
 
 	/*
 	 * Force the first log block to be transactionally allocated.
 	 * We have to do this before we freeze the pool -- otherwise
 	 * the log chain won't be anchored.
 	 */
 	while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
 		ztest_dmu_object_alloc_free(zd, 0);
 		zil_commit(zd->zd_zilog, 0);
 	}
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	/*
 	 * Freeze the pool.  This stops spa_sync() from doing anything,
 	 * so that the only way to record changes from now on is the ZIL.
 	 */
 	spa_freeze(spa);
 
 	/*
 	 * Because it is hard to predict how much space a write will actually
 	 * require beforehand, we leave ourselves some fudge space to write over
 	 * capacity.
 	 */
 	uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
 
 	/*
 	 * Run tests that generate log records but don't alter the pool config
 	 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
 	 * We do a txg_wait_synced() after each iteration to force the txg
 	 * to increase well beyond the last synced value in the uberblock.
 	 * The ZIL should be OK with that.
 	 *
 	 * Run a random number of times less than zo_maxloops and ensure we do
 	 * not run out of space on the pool.
 	 */
 	while (ztest_random(10) != 0 &&
 	    numloops++ < ztest_opts.zo_maxloops &&
 	    metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
 		ztest_od_t od;
 		ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
 		VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
 		ztest_io(zd, od.od_object,
 		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 	}
 
 	/*
 	 * Commit all of the changes we just generated.
 	 */
 	zil_commit(zd->zd_zilog, 0);
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	/*
 	 * Close our dataset and close the pool.
 	 */
 	ztest_dataset_close(0);
 	spa_close(spa, FTAG);
 	kernel_fini();
 
 	/*
 	 * Open and close the pool and dataset to induce log replay.
 	 */
 	kernel_init(FREAD | FWRITE);
 	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
 	VERIFY3U(0, ==, ztest_dataset_open(0));
 	spa->spa_debug = B_TRUE;
 	ztest_spa = spa;
 	txg_wait_synced(spa_get_dsl(spa), 0);
 	ztest_dataset_close(0);
 	ztest_reguid(NULL, 0);
 
 	spa_close(spa, FTAG);
 	kernel_fini();
 }
 
 void
 print_time(hrtime_t t, char *timebuf)
 {
 	hrtime_t s = t / NANOSEC;
 	hrtime_t m = s / 60;
 	hrtime_t h = m / 60;
 	hrtime_t d = h / 24;
 
 	s -= m * 60;
 	m -= h * 60;
 	h -= d * 24;
 
 	timebuf[0] = '\0';
 
 	if (d)
 		(void) sprintf(timebuf,
 		    "%llud%02lluh%02llum%02llus", d, h, m, s);
 	else if (h)
 		(void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
 	else if (m)
 		(void) sprintf(timebuf, "%llum%02llus", m, s);
 	else
 		(void) sprintf(timebuf, "%llus", s);
 }
 
 static nvlist_t *
 make_random_props(void)
 {
 	nvlist_t *props;
 
 	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
 	if (ztest_random(2) == 0)
 		return (props);
 	VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
 
 	return (props);
 }
 
 /*
  * Import a storage pool with the given name.
  */
 static void
 ztest_import(ztest_shared_t *zs)
 {
 	libzfs_handle_t *hdl;
 	importargs_t args = { 0 };
 	spa_t *spa;
 	nvlist_t *cfg = NULL;
 	int nsearch = 1;
 	char *searchdirs[nsearch];
 	char *name = ztest_opts.zo_pool;
 	int flags = ZFS_IMPORT_MISSING_LOG;
 	int error;
 
 	mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
 
 	kernel_init(FREAD | FWRITE);
 	hdl = libzfs_init();
 
 	searchdirs[0] = ztest_opts.zo_dir;
 	args.paths = nsearch;
 	args.path = searchdirs;
 	args.can_be_active = B_FALSE;
 
 	error = zpool_tryimport(hdl, name, &cfg, &args);
 	if (error)
 		(void) fatal(0, "No pools found\n");
 
 	VERIFY0(spa_import(name, cfg, NULL, flags));
 	VERIFY0(spa_open(name, &spa, FTAG));
 	zs->zs_metaslab_sz =
 	    1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
 	spa_close(spa, FTAG);
 
 	libzfs_fini(hdl);
 	kernel_fini();
 
 	if (!ztest_opts.zo_mmp_test) {
 		ztest_run_zdb(ztest_opts.zo_pool);
 		ztest_freeze();
 		ztest_run_zdb(ztest_opts.zo_pool);
 	}
 
 	(void) rwlock_destroy(&ztest_name_lock);
 	mutex_destroy(&ztest_vdev_lock);
 }
 
 /*
  * Create a storage pool with the given name and initial vdev size.
  * Then test spa_freeze() functionality.
  */
 static void
 ztest_init(ztest_shared_t *zs)
 {
 	spa_t *spa;
 	nvlist_t *nvroot, *props;
 	int i;
 
 	mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
 
 	kernel_init(FREAD | FWRITE);
 
 	/*
 	 * Create the storage pool.
 	 */
 	(void) spa_destroy(ztest_opts.zo_pool);
 	ztest_shared->zs_vdev_next_leaf = 0;
 	zs->zs_splits = 0;
 	zs->zs_mirrors = ztest_opts.zo_mirrors;
 	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
 	    0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
 	props = make_random_props();
 	for (i = 0; i < SPA_FEATURES; i++) {
 		char *buf;
 		VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
 		    spa_feature_table[i].fi_uname));
 		VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
 		free(buf);
 	}
 	VERIFY3U(0, ==,
 	    spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL));
 	nvlist_free(nvroot);
 	nvlist_free(props);
 
 	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	zs->zs_metaslab_sz =
 	    1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
 	spa_close(spa, FTAG);
 
 	kernel_fini();
 
 	if (!ztest_opts.zo_mmp_test) {
 		ztest_run_zdb(ztest_opts.zo_pool);
 		ztest_freeze();
 		ztest_run_zdb(ztest_opts.zo_pool);
 	}
 
 	(void) rwlock_destroy(&ztest_name_lock);
 	mutex_destroy(&ztest_vdev_lock);
 }
 
 static void
 setup_data_fd(void)
 {
 	static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
 
 	ztest_fd_data = mkstemp(ztest_name_data);
 	ASSERT3S(ztest_fd_data, >=, 0);
 	(void) unlink(ztest_name_data);
 }
 
 static int
 shared_data_size(ztest_shared_hdr_t *hdr)
 {
 	int size;
 
 	size = hdr->zh_hdr_size;
 	size += hdr->zh_opts_size;
 	size += hdr->zh_size;
 	size += hdr->zh_stats_size * hdr->zh_stats_count;
 	size += hdr->zh_ds_size * hdr->zh_ds_count;
 
 	return (size);
 }
 
 static void
 setup_hdr(void)
 {
 	int size;
 	ztest_shared_hdr_t *hdr;
 
 	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
 	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
 	ASSERT(hdr != MAP_FAILED);
 
 	VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
 
 	hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
 	hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
 	hdr->zh_size = sizeof (ztest_shared_t);
 	hdr->zh_stats_size = sizeof (ztest_shared_callstate_t);
 	hdr->zh_stats_count = ZTEST_FUNCS;
 	hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
 	hdr->zh_ds_count = ztest_opts.zo_datasets;
 
 	size = shared_data_size(hdr);
 	VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
 
 	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
 }
 
 static void
 setup_data(void)
 {
 	int size, offset;
 	ztest_shared_hdr_t *hdr;
 	uint8_t *buf;
 
 	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
 	    PROT_READ, MAP_SHARED, ztest_fd_data, 0);
 	ASSERT(hdr != MAP_FAILED);
 
 	size = shared_data_size(hdr);
 
 	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
 	hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
 	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
 	ASSERT(hdr != MAP_FAILED);
 	buf = (uint8_t *)hdr;
 
 	offset = hdr->zh_hdr_size;
 	ztest_shared_opts = (void *)&buf[offset];
 	offset += hdr->zh_opts_size;
 	ztest_shared = (void *)&buf[offset];
 	offset += hdr->zh_size;
 	ztest_shared_callstate = (void *)&buf[offset];
 	offset += hdr->zh_stats_size * hdr->zh_stats_count;
 	ztest_shared_ds = (void *)&buf[offset];
 }
 
 static boolean_t
 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp)
 {
 	pid_t pid;
 	int status;
 	char *cmdbuf = NULL;
 
 	pid = fork();
 
 	if (cmd == NULL) {
 		cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
 		(void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
 		cmd = cmdbuf;
 	}
 
 	if (pid == -1)
 		fatal(1, "fork failed");
 
 	if (pid == 0) {	/* child */
 		char *emptyargv[2] = { cmd, NULL };
 		char fd_data_str[12];
 
 		struct rlimit rl = { 1024, 1024 };
 		(void) setrlimit(RLIMIT_NOFILE, &rl);
 
 		(void) close(ztest_fd_rand);
 		VERIFY(11 >= snprintf(fd_data_str, 12, "%d", ztest_fd_data));
 		VERIFY(0 == setenv("ZTEST_FD_DATA", fd_data_str, 1));
 
 		(void) enable_extended_FILE_stdio(-1, -1);
 		if (libpath != NULL)
 			VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
 		(void) execv(cmd, emptyargv);
 		ztest_dump_core = B_FALSE;
 		fatal(B_TRUE, "exec failed: %s", cmd);
 	}
 
 	if (cmdbuf != NULL) {
 		umem_free(cmdbuf, MAXPATHLEN);
 		cmd = NULL;
 	}
 
 	while (waitpid(pid, &status, 0) != pid)
 		continue;
 	if (statusp != NULL)
 		*statusp = status;
 
 	if (WIFEXITED(status)) {
 		if (WEXITSTATUS(status) != 0) {
 			(void) fprintf(stderr, "child exited with code %d\n",
 			    WEXITSTATUS(status));
 			exit(2);
 		}
 		return (B_FALSE);
 	} else if (WIFSIGNALED(status)) {
 		if (!ignorekill || WTERMSIG(status) != SIGKILL) {
 			(void) fprintf(stderr, "child died with signal %d\n",
 			    WTERMSIG(status));
 			exit(3);
 		}
 		return (B_TRUE);
 	} else {
 		(void) fprintf(stderr, "something strange happened to child\n");
 		exit(4);
 		/* NOTREACHED */
 	}
 }
 
 static void
 ztest_run_init(void)
 {
 	int i;
 
 	ztest_shared_t *zs = ztest_shared;
 
 	/*
 	 * Blow away any existing copy of zpool.cache
 	 */
 	(void) remove(spa_config_path);
 
 	if (ztest_opts.zo_init == 0) {
 		if (ztest_opts.zo_verbose >= 1)
 			(void) printf("Importing pool %s\n",
 			    ztest_opts.zo_pool);
 		ztest_import(zs);
 		return;
 	}
 
 	/*
 	 * Create and initialize our storage pool.
 	 */
 	for (i = 1; i <= ztest_opts.zo_init; i++) {
 		bzero(zs, sizeof (ztest_shared_t));
 		if (ztest_opts.zo_verbose >= 3 &&
 		    ztest_opts.zo_init != 1) {
 			(void) printf("ztest_init(), pass %d\n", i);
 		}
 		ztest_init(zs);
 	}
 }
 
 int
 main(int argc, char **argv)
 {
 	int kills = 0;
 	int iters = 0;
 	int older = 0;
 	int newer = 0;
 	ztest_shared_t *zs;
 	ztest_info_t *zi;
 	ztest_shared_callstate_t *zc;
 	char timebuf[100];
 	char numbuf[NN_NUMBUF_SZ];
 	spa_t *spa;
 	char *cmd;
 	boolean_t hasalt;
 	int f;
 	char *fd_data_str = getenv("ZTEST_FD_DATA");
 	struct sigaction action;
 
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
 
 	dprintf_setup(&argc, argv);
 
 	action.sa_handler = sig_handler;
 	sigemptyset(&action.sa_mask);
 	action.sa_flags = 0;
 
 	if (sigaction(SIGSEGV, &action, NULL) < 0) {
 		(void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n",
 		    strerror(errno));
 		exit(EXIT_FAILURE);
 	}
 
 	if (sigaction(SIGABRT, &action, NULL) < 0) {
 		(void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n",
 		    strerror(errno));
 		exit(EXIT_FAILURE);
 	}
 
 	ztest_fd_rand = open("/dev/urandom", O_RDONLY);
 	ASSERT3S(ztest_fd_rand, >=, 0);
 
 	if (!fd_data_str) {
 		process_options(argc, argv);
 
 		setup_data_fd();
 		setup_hdr();
 		setup_data();
 		bcopy(&ztest_opts, ztest_shared_opts,
 		    sizeof (*ztest_shared_opts));
 	} else {
 		ztest_fd_data = atoi(fd_data_str);
 		setup_data();
 		bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
 	}
 	ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
 
 	/* Override location of zpool.cache */
 	VERIFY(asprintf((char **)&spa_config_path, "%s/zpool.cache",
 	    ztest_opts.zo_dir) != -1);
 
 	ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
 	    UMEM_NOFAIL);
 	zs = ztest_shared;
 
 	if (fd_data_str) {
 		metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
 		metaslab_df_alloc_threshold =
 		    zs->zs_metaslab_df_alloc_threshold;
 
 		if (zs->zs_do_init)
 			ztest_run_init();
 		else
 			ztest_run(zs);
 		exit(0);
 	}
 
 	hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);
 
 	if (ztest_opts.zo_verbose >= 1) {
 		(void) printf("%llu vdevs, %d datasets, %d threads,"
 		    " %llu seconds...\n",
 		    (u_longlong_t)ztest_opts.zo_vdevs,
 		    ztest_opts.zo_datasets,
 		    ztest_opts.zo_threads,
 		    (u_longlong_t)ztest_opts.zo_time);
 	}
 
 	cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
 	(void) strlcpy(cmd, getexecname(), MAXNAMELEN);
 
 	zs->zs_do_init = B_TRUE;
 	if (strlen(ztest_opts.zo_alt_ztest) != 0) {
 		if (ztest_opts.zo_verbose >= 1) {
 			(void) printf("Executing older ztest for "
 			    "initialization: %s\n", ztest_opts.zo_alt_ztest);
 		}
 		VERIFY(!exec_child(ztest_opts.zo_alt_ztest,
 		    ztest_opts.zo_alt_libpath, B_FALSE, NULL));
 	} else {
 		VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL));
 	}
 	zs->zs_do_init = B_FALSE;
 
 	zs->zs_proc_start = gethrtime();
 	zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC;
 
 	for (f = 0; f < ZTEST_FUNCS; f++) {
 		zi = &ztest_info[f];
 		zc = ZTEST_GET_SHARED_CALLSTATE(f);
 		if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
 			zc->zc_next = UINT64_MAX;
 		else
 			zc->zc_next = zs->zs_proc_start +
 			    ztest_random(2 * zi->zi_interval[0] + 1);
 	}
 
 	/*
 	 * Run the tests in a loop.  These tests include fault injection
 	 * to verify that self-healing data works, and forced crashes
 	 * to verify that we never lose on-disk consistency.
 	 */
 	while (gethrtime() < zs->zs_proc_stop) {
 		int status;
 		boolean_t killed;
 
 		/*
 		 * Initialize the workload counters for each function.
 		 */
 		for (f = 0; f < ZTEST_FUNCS; f++) {
 			zc = ZTEST_GET_SHARED_CALLSTATE(f);
 			zc->zc_count = 0;
 			zc->zc_time = 0;
 		}
 
 		/* Set the allocation switch size */
 		zs->zs_metaslab_df_alloc_threshold =
 		    ztest_random(zs->zs_metaslab_sz / 4) + 1;
 
 		if (!hasalt || ztest_random(2) == 0) {
 			if (hasalt && ztest_opts.zo_verbose >= 1) {
 				(void) printf("Executing newer ztest: %s\n",
 				    cmd);
 			}
 			newer++;
 			killed = exec_child(cmd, NULL, B_TRUE, &status);
 		} else {
 			if (hasalt && ztest_opts.zo_verbose >= 1) {
 				(void) printf("Executing older ztest: %s\n",
 				    ztest_opts.zo_alt_ztest);
 			}
 			older++;
 			killed = exec_child(ztest_opts.zo_alt_ztest,
 			    ztest_opts.zo_alt_libpath, B_TRUE, &status);
 		}
 
 		if (killed)
 			kills++;
 		iters++;
 
 		if (ztest_opts.zo_verbose >= 1) {
 			hrtime_t now = gethrtime();
 
 			now = MIN(now, zs->zs_proc_stop);
 			print_time(zs->zs_proc_stop - now, timebuf);
 			nicenum(zs->zs_space, numbuf, sizeof (numbuf));
 
 			(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
 			    "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
 			    iters,
 			    WIFEXITED(status) ? "Complete" : "SIGKILL",
 			    (u_longlong_t)zs->zs_enospc_count,
 			    100.0 * zs->zs_alloc / zs->zs_space,
 			    numbuf,
 			    100.0 * (now - zs->zs_proc_start) /
 			    (ztest_opts.zo_time * NANOSEC), timebuf);
 		}
 
 		if (ztest_opts.zo_verbose >= 2) {
 			(void) printf("\nWorkload summary:\n\n");
 			(void) printf("%7s %9s   %s\n",
 			    "Calls", "Time", "Function");
 			(void) printf("%7s %9s   %s\n",
 			    "-----", "----", "--------");
 			for (f = 0; f < ZTEST_FUNCS; f++) {
 				zi = &ztest_info[f];
 				zc = ZTEST_GET_SHARED_CALLSTATE(f);
 				print_time(zc->zc_time, timebuf);
 				(void) printf("%7llu %9s   %s\n",
 				    (u_longlong_t)zc->zc_count, timebuf,
 				    zi->zi_funcname);
 			}
 			(void) printf("\n");
 		}
 
 		/*
 		 * It's possible that we killed a child during a rename test,
 		 * in which case we'll have a 'ztest_tmp' pool lying around
 		 * instead of 'ztest'.  Do a blind rename in case this happened.
 		 */
 		kernel_init(FREAD);
 		if (spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0) {
 			spa_close(spa, FTAG);
 		} else {
 			char tmpname[ZFS_MAX_DATASET_NAME_LEN];
 			kernel_fini();
 			kernel_init(FREAD | FWRITE);
 			(void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
 			    ztest_opts.zo_pool);
 			(void) spa_rename(tmpname, ztest_opts.zo_pool);
 		}
 		kernel_fini();
 
 		if (!ztest_opts.zo_mmp_test)
 			ztest_run_zdb(ztest_opts.zo_pool);
 	}
 
 	if (ztest_opts.zo_verbose >= 1) {
 		if (hasalt) {
 			(void) printf("%d runs of older ztest: %s\n", older,
 			    ztest_opts.zo_alt_ztest);
 			(void) printf("%d runs of newer ztest: %s\n", newer,
 			    cmd);
 		}
 		(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
 		    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
 	}
 
 	umem_free(cmd, MAXNAMELEN);
 
 	return (0);
 }
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 8a92919003ed..f252a9cb63c0 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -1,1026 +1,1026 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
 /*
  * This file describes the interface that the DMU provides for its
  * consumers.
  *
  * The DMU also interacts with the SPA.  That interface is described in
  * dmu_spa.h.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/inttypes.h>
 #include <sys/cred.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_priority.h>
 #include <sys/uio.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct page;
 struct vnode;
 struct spa;
 struct zilog;
 struct zio;
 struct blkptr;
 struct zap_cursor;
 struct dsl_dataset;
 struct dsl_pool;
 struct dnode;
 struct drr_begin;
 struct drr_end;
 struct zbookmark_phys;
 struct spa;
 struct nvlist;
 struct arc_buf;
 struct zio_prop;
 struct sa_handle;
 struct dsl_crypto_params;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
 typedef struct dnode dnode_t;
 
 typedef enum dmu_object_byteswap {
 	DMU_BSWAP_UINT8,
 	DMU_BSWAP_UINT16,
 	DMU_BSWAP_UINT32,
 	DMU_BSWAP_UINT64,
 	DMU_BSWAP_ZAP,
 	DMU_BSWAP_DNODE,
 	DMU_BSWAP_OBJSET,
 	DMU_BSWAP_ZNODE,
 	DMU_BSWAP_OLDACL,
 	DMU_BSWAP_ACL,
 	/*
 	 * Allocating a new byteswap type number makes the on-disk format
 	 * incompatible with any other format that uses the same number.
 	 *
 	 * Data can usually be structured to work with one of the
 	 * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
 	 */
 	DMU_BSWAP_NUMFUNCS
 } dmu_object_byteswap_t;
 
 #define	DMU_OT_NEWTYPE 0x80
 #define	DMU_OT_METADATA 0x40
 #define	DMU_OT_ENCRYPTED 0x20
 #define	DMU_OT_BYTESWAP_MASK 0x1f
 
 /*
  * Defines a uint8_t object type. Object types specify if the data
  * in the object is metadata (boolean) and how to byteswap the data
  * (dmu_object_byteswap_t).
  */
 #define	DMU_OT(byteswap, metadata, encrypted) \
 	(DMU_OT_NEWTYPE | \
 	((metadata) ? DMU_OT_METADATA : 0) | \
 	((encrypted) ? DMU_OT_ENCRYPTED : 0) | \
 	((byteswap) & DMU_OT_BYTESWAP_MASK))
 
 #define	DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
 	(ot) < DMU_OT_NUMTYPES)
 
 #define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_METADATA) : \
 	dmu_ot[(int)(ot)].ot_metadata)
 
 #define	DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_ENCRYPTED) : \
 	dmu_ot[(int)(ot)].ot_encrypt)
 
 /*
  * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
  * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
  * is repurposed for embedded BPs.
  */
 #define	DMU_OT_HAS_FILL(ot) \
 	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
 
 #define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) : \
 	dmu_ot[(int)(ot)].ot_byteswap)
 
 typedef enum dmu_object_type {
 	DMU_OT_NONE,
 	/* general: */
 	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
 	DMU_OT_BPOBJ,			/* UINT64 */
 	DMU_OT_BPOBJ_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
 	/* zil: */
 	DMU_OT_INTENT_LOG,		/* UINT64 */
 	/* dmu: */
 	DMU_OT_DNODE,			/* DNODE */
 	DMU_OT_OBJSET,			/* OBJSET */
 	/* dsl: */
 	DMU_OT_DSL_DIR,			/* UINT64 */
 	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
 	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
 	DMU_OT_DSL_PROPS,		/* ZAP */
 	DMU_OT_DSL_DATASET,		/* UINT64 */
 	/* zpl: */
 	DMU_OT_ZNODE,			/* ZNODE */
 	DMU_OT_OLDACL,			/* Old ACL */
 	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
 	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
 	DMU_OT_MASTER_NODE,		/* ZAP */
 	DMU_OT_UNLINKED_SET,		/* ZAP */
 	/* zvol: */
 	DMU_OT_ZVOL,			/* UINT8 */
 	DMU_OT_ZVOL_PROP,		/* ZAP */
 	/* other; for testing only! */
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
 	/* new object types: */
 	DMU_OT_ERROR_LOG,		/* ZAP */
 	DMU_OT_SPA_HISTORY,		/* UINT8 */
 	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
 	DMU_OT_POOL_PROPS,		/* ZAP */
 	DMU_OT_DSL_PERMS,		/* ZAP */
 	DMU_OT_ACL,			/* ACL */
 	DMU_OT_SYSACL,			/* SYSACL */
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
 	DMU_OT_SCAN_QUEUE,		/* ZAP */
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
 	DMU_OT_DDT_ZAP,			/* ZAP */
 	DMU_OT_DDT_STATS,		/* ZAP */
 	DMU_OT_SA,			/* System attr */
 	DMU_OT_SA_MASTER_NODE,		/* ZAP */
 	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
 	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
 	DMU_OT_SCAN_XLATE,		/* ZAP */
 	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
 	DMU_OT_DEADLIST,		/* ZAP */
 	DMU_OT_DEADLIST_HDR,		/* UINT64 */
 	DMU_OT_DSL_CLONES,		/* ZAP */
 	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
 	/*
 	 * Do not allocate new object types here. Doing so makes the on-disk
 	 * format incompatible with any other format that uses the same object
 	 * type number.
 	 *
 	 * When creating an object which does not have one of the above types
 	 * use the DMU_OTN_* type with the correct byteswap and metadata
 	 * values.
 	 *
 	 * The DMU_OTN_* types do not have entries in the dmu_ot table,
 	 * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
 	 * of indexing into dmu_ot directly (this works for both DMU_OT_* types
 	 * and DMU_OTN_* types).
 	 */
 	DMU_OT_NUMTYPES,
 
 	/*
 	 * Names for valid types declared with DMU_OT().
 	 */
 	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_FALSE),
 	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_FALSE),
 	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_FALSE),
 	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_FALSE),
 	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_FALSE),
 	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_FALSE),
 	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_FALSE),
 	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_FALSE),
 	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_FALSE),
 	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_FALSE),
 
 	DMU_OTN_UINT8_ENC_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_TRUE),
 	DMU_OTN_UINT8_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_TRUE),
 	DMU_OTN_UINT16_ENC_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_TRUE),
 	DMU_OTN_UINT16_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_TRUE),
 	DMU_OTN_UINT32_ENC_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_TRUE),
 	DMU_OTN_UINT32_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_TRUE),
 	DMU_OTN_UINT64_ENC_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_TRUE),
 	DMU_OTN_UINT64_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_TRUE),
 	DMU_OTN_ZAP_ENC_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_TRUE),
 	DMU_OTN_ZAP_ENC_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_TRUE),
 } dmu_object_type_t;
 
 typedef enum txg_how {
 	TXG_WAIT = 1,
 	TXG_NOWAIT,
 	TXG_WAITED,
 } txg_how_t;
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
 void byteswap_uint8_array(void *buf, size_t size);
 void zap_byteswap(void *buf, size_t size);
 void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
 #define	DS_FIND_SERIALIZE	(1<<2)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
 
 /*
  * Zap prefix for object accounting in DMU_{USER,GROUP}USED_OBJECT.
  */
 #define	DMU_OBJACCT_PREFIX	"obj-"
 #define	DMU_OBJACCT_PREFIX_LEN	4
 
 /*
  * artificial blkids for bonus buffer and spill blocks
  */
 #define	DMU_BONUS_BLKID		(-1ULL)
 #define	DMU_SPILL_BLKID		(-2ULL)
 
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
 typedef void dmu_objset_create_sync_func_t(objset_t *os, void *arg,
     cred_t *cr, dmu_tx_t *tx);
 
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, boolean_t key_required, void *tag, objset_t **osp);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, boolean_t key_required, void *tag);
 int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
 
 void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func,
     void *arg);
 int dmu_objset_clone(const char *name, const char *origin);
 int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
     struct nvlist *errlist);
 int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
 int dmu_objset_snapshot_tmp(const char *, const char *, int);
 int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 
 typedef struct dmu_buf {
 	uint64_t db_object;		/* object that this buffer is part of */
 	uint64_t db_offset;		/* byte offset in this object */
 	uint64_t db_size;		/* size of buffer in bytes */
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
 #define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
 #define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
 #define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
 #define	DMU_POOL_DEFLATE		"deflate"
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
 #define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
 #define	DMU_POOL_CHECKSUM_SALT		"org.illumos:checksum_salt"
 #define	DMU_POOL_VDEV_ZAP_MAP		"com.delphix:vdev_zap_map"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
  * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
  *
  * The transaction must be assigned to a txg.  The newly allocated
  * object will be "held" in the transaction (ie. you can modify the
  * newly allocated object in this transaction).
  *
  * dmu_object_alloc() chooses an object and returns it in *objectp.
  *
  * dmu_object_claim() allocates a specific object number.  If that
  * number is already allocated, it fails and returns EEXIST.
  *
  * Return 0 on success, or ENOSPC or EEXIST as specified above.
  */
 uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len,
     int dnodesize, dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len,
     int dnodesize, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
 int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
     dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
     int bonuslen, int dnodesize, dmu_tx_t *txp);
 
 /*
  * Free an object from this objset.
  *
  * The object's data will be freed as well (ie. you don't need to call
  * dmu_free(object, 0, -1, tx)).
  *
  * The object need not be held in the transaction.
  *
  * If there are any holds on this object's buffers (via dmu_buf_hold()),
  * or tx holds on the object (via dmu_tx_hold_object()), you can not
  * free it; it fails and returns EBUSY.
  *
  * If the object is not allocated, it fails and returns ENOENT.
  *
  * Return 0 on success, or EBUSY or ENOENT as specified above.
  */
 int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Find the next allocated or free object.
  *
  * The objectp parameter is in-out.  It will be updated to be the next
  * object which is allocated.  Ignore objects which have not been
  * modified since txg.
  *
  * XXX Can only be called on a objset with no dirty data.
  *
  * Returns 0 on success, or ENOENT if there are no more objects.
  */
 int dmu_object_next(objset_t *os, uint64_t *objectp,
     boolean_t hole, uint64_t txg);
 
 /*
  * Set the number of levels on a dnode. nlevels must be greater than the
  * current number of levels or an EINVAL will be returned.
  */
 int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels,
     dmu_tx_t *tx);
 
 /*
  * Set the data blocksize for an object.
  *
  * The object cannot have any blocks allcated beyond the first.  If
  * the first block is allocated already, the new size must be greater
  * than the current block size.  If these conditions are not met,
  * ENOTSUP will be returned.
  *
  * Returns 0 on success, or EBUSY if there are any holds on the object
  * contents, or ENOTSUP as described above.
  */
 int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
     int ibs, dmu_tx_t *tx);
 
 /*
  * Set the checksum property on a dnode.  The new checksum algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx);
 
 /*
  * Set the compress property on a dnode.  The new compression algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 int dmu_object_dirty_raw(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx);
 
 /*
  * Decide how to write a block: checksum, compression, number of copies, etc.
  */
 #define	WP_NOFILL	0x1
 #define	WP_DMU_SYNC	0x2
 #define	WP_SPILL	0x4
 
 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
     struct zio_prop *zp);
 
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
  * data.  As with any normal buffer, you must call dmu_buf_read() to
  * read db_data, dmu_buf_will_dirty() before modifying it, and the
  * object must be held in an assigned transaction before calling
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release what you hold with dmu_buf_rele().
  *
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag,
     uint32_t flags, dmu_buf_t **dbp);
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
 dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
 int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
 
 /*
  * Special spill buffer support used by "SA" framework
  */
 
 int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
     void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
  * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
  * that it will remain in memory.  You must release the hold with
  * dmu_buf_rele().  You must not access the dmu_buf_t after releasing
  * what you hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
  *
  * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
  * on the returned buffer before reading or writing the buffer's
  * db_data.  The comments for those routines describe what particular
  * operations are valid after calling them.
  *
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **, int flags);
 int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     void *tag, dmu_buf_t **dbp, int flags);
 
 /*
  * Add a reference to a dmu buffer that has already been held via
  * dmu_buf_hold() in the current context.
  */
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
 
 /*
  * Attempt to add a reference to a dmu buffer that is in an unknown state,
  * using a pointer that may have been invalidated by eviction processing.
  * The request will succeed if the passed in dbuf still represents the
  * same os/object/blkid, is ineligible for eviction, and has at least
  * one hold by a user other than the syncer.
  */
 boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
     uint64_t blkid, void *tag);
 
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
 /*
  * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
  * range of an object.  A pointer to an array of dmu_buf_t*'s is
  * returned (in *dbpp).
  *
  * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
  * frees the array.  The hold on the array of buffers MUST be released
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, boolean_t read, void *tag,
     int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
 typedef void dmu_buf_evict_func_t(void *user_ptr);
 
 /*
  * A DMU buffer user object may be associated with a dbuf for the
  * duration of its lifetime.  This allows the user of a dbuf (client)
  * to attach private data to a dbuf (e.g. in-core only data such as a
  * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
  * when that dbuf has been evicted.  Clients typically respond to the
  * eviction notification by freeing their private data, thus ensuring
  * the same lifetime for both dbuf and private data.
  *
  * The mapping from a dmu_buf_user_t to any client private data is the
  * client's responsibility.  All current consumers of the API with private
  * data embed a dmu_buf_user_t as the first member of the structure for
  * their private data.  This allows conversions between the two types
  * with a simple cast.  Since the DMU buf user API never needs access
  * to the private data, other strategies can be employed if necessary
  * or convenient for the client (e.g. using container_of() to do the
  * conversion for private data that cannot have the dmu_buf_user_t as
  * its first member).
  *
  * Eviction callbacks are executed without the dbuf mutex held or any
  * other type of mechanism to guarantee that the dbuf is still available.
  * For this reason, users must assume the dbuf has already been freed
  * and not reference the dbuf from the callback context.
  *
  * Users requesting "immediate eviction" are notified as soon as the dbuf
  * is only referenced by dirty records (dirties == holds).  Otherwise the
  * notification occurs after eviction processing for the dbuf begins.
  */
 typedef struct dmu_buf_user {
 	/*
 	 * Asynchronous user eviction callback state.
 	 */
 	taskq_ent_t	dbu_tqent;
 
 	/*
 	 * This instance's eviction function pointers.
 	 *
 	 * dbu_evict_func_sync is called synchronously and then
 	 * dbu_evict_func_async is executed asynchronously on a taskq.
 	 */
 	dmu_buf_evict_func_t *dbu_evict_func_sync;
 	dmu_buf_evict_func_t *dbu_evict_func_async;
 #ifdef ZFS_DEBUG
 	/*
 	 * Pointer to user's dbuf pointer.  NULL for clients that do
 	 * not associate a dbuf with their user data.
 	 *
 	 * The dbuf pointer is cleared upon eviction so as to catch
 	 * use-after-evict bugs in clients.
 	 */
 	dmu_buf_t **dbu_clear_on_evict_dbufp;
 #endif
 } dmu_buf_user_t;
 
 /*
  * Initialize the given dmu_buf_user_t instance with the eviction function
  * evict_func, to be called when the user is evicted.
  *
  * NOTE: This function should only be called once on a given dmu_buf_user_t.
  *       To allow enforcement of this, dbu must already be zeroed on entry.
  */
 /*ARGSUSED*/
 static inline void
 dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync,
     dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp)
 {
 	ASSERT(dbu->dbu_evict_func_sync == NULL);
 	ASSERT(dbu->dbu_evict_func_async == NULL);
 
 	/* must have at least one evict func */
 	IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
 	dbu->dbu_evict_func_sync = evict_func_sync;
 	dbu->dbu_evict_func_async = evict_func_async;
 	taskq_init_ent(&dbu->dbu_tqent);
 #ifdef ZFS_DEBUG
 	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
 #endif
 }
 
 /*
  * Attach user data to a dbuf and mark it for normal (when the dbuf's
  * data is cleared or its reference count goes to zero) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Attach user data to a dbuf and mark it for immediate (its dirty and
  * reference counts are equal) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Replace the current user of a dbuf.
  *
  * If given the current user of a dbuf, replaces the dbuf's user with
  * "new_user" and returns the user data pointer that was replaced.
  * Otherwise returns the current, and unmodified, dbuf user pointer.
  */
 void *dmu_buf_replace_user(dmu_buf_t *db,
     dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
 
 /*
  * Remove the specified user data for a DMU buffer.
  *
  * Returns the user that was removed on success, or the current user if
  * another user currently owns the buffer.
  */
 void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 objset_t *dmu_buf_get_objset(dmu_buf_t *db);
 dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
 void dmu_buf_dnode_exit(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
 
 /*
  * Returns the blkptr associated with this dbuf, or NULL if not set.
  */
 struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
 
 /*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
  * dmu_tx_assign()).  The buffer's object must be held in the tx
  * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_change_crypt_params(dmu_buf_t *db, dmu_tx_t *tx);
 
 /*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
  * been assigned, you can modify buffers which belong to held objects as
  * part of this transaction.  You can't modify buffers before the
  * transaction has been assigned; you can't modify buffers which don't
  * belong to objects which this transaction holds; you can't hold
  * objects once the transaction has been assigned.  You may hold an
  * object which you are going to free (with dmu_object_free()), but you
  * don't have to.
  *
  * You can abort the transaction before it has been assigned.
  *
  * Note that you may hold buffers (with dmu_buf_hold) at any time,
  * regardless of transaction state.
  */
 
 #define	DMU_NEW_OBJECT	(-1ULL)
 #define	DMU_OBJECT_END	(-1ULL)
 
 dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add,
     const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn);
 void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_mark_netfree(dmu_tx_t *tx);
 
 /*
  * To register a commit callback, dmu_tx_callback_register() must be called.
  *
  * dcb_data is a pointer to caller private data that is passed on as a
  * callback parameter. The caller is responsible for properly allocating and
  * freeing it.
  *
  * When registering a callback, the transaction must be already created, but
  * it cannot be committed or aborted. It can be assigned to a txg or not.
  *
  * The callback will be called after the transaction has been safely written
  * to stable storage and will also be called if the dmu_tx is aborted.
  * If there is any error which prevents the transaction from being committed to
  * disk, the callback will be called with a value of error != 0.
  */
 typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
 
 void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
     void *dcb_data);
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
  * -1, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx);
 int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size);
 int dmu_free_long_range_raw(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size);
 int dmu_free_long_object(objset_t *os, uint64_t object);
 int dmu_free_long_object_raw(objset_t *os, uint64_t object);
 
 /*
  * Convenience functions.
  *
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
 #define	DMU_READ_PREFETCH	0 /* prefetch */
 #define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
 #define	DMU_READ_NO_DECRYPT	2 /* don't decrypt */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf, uint32_t flags);
 int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx);
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_tx_t *tx);
 #ifdef _KERNEL
 #include <linux/blkdev_compat.h>
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
 int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
 int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size,
 	dmu_tx_t *tx);
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 void dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset,
     struct arc_buf *buf, dmu_tx_t *tx);
 void dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset,
     struct arc_buf *buf, dmu_tx_t *tx);
 #define	dmu_assign_arcbuf	dmu_assign_arcbuf_by_dbuf
 void dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
 void dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset,
     dmu_buf_t *handle, dmu_tx_t *tx);
 #ifdef HAVE_UIO_ZEROCOPY
 int dmu_xuio_init(struct xuio *uio, int niov);
 void dmu_xuio_fini(struct xuio *uio);
 int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
     size_t n);
 int dmu_xuio_cnt(struct xuio *uio);
 struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
 void dmu_xuio_clear(struct xuio *uio, int i);
 #endif /* HAVE_UIO_ZEROCOPY */
 void xuio_stat_wbuf_copied(void);
 void xuio_stat_wbuf_nocopy(void);
 
 extern int zfs_prefetch_disable;
 extern int zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
 	uint64_t doi_bonus_size;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_nblkptr;
 	uint8_t doi_pad[4];
 	uint64_t doi_dnodesize;
 	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
 	uint64_t doi_max_offset;
 	uint64_t doi_fill_count;		/* number of non-empty blocks */
 } dmu_object_info_t;
 
 typedef void (*const arc_byteswap_func_t)(void *buf, size_t size);
 
 typedef struct dmu_object_type_info {
 	dmu_object_byteswap_t	ot_byteswap;
 	boolean_t		ot_metadata;
 	boolean_t		ot_encrypt;
 	char			*ot_name;
 } dmu_object_type_info_t;
 
 typedef const struct dmu_object_byteswap_info {
 	arc_byteswap_func_t	 ob_func;
 	char			*ob_name;
 } dmu_object_byteswap_info_t;
 
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
 extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
 
 /*
  * Get information on a DMU object.
  *
  * Return 0 on success or ENOENT if object is not allocated.
  *
  * If doi is NULL, just indicates whether the object exists.
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dnode in hand. */
 void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 /*
  * Like dmu_object_info_from_db, but faster still when you only care about
  * the size.  This is specifically optimized for zfs_getattr().
  */
 void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
     u_longlong_t *nblk512);
 
 void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize);
 
 typedef struct dmu_objset_stats {
 	uint64_t dds_num_clones; /* number of clones of this */
 	uint64_t dds_creation_txg;
 	uint64_t dds_guid;
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
 	char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
 } dmu_objset_stats_t;
 
 /*
  * Get stats on a dataset.
  */
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 
 /*
  * Add entries to the nvlist for all the objset's properties.  See
  * zfs_prop_table[] and zfs(1m) for details on the properties.
  */
 void dmu_objset_stats(objset_t *os, struct nvlist *nv);
 
 /*
  * Get the space usage statistics for statvfs().
  *
  * refdbytes is the amount of space "referenced" by this objset.
  * availbytes is the amount of space available to this objset, taking
  * into account quotas & reservations, assuming that no other objsets
  * use the space first.  These values correspond to the 'referenced' and
  * 'available' properties, described in the zfs(1m) manpage.
  *
  * usedobjs and availobjs are the number of objects currently allocated,
  * and available.
  */
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 
 /*
  * The fsid_guid is a 56-bit ID that can change to avoid collisions.
  * (Contrast with the ds_guid which is a 64-bit ID that will never
  * change, so there is a small probability that it will collide.)
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
 /*
  * Get the [cm]time for an objset's snapshot dir
  */
 timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
 extern struct zilog *dmu_objset_zil(objset_t *os);
 extern struct dsl_pool *dmu_objset_pool(objset_t *os);
 extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
 extern uint64_t dmu_objset_dnodesize(objset_t *os);
 extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
 extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
 extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
     int maxlen, boolean_t *conflict);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
 typedef int objset_used_cb_t(dmu_object_type_t bonustype,
     void *bonus, uint64_t *userp, uint64_t *groupp);
 extern void dmu_objset_register_type(dmu_objset_type_t ost,
     objset_used_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
 extern void *dmu_objset_get_user(objset_t *os);
 
 /*
  * Return the txg number for the given assigned transaction.
  */
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 
 /*
  * Synchronous write.
  * If a parent zio is provided this function initiates a write on the
  * provided buffer as a child of the parent zio.
  * In the absence of a parent zio, the write is completed synchronously.
  * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
 
 /*
  * {zfs,zvol,ztest}_get_done() args
  */
 typedef struct zgd {
-	struct zilog	*zgd_zilog;
+	struct lwb	*zgd_lwb;
 	struct blkptr	*zgd_bp;
 	dmu_buf_t	*zgd_db;
 	struct rl	*zgd_rl;
 	void		*zgd_private;
 } zgd_t;
 
 typedef void dmu_sync_cb_t(zgd_t *arg, int error);
 int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
 
 /*
  * Find the next hole or data block in file starting at *off
  * Return found offset in *off. Return ESRCH for end of file.
  */
 int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
     uint64_t *off);
 
 /*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
 extern void dmu_fini(void);
 
 typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
     uint64_t object, uint64_t offset, int len);
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
 int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
     struct vnode *vp, offset_t *offp);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
 extern int zfs_mdcomp_disable;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_DMU_H */
diff --git a/include/sys/trace_zil.h b/include/sys/trace_zil.h
index eaa391270d76..ff16c8686cc4 100644
--- a/include/sys/trace_zil.h
+++ b/include/sys/trace_zil.h
@@ -1,129 +1,221 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 #if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS)
 
 #undef TRACE_SYSTEM
 #define	TRACE_SYSTEM zfs
 
 #undef TRACE_SYSTEM_VAR
 #define	TRACE_SYSTEM_VAR zfs_zil
 
 #if !defined(_TRACE_ZIL_H) || defined(TRACE_HEADER_MULTI_READ)
 #define	_TRACE_ZIL_H
 
 #include <linux/tracepoint.h>
 #include <sys/types.h>
 
+#define	ZILOG_TP_STRUCT_ENTRY						    \
+		__field(uint64_t,	zl_lr_seq)			    \
+		__field(uint64_t,	zl_commit_lr_seq)		    \
+		__field(uint64_t,	zl_destroy_txg)			    \
+		__field(uint64_t,	zl_replaying_seq)		    \
+		__field(uint32_t,	zl_suspend)			    \
+		__field(uint8_t,	zl_suspending)			    \
+		__field(uint8_t,	zl_keep_first)			    \
+		__field(uint8_t,	zl_replay)			    \
+		__field(uint8_t,	zl_stop_sync)			    \
+		__field(uint8_t,	zl_logbias)			    \
+		__field(uint8_t,	zl_sync)			    \
+		__field(int,		zl_parse_error)			    \
+		__field(uint64_t,	zl_parse_blk_seq)		    \
+		__field(uint64_t,	zl_parse_lr_seq)		    \
+		__field(uint64_t,	zl_parse_blk_count)		    \
+		__field(uint64_t,	zl_parse_lr_count)		    \
+		__field(uint64_t,	zl_cur_used)			    \
+		__field(clock_t,	zl_replay_time)			    \
+		__field(uint64_t,	zl_replay_blks)
+
+#define	ZILOG_TP_FAST_ASSIGN						    \
+		__entry->zl_lr_seq		= zilog->zl_lr_seq;	    \
+		__entry->zl_commit_lr_seq	= zilog->zl_commit_lr_seq;  \
+		__entry->zl_destroy_txg	= zilog->zl_destroy_txg;	    \
+		__entry->zl_replaying_seq	= zilog->zl_replaying_seq;  \
+		__entry->zl_suspend		= zilog->zl_suspend;	    \
+		__entry->zl_suspending	= zilog->zl_suspending;		    \
+		__entry->zl_keep_first	= zilog->zl_keep_first;		    \
+		__entry->zl_replay		= zilog->zl_replay;	    \
+		__entry->zl_stop_sync	= zilog->zl_stop_sync;		    \
+		__entry->zl_logbias		= zilog->zl_logbias;	    \
+		__entry->zl_sync		= zilog->zl_sync;	    \
+		__entry->zl_parse_error	= zilog->zl_parse_error;	    \
+		__entry->zl_parse_blk_seq	= zilog->zl_parse_blk_seq;  \
+		__entry->zl_parse_lr_seq	= zilog->zl_parse_lr_seq;   \
+		__entry->zl_parse_blk_count	= zilog->zl_parse_blk_count;\
+		__entry->zl_parse_lr_count	= zilog->zl_parse_lr_count; \
+		__entry->zl_cur_used	= zilog->zl_cur_used;		    \
+		__entry->zl_replay_time	= zilog->zl_replay_time;	    \
+		__entry->zl_replay_blks	= zilog->zl_replay_blks;
+
+#define	ZILOG_TP_PRINTK_FMT						    \
+	"zl { lr_seq %llu commit_lr_seq %llu destroy_txg %llu "		    \
+	"replaying_seq %llu suspend %u suspending %u keep_first %u "	    \
+	"replay %u stop_sync %u logbias %u sync %u "			    \
+	"parse_error %u parse_blk_seq %llu parse_lr_seq %llu "		    \
+	"parse_blk_count %llu parse_lr_count %llu "			    \
+	"cur_used %llu replay_time %lu replay_blks %llu }"
+
+#define	ZILOG_TP_PRINTK_ARGS						    \
+	    __entry->zl_lr_seq, __entry->zl_commit_lr_seq,		    \
+	    __entry->zl_destroy_txg, __entry->zl_replaying_seq,		    \
+	    __entry->zl_suspend, __entry->zl_suspending,		    \
+	    __entry->zl_keep_first, __entry->zl_replay,			    \
+	    __entry->zl_stop_sync, __entry->zl_logbias, __entry->zl_sync,   \
+	    __entry->zl_parse_error, __entry->zl_parse_blk_seq,		    \
+	    __entry->zl_parse_lr_seq, __entry->zl_parse_blk_count,	    \
+	    __entry->zl_parse_lr_count, __entry->zl_cur_used,		    \
+	    __entry->zl_replay_time, __entry->zl_replay_blks
+
+#define	ITX_TP_STRUCT_ENTRY						    \
+		__field(itx_wr_state_t,	itx_wr_state)			    \
+		__field(uint8_t,	itx_sync)			    \
+		__field(zil_callback_t,	itx_callback)			    \
+		__field(void *,		itx_callback_data)		    \
+		__field(uint64_t,	itx_oid)			    \
+									    \
+		__field(uint64_t,	lrc_txtype)			    \
+		__field(uint64_t,	lrc_reclen)			    \
+		__field(uint64_t,	lrc_txg)			    \
+		__field(uint64_t,	lrc_seq)
+
+#define	ITX_TP_FAST_ASSIGN						    \
+		__entry->itx_wr_state		= itx->itx_wr_state;	    \
+		__entry->itx_sync		= itx->itx_sync;	    \
+		__entry->itx_callback		= itx->itx_callback;	    \
+		__entry->itx_callback_data	= itx->itx_callback_data;   \
+		__entry->itx_oid		= itx->itx_oid;		    \
+									    \
+		__entry->lrc_txtype		= itx->itx_lr.lrc_txtype;   \
+		__entry->lrc_reclen		= itx->itx_lr.lrc_reclen;   \
+		__entry->lrc_txg		= itx->itx_lr.lrc_txg;	    \
+		__entry->lrc_seq		= itx->itx_lr.lrc_seq;
+
+#define	ITX_TP_PRINTK_FMT						    \
+	"itx { wr_state %u sync %u callback %p callback_data %p oid %llu"   \
+	" { txtype %llu reclen %llu txg %llu seq %llu } }"
+
+#define	ITX_TP_PRINTK_ARGS						    \
+	    __entry->itx_wr_state, __entry->itx_sync, __entry->itx_callback,\
+	    __entry->itx_callback_data, __entry->itx_oid,		    \
+	    __entry->lrc_txtype, __entry->lrc_reclen, __entry->lrc_txg,	    \
+	    __entry->lrc_seq
+
+#define	ZCW_TP_STRUCT_ENTRY						    \
+		__field(lwb_t *,	zcw_lwb)			    \
+		__field(boolean_t,	zcw_done)			    \
+		__field(int,		zcw_zio_error)			    \
+
+#define	ZCW_TP_FAST_ASSIGN						    \
+		__entry->zcw_lwb		= zcw->zcw_lwb;		    \
+		__entry->zcw_done		= zcw->zcw_done;	    \
+		__entry->zcw_zio_error		= zcw->zcw_zio_error;
+
+#define	ZCW_TP_PRINTK_FMT						    \
+	"zcw { lwb %p done %u error %u }"
+
+#define	ZCW_TP_PRINTK_ARGS						    \
+	    __entry->zcw_lwb, __entry->zcw_done, __entry->zcw_zio_error
+
 /*
- * Generic support for one argument tracepoints of the form:
+ * Generic support for two argument tracepoints of the form:
  *
- * DTRACE_PROBE1(...,
- *     zilog_t *, ...);
+ * DTRACE_PROBE2(...,
+ *     zilog_t *, ...,
+ *     itx_t *, ...);
  */
 /* BEGIN CSTYLED */
-DECLARE_EVENT_CLASS(zfs_zil_class,
-	TP_PROTO(zilog_t *zilog),
-	TP_ARGS(zilog),
+DECLARE_EVENT_CLASS(zfs_zil_process_itx_class,
+	TP_PROTO(zilog_t *zilog, itx_t *itx),
+	TP_ARGS(zilog, itx),
 	TP_STRUCT__entry(
-	    __field(uint64_t,	zl_lr_seq)
-	    __field(uint64_t,	zl_commit_lr_seq)
-	    __field(uint64_t,	zl_destroy_txg)
-	    __field(uint64_t,	zl_replaying_seq)
-	    __field(uint32_t,	zl_suspend)
-	    __field(uint8_t,	zl_suspending)
-	    __field(uint8_t,	zl_keep_first)
-	    __field(uint8_t,	zl_replay)
-	    __field(uint8_t,	zl_stop_sync)
-	    __field(uint8_t,	zl_writer)
-	    __field(uint8_t,	zl_logbias)
-	    __field(uint8_t,	zl_sync)
-	    __field(int,	zl_parse_error)
-	    __field(uint64_t,	zl_parse_blk_seq)
-	    __field(uint64_t,	zl_parse_lr_seq)
-	    __field(uint64_t,	zl_parse_blk_count)
-	    __field(uint64_t,	zl_parse_lr_count)
-	    __field(uint64_t,	zl_next_batch)
-	    __field(uint64_t,	zl_com_batch)
-	    __field(uint64_t,	zl_cur_used)
-	    __field(clock_t,	zl_replay_time)
-	    __field(uint64_t,	zl_replay_blks)
+	    ZILOG_TP_STRUCT_ENTRY
+	    ITX_TP_STRUCT_ENTRY
 	),
 	TP_fast_assign(
-	    __entry->zl_lr_seq		= zilog->zl_lr_seq;
-	    __entry->zl_commit_lr_seq	= zilog->zl_commit_lr_seq;
-	    __entry->zl_destroy_txg	= zilog->zl_destroy_txg;
-	    __entry->zl_replaying_seq	= zilog->zl_replaying_seq;
-	    __entry->zl_suspend		= zilog->zl_suspend;
-	    __entry->zl_suspending	= zilog->zl_suspending;
-	    __entry->zl_keep_first	= zilog->zl_keep_first;
-	    __entry->zl_replay		= zilog->zl_replay;
-	    __entry->zl_stop_sync	= zilog->zl_stop_sync;
-	    __entry->zl_writer		= zilog->zl_writer;
-	    __entry->zl_logbias		= zilog->zl_logbias;
-	    __entry->zl_sync		= zilog->zl_sync;
-	    __entry->zl_parse_error	= zilog->zl_parse_error;
-	    __entry->zl_parse_blk_seq	= zilog->zl_parse_blk_seq;
-	    __entry->zl_parse_lr_seq	= zilog->zl_parse_lr_seq;
-	    __entry->zl_parse_blk_count	= zilog->zl_parse_blk_count;
-	    __entry->zl_parse_lr_count	= zilog->zl_parse_lr_count;
-	    __entry->zl_next_batch	= zilog->zl_next_batch;
-	    __entry->zl_com_batch	= zilog->zl_com_batch;
-	    __entry->zl_cur_used	= zilog->zl_cur_used;
-	    __entry->zl_replay_time	= zilog->zl_replay_time;
-	    __entry->zl_replay_blks	= zilog->zl_replay_blks;
+	    ZILOG_TP_FAST_ASSIGN
+	    ITX_TP_FAST_ASSIGN
 	),
-	TP_printk("zl { lr_seq %llu commit_lr_seq %llu destroy_txg %llu "
-	    "replaying_seq %llu suspend %u suspending %u keep_first %u "
-	    "replay %u stop_sync %u writer %u logbias %u sync %u "
-	    "parse_error %u parse_blk_seq %llu parse_lr_seq %llu "
-	    "parse_blk_count %llu parse_lr_count %llu next_batch %llu "
-	    "com_batch %llu cur_used %llu replay_time %lu replay_blks %llu }",
-	    __entry->zl_lr_seq, __entry->zl_commit_lr_seq,
-	    __entry->zl_destroy_txg, __entry->zl_replaying_seq,
-	    __entry->zl_suspend, __entry->zl_suspending, __entry->zl_keep_first,
-	    __entry->zl_replay, __entry->zl_stop_sync, __entry->zl_writer,
-	    __entry->zl_logbias, __entry->zl_sync, __entry->zl_parse_error,
-	    __entry->zl_parse_blk_seq, __entry->zl_parse_lr_seq,
-	    __entry->zl_parse_blk_count, __entry->zl_parse_lr_count,
-	    __entry->zl_next_batch, __entry->zl_com_batch, __entry->zl_cur_used,
-	    __entry->zl_replay_time, __entry->zl_replay_blks)
+	TP_printk(
+	    ZILOG_TP_PRINTK_FMT " " ITX_TP_PRINTK_FMT,
+	    ZILOG_TP_PRINTK_ARGS, ITX_TP_PRINTK_ARGS)
 );
 /* END CSTYLED */
 
 /* BEGIN CSTYLED */
-#define	DEFINE_ZIL_EVENT(name) \
-DEFINE_EVENT(zfs_zil_class, name, \
-	TP_PROTO(zilog_t *zilog), \
-	TP_ARGS(zilog))
-DEFINE_ZIL_EVENT(zfs_zil__cw1);
-DEFINE_ZIL_EVENT(zfs_zil__cw2);
+#define	DEFINE_ZIL_PROCESS_ITX_EVENT(name) \
+DEFINE_EVENT(zfs_zil_process_itx_class, name, \
+	TP_PROTO(zilog_t *zilog, itx_t *itx), \
+	TP_ARGS(zilog, itx))
+DEFINE_ZIL_PROCESS_ITX_EVENT(zfs_zil__process__commit__itx);
+DEFINE_ZIL_PROCESS_ITX_EVENT(zfs_zil__process__normal__itx);
+/* END CSTYLED */
+
+/*
+ * Generic support for two argument tracepoints of the form:
+ *
+ * DTRACE_PROBE2(...,
+ *     zilog_t *, ...,
+ *     zil_commit_waiter_t *, ...);
+ */
+/* BEGIN CSTYLED */
+DECLARE_EVENT_CLASS(zfs_zil_commit_io_error_class,
+	TP_PROTO(zilog_t *zilog, zil_commit_waiter_t *zcw),
+	TP_ARGS(zilog, zcw),
+	TP_STRUCT__entry(
+	    ZILOG_TP_STRUCT_ENTRY
+	    ZCW_TP_STRUCT_ENTRY
+	),
+	TP_fast_assign(
+	    ZILOG_TP_FAST_ASSIGN
+	    ZCW_TP_FAST_ASSIGN
+	),
+	TP_printk(
+	    ZILOG_TP_PRINTK_FMT " " ZCW_TP_PRINTK_FMT,
+	    ZILOG_TP_PRINTK_ARGS, ZCW_TP_PRINTK_ARGS)
+);
+
+/* BEGIN CSTYLED */
+#define	DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(name) \
+DEFINE_EVENT(zfs_zil_commit_io_error_class, name, \
+	TP_PROTO(zilog_t *zilog, zil_commit_waiter_t *zcw), \
+	TP_ARGS(zilog, zcw))
+DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error);
 /* END CSTYLED */
 
 #endif /* _TRACE_ZIL_H */
 
 #undef TRACE_INCLUDE_PATH
 #undef TRACE_INCLUDE_FILE
 #define	TRACE_INCLUDE_PATH sys
 #define	TRACE_INCLUDE_FILE trace_zil
 #include <trace/define_trace.h>
 
 #endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */
diff --git a/include/sys/zil.h b/include/sys/zil.h
index 442fee4a1f8f..5513c7ce8ed9 100644
--- a/include/sys/zil.h
+++ b/include/sys/zil.h
@@ -1,519 +1,523 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_ZIL_H
 #define	_SYS_ZIL_H
 
 #include <sys/types.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu.h>
 #include <sys/zio_crypt.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct dsl_pool;
 struct dsl_dataset;
+struct lwb;
 
 /*
  * Intent log format:
  *
  * Each objset has its own intent log.  The log header (zil_header_t)
  * for objset N's intent log is kept in the Nth object of the SPA's
  * intent_log objset.  The log header points to a chain of log blocks,
  * each of which contains log records (i.e., transactions) followed by
  * a log block trailer (zil_trailer_t).  The format of a log record
  * depends on the record (or transaction) type, but all records begin
  * with a common structure that defines the type, length, and txg.
  */
 
 /*
  * Intent log header - this on disk structure holds fields to manage
  * the log.  All fields are 64 bit to easily handle cross architectures.
  */
 typedef struct zil_header {
 	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */
 	uint64_t zh_replay_seq;	/* highest replayed sequence number */
 	blkptr_t zh_log;	/* log chain */
 	uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
 	uint64_t zh_flags;	/* header flags */
 	uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
 	uint64_t zh_pad[3];
 } zil_header_t;
 
 /*
  * zh_flags bit settings
  */
 #define	ZIL_REPLAY_NEEDED	0x1	/* replay needed - internal only */
 #define	ZIL_CLAIM_LR_SEQ_VALID	0x2	/* zh_claim_lr_seq field is valid */
 
 /*
  * Log block chaining.
  *
  * Log blocks are chained together. Originally they were chained at the
  * end of the block. For performance reasons the chain was moved to the
  * beginning of the block which allows writes for only the data being used.
  * The older position is supported for backwards compatability.
  *
  * The zio_eck_t contains a zec_cksum which for the intent log is
  * the sequence number of this log block. A seq of 0 is invalid.
  * The zec_cksum is checked by the SPA against the sequence
  * number passed in the blk_cksum field of the blkptr_t
  */
 typedef struct zil_chain {
 	uint64_t zc_pad;
 	blkptr_t zc_next_blk;	/* next block in chain */
 	uint64_t zc_nused;	/* bytes in log block used */
 	zio_eck_t zc_eck;	/* block trailer */
 } zil_chain_t;
 
 #define	ZIL_MIN_BLKSZ	4096ULL
 
 /*
  * ziltest is by and large an ugly hack, but very useful in
  * checking replay without tedious work.
  * When running ziltest we want to keep all itx's and so maintain
  * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
  * We subtract TXG_CONCURRENT_STATES to allow for common code.
  */
 #define	ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
 
 /*
  * The words of a log block checksum.
  */
 #define	ZIL_ZC_GUID_0	0
 #define	ZIL_ZC_GUID_1	1
 #define	ZIL_ZC_OBJSET	2
 #define	ZIL_ZC_SEQ	3
 
 typedef enum zil_create {
 	Z_FILE,
 	Z_DIR,
 	Z_XATTRDIR,
 } zil_create_t;
 
 /*
  * size of xvattr log section.
  * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
  * for create time and a single 64 bit integer for all of the attributes,
  * and 4 64 bit integers (32 bytes) for the scanstamp.
  *
  */
 
 #define	ZIL_XVAT_SIZE(mapsize) \
 	sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
 	(sizeof (uint64_t) * 7)
 
 /*
  * Size of ACL in log.  The ACE data is padded out to properly align
  * on 8 byte boundary.
  */
 
 #define	ZIL_ACE_LENGTH(x)	(roundup(x, sizeof (uint64_t)))
 
 /*
  * Intent log transaction types and record structures
  */
+#define	TX_COMMIT		0	/* Commit marker (no on-disk state) */
 #define	TX_CREATE		1	/* Create file */
 #define	TX_MKDIR		2	/* Make directory */
 #define	TX_MKXATTR		3	/* Make XATTR directory */
 #define	TX_SYMLINK		4	/* Create symbolic link to a file */
 #define	TX_REMOVE		5	/* Remove file */
 #define	TX_RMDIR		6	/* Remove directory */
 #define	TX_LINK			7	/* Create hard link to a file */
 #define	TX_RENAME		8	/* Rename a file */
 #define	TX_WRITE		9	/* File write */
 #define	TX_TRUNCATE		10	/* Truncate a file */
 #define	TX_SETATTR		11	/* Set file attributes */
 #define	TX_ACL_V0		12	/* Set old formatted ACL */
 #define	TX_ACL			13	/* Set ACL */
 #define	TX_CREATE_ACL		14	/* create with ACL */
 #define	TX_CREATE_ATTR		15	/* create + attrs */
 #define	TX_CREATE_ACL_ATTR 	16	/* create with ACL + attrs */
 #define	TX_MKDIR_ACL		17	/* mkdir with ACL */
 #define	TX_MKDIR_ATTR		18	/* mkdir with attr */
 #define	TX_MKDIR_ACL_ATTR	19	/* mkdir with ACL + attrs */
 #define	TX_WRITE2		20	/* dmu_sync EALREADY write */
 #define	TX_MAX_TYPE		21	/* Max transaction type */
 
 /*
  * The transactions for mkdir, symlink, remove, rmdir, link, and rename
  * may have the following bit set, indicating the original request
  * specified case-insensitive handling of names.
  */
 #define	TX_CI	((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
 
 /*
  * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
  * out of order.  For convenience in the code, all such records must have
  * lr_foid at the same offset.
  */
 #define	TX_OOO(txtype)			\
 	((txtype) == TX_WRITE ||	\
 	(txtype) == TX_TRUNCATE ||	\
 	(txtype) == TX_SETATTR ||	\
 	(txtype) == TX_ACL_V0 ||	\
 	(txtype) == TX_ACL ||		\
 	(txtype) == TX_WRITE2)
 
 /*
  * The number of dnode slots consumed by the object is stored in the 8
  * unused upper bits of the object ID. We subtract 1 from the value
  * stored on disk for compatibility with implementations that don't
  * support large dnodes. The slot count for a single-slot dnode will
  * contain 0 for those bits to preserve the log record format for
  * "small" dnodes.
  */
 #define	LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1)
 #define	LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1)
 #define	LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT)
 #define	LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x))
 
 /*
  * Format of log records.
  * The fields are carefully defined to allow them to be aligned
  * and sized the same on sparc & intel architectures.
  * Each log record has a common structure at the beginning.
  *
  * The log record on disk (lrc_seq) holds the sequence number of all log
  * records which is used to ensure we don't replay the same record.
  */
 typedef struct {			/* common log record header */
 	uint64_t	lrc_txtype;	/* intent log transaction type */
 	uint64_t	lrc_reclen;	/* transaction record length */
 	uint64_t	lrc_txg;	/* dmu transaction group number */
 	uint64_t	lrc_seq;	/* see comment above */
 } lr_t;
 
 /*
  * Common start of all out-of-order record types (TX_OOO() above).
  */
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* object id */
 } lr_ooo_t;
 
 /*
  * Handle option extended vattr attributes.
  *
  * Whenever new attributes are added the version number
  * will need to be updated as will code in
  * zfs_log.c and zfs_replay.c
  */
 typedef struct {
 	uint32_t	lr_attr_masksize; /* number of elements in array */
 	uint32_t	lr_attr_bitmap; /* First entry of array */
 	/* remainder of array and any additional fields */
 } lr_attr_t;
 
 /*
  * log record for creates without optional ACL.
  * This log record does support optional xvattr_t attributes.
  */
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_doid;	/* object id of directory */
 	uint64_t	lr_foid;	/* object id of created file object */
 	uint64_t	lr_mode;	/* mode of object */
 	uint64_t	lr_uid;		/* uid of object */
 	uint64_t	lr_gid;		/* gid of object */
 	uint64_t	lr_gen;		/* generation (txg of creation) */
 	uint64_t	lr_crtime[2];	/* creation time */
 	uint64_t	lr_rdev;	/* rdev of object to create */
 	/* name of object to create follows this */
 	/* for symlinks, link content follows name */
 	/* for creates with xvattr data, the name follows the xvattr info */
 } lr_create_t;
 
 /*
  * FUID ACL record will be an array of ACEs from the original ACL.
  * If this array includes ephemeral IDs, the record will also include
  * an array of log-specific FUIDs to replace the ephemeral IDs.
  * Only one copy of each unique domain will be present, so the log-specific
  * FUIDs will use an index into a compressed domain table.  On replay this
  * information will be used to construct real FUIDs (and bypass idmap,
  * since it may not be available).
  */
 
 /*
  * Log record for creates with optional ACL
  * This log record is also used for recording any FUID
  * information needed for replaying the create.  If the
  * file doesn't have any actual ACEs then the lr_aclcnt
  * would be zero.
  *
  * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's.
  * If create is also setting xvattr's, then acl data follows xvattr.
  * If ACE FUIDs are needed then they will follow the xvattr_t.  Following
  * the FUIDs will be the domain table information.  The FUIDs for the owner
  * and group will be in lr_create.  Name follows ACL data.
  */
 typedef struct {
 	lr_create_t	lr_create;	/* common create portion */
 	uint64_t	lr_aclcnt;	/* number of ACEs in ACL */
 	uint64_t	lr_domcnt;	/* number of unique domains */
 	uint64_t	lr_fuidcnt;	/* number of real fuids */
 	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
 	uint64_t	lr_acl_flags;	/* ACL flags */
 } lr_acl_create_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_doid;	/* obj id of directory */
 	/* name of object to remove follows this */
 } lr_remove_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_doid;	/* obj id of directory */
 	uint64_t	lr_link_obj;	/* obj id of link */
 	/* name of object to link follows this */
 } lr_link_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_sdoid;	/* obj id of source directory */
 	uint64_t	lr_tdoid;	/* obj id of target directory */
 	/* 2 strings: names of source and destination follow this */
 } lr_rename_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* file object to write */
 	uint64_t	lr_offset;	/* offset to write to */
 	uint64_t	lr_length;	/* user data length to write */
 	uint64_t	lr_blkoff;	/* no longer used */
 	blkptr_t	lr_blkptr;	/* spa block pointer for replay */
 	/* write data will follow for small writes */
 } lr_write_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* object id of file to truncate */
 	uint64_t	lr_offset;	/* offset to truncate from */
 	uint64_t	lr_length;	/* length to truncate */
 } lr_truncate_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* file object to change attributes */
 	uint64_t	lr_mask;	/* mask of attributes to set */
 	uint64_t	lr_mode;	/* mode to set */
 	uint64_t	lr_uid;		/* uid to set */
 	uint64_t	lr_gid;		/* gid to set */
 	uint64_t	lr_size;	/* size to set */
 	uint64_t	lr_atime[2];	/* access time */
 	uint64_t	lr_mtime[2];	/* modification time */
 	/* optional attribute lr_attr_t may be here */
 } lr_setattr_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* obj id of file */
 	uint64_t	lr_aclcnt;	/* number of acl entries */
 	/* lr_aclcnt number of ace_t entries follow this */
 } lr_acl_v0_t;
 
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_foid;	/* obj id of file */
 	uint64_t	lr_aclcnt;	/* number of ACEs in ACL */
 	uint64_t	lr_domcnt;	/* number of unique domains */
 	uint64_t	lr_fuidcnt;	/* number of real fuids */
 	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
 	uint64_t	lr_acl_flags;	/* ACL flags */
 	/* lr_acl_bytes number of variable sized ace's follows */
 } lr_acl_t;
 
 /*
  * ZIL structure definitions, interface function prototype and globals.
  */
 
 /*
  * Writes are handled in three different ways:
  *
  * WR_INDIRECT:
  *    In this mode, if we need to commit the write later, then the block
  *    is immediately written into the file system (using dmu_sync),
  *    and a pointer to the block is put into the log record.
  *    When the txg commits the block is linked in.
  *    This saves additionally writing the data into the log record.
  *    There are a few requirements for this to occur:
  *	- write is greater than zfs/zvol_immediate_write_sz
  *	- not using slogs (as slogs are assumed to always be faster
  *	  than writing into the main pool)
  *	- the write occupies only one block
  * WR_COPIED:
  *    If we know we'll immediately be committing the
  *    transaction (FSYNC or FDSYNC), then we allocate a larger
  *    log record here for the data and copy the data in.
  * WR_NEED_COPY:
  *    Otherwise we don't allocate a buffer, and *if* we need to
  *    flush the write later then a buffer is allocated and
  *    we retrieve the data using the dmu.
  */
 typedef enum {
 	WR_INDIRECT,	/* indirect - a large write (dmu_sync() data */
 			/* and put blkptr in log, rather than actual data) */
 	WR_COPIED,	/* immediate - data is copied into lr_write_t */
 	WR_NEED_COPY,	/* immediate - data needs to be copied if pushed */
 	WR_NUM_STATES	/* number of states */
 } itx_wr_state_t;
 
 typedef void (*zil_callback_t)(void *data);
 
 typedef struct itx {
 	list_node_t	itx_node;	/* linkage on zl_itx_list */
 	void		*itx_private;	/* type-specific opaque data */
 	itx_wr_state_t	itx_wr_state;	/* write state */
 	uint8_t		itx_sync;	/* synchronous transaction */
 	zil_callback_t	itx_callback;   /* Called when the itx is persistent */
 	void		*itx_callback_data; /* User data for the callback */
 	size_t		itx_size;	/* allocated itx structure size */
 	uint64_t	itx_oid;	/* object id */
 	lr_t		itx_lr;		/* common part of log record */
 	/* followed by type-specific part of lr_xx_t and its immediate data */
 } itx_t;
 
 /*
  * Used for zil kstat.
  */
 typedef struct zil_stats {
 	/*
 	 * Number of times a ZIL commit (e.g. fsync) has been requested.
 	 */
 	kstat_named_t zil_commit_count;
 
 	/*
 	 * Number of times the ZIL has been flushed to stable storage.
 	 * This is less than zil_commit_count when commits are "merged"
 	 * (see the documentation above zil_commit()).
 	 */
 	kstat_named_t zil_commit_writer_count;
 
 	/*
 	 * Number of transactions (reads, writes, renames, etc.)
 	 * that have been commited.
 	 */
 	kstat_named_t zil_itx_count;
 
 	/*
 	 * See the documentation for itx_wr_state_t above.
 	 * Note that "bytes" accumulates the length of the transactions
 	 * (i.e. data), not the actual log record sizes.
 	 */
 	kstat_named_t zil_itx_indirect_count;
 	kstat_named_t zil_itx_indirect_bytes;
 	kstat_named_t zil_itx_copied_count;
 	kstat_named_t zil_itx_copied_bytes;
 	kstat_named_t zil_itx_needcopy_count;
 	kstat_named_t zil_itx_needcopy_bytes;
 
 	/*
 	 * Transactions which have been allocated to the "normal"
 	 * (i.e. not slog) storage pool. Note that "bytes" accumulate
 	 * the actual log record sizes - which do not include the actual
 	 * data in case of indirect writes.
 	 */
 	kstat_named_t zil_itx_metaslab_normal_count;
 	kstat_named_t zil_itx_metaslab_normal_bytes;
 
 	/*
 	 * Transactions which have been allocated to the "slog" storage pool.
 	 * If there are no separate log devices, this is the same as the
 	 * "normal" pool.
 	 */
 	kstat_named_t zil_itx_metaslab_slog_count;
 	kstat_named_t zil_itx_metaslab_slog_bytes;
 } zil_stats_t;
 
 extern zil_stats_t zil_stats;
 
 #define	ZIL_STAT_INCR(stat, val) \
     atomic_add_64(&zil_stats.stat.value.ui64, (val));
 #define	ZIL_STAT_BUMP(stat) \
     ZIL_STAT_INCR(stat, 1);
 
 typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
     uint64_t txg);
 typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
     uint64_t txg);
 typedef int zil_replay_func_t(void *arg1, void *arg2, boolean_t byteswap);
-typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
+typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf,
+    struct lwb *lwb, zio_t *zio);
 
 extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
     boolean_t decrypt);
 
 extern void	zil_init(void);
 extern void	zil_fini(void);
 
 extern zilog_t	*zil_alloc(objset_t *os, zil_header_t *zh_phys);
 extern void	zil_free(zilog_t *zilog);
 
 extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
 extern void	zil_close(zilog_t *zilog);
 
 extern void	zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *replay_func[TX_MAX_TYPE]);
 extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
 extern void	zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx);
 
 extern itx_t	*zil_itx_create(uint64_t txtype, size_t lrsize);
 extern void	zil_itx_destroy(itx_t *itx);
 extern void	zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 
 extern void	zil_commit(zilog_t *zilog, uint64_t oid);
 
 extern int	zil_vdev_offline(const char *osname, void *txarg);
 extern int	zil_claim(struct dsl_pool *dp,
     struct dsl_dataset *ds, void *txarg);
 extern int 	zil_check_log_chain(struct dsl_pool *dp,
     struct dsl_dataset *ds, void *tx);
 extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_clean(zilog_t *zilog, uint64_t synced_txg);
 
 extern int	zil_suspend(const char *osname, void **cookiep);
 extern void	zil_resume(void *cookie);
 
-extern void	zil_add_block(zilog_t *zilog, const blkptr_t *bp);
+extern void	zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp);
+extern void	zil_lwb_add_txg(struct lwb *lwb, uint64_t txg);
 extern int	zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
 
 extern void	zil_set_sync(zilog_t *zilog, uint64_t syncval);
 
 extern void	zil_set_logbias(zilog_t *zilog, uint64_t slogval);
 
 extern int zil_replay_disable;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_ZIL_H */
diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h
index dd5304b79a91..3f69b326549b 100644
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -1,165 +1,216 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_ZIL_IMPL_H
 #define	_SYS_ZIL_IMPL_H
 
 #include <sys/zil.h>
 #include <sys/dmu_objset.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
- * Log write buffer.
+ * Possbile states for a given lwb structure. An lwb will start out in
+ * the "closed" state, and then transition to the "opened" state via a
+ * call to zil_lwb_write_open(). After the lwb is "open", it can
+ * transition into the "issued" state via zil_lwb_write_issue(). After
+ * the lwb's zio completes, and the vdev's are flushed, the lwb will
+ * transition into the "done" state via zil_lwb_write_done(), and the
+ * structure eventually freed.
+ */
+typedef enum {
+    LWB_STATE_CLOSED,
+    LWB_STATE_OPENED,
+    LWB_STATE_ISSUED,
+    LWB_STATE_DONE,
+    LWB_NUM_STATES
+} lwb_state_t;
+
+/*
+ * Log write block (lwb)
+ *
+ * Prior to an lwb being issued to disk via zil_lwb_write_issue(), it
+ * will be protected by the zilog's "zl_writer_lock". Basically, prior
+ * to it being issued, it will only be accessed by the thread that's
+ * holding the "zl_writer_lock". After the lwb is issued, the zilog's
+ * "zl_lock" is used to protect the lwb against concurrent access.
  */
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
 	boolean_t	lwb_fastwrite;	/* is blk marked for fastwrite? */
 	boolean_t	lwb_slog;	/* lwb_blk is on SLOG device */
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
+	lwb_state_t	lwb_state;	/* the state of this lwb */
 	char		*lwb_buf;	/* log write buffer */
-	zio_t		*lwb_zio;	/* zio for this buffer */
+	zio_t		*lwb_write_zio;	/* zio for the lwb buffer */
+	zio_t		*lwb_root_zio;	/* root zio for lwb write and flushes */
 	dmu_tx_t	*lwb_tx;	/* tx for log block allocation */
 	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
 	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
+	list_t		lwb_itxs;	/* list of itx's */
+	list_t		lwb_waiters;	/* list of zil_commit_waiter's */
+	avl_tree_t	lwb_vdev_tree;	/* vdevs to flush after lwb write */
+	kmutex_t	lwb_vdev_lock;	/* protects lwb_vdev_tree */
+	hrtime_t	lwb_issued_timestamp; /* when was the lwb issued? */
 } lwb_t;
 
+/*
+ * ZIL commit waiter.
+ *
+ * This structure is allocated each time zil_commit() is called, and is
+ * used by zil_commit() to communicate with other parts of the ZIL, such
+ * that zil_commit() can know when it safe for it return. For more
+ * details, see the comment above zil_commit().
+ *
+ * The "zcw_lock" field is used to protect the commit waiter against
+ * concurrent access. This lock is often acquired while already holding
+ * the zilog's "zl_writer_lock" or "zl_lock"; see the functions
+ * zil_process_commit_list() and zil_lwb_flush_vdevs_done() as examples
+ * of this. Thus, one must be careful not to acquire the
+ * "zl_writer_lock" or "zl_lock" when already holding the "zcw_lock";
+ * e.g. see the zil_commit_waiter_timeout() function.
+ */
+typedef struct zil_commit_waiter {
+	kcondvar_t	zcw_cv;		/* signalled when "done" */
+	kmutex_t	zcw_lock;	/* protects fields of this struct */
+	list_node_t	zcw_node;	/* linkage in lwb_t:lwb_waiter list */
+	lwb_t		*zcw_lwb;	/* back pointer to lwb when linked */
+	boolean_t	zcw_done;	/* B_TRUE when "done", else B_FALSE */
+	int		zcw_zio_error;	/* contains the zio io_error value */
+} zil_commit_waiter_t;
+
 /*
  * Intent log transaction lists
  */
 typedef struct itxs {
 	list_t		i_sync_list;	/* list of synchronous itxs */
 	avl_tree_t	i_async_tree;	/* tree of foids for async itxs */
 } itxs_t;
 
 typedef struct itxg {
 	kmutex_t	itxg_lock;	/* lock for this structure */
 	uint64_t	itxg_txg;	/* txg for this chain */
 	itxs_t		*itxg_itxs;	/* sync and async itxs */
 } itxg_t;
 
 /* for async nodes we build up an AVL tree of lists of async itxs per file */
 typedef struct itx_async_node {
 	uint64_t	ia_foid;	/* file object id */
 	list_t		ia_list;	/* list of async itxs for this foid */
 	avl_node_t	ia_node;	/* AVL tree linkage */
 } itx_async_node_t;
 
 /*
  * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
  * we've touched so we know which ones need a write cache flush at the end.
  */
 typedef struct zil_vdev_node {
 	uint64_t	zv_vdev;	/* vdev to be flushed */
 	avl_node_t	zv_node;	/* AVL tree linkage */
 } zil_vdev_node_t;
 
 #define	ZIL_PREV_BLKS 16
 
 /*
  * Stable storage intent log management structure.  One per dataset.
  */
 struct zilog {
 	kmutex_t	zl_lock;	/* protects most zilog_t fields */
 	struct dsl_pool	*zl_dmu_pool;	/* DSL pool */
 	spa_t		*zl_spa;	/* handle for read/write log */
 	const zil_header_t *zl_header;	/* log header buffer */
 	objset_t	*zl_os;		/* object set we're logging */
 	zil_get_data_t	*zl_get_data;	/* callback to get object content */
-	zio_t		*zl_root_zio;	/* log writer root zio */
+	lwb_t		*zl_last_lwb_opened; /* most recent lwb opened */
+	hrtime_t	zl_last_lwb_latency; /* zio latency of last lwb done */
 	uint64_t	zl_lr_seq;	/* on-disk log record sequence number */
 	uint64_t	zl_commit_lr_seq; /* last committed on-disk lr seq */
 	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
 	uint64_t	zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
 	uint64_t	zl_replaying_seq; /* current replay seq number */
 	uint32_t	zl_suspend;	/* log suspend count */
-	kcondvar_t	zl_cv_writer;	/* log writer thread completion */
 	kcondvar_t	zl_cv_suspend;	/* log suspend completion */
 	uint8_t		zl_suspending;	/* log is currently suspending */
 	uint8_t		zl_keep_first;	/* keep first log block in destroy */
 	uint8_t		zl_replay;	/* replaying records while set */
 	uint8_t		zl_stop_sync;	/* for debugging */
-	uint8_t		zl_writer;	/* boolean: write setup in progress */
+	kmutex_t	zl_writer_lock;	/* single writer, per ZIL, at a time */
 	uint8_t		zl_logbias;	/* latency or throughput */
 	uint8_t		zl_sync;	/* synchronous or asynchronous */
 	int		zl_parse_error;	/* last zil_parse() error */
 	uint64_t	zl_parse_blk_seq; /* highest blk seq on last parse */
 	uint64_t	zl_parse_lr_seq; /* highest lr seq on last parse */
 	uint64_t	zl_parse_blk_count; /* number of blocks parsed */
 	uint64_t	zl_parse_lr_count; /* number of log records parsed */
-	uint64_t	zl_next_batch;	/* next batch number */
-	uint64_t	zl_com_batch;	/* committed batch number */
-	kcondvar_t	zl_cv_batch[2];	/* batch condition variables */
 	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
 	list_t		zl_itx_commit_list; /* itx list to be committed */
 	uint64_t	zl_cur_used;	/* current commit log size used */
 	list_t		zl_lwb_list;	/* in-flight log write list */
-	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
-	avl_tree_t	zl_vdev_tree;	/* vdevs to flush in zil_commit() */
 	avl_tree_t	zl_bp_tree;	/* track bps during log parse */
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
 	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
 	zil_header_t	zl_old_header;	/* debugging aid */
 	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
 	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
 	txg_node_t	zl_dirty_link;	/* protected by dp_dirty_zilogs list */
+	uint64_t	zl_dirty_max_txg; /* highest txg used to dirty zilog */
 };
 
 typedef struct zil_bp_node {
 	dva_t		zn_dva;
 	avl_node_t	zn_node;
 } zil_bp_node_t;
 
 /*
  * Maximum amount of write data that can be put into single log block.
  */
 #define	ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
 
 /*
  * Maximum amount of log space we agree to waste to reduce number of
  * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
  */
 #define	ZIL_MAX_WASTE_SPACE (ZIL_MAX_LOG_DATA / 8)
 
 /*
  * Maximum amount of write data for WR_COPIED.  Fall back to WR_NEED_COPY
  * as more space efficient if we can't fit at least two log records into
  * maximum sized log block.
  */
 #define	ZIL_MAX_COPIED_DATA ((SPA_OLD_MAXBLOCKSIZE - \
     sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_ZIL_IMPL_H */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 30c4ee3ce4c3..1f9f48268463 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -1,657 +1,658 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  */
 
 #ifndef _ZIO_H
 #define	_ZIO_H
 
 #include <sys/zio_priority.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_impl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Embedded checksum
  */
 #define	ZEC_MAGIC	0x210da7ab10c7a11ULL
 
 typedef struct zio_eck {
 	uint64_t	zec_magic;	/* for validation, endianness	*/
 	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
 } zio_eck_t;
 
 /*
  * Gang block headers are self-checksumming and contain an array
  * of block pointers.
  */
 #define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
 #define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
 	sizeof (zio_eck_t)) / sizeof (blkptr_t))
 #define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
 	sizeof (zio_eck_t) - \
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
 typedef struct zio_gbh {
 	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
 	uint64_t		zg_filler[SPA_GBH_FILLER];
 	zio_eck_t		zg_tail;
 } zio_gbh_phys_t;
 
 enum zio_checksum {
 	ZIO_CHECKSUM_INHERIT = 0,
 	ZIO_CHECKSUM_ON,
 	ZIO_CHECKSUM_OFF,
 	ZIO_CHECKSUM_LABEL,
 	ZIO_CHECKSUM_GANG_HEADER,
 	ZIO_CHECKSUM_ZILOG,
 	ZIO_CHECKSUM_FLETCHER_2,
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
 	ZIO_CHECKSUM_NOPARITY,
 	ZIO_CHECKSUM_SHA512,
 	ZIO_CHECKSUM_SKEIN,
 	ZIO_CHECKSUM_EDONR,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
 
 #define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
 #define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
 
 #define	ZIO_CHECKSUM_MASK	0xffULL
 #define	ZIO_CHECKSUM_VERIFY	(1 << 8)
 
 #define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
 #define	ZIO_DEDUPDITTO_MIN	100
 
 /* supported encryption algorithms */
 enum zio_encrypt {
 	ZIO_CRYPT_INHERIT = 0,
 	ZIO_CRYPT_ON,
 	ZIO_CRYPT_OFF,
 	ZIO_CRYPT_AES_128_CCM,
 	ZIO_CRYPT_AES_192_CCM,
 	ZIO_CRYPT_AES_256_CCM,
 	ZIO_CRYPT_AES_128_GCM,
 	ZIO_CRYPT_AES_192_GCM,
 	ZIO_CRYPT_AES_256_GCM,
 	ZIO_CRYPT_FUNCTIONS
 };
 
 #define	ZIO_CRYPT_ON_VALUE	ZIO_CRYPT_AES_256_CCM
 #define	ZIO_CRYPT_DEFAULT	ZIO_CRYPT_OFF
 
 /* macros defining encryption lengths */
 #define	ZIO_OBJSET_MAC_LEN		32
 #define	ZIO_DATA_IV_LEN			12
 #define	ZIO_DATA_SALT_LEN		8
 #define	ZIO_DATA_MAC_LEN		16
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
 
 /*
  * The meaning of "compress = on" selected by the compression features enabled
  * on a given pool.
  */
 #define	ZIO_COMPRESS_LEGACY_ON_VALUE	ZIO_COMPRESS_LZJB
 #define	ZIO_COMPRESS_LZ4_ON_VALUE	ZIO_COMPRESS_LZ4
 
 #define	ZIO_COMPRESS_DEFAULT		ZIO_COMPRESS_OFF
 
 #define	BOOTFS_COMPRESS_VALID(compress)			\
 	((compress) == ZIO_COMPRESS_LZJB ||		\
 	(compress) == ZIO_COMPRESS_LZ4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_1 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_2 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_3 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_5 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_6 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_7 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_8 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_9 ||		\
 	(compress) == ZIO_COMPRESS_ZLE ||		\
 	(compress) == ZIO_COMPRESS_ON ||		\
 	(compress) == ZIO_COMPRESS_OFF)
 
 /*
  * Default Linux timeout for a sd device.
  */
 #define	ZIO_DELAY_MAX			(30 * MILLISEC)
 
 #define	ZIO_FAILURE_MODE_WAIT		0
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
 
 enum zio_flag {
 	/*
 	 * Flags inherited by gang, ddt, and vdev children,
 	 * and that must be equal for two zios to aggregate
 	 */
 	ZIO_FLAG_DONT_AGGREGATE	= 1 << 0,
 	ZIO_FLAG_IO_REPAIR	= 1 << 1,
 	ZIO_FLAG_SELF_HEAL	= 1 << 2,
 	ZIO_FLAG_RESILVER	= 1 << 3,
 	ZIO_FLAG_SCRUB		= 1 << 4,
 	ZIO_FLAG_SCAN_THREAD	= 1 << 5,
 	ZIO_FLAG_PHYSICAL	= 1 << 6,
 
 #define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
 
 	/*
 	 * Flags inherited by ddt, gang, and vdev children.
 	 */
 	ZIO_FLAG_CANFAIL	= 1 << 7,	/* must be first for INHERIT */
 	ZIO_FLAG_SPECULATIVE	= 1 << 8,
 	ZIO_FLAG_CONFIG_WRITER	= 1 << 9,
 	ZIO_FLAG_DONT_RETRY	= 1 << 10,
 	ZIO_FLAG_DONT_CACHE	= 1 << 11,
 	ZIO_FLAG_NODATA		= 1 << 12,
 	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 13,
 	ZIO_FLAG_IO_ALLOCATING  = 1 << 14,
 
 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 
 	/*
 	 * Flags inherited by vdev children.
 	 */
 	ZIO_FLAG_IO_RETRY	= 1 << 15,	/* must be first for INHERIT */
 	ZIO_FLAG_PROBE		= 1 << 16,
 	ZIO_FLAG_TRYHARD	= 1 << 17,
 	ZIO_FLAG_OPTIONAL	= 1 << 18,
 
 #define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
 
 	/*
 	 * Flags not inherited by any children.
 	 */
 	ZIO_FLAG_DONT_QUEUE	= 1 << 19,	/* must be first for INHERIT */
 	ZIO_FLAG_DONT_PROPAGATE	= 1 << 20,
 	ZIO_FLAG_IO_BYPASS	= 1 << 21,
 	ZIO_FLAG_IO_REWRITE	= 1 << 22,
 	ZIO_FLAG_RAW_COMPRESS	= 1 << 23,
 	ZIO_FLAG_RAW_ENCRYPT	= 1 << 24,
 	ZIO_FLAG_GANG_CHILD	= 1 << 25,
 	ZIO_FLAG_DDT_CHILD	= 1 << 26,
 	ZIO_FLAG_GODFATHER	= 1 << 27,
 	ZIO_FLAG_NOPWRITE	= 1 << 28,
 	ZIO_FLAG_REEXECUTED	= 1 << 29,
 	ZIO_FLAG_DELEGATED	= 1 << 30,
 	ZIO_FLAG_FASTWRITE	= 1 << 31,
 };
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
 #define	ZIO_FLAG_RAW	(ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
 
 #define	ZIO_DDT_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |		\
 	ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_GANG_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
 	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_VDEV_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
 	ZIO_FLAG_CANFAIL)
 
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
 	ZIO_CHILD_DDT,
 	ZIO_CHILD_LOGICAL,
 	ZIO_CHILD_TYPES
 };
 
 enum zio_wait_type {
 	ZIO_WAIT_READY = 0,
 	ZIO_WAIT_DONE,
 	ZIO_WAIT_TYPES
 };
 
 /*
  * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent
  * graveyard) to indicate checksum errors and fragmentation.
  */
 #define	ECKSUM	EBADE
 #define	EFRAGS	EBADR
 
 typedef void zio_done_func_t(zio_t *zio);
 
 extern int zio_dva_throttle_enabled;
 extern const char *zio_type_name[ZIO_TYPES];
 
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
  * identifies any block in the pool.  By convention, the meta-objset (MOS)
  * is objset 0, and the meta-dnode is object 0.  This covers all blocks
  * except root blocks and ZIL blocks, which are defined as follows:
  *
  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
  * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
  *
  * Note: this structure is called a bookmark because its original purpose
  * was to remember where to resume a pool-wide traverse.
  *
  * Note: this structure is passed between userland and the kernel, and is
  * stored on disk (by virtue of being incorporated into other on-disk
  * structures, e.g. dsl_scan_phys_t).
  */
 struct zbookmark_phys {
 	uint64_t	zb_objset;
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
 };
 
 #define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
 {                                                       \
 	(zb)->zb_objset = objset;                       \
 	(zb)->zb_object = object;                       \
 	(zb)->zb_level = level;                         \
 	(zb)->zb_blkid = blkid;                         \
 }
 
 #define	ZB_DESTROYED_OBJSET	(-1ULL)
 
 #define	ZB_ROOT_OBJECT		(0ULL)
 #define	ZB_ROOT_LEVEL		(-1LL)
 #define	ZB_ROOT_BLKID		(0ULL)
 
 #define	ZB_ZIL_OBJECT		(0ULL)
 #define	ZB_ZIL_LEVEL		(-2LL)
 
 #define	ZB_DNODE_LEVEL		(-3LL)
 #define	ZB_DNODE_BLKID		(0ULL)
 
 #define	ZB_IS_ZERO(zb)						\
 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
 #define	ZB_IS_ROOT(zb)				\
 	((zb)->zb_object == ZB_ROOT_OBJECT &&	\
 	(zb)->zb_level == ZB_ROOT_LEVEL &&	\
 	(zb)->zb_blkid == ZB_ROOT_BLKID)
 
 typedef struct zio_prop {
 	enum zio_checksum	zp_checksum;
 	enum zio_compress	zp_compress;
 	dmu_object_type_t	zp_type;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
 	boolean_t		zp_dedup;
 	boolean_t		zp_dedup_verify;
 	boolean_t		zp_nopwrite;
 	boolean_t		zp_encrypt;
 	boolean_t		zp_byteorder;
 	uint8_t			zp_salt[ZIO_DATA_SALT_LEN];
 	uint8_t			zp_iv[ZIO_DATA_IV_LEN];
 	uint8_t			zp_mac[ZIO_DATA_MAC_LEN];
 } zio_prop_t;
 
 typedef struct zio_cksum_report zio_cksum_report_t;
 
 typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
     const abd_t *good_data);
 typedef void zio_cksum_free_f(void *cbdata, size_t size);
 
 struct zio_bad_cksum;				/* defined in zio_checksum.h */
 struct dnode_phys;
 struct abd;
 
 struct zio_cksum_report {
 	struct zio_cksum_report *zcr_next;
 	nvlist_t		*zcr_ereport;
 	nvlist_t		*zcr_detector;
 	void			*zcr_cbdata;
 	size_t			zcr_cbinfo;	/* passed to zcr_free() */
 	uint64_t		zcr_align;
 	uint64_t		zcr_length;
 	zio_cksum_finish_f	*zcr_finish;
 	zio_cksum_free_f	*zcr_free;
 
 	/* internal use only */
 	struct zio_bad_cksum	*zcr_ckinfo;	/* information from failure */
 };
 
 typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr,
     void *arg);
 
 zio_vsd_cksum_report_f	zio_vsd_default_cksum_report;
 
 typedef struct zio_vsd_ops {
 	zio_done_func_t		*vsd_free;
 	zio_vsd_cksum_report_f	*vsd_cksum_report;
 } zio_vsd_ops_t;
 
 typedef struct zio_gang_node {
 	zio_gbh_phys_t		*gn_gbh;
 	struct zio_gang_node	*gn_child[SPA_GBH_NBLKPTRS];
 } zio_gang_node_t;
 
 typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
     zio_gang_node_t *gn, struct abd *data, uint64_t offset);
 
 typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
 
 typedef struct zio_transform {
 	struct abd		*zt_orig_abd;
 	uint64_t		zt_orig_size;
 	uint64_t		zt_bufsize;
 	zio_transform_func_t	*zt_transform;
 	struct zio_transform	*zt_next;
 } zio_transform_t;
 
 typedef int zio_pipe_stage_t(zio_t *zio);
 
 /*
  * The io_reexecute flags are distinct from io_flags because the child must
  * be able to propagate them to the parent.  The normal io_flags are local
  * to the zio, not protected by any lock, and not modifiable by children;
  * the reexecute flags are protected by io_lock, modifiable by children,
  * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
  */
 #define	ZIO_REEXECUTE_NOW	0x01
 #define	ZIO_REEXECUTE_SUSPEND	0x02
 
 typedef struct zio_alloc_list {
 	list_t  zal_list;
 	uint64_t zal_size;
 } zio_alloc_list_t;
 
 typedef struct zio_link {
 	zio_t		*zl_parent;
 	zio_t		*zl_child;
 	list_node_t	zl_parent_node;
 	list_node_t	zl_child_node;
 } zio_link_t;
 
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_phys_t	io_bookmark;
 	zio_prop_t	io_prop;
 	zio_type_t	io_type;
 	enum zio_child	io_child_type;
 	int		io_cmd;
 	zio_priority_t	io_priority;
 	uint8_t		io_reexecute;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
 	spa_t		*io_spa;
 	blkptr_t	*io_bp;
 	blkptr_t	*io_bp_override;
 	blkptr_t	io_bp_copy;
 	list_t		io_parent_list;
 	list_t		io_child_list;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
 	/* Callback info */
 	zio_done_func_t	*io_ready;
 	zio_done_func_t	*io_children_ready;
 	zio_done_func_t	*io_physdone;
 	zio_done_func_t	*io_done;
 	void		*io_private;
 	int64_t		io_prev_space_delta;	/* DMU private */
 	blkptr_t	io_bp_orig;
 	/* io_lsize != io_orig_size iff this is a raw write */
 	uint64_t	io_lsize;
 
 	/* Data represented by this I/O */
 	struct abd	*io_abd;
 	struct abd	*io_orig_abd;
 	uint64_t	io_size;
 	uint64_t	io_orig_size;
 
 	/* Stuff for the vdev stack */
 	vdev_t		*io_vd;
 	void		*io_vsd;
 	const zio_vsd_ops_t *io_vsd_ops;
 
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;	/* submitted at */
 	hrtime_t	io_queued_timestamp;
 	hrtime_t	io_target_timestamp;
 	hrtime_t	io_delta;	/* vdev queue service delta */
 	hrtime_t	io_delay;	/* Device access time (disk or */
 					/* file). */
 	avl_node_t	io_queue_node;
 	avl_node_t	io_offset_node;
 	avl_node_t	io_alloc_node;
 	zio_alloc_list_t 	io_alloc_list;
 
 	/* Internal pipeline state */
 	enum zio_flag	io_flags;
 	enum zio_stage	io_stage;
 	enum zio_stage	io_pipeline;
 	enum zio_flag	io_orig_flags;
 	enum zio_stage	io_orig_stage;
 	enum zio_stage	io_orig_pipeline;
 	enum zio_stage	io_pipeline_trace;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
 	uint64_t	io_child_count;
 	uint64_t	io_phys_children;
 	uint64_t	io_parent_count;
 	uint64_t	*io_stall;
 	zio_t		*io_gang_leader;
 	zio_gang_node_t	*io_gang_tree;
 	void		*io_executor;
 	void		*io_waiter;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
 
 	/* FMA state */
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
 
 extern int zio_bookmark_compare(const void *, const void *);
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
     zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_root(spa_t *spa,
     zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *physdone, zio_done_func_t *done,
     void *private, zio_priority_t priority, enum zio_flag flags,
     const zbookmark_phys_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
 
 extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
     boolean_t nopwrite);
 
 extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 
 extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp,
     zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     enum zio_flag flags, boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     enum zio_flag flags, boolean_t labels);
 
 extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, enum zio_flag flags);
 
 extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
     blkptr_t *new_bp, uint64_t size, boolean_t *slog);
 extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
 extern void zio_execute(zio_t *zio);
 extern void zio_interrupt(zio_t *zio);
 extern void zio_delay_init(zio_t *zio);
 extern void zio_delay_interrupt(zio_t *zio);
 
 extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
 extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
 extern zio_t *zio_unique_parent(zio_t *cio);
 extern void zio_add_child(zio_t *pio, zio_t *cio);
 
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
 extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
     uint64_t bufsize, zio_transform_func_t *transform);
 extern void zio_pop_transforms(zio_t *zio);
 
 extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     uint64_t offset, struct abd *data, uint64_t size, int type,
     zio_priority_t priority, enum zio_flag flags,
     zio_done_func_t *done, void *private);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
     struct abd *data, uint64_t size, int type, zio_priority_t priority,
     enum zio_flag flags, zio_done_func_t *done, void *private);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
 extern void zio_vdev_io_redone(zio_t *zio);
 
 extern void zio_checksum_verified(zio_t *zio);
 extern int zio_worst_error(int e1, int e2);
 
 extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
     enum zio_checksum parent);
 extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
     enum zio_checksum child, enum zio_checksum parent);
 extern enum zio_compress zio_compress_select(spa_t *spa,
     enum zio_compress child, enum zio_compress parent);
 
+extern void zio_cancel(zio_t *zio);
 extern void zio_suspend(spa_t *spa, zio_t *zio);
 extern int zio_resume(spa_t *spa);
 extern void zio_resume_wait(spa_t *spa);
 
 /*
  * Initial setup and teardown.
  */
 extern void zio_init(void);
 extern void zio_fini(void);
 
 /*
  * Fault injection
  */
 struct zinject_record;
 extern uint32_t zio_injection_enabled;
 extern int zio_inject_fault(char *name, int flags, int *id,
     struct zinject_record *record);
 extern int zio_inject_list_next(int *id, char *name, size_t buflen,
     struct zinject_record *record);
 extern int zio_clear_fault(int id);
 extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
 extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1,
     int err2);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
 extern hrtime_t zio_handle_io_delay(zio_t *zio);
 
 /*
  * Checksum ereport functions
  */
 extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
     zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length,
     void *arg, struct zio_bad_cksum *info);
 extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
     const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical);
 
 extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
 
 /* If we have the good data in hand, this function can be used */
 extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
     zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length,
     const abd_t *good_data, const abd_t *bad_data, struct zio_bad_cksum *info);
 
 /* Called from spa_sync(), but primarily an injection handler */
 extern void spa_handle_ignored_writes(spa_t *spa);
 
 /* zbookmark_phys functions */
 boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
 int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
     uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _ZIO_H */
diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h
index 026d6fef9df1..c080bc8aa5bd 100644
--- a/lib/libspl/include/assert.h
+++ b/lib/libspl/include/assert.h
@@ -1,125 +1,128 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include_next <assert.h>
 
 #ifndef _LIBSPL_ASSERT_H
 #define	_LIBSPL_ASSERT_H
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
 
 static inline int
 libspl_assert(const char *buf, const char *file, const char *func, int line)
 {
 	fprintf(stderr, "%s\n", buf);
 	fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func);
 	abort();
 }
 
 /* printf version of libspl_assert */
 static inline void
 libspl_assertf(const char *file, const char *func, int line, char *format, ...)
 {
 	va_list args;
 
 	va_start(args, format);
 	vfprintf(stderr, format, args);
 	fprintf(stderr, "\n");
 	fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func);
 	va_end(args);
 	abort();
 }
 
 #ifdef verify
 #undef verify
 #endif
 
 #define	VERIFY(cond)							\
 	(void) ((!(cond)) &&						\
 	    libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__))
 #define	verify(cond)							\
 	(void) ((!(cond)) &&						\
 	    libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__))
 
 #define	VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE)				\
 do {									\
 	const TYPE __left = (TYPE)(LEFT);				\
 	const TYPE __right = (TYPE)(RIGHT);				\
 	if (!(__left OP __right))					\
 		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
 		    "%s %s %s (0x%llx %s 0x%llx)", #LEFT, #OP, #RIGHT,	\
 		    (u_longlong_t)__left, #OP, (u_longlong_t)__right);	\
 } while (0)
 
+#define	VERIFY3B(x, y, z)	VERIFY3_IMPL(x, y, z, boolean_t)
 #define	VERIFY3S(x, y, z)	VERIFY3_IMPL(x, y, z, int64_t)
 #define	VERIFY3U(x, y, z)	VERIFY3_IMPL(x, y, z, uint64_t)
 #define	VERIFY3P(x, y, z)	VERIFY3_IMPL(x, y, z, uintptr_t)
 #define	VERIFY0(x)		VERIFY3_IMPL(x, ==, 0, uint64_t)
 
 #ifdef assert
 #undef assert
 #endif
 
 /* Compile time assert */
 #define	CTASSERT_GLOBAL(x)		_CTASSERT(x, __LINE__)
 #define	CTASSERT(x)			{ _CTASSERT(x, __LINE__); }
 #define	_CTASSERT(x, y)			__CTASSERT(x, y)
 #define	__CTASSERT(x, y)						\
 	typedef char __attribute__((unused))				\
 	__compile_time_assertion__ ## y[(x) ? 1 : -1]
 
 #ifdef NDEBUG
+#define	ASSERT3B(x, y, z)	((void)0)
 #define	ASSERT3S(x, y, z)	((void)0)
 #define	ASSERT3U(x, y, z)	((void)0)
 #define	ASSERT3P(x, y, z)	((void)0)
 #define	ASSERT0(x)		((void)0)
 #define	ASSERT(x)		((void)0)
 #define	assert(x)		((void)0)
 #define	ASSERTV(x)
 #define	IMPLY(A, B)		((void)0)
 #define	EQUIV(A, B)		((void)0)
 #else
+#define	ASSERT3B(x, y, z)	VERIFY3B(x, y, z)
 #define	ASSERT3S(x, y, z)	VERIFY3S(x, y, z)
 #define	ASSERT3U(x, y, z)	VERIFY3U(x, y, z)
 #define	ASSERT3P(x, y, z)	VERIFY3P(x, y, z)
 #define	ASSERT0(x)		VERIFY0(x)
 #define	ASSERT(x)		VERIFY(x)
 #define	assert(x)		VERIFY(x)
 #define	ASSERTV(x)		x
 #define	IMPLY(A, B) \
 	((void)(((!(A)) || (B)) || \
 	    libspl_assert("(" #A ") implies (" #B ")", \
 	    __FILE__, __FUNCTION__, __LINE__)))
 #define	EQUIV(A, B) \
 	((void)((!!(A) == !!(B)) || \
 	    libspl_assert("(" #A ") is equivalent to (" #B ")", \
 	    __FILE__, __FUNCTION__, __LINE__)))
 
 #endif  /* NDEBUG */
 
 #endif  /* _LIBSPL_ASSERT_H */
diff --git a/lib/libspl/include/sys/time.h b/lib/libspl/include/sys/time.h
index f05fcaa1c663..b3dd74dd5715 100644
--- a/lib/libspl/include/sys/time.h
+++ b/lib/libspl/include/sys/time.h
@@ -1,78 +1,86 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _LIBSPL_SYS_TIME_H
 #define	_LIBSPL_SYS_TIME_H
 
 #include_next <sys/time.h>
 #include <sys/types.h>
 
 #ifndef SEC
 #define	SEC		1
 #endif
 
 #ifndef MILLISEC
 #define	MILLISEC	1000
 #endif
 
 #ifndef MICROSEC
 #define	MICROSEC	1000000
 #endif
 
 #ifndef NANOSEC
 #define	NANOSEC		1000000000
 #endif
 
 #ifndef NSEC_PER_USEC
 #define	NSEC_PER_USEC	1000L
 #endif
 
 #ifndef MSEC2NSEC
-#define	MSEC2NSEC(m)    ((hrtime_t)(m) * (NANOSEC / MILLISEC))
+#define	MSEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / MILLISEC))
 #endif
 
 #ifndef NSEC2MSEC
-#define	NSEC2MSEC(n)    ((n) / (NANOSEC / MILLISEC))
+#define	NSEC2MSEC(n)	((n) / (NANOSEC / MILLISEC))
 #endif
 
-#ifndef	NSEC2SEC
+#ifndef USEC2NSEC
+#define	USEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / MICROSEC))
+#endif
+
+#ifndef NSEC2USEC
+#define	NSEC2USEC(n)	((n) / (NANOSEC / MICROSEC))
+#endif
+
+#ifndef NSEC2SEC
 #define	NSEC2SEC(n)	((n) / (NANOSEC / SEC))
 #endif
 
 #ifndef SEC2NSEC
 #define	SEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / SEC))
 #endif
 
 
 typedef	long long		hrtime_t;
 typedef	struct	timespec	timestruc_t;
 typedef	struct	timespec	timespec_t;
 
 
 extern hrtime_t gethrtime(void);
 extern void gethrestime(timestruc_t *);
 
 #endif /* _LIBSPL_SYS_TIME_H */
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 108dfe15736d..56740ae375a8 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1,2512 +1,2519 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_prop.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/trace_dmu.h>
 #include <sys/zfs_rlock.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * Enable/disable nopwrite feature.
  */
 int zfs_nopwrite_enabled = 1;
 
 /*
  * Tunable to control percentage of dirtied blocks from frees in one TXG.
  * After this threshold is crossed, additional dirty blocks from frees
  * wait until the next TXG.
  * A value of zero will disable this throttle.
  */
 unsigned long zfs_per_txg_dirty_frees_percent = 30;
 
 /*
  * Enable/disable forcing txg sync when dirty in dmu_offset_next.
  */
 int zfs_dmu_offset_next_sync = 0;
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{ DMU_BSWAP_UINT8,	TRUE,	FALSE,	"unallocated"		},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"object directory"	},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"object array"		},
 	{ DMU_BSWAP_UINT8,	TRUE,	FALSE,	"packed nvlist"		},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"packed nvlist size"	},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"bpobj"			},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"bpobj header"		},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"SPA space map header"	},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"SPA space map"		},
 	{ DMU_BSWAP_UINT64,	TRUE,	TRUE,	"ZIL intent log"	},
 	{ DMU_BSWAP_DNODE,	TRUE,	TRUE,	"DMU dnode"		},
 	{ DMU_BSWAP_OBJSET,	TRUE,	FALSE,	"DMU objset"		},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"DSL directory"		},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL directory child map"},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL dataset snap map"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL props"		},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"DSL dataset"		},
 	{ DMU_BSWAP_ZNODE,	TRUE,	FALSE,	"ZFS znode"		},
 	{ DMU_BSWAP_OLDACL,	TRUE,	TRUE,	"ZFS V0 ACL"		},
 	{ DMU_BSWAP_UINT8,	FALSE,	TRUE,	"ZFS plain file"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"ZFS directory"		},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"ZFS master node"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"ZFS delete queue"	},
 	{ DMU_BSWAP_UINT8,	FALSE,	TRUE,	"zvol object"		},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"zvol prop"		},
 	{ DMU_BSWAP_UINT8,	FALSE,	TRUE,	"other uint8[]"		},
 	{ DMU_BSWAP_UINT64,	FALSE,	TRUE,	"other uint64[]"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"other ZAP"		},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"persistent error log"	},
 	{ DMU_BSWAP_UINT8,	TRUE,	FALSE,	"SPA history"		},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"SPA history offsets"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"Pool properties"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL permissions"	},
 	{ DMU_BSWAP_ACL,	TRUE,	TRUE,	"ZFS ACL"		},
 	{ DMU_BSWAP_UINT8,	TRUE,	TRUE,	"ZFS SYSACL"		},
 	{ DMU_BSWAP_UINT8,	TRUE,	TRUE,	"FUID table"		},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"FUID table size"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL dataset next clones"},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"scan work queue"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"ZFS user/group used"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"ZFS user/group quota"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"snapshot refcount tags"},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DDT ZAP algorithm"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DDT statistics"	},
 	{ DMU_BSWAP_UINT8,	TRUE,	TRUE,	"System attributes"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"SA master node"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"SA attr registration"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	TRUE,	"SA attr layouts"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"scan translations"	},
 	{ DMU_BSWAP_UINT8,	FALSE,	TRUE,	"deduplicated block"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL deadlist map"	},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"DSL deadlist map hdr"	},
 	{ DMU_BSWAP_ZAP,	TRUE,	FALSE,	"DSL dir clones"	},
 	{ DMU_BSWAP_UINT64,	TRUE,	FALSE,	"bpobj subobj"		}
 };
 
 const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	byteswap_uint8_array,	"uint8"		},
 	{	byteswap_uint16_array,	"uint16"	},
 	{	byteswap_uint32_array,	"uint32"	},
 	{	byteswap_uint64_array,	"uint64"	},
 	{	zap_byteswap,		"zap"		},
 	{	dnode_buf_byteswap,	"dnode"		},
 	{	dmu_objset_byteswap,	"objset"	},
 	{	zfs_znode_byteswap,	"znode"		},
 	{	zfs_oldacl_byteswap,	"oldacl"	},
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
 int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
     void *tag, dmu_buf_t **dbp)
 {
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 
 	blkid = dbuf_whichblock(dn, 0, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (err);
 }
 
 int
 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_bonus_max(void)
 {
 	return (DN_OLD_MAX_BONUSLEN);
 }
 
 int
 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else if (newsize < 0 || newsize > db_fake->db_size) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonuslen(dn, newsize, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 int
 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (!DMU_OT_IS_VALID(type)) {
 		error = SET_ERROR(EINVAL);
 	} else if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonus_type(dn, type, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 dmu_object_type_t
 dmu_get_bonustype(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	type = dn->dn_bonustype;
 	DB_DNODE_EXIT(db);
 
 	return (type);
 }
 
 int
 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	dbuf_rm_spill(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_rm_spill(dn, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  */
 int
 dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags,
     dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *db;
 	int error;
 	uint32_t db_flags = DB_RF_MUST_SUCCEED;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	if (error)
 		return (error);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		rw_exit(&dn->dn_struct_rwlock);
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		if (dn->dn_bonus == NULL)
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 
 	/* as long as the bonus buf is held, the dnode will be held */
 	if (refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
 		atomic_inc_32(&dn->dn_dbufs_count);
 	}
 
 	/*
 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
 	 * a dnode hold for every dbuf.
 	 */
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 
 	error = dbuf_read(db, NULL, db_flags);
 	if (error) {
 		dnode_evict_bonus(dn);
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 		return (error);
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_bonus_hold(objset_t *os, uint64_t obj, void *tag, dmu_buf_t **dbp)
 {
 	return (dmu_bonus_hold_impl(os, obj, tag, DMU_READ_NO_PREFETCH, dbp));
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  *
  * This interface will allocate a blank spill dbuf when a spill blk
  * doesn't already exist on the dnode.
  *
  * if you only want to find an already existing spill db, then
  * dmu_spill_hold_existing() should be used.
  */
 int
 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = NULL;
 	int err;
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 	err = dbuf_read(db, NULL, flags);
 	if (err == 0)
 		*dbp = &db->db;
 	else {
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 	}
 	return (err);
 }
 
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 		err = SET_ERROR(EINVAL);
 	} else {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 		if (!dn->dn_have_spill) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			err = dmu_spill_hold_by_dnode(dn,
 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 		}
 
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	DB_DNODE_EXIT(db);
 	return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
  * whose dnodes are in the same block.
  */
 static int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
 	/*
 	 * Note: We directly notify the prefetch code of this read, so that
 	 * we can tell it about the multi-block read.  dbuf_read() only knows
 	 * about the one block it is accessing.
 	 */
 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
 	    DB_RF_NOPREFETCH;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
 		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
 			    "%llx/%llx (size=%u access=%llu+%llu)",
 			    (longlong_t)dn->dn_objset->
 			    os_dsl_dataset->ds_object,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
 			rw_exit(&dn->dn_struct_rwlock);
 			return (SET_ERROR(EIO));
 		}
 		nblks = 1;
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			zio_nowait(zio);
 			return (SET_ERROR(EIO));
 		}
 
 		/* initiate async i/o */
 		if (read)
 			(void) dbuf_read(db, zio, dbuf_flags);
 		dbp[i] = &db->db;
 	}
 
 	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
 	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
 		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
 		    read && DNODE_IS_CACHEABLE(dn));
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* wait for async i/o */
 	err = zio_wait(zio);
 	if (err) {
 		dmu_buf_rele_array(dbp, nblks, tag);
 		return (err);
 	}
 
 	/* wait for other io to complete */
 	if (read) {
 		for (i = 0; i < nblks; i++) {
 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL)
 				cv_wait(&db->db_changed, &db->db_mtx);
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 			if (err) {
 				dmu_buf_rele_array(dbp, nblks, tag);
 				return (err);
 			}
 		}
 	}
 
 	*numbufsp = nblks;
 	*dbpp = dbp;
 	return (0);
 }
 
 static int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, boolean_t read, void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 
 	if (numbufs == 0)
 		return;
 
 	for (i = 0; i < numbufs; i++) {
 		if (dbp[i])
 			dbuf_rele(dbp[i], tag);
 	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
 /*
  * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefeteched will be those that point to the blocks containing
  * the data starting at offset, and continuing to offset + len.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
  * in cache, they will be asychronously read in.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	int nblks, err;
 
 	if (len == 0) {  /* they're interested in the bonus buffer */
 		dn = DMU_META_DNODE(os);
 
 		if (object == 0 || object >= DN_MAX_OBJECT)
 			return;
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		blkid = dbuf_whichblock(dn, level,
 		    object * sizeof (dnode_phys_t));
 		dbuf_prefetch(dn, level, blkid, pri, 0);
 		rw_exit(&dn->dn_struct_rwlock);
 		return;
 	}
 
 	/*
 	 * XXX - Note, if the dnode for the requested object is not
 	 * already cached, we will do a *synchronous* read in the
 	 * dnode_hold() call.  The same is true for any indirects.
 	 */
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	/*
 	 * offset + len - 1 is the last byte we want to prefetch for, and offset
 	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
 	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
 	 * offset)  is the first.  Then the number we need to prefetch is the
 	 * last - first + 1.
 	 */
 	if (level > 0 || dn->dn_datablkshift != 0) {
 		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
 		    dbuf_whichblock(dn, level, offset) + 1;
 	} else {
 		nblks = (offset < dn->dn_datablksz);
 	}
 
 	if (nblks != 0) {
 		blkid = dbuf_whichblock(dn, level, offset);
 		for (int i = 0; i < nblks; i++)
 			dbuf_prefetch(dn, level, blkid + i, pri, 0);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crashes in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
  *
  * On input, *start should be the first offset that does not need to be
  * freed (e.g. "offset + length").  On return, *start will be the first
  * offset that should be freed.
  */
 static int
 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
 {
 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
 	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange =
 	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
 	ASSERT3U(minimum, <=, *start);
 
 	if (*start - minimum <= iblkrange * maxblks) {
 		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
 	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
 		/*
 		 * dnode_next_offset(BACKWARDS) will find an allocated L1
 		 * indirect block at or before the input offset.  We must
 		 * decrement *start so that it is at the end of the region
 		 * to search.
 		 */
 		(*start)--;
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
 		/* if there are no indirect blocks before start, we are done */
 		if (err == ESRCH) {
 			*start = minimum;
 			break;
 		} else if (err != 0) {
 			return (err);
 		}
 
 		/* set start to the beginning of this L1 indirect */
 		*start = P2ALIGN(*start, iblkrange);
 	}
 	if (*start < minimum)
 		*start = minimum;
 	return (0);
 }
 
 /*
  * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
  * otherwise return false.
  * Used below in dmu_free_long_range_impl() to enable abort when unmounting
  */
 /*ARGSUSED*/
 static boolean_t
 dmu_objset_zfs_unmounting(objset_t *os)
 {
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS)
 		return (zfs_get_vfs_flag_unmounted(os));
 #endif
 	return (B_FALSE);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
     uint64_t length, boolean_t raw)
 {
 	uint64_t object_size;
 	int err;
 	uint64_t dirty_frees_threshold;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	if (dn == NULL)
 		return (SET_ERROR(EINVAL));
 
 	object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	if (offset >= object_size)
 		return (0);
 
 	if (zfs_per_txg_dirty_frees_percent <= 100)
 		dirty_frees_threshold =
 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 	else
 		dirty_frees_threshold = zfs_dirty_data_max / 4;
 
 	if (length == DMU_OBJECT_END || offset + length > object_size)
 		length = object_size - offset;
 
 	while (length != 0) {
 		uint64_t chunk_end, chunk_begin, chunk_len;
 		uint64_t long_free_dirty_all_txgs = 0;
 		dmu_tx_t *tx;
 
 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
 			return (SET_ERROR(EINTR));
 
 		chunk_end = chunk_begin = offset + length;
 
 		/* move chunk_begin backwards to the beginning of this chunk */
 		err = get_next_chunk(dn, &chunk_begin, offset);
 		if (err)
 			return (err);
 		ASSERT3U(chunk_begin, >=, offset);
 		ASSERT3U(chunk_begin, <=, chunk_end);
 
 		chunk_len = chunk_end - chunk_begin;
 
 		mutex_enter(&dp->dp_lock);
 		for (int t = 0; t < TXG_SIZE; t++) {
 			long_free_dirty_all_txgs +=
 			    dp->dp_long_free_dirty_pertxg[t];
 		}
 		mutex_exit(&dp->dp_lock);
 
 		/*
 		 * To avoid filling up a TXG with just frees wait for
 		 * the next TXG to open before freeing more chunks if
 		 * we have reached the threshold of frees
 		 */
 		if (dirty_frees_threshold != 0 &&
 		    long_free_dirty_all_txgs >= dirty_frees_threshold) {
 			txg_wait_open(dp, 0);
 			continue;
 		}
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 
 		/*
 		 * Mark this transaction as typically resulting in a net
 		 * reduction in space used.
 		 */
 		dmu_tx_mark_netfree(tx);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
 
 		mutex_enter(&dp->dp_lock);
 		dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
 		    chunk_len;
 		mutex_exit(&dp->dp_lock);
 		DTRACE_PROBE3(free__long__range,
 		    uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
 		    uint64_t, dmu_tx_get_txg(tx));
 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 
 		/* if this is a raw free, mark the dirty record as such */
 		if (raw) {
 			dbuf_dirty_record_t *dr = dn->dn_dbuf->db_last_dirty;
 
 			while (dr != NULL && dr->dr_txg > tx->tx_txg)
 				dr = dr->dr_next;
 			if (dr != NULL && dr->dr_txg == tx->tx_txg)
 				dr->dt.dl.dr_raw = B_TRUE;
 		}
 
 		dmu_tx_commit(tx);
 
 		length -= chunk_len;
 	}
 	return (0);
 }
 
 int
 dmu_free_long_range(objset_t *os, uint64_t object,
     uint64_t offset, uint64_t length)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length, B_FALSE);
 
 	/*
 	 * It is important to zero out the maxblkid when freeing the entire
 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
 	 * will take the fast path, and (b) dnode_reallocate() can verify
 	 * that the entire file has been freed.
 	 */
 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
 		dn->dn_maxblkid = 0;
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 /*
  * This function is equivalent to dmu_free_long_range(), but also
  * marks the new dirty record as a raw write.
  */
 int
 dmu_free_long_range_raw(objset_t *os, uint64_t object,
     uint64_t offset, uint64_t length)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length, B_TRUE);
 
 	/*
 	 * It is important to zero out the maxblkid when freeing the entire
 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
 	 * will take the fast path, and (b) dnode_reallocate() can verify
 	 * that the entire file has been freed.
 	 */
 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
 		dn->dn_maxblkid = 0;
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 static int
 dmu_free_long_object_impl(objset_t *os, uint64_t object, boolean_t raw)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 	if (err != 0)
 		return (err);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, object);
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 	dmu_tx_mark_netfree(tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err == 0) {
 		if (raw)
 			err = dmu_object_dirty_raw(os, object, tx);
 		if (err == 0)
 			err = dmu_object_free(os, object, tx);
 
 		dmu_tx_commit(tx);
 	} else {
 		dmu_tx_abort(tx);
 	}
 
 	return (err);
 }
 
 int
 dmu_free_long_object(objset_t *os, uint64_t object)
 {
 	return (dmu_free_long_object_impl(os, object, B_FALSE));
 }
 
 int
 dmu_free_long_object_raw(objset_t *os, uint64_t object)
 {
 	return (dmu_free_long_object_impl(os, object, B_TRUE));
 }
 
 
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 static int
 dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs, err = 0;
 
 	/*
 	 * Deal with odd block sizes, where there can't be data past the first
 	 * block.  If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
 	if (dn->dn_maxblkid == 0) {
 		uint64_t newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		bzero((char *)buf + newsz, size - newsz);
 		size = newsz;
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int i;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
 		for (i = 0; i < numbufs; i++) {
 			uint64_t tocpy;
 			int64_t bufoff;
 			dmu_buf_t *db = dbp[i];
 
 			ASSERT(size > 0);
 
 			bufoff = offset - db->db_offset;
 			tocpy = MIN(db->db_size - bufoff, size);
 
 			(void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
 
 			offset += tocpy;
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	return (err);
 }
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	err = dmu_read_impl(dn, offset, size, buf, flags);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags)
 {
 	return (dmu_read_impl(dn, offset, size, buf, flags));
 }
 
 static void
 dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	int i;
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = offset - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		offset += tocpy;
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
 }
 
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
 	if (size == 0)
 		return;
 
 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		dmu_buf_t *db = dbp[i];
 
 		dmu_buf_will_not_fill(db, tx);
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 
 	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
 	VERIFY0(dmu_buf_hold_noread(os, object, offset,
 	    FTAG, &db));
 
 	dmu_buf_write_embedded(db,
 	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
 	    uncompressed_size, compressed_size, byteorder, tx);
 
 	dmu_buf_rele(db, FTAG);
 }
 
 /*
  * DMU support for xuio
  */
 kstat_t *xuio_ksp = NULL;
 
 typedef struct xuio_stats {
 	/* loaned yet not returned arc_buf */
 	kstat_named_t xuiostat_onloan_rbuf;
 	kstat_named_t xuiostat_onloan_wbuf;
 	/* whether a copy is made when loaning out a read buffer */
 	kstat_named_t xuiostat_rbuf_copied;
 	kstat_named_t xuiostat_rbuf_nocopy;
 	/* whether a copy is made when assigning a write buffer */
 	kstat_named_t xuiostat_wbuf_copied;
 	kstat_named_t xuiostat_wbuf_nocopy;
 } xuio_stats_t;
 
 static xuio_stats_t xuio_stats = {
 	{ "onloan_read_buf",	KSTAT_DATA_UINT64 },
 	{ "onloan_write_buf",	KSTAT_DATA_UINT64 },
 	{ "read_buf_copied",	KSTAT_DATA_UINT64 },
 	{ "read_buf_nocopy",	KSTAT_DATA_UINT64 },
 	{ "write_buf_copied",	KSTAT_DATA_UINT64 },
 	{ "write_buf_nocopy",	KSTAT_DATA_UINT64 }
 };
 
 #define	XUIOSTAT_INCR(stat, val)        \
 	atomic_add_64(&xuio_stats.stat.value.ui64, (val))
 #define	XUIOSTAT_BUMP(stat)	XUIOSTAT_INCR(stat, 1)
 
 #ifdef HAVE_UIO_ZEROCOPY
 int
 dmu_xuio_init(xuio_t *xuio, int nblk)
 {
 	dmu_xuio_t *priv;
 	uio_t *uio = &xuio->xu_uio;
 
 	uio->uio_iovcnt = nblk;
 	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
 
 	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
 	priv->cnt = nblk;
 	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
 	priv->iovp = (iovec_t *)uio->uio_iov;
 	XUIO_XUZC_PRIV(xuio) = priv;
 
 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
 	else
 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
 
 	return (0);
 }
 
 void
 dmu_xuio_fini(xuio_t *xuio)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	int nblk = priv->cnt;
 
 	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
 	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
 	kmem_free(priv, sizeof (dmu_xuio_t));
 
 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
 	else
 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
 }
 
 /*
  * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
  * and increase priv->next by 1.
  */
 int
 dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
 {
 	struct iovec *iov;
 	uio_t *uio = &xuio->xu_uio;
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	int i = priv->next++;
 
 	ASSERT(i < priv->cnt);
 	ASSERT(off + n <= arc_buf_lsize(abuf));
 	iov = (iovec_t *)uio->uio_iov + i;
 	iov->iov_base = (char *)abuf->b_data + off;
 	iov->iov_len = n;
 	priv->bufs[i] = abuf;
 	return (0);
 }
 
 int
 dmu_xuio_cnt(xuio_t *xuio)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	return (priv->cnt);
 }
 
 arc_buf_t *
 dmu_xuio_arcbuf(xuio_t *xuio, int i)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 
 	ASSERT(i < priv->cnt);
 	return (priv->bufs[i]);
 }
 
 void
 dmu_xuio_clear(xuio_t *xuio, int i)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 
 	ASSERT(i < priv->cnt);
 	priv->bufs[i] = NULL;
 }
 #endif /* HAVE_UIO_ZEROCOPY */
 
 static void
 xuio_stat_init(void)
 {
 	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (xuio_ksp != NULL) {
 		xuio_ksp->ks_data = &xuio_stats;
 		kstat_install(xuio_ksp);
 	}
 }
 
 static void
 xuio_stat_fini(void)
 {
 	if (xuio_ksp != NULL) {
 		kstat_delete(xuio_ksp);
 		xuio_ksp = NULL;
 	}
 }
 
 void
 xuio_stat_wbuf_copied(void)
 {
 	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 }
 
 void
 xuio_stat_wbuf_nocopy(void)
 {
 	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
 }
 
 #ifdef _KERNEL
 int
 dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 #ifdef HAVE_UIO_ZEROCOPY
 	xuio_t *xuio = NULL;
 #endif
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
 	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
 	    TRUE, FTAG, &numbufs, &dbp, 0);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 #ifdef HAVE_UIO_ZEROCOPY
 		if (xuio) {
 			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 			arc_buf_t *dbuf_abuf = dbi->db_buf;
 			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
 			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
 			if (!err) {
 				uio->uio_resid -= tocpy;
 				uio->uio_loffset += tocpy;
 			}
 
 			if (abuf == dbuf_abuf)
 				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
 			else
 				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
 		} else
 #endif
 			err = uiomove((char *)db->db_data + bufoff, tocpy,
 			    UIO_READ, uio);
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From object zdb->db_object.
  * Starting at offset uio->uio_loffset.
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_read_uio_dnode(dn, uio, size);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From the specified object
  * Starting at offset uio->uio_loffset.
  */
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_read_uio_dnode(dn, uio, size);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
 	int i;
 
 	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		/*
 		 * XXX uiomove could block forever (eg.nfs-backed
 		 * pages).  There needs to be a uiolockdown() function
 		 * to lock the pages in memory, so that uiomove won't
 		 * block.
 		 */
 		err = uiomove((char *)db->db_data + bufoff, tocpy,
 		    UIO_WRITE, uio);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To object zdb->db_object.
  * Starting at offset uio->uio_loffset.
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To the specified object.
  * Starting at offset uio->uio_loffset.
  */
 int
 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 #endif /* _KERNEL */
 
 /*
  * Allocate a loaned anonymous arc buffer.
  */
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
 }
 
 /*
  * Free a loaned arc buffer.
  */
 void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
 	arc_buf_destroy(buf, FTAG);
 }
 
 void
 dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, const uint8_t *salt,
     const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
 {
 	dmu_object_type_t type;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 	uint64_t dsobj = dmu_objset_id(db->db_objset);
 
 	ASSERT3P(db->db_buf, !=, NULL);
 	ASSERT3U(dsobj, !=, 0);
 
 	dmu_buf_will_change_crypt_params(handle, tx);
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * This technically violates the assumption the dmu code makes
 	 * that dnode blocks are only released in syncing context.
 	 */
 	(void) arc_release(db->db_buf, db);
 	arc_convert_to_raw(db->db_buf, dsobj, byteorder, type, salt, iv, mac);
 }
 
 void
 dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset,
     dmu_buf_t *handle, dmu_tx_t *tx)
 {
 	dmu_buf_t *dst_handle;
 	dmu_buf_impl_t *dstdb;
 	dmu_buf_impl_t *srcdb = (dmu_buf_impl_t *)handle;
 	arc_buf_t *abuf;
 	uint64_t datalen;
 	boolean_t byteorder;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 
 	ASSERT3P(srcdb->db_buf, !=, NULL);
 
 	/* hold the db that we want to write to */
 	VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &dst_handle,
 	    DMU_READ_NO_DECRYPT));
 	dstdb = (dmu_buf_impl_t *)dst_handle;
 	datalen = arc_buf_size(srcdb->db_buf);
 
 	/* allocated an arc buffer that matches the type of srcdb->db_buf */
 	if (arc_is_encrypted(srcdb->db_buf)) {
 		arc_get_raw_params(srcdb->db_buf, &byteorder, salt, iv, mac);
 		abuf = arc_loan_raw_buf(os->os_spa, dmu_objset_id(os),
 		    byteorder, salt, iv, mac, DB_DNODE(dstdb)->dn_type,
 		    datalen, arc_buf_lsize(srcdb->db_buf),
 		    arc_get_compression(srcdb->db_buf));
 	} else {
 		/* we won't get a compressed db back from dmu_buf_hold() */
 		ASSERT3U(arc_get_compression(srcdb->db_buf),
 		    ==, ZIO_COMPRESS_OFF);
 		abuf = arc_loan_buf(os->os_spa,
 		    DMU_OT_IS_METADATA(DB_DNODE(dstdb)->dn_type), datalen);
 	}
 
 	ASSERT3U(datalen, ==, arc_buf_size(abuf));
 
 	/* copy the data to the new buffer and assign it to the dstdb */
 	bcopy(srcdb->db_buf->b_data, abuf->b_data, datalen);
 	dbuf_assign_arcbuf(dstdb, abuf, tx);
 	dmu_buf_rele(dst_handle, FTAG);
 }
 
 /*
  * When possible directly assign passed loaned arc buffer to a dbuf.
  * If this is not possible copy the contents of passed arc buf via
  * dmu_write().
  */
 void
 dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	objset_t *os = dn->dn_objset;
 	uint64_t object = dn->dn_object;
 	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
 	uint64_t blkid;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/*
 	 * We can only assign if the offset is aligned, the arc buf is the
 	 * same size as the dbuf, and the dbuf is not metadata.
 	 */
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
 		/* compressed bufs must always be assignable to their dbuf */
 		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
 		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
 
 		dbuf_rele(db, FTAG);
 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
 		dmu_return_arcbuf(buf);
 		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 	}
 }
 
 void
 dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
 
 	DB_DNODE_ENTER(dbuf);
 	dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx);
 	DB_DNODE_EXIT(dbuf);
 }
 
 typedef struct {
 	dbuf_dirty_record_t	*dsa_dr;
 	dmu_sync_cb_t		*dsa_done;
 	zgd_t			*dsa_zgd;
 	dmu_tx_t		*dsa_tx;
 } dmu_sync_arg_t;
 
 /* ARGSUSED */
 static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	dmu_sync_arg_t *dsa = varg;
 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error == 0) {
 		if (BP_IS_HOLE(bp)) {
 			/*
 			 * A block of zeros may compress to a hole, but the
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
 		} else if (!BP_IS_EMBEDDED(bp)) {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			BP_SET_FILL(bp, 1);
 		}
 	}
 }
 
 static void
 dmu_sync_late_arrival_ready(zio_t *zio)
 {
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
 /* ARGSUSED */
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
 		if (dr->dt.dl.dr_nopwrite) {
 			blkptr_t *bp = zio->io_bp;
 			blkptr_t *bp_orig = &zio->io_bp_orig;
 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			VERIFY(BP_EQUAL(bp, db->db_blkptr));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
 			VERIFY(zio_checksum_table[chksum].ci_flags &
 			    ZCHECKSUM_FLAG_NOPWRITE);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
 
 		/*
 		 * Old style holes are filled with all zeros, whereas
 		 * new-style holes maintain their lsize, type, level,
 		 * and birth time (see zio_write_compress). While we
 		 * need to reset the BP_SET_LSIZE() call that happened
 		 * in dmu_sync_ready for old style holes, we do *not*
 		 * want to wipe out the information contained in new
 		 * style holes. Thus, only zero out the block pointer if
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
 		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	}
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static void
 dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
 	ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
 
 	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
 		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
 		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	abd_put(zio->io_abd);
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
     zio_prop_t *zp, zbookmark_phys_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
 	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
 	}
 
+	/*
+	 * In order to prevent the zgd's lwb from being free'd prior to
+	 * dmu_sync_late_arrival_done() being called, we have to ensure
+	 * the lwb's "max txg" takes this tx's txg into account.
+	 */
+	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
+
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = NULL;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
 	/*
 	 * Since we are currently syncing this txg, it's nontrivial to
 	 * determine what BP to nopwrite against, so we disable nopwrite.
 	 *
 	 * When syncing, the db_blkptr is initially the BP of the previous
 	 * txg.  We can not nopwrite against it because it will be changed
 	 * (this is similar to the non-late-arrival case where the dbuf is
 	 * dirty in a future txg).
 	 *
 	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
 	 * We can not nopwrite against it because although the BP will not
 	 * (typically) be changed, the data has not yet been persisted to this
 	 * location.
 	 *
 	 * Finally, when dbuf_write_done() is called, it is theoretically
 	 * possible to always nopwrite, because the data that was written in
 	 * this txg is the same data that we are trying to write.  However we
 	 * would need to check that this dbuf is not dirty in any future
 	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
 	 * don't nopwrite in this case.
 	 */
 	zp->zp_nopwrite = B_FALSE;
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
 
 /*
  * Intent log support: sync the block associated with db to disk.
  * N.B. and XXX: the caller is responsible for making sure that the
  * data isn't changing while dmu_sync() is writing it.
  *
  * Return values:
  *
  *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
  *	EIO: could not do the I/O.
  *		The caller should do a txg_wait_synced().
  *
  *	0: the I/O has been initiated.
  *		The caller should log this blkptr in the done callback.
  *		It is possible that the I/O will fail, in which case
  *		the error will be reported to the done callback and
  *		propagated to pio from zio_done().
  */
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr;
 	dmu_sync_arg_t *dsa;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	dnode_t *dn;
 
 	ASSERT(pio != NULL);
 	ASSERT(txg != 0);
 
 	/* dbuf is within the locked range */
 	ASSERT3U(db->db.db_offset, >=, zgd->zgd_rl->r_off);
 	ASSERT3U(db->db.db_offset + db->db.db_size, <=,
 	    zgd->zgd_rl->r_off + zgd->zgd_rl->r_len);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
 	if (txg > spa_freeze_txg(os->os_spa))
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
 	 * and us.  If we determine that this txg is not yet syncing,
 	 * but it begins to sync a moment later, that's OK because the
 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
 		 * This txg has already synced.  There's nothing to do.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	if (txg <= spa_syncing_txg(os->os_spa)) {
 		/*
 		 * This txg is currently syncing, so we can't mess with
 		 * the dirty record anymore; just write a new log block.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = db->db_last_dirty;
 	while (dr && dr->dr_txg != txg)
 		dr = dr->dr_next;
 
 	if (dr == NULL) {
 		/*
 		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
 
 	if (db->db_blkptr != NULL) {
 		/*
 		 * We need to fill in zgd_bp with the current blkptr so that
 		 * the nopwrite code can check if we're writing the same
 		 * data that's already on disk.  We can only nopwrite if we
 		 * are sure that after making the copy, db_blkptr will not
 		 * change until our i/o completes.  We ensure this by
 		 * holding the db_mtx, and only allowing nopwrite if the
 		 * block is not already dirty (see below).  This is verified
 		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
 		 * not changed.
 		 */
 		*zgd->zgd_bp = *db->db_blkptr;
 	}
 
 	/*
 	 * Assume the on-disk data is X, the current syncing data (in
 	 * txg - 1) is Y, and the current in-memory data is Z (currently
 	 * in dmu_sync).
 	 *
 	 * We usually want to perform a nopwrite if X and Z are the
 	 * same.  However, if Y is different (i.e. the BP is going to
 	 * change before this write takes effect), then a nopwrite will
 	 * be incorrect - we would override with X, which could have
 	 * been freed when Y was written.
 	 *
 	 * (Note that this is not a concern when we are nop-writing from
 	 * syncing context, because X and Y must be identical, because
 	 * all previous txgs have been synced.)
 	 *
 	 * Therefore, we disable nopwrite if the current BP could change
 	 * before this TXG.  There are two ways it could change: by
 	 * being dirty (dr_next is non-NULL), or by being freed
 	 * (dnode_block_freed()).  This behavior is verified by
 	 * zio_done(), which VERIFYs that the override BP is identical
 	 * to the on-disk BP.
 	 */
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
 		zp.zp_nopwrite = B_FALSE;
 	DB_DNODE_EXIT(db);
 
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * We have already issued a sync write for this buffer,
 		 * or this buffer has already been synced.  It could not
 		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EALREADY));
 	}
 
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 	mutex_exit(&db->db_mtx);
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = dr;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg,
 	    zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
 	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
 }
 
 int
 dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_nlevels(dn, nlevels, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's checksum function.  This
 	 * check ensures that the receiving system can understand the
 	 * checksum function transmitted.
 	 */
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's compression function.  This
 	 * check ensures that the receiving system can understand the
 	 * compression function transmitted.
 	 */
 	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * Dirty an object and set the dirty record's raw flag. This is used
  * when writing raw data to an object that will not effect the
  * encryption parameters, specifically during raw receives.
  */
 int
 dmu_object_dirty_raw(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	dmu_buf_will_change_crypt_params((dmu_buf_t *)dn->dn_dbuf, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int zfs_mdcomp_disable = 0;
 
 /*
  * When the "redundant_metadata" property is set to "most", only indirect
  * blocks of this level and higher will have an additional ditto block.
  */
 int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
 	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
 	boolean_t dedup = B_FALSE;
 	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	boolean_t encrypt = B_FALSE;
 	int copies = os->os_copies;
 
 	/*
 	 * We maintain different write policies for each of the following
 	 * types of data:
 	 *	 1. metadata
 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
 	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		if (zfs_mdcomp_disable) {
 			compress = ZIO_COMPRESS_EMPTY;
 		} else {
 			/*
 			 * XXX -- we should design a compression algorithm
 			 * that specializes in arrays of bps.
 			 */
 			compress = zio_compress_select(os->os_spa,
 			    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
 		}
 
 		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
 		if (!(zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_METADATA) ||
 		    (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 
 		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
 		    (os->os_redundant_metadata ==
 		    ZFS_REDUNDANT_METADATA_MOST &&
 		    (level >= zfs_redundant_metadata_most_ditto_level ||
 		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
 			copies++;
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
 		/*
 		 * If we're writing preallocated blocks, we aren't actually
 		 * writing them so don't set any policy properties.  These
 		 * blocks are currently only used by an external subsystem
 		 * outside of zfs (i.e. dump) and not written by the zio
 		 * pipeline.
 		 */
 		compress = ZIO_COMPRESS_OFF;
 		checksum = ZIO_CHECKSUM_OFF;
 	} else {
 		compress = zio_compress_select(os->os_spa, dn->dn_compress,
 		    compress);
 
 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
 		    zio_checksum_select(dn->dn_checksum, checksum) :
 		    dedup_checksum;
 
 		/*
 		 * Determine dedup setting.  If we are in dmu_sync(),
 		 * we won't actually dedup now because that's all
 		 * done in syncing context; but we do want to use the
 		 * dedup checkum.  If the checksum is not strong
 		 * enough to ensure unique signatures, force
 		 * dedup_verify.
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
 			if (!(zio_checksum_table[checksum].ci_flags &
 			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
 
 		/*
 		 * Enable nopwrite if we have secure enough checksum
 		 * algorithm (see comment in zio_nop_write) and
 		 * compression is enabled.  We don't enable nopwrite if
 		 * dedup is enabled as the two features are mutually
 		 * exclusive.
 		 */
 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
 	/*
 	 * All objects in an encrypted objset are protected from modification
 	 * via a MAC. Encrypted objects store their IV and salt in the last DVA
 	 * in the bp, so we cannot use all copies. Encrypted objects are also
 	 * not subject to nopwrite since writing the same data will still
 	 * result in a new ciphertext. Only encrypted blocks can be dedup'd
 	 * to avoid ambiguity in the dedup code since the DDT does not store
 	 * object types.
 	 */
 	if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
 		encrypt = B_TRUE;
 
 		if (DMU_OT_IS_ENCRYPTED(type)) {
 			copies = MIN(copies, SPA_DVAS_PER_BP - 1);
 			nopwrite = B_FALSE;
 		} else {
 			dedup = B_FALSE;
 		}
 
 		if (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)
 			compress = ZIO_COMPRESS_EMPTY;
 	}
 
 	zp->zp_compress = compress;
 	zp->zp_checksum = checksum;
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
 	zp->zp_encrypt = encrypt;
 	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
 	bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
 	bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
 	bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
 
 	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
 }
 
 /*
  * This function is only called from zfs_holey_common() for zpl_llseek()
  * in order to determine the location of holes.  In order to accurately
  * report holes all dirty data must be synced to disk.  This causes extremely
  * poor performance when seeking for holes in a dirty file.  As a compromise,
  * only provide hole data when the dnode is clean.  When a dnode is dirty
  * report the dnode as having no holes which is always a safe thing to do.
  */
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
 	int i, err;
 	boolean_t clean = B_TRUE;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	/*
 	 * Check if dnode is dirty
 	 */
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (list_link_active(&dn->dn_dirty_link[i])) {
 			clean = B_FALSE;
 			break;
 		}
 	}
 
 	/*
 	 * If compatibility option is on, sync any current changes before
 	 * we go trundling through the block pointers.
 	 */
 	if (!clean && zfs_dmu_offset_next_sync) {
 		clean = B_TRUE;
 		dnode_rele(dn, FTAG);
 		txg_wait_synced(dmu_objset_pool(os), 0);
 		err = dnode_hold(os, object, FTAG, &dn);
 		if (err)
 			return (err);
 	}
 
 	if (clean)
 		err = dnode_next_offset(dn,
 		    (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	else
 		err = SET_ERROR(EBUSY);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 void
 __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
 	doi->doi_type = dn->dn_type;
 	doi->doi_bonus_type = dn->dn_bonustype;
 	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
 	doi->doi_nblkptr = dn->dn_nblkptr;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
 	__dmu_object_info_from_dnode(dn, doi);
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get information on a DMU object.
  * If doi is NULL, just indicates whether the object exists.
  */
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 
 	if (err)
 		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
 
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 /*
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
 	DB_DNODE_EXIT(db);
 }
 
 /*
  * Faster still when you only care about the size.
  * This is specifically optimized for zfs_getattr().
  */
 void
 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
     u_longlong_t *nblk512)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add in number of slots used for the dnode itself */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
 	DB_DNODE_EXIT(db);
 }
 
 void
 dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	*dnsize = dn->dn_num_slots << DNODE_SHIFT;
 	DB_DNODE_EXIT(db);
 }
 
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
 	uint64_t *buf = vbuf;
 	size_t count = size >> 3;
 	int i;
 
 	ASSERT((size & 7) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_64(buf[i]);
 }
 
 void
 byteswap_uint32_array(void *vbuf, size_t size)
 {
 	uint32_t *buf = vbuf;
 	size_t count = size >> 2;
 	int i;
 
 	ASSERT((size & 3) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_32(buf[i]);
 }
 
 void
 byteswap_uint16_array(void *vbuf, size_t size)
 {
 	uint16_t *buf = vbuf;
 	size_t count = size >> 1;
 	int i;
 
 	ASSERT((size & 1) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_16(buf[i]);
 }
 
 /* ARGSUSED */
 void
 byteswap_uint8_array(void *vbuf, size_t size)
 {
 }
 
 void
 dmu_init(void)
 {
 	abd_init();
 	zfs_dbgmsg_init();
 	sa_cache_init();
 	xuio_stat_init();
 	dmu_objset_init();
 	dnode_init();
 	zfetch_init();
 	dmu_tx_init();
 	l2arc_init();
 	arc_init();
 	dbuf_init();
 }
 
 void
 dmu_fini(void)
 {
 	arc_fini(); /* arc depends on l2arc, so arc must go first */
 	l2arc_fini();
 	dmu_tx_fini();
 	zfetch_fini();
 	dbuf_fini();
 	dnode_fini();
 	dmu_objset_fini();
 	xuio_stat_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
 	abd_fini();
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(dmu_bonus_hold);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_range_raw);
 EXPORT_SYMBOL(dmu_free_long_object);
 EXPORT_SYMBOL(dmu_free_long_object_raw);
 EXPORT_SYMBOL(dmu_read);
 EXPORT_SYMBOL(dmu_read_by_dnode);
 EXPORT_SYMBOL(dmu_write);
 EXPORT_SYMBOL(dmu_write_by_dnode);
 EXPORT_SYMBOL(dmu_prealloc);
 EXPORT_SYMBOL(dmu_object_info);
 EXPORT_SYMBOL(dmu_object_info_from_dnode);
 EXPORT_SYMBOL(dmu_object_info_from_db);
 EXPORT_SYMBOL(dmu_object_size_from_db);
 EXPORT_SYMBOL(dmu_object_dnsize_from_db);
 EXPORT_SYMBOL(dmu_object_set_nlevels);
 EXPORT_SYMBOL(dmu_object_set_blocksize);
 EXPORT_SYMBOL(dmu_object_set_checksum);
 EXPORT_SYMBOL(dmu_object_set_compress);
 EXPORT_SYMBOL(dmu_write_policy);
 EXPORT_SYMBOL(dmu_sync);
 EXPORT_SYMBOL(dmu_request_arcbuf);
 EXPORT_SYMBOL(dmu_return_arcbuf);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
 EXPORT_SYMBOL(dmu_buf_hold);
 EXPORT_SYMBOL(dmu_ot);
 
 /* BEGIN CSTYLED */
 module_param(zfs_mdcomp_disable, int, 0644);
 MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression");
 
 module_param(zfs_nopwrite_enabled, int, 0644);
 MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes");
 
 module_param(zfs_per_txg_dirty_frees_percent, ulong, 0644);
 MODULE_PARM_DESC(zfs_per_txg_dirty_frees_percent,
 	"percentage of dirtied blocks from frees in one TXG");
 
 module_param(zfs_dmu_offset_next_sync, int, 0644);
 MODULE_PARM_DESC(zfs_dmu_offset_next_sync,
 	"Enable forcing txg sync to find holes");
 
 /* END CSTYLED */
 
 #endif
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index 078839a3203b..bf8544507b60 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -1,953 +1,953 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/txg_impl.h>
 #include <sys/dmu_impl.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/zil.h>
 #include <sys/callb.h>
 #include <sys/trace_txg.h>
 
 /*
  * ZFS Transaction Groups
  * ----------------------
  *
  * ZFS transaction groups are, as the name implies, groups of transactions
  * that act on persistent state. ZFS asserts consistency at the granularity of
  * these transaction groups. Each successive transaction group (txg) is
  * assigned a 64-bit consecutive identifier. There are three active
  * transaction group states: open, quiescing, or syncing. At any given time,
  * there may be an active txg associated with each state; each active txg may
  * either be processing, or blocked waiting to enter the next state. There may
  * be up to three active txgs, and there is always a txg in the open state
  * (though it may be blocked waiting to enter the quiescing state). In broad
  * strokes, transactions -- operations that change in-memory structures -- are
  * accepted into the txg in the open state, and are completed while the txg is
  * in the open or quiescing states. The accumulated changes are written to
  * disk in the syncing state.
  *
  * Open
  *
  * When a new txg becomes active, it first enters the open state. New
  * transactions -- updates to in-memory structures -- are assigned to the
  * currently open txg. There is always a txg in the open state so that ZFS can
  * accept new changes (though the txg may refuse new changes if it has hit
  * some limit). ZFS advances the open txg to the next state for a variety of
  * reasons such as it hitting a time or size threshold, or the execution of an
  * administrative action that must be completed in the syncing state.
  *
  * Quiescing
  *
  * After a txg exits the open state, it enters the quiescing state. The
  * quiescing state is intended to provide a buffer between accepting new
  * transactions in the open state and writing them out to stable storage in
  * the syncing state. While quiescing, transactions can continue their
  * operation without delaying either of the other states. Typically, a txg is
  * in the quiescing state very briefly since the operations are bounded by
  * software latencies rather than, say, slower I/O latencies. After all
  * transactions complete, the txg is ready to enter the next state.
  *
  * Syncing
  *
  * In the syncing state, the in-memory state built up during the open and (to
  * a lesser degree) the quiescing states is written to stable storage. The
  * process of writing out modified data can, in turn modify more data. For
  * example when we write new blocks, we need to allocate space for them; those
  * allocations modify metadata (space maps)... which themselves must be
  * written to stable storage. During the sync state, ZFS iterates, writing out
  * data until it converges and all in-memory changes have been written out.
  * The first such pass is the largest as it encompasses all the modified user
  * data (as opposed to filesystem metadata). Subsequent passes typically have
  * far less data to write as they consist exclusively of filesystem metadata.
  *
  * To ensure convergence, after a certain number of passes ZFS begins
  * overwriting locations on stable storage that had been allocated earlier in
  * the syncing state (and subsequently freed). ZFS usually allocates new
  * blocks to optimize for large, continuous, writes. For the syncing state to
  * converge however it must complete a pass where no new blocks are allocated
  * since each allocation requires a modification of persistent metadata.
  * Further, to hasten convergence, after a prescribed number of passes, ZFS
  * also defers frees, and stops compressing.
  *
  * In addition to writing out user data, we must also execute synctasks during
  * the syncing context. A synctask is the mechanism by which some
  * administrative activities work such as creating and destroying snapshots or
  * datasets. Note that when a synctask is initiated it enters the open txg,
  * and ZFS then pushes that txg as quickly as possible to completion of the
  * syncing state in order to reduce the latency of the administrative
  * activity. To complete the syncing state, ZFS writes out a new uberblock,
  * the root of the tree of blocks that comprise all state stored on the ZFS
  * pool. Finally, if there is a quiesced txg waiting, we signal that it can
  * now transition to the syncing state.
  */
 
 static void txg_sync_thread(void *arg);
 static void txg_quiesce_thread(void *arg);
 
 int zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
 
 /*
  * Prepare the txg subsystem.
  */
 void
 txg_init(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	int c;
 	bzero(tx, sizeof (tx_state_t));
 
 	tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
 
 	for (c = 0; c < max_ncpus; c++) {
 		int i;
 
 		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_NOLOCKDEP,
 		    NULL);
 		for (i = 0; i < TXG_SIZE; i++) {
 			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
 			    NULL);
 			list_create(&tx->tx_cpu[c].tc_callbacks[i],
 			    sizeof (dmu_tx_callback_t),
 			    offsetof(dmu_tx_callback_t, dcb_node));
 		}
 	}
 
 	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
 
 	tx->tx_open_txg = txg;
 }
 
 /*
  * Close down the txg subsystem.
  */
 void
 txg_fini(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	int c;
 
-	ASSERT(tx->tx_threads == 0);
+	ASSERT0(tx->tx_threads);
 
 	mutex_destroy(&tx->tx_sync_lock);
 
 	cv_destroy(&tx->tx_sync_more_cv);
 	cv_destroy(&tx->tx_sync_done_cv);
 	cv_destroy(&tx->tx_quiesce_more_cv);
 	cv_destroy(&tx->tx_quiesce_done_cv);
 	cv_destroy(&tx->tx_exit_cv);
 
 	for (c = 0; c < max_ncpus; c++) {
 		int i;
 
 		mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
 		mutex_destroy(&tx->tx_cpu[c].tc_lock);
 		for (i = 0; i < TXG_SIZE; i++) {
 			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
 			list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
 		}
 	}
 
 	if (tx->tx_commit_cb_taskq != NULL)
 		taskq_destroy(tx->tx_commit_cb_taskq);
 
 	vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
 
 	bzero(tx, sizeof (tx_state_t));
 }
 
 /*
  * Start syncing transaction groups.
  */
 void
 txg_sync_start(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	mutex_enter(&tx->tx_sync_lock);
 
 	dprintf("pool %p\n", dp);
 
-	ASSERT(tx->tx_threads == 0);
+	ASSERT0(tx->tx_threads);
 
 	tx->tx_threads = 2;
 
 	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
 	    dp, 0, &p0, TS_RUN, defclsyspri);
 
 	/*
 	 * The sync thread can need a larger-than-default stack size on
 	 * 32-bit x86.  This is due in part to nested pools and
 	 * scrub_visitbp() recursion.
 	 */
 	tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
 	    dp, 0, &p0, TS_RUN, defclsyspri);
 
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 static void
 txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
 {
 	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
 	mutex_enter(&tx->tx_sync_lock);
 }
 
 static void
 txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
 {
 	ASSERT(*tpp != NULL);
 	*tpp = NULL;
 	tx->tx_threads--;
 	cv_broadcast(&tx->tx_exit_cv);
 	CALLB_CPR_EXIT(cpr);		/* drops &tx->tx_sync_lock */
 	thread_exit();
 }
 
 static void
 txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
 {
 	CALLB_CPR_SAFE_BEGIN(cpr);
 
 	if (time)
 		(void) cv_timedwait_sig(cv, &tx->tx_sync_lock,
 		    ddi_get_lbolt() + time);
 	else
 		cv_wait_sig(cv, &tx->tx_sync_lock);
 
 	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
 }
 
 /*
  * Stop syncing transaction groups.
  */
 void
 txg_sync_stop(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	dprintf("pool %p\n", dp);
 	/*
 	 * Finish off any work in progress.
 	 */
-	ASSERT(tx->tx_threads == 2);
+	ASSERT3U(tx->tx_threads, ==, 2);
 
 	/*
 	 * We need to ensure that we've vacated the deferred space_maps.
 	 */
 	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
 
 	/*
 	 * Wake all sync threads and wait for them to die.
 	 */
 	mutex_enter(&tx->tx_sync_lock);
 
-	ASSERT(tx->tx_threads == 2);
+	ASSERT3U(tx->tx_threads, ==, 2);
 
 	tx->tx_exiting = 1;
 
 	cv_broadcast(&tx->tx_quiesce_more_cv);
 	cv_broadcast(&tx->tx_quiesce_done_cv);
 	cv_broadcast(&tx->tx_sync_more_cv);
 
 	while (tx->tx_threads != 0)
 		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
 
 	tx->tx_exiting = 0;
 
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 uint64_t
 txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	tx_cpu_t *tc;
 	uint64_t txg;
 
 	/*
 	 * It appears the processor id is simply used as a "random"
 	 * number to index into the array, and there isn't any other
 	 * significance to the chosen tx_cpu. Because.. Why not use
 	 * the current cpu to index into the array?
 	 */
 	kpreempt_disable();
 	tc = &tx->tx_cpu[CPU_SEQID];
 	kpreempt_enable();
 
 	mutex_enter(&tc->tc_open_lock);
 	txg = tx->tx_open_txg;
 
 	mutex_enter(&tc->tc_lock);
 	tc->tc_count[txg & TXG_MASK]++;
 	mutex_exit(&tc->tc_lock);
 
 	th->th_cpu = tc;
 	th->th_txg = txg;
 
 	return (txg);
 }
 
 void
 txg_rele_to_quiesce(txg_handle_t *th)
 {
 	tx_cpu_t *tc = th->th_cpu;
 
 	ASSERT(!MUTEX_HELD(&tc->tc_lock));
 	mutex_exit(&tc->tc_open_lock);
 }
 
 void
 txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
 {
 	tx_cpu_t *tc = th->th_cpu;
 	int g = th->th_txg & TXG_MASK;
 
 	mutex_enter(&tc->tc_lock);
 	list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
 	mutex_exit(&tc->tc_lock);
 }
 
 void
 txg_rele_to_sync(txg_handle_t *th)
 {
 	tx_cpu_t *tc = th->th_cpu;
 	int g = th->th_txg & TXG_MASK;
 
 	mutex_enter(&tc->tc_lock);
 	ASSERT(tc->tc_count[g] != 0);
 	if (--tc->tc_count[g] == 0)
 		cv_broadcast(&tc->tc_cv[g]);
 	mutex_exit(&tc->tc_lock);
 
 	th->th_cpu = NULL;	/* defensive */
 }
 
 /*
  * Blocks until all transactions in the group are committed.
  *
  * On return, the transaction group has reached a stable state in which it can
  * then be passed off to the syncing context.
  */
 static void
 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	uint64_t tx_open_time;
 	int g = txg & TXG_MASK;
 	int c;
 
 	/*
 	 * Grab all tc_open_locks so nobody else can get into this txg.
 	 */
 	for (c = 0; c < max_ncpus; c++)
 		mutex_enter(&tx->tx_cpu[c].tc_open_lock);
 
 	ASSERT(txg == tx->tx_open_txg);
 	tx->tx_open_txg++;
 	tx->tx_open_time = tx_open_time = gethrtime();
 
 	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
 	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
 
 	/*
 	 * Now that we've incremented tx_open_txg, we can let threads
 	 * enter the next transaction group.
 	 */
 	for (c = 0; c < max_ncpus; c++)
 		mutex_exit(&tx->tx_cpu[c].tc_open_lock);
 
 	spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx_open_time);
 	spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time);
 
 	/*
 	 * Quiesce the transaction group by waiting for everyone to txg_exit().
 	 */
 	for (c = 0; c < max_ncpus; c++) {
 		tx_cpu_t *tc = &tx->tx_cpu[c];
 		mutex_enter(&tc->tc_lock);
 		while (tc->tc_count[g] != 0)
 			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
 		mutex_exit(&tc->tc_lock);
 	}
 
 	spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime());
 }
 
 static void
 txg_do_callbacks(list_t *cb_list)
 {
 	dmu_tx_do_callbacks(cb_list, 0);
 
 	list_destroy(cb_list);
 
 	kmem_free(cb_list, sizeof (list_t));
 }
 
 /*
  * Dispatch the commit callbacks registered on this txg to worker threads.
  *
  * If no callbacks are registered for a given TXG, nothing happens.
  * This function creates a taskq for the associated pool, if needed.
  */
 static void
 txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
 {
 	int c;
 	tx_state_t *tx = &dp->dp_tx;
 	list_t *cb_list;
 
 	for (c = 0; c < max_ncpus; c++) {
 		tx_cpu_t *tc = &tx->tx_cpu[c];
 		/*
 		 * No need to lock tx_cpu_t at this point, since this can
 		 * only be called once a txg has been synced.
 		 */
 
 		int g = txg & TXG_MASK;
 
 		if (list_is_empty(&tc->tc_callbacks[g]))
 			continue;
 
 		if (tx->tx_commit_cb_taskq == NULL) {
 			/*
 			 * Commit callback taskq hasn't been created yet.
 			 */
 			tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
 			    max_ncpus, defclsyspri, max_ncpus, max_ncpus * 2,
 			    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 		}
 
 		cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 		list_create(cb_list, sizeof (dmu_tx_callback_t),
 		    offsetof(dmu_tx_callback_t, dcb_node));
 
 		list_move_tail(cb_list, &tc->tc_callbacks[g]);
 
 		(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
 		    txg_do_callbacks, cb_list, TQ_SLEEP);
 	}
 }
 
 /*
  * Wait for pending commit callbacks of already-synced transactions to finish
  * processing.
  * Calling this function from within a commit callback will deadlock.
  */
 void
 txg_wait_callbacks(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	if (tx->tx_commit_cb_taskq != NULL)
 		taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0);
 }
 
 static void
 txg_sync_thread(void *arg)
 {
 	dsl_pool_t *dp = arg;
 	spa_t *spa = dp->dp_spa;
 	tx_state_t *tx = &dp->dp_tx;
 	callb_cpr_t cpr;
 	clock_t start, delta;
 
 	(void) spl_fstrans_mark();
 	txg_thread_enter(tx, &cpr);
 
 	start = delta = 0;
 	for (;;) {
 		clock_t timeout = zfs_txg_timeout * hz;
 		clock_t timer;
 		uint64_t txg;
 		txg_stat_t *ts;
 
 		/*
 		 * We sync when we're scanning, there's someone waiting
 		 * on us, or the quiesce thread has handed off a txg to
 		 * us, or we have reached our timeout.
 		 */
 		timer = (delta >= timeout ? 0 : timeout - delta);
 		while (!dsl_scan_active(dp->dp_scan) &&
 		    !tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 		    tx->tx_quiesced_txg == 0 &&
 		    dp->dp_dirty_total < zfs_dirty_data_sync) {
 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
 			delta = ddi_get_lbolt() - start;
 			timer = (delta > timeout ? 0 : timeout - delta);
 		}
 
 		/*
 		 * Wait until the quiesce thread hands off a txg to us,
 		 * prompting it to do so if necessary.
 		 */
 		while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
 			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 			cv_broadcast(&tx->tx_quiesce_more_cv);
 			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
 		}
 
 		if (tx->tx_exiting)
 			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
 
 		/*
 		 * Consume the quiesced txg which has been handed off to
 		 * us.  This may cause the quiescing thread to now be
 		 * able to quiesce another txg, so we must signal it.
 		 */
 		txg = tx->tx_quiesced_txg;
 		tx->tx_quiesced_txg = 0;
 		tx->tx_syncing_txg = txg;
 		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
 		ts = spa_txg_history_init_io(spa, txg, dp);
 		cv_broadcast(&tx->tx_quiesce_more_cv);
 
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 
 		start = ddi_get_lbolt();
 		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;
 
 		mutex_enter(&tx->tx_sync_lock);
 		tx->tx_synced_txg = txg;
 		tx->tx_syncing_txg = 0;
 		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
 		spa_txg_history_fini_io(spa, ts);
 		cv_broadcast(&tx->tx_sync_done_cv);
 
 		/*
 		 * Dispatch commit callbacks to worker threads.
 		 */
 		txg_dispatch_callbacks(dp, txg);
 	}
 }
 
 static void
 txg_quiesce_thread(void *arg)
 {
 	dsl_pool_t *dp = arg;
 	tx_state_t *tx = &dp->dp_tx;
 	callb_cpr_t cpr;
 
 	txg_thread_enter(tx, &cpr);
 
 	for (;;) {
 		uint64_t txg;
 
 		/*
 		 * We quiesce when there's someone waiting on us.
 		 * However, we can only have one txg in "quiescing" or
 		 * "quiesced, waiting to sync" state.  So we wait until
 		 * the "quiesced, waiting to sync" txg has been consumed
 		 * by the sync thread.
 		 */
 		while (!tx->tx_exiting &&
 		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
 		    tx->tx_quiesced_txg != 0))
 			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
 
 		if (tx->tx_exiting)
 			txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
 
 		txg = tx->tx_open_txg;
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 		    txg, tx->tx_quiesce_txg_waiting,
 		    tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 		txg_quiesce(dp, txg);
 		mutex_enter(&tx->tx_sync_lock);
 
 		/*
 		 * Hand this txg off to the sync thread.
 		 */
 		dprintf("quiesce done, handing off txg %llu\n", txg);
 		tx->tx_quiesced_txg = txg;
 		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_sync_more_cv);
 		cv_broadcast(&tx->tx_quiesce_done_cv);
 	}
 }
 
 /*
  * Delay this thread by delay nanoseconds if we are still in the open
  * transaction group and there is already a waiting txg quiesing or quiesced.
  * Abort the delay if this txg stalls or enters the quiesing state.
  */
 void
 txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	hrtime_t start = gethrtime();
 
 	/* don't delay if this txg could transition to quiescing immediately */
 	if (tx->tx_open_txg > txg ||
 	    tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
 		return;
 
 	mutex_enter(&tx->tx_sync_lock);
 	if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
 		mutex_exit(&tx->tx_sync_lock);
 		return;
 	}
 
 	while (gethrtime() - start < delay &&
 	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
 		(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
 		    &tx->tx_sync_lock, delay, resolution, 0);
 	}
 
 	DMU_TX_STAT_BUMP(dmu_tx_delay);
 
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 void
 txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	ASSERT(!dsl_pool_config_held(dp));
 
 	mutex_enter(&tx->tx_sync_lock);
-	ASSERT(tx->tx_threads == 2);
+	ASSERT3U(tx->tx_threads, ==, 2);
 	if (txg == 0)
 		txg = tx->tx_open_txg + TXG_DEFER_SIZE;
 	if (tx->tx_sync_txg_waiting < txg)
 		tx->tx_sync_txg_waiting = txg;
 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 	while (tx->tx_synced_txg < txg) {
 		dprintf("broadcasting sync more "
 		    "tx_synced=%llu waiting=%llu dp=%p\n",
 		    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 		cv_broadcast(&tx->tx_sync_more_cv);
 		cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
 	}
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 void
 txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	ASSERT(!dsl_pool_config_held(dp));
 
 	mutex_enter(&tx->tx_sync_lock);
-	ASSERT(tx->tx_threads == 2);
+	ASSERT3U(tx->tx_threads, ==, 2);
 	if (txg == 0)
 		txg = tx->tx_open_txg + 1;
 	if (tx->tx_quiesce_txg_waiting < txg)
 		tx->tx_quiesce_txg_waiting = txg;
 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 	while (tx->tx_open_txg < txg) {
 		cv_broadcast(&tx->tx_quiesce_more_cv);
 		cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
 	}
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 /*
  * If there isn't a txg syncing or in the pipeline, push another txg through
  * the pipeline by queiscing the open txg.
  */
 void
 txg_kick(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	ASSERT(!dsl_pool_config_held(dp));
 
 	mutex_enter(&tx->tx_sync_lock);
 	if (tx->tx_syncing_txg == 0 &&
 	    tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
 	    tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
 	    tx->tx_quiesced_txg <= tx->tx_synced_txg) {
 		tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
 		cv_broadcast(&tx->tx_quiesce_more_cv);
 	}
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 boolean_t
 txg_stalled(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
 }
 
 boolean_t
 txg_sync_waiting(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
 	    tx->tx_quiesced_txg != 0);
 }
 
 /*
  * Verify that this txg is active (open, quiescing, syncing).  Non-active
  * txg's should not be manipulated.
  */
 void
 txg_verify(spa_t *spa, uint64_t txg)
 {
 	ASSERTV(dsl_pool_t *dp = spa_get_dsl(spa));
 	if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
 		return;
 	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
 	ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
 	ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
 }
 
 /*
  * Per-txg object lists.
  */
 void
 txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
 {
 	int t;
 
 	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	tl->tl_offset = offset;
 	tl->tl_spa = spa;
 
 	for (t = 0; t < TXG_SIZE; t++)
 		tl->tl_head[t] = NULL;
 }
 
 void
 txg_list_destroy(txg_list_t *tl)
 {
 	int t;
 
 	for (t = 0; t < TXG_SIZE; t++)
 		ASSERT(txg_list_empty(tl, t));
 
 	mutex_destroy(&tl->tl_lock);
 }
 
 boolean_t
 txg_list_empty(txg_list_t *tl, uint64_t txg)
 {
 	txg_verify(tl->tl_spa, txg);
 	return (tl->tl_head[txg & TXG_MASK] == NULL);
 }
 
 /*
  * Returns true if all txg lists are empty.
  *
  * Warning: this is inherently racy (an item could be added immediately
  * after this function returns). We don't bother with the lock because
  * it wouldn't change the semantics.
  */
 boolean_t
 txg_all_lists_empty(txg_list_t *tl)
 {
 	for (int i = 0; i < TXG_SIZE; i++) {
 		if (!txg_list_empty(tl, i)) {
 			return (B_FALSE);
 		}
 	}
 	return (B_TRUE);
 }
 
 /*
  * Add an entry to the list (unless it's already on the list).
  * Returns B_TRUE if it was actually added.
  */
 boolean_t
 txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 	boolean_t add;
 
 	txg_verify(tl->tl_spa, txg);
 	mutex_enter(&tl->tl_lock);
 	add = (tn->tn_member[t] == 0);
 	if (add) {
 		tn->tn_member[t] = 1;
 		tn->tn_next[t] = tl->tl_head[t];
 		tl->tl_head[t] = tn;
 	}
 	mutex_exit(&tl->tl_lock);
 
 	return (add);
 }
 
 /*
  * Add an entry to the end of the list, unless it's already on the list.
  * (walks list to find end)
  * Returns B_TRUE if it was actually added.
  */
 boolean_t
 txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 	boolean_t add;
 
 	txg_verify(tl->tl_spa, txg);
 	mutex_enter(&tl->tl_lock);
 	add = (tn->tn_member[t] == 0);
 	if (add) {
 		txg_node_t **tp;
 
 		for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
 			continue;
 
 		tn->tn_member[t] = 1;
 		tn->tn_next[t] = NULL;
 		*tp = tn;
 	}
 	mutex_exit(&tl->tl_lock);
 
 	return (add);
 }
 
 /*
  * Remove the head of the list and return it.
  */
 void *
 txg_list_remove(txg_list_t *tl, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn;
 	void *p = NULL;
 
 	txg_verify(tl->tl_spa, txg);
 	mutex_enter(&tl->tl_lock);
 	if ((tn = tl->tl_head[t]) != NULL) {
 		p = (char *)tn - tl->tl_offset;
 		tl->tl_head[t] = tn->tn_next[t];
 		tn->tn_next[t] = NULL;
 		tn->tn_member[t] = 0;
 	}
 	mutex_exit(&tl->tl_lock);
 
 	return (p);
 }
 
 /*
  * Remove a specific item from the list and return it.
  */
 void *
 txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn, **tp;
 
 	txg_verify(tl->tl_spa, txg);
 	mutex_enter(&tl->tl_lock);
 
 	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
 		if ((char *)tn - tl->tl_offset == p) {
 			*tp = tn->tn_next[t];
 			tn->tn_next[t] = NULL;
 			tn->tn_member[t] = 0;
 			mutex_exit(&tl->tl_lock);
 			return (p);
 		}
 	}
 
 	mutex_exit(&tl->tl_lock);
 
 	return (NULL);
 }
 
 boolean_t
 txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 
 	txg_verify(tl->tl_spa, txg);
 	return (tn->tn_member[t] != 0);
 }
 
 /*
  * Walk a txg list -- only safe if you know it's not changing.
  */
 void *
 txg_list_head(txg_list_t *tl, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = tl->tl_head[t];
 
 	txg_verify(tl->tl_spa, txg);
 	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
 }
 
 void *
 txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 
 	txg_verify(tl->tl_spa, txg);
 	tn = tn->tn_next[t];
 
 	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(txg_init);
 EXPORT_SYMBOL(txg_fini);
 EXPORT_SYMBOL(txg_sync_start);
 EXPORT_SYMBOL(txg_sync_stop);
 EXPORT_SYMBOL(txg_hold_open);
 EXPORT_SYMBOL(txg_rele_to_quiesce);
 EXPORT_SYMBOL(txg_rele_to_sync);
 EXPORT_SYMBOL(txg_register_callbacks);
 EXPORT_SYMBOL(txg_delay);
 EXPORT_SYMBOL(txg_wait_synced);
 EXPORT_SYMBOL(txg_wait_open);
 EXPORT_SYMBOL(txg_wait_callbacks);
 EXPORT_SYMBOL(txg_stalled);
 EXPORT_SYMBOL(txg_sync_waiting);
 
 module_param(zfs_txg_timeout, int, 0644);
 MODULE_PARM_DESC(zfs_txg_timeout, "Max seconds worth of delta per txg");
 #endif
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index a60cd619838a..977035fd94f1 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -1,4951 +1,4953 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vfs_opreg.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/vmsystm.h>
 #include <sys/atomic.h>
 #include <vm/pvn.h>
 #include <sys/pathname.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/sid.h>
 #include <sys/mode.h>
 #include "fs/fs_subr.h"
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_vnops.h>
 #include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
 #include <sys/cred.h>
 #include <sys/attr.h>
 #include <sys/zpl.h>
+#include <sys/zil.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1) A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  *      can return EIO from the calling function.
  *
  *  (2)	iput() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *	If you must call iput() within a tx then use zfs_iput_async().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
  *      dmu_tx_assign().  This is critical because we don't want to block
  *      while holding locks.
  *
  *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
  *	reduces lock contention and CPU usage when we must wait (note that if
  *	throughput is constrained by the storage, nearly every transaction
  *	must wait).
  *
  *      Note, in particular, that if a lock is sometimes acquired before
  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
  *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
  *	to indicate that this operation has already called dmu_tx_wait().
  *	This will ensure that we don't retry forever, waiting a short bit
  *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may igrab())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		iput(...);		// release held vnodes
  *		if (error == ERESTART) {
  *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		ZFS_EXIT(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	iput(...);			// release held vnodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 
 /*
  * Virus scanning is unsupported.  It would be possible to add a hook
  * here to performance the required virus scan.  This could be done
  * entirely in the kernel or potentially as an update to invoke a
  * scanning utility.
  */
 static int
 zfs_vscan(struct inode *ip, cred_t *cr, int async)
 {
 	return (0);
 }
 
 /* ARGSUSED */
 int
 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 {
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/* Honor ZFS_APPENDONLY file attribute */
 	if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & O_APPEND) == 0)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	/* Virus scan eligible files on open */
 	if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 		if (zfs_vscan(ip, cr, 0) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EACCES));
 		}
 	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & O_SYNC)
 		atomic_inc_32(&zp->z_sync_cnt);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /* ARGSUSED */
 int
 zfs_close(struct inode *ip, int flag, cred_t *cr)
 {
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/* Decrement the synchronous opens in the znode */
 	if (flag & O_SYNC)
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 		VERIFY(zfs_vscan(ip, cr, 1) == 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
 /*
  * Lseek support for finding holes (cmd == SEEK_HOLE) and
  * data (cmd == SEEK_DATA). "off" is an in/out parameter.
  */
 static int
 zfs_holey_common(struct inode *ip, int cmd, loff_t *off)
 {
 	znode_t	*zp = ITOZ(ip);
 	uint64_t noff = (uint64_t)*off; /* new offset */
 	uint64_t file_sz;
 	int error;
 	boolean_t hole;
 
 	file_sz = zp->z_size;
 	if (noff >= file_sz)  {
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (cmd == SEEK_HOLE)
 		hole = B_TRUE;
 	else
 		hole = B_FALSE;
 
 	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
 
 	if (error == ESRCH)
 		return (SET_ERROR(ENXIO));
 
 	/* file was dirty, so fall back to using generic logic */
 	if (error == EBUSY) {
 		if (hole)
 			*off = file_sz;
 
 		return (0);
 	}
 
 	/*
 	 * We could find a hole that begins after the logical end-of-file,
 	 * because dmu_offset_next() only works on whole blocks.  If the
 	 * EOF falls mid-block, then indicate that the "virtual hole"
 	 * at the end of the file begins at the logical EOF, rather than
 	 * at the end of the last block.
 	 */
 	if (noff > file_sz) {
 		ASSERT(hole);
 		noff = file_sz;
 	}
 
 	if (noff < *off)
 		return (error);
 	*off = noff;
 	return (error);
 }
 
 int
 zfs_holey(struct inode *ip, int cmd, loff_t *off)
 {
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	error = zfs_holey_common(ip, cmd, off);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 #endif /* SEEK_HOLE && SEEK_DATA */
 
 #if defined(_KERNEL)
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  */
 static void
 update_pages(struct inode *ip, int64_t start, int len,
     objset_t *os, uint64_t oid)
 {
 	struct address_space *mp = ip->i_mapping;
 	struct page *pp;
 	uint64_t nbytes;
 	int64_t	off;
 	void *pb;
 
 	off = start & (PAGE_SIZE-1);
 	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 		nbytes = MIN(PAGE_SIZE - off, len);
 
 		pp = find_lock_page(mp, start >> PAGE_SHIFT);
 		if (pp) {
 			if (mapping_writably_mapped(mp))
 				flush_dcache_page(pp);
 
 			pb = kmap(pp);
 			(void) dmu_read(os, oid, start+off, nbytes, pb+off,
 			    DMU_READ_PREFETCH);
 			kunmap(pp);
 
 			if (mapping_writably_mapped(mp))
 				flush_dcache_page(pp);
 
 			mark_page_accessed(pp);
 			SetPageUptodate(pp);
 			ClearPageError(pp);
 			unlock_page(pp);
 			put_page(pp);
 		}
 
 		len -= nbytes;
 		off = 0;
 	}
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	 the file is memory mapped.
  */
 static int
 mappedread(struct inode *ip, int nbytes, uio_t *uio)
 {
 	struct address_space *mp = ip->i_mapping;
 	struct page *pp;
 	znode_t *zp = ITOZ(ip);
 	int64_t	start, off;
 	uint64_t bytes;
 	int len = nbytes;
 	int error = 0;
 	void *pb;
 
 	start = uio->uio_loffset;
 	off = start & (PAGE_SIZE-1);
 	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 		bytes = MIN(PAGE_SIZE - off, len);
 
 		pp = find_lock_page(mp, start >> PAGE_SHIFT);
 		if (pp) {
 			ASSERT(PageUptodate(pp));
 
 			pb = kmap(pp);
 			error = uiomove(pb + off, bytes, UIO_READ, uio);
 			kunmap(pp);
 
 			if (mapping_writably_mapped(mp))
 				flush_dcache_page(pp);
 
 			mark_page_accessed(pp);
 			unlock_page(pp);
 			put_page(pp);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, bytes);
 		}
 
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	return (error);
 }
 #endif /* _KERNEL */
 
 unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 
 /*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	ip	- inode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		ioflag	- FSYNC flags; used to provide FRSYNC semantics.
  *			  O_DIRECT flag; used to bypass page cache.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Side Effects:
  *	inode - atime updated if byte count > 0
  */
 /* ARGSUSED */
 int
 zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	ssize_t		n, nbytes;
 	int		error = 0;
 	rl_t		*rl;
 #ifdef HAVE_UIO_ZEROCOPY
 	xuio_t		*xuio = NULL;
 #endif /* HAVE_UIO_ZEROCOPY */
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EACCES));
 	}
 
 	/*
 	 * Validate file offset
 	 */
 	if (uio->uio_loffset < (offset_t)0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Fasttrack empty reads
 	 */
 	if (uio->uio_resid == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 * Only do this for non-snapshots.
 	 */
 	if (zfsvfs->z_log &&
 	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
 	 */
 	rl = zfs_range_lock(&zp->z_range_lock, uio->uio_loffset, uio->uio_resid,
 	    RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
 	if (uio->uio_loffset >= zp->z_size) {
 		error = 0;
 		goto out;
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_size);
 	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 
 #ifdef HAVE_UIO_ZEROCOPY
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
 		int blksz = zp->z_blksz;
 		uint64_t offset = uio->uio_loffset;
 
 		xuio = (xuio_t *)uio;
 		if ((ISP2(blksz))) {
 			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 			    blksz)) / blksz;
 		} else {
 			ASSERT(offset + n <= blksz);
 			nblk = 1;
 		}
 		(void) dmu_xuio_init(xuio, nblk);
 
 		if (vn_has_cached_data(ip)) {
 			/*
 			 * For simplicity, we always allocate a full buffer
 			 * even if we only expect to read a portion of a block.
 			 */
 			while (--nblk >= 0) {
 				(void) dmu_xuio_add(xuio,
 				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    blksz), 0, blksz);
 			}
 		}
 	}
 #endif /* HAVE_UIO_ZEROCOPY */
 
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 		if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
 			error = mappedread(ip, nbytes, uio);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes);
 		}
 
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 
 		n -= nbytes;
 	}
 out:
 	zfs_range_unlock(rl);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	ip	- inode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- FAPPEND flag set if in append mode.
  *			  O_DIRECT flag; used to bypass page cache.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - ctime|mtime updated if byte count > 0
  */
 
 /* ARGSUSED */
 int
 zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 {
 	znode_t		*zp = ITOZ(ip);
 	rlim64_t	limit = uio->uio_limit;
 	ssize_t		start_resid = uio->uio_resid;
 	ssize_t		tx_bytes;
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
 	zilog_t		*zilog;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error = 0;
 	arc_buf_t	*abuf;
 	const iovec_t	*aiov = NULL;
 	xuio_t		*xuio = NULL;
 	int		write_eof;
 	int		count = 0;
 	sa_bulk_attr_t	bulk[4];
 	uint64_t	mtime[2], ctime[2];
 	uint32_t	uid;
 #ifdef HAVE_UIO_ZEROCOPY
 	int		i_iov = 0;
 	const iovec_t	*iovp = uio->uio_iov;
 	ASSERTV(int	iovcnt = uio->uio_iovcnt);
 #endif
 
 	/*
 	 * Fasttrack empty write
 	 */
 	n = start_resid;
 	if (n == 0)
 		return (0);
 
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/*
 	 * Callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(zfsvfs)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * If immutable or not appending then return EPERM
 	 */
 	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 	    (uio->uio_loffset < zp->z_size))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Validate file offset
 	 */
 	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 	if (woff < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 * Skip this if uio contains loaned arc_buf.
 	 */
 #ifdef HAVE_UIO_ZEROCOPY
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 		xuio = (xuio_t *)uio;
 	else
 #endif
 		uio_prefaultpages(MIN(n, max_blksz), uio);
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & FAPPEND) {
 		/*
 		 * Obtain an appending range lock to guarantee file append
 		 * semantics.  We reset the write offset once we have the lock.
 		 */
 		rl = zfs_range_lock(&zp->z_range_lock, 0, n, RL_APPEND);
 		woff = rl->r_off;
 		if (rl->r_len == UINT64_MAX) {
 			/*
 			 * We overlocked the file because this write will cause
 			 * the file block size to increase.
 			 * Note that zp_size cannot change with this lock held.
 			 */
 			woff = zp->z_size;
 		}
 		uio->uio_loffset = woff;
 	} else {
 		/*
 		 * Note that if the file block size will change as a result of
 		 * this write, then this range lock will lock the entire file
 		 * so that we can re-write the block safely.
 		 */
 		rl = zfs_range_lock(&zp->z_range_lock, woff, n, RL_WRITER);
 	}
 
 	if (woff >= limit) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EFBIG));
 	}
 
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
 	/* Will this write extend the file length? */
 	write_eof = (woff + n > zp->z_size);
 
 	end_size = MAX(zp->z_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		abuf = NULL;
 		woff = uio->uio_loffset;
 		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
 		if (xuio && abuf == NULL) {
 #ifdef HAVE_UIO_ZEROCOPY
 			ASSERT(i_iov < iovcnt);
 			ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
 			aiov = &iovp[i_iov];
 			abuf = dmu_xuio_arcbuf(xuio, i_iov);
 			dmu_xuio_clear(xuio, i_iov);
 			ASSERT((aiov->iov_base == abuf->b_data) ||
 			    ((char *)aiov->iov_base - (char *)abuf->b_data +
 			    aiov->iov_len == arc_buf_size(abuf)));
 			i_iov++;
 #endif
 		} else if (abuf == NULL && n >= max_blksz &&
 		    woff >= zp->z_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
 			 * from the dmu so that we can fill it before we enter
 			 * a transaction.  This avoids the possibility of
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
 			size_t cbytes;
 
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    max_blksz);
 			ASSERT(abuf != NULL);
 			ASSERT(arc_buf_size(abuf) == max_blksz);
 			if ((error = uiocopy(abuf->b_data, max_blksz,
 			    UIO_WRITE, uio, &cbytes))) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
 			ASSERT(cbytes == max_blksz);
 		}
 
 		/*
 		 * Start a transaction.
 		 */
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			break;
 		}
 
 		/*
 		 * If zfs_range_lock() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since zfs_range_reduce() will
 		 * shrink down r_len to the appropriate size.
 		 */
 		if (rl->r_len == UINT64_MAX) {
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
 				/*
 				 * File's blocksize is already larger than the
 				 * "recordsize" property.  Only let it grow to
 				 * the next power of 2.
 				 */
 				ASSERT(!ISP2(zp->z_blksz));
 				new_blksz = MIN(end_size,
 				    1 << highbit64(zp->z_blksz));
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
 			zfs_grow_blocksize(zp, new_blksz, tx);
 			zfs_range_reduce(rl, woff, n);
 		}
 
 		/*
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes, tx);
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
 			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 			/*
 			 * If this is not a full block write, but we are
 			 * extending the file past EOF and this data starts
 			 * block-aligned, use assign_arcbuf().  Otherwise,
 			 * write via dmu_write().
 			 */
 			if (tx_bytes < max_blksz && (!write_eof ||
 			    aiov->iov_base != abuf->b_data)) {
 				ASSERT(xuio);
 				dmu_write(zfsvfs->z_os, zp->z_id, woff,
 				    /* cppcheck-suppress nullPointer */
 				    aiov->iov_len, aiov->iov_base, tx);
 				dmu_return_arcbuf(abuf);
 				xuio_stat_wbuf_copied();
 			} else {
 				ASSERT(xuio || tx_bytes == max_blksz);
 				dmu_assign_arcbuf_by_dbuf(
 				    sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
 			}
 			ASSERT(tx_bytes <= uio->uio_resid);
 			uioskip(uio, tx_bytes);
 		}
 		if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) {
 			update_pages(ip, woff,
 			    tx_bytes, zfsvfs->z_os, zp->z_id);
 		}
 
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
 		}
 
 		/*
 		 * Clear Set-UID/Set-GID bits on successful write if not
 		 * privileged and at least one of the execute bits is set.
 		 *
 		 * It would be nice to to this after all writes have
 		 * been done, but that would still expose the ISUID/ISGID
 		 * to another app after the partial write is committed.
 		 *
 		 * Note: we don't call zfs_fuid_map_id() here because
 		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
 		uid = KUID_TO_SUID(ip->i_uid);
 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 		    secpolicy_vnode_setid_retain(cr,
 		    ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
 			uint64_t newmode;
 			zp->z_mode &= ~(S_ISUID | S_ISGID);
 			ip->i_mode = newmode = zp->z_mode;
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 			    (void *)&newmode, sizeof (uint64_t), tx);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
 			ASSERT(error == 0);
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
 		 * the file size to the specified eof. Note, there's no
 		 * concurrency during replay.
 		 */
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
 		    NULL, NULL);
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 
 		if (!xuio && n > 0)
 			uio_prefaultpages(MIN(n, max_blksz), uio);
 	}
 
 	zfs_inode_update(zp);
 	zfs_range_unlock(rl);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
 	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (ioflag & (FSYNC | FDSYNC) ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, zp->z_id);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Drop a reference on the passed inode asynchronously. This ensures
  * that the caller will never drop the last reference on an inode in
  * the current context. Doing so while holding open a tx could result
  * in a deadlock if iput_final() re-enters the filesystem code.
  */
 void
 zfs_iput_async(struct inode *ip)
 {
 	objset_t *os = ITOZSB(ip)->z_os;
 
 	ASSERT(atomic_read(&ip->i_count) > 0);
 	ASSERT(os != NULL);
 
 	if (atomic_read(&ip->i_count) == 1)
 		VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)),
 		    (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
 	else
 		iput(ip);
 }
 
 void
 zfs_get_done(zgd_t *zgd, int error)
 {
 	znode_t *zp = zgd->zgd_private;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_range_unlock(zgd->zgd_rl);
 
 	/*
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
 	zfs_iput_async(ZTOI(zp));
 
 	if (error == 0 && zgd->zgd_bp)
-		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 #ifdef DEBUG
 static int zil_fault_io = 0;
 #endif
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error = 0;
 
-	ASSERT(zio != NULL);
-	ASSERT(size != 0);
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(zio, !=, NULL);
+	ASSERT3U(size, !=, 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
 	if (zfs_zget(zfsvfs, object, &zp) != 0)
 		return (SET_ERROR(ENOENT));
 	if (zp->z_unlinked) {
 		/*
 		 * Release the vnode asynchronously as we currently have the
 		 * txg stopped from syncing.
 		 */
 		zfs_iput_async(ZTOI(zp));
 		return (SET_ERROR(ENOENT));
 	}
 
 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
-	zgd->zgd_zilog = zfsvfs->z_log;
+	zgd->zgd_lwb = lwb;
 	zgd->zgd_private = zp;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset, size,
 		    RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (offset >= zp->z_size) {
 			error = SET_ERROR(ENOENT);
 		} else {
 			error = dmu_read(os, object, offset, size, buf,
 			    DMU_READ_NO_PREFETCH);
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and its checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
 			uint64_t blkoff;
 			size = zp->z_blksz;
 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 			offset -= blkoff;
 			zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset,
 			    size, RL_READER);
 			if (zp->z_blksz == size)
 				break;
 			offset += blkoff;
 			zfs_range_unlock(zgd->zgd_rl);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (lr->lr_offset >= zp->z_size)
 			error = SET_ERROR(ENOENT);
 #ifdef DEBUG
 		if (zil_fault_io) {
 			error = SET_ERROR(EIO);
 			zil_fault_io = 0;
 		}
 #endif
 		if (error == 0)
 			error = dmu_buf_hold(os, object, offset, zgd, &db,
 			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zfs_get_done, zgd);
 			ASSERT(error || lr->lr_length <= size);
 
 			/*
 			 * On success, we need to wait for the write I/O
 			 * initiated by dmu_sync() to complete before we can
 			 * release this dbuf.  We will finish everything up
 			 * in the zfs_get_done() callback.
 			 */
 			if (error == 0)
 				return (0);
 
 			if (error == EALREADY) {
 				lr->lr_common.lrc_txtype = TX_WRITE2;
 				error = 0;
 			}
 		}
 	}
 
 	zfs_get_done(zgd, error);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 int
 zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (flag & V_ACE_MASK)
 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
 	else
 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held inode reference for it.
  *
  *	IN:	dip	- inode of directory to search.
  *		nm	- name of entry to lookup.
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		cr	- credentials of caller.
  *		direntflags - directory lookup flags
  *		realpnp - returned pathname.
  *
  *	OUT:	ipp	- inode of located entry, NULL if not found.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	NA
  */
 /* ARGSUSED */
 int
 zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags,
     cred_t *cr, int *direntflags, pathname_t *realpnp)
 {
 	znode_t *zdp = ITOZ(dip);
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	int error = 0;
 
 	/*
 	 * Fast path lookup, however we must skip DNLC lookup
 	 * for case folding or normalizing lookups because the
 	 * DNLC code only stores the passed in name.  This means
 	 * creating 'a' and removing 'A' on a case insensitive
 	 * file system would work, but DNLC still thinks 'a'
 	 * exists and won't let you create it again on the next
 	 * pass through fast path.
 	 */
 	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 
 		if (!S_ISDIR(dip->i_mode)) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
 
 		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 			error = zfs_fastaccesschk_execute(zdp, cr);
 			if (!error) {
 				*ipp = dip;
 				igrab(*ipp);
 				return (0);
 			}
 			return (error);
 #ifdef HAVE_DNLC
 		} else if (!zdp->z_zfsvfs->z_norm &&
 		    (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) {
 
 			vnode_t *tvp = dnlc_lookup(dvp, nm);
 
 			if (tvp) {
 				error = zfs_fastaccesschk_execute(zdp, cr);
 				if (error) {
 					iput(tvp);
 					return (error);
 				}
 				if (tvp == DNLC_NO_VNODE) {
 					iput(tvp);
 					return (SET_ERROR(ENOENT));
 				} else {
 					*vpp = tvp;
 					return (specvp_check(vpp, cr));
 				}
 			}
 #endif /* HAVE_DNLC */
 		}
 	}
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zdp);
 
 	*ipp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 
 		if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0,
 		    B_FALSE, cr))) {
 			iput(*ipp);
 			*ipp = NULL;
 		}
 
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (!S_ISDIR(dip->i_mode)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOTDIR));
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 
 	if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp);
 	if ((error == 0) && (*ipp))
 		zfs_inode_update(ITOZ(*ipp));
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the ip of the created or trunc'd file.
  *
  *	IN:	dip	- inode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		vsecp	- ACL to be set
  *
  *	OUT:	ipp	- inode of created or trunc'd entry.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dip - ctime|mtime updated if new entry created
  *	 ip - ctime|mtime always, atime if new
  */
 
 /* ARGSUSED */
 int
 zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl,
     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
 {
 	znode_t		*zp, *dzp = ITOZ(dip);
 	zfsvfs_t	*zfsvfs = ITOZSB(dip);
 	zilog_t		*zilog;
 	objset_t	*os;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	uid_t		uid;
 	gid_t		gid;
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	have_acl = B_FALSE;
 	boolean_t	waited = B_FALSE;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	gid = crgetgid(cr);
 	uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & ATTR_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_mode)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 top:
 	*ipp = NULL;
 	if (*name == '\0') {
 		/*
 		 * Null component name refers to the directory itself.
 		 */
 		igrab(dip);
 		zp = dzp;
 		dl = NULL;
 		error = 0;
 	} else {
 		/* possible igrab(zp) */
 		int zflg = 0;
 
 		if (flag & FIGNORECASE)
 			zflg |= ZCILOOK;
 
 		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 		    NULL, NULL);
 		if (error) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			if (strcmp(name, "..") == 0)
 				error = SET_ERROR(EISDIR);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if (zp == NULL) {
 		uint64_t txtype;
 
 		/*
 		 * Create a new file object and update the directory
 		 * to reference it.
 		 */
 		if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			goto out;
 		}
 
 		/*
 		 * We only support the creation of regular files in
 		 * extended attribute directories.
 		 */
 
 		if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 		    cr, vsecp, &acl_ids)) != 0)
 			goto out;
 		have_acl = B_TRUE;
 
 		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 			zfs_acl_ids_free(&acl_ids);
 			error = SET_ERROR(EDQUOT);
 			goto out;
 		}
 
 		tx = dmu_tx_create(os);
 
 		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 		    ZFS_SA_BASE_ATTR_SIZE);
 
 		fuid_dirtied = zfsvfs->z_fuid_dirty;
 		if (fuid_dirtied)
 			zfs_fuid_txhold(zfsvfs, tx);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 		if (!zfsvfs->z_use_sa &&
 		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, acl_ids.z_aclp->z_acl_bytes);
 		}
 		error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 		if (error) {
 			zfs_dirent_unlock(dl);
 			if (error == ERESTART) {
 				waited = B_TRUE;
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			zfs_acl_ids_free(&acl_ids);
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 		if (fuid_dirtied)
 			zfs_fuid_sync(zfsvfs, tx);
 
 		(void) zfs_link_create(dl, zp, tx, ZNEW);
 		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 		if (flag & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 		    vsecp, acl_ids.z_fuidp, vap);
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_commit(tx);
 	} else {
 		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
 
 		if (have_acl)
 			zfs_acl_ids_free(&acl_ids);
 		have_acl = B_FALSE;
 
 		/*
 		 * A directory entry already exists for this name.
 		 */
 		/*
 		 * Can't truncate an existing file if in exclusive mode.
 		 */
 		if (excl) {
 			error = SET_ERROR(EEXIST);
 			goto out;
 		}
 		/*
 		 * Can't open a directory for writing.
 		 */
 		if (S_ISDIR(ZTOI(zp)->i_mode)) {
 			error = SET_ERROR(EISDIR);
 			goto out;
 		}
 		/*
 		 * Verify requested access to file.
 		 */
 		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
 			goto out;
 		}
 
 		mutex_enter(&dzp->z_lock);
 		dzp->z_seq++;
 		mutex_exit(&dzp->z_lock);
 
 		/*
 		 * Truncate regular files if requested.
 		 */
 		if (S_ISREG(ZTOI(zp)->i_mode) &&
 		    (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
 			/* we can't hold any locks when calling zfs_freesp() */
 			if (dl) {
 				zfs_dirent_unlock(dl);
 				dl = NULL;
 			}
 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
 		}
 	}
 out:
 
 	if (dl)
 		zfs_dirent_unlock(dl);
 
 	if (error) {
 		if (zp)
 			iput(ZTOI(zp));
 	} else {
 		zfs_inode_update(dzp);
 		zfs_inode_update(zp);
 		*ipp = ZTOI(zp);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /* ARGSUSED */
 int
 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
 {
 	znode_t		*zp = NULL, *dzp = ITOZ(dip);
 	zfsvfs_t	*zfsvfs = ITOZSB(dip);
 	objset_t	*os;
 	dmu_tx_t	*tx;
 	int		error;
 	uid_t		uid;
 	gid_t		gid;
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	have_acl = B_FALSE;
 	boolean_t	waited = B_FALSE;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	gid = crgetgid(cr);
 	uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	os = zfsvfs->z_os;
 
 	if (vap->va_mask & ATTR_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_mode)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 top:
 	*ipp = NULL;
 
 	/*
 	 * Create a new file object and update the directory
 	 * to reference it.
 	 */
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
 		if (have_acl)
 			zfs_acl_ids_free(&acl_ids);
 		goto out;
 	}
 
 	if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 	    cr, vsecp, &acl_ids)) != 0)
 		goto out;
 	have_acl = B_TRUE;
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		error = SET_ERROR(EDQUOT);
 		goto out;
 	}
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa &&
 	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, acl_ids.z_aclp->z_acl_bytes);
 	}
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	/* Add to unlinked set */
 	zp->z_unlinked = 1;
 	zfs_unlinked_add(zp, tx);
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 out:
 
 	if (error) {
 		if (zp)
 			iput(ZTOI(zp));
 	} else {
 		zfs_inode_update(dzp);
 		zfs_inode_update(zp);
 		*ipp = ZTOI(zp);
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dip	- inode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dip - ctime|mtime
  *	 ip - ctime (if nlink > 0)
  */
 
 uint64_t null_xattr = 0;
 
 /*ARGSUSED*/
 int
 zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags)
 {
 	znode_t		*zp, *dzp = ITOZ(dip);
 	znode_t		*xzp;
 	struct inode	*ip;
 	zfsvfs_t	*zfsvfs = ITOZSB(dip);
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	uint64_t	xattr_obj_unlinked = 0;
 	uint64_t	obj = 0;
 	uint64_t	links;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	pathname_t	*realnmp = NULL;
 	pathname_t	realnm;
 	int		error;
 	int		zflg = ZEXISTS;
 	boolean_t	waited = B_FALSE;
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE) {
 		zflg |= ZCILOOK;
 		pn_alloc(&realnm);
 		realnmp = &realnm;
 	}
 
 top:
 	xattr_obj = 0;
 	xzp = NULL;
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, realnmp))) {
 		if (realnmp)
 			pn_free(realnmp);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	ip = ZTOI(zp);
 
 	if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (S_ISDIR(ip->i_mode)) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 #ifdef HAVE_DNLC
 	if (realnmp)
 		dnlc_remove(dvp, realnmp->pn_buf);
 	else
 		dnlc_remove(dvp, name);
 #endif /* HAVE_DNLC */
 
 	mutex_enter(&zp->z_lock);
 	may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped);
 	mutex_exit(&zp->z_lock);
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the inode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	obj = zp->z_id;
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	if (may_delete_now) {
 		toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
 		/* if the file is too big, only hold_free a token amount */
 		dmu_tx_hold_free(tx, zp->z_id, 0,
 		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
 	}
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	mutex_enter(&zp->z_lock);
 	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 	mutex_exit(&zp->z_lock);
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	/*
 	 * Mark this transaction as typically resulting in a net free of space
 	 */
 	dmu_tx_mark_netfree(tx);
 
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			iput(ip);
 			if (xzp)
 				iput(ZTOI(xzp));
 			goto top;
 		}
 		if (realnmp)
 			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		iput(ip);
 		if (xzp)
 			iput(ZTOI(xzp));
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 		/*
 		 * Hold z_lock so that we can make sure that the ACL obj
 		 * hasn't changed.  Could have been deleted due to
 		 * zfs_sa_upgrade().
 		 */
 		mutex_enter(&zp->z_lock);
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
 		delete_now = may_delete_now && !toobig &&
 		    atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) &&
 		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
 		    acl_obj;
 	}
 
 	if (delete_now) {
 		if (xattr_obj_unlinked) {
 			ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
 			mutex_enter(&xzp->z_lock);
 			xzp->z_unlinked = 1;
 			clear_nlink(ZTOI(xzp));
 			links = 0;
 			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 			    &links, sizeof (links), tx);
 			ASSERT3U(error,  ==,  0);
 			mutex_exit(&xzp->z_lock);
 			zfs_unlinked_add(xzp, tx);
 
 			if (zp->z_is_sa)
 				error = sa_remove(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), tx);
 			else
 				error = sa_update(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
 				    sizeof (uint64_t), tx);
 			ASSERT0(error);
 		}
 		/*
 		 * Add to the unlinked set because a new reference could be
 		 * taken concurrently resulting in a deferred destruction.
 		 */
 		zfs_unlinked_add(zp, tx);
 		mutex_exit(&zp->z_lock);
 	} else if (unlinked) {
 		mutex_exit(&zp->z_lock);
 		zfs_unlinked_add(zp, tx);
 	}
 
 	txtype = TX_REMOVE;
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
 	if (realnmp)
 		pn_free(realnmp);
 
 	zfs_dirent_unlock(dl);
 	zfs_inode_update(dzp);
 	zfs_inode_update(zp);
 
 	if (delete_now)
 		iput(ip);
 	else
 		zfs_iput_async(ip);
 
 	if (xzp) {
 		zfs_inode_update(xzp);
 		zfs_iput_async(ZTOI(xzp));
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dip using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dip	- inode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		vsecp	- ACL to be set
  *
  *	OUT:	ipp	- inode of created directory.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dip - ctime|mtime updated
  *	ipp - ctime|mtime|atime updated
  */
 /*ARGSUSED*/
 int
 zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp,
     cred_t *cr, int flags, vsecattr_t *vsecp)
 {
 	znode_t		*zp, *dzp = ITOZ(dip);
 	zfsvfs_t	*zfsvfs = ITOZSB(dip);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zf = ZNEW;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(S_ISDIR(vap->va_mode));
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if (dirname == NULL)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	if (vap->va_mask & ATTR_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_mode)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    vsecp, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 top:
 	*ipp = NULL;
 
 	if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
 	    NULL, NULL))) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 
 	*ipp = ZTOI(zp);
 
 	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_inode_update(dzp);
 	zfs_inode_update(zp);
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dip	- inode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- inode of current working directory.
  *		cr	- credentials of caller.
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dip - ctime|mtime updated
  */
 /*ARGSUSED*/
 int
 zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
     int flags)
 {
 	znode_t		*dzp = ITOZ(dip);
 	znode_t		*zp;
 	struct inode	*ip;
 	zfsvfs_t	*zfsvfs = ITOZSB(dip);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zflg = ZEXISTS;
 	boolean_t	waited = B_FALSE;
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 top:
 	zp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, NULL))) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	ip = ZTOI(zp);
 
 	if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
 		goto out;
 	}
 
 	if (!S_ISDIR(ip->i_mode)) {
 		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	if (ip == cwd) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * Grab a lock on the directory to make sure that no one is
 	 * trying to add (or lookup) entries while we are removing it.
 	 */
 	rw_enter(&zp->z_name_lock, RW_WRITER);
 
 	/*
 	 * Grab a lock on the parent pointer to make sure we play well
 	 * with the treewalk and directory rename code.
 	 */
 	rw_enter(&zp->z_parent_lock, RW_WRITER);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			iput(ip);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		iput(ip);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
 	rw_exit(&zp->z_parent_lock);
 	rw_exit(&zp->z_name_lock);
 out:
 	zfs_dirent_unlock(dl);
 
 	zfs_inode_update(dzp);
 	zfs_inode_update(zp);
 	iput(ip);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * dirent buffer from the given directory cursor position.
  *
  *	IN:	ip	- inode of directory to read.
  *		dirent	- buffer for directory entries.
  *
  *	OUT:	dirent	- filler buffer of directory entries.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 /* ARGSUSED */
 int
 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	objset_t	*os;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	int		error;
 	uint8_t		prefetch;
 	uint8_t		type;
 	int		done = 0;
 	uint64_t	parent;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0)
 		goto out;
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if (zp->z_unlinked)
 		goto out;
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = ctx->pos;
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	while (!done) {
 		uint64_t objnum;
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			zap.za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			zap.za_normalization_conflict = 0;
 			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			zap.za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if ((error = zap_cursor_retrieve(&zc, &zap))) {
 				if (error == ENOENT)
 					break;
 				else
 					goto update;
 			}
 
 			/*
 			 * Allow multiple entries provided the first entry is
 			 * the object id.  Non-zpl consumers may safely make
 			 * use of the additional space.
 			 *
 			 * XXX: This should be a feature flag for compatibility
 			 */
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers == 0) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld, "
 				    "length = %d, num = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset,
 				    zap.za_integer_length,
 				    (u_longlong_t)zap.za_num_integers);
 				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 		}
 
 		done = !dir_emit(ctx, zap.za_name, strlen(zap.za_name),
 		    objnum, type);
 		if (done)
 			break;
 
 		/* Prefetch znode */
 		if (prefetch) {
 			dmu_prefetch(os, objnum, 0, 0, 0,
 			    ZIO_PRIORITY_SYNC_READ);
 		}
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 		ctx->pos = offset;
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 update:
 	zap_cursor_fini(&zc);
 	if (error == ENOENT)
 		error = 0;
 out:
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 ulong_t zfs_fsync_sync_cnt = 4;
 
 int
 zfs_fsync(struct inode *ip, int syncflag, cred_t *cr)
 {
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 
 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
 
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		zil_commit(zfsvfs->z_log, zp->z_id);
 		ZFS_EXIT(zfsvfs);
 	}
 	tsd_set(zfs_fsyncer_key, NULL);
 
 	return (0);
 }
 
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	ip	- inode of file.
  *		vap	- va_mask identifies requested attributes.
  *			  If ATTR_XVATTR set, then optional attrs are requested
  *		flags	- ATTR_NOACLCHECK (CIFS server context)
  *		cr	- credentials of caller.
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds)
  */
 /* ARGSUSED */
 int
 zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int	error = 0;
 	uint64_t links;
 	uint64_t atime[2], mtime[2], ctime[2];
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t *xoap = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	sa_bulk_attr_t bulk[3];
 	int count = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 
 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
 	    (vap->va_uid != crgetuid(cr))) {
 		if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr))) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 
 	mutex_enter(&zp->z_lock);
 	vap->va_type = vn_mode_to_vtype(zp->z_mode);
 	vap->va_mode = zp->z_mode;
 	vap->va_fsid = ZTOI(zp)->i_sb->s_dev;
 	vap->va_nodeid = zp->z_id;
 	if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
 		links = ZTOI(zp)->i_nlink + 1;
 	else
 		links = ZTOI(zp)->i_nlink;
 	vap->va_nlink = MIN(links, ZFS_LINK_MAX);
 	vap->va_size = i_size_read(ip);
 	vap->va_rdev = ip->i_rdev;
 	vap->va_seq = ip->i_generation;
 
 	/*
 	 * Add in any requested optional attributes and the create time.
 	 * Also set the corresponding bits in the returned attribute bitmap.
 	 */
 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 			xoap->xoa_archive =
 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 			xoap->xoa_readonly =
 			    ((zp->z_pflags & ZFS_READONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_READONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 			xoap->xoa_system =
 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
 			XVA_SET_RTN(xvap, XAT_SYSTEM);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 			xoap->xoa_hidden =
 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
 			XVA_SET_RTN(xvap, XAT_HIDDEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			xoap->xoa_nounlink =
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			xoap->xoa_immutable =
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			xoap->xoa_appendonly =
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			xoap->xoa_nodump =
 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
 			XVA_SET_RTN(xvap, XAT_NODUMP);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 			xoap->xoa_opaque =
 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
 			XVA_SET_RTN(xvap, XAT_OPAQUE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			xoap->xoa_av_quarantined =
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			xoap->xoa_av_modified =
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
 		    S_ISREG(ip->i_mode)) {
 			zfs_sa_get_scanstamp(zp, xvap);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 			uint64_t times[2];
 
 			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
 			    times, sizeof (times));
 			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
 			XVA_SET_RTN(xvap, XAT_CREATETIME);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_REPARSE);
 		}
 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
 			xoap->xoa_generation = ip->i_generation;
 			XVA_SET_RTN(xvap, XAT_GEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 			xoap->xoa_offline =
 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
 			XVA_SET_RTN(xvap, XAT_OFFLINE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 			xoap->xoa_sparse =
 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_SPARSE);
 		}
 	}
 
 	ZFS_TIME_DECODE(&vap->va_atime, atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 
 	mutex_exit(&zp->z_lock);
 
 	sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Get the basic file attributes and place them in the provided kstat
  * structure.  The inode is assumed to be the authoritative source
  * for most of the attributes.  However, the znode currently has the
  * authoritative atime, blksize, and block count.
  *
  *	IN:	ip	- inode of file.
  *
  *	OUT:	sp	- kstat values.
  *
  *	RETURN:	0 (always succeeds)
  */
 /* ARGSUSED */
 int
 zfs_getattr_fast(struct inode *ip, struct kstat *sp)
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	uint32_t blksize;
 	u_longlong_t nblocks;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	mutex_enter(&zp->z_lock);
 
 	generic_fillattr(ip, sp);
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	sp->blksize = blksize;
 	sp->blocks = nblocks;
 
 	if (unlikely(zp->z_blksz == 0)) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		sp->blksize = zfsvfs->z_max_blksz;
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	/*
 	 * Required to prevent NFS client from detecting different inode
 	 * numbers of snapshot root dentry before and after snapshot mount.
 	 */
 	if (zfsvfs->z_issnap) {
 		if (ip->i_sb->s_root->d_inode == ip)
 			sp->ino = ZFSCTL_INO_SNAPDIRS -
 			    dmu_objset_id(zfsvfs->z_os);
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	ip	- inode of file to be modified.
  *		vap	- new attribute values.
  *			  If ATTR_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - ctime updated, mtime updated if size changed.
  */
 /* ARGSUSED */
 int
 zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	*tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask = 0;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_kuid = 0, new_kgid = 0, new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2], atime[2];
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	sa_bulk_attr_t	*bulk, *xattr_bulk;
 	int		count = 0, xattr_count = 0;
 
 	if (mask == 0)
 		return (0);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & ATTR_XVATTR))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If this is an xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 
 	tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
 	xva_init(tmpxvattr);
 
 	bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 7, KM_SLEEP);
 	xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 7, KM_SLEEP);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
 	    ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		err = SET_ERROR(EPERM);
 		goto out3;
 	}
 
 	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
 		err = SET_ERROR(EPERM);
 		goto out3;
 	}
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (ATTR_ATIME | ATTR_MTIME)) {
 		if (((mask & ATTR_ATIME) &&
 		    TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & ATTR_MTIME) &&
 		    TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			err = SET_ERROR(EOVERFLOW);
 			goto out3;
 		}
 	}
 
 top:
 	attrzp = NULL;
 	aclp = NULL;
 
 	/* Can this be moved to before the top label? */
 	if (zfs_is_readonly(zfsvfs)) {
 		err = SET_ERROR(EROFS);
 		goto out3;
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & ATTR_SIZE) {
 		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
 		if (err)
 			goto out3;
 
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err)
 			goto out3;
 	}
 
 	if (mask & (ATTR_ATIME|ATTR_MTIME) ||
 	    ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr);
 	}
 
 	if (mask & (ATTR_UID|ATTR_GID)) {
 		int	idmask = (mask & (ATTR_UID|ATTR_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & ATTR_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & ATTR_GID) &&
 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
 
 		/*
 		 * If both ATTR_UID and ATTR_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (ATTR_UID|ATTR_GID)) &&
 		    take_owner && take_group) ||
 		    ((idmask == ATTR_UID) && take_owner) ||
 		    ((idmask == ATTR_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				(void) secpolicy_setid_clear(vap, cr);
 				trim_mask = (mask & (ATTR_UID|ATTR_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	mutex_enter(&zp->z_lock);
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & ATTR_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((!S_ISREG(ip->i_mode) &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			mutex_exit(&zp->z_lock);
 			err = SET_ERROR(EPERM);
 			goto out3;
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	if (mask & ATTR_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(ip, vap,
 			    &oldva, cr);
 			if (err)
 				goto out3;
 
 			trim_mask |= ATTR_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 		}
 		err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
 		if (err)
 			goto out3;
 
 		if (trim_mask)
 			vap->va_mask |= saved_mask;
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (ATTR_UID | ATTR_GID))) {
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
 			if (err)
 				goto out2;
 		}
 		if (mask & ATTR_UID) {
 			new_kuid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_kuid)) {
 				if (attrzp)
 					iput(ZTOI(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (mask & ATTR_GID) {
 			new_kgid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
 			if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_kgid)) {
 				if (attrzp)
 					iput(ZTOI(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	if (mask & ATTR_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		zfs_acl_chmod_setattr(zp, &aclp, new_mode);
 
 		mutex_enter(&zp->z_lock);
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		mutex_exit(&zp->z_lock);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if ((mask & ATTR_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err)
 		goto out;
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 
 	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 		mutex_enter(&zp->z_acl_lock);
 	mutex_enter(&zp->z_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		mutex_enter(&attrzp->z_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 	}
 
 	if (mask & (ATTR_UID|ATTR_GID)) {
 
 		if (mask & ATTR_UID) {
 			ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
 			new_uid = zfs_uid_read(ZTOI(zp));
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
 			}
 		}
 
 		if (mask & ATTR_GID) {
 			ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
 			new_gid = zfs_gid_read(ZTOI(zp));
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
 			}
 		}
 		if (!(mask & ATTR_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT(err == 0);
 		if (attrzp) {
 			err = zfs_acl_chown_setattr(attrzp);
 			ASSERT(err == 0);
 		}
 	}
 
 	if (mask & ATTR_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = ZTOI(zp)->i_mode = new_mode;
 		ASSERT3P(aclp, !=, NULL);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 	if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
 		zp->z_atime_dirty = 0;
 		ZFS_TIME_ENCODE(&ip->i_atime, atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &atime, sizeof (atime));
 	}
 
 	if (mask & (ATTR_MTIME | ATTR_SIZE)) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		ZTOI(zp)->i_mtime = timespec_trunc(vap->va_mtime,
 		    ZTOI(zp)->i_sb->s_time_gran);
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	if (mask & (ATTR_CTIME | ATTR_SIZE)) {
 		ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
 		ZTOI(zp)->i_ctime = timespec_trunc(vap->va_ctime,
 		    ZTOI(zp)->i_sb->s_time_gran);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    ctime, sizeof (ctime));
 	}
 
 	if (attrzp && mask) {
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
 		    sizeof (ctime));
 	}
 
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & ATTR_XVATTR)) {
 
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT(S_ISREG(ip->i_mode));
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	mutex_exit(&zp->z_lock);
 	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 		mutex_exit(&attrzp->z_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT(err2 == 0);
 	}
 
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 		if (attrzp)
 			iput(ZTOI(attrzp));
 		if (err == ERESTART)
 			goto top;
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 		if (attrzp)
 			iput(ZTOI(attrzp));
 		zfs_inode_update(zp);
 	}
 
 out2:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 out3:
 	kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * 7);
 	kmem_free(bulk, sizeof (sa_bulk_attr_t) * 7);
 	kmem_free(tmpxvattr, sizeof (xvattr_t));
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 typedef struct zfs_zlock {
 	krwlock_t	*zl_rwlock;	/* lock we acquired */
 	znode_t		*zl_znode;	/* znode we held */
 	struct zfs_zlock *zl_next;	/* next in list */
 } zfs_zlock_t;
 
 /*
  * Drop locks and release vnodes that were held by zfs_rename_lock().
  */
 static void
 zfs_rename_unlock(zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t *zl;
 
 	while ((zl = *zlpp) != NULL) {
 		if (zl->zl_znode != NULL)
 			zfs_iput_async(ZTOI(zl->zl_znode));
 		rw_exit(zl->zl_rwlock);
 		*zlpp = zl->zl_next;
 		kmem_free(zl, sizeof (*zl));
 	}
 }
 
 /*
  * Search back through the directory tree, using the ".." entries.
  * Lock each directory in the chain to prevent concurrent renames.
  * Fail any attempt to move a directory into one of its own descendants.
  * XXX - z_parent_lock can overlap with map or grow locks
  */
 static int
 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t	*zl;
 	znode_t		*zp = tdzp;
 	uint64_t	rootid = ZTOZSB(zp)->z_root;
 	uint64_t	oidp = zp->z_id;
 	krwlock_t	*rwlp = &szp->z_parent_lock;
 	krw_t		rw = RW_WRITER;
 
 	/*
 	 * First pass write-locks szp and compares to zp->z_id.
 	 * Later passes read-lock zp and compare to zp->z_parent.
 	 */
 	do {
 		if (!rw_tryenter(rwlp, rw)) {
 			/*
 			 * Another thread is renaming in this path.
 			 * Note that if we are a WRITER, we don't have any
 			 * parent_locks held yet.
 			 */
 			if (rw == RW_READER && zp->z_id > szp->z_id) {
 				/*
 				 * Drop our locks and restart
 				 */
 				zfs_rename_unlock(&zl);
 				*zlpp = NULL;
 				zp = tdzp;
 				oidp = zp->z_id;
 				rwlp = &szp->z_parent_lock;
 				rw = RW_WRITER;
 				continue;
 			} else {
 				/*
 				 * Wait for other thread to drop its locks
 				 */
 				rw_enter(rwlp, rw);
 			}
 		}
 
 		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
 		zl->zl_rwlock = rwlp;
 		zl->zl_znode = NULL;
 		zl->zl_next = *zlpp;
 		*zlpp = zl;
 
 		if (oidp == szp->z_id)		/* We're a descendant of szp */
 			return (SET_ERROR(EINVAL));
 
 		if (oidp == rootid)		/* We've hit the top */
 			return (0);
 
 		if (rw == RW_READER) {		/* i.e. not the first pass */
 			int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
 			if (error)
 				return (error);
 			zl->zl_znode = zp;
 		}
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
 		    &oidp, sizeof (oidp));
 		rwlp = &zp->z_parent_lock;
 		rw = RW_READER;
 
 	} while (zp->z_id != sdzp->z_id);
 
 	return (0);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdip	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdip	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdip,tdip - ctime|mtime updated
  */
 /*ARGSUSED*/
 int
 zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
     cred_t *cr, int flags)
 {
 	znode_t		*tdzp, *szp, *tzp;
 	znode_t		*sdzp = ITOZ(sdip);
 	zfsvfs_t	*zfsvfs = ITOZSB(sdip);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*sdl, *tdl;
 	dmu_tx_t	*tx;
 	zfs_zlock_t	*zl;
 	int		cmp, serr, terr;
 	int		error = 0;
 	int		zflg = 0;
 	boolean_t	waited = B_FALSE;
 
 	if (snm == NULL || tnm == NULL)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(sdzp);
 	zilog = zfsvfs->z_log;
 
 	tdzp = ITOZ(tdip);
 	ZFS_VERIFY_ZP(tdzp);
 
 	/*
 	 * We check i_sb because snapshots and the ctldir must have different
 	 * super blocks.
 	 */
 	if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EXDEV));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 top:
 	szp = NULL;
 	tzp = NULL;
 	zl = NULL;
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Lock source and target directory entries.  To prevent deadlock,
 	 * a lock ordering must be defined.  We lock the directory with
 	 * the smallest object id first, or if it's a tie, the one with
 	 * the lexically first name.
 	 */
 	if (sdzp->z_id < tdzp->z_id) {
 		cmp = -1;
 	} else if (sdzp->z_id > tdzp->z_id) {
 		cmp = 1;
 	} else {
 		/*
 		 * First compare the two name arguments without
 		 * considering any case folding.
 		 */
 		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
 
 		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
 		ASSERT(error == 0 || !zfsvfs->z_utf8);
 		if (cmp == 0) {
 			/*
 			 * POSIX: "If the old argument and the new argument
 			 * both refer to links to the same existing file,
 			 * the rename() function shall return successfully
 			 * and perform no other action."
 			 */
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 		/*
 		 * If the file system is case-folding, then we may
 		 * have some more checking to do.  A case-folding file
 		 * system is either supporting mixed case sensitivity
 		 * access or is completely case-insensitive.  Note
 		 * that the file system is always case preserving.
 		 *
 		 * In mixed sensitivity mode case sensitive behavior
 		 * is the default.  FIGNORECASE must be used to
 		 * explicitly request case insensitive behavior.
 		 *
 		 * If the source and target names provided differ only
 		 * by case (e.g., a request to rename 'tim' to 'Tim'),
 		 * we will treat this as a special case in the
 		 * case-insensitive mode: as long as the source name
 		 * is an exact match, we will allow this to proceed as
 		 * a name-change request.
 		 */
 		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
 		    flags & FIGNORECASE)) &&
 		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
 		    &error) == 0) {
 			/*
 			 * case preserving rename request, require exact
 			 * name matches
 			 */
 			zflg |= ZCIEXACT;
 			zflg &= ~ZCILOOK;
 		}
 	}
 
 	/*
 	 * If the source and destination directories are the same, we should
 	 * grab the z_name_lock of that directory only once.
 	 */
 	if (sdzp == tdzp) {
 		zflg |= ZHAVELOCK;
 		rw_enter(&sdzp->z_name_lock, RW_READER);
 	}
 
 	if (cmp < 0) {
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
 		    ZEXISTS | zflg, NULL, NULL);
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
 	} else {
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, zflg, NULL, NULL);
 		serr = zfs_dirent_lock(&sdl,
 		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
 		    NULL, NULL);
 	}
 
 	if (serr) {
 		/*
 		 * Source entry invalid or not there.
 		 */
 		if (!terr) {
 			zfs_dirent_unlock(tdl);
 			if (tzp)
 				iput(ZTOI(tzp));
 		}
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (strcmp(snm, "..") == 0)
 			serr = EINVAL;
 		ZFS_EXIT(zfsvfs);
 		return (serr);
 	}
 	if (terr) {
 		zfs_dirent_unlock(sdl);
 		iput(ZTOI(szp));
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (strcmp(tnm, "..") == 0)
 			terr = EINVAL;
 		ZFS_EXIT(zfsvfs);
 		return (terr);
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 
 	if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
 		goto out;
 
 	if (S_ISDIR(ZTOI(szp)->i_mode)) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
 			goto out;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if (S_ISDIR(ZTOI(szp)->i_mode)) {
 			if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
 				error = SET_ERROR(ENOTDIR);
 				goto out;
 			}
 		} else {
 			if (S_ISDIR(ZTOI(tzp)->i_mode)) {
 				error = SET_ERROR(EISDIR);
 				goto out;
 			}
 		}
 		/*
 		 * POSIX dictates that when the source and target
 		 * entries refer to the same file object, rename
 		 * must do nothing and exit without error.
 		 */
 		if (szp->z_id == tzp->z_id) {
 			error = 0;
 			goto out;
 		}
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
 		zfs_dirent_unlock(sdl);
 		zfs_dirent_unlock(tdl);
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			iput(ZTOI(szp));
 			if (tzp)
 				iput(ZTOI(tzp));
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		iput(ZTOI(szp));
 		if (tzp)
 			iput(ZTOI(tzp));
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
 			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 			if (error == 0) {
 				zfs_log_rename(zilog, tx, TX_RENAME |
 				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
 				    sdl->dl_name, tdzp, tdl->dl_name, szp);
 			} else {
 				/*
 				 * At this point, we have successfully created
 				 * the target name, but have failed to remove
 				 * the source name.  Since the create was done
 				 * with the ZRENAMING flag, there are
 				 * complications; for one, the link count is
 				 * wrong.  The easiest way to deal with this
 				 * is to remove the newly created target, and
 				 * return the original error.  This must
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
 				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
 				    ZRENAMING, NULL), ==, 0);
 			}
 		}
 	}
 
 	dmu_tx_commit(tx);
 out:
 	if (zl != NULL)
 		zfs_rename_unlock(&zl);
 
 	zfs_dirent_unlock(sdl);
 	zfs_dirent_unlock(tdl);
 
 	zfs_inode_update(sdzp);
 	if (sdzp == tdzp)
 		rw_exit(&sdzp->z_name_lock);
 
 	if (sdzp != tdzp)
 		zfs_inode_update(tdzp);
 
 	zfs_inode_update(szp);
 	iput(ZTOI(szp));
 	if (tzp) {
 		zfs_inode_update(tzp);
 		iput(ZTOI(tzp));
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dip	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		target	- Target path of new symlink.
  *
  *		cr	- credentials of caller.
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dip - ctime|mtime updated
  */
 /*ARGSUSED*/
 int
 zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link,
     struct inode **ipp, cred_t *cr, int flags)
 {
 	znode_t		*zp, *dzp = ITOZ(dip);
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = ITOZSB(dip);
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	int		zflg = ZNEW;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(S_ISLNK(vap->va_mode));
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 top:
 	*ipp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datsets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
 	mutex_exit(&zp->z_lock);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 
 	zfs_inode_update(dzp);
 	zfs_inode_update(zp);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	*ipp = ZTOI(zp);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by ip.
  *
  *	IN:	ip	- inode of symbolic link
  *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - atime updated
  */
 /* ARGSUSED */
 int
 zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 	mutex_exit(&zp->z_lock);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdip referencing sip.
  *
  *	IN:	tdip	- Directory to contain new entry.
  *		sip	- inode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	tdip - ctime|mtime updated
  *	 sip - ctime updated
  */
 /* ARGSUSED */
 int
 zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr,
     int flags)
 {
 	znode_t		*dzp = ITOZ(tdip);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = ITOZSB(tdip);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zf = ZNEW;
 	uint64_t	parent;
 	uid_t		owner;
 	boolean_t	waited = B_FALSE;
 	boolean_t	is_tmpfile = 0;
 	uint64_t	txg;
 #ifdef HAVE_TMPFILE
 	is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
 #endif
 	ASSERT(S_ISDIR(tdip->i_mode));
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (S_ISDIR(sip->i_mode)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	szp = ITOZ(sip);
 	ZFS_VERIFY_ZP(szp);
 
 	/*
 	 * We check i_sb because snapshots and the ctldir must have different
 	 * super blocks.
 	 */
 	if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
 	    cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	if (is_tmpfile)
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	/* unmark z_unlinked so zfs_link_create will not reject */
 	if (is_tmpfile)
 		szp->z_unlinked = 0;
 	error = zfs_link_create(dl, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		/*
 		 * tmpfile is created to be in z_unlinkedobj, so remove it.
 		 * Also, we don't log in ZIL, be cause all previous file
 		 * operation on the tmpfile are ignored by ZIL. Instead we
 		 * always wait for txg to sync to make sure all previous
 		 * operation are sync safe.
 		 */
 		if (is_tmpfile) {
 			VERIFY(zap_remove_int(zfsvfs->z_os,
 			    zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
 		} else {
 			if (flags & FIGNORECASE)
 				txtype |= TX_CI;
 			zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 		}
 	} else if (is_tmpfile) {
 		/* restore z_unlinked since when linking failed */
 		szp->z_unlinked = 1;
 	}
 	txg = dmu_tx_get_txg(tx);
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	if (is_tmpfile)
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
 
 	zfs_inode_update(dzp);
 	zfs_inode_update(szp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 static void
 zfs_putpage_commit_cb(void *arg)
 {
 	struct page *pp = arg;
 
 	ClearPageError(pp);
 	end_page_writeback(pp);
 }
 
 /*
  * Push a page out to disk, once the page is on stable storage the
  * registered commit callback will be run as notification of completion.
  *
  *	IN:	ip	- page mapped for inode.
  *		pp	- page to push (page is locked)
  *		wbc	- writeback control data
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - ctime|mtime updated
  */
 /* ARGSUSED */
 int
 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	loff_t		offset;
 	loff_t		pgoff;
 	unsigned int	pglen;
 	rl_t		*rl;
 	dmu_tx_t	*tx;
 	caddr_t		va;
 	int		err = 0;
 	uint64_t	mtime[2], ctime[2];
 	sa_bulk_attr_t	bulk[3];
 	int		cnt = 0;
 	struct address_space *mapping;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	ASSERT(PageLocked(pp));
 
 	pgoff = page_offset(pp);	/* Page byte-offset in file */
 	offset = i_size_read(ip);	/* File length in bytes */
 	pglen = MIN(PAGE_SIZE,		/* Page length in bytes */
 	    P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
 
 	/* Page is beyond end of file */
 	if (pgoff >= offset) {
 		unlock_page(pp);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/* Truncate page length to end of file */
 	if (pgoff + pglen > offset)
 		pglen = offset - pgoff;
 
 #if 0
 	/*
 	 * FIXME: Allow mmap writes past its quota.  The correct fix
 	 * is to register a page_mkwrite() handler to count the page
 	 * against its quota when it is about to be dirtied.
 	 */
 	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 		err = EDQUOT;
 	}
 #endif
 
 	/*
 	 * The ordering here is critical and must adhere to the following
 	 * rules in order to avoid deadlocking in either zfs_read() or
 	 * zfs_free_range() due to a lock inversion.
 	 *
 	 * 1) The page must be unlocked prior to acquiring the range lock.
 	 *    This is critical because zfs_read() calls find_lock_page()
 	 *    which may block on the page lock while holding the range lock.
 	 *
 	 * 2) Before setting or clearing write back on a page the range lock
 	 *    must be held in order to prevent a lock inversion with the
 	 *    zfs_free_range() function.
 	 *
 	 * This presents a problem because upon entering this function the
 	 * page lock is already held.  To safely acquire the range lock the
 	 * page lock must be dropped.  This creates a window where another
 	 * process could truncate, invalidate, dirty, or write out the page.
 	 *
 	 * Therefore, after successfully reacquiring the range and page locks
 	 * the current page state is checked.  In the common case everything
 	 * will be as is expected and it can be written out.  However, if
 	 * the page state has changed it must be handled accordingly.
 	 */
 	mapping = pp->mapping;
 	redirty_page_for_writepage(wbc, pp);
 	unlock_page(pp);
 
 	rl = zfs_range_lock(&zp->z_range_lock, pgoff, pglen, RL_WRITER);
 	lock_page(pp);
 
 	/* Page mapping changed or it was no longer dirty, we're done */
 	if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
 		unlock_page(pp);
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/* Another process started write block if required */
 	if (PageWriteback(pp)) {
 		unlock_page(pp);
 		zfs_range_unlock(rl);
 
 		if (wbc->sync_mode != WB_SYNC_NONE)
 			wait_on_page_writeback(pp);
 
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/* Clear the dirty flag the required locks are held */
 	if (!clear_page_dirty_for_io(pp)) {
 		unlock_page(pp);
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * Counterpart for redirty_page_for_writepage() above.  This page
 	 * was in fact not skipped and should not be counted as if it were.
 	 */
 	wbc->pages_skipped--;
 	set_page_writeback(pp);
 	unlock_page(pp);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (err != 0) {
 		if (err == ERESTART)
 			dmu_tx_wait(tx);
 
 		dmu_tx_abort(tx);
 		__set_page_dirty_nobuffers(pp);
 		ClearPageError(pp);
 		end_page_writeback(pp);
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 
 	va = kmap(pp);
 	ASSERT3U(pglen, <=, PAGE_SIZE);
 	dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
 	kunmap(pp);
 
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/* Preserve the mtime and ctime provided by the inode */
 	ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
 	ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
 	zp->z_atime_dirty = 0;
 	zp->z_seq++;
 
 	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 
 	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
 	    zfs_putpage_commit_cb, pp);
 	dmu_tx_commit(tx);
 
 	zfs_range_unlock(rl);
 
 	if (wbc->sync_mode != WB_SYNC_NONE) {
 		/*
 		 * Note that this is rarely called under writepages(), because
 		 * writepages() normally handles the entire commit for
 		 * performance reasons.
 		 */
 		zil_commit(zfsvfs->z_log, zp->z_id);
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 /*
  * Update the system attributes when the inode has been dirtied.  For the
  * moment we only update the mode, atime, mtime, and ctime.
  */
 int
 zfs_dirty_inode(struct inode *ip, int flags)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	dmu_tx_t	*tx;
 	uint64_t	mode, atime[2], mtime[2], ctime[2];
 	sa_bulk_attr_t	bulk[4];
 	int		error = 0;
 	int		cnt = 0;
 
 	if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
 		return (0);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 #ifdef I_DIRTY_TIME
 	/*
 	 * This is the lazytime semantic indroduced in Linux 4.0
 	 * This flag will only be called from update_time when lazytime is set.
 	 * (Note, I_DIRTY_SYNC will also set if not lazytime)
 	 * Fortunately mtime and ctime are managed within ZFS itself, so we
 	 * only need to dirty atime.
 	 */
 	if (flags == I_DIRTY_TIME) {
 		zp->z_atime_dirty = 1;
 		goto out;
 	}
 #endif
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	mutex_enter(&zp->z_lock);
 	zp->z_atime_dirty = 0;
 
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 
 	/* Preserve the mode, mtime and ctime provided by the inode */
 	ZFS_TIME_ENCODE(&ip->i_atime, atime);
 	ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
 	ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
 	mode = ip->i_mode;
 
 	zp->z_mode = mode;
 
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 	mutex_exit(&zp->z_lock);
 
 	dmu_tx_commit(tx);
 out:
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*ARGSUSED*/
 void
 zfs_inactive(struct inode *ip)
 {
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	uint64_t atime[2];
 	int error;
 	int need_unlock = 0;
 
 	/* Only read lock if we haven't already write locked, e.g. rollback */
 	if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
 		need_unlock = 1;
 		rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	}
 	if (zp->z_sa_hdl == NULL) {
 		if (need_unlock)
 			rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			ZFS_TIME_ENCODE(&ip->i_atime, atime);
 			mutex_enter(&zp->z_lock);
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&atime, sizeof (atime), tx);
 			zp->z_atime_dirty = 0;
 			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 
 	zfs_zinactive(zp);
 	if (need_unlock)
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 /*
  * Bounds-check the seek operation.
  *
  *	IN:	ip	- inode seeking within
  *		ooff	- old file offset
  *		noffp	- pointer to new file offset
  *		ct	- caller context
  *
  *	RETURN:	0 if success
  *		EINVAL if new offset invalid
  */
 /* ARGSUSED */
 int
 zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp)
 {
 	if (S_ISDIR(ip->i_mode))
 		return (0);
 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
 }
 
 /*
  * Fill pages with data from the disk.
  */
 static int
 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	objset_t *os;
 	struct page *cur_pp;
 	u_offset_t io_off, total;
 	size_t io_len;
 	loff_t i_size;
 	unsigned page_idx;
 	int err;
 
 	os = zfsvfs->z_os;
 	io_len = nr_pages << PAGE_SHIFT;
 	i_size = i_size_read(ip);
 	io_off = page_offset(pl[0]);
 
 	if (io_off + io_len > i_size)
 		io_len = i_size - io_off;
 
 	/*
 	 * Iterate over list of pages and read each page individually.
 	 */
 	page_idx = 0;
 	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
 		caddr_t va;
 
 		cur_pp = pl[page_idx++];
 		va = kmap(cur_pp);
 		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
 		    DMU_READ_PREFETCH);
 		kunmap(cur_pp);
 		if (err) {
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
 				err = SET_ERROR(EIO);
 			return (err);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Uses zfs_fillpage to read data from the file and fill the pages.
  *
  *	IN:	ip	 - inode of file to get data from.
  *		pl	 - list of pages to read
  *		nr_pages - number of pages to read
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 int
 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
 {
 	znode_t	 *zp  = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int	 err;
 
 	if (pl == NULL)
 		return (0);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	err = zfs_fillpage(ip, pl, nr_pages);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 /*
  * Check ZFS specific permissions to memory map a section of a file.
  *
  *	IN:	ip	- inode of the file to mmap
  *		off	- file offset
  *		addrp	- start address in memory region
  *		len	- length of memory region
  *		vm_flags- address flags
  *
  *	RETURN:	0 if success
  *		error code if failure
  */
 /*ARGSUSED*/
 int
 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
     unsigned long vm_flags)
 {
 	znode_t  *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((vm_flags & VM_WRITE) && (zp->z_pflags &
 	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((vm_flags & (VM_READ | VM_EXEC)) &&
 	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EACCES));
 	}
 
 	if (off < 0 || len > MAXOFFSET_T - off) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENXIO));
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * convoff - converts the given data (start, whence) to the
  * given whence.
  */
 int
 convoff(struct inode *ip, flock64_t *lckdat, int  whence, offset_t offset)
 {
 	vattr_t vap;
 	int error;
 
 	if ((lckdat->l_whence == 2) || (whence == 2)) {
 		if ((error = zfs_getattr(ip, &vap, 0, CRED())))
 			return (error);
 	}
 
 	switch (lckdat->l_whence) {
 	case 1:
 		lckdat->l_start += offset;
 		break;
 	case 2:
 		lckdat->l_start += vap.va_size;
 		/* FALLTHRU */
 	case 0:
 		break;
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (lckdat->l_start < 0)
 		return (SET_ERROR(EINVAL));
 
 	switch (whence) {
 	case 1:
 		lckdat->l_start -= offset;
 		break;
 	case 2:
 		lckdat->l_start -= vap.va_size;
 		/* FALLTHRU */
 	case 0:
 		break;
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 
 	lckdat->l_whence = (short)whence;
 	return (0);
 }
 
 /*
  * Free or allocate space in a file.  Currently, this function only
  * supports the `F_FREESP' command.  However, this command is somewhat
  * misnamed, as its functionality includes the ability to allocate as
  * well as free space.
  *
  *	IN:	ip	- inode of file to free data in.
  *		cmd	- action to take (only F_FREESP supported).
  *		bfp	- section of file to free/alloc.
  *		flag	- current file open mode flags.
  *		offset	- current file offset.
  *		cr	- credentials of caller [UNUSED].
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	ip - ctime|mtime updated
  */
 /* ARGSUSED */
 int
 zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag,
     offset_t offset, cred_t *cr)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	uint64_t	off, len;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (cmd != F_FREESP) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(zfsvfs)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	if ((error = convoff(ip, bfp, 0, offset))) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (bfp->l_len < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Permissions aren't checked on Solaris because on this OS
 	 * zfs_space() can only be called with an opened file handle.
 	 * On Linux we can get here through truncate_range() which
 	 * operates directly on inodes, so we need to check access rights.
 	 */
 	if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	off = bfp->l_start;
 	len = bfp->l_len; /* 0 means from off to end of file */
 
 	error = zfs_freesp(zp, off, len, flag, TRUE);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*ARGSUSED*/
 int
 zfs_fid(struct inode *ip, fid_t *fidp)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = SHORT_FID_LEN;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*ARGSUSED*/
 int
 zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 int
 zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	zilog_t	*zilog = zfsvfs->z_log;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 #ifdef HAVE_UIO_ZEROCOPY
 /*
  * Tunable, both must be a power of 2.
  *
  * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
  * zcr_blksz_max: if set to less than the file block size, allow loaning out of
  *		an arcbuf for a partial block read
  */
 int zcr_blksz_min = (1 << 10);	/* 1K */
 int zcr_blksz_max = (1 << 17);	/* 128K */
 
 /*ARGSUSED*/
 static int
 zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
 {
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int max_blksz = zfsvfs->z_max_blksz;
 	uio_t *uio = &xuio->xu_uio;
 	ssize_t size = uio->uio_resid;
 	offset_t offset = uio->uio_loffset;
 	int blksz;
 	int fullblk, i;
 	arc_buf_t *abuf;
 	ssize_t maxsize;
 	int preamble, postamble;
 
 	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	switch (ioflag) {
 	case UIO_WRITE:
 		/*
 		 * Loan out an arc_buf for write if write size is bigger than
 		 * max_blksz, and the file's block size is also max_blksz.
 		 */
 		blksz = max_blksz;
 		if (size < blksz || zp->z_blksz != blksz) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 		/*
 		 * Caller requests buffers for write before knowing where the
 		 * write offset might be (e.g. NFS TCP write).
 		 */
 		if (offset == -1) {
 			preamble = 0;
 		} else {
 			preamble = P2PHASE(offset, blksz);
 			if (preamble) {
 				preamble = blksz - preamble;
 				size -= preamble;
 			}
 		}
 
 		postamble = P2PHASE(size, blksz);
 		size -= postamble;
 
 		fullblk = size / blksz;
 		(void) dmu_xuio_init(xuio,
 		    (preamble != 0) + fullblk + (postamble != 0));
 
 		/*
 		 * Have to fix iov base/len for partial buffers.  They
 		 * currently represent full arc_buf's.
 		 */
 		if (preamble) {
 			/* data begins in the middle of the arc_buf */
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf,
 			    blksz - preamble, preamble);
 		}
 
 		for (i = 0; i < fullblk; i++) {
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
 		}
 
 		if (postamble) {
 			/* data ends in the middle of the arc_buf */
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
 		}
 		break;
 	case UIO_READ:
 		/*
 		 * Loan out an arc_buf for read if the read size is larger than
 		 * the current file block size.  Block alignment is not
 		 * considered.  Partial arc_buf will be loaned out for read.
 		 */
 		blksz = zp->z_blksz;
 		if (blksz < zcr_blksz_min)
 			blksz = zcr_blksz_min;
 		if (blksz > zcr_blksz_max)
 			blksz = zcr_blksz_max;
 		/* avoid potential complexity of dealing with it */
 		if (blksz > max_blksz) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 
 		maxsize = zp->z_size - uio->uio_loffset;
 		if (size > maxsize)
 			size = maxsize;
 
 		if (size < blksz) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 		break;
 	default:
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	uio->uio_extflg = UIO_XUIO;
 	XUIO_XUZC_RW(xuio) = ioflag;
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
 {
 	int i;
 	arc_buf_t *abuf;
 	int ioflag = XUIO_XUZC_RW(xuio);
 
 	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
 
 	i = dmu_xuio_cnt(xuio);
 	while (i-- > 0) {
 		abuf = dmu_xuio_arcbuf(xuio, i);
 		/*
 		 * if abuf == NULL, it must be a write buffer
 		 * that has been returned in zfs_write().
 		 */
 		if (abuf)
 			dmu_return_arcbuf(abuf);
 		ASSERT(abuf || ioflag == UIO_WRITE);
 	}
 
 	dmu_xuio_fini(xuio);
 	return (0);
 }
 #endif /* HAVE_UIO_ZEROCOPY */
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(zfs_open);
 EXPORT_SYMBOL(zfs_close);
 EXPORT_SYMBOL(zfs_read);
 EXPORT_SYMBOL(zfs_write);
 EXPORT_SYMBOL(zfs_access);
 EXPORT_SYMBOL(zfs_lookup);
 EXPORT_SYMBOL(zfs_create);
 EXPORT_SYMBOL(zfs_tmpfile);
 EXPORT_SYMBOL(zfs_remove);
 EXPORT_SYMBOL(zfs_mkdir);
 EXPORT_SYMBOL(zfs_rmdir);
 EXPORT_SYMBOL(zfs_readdir);
 EXPORT_SYMBOL(zfs_fsync);
 EXPORT_SYMBOL(zfs_getattr);
 EXPORT_SYMBOL(zfs_getattr_fast);
 EXPORT_SYMBOL(zfs_setattr);
 EXPORT_SYMBOL(zfs_rename);
 EXPORT_SYMBOL(zfs_symlink);
 EXPORT_SYMBOL(zfs_readlink);
 EXPORT_SYMBOL(zfs_link);
 EXPORT_SYMBOL(zfs_inactive);
 EXPORT_SYMBOL(zfs_space);
 EXPORT_SYMBOL(zfs_fid);
 EXPORT_SYMBOL(zfs_getsecattr);
 EXPORT_SYMBOL(zfs_setsecattr);
 EXPORT_SYMBOL(zfs_getpage);
 EXPORT_SYMBOL(zfs_putpage);
 EXPORT_SYMBOL(zfs_dirty_inode);
 EXPORT_SYMBOL(zfs_map);
 
 /* CSTYLED */
 module_param(zfs_delete_blocks, ulong, 0644);
 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
 module_param(zfs_read_chunk_size, long, 0644);
 MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
 #endif
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 9f4312a5a28d..5300b3408a14 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1,2394 +1,3412 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab.h>
 #include <sys/trace_zil.h>
 #include <sys/abd.h>
 
 /*
- * The zfs intent log (ZIL) saves transaction records of system calls
- * that change the file system in memory with enough information
- * to be able to replay them. These are stored in memory until
- * either the DMU transaction group (txg) commits them to the stable pool
- * and they can be discarded, or they are flushed to the stable log
- * (also in the pool) due to a fsync, O_DSYNC or other synchronous
- * requirement. In the event of a panic or power fail then those log
- * records (transactions) are replayed.
+ * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
+ * calls that change the file system. Each itx has enough information to
+ * be able to replay them after a system crash, power loss, or
+ * equivalent failure mode. These are stored in memory until either:
  *
- * There is one ZIL per file system. Its on-disk (pool) format consists
- * of 3 parts:
+ *   1. they are committed to the pool by the DMU transaction group
+ *      (txg), at which point they can be discarded; or
+ *   2. they are committed to the on-disk ZIL for the dataset being
+ *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
+ *      requirement).
  *
- * 	- ZIL header
- * 	- ZIL blocks
- * 	- ZIL records
+ * In the event of a crash or power loss, the itxs contained by each
+ * dataset's on-disk ZIL will be replayed when that dataset is first
+ * instantianted (e.g. if the dataset is a normal fileystem, when it is
+ * first mounted).
  *
- * A log record holds a system call transaction. Log blocks can
- * hold many log records and the blocks are chained together.
- * Each ZIL block contains a block pointer (blkptr_t) to the next
- * ZIL block in the chain. The ZIL header points to the first
- * block in the chain. Note there is not a fixed place in the pool
- * to hold blocks. They are dynamically allocated and freed as
- * needed from the blocks available. Figure X shows the ZIL structure:
+ * As hinted at above, there is one ZIL per dataset (both the in-memory
+ * representation, and the on-disk representation). The on-disk format
+ * consists of 3 parts:
+ *
+ * 	- a single, per-dataset, ZIL header; which points to a chain of
+ * 	- zero or more ZIL blocks; each of which contains
+ * 	- zero or more ZIL records
+ *
+ * A ZIL record holds the information necessary to replay a single
+ * system call transaction. A ZIL block can hold many ZIL records, and
+ * the blocks are chained together, similarly to a singly linked list.
+ *
+ * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
+ * block in the chain, and the ZIL header points to the first block in
+ * the chain.
+ *
+ * Note, there is not a fixed place in the pool to hold these ZIL
+ * blocks; they are dynamically allocated and freed as needed from the
+ * blocks available on the pool, though they can be preferentially
+ * allocated from a dedicated "log" vdev.
  */
 
+/*
+ * This controls the amount of time that a ZIL block (lwb) will remain
+ * "open" when it isn't "full", and it has a thread waiting for it to be
+ * committed to stable storage. Please refer to the zil_commit_waiter()
+ * function (and the comments within it) for more details.
+ */
+int zfs_commit_timeout_pct = 5;
+
 /*
  * See zil.h for more information about these fields.
  */
 zil_stats_t zil_stats = {
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
 };
 
 static kstat_t *zil_ksp;
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
 int zil_replay_disable = 0;
 
 /*
  * Tunable parameter for debugging or performance analysis.  Setting
  * zfs_nocacheflush will cause corruption on power loss if a volatile
  * out-of-order write cache is enabled.
  */
 int zfs_nocacheflush = 0;
 
 /*
  * Limit SLOG write size per commit executed with synchronous priority.
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
  */
 unsigned long zil_slog_bulk = 768 * 1024;
 
 static kmem_cache_t *zil_lwb_cache;
+static kmem_cache_t *zil_zcw_cache;
 
 static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
 
 #define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
     sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (likely(cmp))
 		return (cmp);
 
 	return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 }
 
 static void
 zil_bp_tree_init(zilog_t *zilog)
 {
 	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
 	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
 zil_bp_tree_fini(zilog_t *zilog)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
 int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (0);
 
 	dva = BP_IDENTITY(bp);
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (SET_ERROR(EEXIST));
 
 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
 	return (0);
 }
 
 static zil_header_t *
 zil_header_in_syncing_context(zilog_t *zilog)
 {
 	return ((zil_header_t *)zilog->zl_header);
 }
 
 static void
 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 {
 	zio_cksum_t *zc = &bp->blk_cksum;
 
 	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
 	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 }
 
 /*
  * Read a log block and make sure it's valid.
  */
 static int
 zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
     blkptr_t *nbp, void *dst, char **end)
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	if (!decrypt)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
 	    &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
 		 * Validate the checksummed log block.
 		 *
 		 * Sequence numbers should be... sequential.  The checksum
 		 * verifier for the next block should be bp's checksum plus 1.
 		 *
 		 * Also check the log chain linkage and size used.
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = abuf->b_data;
 			char *lr = (char *)(zilc + 1);
 			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
 
 			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
 				bcopy(lr, dst, len);
 				*end = (char *)dst + len;
 				*nbp = zilc->zc_next_blk;
 			}
 		} else {
 			char *lr = abuf->b_data;
 			uint64_t size = BP_GET_LSIZE(bp);
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
 			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				ASSERT3U(zilc->zc_nused, <=,
 				    SPA_OLD_MAXBLOCKSIZE);
 				bcopy(lr, dst, zilc->zc_nused);
 				*end = (char *)dst + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
 
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 /*
  * Read a TX_WRITE log data block.
  */
 static int
 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
 		if (wbuf != NULL)
 			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 		return (0);
 	}
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	/*
 	 * If we are not using the resulting data, we are just checking that
 	 * it hasn't been corrupted so we don't need to waste CPU time
 	 * decompressing and decrypting it.
 	 */
 	if (wbuf == NULL)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		if (wbuf != NULL)
 			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
  */
 int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
     boolean_t decrypt)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	boolean_t claimed = !!zh->zh_claim_txg;
 	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
 	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
 	uint64_t max_blk_seq = 0;
 	uint64_t max_lr_seq = 0;
 	uint64_t blk_count = 0;
 	uint64_t lr_count = 0;
 	blkptr_t blk, next_blk;
 	char *lrbuf, *lrp;
 	int error = 0;
 
 	bzero(&next_blk, sizeof (blkptr_t));
 
 	/*
 	 * Old logs didn't record the maximum zh_claim_lr_seq.
 	 */
 	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
 	 * For each block in the chain we strongly check that block to
 	 * ensure its validity.  We stop when an invalid block is found.
 	 * For each block pointer in the chain we call parse_blk_func().
 	 * For each record in each valid block we call parse_lr_func().
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
 	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
 		char *end = NULL;
 
 		if (blk_seq > claim_blk_seq)
 			break;
 
 		error = parse_blk_func(zilog, &blk, arg, txg);
 		if (error != 0)
 			break;
 		ASSERT3U(max_blk_seq, <, blk_seq);
 		max_blk_seq = blk_seq;
 		blk_count++;
 
 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
 		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
 		    lrbuf, &end);
 		if (error != 0)
 			break;
 
 		for (lrp = lrbuf; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
 			if (lr->lrc_seq > claim_lr_seq)
 				goto done;
 
 			error = parse_lr_func(zilog, lr, arg, txg);
 			if (error != 0)
 				goto done;
 			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
 	}
 done:
 	zilog->zl_parse_error = error;
 	zilog->zl_parse_blk_seq = max_blk_seq;
 	zilog->zl_parse_lr_seq = max_lr_seq;
 	zilog->zl_parse_blk_count = blk_count;
 	zilog->zl_parse_lr_count = lr_count;
 
 	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
 	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) ||
 	    (decrypt && error == EIO));
 
 	zil_bp_tree_fini(zilog);
 	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
 
 	return (error);
 }
 
 static int
 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
 {
 	/*
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
 	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
 	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
 static int
 zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
 	if (lrc->lrc_txtype != TX_WRITE)
 		return (0);
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
 	 * in normal operation when a log block is written to disk before
 	 * some of the dmu_sync() blocks it points to.  In this case, the
 	 * transaction cannot have been committed to anyone (we would have
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
 	if (lr->lr_blkptr.blk_birth >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 /* ARGSUSED */
 static int
 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
 {
 	zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
 zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
 	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
 	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
 	    !BP_IS_HOLE(bp))
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
+static int
+zil_lwb_vdev_compare(const void *x1, const void *x2)
+{
+	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+
+	return (AVL_CMP(v1, v2));
+}
+
 static lwb_t *
 zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
     boolean_t fastwrite)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	lwb->lwb_blk = *bp;
 	lwb->lwb_fastwrite = fastwrite;
 	lwb->lwb_slog = slog;
+	lwb->lwb_state = LWB_STATE_CLOSED;
 	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
 	lwb->lwb_max_txg = txg;
-	lwb->lwb_zio = NULL;
+	lwb->lwb_write_zio = NULL;
+	lwb->lwb_root_zio = NULL;
 	lwb->lwb_tx = NULL;
+	lwb->lwb_issued_timestamp = 0;
 	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 		lwb->lwb_nused = sizeof (zil_chain_t);
 		lwb->lwb_sz = BP_GET_LSIZE(bp);
 	} else {
 		lwb->lwb_nused = 0;
 		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
 	mutex_exit(&zilog->zl_lock);
 
+	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+	ASSERT(list_is_empty(&lwb->lwb_waiters));
+	ASSERT(list_is_empty(&lwb->lwb_itxs));
+
 	return (lwb);
 }
 
+static void
+zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
+{
+	ASSERT(MUTEX_HELD(&zilog->zl_lock));
+	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+	ASSERT(list_is_empty(&lwb->lwb_waiters));
+
+	if (lwb->lwb_state == LWB_STATE_OPENED) {
+		avl_tree_t *t = &lwb->lwb_vdev_tree;
+		void *cookie = NULL;
+		zil_vdev_node_t *zv;
+		itx_t *itx;
+
+		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
+			kmem_free(zv, sizeof (*zv));
+
+		while ((itx = list_head(&lwb->lwb_itxs)) != NULL) {
+			if (itx->itx_callback != NULL)
+				itx->itx_callback(itx->itx_callback_data);
+			list_remove(&lwb->lwb_itxs, itx);
+			zil_itx_destroy(itx);
+		}
+
+		ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+		ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+
+		zio_cancel(lwb->lwb_root_zio);
+		zio_cancel(lwb->lwb_write_zio);
+
+		lwb->lwb_root_zio = NULL;
+		lwb->lwb_write_zio = NULL;
+	} else {
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+	}
+
+	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+	ASSERT(list_is_empty(&lwb->lwb_itxs));
+	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
+	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
+
+	/*
+	 * Clear the zilog's field to indicate this lwb is no longer
+	 * valid, and prevent use-after-free errors.
+	 */
+	if (zilog->zl_last_lwb_opened == lwb)
+		zilog->zl_last_lwb_opened = NULL;
+
+	kmem_cache_free(zil_lwb_cache, lwb);
+}
+
 /*
  * Called when we create in-memory log transactions so that we know
  * to cleanup the itxs at the end of spa_sync().
  */
 void
 zilog_dirty(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
+	ASSERT(spa_writeable(zilog->zl_spa));
+
 	if (ds->ds_is_snapshot)
 		panic("dirtying snapshot!");
 
 	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, zilog);
+
+		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
 	}
 }
 
 /*
  * Determine if the zil is dirty in the specified txg. Callers wanting to
  * ensure that the dirty state does not change must hold the itxg_lock for
  * the specified txg. Holding the lock will ensure that the zil cannot be
  * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
  * state.
  */
 boolean_t
 zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * Determine if the zil is dirty. The zil is considered dirty if it has
  * any pending itx records that have not been cleaned by zil_clean().
  */
 boolean_t
 zilog_is_dirty(zilog_t *zilog)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Create an on-disk intent log.
  */
 static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
 	boolean_t fastwrite = FALSE;
 	boolean_t slog = FALSE;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	ASSERT(zh->zh_claim_txg == 0);
 	ASSERT(zh->zh_replay_seq == 0);
 
 	blk = zh->zh_log;
 
 	/*
 	 * Allocate an initial log block if:
 	 *    - there isn't one already
 	 *    - the existing block is the wrong endianness
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
-		VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
 			zio_free_zil(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
 		    ZIL_MIN_BLKSZ, &slog);
 		fastwrite = TRUE;
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
 
 	/*
-	 * Allocate a log write buffer (lwb) for the first log block.
+	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
 		lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
 	 * and wait for zil_sync() to stuff the block poiner into zh_log.
 	 * (zh is part of the MOS, so we cannot modify it in open context.)
 	 */
 	if (tx != NULL) {
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	}
 
 	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 
 	return (lwb);
 }
 
 /*
- * In one tx, free all log blocks and clear the log header.
- * If keep_first is set, then we're replaying a log with no content.
- * We want to keep the first block, however, so that the first
- * synchronous transaction doesn't require a txg_wait_synced()
- * in zil_create().  We don't need to txg_wait_synced() here either
- * when keep_first is set, because both zil_create() and zil_destroy()
- * will wait for any in-progress destroys to complete.
+ * In one tx, free all log blocks and clear the log header. If keep_first
+ * is set, then we're replaying a log with no content. We want to keep the
+ * first block, however, so that the first synchronous transaction doesn't
+ * require a txg_wait_synced() in zil_create(). We don't need to
+ * txg_wait_synced() here either when keep_first is set, because both
+ * zil_create() and zil_destroy() will wait for any in-progress destroys
+ * to complete.
  */
 void
 zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
 	dmu_tx_t *tx;
 	uint64_t txg;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	zilog->zl_old_header = *zh;		/* debugging aid */
 
 	if (BP_IS_HOLE(&zh->zh_log))
 		return;
 
 	tx = dmu_tx_create(zilog->zl_os);
-	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
-			ASSERT(lwb->lwb_zio == NULL);
 			if (lwb->lwb_fastwrite)
 				metaslab_fastwrite_unmark(zilog->zl_spa,
 				    &lwb->lwb_blk);
+
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-			zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
-			kmem_cache_free(zil_lwb_cache, lwb);
+			zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
+			zil_free_lwb(zilog, lwb);
 		}
 	} else if (!keep_first) {
 		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 }
 
 void
 zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	(void) zil_parse(zilog, zil_free_log_block,
 	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
 }
 
 int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	uint64_t first_txg = dmu_tx_get_txg(tx);
 	zilog_t *zilog;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_own_obj(dp, ds->ds_object,
 	    DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
 	if (error != 0) {
 		/*
 		 * EBUSY indicates that the objset is inconsistent, in which
 		 * case it can not have a ZIL.
 		 */
 		if (error != EBUSY) {
 			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
 			    (unsigned long long)ds->ds_object, error);
 		}
 
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 
 	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
 		if (!BP_IS_HOLE(&zh->zh_log))
 			zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
 		BP_ZERO(&zh->zh_log);
 		if (os->os_encrypted)
 			os->os_next_write_raw = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_disown(os, B_FALSE, FTAG);
 		return (0);
 	}
 
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
 	 * but we can read the entire log later, we will not try to replay
 	 * or destroy beyond the last block we successfully claimed.
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
 		(void) zil_parse(zilog, zil_claim_log_block,
 		    zil_claim_log_record, tx, first_txg, B_FALSE);
 		zh->zh_claim_txg = first_txg;
 		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
 		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
 		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
 			zh->zh_flags |= ZIL_REPLAY_NEEDED;
 		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
 	dmu_objset_disown(os, B_FALSE, FTAG);
 	return (0);
 }
 
 /*
  * Check the log by walking the log chain.
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
 /* ARGSUSED */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	zilog_t *zilog;
 	objset_t *os;
 	blkptr_t *bp;
 	int error;
 
 	ASSERT(tx == NULL);
 
 	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
 		cmn_err(CE_WARN, "can't open objset %llu, error %d",
 		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
 	/*
 	 * Check the first block and determine if it's on a log device
 	 * which may have been removed or faulted prior to loading this
 	 * pool.  If so, there's no point in checking the rest of the log
 	 * as its content should have already been synced to the pool.
 	 */
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;
 
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
 		if (!valid)
 			return (0);
 	}
 
 	/*
 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
 	 * any blocks, but just determine whether it is possible to do so.
 	 * In addition to checking the log chain, zil_claim_log_block()
 	 * will invoke zio_claim() with a done func of spa_claim_notify(),
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa),
 	    B_FALSE);
 
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
-static int
-zil_vdev_compare(const void *x1, const void *x2)
+/*
+ * When an itx is "skipped", this function is used to properly mark the
+ * waiter as "done, and signal any thread(s) waiting on it. An itx can
+ * be skipped (and not committed to an lwb) for a variety of reasons,
+ * one of them being that the itx was committed via spa_sync(), prior to
+ * it being committed to an lwb; this can happen if a thread calling
+ * zil_commit() is racing with spa_sync().
+ */
+static void
+zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
 {
-	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
-	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+	mutex_enter(&zcw->zcw_lock);
+	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+	zcw->zcw_done = B_TRUE;
+	cv_broadcast(&zcw->zcw_cv);
+	mutex_exit(&zcw->zcw_lock);
+}
 
-	return (AVL_CMP(v1, v2));
+/*
+ * This function is used when the given waiter is to be linked into an
+ * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
+ * At this point, the waiter will no longer be referenced by the itx,
+ * and instead, will be referenced by the lwb.
+ */
+static void
+zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
+{
+	mutex_enter(&zcw->zcw_lock);
+	ASSERT(!list_link_active(&zcw->zcw_node));
+	ASSERT3P(zcw->zcw_lwb, ==, NULL);
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
+	    lwb->lwb_state == LWB_STATE_ISSUED);
+
+	list_insert_tail(&lwb->lwb_waiters, zcw);
+	zcw->zcw_lwb = lwb;
+	mutex_exit(&zcw->zcw_lock);
+}
+
+/*
+ * This function is used when zio_alloc_zil() fails to allocate a ZIL
+ * block, and the given waiter must be linked to the "nolwb waiters"
+ * list inside of zil_process_commit_list().
+ */
+static void
+zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
+{
+	mutex_enter(&zcw->zcw_lock);
+	ASSERT(!list_link_active(&zcw->zcw_node));
+	ASSERT3P(zcw->zcw_lwb, ==, NULL);
+	list_insert_tail(nolwb, zcw);
+	mutex_exit(&zcw->zcw_lock);
 }
 
 void
-zil_add_block(zilog_t *zilog, const blkptr_t *bp)
+zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 {
-	avl_tree_t *t = &zilog->zl_vdev_tree;
+	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	avl_index_t where;
 	zil_vdev_node_t *zv, zvsearch;
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
 	if (zfs_nocacheflush)
 		return;
 
-	ASSERT(zilog->zl_writer);
-
-	/*
-	 * Even though we're zl_writer, we still need a lock because the
-	 * zl_get_data() callbacks may have dmu_sync() done callbacks
-	 * that will run concurrently.
-	 */
-	mutex_enter(&zilog->zl_vdev_lock);
+	mutex_enter(&lwb->lwb_vdev_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
 			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
 			zv->zv_vdev = zvsearch.zv_vdev;
 			avl_insert(t, zv, where);
 		}
 	}
-	mutex_exit(&zilog->zl_vdev_lock);
+	mutex_exit(&lwb->lwb_vdev_lock);
 }
 
+void
+zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
+{
+	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+}
+
+/*
+ * This function is a called after all VDEVs associated with a given lwb
+ * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
+ * as the lwb write completes, if "zfs_nocacheflush" is set.
+ *
+ * The intention is for this function to be called as soon as the
+ * contents of an lwb are considered "stable" on disk, and will survive
+ * any sudden loss of power. At this point, any threads waiting for the
+ * lwb to reach this state are signalled, and the "waiter" structures
+ * are marked "done".
+ */
 static void
-zil_flush_vdevs(zilog_t *zilog)
+zil_lwb_flush_vdevs_done(zio_t *zio)
 {
-	spa_t *spa = zilog->zl_spa;
-	avl_tree_t *t = &zilog->zl_vdev_tree;
-	void *cookie = NULL;
-	zil_vdev_node_t *zv;
-	zio_t *zio;
+	lwb_t *lwb = zio->io_private;
+	zilog_t *zilog = lwb->lwb_zilog;
+	dmu_tx_t *tx = lwb->lwb_tx;
+	zil_commit_waiter_t *zcw;
+	itx_t *itx;
+
+	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
+
+	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 
-	ASSERT(zilog->zl_writer);
+	mutex_enter(&zilog->zl_lock);
 
 	/*
-	 * We don't need zl_vdev_lock here because we're the zl_writer,
-	 * and all zl_get_data() callbacks are done.
+	 * Ensure the lwb buffer pointer is cleared before releasing the
+	 * txg. If we have had an allocation failure and the txg is
+	 * waiting to sync then we want zil_sync() to remove the lwb so
+	 * that it's not picked up as the next new one in
+	 * zil_process_commit_list(). zil_sync() will only remove the
+	 * lwb if lwb_buf is null.
 	 */
-	if (avl_numnodes(t) == 0)
-		return;
+	lwb->lwb_buf = NULL;
+	lwb->lwb_tx = NULL;
 
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
+	zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
 
-	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+	lwb->lwb_root_zio = NULL;
+	lwb->lwb_state = LWB_STATE_DONE;
 
-	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
-		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL)
-			zio_flush(zio, vd);
-		kmem_free(zv, sizeof (*zv));
+	if (zilog->zl_last_lwb_opened == lwb) {
+		/*
+		 * Remember the highest committed log sequence number
+		 * for ztest. We only update this value when all the log
+		 * writes succeeded, because ztest wants to ASSERT that
+		 * it got the whole log chain.
+		 */
+		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
+	}
+
+	while ((itx = list_head(&lwb->lwb_itxs)) != NULL) {
+		list_remove(&lwb->lwb_itxs, itx);
+		zil_itx_destroy(itx);
+	}
+
+	while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
+		mutex_enter(&zcw->zcw_lock);
+
+		ASSERT(list_link_active(&zcw->zcw_node));
+		list_remove(&lwb->lwb_waiters, zcw);
+
+		ASSERT3P(zcw->zcw_lwb, ==, lwb);
+		zcw->zcw_lwb = NULL;
+
+		zcw->zcw_zio_error = zio->io_error;
+
+		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+		zcw->zcw_done = B_TRUE;
+		cv_broadcast(&zcw->zcw_cv);
+
+		mutex_exit(&zcw->zcw_lock);
 	}
 
+	mutex_exit(&zilog->zl_lock);
+
 	/*
-	 * Wait for all the flushes to complete.  Not all devices actually
-	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
+	 * Now that we've written this log block, we have a stable pointer
+	 * to the next block in the chain, so it's OK to let the txg in
+	 * which we allocated the next block sync.
 	 */
-	(void) zio_wait(zio);
-
-	spa_config_exit(spa, SCL_STATE, FTAG);
+	dmu_tx_commit(tx);
 }
 
 /*
- * Function called when a log block write completes
+ * This is called when an lwb write completes. This means, this specific
+ * lwb was written to disk, and all dependent lwb have also been
+ * written to disk.
+ *
+ * At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to
+ * the VDEVs involved in writing out this specific lwb. The lwb will be
+ * "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the
+ * zio completion callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
+	spa_t *spa = zio->io_spa;
 	zilog_t *zilog = lwb->lwb_zilog;
-	dmu_tx_t *tx = lwb->lwb_tx;
+	avl_tree_t *t = &lwb->lwb_vdev_tree;
+	void *cookie = NULL;
+	zil_vdev_node_t *zv;
+
+	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
 	ASSERT(!BP_IS_GANG(zio->io_bp));
 	ASSERT(!BP_IS_HOLE(zio->io_bp));
 	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
 
-	/*
-	 * Ensure the lwb buffer pointer is cleared before releasing
-	 * the txg. If we have had an allocation failure and
-	 * the txg is waiting to sync then we want want zil_sync()
-	 * to remove the lwb so that it's not picked up as the next new
-	 * one in zil_commit_writer(). zil_sync() will only remove
-	 * the lwb if lwb_buf is null.
-	 */
 	abd_put(zio->io_abd);
-	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
+
 	mutex_enter(&zilog->zl_lock);
-	lwb->lwb_zio = NULL;
+	lwb->lwb_write_zio = NULL;
 	lwb->lwb_fastwrite = FALSE;
-	lwb->lwb_buf = NULL;
-	lwb->lwb_tx = NULL;
 	mutex_exit(&zilog->zl_lock);
 
+	if (avl_numnodes(t) == 0)
+		return;
+
 	/*
-	 * Now that we've written this log block, we have a stable pointer
-	 * to the next block in the chain, so it's OK to let the txg in
-	 * which we allocated the next block sync.
+	 * If there was an IO error, we're not going to call zio_flush()
+	 * on these vdevs, so we simply empty the tree and free the
+	 * nodes. We avoid calling zio_flush() since there isn't any
+	 * good reason for doing so, after the lwb block failed to be
+	 * written out.
 	 */
-	dmu_tx_commit(tx);
+	if (zio->io_error != 0) {
+		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
+			kmem_free(zv, sizeof (*zv));
+		return;
+	}
+
+	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
+		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
+		if (vd != NULL)
+			zio_flush(lwb->lwb_root_zio, vd);
+		kmem_free(zv, sizeof (*zv));
+	}
 }
 
 /*
- * Initialize the io for a log block.
+ * This function's purpose is to "open" an lwb such that it is ready to
+ * accept new itxs being committed to it. To do this, the lwb's zio
+ * structures are created, and linked to the lwb. This function is
+ * idempotent; if the passed in lwb has already been opened, this
+ * function is essentially a no-op.
  */
 static void
-zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
+zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 {
 	zbookmark_phys_t zb;
 	zio_priority_t prio;
 
+	ASSERT(MUTEX_HELD(&zilog->zl_writer_lock));
+	ASSERT3P(lwb, !=, NULL);
+	EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
+	EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
+
 	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
-	if (zilog->zl_root_zio == NULL) {
-		zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
-		    ZIO_FLAG_CANFAIL);
-	}
-
 	/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
 	mutex_enter(&zilog->zl_lock);
-	if (lwb->lwb_zio == NULL) {
+	if (lwb->lwb_root_zio == NULL) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
+
 		if (!lwb->lwb_fastwrite) {
 			metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
 			lwb->lwb_fastwrite = 1;
 		}
+
 		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
-		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
-		    0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
-		    zil_lwb_write_done, lwb, prio,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+
+		lwb->lwb_root_zio = zio_root(zilog->zl_spa,
+		    zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
+		ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+
+		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
+		    zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
+		    BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
+		    prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
 		    ZIO_FLAG_FASTWRITE, &zb);
+		ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+
+		lwb->lwb_state = LWB_STATE_OPENED;
+
+		/*
+		 * The zilog's "zl_last_lwb_opened" field is used to
+		 * build the lwb/zio dependency chain, which is used to
+		 * preserve the ordering of lwb completions that is
+		 * required by the semantics of the ZIL. Each new lwb
+		 * zio becomes a parent of the "previous" lwb zio, such
+		 * that the new lwb's zio cannot complete until the
+		 * "previous" lwb's zio completes.
+		 *
+		 * This is required by the semantics of zil_commit();
+		 * the commit waiters attached to the lwbs will be woken
+		 * in the lwb zio's completion callback, so this zio
+		 * dependency graph ensures the waiters are woken in the
+		 * correct order (the same order the lwbs were created).
+		 */
+		lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
+		if (last_lwb_opened != NULL &&
+		    last_lwb_opened->lwb_state != LWB_STATE_DONE) {
+			ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+			    last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
+			ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
+			zio_add_child(lwb->lwb_root_zio,
+			    last_lwb_opened->lwb_root_zio);
+		}
+		zilog->zl_last_lwb_opened = lwb;
 	}
 	mutex_exit(&zilog->zl_lock);
+
+	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 }
 
 /*
  * Define a limited set of intent log block sizes.
  *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
  * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 uint64_t zil_block_buckets[] = {
     4096,		/* non TX_WRITE */
     8192+4096,		/* data base */
     32*1024 + 4096, 	/* NFS writes */
     UINT64_MAX
 };
 
 /*
  * Start a log block write and advance to the next log block.
  * Calls are serialized.
  */
 static lwb_t *
-zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 {
 	lwb_t *nlwb = NULL;
 	zil_chain_t *zilc;
 	spa_t *spa = zilog->zl_spa;
 	blkptr_t *bp;
 	dmu_tx_t *tx;
 	uint64_t txg;
 	uint64_t zil_blksz, wsz;
 	int i, error;
 	boolean_t slog;
 
+	ASSERT(MUTEX_HELD(&zilog->zl_writer_lock));
+	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 		bp = &zilc->zc_next_blk;
 	} else {
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
 		bp = &zilc->zc_next_blk;
 	}
 
 	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
 
 	/*
 	 * Allocate the next block and save its address in this block
 	 * before writing it in order to establish the log chain.
 	 * Note that if the allocation of nlwb synced before we wrote
 	 * the block that points at it (lwb), we'd leak it if we crashed.
 	 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
 	 * We dirty the dataset to ensure that zil_sync() will be called
 	 * to clean up in the event of allocation failure or I/O failure.
 	 */
+
 	tx = dmu_tx_create(zilog->zl_os);
 
 	/*
 	 * Since we are not going to create any new dirty data and we can even
 	 * help with clearing the existing dirty data, we should not be subject
 	 * to the dirty data based delays.
 	 * We (ab)use TXG_WAITED to bypass the delay mechanism.
 	 * One side effect from using TXG_WAITED is that dmu_tx_assign() can
 	 * fail if the pool is suspended.  Those are dramatic circumstances,
 	 * so we return NULL to signal that the normal ZIL processing is not
 	 * possible and txg_wait_synced() should be used to ensure that the data
 	 * is on disk.
 	 */
 	error = dmu_tx_assign(tx, TXG_WAITED);
 	if (error != 0) {
-		ASSERT3S(error, ==, EIO);
+		ASSERT(error == EIO || error == ERESTART);
 		dmu_tx_abort(tx);
 		return (NULL);
 	}
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	lwb->lwb_tx = tx;
 
 	/*
 	 * Log blocks are pre-allocated. Here we select the size of the next
 	 * block, based on size used in the last block.
 	 * - first find the smallest bucket that will fit the block from a
 	 *   limited set of block sizes. This is because it's faster to write
 	 *   blocks allocated from the same metaslab as they are adjacent or
 	 *   close.
 	 * - next find the maximum from the new suggested size and an array of
 	 *   previous sizes. This lessens a picket fence effect of wrongly
 	 *   guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
 	 *   requests.
 	 *
 	 * Note we only write what is used, but we can't just allocate
 	 * the maximum block size because we can exhaust the available
 	 * pool log space.
 	 */
 	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
 	for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
 		continue;
 	zil_blksz = zil_block_buckets[i];
 	if (zil_blksz == UINT64_MAX)
 		zil_blksz = SPA_OLD_MAXBLOCKSIZE;
 	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
 	for (i = 0; i < ZIL_PREV_BLKS; i++)
 		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
 	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
 	BP_ZERO(bp);
 	error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
 	if (slog) {
 		ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
 		ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused);
 	} else {
 		ZIL_STAT_BUMP(zil_itx_metaslab_normal_count);
 		ZIL_STAT_INCR(zil_itx_metaslab_normal_bytes, lwb->lwb_nused);
 	}
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		/*
-		 * Allocate a new log write buffer (lwb).
+		 * Allocate a new log write block (lwb).
 		 */
 		nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE);
-
-		/* Record the block for later vdev flushing */
-		zil_add_block(zilog, &lwb->lwb_blk);
 	}
 
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
 		/* For Slim ZIL only write what is used. */
 		wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
 		ASSERT3U(wsz, <=, lwb->lwb_sz);
-		zio_shrink(lwb->lwb_zio, wsz);
+		zio_shrink(lwb->lwb_write_zio, wsz);
 
 	} else {
 		wsz = lwb->lwb_sz;
 	}
 
 	zilc->zc_pad = 0;
 	zilc->zc_nused = lwb->lwb_nused;
 	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 
 	/*
 	 * clear unused data for security
 	 */
 	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
 
-	zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
+	spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
+
+	zil_lwb_add_block(lwb, &lwb->lwb_blk);
+	lwb->lwb_issued_timestamp = gethrtime();
+	lwb->lwb_state = LWB_STATE_ISSUED;
+
+	zio_nowait(lwb->lwb_root_zio);
+	zio_nowait(lwb->lwb_write_zio);
 
 	/*
 	 * If there was an allocation failure then nlwb will be null which
 	 * forces a txg_wait_synced().
 	 */
 	return (nlwb);
 }
 
 static lwb_t *
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
 	lr_t *lrcb, *lrc;
 	lr_write_t *lrwb, *lrw;
 	char *lr_buf;
 	uint64_t dlen, dnow, lwb_sp, reclen, txg;
 
-	if (lwb == NULL)
-		return (NULL);
+	ASSERT(MUTEX_HELD(&zilog->zl_writer_lock));
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(lwb->lwb_buf, !=, NULL);
+
+	zil_lwb_write_open(zilog, lwb);
 
-	ASSERT(lwb->lwb_buf != NULL);
+	lrc = &itx->itx_lr;
+	lrw = (lr_write_t *)lrc;
+
+	/*
+	 * A commit itx doesn't represent any on-disk state; instead
+	 * it's simply used as a place holder on the commit list, and
+	 * provides a mechanism for attaching a "commit waiter" onto the
+	 * correct lwb (such that the waiter can be signalled upon
+	 * completion of that lwb). Thus, we don't process this itx's
+	 * log record if it's a commit itx (these itx's don't have log
+	 * records), and instead link the itx's waiter onto the lwb's
+	 * list of waiters.
+	 *
+	 * For more details, see the comment above zil_commit().
+	 */
+	if (lrc->lrc_txtype == TX_COMMIT) {
+		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
+		itx->itx_private = NULL;
+		return (lwb);
+	}
 
-	lrc = &itx->itx_lr;		/* Common log record inside itx. */
-	lrw = (lr_write_t *)lrc;	/* Write log record inside itx. */
 	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 	} else {
 		dlen = 0;
 	}
 	reclen = lrc->lrc_reclen;
 	zilog->zl_cur_used += (reclen + dlen);
 	txg = lrc->lrc_txg;
 
-	zil_lwb_write_init(zilog, lwb);
+	ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
 
 cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
 	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 ||
 	    lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
-		lwb = zil_lwb_write_start(zilog, lwb);
+		lwb = zil_lwb_write_issue(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
-		zil_lwb_write_init(zilog, lwb);
+		zil_lwb_write_open(zilog, lwb);
 		ASSERT(LWB_EMPTY(lwb));
 		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
 		ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
 	}
 
 	dnow = MIN(dlen, lwb_sp - reclen);
 	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
 	bcopy(lrc, lr_buf, reclen);
 	lrcb = (lr_t *)lr_buf;		/* Like lrc, but inside lwb. */
 	lrwb = (lr_write_t *)lrcb;	/* Like lrw, but inside lwb. */
 
 	ZIL_STAT_BUMP(zil_itx_count);
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
 	 */
 	if (lrc->lrc_txtype == TX_WRITE) {
 		if (txg > spa_freeze_txg(zilog->zl_spa))
 			txg_wait_synced(zilog->zl_dmu_pool, txg);
 		if (itx->itx_wr_state == WR_COPIED) {
 			ZIL_STAT_BUMP(zil_itx_copied_count);
 			ZIL_STAT_INCR(zil_itx_copied_bytes, lrw->lr_length);
 		} else {
 			char *dbuf;
 			int error;
 
 			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
 				lrcb->lrc_reclen += dnow;
 				if (lrwb->lr_length > dnow)
 					lrwb->lr_length = dnow;
 				lrw->lr_offset += dnow;
 				lrw->lr_length -= dnow;
 				ZIL_STAT_BUMP(zil_itx_needcopy_count);
 				ZIL_STAT_INCR(zil_itx_needcopy_bytes,
 				    lrw->lr_length);
 			} else {
-				ASSERT(itx->itx_wr_state == WR_INDIRECT);
+				ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
 				dbuf = NULL;
 				ZIL_STAT_BUMP(zil_itx_indirect_count);
 				ZIL_STAT_INCR(zil_itx_indirect_bytes,
 				    lrw->lr_length);
 			}
-			error = zilog->zl_get_data(
-			    itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
+
+			/*
+			 * We pass in the "lwb_write_zio" rather than
+			 * "lwb_root_zio" so that the "lwb_write_zio"
+			 * becomes the parent of any zio's created by
+			 * the "zl_get_data" callback. The vdevs are
+			 * flushed after the "lwb_write_zio" completes,
+			 * so we want to make sure that completion
+			 * callback waits for these additional zio's,
+			 * such that the vdevs used by those zio's will
+			 * be included in the lwb's vdev tree, and those
+			 * vdevs will be properly flushed. If we passed
+			 * in "lwb_root_zio" here, then these additional
+			 * vdevs may not be flushed; e.g. if these zio's
+			 * completed after "lwb_write_zio" completed.
+			 */
+			error = zilog->zl_get_data(itx->itx_private,
+			    lrwb, dbuf, lwb, lwb->lwb_write_zio);
+
 			if (error == EIO) {
 				txg_wait_synced(zilog->zl_dmu_pool, txg);
 				return (lwb);
 			}
 			if (error != 0) {
 				ASSERT(error == ENOENT || error == EEXIST ||
 				    error == EALREADY);
 				return (lwb);
 			}
 		}
 	}
 
 	/*
 	 * We're actually making an entry, so update lrc_seq to be the
 	 * log record sequence number.  Note that this is generally not
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
-	lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+	lrcb->lrc_seq = ++zilog->zl_lr_seq;
 	lwb->lwb_nused += reclen + dnow;
-	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+
+	zil_lwb_add_txg(lwb, txg);
+
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
 	dlen -= dnow;
 	if (dlen > 0) {
 		zilog->zl_cur_used += reclen;
 		goto cont;
 	}
 
 	return (lwb);
 }
 
 itx_t *
 zil_itx_create(uint64_t txtype, size_t lrsize)
 {
 	size_t itxsize;
 	itx_t *itx;
 
 	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
 	itxsize = offsetof(itx_t, itx_lr) + lrsize;
 
 	itx = zio_data_buf_alloc(itxsize);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	itx->itx_size = itxsize;
 
 	return (itx);
 }
 
 void
 zil_itx_destroy(itx_t *itx)
 {
+	IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
+	IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
+
+	if (itx->itx_callback != NULL)
+		itx->itx_callback(itx->itx_callback_data);
+
 	zio_data_buf_free(itx, itx->itx_size);
 }
 
 /*
  * Free up the sync and async itxs. The itxs_t has already been detached
  * so no locks are needed.
  */
 static void
 zil_itxg_clean(itxs_t *itxs)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
 	while ((itx = list_head(list)) != NULL) {
-		if (itx->itx_callback != NULL)
-			itx->itx_callback(itx->itx_callback_data);
+		/*
+		 * In the general case, commit itxs will not be found
+		 * here, as they'll be committed to an lwb via
+		 * zil_lwb_commit(), and free'd in that function. Having
+		 * said that, it is still possible for commit itxs to be
+		 * found here, due to the following race:
+		 *
+		 *	- a thread calls zil_commit() which assigns the
+		 *	  commit itx to a per-txg i_sync_list
+		 *	- zil_itxg_clean() is called (e.g. via spa_sync())
+		 *	  while the waiter is still on the i_sync_list
+		 *
+		 * There's nothing to prevent syncing the txg while the
+		 * waiter is on the i_sync_list. This normally doesn't
+		 * happen because spa_sync() is slower than zil_commit(),
+		 * but if zil_commit() calls txg_wait_synced() (e.g.
+		 * because zil_create() or zil_commit_writer_stall() is
+		 * called) we will hit this case.
+		 */
+		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
+			zil_commit_waiter_skip(itx->itx_private);
+
 		list_remove(list, itx);
 		zil_itx_destroy(itx);
 	}
 
 	cookie = NULL;
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
 		while ((itx = list_head(list)) != NULL) {
-			if (itx->itx_callback != NULL)
-				itx->itx_callback(itx->itx_callback_data);
 			list_remove(list, itx);
+			/* commit itxs should never be on the async lists. */
+			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 		list_destroy(list);
 		kmem_free(ian, sizeof (itx_async_node_t));
 	}
 	avl_destroy(t);
 
 	kmem_free(itxs, sizeof (itxs_t));
 }
 
 static int
 zil_aitx_compare(const void *x1, const void *x2)
 {
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
 	return (AVL_CMP(o1, o2));
 }
 
 /*
  * Remove all async itx with the given oid.
  */
 static void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
 	itx_t *itx;
 
 	ASSERT(oid != 0);
 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		ian = avl_find(t, &oid, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
 	while ((itx = list_head(&clean_list)) != NULL) {
-		if (itx->itx_callback != NULL)
-			itx->itx_callback(itx->itx_callback_data);
 		list_remove(&clean_list, itx);
+		/* commit itxs should never be on the async lists. */
+		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 		zil_itx_destroy(itx);
 	}
 	list_destroy(&clean_list);
 }
 
 void
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t txg;
 	itxg_t *itxg;
 	itxs_t *itxs, *clean = NULL;
 
 	/*
 	 * Object ids can be re-instantiated in the next txg so
 	 * remove any async transactions to avoid future leaks.
 	 * This can happen if a fsync occurs on the re-instantiated
 	 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
 	 * the new file data and flushes a write record for the old object.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
 		zil_remove_async(zilog, itx->itx_oid);
 
 	/*
 	 * Ensure the data of a renamed file is committed before the rename.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
 		zil_async_to_sync(zilog, itx->itx_oid);
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
 		txg = ZILTEST_TXG;
 	else
 		txg = dmu_tx_get_txg(tx);
 
 	itxg = &zilog->zl_itxg[txg & TXG_MASK];
 	mutex_enter(&itxg->itxg_lock);
 	itxs = itxg->itxg_itxs;
 	if (itxg->itxg_txg != txg) {
 		if (itxs != NULL) {
 			/*
 			 * The zil_clean callback hasn't got around to cleaning
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
 			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
 			    "txg %llu", itxg->itxg_txg);
 			clean = itxg->itxg_itxs;
 		}
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
 		    KM_SLEEP);
 
 		list_create(&itxs->i_sync_list, sizeof (itx_t),
 		    offsetof(itx_t, itx_node));
 		avl_create(&itxs->i_async_tree, zil_aitx_compare,
 		    sizeof (itx_async_node_t),
 		    offsetof(itx_async_node_t, ia_node));
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid =
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
 		itx_async_node_t *ian;
 		avl_index_t where;
 
 		ian = avl_find(t, &foid, &where);
 		if (ian == NULL) {
 			ian = kmem_alloc(sizeof (itx_async_node_t),
 			    KM_SLEEP);
 			list_create(&ian->ia_list, sizeof (itx_t),
 			    offsetof(itx_t, itx_node));
 			ian->ia_foid = foid;
 			avl_insert(t, ian, where);
 		}
 		list_insert_tail(&ian->ia_list, itx);
 	}
 
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
-	zilog_dirty(zilog, txg);
+
+	/*
+	 * We don't want to dirty the ZIL using ZILTEST_TXG, because
+	 * zil_clean() will never be called using ZILTEST_TXG. Thus, we
+	 * need to be careful to always dirty the ZIL using the "real"
+	 * TXG (not itxg_txg) even when the SPA is frozen.
+	 */
+	zilog_dirty(zilog, dmu_tx_get_txg(tx));
 	mutex_exit(&itxg->itxg_lock);
 
 	/* Release the old itxs now we've dropped the lock */
 	if (clean != NULL)
 		zil_itxg_clean(clean);
 }
 
 /*
  * If there are any in-memory intent log transactions which have now been
  * synced then start up a taskq to free them. We should only do this after we
  * have written out the uberblocks (i.e. txg has been comitted) so that
  * don't inadvertently clean out in-memory log records that would be required
  * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
 	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
 	itxs_t *clean_me;
 
+	ASSERT3U(synced_txg, <, ZILTEST_TXG);
+
 	mutex_enter(&itxg->itxg_lock);
 	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
 		mutex_exit(&itxg->itxg_lock);
 		return;
 	}
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT3U(itxg->itxg_txg, !=, 0);
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
 	mutex_exit(&itxg->itxg_lock);
 	/*
 	 * Preferably start a task queue to free up the old itxs but
 	 * if taskq_dispatch can't allocate resources to do that then
 	 * free it in-line. This should be rare. Note, using TQ_SLEEP
 	 * created a bad performance problem.
 	 */
 	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
 	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
 	taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
 	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP);
 	if (id == TASKQID_INVALID)
 		zil_itxg_clean(clean_me);
 }
 
 /*
- * Get the list of itxs to commit into zl_itx_commit_list.
+ * This function will traverse the queue of itxs that need to be
+ * committed, and move them onto the ZIL's zl_itx_commit_list.
  */
 static void
 zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 
+	ASSERT(MUTEX_HELD(&zilog->zl_writer_lock));
+
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing. That's okay since we'll
 	 * only commit things in the future.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If we're adding itx records to the zl_itx_commit_list,
 		 * then the zil better be dirty in this "txg". We can assert
 		 * that here since we're holding the itxg_lock which will
 		 * prevent spa_sync from cleaning it. Once we add the itxs
 		 * to the zl_itx_commit_list we must commit it to disk even
 		 * if it's unnecessary (i.e. the txg was synced).
 		 */
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
 
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
 /*
  * Move the async itxs for a specified object to commit into sync lists.
  */
 static void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian;
 	avl_tree_t *t;
 	avl_index_t where;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If a foid is specified then find that node and append its
 		 * list. Otherwise walk the tree appending all the lists
 		 * to the sync list. We add to the end rather than the
 		 * beginning to ensure the create has happened.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
 			ian = avl_find(t, &foid, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 			}
 		} else {
 			void *cookie = NULL;
 
 			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 				list_destroy(&ian->ia_list);
 				kmem_free(ian, sizeof (itx_async_node_t));
 			}
 		}
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
+/*
+ * This function will prune commit itxs that are at the head of the
+ * commit list (it won't prune past the first non-commit itx), and
+ * either: a) attach them to the last lwb that's still pending
+ * completion, or b) skip them altogether.
+ *
+ * This is used as a performance optimization to prevent commit itxs
+ * from generating new lwbs when it's unnecessary to do so.
+ */
 static void
-zil_commit_writer(zilog_t *zilog)
+zil_prune_commit_list(zilog_t *zilog)
 {
-	uint64_t txg;
 	itx_t *itx;
-	lwb_t *lwb;
-	spa_t *spa = zilog->zl_spa;
-	int error = 0;
 
-	ASSERT(zilog->zl_root_zio == NULL);
+	ASSERT(MUTEX_HELD(&zilog->zl_writer_lock));
 
-	mutex_exit(&zilog->zl_lock);
+	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
+		lr_t *lrc = &itx->itx_lr;
+		if (lrc->lrc_txtype != TX_COMMIT)
+			break;
 
-	zil_get_commit_list(zilog);
+		mutex_enter(&zilog->zl_lock);
+
+		lwb_t *last_lwb = zilog->zl_last_lwb_opened;
+		if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) {
+			/*
+			 * All of the itxs this waiter was waiting on
+			 * must have already completed (or there were
+			 * never any itx's for it to wait on), so it's
+			 * safe to skip this waiter and mark it done.
+			 */
+			zil_commit_waiter_skip(itx->itx_private);
+		} else {
+			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
+			itx->itx_private = NULL;
+		}
+
+		mutex_exit(&zilog->zl_lock);
+
+		list_remove(&zilog->zl_itx_commit_list, itx);
+		zil_itx_destroy(itx);
+	}
+
+	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
+}
+
+static void
+zil_commit_writer_stall(zilog_t *zilog)
+{
+	/*
+	 * When zio_alloc_zil() fails to allocate the next lwb block on
+	 * disk, we must call txg_wait_synced() to ensure all of the
+	 * lwbs in the zilog's zl_lwb_list are synced and then freed (in
+	 * zil_sync()), such that any subsequent ZIL writer (i.e. a call
+	 * to zil_process_commit_list()) will have to call zil_create(),
+	 * and start a new ZIL chain.
+	 *
+	 * Since zil_alloc_zil() failed, the lwb that was previously
+	 * issued does not have a pointer to the "next" lwb on disk.
+	 * Thus, if another ZIL writer thread was to allocate the "next"
+	 * on-disk lwb, that block could be leaked in the event of a
+	 * crash (because the previous lwb on-disk would not point to
+	 * it).
+	 *
+	 * We must hold the zilog's zl_writer_lock while we do this, to
+	 * ensure no new threads enter zil_process_commit_list() until
+	 * all lwb's in the zl_lwb_list have been synced and freed
+	 * (which is achieved via the txg_wait_synced() call).
+	 */
+	ASSERT(MUTEX_HELD(&zilog->zl_writer_lock));
+	txg_wait_synced(zilog->zl_dmu_pool, 0);
+	ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+}
+
+/*
+ * This function will traverse the commit list, creating new lwbs as
+ * needed, and committing the itxs from the commit list to these newly
+ * created lwbs. Additionally, as a new lwb is created, the previous
+ * lwb will be issued to the zio layer to be written to disk.
+ */
+static void
+zil_process_commit_list(zilog_t *zilog)
+{
+	spa_t *spa = zilog->zl_spa;
+	list_t nolwb_itxs;
+	list_t nolwb_waiters;
+	lwb_t *lwb;
+	itx_t *itx;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_writer_lock));
 
 	/*
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
-	if (list_head(&zilog->zl_itx_commit_list) == NULL) {
-		mutex_enter(&zilog->zl_lock);
+	if (list_head(&zilog->zl_itx_commit_list) == NULL)
 		return;
-	}
 
-	if (zilog->zl_suspend) {
-		lwb = NULL;
+	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
+	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
+	    offsetof(zil_commit_waiter_t, zcw_node));
+
+	lwb = list_tail(&zilog->zl_lwb_list);
+	if (lwb == NULL) {
+		lwb = zil_create(zilog);
 	} else {
-		lwb = list_tail(&zilog->zl_lwb_list);
-		if (lwb == NULL)
-			lwb = zil_create(zilog);
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
 	}
 
-	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
-	for (itx = list_head(&zilog->zl_itx_commit_list); itx != NULL;
-	    itx = list_next(&zilog->zl_itx_commit_list, itx)) {
-		txg = itx->itx_lr.lrc_txg;
+	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
+		lr_t *lrc = &itx->itx_lr;
+		uint64_t txg = lrc->lrc_txg;
+
 		ASSERT3U(txg, !=, 0);
 
+		if (lrc->lrc_txtype == TX_COMMIT) {
+			DTRACE_PROBE2(zil__process__commit__itx,
+			    zilog_t *, zilog, itx_t *, itx);
+		} else {
+			DTRACE_PROBE2(zil__process__normal__itx,
+			    zilog_t *, zilog, itx_t *, itx);
+		}
+
+		list_remove(&zilog->zl_itx_commit_list, itx);
+
 		/*
 		 * This is inherently racy and may result in us writing
-		 * out a log block for a txg that was just synced. This is
-		 * ok since we'll end cleaning up that log block the next
-		 * time we call zil_sync().
+		 * out a log block for a txg that was just synced. This
+		 * is ok since we'll end cleaning up that log block the
+		 * next time we call zil_sync().
 		 */
-		if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
-			lwb = zil_lwb_commit(zilog, itx, lwb);
+		boolean_t synced = txg <= spa_last_synced_txg(spa);
+		boolean_t frozen = txg > spa_freeze_txg(spa);
+
+		if (!synced || frozen) {
+			if (lwb != NULL) {
+				lwb = zil_lwb_commit(zilog, itx, lwb);
+
+				if (lwb == NULL)
+					list_insert_tail(&nolwb_itxs, itx);
+				else
+					list_insert_tail(&lwb->lwb_itxs, itx);
+			} else {
+				if (lrc->lrc_txtype == TX_COMMIT) {
+					zil_commit_waiter_link_nolwb(
+					    itx->itx_private, &nolwb_waiters);
+				}
+
+				list_insert_tail(&nolwb_itxs, itx);
+			}
+		} else {
+			/*
+			 * If this is a commit itx, then there will be a
+			 * thread that is either: already waiting for
+			 * it, or soon will be waiting.
+			 *
+			 * This itx has already been committed to disk
+			 * via spa_sync() so we don't bother committing
+			 * it to an lwb. As a result, we cannot use the
+			 * lwb zio callback to signal the waiter and
+			 * mark it as done, so we must do that here.
+			 */
+			if (lrc->lrc_txtype == TX_COMMIT)
+				zil_commit_waiter_skip(itx->itx_private);
+
+			zil_itx_destroy(itx);
+		}
 	}
-	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
 
-	/* write the last block out */
-	if (lwb != NULL && lwb->lwb_zio != NULL)
-		lwb = zil_lwb_write_start(zilog, lwb);
+	if (lwb == NULL) {
+		/*
+		 * This indicates zio_alloc_zil() failed to allocate the
+		 * "next" lwb on-disk. When this happens, we must stall
+		 * the ZIL write pipeline; see the comment within
+		 * zil_commit_writer_stall() for more details.
+		 */
+		zil_commit_writer_stall(zilog);
 
-	zilog->zl_cur_used = 0;
+		/*
+		 * Additionally, we have to signal and mark the "nolwb"
+		 * waiters as "done" here, since without an lwb, we
+		 * can't do this via zil_lwb_flush_vdevs_done() like
+		 * normal.
+		 */
+		zil_commit_waiter_t *zcw;
+		while ((zcw = list_head(&nolwb_waiters)) != NULL) {
+			zil_commit_waiter_skip(zcw);
+			list_remove(&nolwb_waiters, zcw);
+		}
+
+		/*
+		 * And finally, we have to destroy the itx's that
+		 * couldn't be committed to an lwb; this will also call
+		 * the itx's callback if one exists for the itx.
+		 */
+		while ((itx = list_head(&nolwb_itxs)) != NULL) {
+			list_remove(&nolwb_itxs, itx);
+			zil_itx_destroy(itx);
+		}
+	} else {
+		ASSERT(list_is_empty(&nolwb_waiters));
+		ASSERT3P(lwb, !=, NULL);
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
+
+		/*
+		 * At this point, the ZIL block pointed at by the "lwb"
+		 * variable is in one of the following states: "closed"
+		 * or "open".
+		 *
+		 * If its "closed", then no itxs have been committed to
+		 * it, so there's no point in issueing its zio (i.e.
+		 * it's "empty").
+		 *
+		 * If its "open" state, then it contains one or more
+		 * itxs that eventually need to be committed to stable
+		 * storage. In this case we intentionally do not issue
+		 * the lwb's zio to disk yet, and instead rely on one of
+		 * the following two mechanisms for issuing the zio:
+		 *
+		 * 1. Ideally, there will be more ZIL activity occuring
+		 * on the system, such that this function will be
+		 * immeidately called again (not necessarily by the same
+		 * thread) and this lwb's zio will be issued via
+		 * zil_lwb_commit(). This way, the lwb is guaranteed to
+		 * be "full" when it is issued to disk, and we'll make
+		 * use of the lwb's size the best we can.
+		 *
+		 * 2. If there isn't sufficient ZIL activity occuring on
+		 * the system, such that this lwb's zio isn't issued via
+		 * zil_lwb_commit(), zil_commit_waiter() will issue the
+		 * lwb's zio. If this occurs, the lwb is not guaranteed
+		 * to be "full" by the time its zio is issued, and means
+		 * the size of the lwb was "too large" given the amount
+		 * of ZIL activity occuring on the system at that time.
+		 *
+		 * We do this for a couple of reasons:
+		 *
+		 * 1. To try and reduce the number of IOPs needed to
+		 * write the same number of itxs. If an lwb has space
+		 * available in it's buffer for more itxs, and more itxs
+		 * will be committed relatively soon (relative to the
+		 * latency of performing a write), then it's beneficial
+		 * to wait for these "next" itxs. This way, more itxs
+		 * can be committed to stable storage with fewer writes.
+		 *
+		 * 2. To try and use the largest lwb block size that the
+		 * incoming rate of itxs can support. Again, this is to
+		 * try and pack as many itxs into as few lwbs as
+		 * possible, without significantly impacting the latency
+		 * of each individual itx.
+		 */
+	}
+}
+
+/*
+ * This function is responsible for ensuring the passed in commit waiter
+ * (and associated commit itx) is committed to an lwb. If the waiter is
+ * not already committed to an lwb, all itxs in the zilog's queue of
+ * itxs will be processed. The assumption is the passed in waiter's
+ * commit itx will found in the queue just like the other non-commit
+ * itxs, such that when the entire queue is processed, the waiter will
+ * have been commited to an lwb.
+ *
+ * The lwb associated with the passed in waiter is not guaranteed to
+ * have been issued by the time this function completes. If the lwb is
+ * not issued, we rely on future calls to zil_commit_writer() to issue
+ * the lwb, or the timeout mechanism found in zil_commit_waiter().
+ */
+static void
+zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
+	ASSERT(spa_writeable(zilog->zl_spa));
+	ASSERT0(zilog->zl_suspend);
+
+	mutex_enter(&zilog->zl_writer_lock);
+
+	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
+		/*
+		 * It's possible that, while we were waiting to acquire
+		 * the "zl_writer_lock", another thread committed this
+		 * waiter to an lwb. If that occurs, we bail out early,
+		 * without processing any of the zilog's queue of itxs.
+		 *
+		 * On certain workloads and system configurations, the
+		 * "zl_writer_lock" can become highly contended. In an
+		 * attempt to reduce this contention, we immediately drop
+		 * the lock if the waiter has already been processed.
+		 *
+		 * We've measured this optimization to reduce CPU spent
+		 * contending on this lock by up to 5%, using a system
+		 * with 32 CPUs, low latency storage (~50 usec writes),
+		 * and 1024 threads performing sync writes.
+		 */
+		goto out;
+	}
+
+	ZIL_STAT_BUMP(zil_commit_writer_count);
+
+	zil_get_commit_list(zilog);
+	zil_prune_commit_list(zilog);
+	zil_process_commit_list(zilog);
+
+out:
+	mutex_exit(&zilog->zl_writer_lock);
+}
+
+static void
+zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+	ASSERT(!MUTEX_HELD(&zilog->zl_writer_lock));
+	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+
+	lwb_t *lwb = zcw->zcw_lwb;
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
 
 	/*
-	 * Wait if necessary for the log blocks to be on stable storage.
+	 * If the lwb has already been issued by another thread, we can
+	 * immediately return since there's no work to be done (the
+	 * point of this function is to issue the lwb). Additionally, we
+	 * do this prior to acquiring the zl_writer_lock, to avoid
+	 * acquiring it when it's not necessary to do so.
 	 */
-	if (zilog->zl_root_zio) {
-		error = zio_wait(zilog->zl_root_zio);
-		zilog->zl_root_zio = NULL;
-		zil_flush_vdevs(zilog);
-	}
+	if (lwb->lwb_state == LWB_STATE_ISSUED ||
+	    lwb->lwb_state == LWB_STATE_DONE)
+		return;
 
-	if (error || lwb == NULL)
-		txg_wait_synced(zilog->zl_dmu_pool, 0);
+	/*
+	 * In order to call zil_lwb_write_issue() we must hold the
+	 * zilog's "zl_writer_lock". We can't simply acquire that lock,
+	 * since we're already holding the commit waiter's "zcw_lock",
+	 * and those two locks are aquired in the opposite order
+	 * elsewhere.
+	 */
+	mutex_exit(&zcw->zcw_lock);
+	mutex_enter(&zilog->zl_writer_lock);
+	mutex_enter(&zcw->zcw_lock);
 
-	while ((itx = list_head(&zilog->zl_itx_commit_list))) {
-		txg = itx->itx_lr.lrc_txg;
-		ASSERT(txg);
+	/*
+	 * Since we just dropped and re-acquired the commit waiter's
+	 * lock, we have to re-check to see if the waiter was marked
+	 * "done" during that process. If the waiter was marked "done",
+	 * the "lwb" pointer is no longer valid (it can be free'd after
+	 * the waiter is marked "done"), so without this check we could
+	 * wind up with a use-after-free error below.
+	 */
+	if (zcw->zcw_done)
+		goto out;
 
-		if (itx->itx_callback != NULL)
-			itx->itx_callback(itx->itx_callback_data);
-		list_remove(&zilog->zl_itx_commit_list, itx);
-		zil_itx_destroy(itx);
+	ASSERT3P(lwb, ==, zcw->zcw_lwb);
+
+	/*
+	 * We've already checked this above, but since we hadn't
+	 * acquired the zilog's zl_writer_lock, we have to perform this
+	 * check a second time while holding the lock. We can't call
+	 * zil_lwb_write_issue() if the lwb had already been issued.
+	 */
+	if (lwb->lwb_state == LWB_STATE_ISSUED ||
+	    lwb->lwb_state == LWB_STATE_DONE)
+		goto out;
+
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+
+	/*
+	 * As described in the comments above zil_commit_waiter() and
+	 * zil_process_commit_list(), we need to issue this lwb's zio
+	 * since we've reached the commit waiter's timeout and it still
+	 * hasn't been issued.
+	 */
+	lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
+
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
+
+	/*
+	 * Since the lwb's zio hadn't been issued by the time this thread
+	 * reached its timeout, we reset the zilog's "zl_cur_used" field
+	 * to influence the zil block size selection algorithm.
+	 *
+	 * By having to issue the lwb's zio here, it means the size of the
+	 * lwb was too large, given the incoming throughput of itxs.  By
+	 * setting "zl_cur_used" to zero, we communicate this fact to the
+	 * block size selection algorithm, so it can take this informaiton
+	 * into account, and potentially select a smaller size for the
+	 * next lwb block that is allocated.
+	 */
+	zilog->zl_cur_used = 0;
+
+	if (nlwb == NULL) {
+		/*
+		 * When zil_lwb_write_issue() returns NULL, this
+		 * indicates zio_alloc_zil() failed to allocate the
+		 * "next" lwb on-disk. When this occurs, the ZIL write
+		 * pipeline must be stalled; see the comment within the
+		 * zil_commit_writer_stall() function for more details.
+		 *
+		 * We must drop the commit waiter's lock prior to
+		 * calling zil_commit_writer_stall() or else we can wind
+		 * up with the following deadlock:
+		 *
+		 * - This thread is waiting for the txg to sync while
+		 *   holding the waiter's lock; txg_wait_synced() is
+		 *   used within txg_commit_writer_stall().
+		 *
+		 * - The txg can't sync because it is waiting for this
+		 *   lwb's zio callback to call dmu_tx_commit().
+		 *
+		 * - The lwb's zio callback can't call dmu_tx_commit()
+		 *   because it's blocked trying to acquire the waiter's
+		 *   lock, which occurs prior to calling dmu_tx_commit()
+		 */
+		mutex_exit(&zcw->zcw_lock);
+		zil_commit_writer_stall(zilog);
+		mutex_enter(&zcw->zcw_lock);
 	}
 
-	mutex_enter(&zilog->zl_lock);
+out:
+	mutex_exit(&zilog->zl_writer_lock);
+	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+}
+
+/*
+ * This function is responsible for performing the following two tasks:
+ *
+ * 1. its primary responsibility is to block until the given "commit
+ *    waiter" is considered "done".
+ *
+ * 2. its secondary responsibility is to issue the zio for the lwb that
+ *    the given "commit waiter" is waiting on, if this function has
+ *    waited "long enough" and the lwb is still in the "open" state.
+ *
+ * Given a sufficient amount of itxs being generated and written using
+ * the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
+ * function. If this does not occur, this secondary responsibility will
+ * ensure the lwb is issued even if there is not other synchronous
+ * activity on the system.
+ *
+ * For more details, see zil_process_commit_list(); more specifically,
+ * the comment at the bottom of that function.
+ */
+static void
+zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
+	ASSERT(!MUTEX_HELD(&zilog->zl_writer_lock));
+	ASSERT(spa_writeable(zilog->zl_spa));
+	ASSERT0(zilog->zl_suspend);
+
+	mutex_enter(&zcw->zcw_lock);
 
 	/*
-	 * Remember the highest committed log sequence number for ztest.
-	 * We only update this value when all the log writes succeeded,
-	 * because ztest wants to ASSERT that it got the whole log chain.
+	 * The timeout is scaled based on the lwb latency to avoid
+	 * significantly impacting the latency of each individual itx.
+	 * For more details, see the comment at the bottom of the
+	 * zil_process_commit_list() function.
 	 */
-	if (error == 0 && lwb != NULL)
-		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
+	int pct = MAX(zfs_commit_timeout_pct, 1);
+	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
+	hrtime_t wakeup = gethrtime() + sleep;
+	boolean_t timedout = B_FALSE;
+
+	while (!zcw->zcw_done) {
+		ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+
+		lwb_t *lwb = zcw->zcw_lwb;
+
+		/*
+		 * Usually, the waiter will have a non-NULL lwb field here,
+		 * but it's possible for it to be NULL as a result of
+		 * zil_commit() racing with spa_sync().
+		 *
+		 * When zil_clean() is called, it's possible for the itxg
+		 * list (which may be cleaned via a taskq) to contain
+		 * commit itxs. When this occurs, the commit waiters linked
+		 * off of these commit itxs will not be committed to an
+		 * lwb.  Additionally, these commit waiters will not be
+		 * marked done until zil_commit_waiter_skip() is called via
+		 * zil_itxg_clean().
+		 *
+		 * Thus, it's possible for this commit waiter (i.e. the
+		 * "zcw" variable) to be found in this "in between" state;
+		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
+		 * been skipped, so it's "zcw_done" field is still B_FALSE.
+		 */
+		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
+
+		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
+			ASSERT3B(timedout, ==, B_FALSE);
+
+			/*
+			 * If the lwb hasn't been issued yet, then we
+			 * need to wait with a timeout, in case this
+			 * function needs to issue the lwb after the
+			 * timeout is reached; responsibility (2) from
+			 * the comment above this function.
+			 */
+			clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv,
+			    &zcw->zcw_lock, wakeup, USEC2NSEC(1),
+			    CALLOUT_FLAG_ABSOLUTE);
+
+			if (timeleft >= 0 || zcw->zcw_done)
+				continue;
+
+			timedout = B_TRUE;
+			zil_commit_waiter_timeout(zilog, zcw);
+
+			if (!zcw->zcw_done) {
+				/*
+				 * If the commit waiter has already been
+				 * marked "done", it's possible for the
+				 * waiter's lwb structure to have already
+				 * been freed.  Thus, we can only reliably
+				 * make these assertions if the waiter
+				 * isn't done.
+				 */
+				ASSERT3P(lwb, ==, zcw->zcw_lwb);
+				ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
+			}
+		} else {
+			/*
+			 * If the lwb isn't open, then it must have already
+			 * been issued. In that case, there's no need to
+			 * use a timeout when waiting for the lwb to
+			 * complete.
+			 *
+			 * Additionally, if the lwb is NULL, the waiter
+			 * will soon be signalled and marked done via
+			 * zil_clean() and zil_itxg_clean(), so no timeout
+			 * is required.
+			 */
+
+			IMPLY(lwb != NULL,
+			    lwb->lwb_state == LWB_STATE_ISSUED ||
+			    lwb->lwb_state == LWB_STATE_DONE);
+			cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
+		}
+	}
+
+	mutex_exit(&zcw->zcw_lock);
+}
+
+static zil_commit_waiter_t *
+zil_alloc_commit_waiter(void)
+{
+	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
+
+	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_link_init(&zcw->zcw_node);
+	zcw->zcw_lwb = NULL;
+	zcw->zcw_done = B_FALSE;
+	zcw->zcw_zio_error = 0;
+
+	return (zcw);
+}
+
+static void
+zil_free_commit_waiter(zil_commit_waiter_t *zcw)
+{
+	ASSERT(!list_link_active(&zcw->zcw_node));
+	ASSERT3P(zcw->zcw_lwb, ==, NULL);
+	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
+	mutex_destroy(&zcw->zcw_lock);
+	cv_destroy(&zcw->zcw_cv);
+	kmem_cache_free(zil_zcw_cache, zcw);
 }
 
 /*
- * Commit zfs transactions to stable storage.
- * If foid is 0 push out all transactions, otherwise push only those
- * for that object or might reference that object.
+ * This function is used to create a TX_COMMIT itx and assign it. This
+ * way, it will be linked into the ZIL's list of synchronous itxs, and
+ * then later committed to an lwb (or skipped) when
+ * zil_process_commit_list() is called.
+ */
+static void
+zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
+	itx->itx_sync = B_TRUE;
+	itx->itx_private = zcw;
+
+	zil_itx_assign(zilog, itx, tx);
+
+	dmu_tx_commit(tx);
+}
+
+/*
+ * Commit ZFS Intent Log transactions (itxs) to stable storage.
+ *
+ * When writing ZIL transactions to the on-disk representation of the
+ * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
+ * itxs can be committed to a single lwb. Once a lwb is written and
+ * committed to stable storage (i.e. the lwb is written, and vdevs have
+ * been flushed), each itx that was committed to that lwb is also
+ * considered to be committed to stable storage.
+ *
+ * When an itx is committed to an lwb, the log record (lr_t) contained
+ * by the itx is copied into the lwb's zio buffer, and once this buffer
+ * is written to disk, it becomes an on-disk ZIL block.
+ *
+ * As itxs are generated, they're inserted into the ZIL's queue of
+ * uncommitted itxs. The semantics of zil_commit() are such that it will
+ * block until all itxs that were in the queue when it was called, are
+ * committed to stable storage.
+ *
+ * If "foid" is zero, this means all "synchronous" and "asynchronous"
+ * itxs, for all objects in the dataset, will be committed to stable
+ * storage prior to zil_commit() returning. If "foid" is non-zero, all
+ * "synchronous" itxs for all objects, but only "asynchronous" itxs
+ * that correspond to the foid passed in, will be committed to stable
+ * storage prior to zil_commit() returning.
+ *
+ * Generally speaking, when zil_commit() is called, the consumer doesn't
+ * actually care about _all_ of the uncommitted itxs. Instead, they're
+ * simply trying to waiting for a specific itx to be committed to disk,
+ * but the interface(s) for interacting with the ZIL don't allow such
+ * fine-grained communication. A better interface would allow a consumer
+ * to create and assign an itx, and then pass a reference to this itx to
+ * zil_commit(); such that zil_commit() would return as soon as that
+ * specific itx was committed to disk (instead of waiting for _all_
+ * itxs to be committed).
+ *
+ * When a thread calls zil_commit() a special "commit itx" will be
+ * generated, along with a corresponding "waiter" for this commit itx.
+ * zil_commit() will wait on this waiter's CV, such that when the waiter
+ * is marked done, and signalled, zil_commit() will return.
+ *
+ * This commit itx is inserted into the queue of uncommitted itxs. This
+ * provides an easy mechanism for determining which itxs were in the
+ * queue prior to zil_commit() having been called, and which itxs were
+ * added after zil_commit() was called.
+ *
+ * The commit it is special; it doesn't have any on-disk representation.
+ * When a commit itx is "committed" to an lwb, the waiter associated
+ * with it is linked onto the lwb's list of waiters. Then, when that lwb
+ * completes, each waiter on the lwb's list is marked done and signalled
+ * -- allowing the thread waiting on the waiter to return from zil_commit().
+ *
+ * It's important to point out a few critical factors that allow us
+ * to make use of the commit itxs, commit waiters, per-lwb lists of
+ * commit waiters, and zio completion callbacks like we're doing:
  *
- * itxs are committed in batches. In a heavily stressed zil there will be
- * a commit writer thread who is writing out a bunch of itxs to the log
- * for a set of committing threads (cthreads) in the same batch as the writer.
- * Those cthreads are all waiting on the same cv for that batch.
+ *   1. The list of waiters for each lwb is traversed, and each commit
+ *      waiter is marked "done" and signalled, in the zio completion
+ *      callback of the lwb's zio[*].
  *
- * There will also be a different and growing batch of threads that are
- * waiting to commit (qthreads). When the committing batch completes
- * a transition occurs such that the cthreads exit and the qthreads become
- * cthreads. One of the new cthreads becomes the writer thread for the
- * batch. Any new threads arriving become new qthreads.
+ *      * Actually, the waiters are signalled in the zio completion
+ *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
+ *        that are sent to the vdevs upon completion of the lwb zio.
  *
- * Only 2 condition variables are needed and there's no transition
- * between the two cvs needed. They just flip-flop between qthreads
- * and cthreads.
+ *   2. When the itxs are inserted into the ZIL's queue of uncommitted
+ *      itxs, the order in which they are inserted is preserved[*]; as
+ *      itxs are added to the queue, they are added to the tail of
+ *      in-memory linked lists.
  *
- * Using this scheme we can efficiently wakeup up only those threads
- * that have been committed.
+ *      When committing the itxs to lwbs (to be written to disk), they
+ *      are committed in the same order in which the itxs were added to
+ *      the uncommitted queue's linked list(s); i.e. the linked list of
+ *      itxs to commit is traversed from head to tail, and each itx is
+ *      committed to an lwb in that order.
+ *
+ *      * To clarify:
+ *
+ *        - the order of "sync" itxs is preserved w.r.t. other
+ *          "sync" itxs, regardless of the corresponding objects.
+ *        - the order of "async" itxs is preserved w.r.t. other
+ *          "async" itxs corresponding to the same object.
+ *        - the order of "async" itxs is *not* preserved w.r.t. other
+ *          "async" itxs corresponding to different objects.
+ *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
+ *          versa) is *not* preserved, even for itxs that correspond
+ *          to the same object.
+ *
+ *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
+ *      zil_get_commit_list(), and zil_process_commit_list().
+ *
+ *   3. The lwbs represent a linked list of blocks on disk. Thus, any
+ *      lwb cannot be considered committed to stable storage, until its
+ *      "previous" lwb is also committed to stable storage. This fact,
+ *      coupled with the fact described above, means that itxs are
+ *      committed in (roughly) the order in which they were generated.
+ *      This is essential because itxs are dependent on prior itxs.
+ *      Thus, we *must not* deem an itx as being committed to stable
+ *      storage, until *all* prior itxs have also been committed to
+ *      stable storage.
+ *
+ *      To enforce this ordering of lwb zio's, while still leveraging as
+ *      much of the underlying storage performance as possible, we rely
+ *      on two fundamental concepts:
+ *
+ *          1. The creation and issuance of lwb zio's is protected by
+ *             the zilog's "zl_writer_lock", which ensures only a single
+ *             thread is creating and/or issuing lwb's at a time
+ *          2. The "previous" lwb is a child of the "current" lwb
+ *             (leveraging the zio parent-child depenency graph)
+ *
+ *      By relying on this parent-child zio relationship, we can have
+ *      many lwb zio's concurrently issued to the underlying storage,
+ *      but the order in which they complete will be the same order in
+ *      which they were created.
  */
 void
 zil_commit(zilog_t *zilog, uint64_t foid)
 {
-	uint64_t mybatch;
+	/*
+	 * We should never attempt to call zil_commit on a snapshot for
+	 * a couple of reasons:
+	 *
+	 * 1. A snapshot may never be modified, thus it cannot have any
+	 *    in-flight itxs that would have modified the dataset.
+	 *
+	 * 2. By design, when zil_commit() is called, a commit itx will
+	 *    be assigned to this zilog; as a result, the zilog will be
+	 *    dirtied. We must not dirty the zilog of a snapshot; there's
+	 *    checks in the code that enforce this invariant, and will
+	 *    cause a panic if it's not upheld.
+	 */
+	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
 
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return;
 
+	if (!spa_writeable(zilog->zl_spa)) {
+		/*
+		 * If the SPA is not writable, there should never be any
+		 * pending itxs waiting to be committed to disk. If that
+		 * weren't true, we'd skip writing those itxs out, and
+		 * would break the sematics of zil_commit(); thus, we're
+		 * verifying that truth before we return to the caller.
+		 */
+		ASSERT(list_is_empty(&zilog->zl_lwb_list));
+		ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
+		for (int i = 0; i < TXG_SIZE; i++)
+			ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
+		return;
+	}
+
+	/*
+	 * If the ZIL is suspended, we don't want to dirty it by calling
+	 * zil_commit_itx_assign() below, nor can we write out
+	 * lwbs like would be done in zil_commit_write(). Thus, we
+	 * simply rely on txg_wait_synced() to maintain the necessary
+	 * semantics, and avoid calling those functions altogether.
+	 */
+	if (zilog->zl_suspend > 0) {
+		txg_wait_synced(zilog->zl_dmu_pool, 0);
+		return;
+	}
+
 	ZIL_STAT_BUMP(zil_commit_count);
 
-	/* move the async itxs for the foid to the sync queues */
+	/*
+	 * Move the "async" itxs for the specified foid to the "sync"
+	 * queues, such that they will be later committed (or skipped)
+	 * to an lwb when zil_process_commit_list() is called.
+	 *
+	 * Since these "async" itxs must be committed prior to this
+	 * call to zil_commit returning, we must perform this operation
+	 * before we call zil_commit_itx_assign().
+	 */
 	zil_async_to_sync(zilog, foid);
 
-	mutex_enter(&zilog->zl_lock);
-	mybatch = zilog->zl_next_batch;
-	while (zilog->zl_writer) {
-		cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock);
-		if (mybatch <= zilog->zl_com_batch) {
-			mutex_exit(&zilog->zl_lock);
-			return;
-		}
-	}
-
-	zilog->zl_next_batch++;
-	zilog->zl_writer = B_TRUE;
-	ZIL_STAT_BUMP(zil_commit_writer_count);
-	zil_commit_writer(zilog);
-	zilog->zl_com_batch = mybatch;
-	zilog->zl_writer = B_FALSE;
+	/*
+	 * We allocate a new "waiter" structure which will initially be
+	 * linked to the commit itx using the itx's "itx_private" field.
+	 * Since the commit itx doesn't represent any on-disk state,
+	 * when it's committed to an lwb, rather than copying the its
+	 * lr_t into the lwb's buffer, the commit itx's "waiter" will be
+	 * added to the lwb's list of waiters. Then, when the lwb is
+	 * committed to stable storage, each waiter in the lwb's list of
+	 * waiters will be marked "done", and signalled.
+	 *
+	 * We must create the waiter and assign the commit itx prior to
+	 * calling zil_commit_writer(), or else our specific commit itx
+	 * is not guaranteed to be committed to an lwb prior to calling
+	 * zil_commit_waiter().
+	 */
+	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
+	zil_commit_itx_assign(zilog, zcw);
 
-	/* wake up one thread to become the next writer */
-	cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]);
+	zil_commit_writer(zilog, zcw);
+	zil_commit_waiter(zilog, zcw);
 
-	/* wake up all threads waiting for this batch to be committed */
-	cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]);
+	if (zcw->zcw_zio_error != 0) {
+		/*
+		 * If there was an error writing out the ZIL blocks that
+		 * this thread is waiting on, then we fallback to
+		 * relying on spa_sync() to write out the data this
+		 * thread is waiting on. Obviously this has performance
+		 * implications, but the expectation is for this to be
+		 * an exceptional case, and shouldn't occur often.
+		 */
+		DTRACE_PROBE2(zil__commit__io__error,
+		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
+		txg_wait_synced(zilog->zl_dmu_pool, 0);
+	}
 
-	mutex_exit(&zilog->zl_lock);
+	zil_free_commit_waiter(zcw);
 }
 
 /*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
 	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
 	/*
 	 * We don't zero out zl_destroy_txg, so make sure we don't try
 	 * to destroy it twice.
 	 */
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
 	if (*replayed_seq != 0) {
 		ASSERT(zh->zh_replay_seq < *replayed_seq);
 		zh->zh_replay_seq = *replayed_seq;
 		*replayed_seq = 0;
 	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 
 		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
 
 		bzero(zh, sizeof (zil_header_t));
 		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
 			 * If this block was part of log chain that couldn't
 			 * be claimed because a device was missing during
 			 * zil_claim(), but that device later returns,
 			 * then this block could erroneously appear valid.
 			 * To guard against this, assign a new GUID to the new
 			 * log chain so it doesn't matter what blk points to.
 			 */
 			zil_init_log_chain(zilog, &blk);
 			zh->zh_log = blk;
 		}
 	}
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
 			break;
-
-		ASSERT(lwb->lwb_zio == NULL);
-
 		list_remove(&zilog->zl_lwb_list, lwb);
-		zio_free_zil(spa, txg, &lwb->lwb_blk);
-		kmem_cache_free(zil_lwb_cache, lwb);
+		zio_free(spa, txg, &lwb->lwb_blk);
+		zil_free_lwb(zilog, lwb);
 
 		/*
 		 * If we don't have anything left in the lwb list then
 		 * we've had an allocation failure and we need to zero
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
 		if (list_head(&zilog->zl_lwb_list) == NULL)
 			BP_ZERO(&zh->zh_log);
 	}
 
 	/*
 	 * Remove fastwrite on any blocks that have been pre-allocated for
 	 * the next commit. This prevents fastwrite counter pollution by
 	 * unused, long-lived LWBs.
 	 */
 	for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) {
-		if (lwb->lwb_fastwrite && !lwb->lwb_zio) {
+		if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) {
 			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
 			lwb->lwb_fastwrite = 0;
 		}
 	}
 
 	mutex_exit(&zilog->zl_lock);
 }
 
+/* ARGSUSED */
+static int
+zil_lwb_cons(void *vbuf, void *unused, int kmflag)
+{
+	lwb_t *lwb = vbuf;
+	list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
+	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
+	    offsetof(zil_commit_waiter_t, zcw_node));
+	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
+	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
+	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+zil_lwb_dest(void *vbuf, void *unused)
+{
+	lwb_t *lwb = vbuf;
+	mutex_destroy(&lwb->lwb_vdev_lock);
+	avl_destroy(&lwb->lwb_vdev_tree);
+	list_destroy(&lwb->lwb_waiters);
+	list_destroy(&lwb->lwb_itxs);
+}
+
 void
 zil_init(void)
 {
 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
-	    sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	    sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
+
+	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
+	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	zil_ksp = kstat_create("zfs", 0, "zil", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (zil_ksp != NULL) {
 		zil_ksp->ks_data = &zil_stats;
 		kstat_install(zil_ksp);
 	}
 }
 
 void
 zil_fini(void)
 {
+	kmem_cache_destroy(zil_zcw_cache);
 	kmem_cache_destroy(zil_lwb_cache);
 
 	if (zil_ksp != NULL) {
 		kstat_delete(zil_ksp);
 		zil_ksp = NULL;
 	}
 }
 
 void
 zil_set_sync(zilog_t *zilog, uint64_t sync)
 {
 	zilog->zl_sync = sync;
 }
 
 void
 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
 {
 	zilog->zl_logbias = logbias;
 }
 
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
 	zilog_t *zilog;
 
 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
 
 	zilog->zl_header = zh_phys;
 	zilog->zl_os = os;
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 	zilog->zl_logbias = dmu_objset_logbias(os);
 	zilog->zl_sync = dmu_objset_syncprop(os);
-	zilog->zl_next_batch = 1;
+	zilog->zl_dirty_max_txg = 0;
+	zilog->zl_last_lwb_opened = NULL;
+	zilog->zl_last_lwb_latency = 0;
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zilog->zl_writer_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
 	}
 
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
 	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
 
-	mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
-	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
-
-	cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
-	cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL);
-	cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL);
 
 	return (zilog);
 }
 
 void
 zil_free(zilog_t *zilog)
 {
 	int i;
 
 	zilog->zl_stop_sync = 1;
 
 	ASSERT0(zilog->zl_suspend);
 	ASSERT0(zilog->zl_suspending);
 
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
-	avl_destroy(&zilog->zl_vdev_tree);
-	mutex_destroy(&zilog->zl_vdev_lock);
-
 	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
 	list_destroy(&zilog->zl_itx_commit_list);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * It's possible for an itx to be generated that doesn't dirty
 		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
 		 * callback to remove the entry. We remove those here.
 		 *
 		 * Also free up the ziltest itxs.
 		 */
 		if (zilog->zl_itxg[i].itxg_itxs)
 			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
 		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
 	}
 
+	mutex_destroy(&zilog->zl_writer_lock);
 	mutex_destroy(&zilog->zl_lock);
 
-	cv_destroy(&zilog->zl_cv_writer);
 	cv_destroy(&zilog->zl_cv_suspend);
-	cv_destroy(&zilog->zl_cv_batch[0]);
-	cv_destroy(&zilog->zl_cv_batch[1]);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
 /*
  * Open an intent log.
  */
 zilog_t *
 zil_open(objset_t *os, zil_get_data_t *get_data)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
-	ASSERT(zilog->zl_get_data == NULL);
+	ASSERT3P(zilog->zl_get_data, ==, NULL);
+	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
 
 	return (zilog);
 }
 
 /*
  * Close an intent log.
  */
 void
 zil_close(zilog_t *zilog)
 {
 	lwb_t *lwb;
-	uint64_t txg = 0;
+	uint64_t txg;
 
-	zil_commit(zilog, 0); /* commit all itx */
+	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
+		zil_commit(zilog, 0);
+	} else {
+		ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+		ASSERT0(zilog->zl_dirty_max_txg);
+		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
+	}
 
-	/*
-	 * The lwb_max_txg for the stubby lwb will reflect the last activity
-	 * for the zil.  After a txg_wait_synced() on the txg we know all the
-	 * callbacks have occurred that may clean the zil.  Only then can we
-	 * destroy the zl_clean_taskq.
-	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_tail(&zilog->zl_lwb_list);
-	if (lwb != NULL)
-		txg = lwb->lwb_max_txg;
+	if (lwb == NULL)
+		txg = zilog->zl_dirty_max_txg;
+	else
+		txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
 	mutex_exit(&zilog->zl_lock);
-	if (txg)
+
+	/*
+	 * We need to use txg_wait_synced() to wait long enough for the
+	 * ZIL to be clean, and to wait for all pending lwbs to be
+	 * written out.
+	 */
+	if (txg != 0)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 
 	if (zilog_is_dirty(zilog))
 		zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
 	if (txg < spa_freeze_txg(zilog->zl_spa))
 		VERIFY(!zilog_is_dirty(zilog));
 
 	zilog->zl_get_data = NULL;
 
 	/*
-	 * We should have only one LWB left on the list; remove it now.
+	 * We should have only one lwb left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
-		ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
-		ASSERT(lwb->lwb_zio == NULL);
+		ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+
 		if (lwb->lwb_fastwrite)
 			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
+
 		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-		kmem_cache_free(zil_lwb_cache, lwb);
+		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 static char *suspend_tag = "zil suspending";
 
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
  * On old version pools, we suspend the log briefly when taking a
  * snapshot so that it will have an empty intent log.
  *
  * Long holds are not really intended to be used the way we do here --
  * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
  * could fail.  Therefore we take pains to only put a long hold if it is
  * actually necessary.  Fortunately, it will only be necessary if the
  * objset is currently mounted (or the ZVOL equivalent).  In that case it
  * will already have a long hold, so we are not really making things any worse.
  *
  * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
  * zvol_state_t), and use their mechanism to prevent their hold from being
  * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
  * very little gain.
  *
  * if cookiep == NULL, this does both the suspend & resume.
  * Otherwise, it returns with the dataset "long held", and the cookie
  * should be passed into zil_resume().
  */
 int
 zil_suspend(const char *osname, void **cookiep)
 {
 	objset_t *os;
 	zilog_t *zilog;
 	const zil_header_t *zh;
 	int error;
 
 	error = dmu_objset_hold(osname, suspend_tag, &os);
 	if (error != 0)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/*
 	 * Don't put a long hold in the cases where we can avoid it.  This
 	 * is when there is no cookie so we are doing a suspend & resume
 	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
 	 * for the suspend because it's already suspended, or there's no ZIL.
 	 */
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
 
 	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
 
 	if (zilog->zl_suspend > 1) {
 		/*
 		 * Someone else is already suspending it.
 		 * Just wait for them to finish.
 		 */
 
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
 		if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 		return (0);
 	}
 
 	/*
 	 * If there is no pointer to an on-disk block, this ZIL must not
 	 * be active (e.g. filesystem not mounted), so there's nothing
 	 * to clean up.
 	 */
 	if (BP_IS_HOLE(&zh->zh_log)) {
 		ASSERT(cookiep != NULL); /* fast path already handled */
 
 		*cookiep = os;
 		mutex_exit(&zilog->zl_lock);
 		return (0);
 	}
 
 	/*
 	 * The ZIL has work to do. Ensure that the associated encryption
 	 * key will remain mapped while we are committing the log by
 	 * grabbing a reference to it. If the key isn't loaded we have no
 	 * choice but to return an error until the wrapping key is loaded.
 	 */
 	if (os->os_encrypted && spa_keystore_create_mapping(os->os_spa,
 	    dmu_objset_ds(os), FTAG) != 0) {
 		zilog->zl_suspend--;
 		mutex_exit(&zilog->zl_lock);
 		dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 		dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
 
 	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
 	zil_commit(zilog, 0);
 
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_suspending = B_FALSE;
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
 
 	if (os->os_encrypted) {
 		/*
 		 * Encrypted datasets need to wait for all data to be
 		 * synced out before removing the mapping.
 		 *
 		 * XXX: Depending on the number of datasets with
 		 * outstanding ZIL data on a given log device, this
 		 * might cause spa_offline_log() to take a long time.
 		 */
 		txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 		VERIFY0(spa_keystore_remove_mapping(os->os_spa,
 		    dmu_objset_id(os), FTAG));
 	}
 
 	if (cookiep == NULL)
 		zil_resume(os);
 	else
 		*cookiep = os;
 	return (0);
 }
 
 void
 zil_resume(void *cookie)
 {
 	objset_t *os = cookie;
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zilog->zl_suspend != 0);
 	zilog->zl_suspend--;
 	mutex_exit(&zilog->zl_lock);
 	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
 	zil_replay_func_t **zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
 	char		*zr_lr;
 } zil_replay_arg_t;
 
 static int
 zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
 	dmu_objset_name(zilog->zl_os, name);
 
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
 	    (u_longlong_t)lr->lrc_seq,
 	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
 
 	return (error);
 }
 
 static int
 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	int error = 0;
 
 	zilog->zl_replaying_seq = lr->lrc_seq;
 
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
 		return (0);
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
 		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
 	if (txtype == 0 || txtype >= TX_MAX_TYPE)
 		return (zil_replay_error(zilog, lr, EINVAL));
 
 	/*
 	 * If this record type can be logged out of order, the object
 	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
 	 */
 	if (TX_OOO(txtype)) {
 		error = dmu_object_info(zilog->zl_os,
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
 		if (error == ENOENT || error == EEXIST)
 			return (0);
 	}
 
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
 	bcopy(lr, zr->zr_lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
 	 */
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
 		    zr->zr_lr + reclen);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 
 	/*
 	 * The log block containing this lr may have been byteswapped
 	 * so that we can easily examine common fields like lrc_txtype.
 	 * However, the log is a mix of different record types, and only the
 	 * replay vectors know how to byteswap their records.  Therefore, if
 	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
 	if (zr->zr_byteswap)
 		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
 	 * and update the log header sequence number to reflect the fact that
 	 * we did so. At the end of each replay function the sequence number
 	 * is updated if we are in replay mode.
 	 */
 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
 	if (error != 0) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
 		 * EEXIST. So if we receive any error we try syncing out
 		 * any removes then retry the transaction.  Note that we
 		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	zilog->zl_replay_blks++;
 
 	return (0);
 }
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
  */
 void
 zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
 		zil_destroy(zilog, B_TRUE);
 		return;
 	}
 
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = ddi_get_lbolt();
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg, B_TRUE);
 	vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
 }
 
 boolean_t
 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return (B_TRUE);
 
 	if (zilog->zl_replay) {
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
 		    zilog->zl_replaying_seq;
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /* ARGSUSED */
 int
 zil_vdev_offline(const char *osname, void *arg)
 {
 	int error;
 
 	error = zil_suspend(osname, NULL);
 	if (error != 0)
 		return (SET_ERROR(EEXIST));
 	return (0);
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(zil_alloc);
 EXPORT_SYMBOL(zil_free);
 EXPORT_SYMBOL(zil_open);
 EXPORT_SYMBOL(zil_close);
 EXPORT_SYMBOL(zil_replay);
 EXPORT_SYMBOL(zil_replaying);
 EXPORT_SYMBOL(zil_destroy);
 EXPORT_SYMBOL(zil_destroy_sync);
 EXPORT_SYMBOL(zil_itx_create);
 EXPORT_SYMBOL(zil_itx_destroy);
 EXPORT_SYMBOL(zil_itx_assign);
 EXPORT_SYMBOL(zil_commit);
 EXPORT_SYMBOL(zil_vdev_offline);
 EXPORT_SYMBOL(zil_claim);
 EXPORT_SYMBOL(zil_check_log_chain);
 EXPORT_SYMBOL(zil_sync);
 EXPORT_SYMBOL(zil_clean);
 EXPORT_SYMBOL(zil_suspend);
 EXPORT_SYMBOL(zil_resume);
-EXPORT_SYMBOL(zil_add_block);
+EXPORT_SYMBOL(zil_lwb_add_block);
 EXPORT_SYMBOL(zil_bp_tree_add);
 EXPORT_SYMBOL(zil_set_sync);
 EXPORT_SYMBOL(zil_set_logbias);
 
 /* BEGIN CSTYLED */
 module_param(zil_replay_disable, int, 0644);
 MODULE_PARM_DESC(zil_replay_disable, "Disable intent logging replay");
 
 module_param(zfs_nocacheflush, int, 0644);
 MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes");
 
 module_param(zil_slog_bulk, ulong, 0644);
 MODULE_PARM_DESC(zil_slog_bulk, "Limit in bytes slog sync writes per commit");
 /* END CSTYLED */
 #endif
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 311f79e23b93..92e5a8dd8303 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1,4488 +1,4505 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
 #include <sys/time.h>
 #include <sys/trace_zio.h>
 #include <sys/abd.h>
 #include <sys/dsl_crypt.h>
 
 /*
  * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
 const char *zio_type_name[ZIO_TYPES] = {
 	/*
 	 * Note: Linux kernel thread name length is limited
 	 * so these names will differ from upstream open zfs.
 	 */
 	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
 };
 
 int zio_dva_throttle_enabled = B_TRUE;
 
 /*
  * ==========================================================================
  * I/O kmem caches
  * ==========================================================================
  */
 kmem_cache_t *zio_cache;
 kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #endif
 
 int zio_delay_max = ZIO_DELAY_MAX;
 
 #define	ZIO_PIPELINE_CONTINUE		0x100
 #define	ZIO_PIPELINE_STOP		0x101
 
 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
 #define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
  * Care should be taken when changing these values as they directly impact
  * spa_sync() performance. Tuning these values may introduce subtle performance
  * pathologies and should only be done in the context of performance analysis.
  * These tunables will eventually be removed and replaced with #defines once
  * enough analysis has been done to determine optimal values.
  *
  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  * regular blocks are not deferred.
  */
 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
 #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 
 int zio_requeue_io_start_cut_in_line = 1;
 
 #ifdef ZFS_DEBUG
 int zio_buf_debug_limit = 16384;
 #else
 int zio_buf_debug_limit = 0;
 #endif
 
 static inline void __zio_execute(zio_t *zio);
 
 static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
 
 void
 zio_init(void)
 {
 	size_t c;
 	vmem_t *data_alloc_arena = NULL;
 
 	zio_cache = kmem_cache_create("zio_cache",
 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	/*
 	 * For small buffers, we want a cache for each multiple of
 	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
 	 * for each quarter-power of 2.
 	 */
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 		size_t p2 = size;
 		size_t align = 0;
 		size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 
 #if defined(_ILP32) && defined(_KERNEL)
 		/*
 		 * Cache size limited to 1M on 32-bit platforms until ARC
 		 * buffers no longer require virtual address space.
 		 */
 		if (size > zfs_max_recordsize)
 			break;
 #endif
 
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
 
 #ifndef _KERNEL
 		/*
 		 * If we are using watchpoints, put each buffer on its own page,
 		 * to eliminate the performance overhead of trapping to the
 		 * kernel when modifying a non-watched buffer that shares the
 		 * page with a watched buffer.
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
 		/*
 		 * Here's the problem - on 4K native devices in userland on
 		 * Linux using O_DIRECT, buffers must be 4K aligned or I/O
 		 * will fail with EINVAL, causing zdb (and others) to coredump.
 		 * Since userland probably doesn't need optimized buffer caches,
 		 * we just force 4K alignment on everything.
 		 */
 		align = 8 * SPA_MINBLOCKSIZE;
 #else
 		if (size < PAGESIZE) {
 			align = SPA_MINBLOCKSIZE;
 		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
 			align = PAGESIZE;
 		}
 #endif
 
 		if (align != 0) {
 			char name[36];
 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL, NULL, cflags);
 
 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL,
 			    data_alloc_arena, cflags);
 		}
 	}
 
 	while (--c != 0) {
 		ASSERT(zio_buf_cache[c] != NULL);
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
 
 		ASSERT(zio_data_buf_cache[c] != NULL);
 		if (zio_data_buf_cache[c - 1] == NULL)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
 	zio_inject_init();
 
 	lz4_init();
 }
 
 void
 zio_fini(void)
 {
 	size_t c;
 	kmem_cache_t *last_cache = NULL;
 	kmem_cache_t *last_data_cache = NULL;
 
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 #ifdef _ILP32
 		/*
 		 * Cache size limited to 1M on 32-bit platforms until ARC
 		 * buffers no longer require virtual address space.
 		 */
 		if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
 			break;
 #endif
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 		if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c])
 			(void) printf("zio_fini: [%d] %llu != %llu\n",
 			    (int)((c + 1) << SPA_MINBLOCKSHIFT),
 			    (long long unsigned)zio_buf_cache_allocs[c],
 			    (long long unsigned)zio_buf_cache_frees[c]);
 #endif
 		if (zio_buf_cache[c] != last_cache) {
 			last_cache = zio_buf_cache[c];
 			kmem_cache_destroy(zio_buf_cache[c]);
 		}
 		zio_buf_cache[c] = NULL;
 
 		if (zio_data_buf_cache[c] != last_data_cache) {
 			last_data_cache = zio_data_buf_cache[c];
 			kmem_cache_destroy(zio_data_buf_cache[c]);
 		}
 		zio_data_buf_cache[c] = NULL;
 	}
 
 	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
 
 	lz4_fini();
 }
 
 /*
  * ==========================================================================
  * Allocate and free I/O buffers
  * ==========================================================================
  */
 
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
  * excess / transient data in-core during a crashdump.
  */
 void *
 zio_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_allocs[c], 1);
 #endif
 
 	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 }
 
 /*
  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
  * crashdump if the kernel panics.  This exists so that we will limit the amount
  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
  * of kernel heap dumped to disk when the kernel panics)
  */
 void *
 zio_data_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 }
 
 void
 zio_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif
 
 	kmem_cache_free(zio_buf_cache[c], buf);
 }
 
 void
 zio_data_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }
 
 static void
 zio_abd_free(void *abd, size_t size)
 {
 	abd_free((abd_t *)abd);
 }
 
 /*
  * ==========================================================================
  * Push and pop I/O transform buffers
  * ==========================================================================
  */
 void
 zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
     zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
 	/*
 	 * Ensure that anyone expecting this zio to contain a linear ABD isn't
 	 * going to get a nasty surprise when they try to access the data.
 	 */
 	IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
 
 	zt->zt_orig_abd = zio->io_abd;
 	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;
 	zt->zt_transform = transform;
 
 	zt->zt_next = zio->io_transform_stack;
 	zio->io_transform_stack = zt;
 
 	zio->io_abd = data;
 	zio->io_size = size;
 }
 
 void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
 
 	while ((zt = zio->io_transform_stack) != NULL) {
 		if (zt->zt_transform != NULL)
 			zt->zt_transform(zio,
 			    zt->zt_orig_abd, zt->zt_orig_size);
 
 		if (zt->zt_bufsize != 0)
 			abd_free(zio->io_abd);
 
 		zio->io_abd = zt->zt_orig_abd;
 		zio->io_size = zt->zt_orig_size;
 		zio->io_transform_stack = zt->zt_next;
 
 		kmem_free(zt, sizeof (zio_transform_t));
 	}
 }
 
 /*
  * ==========================================================================
  * I/O transform callbacks for subblocks, decompression, and decryption
  * ==========================================================================
  */
 static void
 zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
 {
 	ASSERT(zio->io_size > size);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		abd_copy(data, zio->io_abd, size);
 }
 
 static void
 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
 	if (zio->io_error == 0) {
 		void *tmp = abd_borrow_buf(data, size);
 		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 		    zio->io_abd, tmp, zio->io_size, size);
 		abd_return_buf_copy(data, tmp, size);
 
 		if (ret != 0)
 			zio->io_error = SET_ERROR(EIO);
 	}
 }
 
 static void
 zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 {
 	int ret;
 	void *tmp;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t lsize = BP_GET_LSIZE(bp);
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(size, !=, 0);
 
 	if (zio->io_error != 0)
 		return;
 
 	/*
 	 * Verify the cksum of MACs stored in an indirect bp. It will always
 	 * be possible to verify this since it does not require an encryption
 	 * key.
 	 */
 	if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 			/*
 			 * We haven't decompressed the data yet, but
 			 * zio_crypt_do_indirect_mac_checksum() requires
 			 * decompressed data to be able to parse out the MACs
 			 * from the indirect block. We decompress it now and
 			 * throw away the result after we are finished.
 			 */
 			tmp = zio_buf_alloc(lsize);
 			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
 			    zio->io_abd, tmp, zio->io_size, lsize);
 			if (ret != 0) {
 				ret = SET_ERROR(EIO);
 				goto error;
 			}
 			ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
 			    tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
 			zio_buf_free(tmp, lsize);
 		} else {
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	/*
 	 * If this is an authenticated block, just check the MAC. It would be
 	 * nice to separate this out into its own flag, but for the moment
 	 * enum zio_flag is out of bits.
 	 */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		if (ot == DMU_OT_OBJSET) {
 			ret = spa_do_crypt_objset_mac_abd(B_FALSE, zio->io_spa,
 			    zio->io_bookmark.zb_objset, zio->io_abd, size,
 			    BP_SHOULD_BYTESWAP(bp));
 		} else {
 			zio_crypt_decode_mac_bp(bp, mac);
 			ret = spa_do_crypt_mac_abd(B_FALSE, zio->io_spa,
 			    zio->io_bookmark.zb_objset, zio->io_abd, size, mac);
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	zio_crypt_decode_params_bp(bp, salt, iv);
 
 	if (ot == DMU_OT_INTENT_LOG) {
 		tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
 		zio_crypt_decode_mac_zil(tmp, mac);
 		abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
 	} else {
 		zio_crypt_decode_mac_bp(bp, mac);
 	}
 
 	ret = spa_do_crypt_abd(B_FALSE, zio->io_spa, zio->io_bookmark.zb_objset,
 	    bp, bp->blk_birth, size, data, zio->io_abd, iv, mac, salt,
 	    &no_crypt);
 	if (no_crypt)
 		abd_copy(data, zio->io_abd, size);
 
 	if (ret != 0)
 		goto error;
 
 	return;
 
 error:
 	/* assert that the key was found unless this was speculative */
 	ASSERT(ret != ENOENT || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
 
 	/*
 	 * If there was a decryption / authentication error return EIO as
 	 * the io_error. If this was not a speculative zio, create an ereport.
 	 */
 	if (ret == ECKSUM) {
 		ret = SET_ERROR(EIO);
 		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 			zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0);
 		}
 	} else {
 		zio->io_error = ret;
 	}
 }
 
 /*
  * ==========================================================================
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
 zio_t *
 zio_walk_parents(zio_t *cio, zio_link_t **zl)
 {
 	list_t *pl = &cio->io_parent_list;
 
 	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_child == cio);
 	return ((*zl)->zl_parent);
 }
 
 zio_t *
 zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
 	list_t *cl = &pio->io_child_list;
 
 	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_parent == pio);
 	return ((*zl)->zl_child);
 }
 
 zio_t *
 zio_unique_parent(zio_t *cio)
 {
 	zio_link_t *zl = NULL;
 	zio_t *pio = zio_walk_parents(cio, &zl);
 
 	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 	return (pio);
 }
 
 void
 zio_add_child(zio_t *pio, zio_t *cio)
 {
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
-	ASSERT(cio->io_child_type <= pio->io_child_type);
+	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
 	pio->io_child_count++;
 	cio->io_parent_count++;
 
 	mutex_exit(&pio->io_lock);
 	mutex_exit(&cio->io_lock);
 }
 
 static void
 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
 	ASSERT(zl->zl_parent == pio);
 	ASSERT(zl->zl_child == cio);
 
 	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
 
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
 	pio->io_child_count--;
 	cio->io_parent_count--;
 
 	mutex_exit(&pio->io_lock);
 	mutex_exit(&cio->io_lock);
 	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 {
 	uint64_t *countp = &zio->io_children[child][wait];
 	boolean_t waiting = B_FALSE;
 
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
 	if (*countp != 0) {
 		zio->io_stage >>= 1;
 		ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
 		zio->io_stall = countp;
 		waiting = B_TRUE;
 	}
 	mutex_exit(&zio->io_lock);
 
 	return (waiting);
 }
 
 __attribute__((always_inline))
 static inline void
 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 {
 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 	int *errorp = &pio->io_child_error[zio->io_child_type];
 
 	mutex_enter(&pio->io_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		*errorp = zio_worst_error(*errorp, zio->io_error);
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
 
 	(*countp)--;
 
 	if (*countp == 0 && pio->io_stall == countp) {
 		zio_taskq_type_t type =
 		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
 		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
 		/*
 		 * Dispatch the parent zio in its own taskq so that
 		 * the child can continue to make progress. This also
 		 * prevents overflowing the stack when we have deeply nested
 		 * parent-child relationships.
 		 */
 		zio_taskq_dispatch(pio, type, B_FALSE);
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
 }
 
 static void
 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 {
 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 		zio->io_error = zio->io_child_error[c];
 }
 
 int
 zio_bookmark_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
 	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
 		return (-1);
 	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
 		return (1);
 
 	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
 		return (-1);
 	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
 		return (1);
 
 	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
 		return (-1);
 	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
 		return (1);
 
 	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
 		return (-1);
 	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
 		return (1);
 
 	if (z1 < z2)
 		return (-1);
 	if (z1 > z2)
 		return (1);
 
 	return (0);
 }
 
 /*
  * ==========================================================================
  * Create the various types of I/O (read, write, free, etc)
  * ==========================================================================
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
     void *private, zio_type_t type, zio_priority_t priority,
     enum zio_flag flags, vdev_t *vd, uint64_t offset,
     const zbookmark_phys_t *zb, enum zio_stage stage,
     enum zio_stage pipeline)
 {
 	zio_t *zio;
 
 	ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 
 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
 
 	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
 
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	bzero(zio, sizeof (zio_t));
 
 	mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
 	list_create(&zio->io_parent_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_parent_node));
 	list_create(&zio->io_child_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_child_node));
 	metaslab_trace_init(&zio->io_alloc_list);
 
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
 		zio->io_child_type = ZIO_CHILD_GANG;
 	else if (flags & ZIO_FLAG_DDT_CHILD)
 		zio->io_child_type = ZIO_CHILD_DDT;
 	else
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
 		zio->io_bp = (blkptr_t *)bp;
 		zio->io_bp_copy = *bp;
 		zio->io_bp_orig = *bp;
 		if (type != ZIO_TYPE_WRITE ||
 		    zio->io_child_type == ZIO_CHILD_DDT)
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 			pipeline |= ZIO_GANG_STAGES;
 	}
 
 	zio->io_spa = spa;
 	zio->io_txg = txg;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
 	zio->io_orig_abd = zio->io_abd = data;
 	zio->io_orig_size = zio->io_size = psize;
 	zio->io_lsize = lsize;
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
 	}
 
 	taskq_init_ent(&zio->io_tqent);
 
 	return (zio);
 }
 
 static void
 zio_destroy(zio_t *zio)
 {
 	metaslab_trace_fini(&zio->io_alloc_list);
 	list_destroy(&zio->io_parent_list);
 	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
 }
 
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
     void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 {
 	return (zio_null(NULL, spa, NULL, done, private, flags));
 }
 
 void
 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 {
 	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 		zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
 		    bp, (longlong_t)BP_GET_TYPE(bp));
 	}
 	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
 	    BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
 		zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
 		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
 	}
 	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
 	    BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
 		zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
 		    bp, (longlong_t)BP_GET_COMPRESS(bp));
 	}
 	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
 		    bp, (longlong_t)BP_GET_LSIZE(bp));
 	}
 	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
 		    bp, (longlong_t)BP_GET_PSIZE(bp));
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
 			zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
 			    bp, (longlong_t)BPE_GET_ETYPE(bp));
 		}
 	}
 
 	/*
 	 * Pool-specific checks.
 	 *
 	 * Note: it would be nice to verify that the blk_birth and
 	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
 	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
 	 * that are in the log) to be arbitrarily large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
 
 		if (vdevid >= spa->spa_root_vdev->vdev_children) {
 			zfs_panic_recover("blkptr at %p DVA %u has invalid "
 			    "VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 		if (vd == NULL) {
 			zfs_panic_recover("blkptr at %p DVA %u has invalid "
 			    "VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_hole_ops) {
 			zfs_panic_recover("blkptr at %p DVA %u has hole "
 			    "VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_missing_ops) {
 			/*
 			 * "missing" vdevs are valid during import, but we
 			 * don't have their detailed info (e.g. asize), so
 			 * we can't perform any more checks on them.
 			 */
 			continue;
 		}
 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 		uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
 		if (BP_IS_GANG(bp))
 			asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 		if (offset + asize > vd->vdev_asize) {
 			zfs_panic_recover("blkptr at %p DVA %u has invalid "
 			    "OFFSET %llu",
 			    bp, i, (longlong_t)offset);
 		}
 	}
 }
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zfs_blkptr_verify(spa, bp);
 
 	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *physdone, zio_done_func_t *done,
     void *private, zio_priority_t priority, enum zio_flag flags,
     const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 	    DMU_OT_IS_VALID(zp->zp_type) &&
 	    zp->zp_level < 32 &&
 	    zp->zp_copies > 0 &&
 	    zp->zp_copies <= spa_max_replication(spa));
 
 	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
 	zio->io_children_ready = children_ready;
 	zio->io_physdone = physdone;
 	zio->io_prop = *zp;
 
 	/*
 	 * Data can be NULL if we are going to call zio_write_override() to
 	 * provide the already-allocated BP.  But we may need the data to
 	 * verify a dedup hit (if requested).  In this case, don't try to
 	 * dedup (just take the already-allocated BP verbatim). Encrypted
 	 * dedup blocks need data as well so we also disable dedup in this
 	 * case.
 	 */
 	if (data == NULL &&
 	    (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
 		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 
 	/*
 	 * We must reset the io_prop to match the values that existed
 	 * when the bp was first written by dmu_sync() keeping in mind
 	 * that nopwrite and dedup are mutually exclusive.
 	 */
 	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
 
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
 
 	/*
 	 * The check for EMBEDDED is a performance optimization.  We
 	 * process the free here (by ignoring it) rather than
 	 * putting it on the list and then processing it in zio_free_sync().
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 	metaslab_check_free(spa, bp);
 
 	/*
 	 * Frees that are for the currently-syncing txg, are not going to be
 	 * deferred, and which will not need to do a read (i.e. not GANG or
 	 * DEDUP), can be processed immediately.  Otherwise, put them on the
 	 * in-memory list for later processing.
 	 */
 	if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
 	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
 		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
 	}
 }
 
 zio_t *
 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     enum zio_flag flags)
 {
 	zio_t *zio;
 	enum zio_stage stage = ZIO_FREE_PIPELINE;
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
 	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
 	dsl_scan_freed(spa, bp);
 
 	/*
 	 * GANG and DEDUP blocks can induce a read (for the gang block header,
 	 * or the DDT), so issue them asynchronously so that this thread is
 	 * not tied up.
 	 */
 	if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 		stage |= ZIO_STAGE_ISSUE_ASYNC;
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
 	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 
 	return (zio);
 }
 
 zio_t *
 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_done_func_t *done, void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 
 	dprintf_bp(bp, "claiming in txg %llu", txg);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	/*
 	 * A claim is an allocation of a specific block.  Claims are needed
 	 * to support immediate writes in the intent log.  The issue is that
 	 * immediate writes contain committed data, but in a txg that was
 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
 	 * the intent log claims all blocks that contain immediate write data
 	 * so that the SPA knows they're in use.
 	 *
 	 * All claims *must* be resolved in the first txg -- before the SPA
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 	ASSERT(txg == spa_first_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
 	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 	ASSERT0(zio->io_queued_timestamp);
 
 	return (zio);
 }
 
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     zio_done_func_t *done, void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 	int c;
 
 	if (vd->vdev_children == 0) {
 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 
 		zio->io_cmd = cmd;
 	} else {
 		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 
 		for (c = 0; c < vd->vdev_children; c++)
 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 			    done, private, flags));
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	return (zio);
 }
 
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
 		 */
 		abd_t *wbuf = abd_alloc_sametype(data, size);
 		abd_copy(wbuf, data, size);
 
 		zio_push_transform(zio, wbuf, size, size, NULL);
 	}
 
 	return (zio);
 }
 
 /*
  * Create a child I/O to do some work for us.
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
     abd_t *data, uint64_t size, int type, zio_priority_t priority,
     enum zio_flag flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
 
 	ASSERT(vd->vdev_parent ==
 	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
 		 * checksum and the parent need not.  This pushes error
 		 * detection as close to the leaves as possible and
 		 * eliminates redundant checksums in the interior nodes.
 		 */
 		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
 	if (vd->vdev_children == 0)
 		offset += VDEV_LABEL_START_SIZE;
 
 	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 
 	/*
 	 * If we've decided to do a repair, the write is not speculative --
 	 * even if the original read was.
 	 */
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
 	/*
 	 * If we're creating a child I/O that is not associated with a
 	 * top-level vdev, then the child zio is not an allocating I/O.
 	 * If this is a retried I/O then we ignore it since we will
 	 * have already processed the original allocating I/O.
 	 */
 	if (flags & ZIO_FLAG_IO_ALLOCATING &&
 	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
 		ASSERTV(metaslab_class_t *mc = spa_normal_class(pio->io_spa));
 
 		ASSERT(mc->mc_alloc_throttle_enabled);
 		ASSERT(type == ZIO_TYPE_WRITE);
 		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
 		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
 		    pio->io_child_type == ZIO_CHILD_GANG);
 
 		flags &= ~ZIO_FLAG_IO_ALLOCATING;
 	}
 
 
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 
 	zio->io_physdone = pio->io_physdone;
 	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
 		zio->io_logical->io_phys_children++;
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
     int type, zio_priority_t priority, enum zio_flag flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 	    data, size, size, done, private, type, priority,
 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 	    vd, offset, NULL,
 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_flush(zio_t *zio, vdev_t *vd)
 {
 	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
 	    NULL, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 }
 
 void
 zio_shrink(zio_t *zio, uint64_t size)
 {
-	ASSERT(zio->io_executor == NULL);
-	ASSERT(zio->io_orig_size == zio->io_size);
-	ASSERT(size <= zio->io_size);
+	ASSERT3P(zio->io_executor, ==, NULL);
+	ASSERT3U(zio->io_orig_size, ==, zio->io_size);
+	ASSERT3U(size, <=, zio->io_size);
 
 	/*
 	 * We don't shrink for raidz because of problems with the
 	 * reconstruction when reading back less than the block size.
 	 * Note, BP_IS_RAIDZ() assumes no compression.
 	 */
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	if (!BP_IS_RAIDZ(zio->io_bp)) {
 		/* we are not doing a raw write */
 		ASSERT3U(zio->io_size, ==, zio->io_lsize);
 		zio->io_orig_size = zio->io_size = zio->io_lsize = size;
 	}
 }
 
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
  * ==========================================================================
  */
 
 static int
 zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize =
 	    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decompress);
 	}
 
 	if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
 	    BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decrypt);
 	}
 
 	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 		int psize = BPE_GET_PSIZE(bp);
 		void *data = abd_borrow_buf(zio->io_abd, psize);
 
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		decode_embedded_bp_compressed(bp, data);
 		abd_return_buf_copy(zio->io_abd, data, psize);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
 	}
 
 	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
 	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_write_bp_init(zio_t *zio)
 {
 	if (!IO_IS_ALLOCATING(zio))
 		return (ZIO_PIPELINE_CONTINUE);
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
 	if (zio->io_bp_override) {
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
 		ASSERT(bp->blk_birth != zio->io_txg);
 		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (BP_IS_EMBEDDED(bp))
 			return (ZIO_PIPELINE_CONTINUE);
 
 		/*
 		 * If we've been overridden and nopwrite is set then
 		 * set the flag accordingly to indicate that a nopwrite
 		 * has already occurred.
 		 */
 		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 			ASSERT(!zp->zp_dedup);
 			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
 			zio->io_flags |= ZIO_FLAG_NOPWRITE;
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		ASSERT(!zp->zp_nopwrite);
 
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
 		    !zp->zp_encrypt) {
 			BP_SET_DEDUP(bp, 1);
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_write_compress(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
 	enum zio_compress compress = zp->zp_compress;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t lsize = zio->io_lsize;
 	uint64_t psize = zio->io_size;
 	int pass = 1;
 
 	/*
 	 * If our children haven't all reached the ready stage,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
 		return (ZIO_PIPELINE_STOP);
 
 	if (!IO_IS_ALLOCATING(zio))
 		return (ZIO_PIPELINE_CONTINUE);
 
 	if (zio->io_children_ready != NULL) {
 		/*
 		 * Now that all our children are ready, run the callback
 		 * associated with this zio in case it wants to modify the
 		 * data to be written.
 		 */
 		ASSERT3U(zp->zp_level, >, 0);
 		zio->io_children_ready(zio);
 	}
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
 		 * converge, it must eventually be the case that we don't
 		 * have to allocate new blocks.  But compression changes
 		 * the blocksize, which forces a reallocate, and makes
 		 * convergence take longer.  Therefore, after the first
 		 * few passes, stop compressing to ensure convergence.
 		 */
 		pass = spa_sync_pass(spa);
 
 		ASSERT(zio->io_txg == spa_syncing_txg(spa));
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(!BP_GET_DEDUP(bp));
 
 		if (pass >= zfs_sync_pass_dont_compress)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
 		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
 	}
 
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		void *cbuf = zio_buf_alloc(lsize);
 		psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
 		if (psize == 0 || psize == lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			zio_buf_free(cbuf, lsize);
 		} else if (!zp->zp_dedup && !zp->zp_encrypt &&
 		    psize <= BPE_PAYLOAD_SIZE &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			zio_buf_free(cbuf, lsize);
 			bp->blk_birth = zio->io_txg;
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
 			return (ZIO_PIPELINE_CONTINUE);
 		} else {
 			/*
 			 * Round up compressed size up to the ashift
 			 * of the smallest-ashift device, and zero the tail.
 			 * This ensures that the compressed size of the BP
 			 * (and thus compressratio property) are correct,
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
 			ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
 			size_t rounded = (size_t)P2ROUNDUP(psize,
 			    1ULL << spa->spa_min_ashift);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
 				psize = lsize;
 			} else {
 				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
 				abd_take_ownership_of_buf(cdata, B_TRUE);
 				abd_zero_off(cdata, psize, rounded - psize);
 				psize = rounded;
 				zio_push_transform(zio, cdata,
 				    psize, lsize, NULL);
 			}
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 
 	} else {
 		ASSERT3U(psize, !=, 0);
 
 	}
 
 	/*
 	 * The final pass of spa_sync() must be all rewrites, but the first
 	 * few passes offer a trade-off: allocating blocks defers convergence,
 	 * but newly allocated blocks are sequential, so they can be written
 	 * to disk faster.  Therefore, we allow the first few passes of
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
 	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		ASSERT(psize != 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
 		BP_ZERO(bp);
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 	}
 
 	if (psize == 0) {
 		if (zio->io_bp_orig.blk_birth != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
 			BP_SET_LEVEL(bp, zp->zp_level);
 			BP_SET_BIRTH(bp, zio->io_txg, 0);
 		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
 		BP_SET_TYPE(bp, zp->zp_type);
 		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
 		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 		if (zp->zp_dedup) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			ASSERT(!zp->zp_encrypt ||
 			    DMU_OT_IS_ENCRYPTED(zp->zp_type));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
 		if (zp->zp_nopwrite) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_free_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		if (BP_GET_DEDUP(bp))
 			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * ==========================================================================
  * Execute the I/O pipeline
  * ==========================================================================
  */
 
 static void
 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
 	int flags = (cutinline ? TQ_FRONT : 0);
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
 	 * interrupt threads may all be blocked waiting for the config lock.
 	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
 	 */
 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * If this is a high priority I/O, then use the high priority taskq if
 	 * available.
 	 */
 	if (zio->io_priority == ZIO_PRIORITY_NOW &&
 	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
 		q++;
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
 	/*
 	 * NB: We are assuming that the zio can only be dispatched
 	 * to a single taskq at a time.  It would be a grievous error
 	 * to dispatch the zio to another taskq at the same time.
 	 */
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
 	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
 	    flags, &zio->io_tqent);
 }
 
 static boolean_t
 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 {
 	kthread_t *executor = zio->io_executor;
 	spa_t *spa = zio->io_spa;
 
 	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
 		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 		uint_t i;
 		for (i = 0; i < tqs->stqs_count; i++) {
 			if (taskq_member(tqs->stqs_taskq[i], executor))
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static int
 zio_issue_async(zio_t *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 
 	return (ZIO_PIPELINE_STOP);
 }
 
 void
 zio_interrupt(zio_t *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 void
 zio_delay_interrupt(zio_t *zio)
 {
 	/*
 	 * The timeout_generic() function isn't defined in userspace, so
 	 * rather than trying to implement the function, the zio delay
 	 * functionality has been disabled for userspace builds.
 	 */
 
 #ifdef _KERNEL
 	/*
 	 * If io_target_timestamp is zero, then no delay has been registered
 	 * for this IO, thus jump to the end of this function and "skip" the
 	 * delay; issuing it directly to the zio layer.
 	 */
 	if (zio->io_target_timestamp != 0) {
 		hrtime_t now = gethrtime();
 
 		if (now >= zio->io_target_timestamp) {
 			/*
 			 * This IO has already taken longer than the target
 			 * delay to complete, so we don't want to delay it
 			 * any longer; we "miss" the delay and issue it
 			 * directly to the zio layer. This is likely due to
 			 * the target latency being set to a value less than
 			 * the underlying hardware can satisfy (e.g. delay
 			 * set to 1ms, but the disks take 10ms to complete an
 			 * IO request).
 			 */
 
 			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
 			    hrtime_t, now);
 
 			zio_interrupt(zio);
 		} else {
 			taskqid_t tid;
 			hrtime_t diff = zio->io_target_timestamp - now;
 			clock_t expire_at_tick = ddi_get_lbolt() +
 			    NSEC_TO_TICK(diff);
 
 			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
 			    hrtime_t, now, hrtime_t, diff);
 
 			if (NSEC_TO_TICK(diff) == 0) {
 				/* Our delay is less than a jiffy - just spin */
 				zfs_sleep_until(zio->io_target_timestamp);
 			} else {
 				/*
 				 * Use taskq_dispatch_delay() in the place of
 				 * OpenZFS's timeout_generic().
 				 */
 				tid = taskq_dispatch_delay(system_taskq,
 				    (task_func_t *)zio_interrupt,
 				    zio, TQ_NOSLEEP, expire_at_tick);
 				if (tid == TASKQID_INVALID) {
 					/*
 					 * Couldn't allocate a task.  Just
 					 * finish the zio without a delay.
 					 */
 					zio_interrupt(zio);
 				}
 			}
 		}
 		return;
 	}
 #endif
 	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
 	zio_interrupt(zio);
 }
 
 /*
  * Execute the I/O pipeline until one of the following occurs:
  * (1) the I/O completes; (2) the pipeline stalls waiting for
  * dependent child I/Os; (3) the I/O issues, so we're waiting
  * for an I/O completion interrupt; (4) the I/O is delegated by
  * vdev-level caching or aggregation; (5) the I/O is deferred
  * due to vdev-level queueing; (6) the I/O is handed off to
  * another thread.  In all cases, the pipeline stops whenever
  * there's no CPU work; it never burns a thread in cv_wait_io().
  *
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
 static zio_pipe_stage_t *zio_pipeline[];
 
 /*
  * zio_execute() is a wrapper around the static function
  * __zio_execute() so that we can force  __zio_execute() to be
  * inlined.  This reduces stack overhead which is important
  * because __zio_execute() is called recursively in several zio
  * code paths.  zio_execute() itself cannot be inlined because
  * it is externally visible.
  */
 void
 zio_execute(zio_t *zio)
 {
 	fstrans_cookie_t cookie;
 
 	cookie = spl_fstrans_mark();
 	__zio_execute(zio);
 	spl_fstrans_unmark(cookie);
 }
 
 /*
  * Used to determine if in the current context the stack is sized large
  * enough to allow zio_execute() to be called recursively.  A minimum
  * stack size of 16K is required to avoid needing to re-dispatch the zio.
  */
 boolean_t
 zio_execute_stack_check(zio_t *zio)
 {
 #if !defined(HAVE_LARGE_STACKS)
 	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
 
 	/* Executing in txg_sync_thread() context. */
 	if (dp && curthread == dp->dp_tx.tx_sync_thread)
 		return (B_TRUE);
 
 	/* Pool initialization outside of zio_taskq context. */
 	if (dp && spa_is_initializing(dp->dp_spa) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
 		return (B_TRUE);
 #endif /* HAVE_LARGE_STACKS */
 
 	return (B_FALSE);
 }
 
 __attribute__((always_inline))
 static inline void
 __zio_execute(zio_t *zio)
 {
 	zio->io_executor = curthread;
 
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
 		int rv;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
 		ASSERT(ISP2(stage));
 		ASSERT(zio->io_stall == NULL);
 
 		do {
 			stage <<= 1;
 		} while ((stage & pipeline) == 0);
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
 
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
 		 *
 		 * For VDEV_IO_START, we cut in line so that the io will
 		 * be sent to disk promptly.
 		 */
 		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		/*
 		 * If the current context doesn't have large enough stacks
 		 * the zio must be issued asynchronously to prevent overflow.
 		 */
 		if (zio_execute_stack_check(zio)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		zio->io_stage = stage;
 		zio->io_pipeline_trace |= zio->io_stage;
 		rv = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (rv == ZIO_PIPELINE_STOP)
 			return;
 
 		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
 	}
 }
 
 
 /*
  * ==========================================================================
  * Initiate I/O, either sync or async
  * ==========================================================================
  */
 int
 zio_wait(zio_t *zio)
 {
 	int error;
 
-	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
-	ASSERT(zio->io_executor == NULL);
+	ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
+	ASSERT3P(zio->io_executor, ==, NULL);
 
 	zio->io_waiter = curthread;
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 
 	__zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
 	while (zio->io_executor != NULL)
 		cv_wait_io(&zio->io_cv, &zio->io_lock);
 	mutex_exit(&zio->io_lock);
 
 	error = zio->io_error;
 	zio_destroy(zio);
 
 	return (error);
 }
 
 void
 zio_nowait(zio_t *zio)
 {
-	ASSERT(zio->io_executor == NULL);
+	ASSERT3P(zio->io_executor, ==, NULL);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    zio_unique_parent(zio) == NULL) {
 		zio_t *pio;
 
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
 		 * We add it to the spa_async_root_zio "Godfather" I/O which
 		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
 		kpreempt_disable();
 		pio = spa->spa_async_zio_root[CPU_SEQID];
 		kpreempt_enable();
 
 		zio_add_child(pio, zio);
 	}
 
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 	__zio_execute(zio);
 }
 
 /*
  * ==========================================================================
- * Reexecute or suspend/resume failed I/O
+ * Reexecute, cancel, or suspend/resume failed I/O
  * ==========================================================================
  */
 
 static void
 zio_reexecute(zio_t *pio)
 {
 	zio_t *cio, *cio_next;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
 	ASSERT(pio->io_gang_leader == NULL);
 	ASSERT(pio->io_gang_tree == NULL);
 
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_state[w] = 0;
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
 	if (IO_IS_ALLOCATING(pio))
 		BP_ZERO(pio->io_bp);
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
 	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
 	zio_link_t *zl = NULL;
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		mutex_enter(&pio->io_lock);
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			pio->io_children[cio->io_child_type][w]++;
 		mutex_exit(&pio->io_lock);
 		zio_reexecute(cio);
 	}
 
 	/*
 	 * Now that all children have been reexecuted, execute the parent.
 	 * We don't reexecute "The Godfather" I/O here as it's the
 	 * responsibility of the caller to wait on it.
 	 */
 	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
 		pio->io_queued_timestamp = gethrtime();
 		__zio_execute(pio);
 	}
 }
 
+void
+zio_cancel(zio_t *zio)
+{
+	/*
+	 * Disallow cancellation of a zio that's already been issued.
+	 */
+	VERIFY3P(zio->io_executor, ==, NULL);
+
+	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+	zio->io_done = NULL;
+
+	zio_nowait(zio);
+}
+
 void
 zio_suspend(spa_t *spa, zio_t *zio)
 {
 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
 	cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
 	    "failure and has been suspended.\n", spa_name(spa));
 
 	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
 	    NULL, NULL, 0, 0);
 
 	mutex_enter(&spa->spa_suspend_lock);
 
 	if (spa->spa_suspend_zio_root == NULL)
 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 
 	spa->spa_suspended = B_TRUE;
 
 	if (zio != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
 
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 int
 zio_resume(spa_t *spa)
 {
 	zio_t *pio;
 
 	/*
 	 * Reexecute all previously suspended i/o.
 	 */
 	mutex_enter(&spa->spa_suspend_lock);
 	spa->spa_suspended = B_FALSE;
 	cv_broadcast(&spa->spa_suspend_cv);
 	pio = spa->spa_suspend_zio_root;
 	spa->spa_suspend_zio_root = NULL;
 	mutex_exit(&spa->spa_suspend_lock);
 
 	if (pio == NULL)
 		return (0);
 
 	zio_reexecute(pio);
 	return (zio_wait(pio));
 }
 
 void
 zio_resume_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_suspend_lock);
 	while (spa_suspended(spa))
 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 /*
  * ==========================================================================
  * Gang blocks.
  *
  * A gang block is a collection of small blocks that looks to the DMU
  * like one large block.  When zio_dva_allocate() cannot find a block
  * of the requested size, due to either severe fragmentation or the pool
  * being nearly full, it calls zio_write_gang_block() to construct the
  * block from smaller fragments.
  *
  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
  * an indirect block: it's an array of block pointers.  It consumes
  * only one sector and hence is allocatable regardless of fragmentation.
  * The gang header's bps point to its gang members, which hold the data.
  *
  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
  * as the verifier to ensure uniqueness of the SHA256 checksum.
  * Critically, the gang block bp's blk_cksum is the checksum of the data,
  * not the gang header.  This ensures that data block signatures (needed for
  * deduplication) are independent of how the block is physically stored.
  *
  * Gang blocks can be nested: a gang member may itself be a gang block.
  * Thus every gang block is a tree in which root and all interior nodes are
  * gang headers, and the leaves are normal blocks that contain user data.
  * The root of the gang tree is called the gang leader.
  *
  * To perform any operation (read, rewrite, free, claim) on a gang block,
  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
  * in the io_gang_tree field of the original logical i/o by recursively
  * reading the gang leader and all gang headers below it.  This yields
  * an in-core tree containing the contents of every gang header and the
  * bps for every constituent of the gang block.
  *
  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
  * of the gang header plus zio_checksum_compute() of the data to update the
  * gang header's blk_cksum as described above.
  *
  * The two-phase assemble/issue model solves the problem of partial failure --
  * what if you'd freed part of a gang block but then couldn't read the
  * gang header for another part?  Assembling the entire gang tree first
  * ensures that all the necessary gang header I/O has succeeded before
  * starting the actual work of free, claim, or write.  Once the gang tree
  * is assembled, free and claim are in-memory operations that cannot fail.
  *
  * In the event that a gang write fails, zio_dva_unallocate() walks the
  * gang tree to immediately free (i.e. insert back into the space map)
  * everything we've allocated.  This ensures that we don't get ENOSPC
  * errors during repeated suspend/resume cycles due to a flaky device.
  *
  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
  * the gang tree, we won't modify the block, so we can safely defer the free
  * (knowing that the block is still intact).  If we *can* assemble the gang
  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
  * each constituent bp and we can allocate a new block on the next sync pass.
  *
  * In all cases, the gang tree allows complete recovery from partial failure.
  * ==========================================================================
  */
 
 static void
 zio_gang_issue_func_done(zio_t *zio)
 {
 	abd_put(zio->io_abd);
 }
 
 static zio_t *
 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	if (gn != NULL)
 		return (pio);
 
 	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
 	    BP_GET_PSIZE(bp), zio_gang_issue_func_done,
 	    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 	    &pio->io_bookmark));
 }
 
 static zio_t *
 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	zio_t *zio;
 
 	if (gn != NULL) {
 		abd_t *gbh_abd =
 		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 		    &pio->io_bookmark);
 		/*
 		 * As we rewrite each gang header, the pipeline will compute
 		 * a new gang block header checksum for it; but no one will
 		 * compute a new data checksum, so we do that here.  The one
 		 * exception is the gang leader: the pipeline already computed
 		 * its data checksum because that stage precedes gang assembly.
 		 * (Presently, nothing actually uses interior data checksums;
 		 * this is just good hygiene.)
 		 */
 		if (gn != pio->io_gang_leader->io_gang_tree) {
 			abd_t *buf = abd_get_offset(data, offset);
 
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    buf, BP_GET_PSIZE(bp));
 
 			abd_put(buf);
 		}
 		/*
 		 * If we are here to damage data for testing purposes,
 		 * leave the GBH alone so that we can detect the damage.
 		 */
 		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    abd_get_offset(data, offset), BP_GET_PSIZE(bp),
 		    zio_gang_issue_func_done, NULL, pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 	}
 
 	return (zio);
 }
 
 /* ARGSUSED */
 static zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
 	    ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 /* ARGSUSED */
 static zio_t *
 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
 	NULL,
 	zio_read_gang,
 	zio_rewrite_gang,
 	zio_free_gang,
 	zio_claim_gang,
 	NULL
 };
 
 static void zio_gang_tree_assemble_done(zio_t *zio);
 
 static zio_gang_node_t *
 zio_gang_node_alloc(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn;
 
 	ASSERT(*gnpp == NULL);
 
 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
 	*gnpp = gn;
 
 	return (gn);
 }
 
 static void
 zio_gang_node_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		ASSERT(gn->gn_child[g] == NULL);
 
 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 	kmem_free(gn, sizeof (*gn));
 	*gnpp = NULL;
 }
 
 static void
 zio_gang_tree_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	if (gn == NULL)
 		return;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		zio_gang_tree_free(&gn->gn_child[g]);
 
 	zio_gang_node_free(gnpp);
 }
 
 static void
 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 
 	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
 	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_gang_tree_assemble_done, gn, gio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
 zio_gang_tree_assemble_done(zio_t *zio)
 {
 	zio_t *gio = zio->io_gang_leader;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
 	ASSERT(zio->io_child_count == 0);
 
 	if (zio->io_error)
 		return;
 
 	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
 	if (BP_SHOULD_BYTESWAP(bp))
 		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
 
 	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
 	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 	abd_put(zio->io_abd);
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
 			continue;
 		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
 	}
 }
 
 static void
 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
     uint64_t offset)
 {
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 
 	ASSERT(BP_IS_GANG(bp) == !!gn);
 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
 
 	/*
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
 
 	if (gn != NULL) {
 		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 			if (BP_IS_HOLE(gbp))
 				continue;
 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
 			    offset);
 			offset += BP_GET_PSIZE(gbp);
 		}
 	}
 
 	if (gn == gio->io_gang_tree)
 		ASSERT3U(gio->io_size, ==, offset);
 
 	if (zio != pio)
 		zio_nowait(zio);
 }
 
 static int
 zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	zio->io_gang_leader = zio;
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_gang_issue(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
 		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
 		    0);
 	else
 		zio_gang_tree_free(&zio->io_gang_tree);
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
 	ASSERTV(zio_t *gio = zio->io_gang_leader);
 
 	if (BP_IS_HOLE(zio->io_bp))
 		return;
 
 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
 		ASSERT(DVA_GET_GANG(&pdva[d]));
 		asize = DVA_GET_ASIZE(&pdva[d]);
 		asize += DVA_GET_ASIZE(&cdva[d]);
 		DVA_SET_ASIZE(&pdva[d], asize);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_write_gang_done(zio_t *zio)
 {
 	abd_put(zio->io_abd);
 }
 
 static int
 zio_write_gang_block(zio_t *pio)
 {
 	spa_t *spa = pio->io_spa;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
 	abd_t *gbh_abd;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
 	int copies = gio->io_prop.zp_copies;
 	int gbh_copies;
 	zio_prop_t zp;
 	int error;
 
 	/*
 	 * encrypted blocks need DVA[2] free so encrypted gang headers can't
 	 * have a third copy.
 	 */
 	gbh_copies = MIN(copies + 1, spa_max_replication(spa));
 	if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
 		gbh_copies = SPA_DVAS_PER_BP - 1;
 
 	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
 
 		flags |= METASLAB_ASYNC_ALLOC;
 		VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
 
 		/*
 		 * The logical zio has already placed a reservation for
 		 * 'copies' allocation slots but gang blocks may require
 		 * additional copies. These additional copies
 		 * (i.e. gbh_copies - copies) are guaranteed to succeed
 		 * since metaslab_class_throttle_reserve() always allows
 		 * additional reservations for gang blocks.
 		 */
 		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
 		    pio, flags));
 	}
 
 	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
 	    &pio->io_alloc_list, pio);
 	if (error) {
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
 
 			/*
 			 * If we failed to allocate the gang block header then
 			 * we remove any additional allocation reservations that
 			 * we placed here. The original reservation will
 			 * be removed when the logical I/O goes to the ready
 			 * stage.
 			 */
 			metaslab_class_throttle_unreserve(mc,
 			    gbh_copies - copies, pio);
 		}
 
 		pio->io_error = error;
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	if (pio == gio) {
 		gnpp = &gio->io_gang_tree;
 	} else {
 		gnpp = pio->io_private;
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
 	}
 
 	gn = zio_gang_node_alloc(gnpp);
 	gbh = gn->gn_gbh;
 	bzero(gbh, SPA_GANGBLOCKSIZE);
 	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
 
 	/*
 	 * Create the gang header.
 	 */
 	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 	/*
 	 * Create and nowait the gang children.
 	 */
 	for (int g = 0; resid != 0; resid -= lsize, g++) {
 		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
 		    SPA_MINBLOCKSIZE);
 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
 
 		zp.zp_checksum = gio->io_prop.zp_checksum;
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
 		zp.zp_dedup = B_FALSE;
 		zp.zp_dedup_verify = B_FALSE;
 		zp.zp_nopwrite = B_FALSE;
 		zp.zp_encrypt = gio->io_prop.zp_encrypt;
 		zp.zp_byteorder = gio->io_prop.zp_byteorder;
 		bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
 		bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
 		bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
 
 		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
 		    lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
 
 			/*
 			 * Gang children won't throttle but we should
 			 * account for their work, so reserve an allocation
 			 * slot for them here.
 			 */
 			VERIFY(metaslab_class_throttle_reserve(mc,
 			    zp.zp_copies, cio, flags));
 		}
 		zio_nowait(cio);
 	}
 
 	/*
 	 * Set pio's pipeline to just wait for zio to finish.
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	/*
 	 * We didn't allocate this bp, so make sure it doesn't get unmarked.
 	 */
 	pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
 
 	zio_nowait(zio);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * The zio_nop_write stage in the pipeline determines if allocating a
  * new bp is necessary.  The nopwrite feature can handle writes in
  * either syncing or open context (i.e. zil writes) and as a result is
  * mutually exclusive with dedup.
  *
  * By leveraging a cryptographically secure checksum, such as SHA256, we
  * can compare the checksums of the new data and the old to determine if
  * allocating a new block is required.  Note that our requirements for
  * cryptographic strength are fairly weak: there can't be any accidental
  * hash collisions, but we don't need to be secure against intentional
  * (malicious) collisions.  To trigger a nopwrite, you have to be able
  * to write the file to begin with, and triggering an incorrect (hash
  * collision) nopwrite is no worse than simply writing to the file.
  * That said, there are no known attacks against the checksum algorithms
  * used for nopwrite, assuming that the salt and the checksums
  * themselves remain secret.
  */
 static int
 zio_nop_write(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	zio_prop_t *zp = &zio->io_prop;
 
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(zp->zp_nopwrite);
 	ASSERT(!zp->zp_dedup);
 	ASSERT(zio->io_bp_override == NULL);
 	ASSERT(IO_IS_ALLOCATING(zio));
 
 	/*
 	 * Check to see if the original bp and the new bp have matching
 	 * characteristics (i.e. same checksum, compression algorithms, etc).
 	 * If they don't then just continue with the pipeline which will
 	 * allocate a new bp.
 	 */
 	if (BP_IS_HOLE(bp_orig) ||
 	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
 	    ZCHECKSUM_FLAG_NOPWRITE) ||
 	    BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
 	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
 	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
 		return (ZIO_PIPELINE_CONTINUE);
 
 	/*
 	 * If the checksums match then reset the pipeline so that we
 	 * avoid allocating a new bp and issuing any I/O.
 	 */
 	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
 		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE);
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
 		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
 		    sizeof (uint64_t)) == 0);
 
 		*bp = *bp_orig;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		zio->io_flags |= ZIO_FLAG_NOPWRITE;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * ==========================================================================
  * Dedup
  * ==========================================================================
  */
 static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
 	ddp = ddt_phys_select(dde, bp);
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
 
 	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
 		dde->dde_repair_abd = zio->io_abd;
 	else
 		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
 }
 
 static int
 zio_ddt_read_start(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
 		ddt_phys_t *ddp = dde->dde_phys;
 		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;
 
 		if (ddp_self == NULL)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
 				continue;
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
 			    &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    abd_alloc_for_io(zio->io_size, B_TRUE),
 			    zio->io_size, zio_ddt_child_read_done, dde,
 			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
 			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
 		}
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	zio_nowait(zio_read(zio, zio->io_spa, bp,
 	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_ddt_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = zio->io_vsd;
 		if (ddt == NULL) {
 			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 		if (dde == NULL) {
 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (ZIO_PIPELINE_STOP);
 		}
 		if (dde->dde_repair_abd != NULL) {
 			abd_copy(zio->io_abd, dde->dde_repair_abd,
 			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
 		ddt_repair_done(ddt, dde);
 		zio->io_vsd = NULL;
 	}
 
 	ASSERT(zio->io_vsd == NULL);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static boolean_t
 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 {
 	spa_t *spa = zio->io_spa;
 	boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
 
 	ASSERT(!(zio->io_bp_override && do_raw));
 
 	/*
 	 * Note: we compare the original data, not the transformed data,
 	 * because when zio->io_bp is an override bp, we will not have
 	 * pushed the I/O transforms.  That's an important optimization
 	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
 	 * However, we should never get a raw, override zio so in these
 	 * cases we can compare the io_abd directly. This is useful because
 	 * it allows us to do dedup verification even if we don't have access
 	 * to the original data (for instance, if the encryption keys aren't
 	 * loaded).
 	 */
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		zio_t *lio = dde->dde_lead_zio[p];
 
 		if (lio != NULL && do_raw) {
 			return (lio->io_size != zio->io_size ||
 			    abd_cmp(zio->io_abd, lio->io_abd) != 0);
 		} else if (lio != NULL) {
 			return (lio->io_orig_size != zio->io_orig_size ||
 			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
 		}
 	}
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		ddt_phys_t *ddp = &dde->dde_phys[p];
 
 		if (ddp->ddp_phys_birth != 0 && do_raw) {
 			blkptr_t blk = *zio->io_bp;
 			uint64_t psize;
 			abd_t *tmpabd;
 			int error;
 
 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
 			psize = BP_GET_PSIZE(&blk);
 
 			if (psize != zio->io_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			tmpabd = abd_alloc_for_io(psize, B_TRUE);
 
 			error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
 			    psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_RAW, &zio->io_bookmark));
 
 			if (error == 0) {
 				if (abd_cmp(tmpabd, zio->io_abd) != 0)
 					error = SET_ERROR(ENOENT);
 			}
 
 			abd_free(tmpabd);
 			ddt_enter(ddt);
 			return (error != 0);
 		} else if (ddp->ddp_phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
 
 			if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			error = arc_read(NULL, spa, &blk,
 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zio->io_bookmark);
 
 			if (error == 0) {
 				if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(ENOENT);
 				arc_buf_destroy(abuf, &abuf);
 			}
 
 			ddt_enter(ddt);
 			return (error != 0);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 zio_ddt_child_write_ready(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 	zio_t *pio;
 
 	if (zio->io_error)
 		return;
 
 	ddt_enter(ddt);
 
 	ASSERT(dde->dde_lead_zio[p] == zio);
 
 	ddt_phys_fill(ddp, zio->io_bp);
 
 	zio_link_t *zl = NULL;
 	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
 	ASSERT(dde->dde_lead_zio[p] == zio);
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		zio_link_t *zl = NULL;
 		while (zio_walk_parents(zio, &zl) != NULL)
 			ddt_phys_addref(ddp);
 	} else {
 		ddt_phys_clear(ddp);
 	}
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_ditto_write_done(zio_t *zio)
 {
 	int p = DDT_PHYS_DITTO;
 	ASSERTV(zio_prop_t *zp = &zio->io_prop);
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(zio->io_spa, bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 	ddt_key_t *ddk = &dde->dde_key;
 
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
 	ASSERT(dde->dde_lead_zio[p] == zio);
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
 		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
 		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
 		if (ddp->ddp_phys_birth != 0)
 			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
 		ddt_phys_fill(ddp, bp);
 	}
 
 	ddt_exit(ddt);
 }
 
 static int
 zio_ddt_write(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
 	int p = zp->zp_copies;
 	int ditto_copies;
 	zio_t *cio = NULL;
 	zio_t *dio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
 
 	ddt_enter(ddt);
 	dde = ddt_lookup(ddt, bp, B_TRUE);
 	ddp = &dde->dde_phys[p];
 
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
 		 * If we're using a weak checksum, upgrade to a strong checksum
 		 * and try again.  If we're already using a strong checksum,
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
 		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
 			zp->zp_dedup = B_FALSE;
 		}
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
 	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
 
 	if (ditto_copies > ddt_ditto_copies_present(dde) &&
 	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
 		zio_prop_t czp = *zp;
 
 		czp.zp_copies = ditto_copies;
 
 		/*
 		 * If we arrived here with an override bp, we won't have run
 		 * the transform stack, so we won't have the data we need to
 		 * generate a child i/o.  So, toss the override bp and restart.
 		 * This is safe, because using the override bp is just an
 		 * optimization; and it's rare, so the cost doesn't matter.
 		 */
 		if (zio->io_bp_override) {
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			zio->io_pipeline = ZIO_WRITE_PIPELINE;
 			zio->io_bp_override = NULL;
 			BP_ZERO(bp);
 			ddt_exit(ddt);
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
 		    NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
 	}
 
 	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
 		if (ddp->ddp_phys_birth != 0)
 			ddt_bp_fill(ddp, bp, txg);
 		if (dde->dde_lead_zio[p] != NULL)
 			zio_add_child(zio, dde->dde_lead_zio[p]);
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
 		ASSERT(bp->blk_birth == txg);
 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 		ddt_phys_fill(ddp, bp);
 		ddt_phys_addref(ddp);
 	} else {
 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, zp,
 		    zio_ddt_child_write_ready, NULL, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[p] = cio;
 	}
 
 	ddt_exit(ddt);
 
 	if (cio)
 		zio_nowait(cio);
 	if (dio)
 		zio_nowait(dio);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 ddt_entry_t *freedde; /* for debugging */
 
 static int
 zio_ddt_free(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
 	if (dde) {
 		ddp = ddt_phys_select(dde, bp);
 		if (ddp)
 			ddt_phys_decref(ddp);
 	}
 	ddt_exit(ddt);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * ==========================================================================
  * Allocate and free blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_io_to_allocate(spa_t *spa)
 {
 	zio_t *zio;
 
 	ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
 
 	zio = avl_first(&spa->spa_alloc_tree);
 	if (zio == NULL)
 		return (NULL);
 
 	ASSERT(IO_IS_ALLOCATING(zio));
 
 	/*
 	 * Try to place a reservation for this zio. If we're unable to
 	 * reserve then we throttle.
 	 */
 	if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
 	    zio->io_prop.zp_copies, zio, 0)) {
 		return (NULL);
 	}
 
 	avl_remove(&spa->spa_alloc_tree, zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	return (zio);
 }
 
 static int
 zio_dva_throttle(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_t *nio;
 
 	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
 	    !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
 	    zio->io_child_type == ZIO_CHILD_GANG ||
 	    zio->io_flags & ZIO_FLAG_NODATA) {
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
 	mutex_enter(&spa->spa_alloc_lock);
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	avl_add(&spa->spa_alloc_tree, zio);
 
 	nio = zio_io_to_allocate(zio->io_spa);
 	mutex_exit(&spa->spa_alloc_lock);
 
 	if (nio == zio)
 		return (ZIO_PIPELINE_CONTINUE);
 
 	if (nio != NULL) {
 		ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 		/*
 		 * We are passing control to a new zio so make sure that
 		 * it is processed by a different thread. We do this to
 		 * avoid stack overflows that can occur when parents are
 		 * throttled and children are making progress. We allow
 		 * it to go to the head of the taskq since it's already
 		 * been waiting.
 		 */
 		zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
 	}
 	return (ZIO_PIPELINE_STOP);
 }
 
 void
 zio_allocate_dispatch(spa_t *spa)
 {
 	zio_t *zio;
 
 	mutex_enter(&spa->spa_alloc_lock);
 	zio = zio_io_to_allocate(spa);
 	mutex_exit(&spa->spa_alloc_lock);
 	if (zio == NULL)
 		return;
 
 	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
 	ASSERT0(zio->io_error);
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
 }
 
 static int
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
 	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 		zio->io_gang_leader = zio;
 	}
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT0(BP_GET_NDVAS(bp));
 	ASSERT3U(zio->io_prop.zp_copies, >, 0);
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
 	if (zio->io_flags & ZIO_FLAG_NODATA)
 		flags |= METASLAB_DONT_THROTTLE;
 	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
 		flags |= METASLAB_GANG_CHILD;
 	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
 		flags |= METASLAB_ASYNC_ALLOC;
 
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio);
 
 	if (error != 0) {
 		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
 		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
 		    error);
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
 			return (zio_write_gang_block(zio));
 		zio->io_error = error;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_dva_free(zio_t *zio)
 {
 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_dva_claim(zio_t *zio)
 {
 	int error;
 
 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 	if (error)
 		zio->io_error = error;
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * Undo an allocation.  This is used by zio_done() when an I/O fails
  * and we want to give back the block we just allocated.
  * This handles both normal blocks and gang blocks.
  */
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
 	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp))
 		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			zio_dva_unallocate(zio, gn->gn_child[g],
 			    &gn->gn_gbh->zg_blkptr[g]);
 		}
 	}
 }
 
 /*
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
 zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
     uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 	zio_alloc_list_t io_alloc_list;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	metaslab_trace_init(&io_alloc_list);
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
 	    txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL);
 	if (error == 0) {
 		*slog = TRUE;
 	} else {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, NULL, METASLAB_FASTWRITE,
 		    &io_alloc_list, NULL);
 		if (error == 0)
 			*slog = FALSE;
 	}
 	metaslab_trace_fini(&io_alloc_list);
 
 	if (error == 0) {
 		BP_SET_LSIZE(new_bp, size);
 		BP_SET_PSIZE(new_bp, size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
 		BP_SET_CHECKSUM(new_bp,
 		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
 		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
 		BP_SET_DEDUP(new_bp, 0);
 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
 
 		/*
 		 * encrypted blocks will require an IV and salt. We generate
 		 * these now since we will not be rewriting the bp at
 		 * rewrite time.
 		 */
 		if (os->os_encrypted) {
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 
 			BP_SET_CRYPT(new_bp, B_TRUE);
 			VERIFY0(spa_crypt_get_salt(spa,
 			    dmu_objset_id(os), salt));
 			VERIFY0(zio_crypt_generate_iv(iv));
 
 			zio_crypt_encode_params_bp(new_bp, salt, iv);
 		}
+	} else {
+		zfs_dbgmsg("%s: zil block allocation failure: "
+		    "size %llu, error %d", spa_name(spa), size, error);
 	}
 
 	return (error);
 }
 
 /*
  * Free an intent log block.
  */
 void
 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
 {
 	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
 	ASSERT(!BP_IS_GANG(bp));
 
 	zio_free(spa, txg, bp);
 }
 
 /*
  * ==========================================================================
  * Read and write to physical devices
  * ==========================================================================
  */
 
 
 /*
  * Issue an I/O to the underlying vdev. Typically the issue pipeline
  * stops after this stage and will resume upon I/O completion.
  * However, there are instances where the vdev layer may need to
  * continue the pipeline when an I/O was not issued. Since the I/O
  * that was sent to the vdev layer might be different than the one
  * currently active in the pipeline (see vdev_queue_io()), we explicitly
  * force the underlying vdev layers to call either zio_execute() or
  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
  */
 static int
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
 
 	zio->io_delay = 0;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
 
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
 		vdev_mirror_ops.vdev_op_io_start(zio);
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	ASSERT3P(zio->io_logical, !=, zio);
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
 	    P2PHASE(zio->io_size, align) != 0) {
 		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
 		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			abd_copy(abuf, zio->io_abd, zio->io_size);
 			abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
 		}
 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
 	}
 
 	/*
 	 * If this is not a physical io, make sure that it is properly aligned
 	 * before proceeding.
 	 */
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
 		ASSERT0(P2PHASE(zio->io_offset, align));
 		ASSERT0(P2PHASE(zio->io_size, align));
 	} else {
 		/*
 		 * For physical writes, we allow 512b aligned writes and assume
 		 * the device will perform a read-modify-write as necessary.
 		 */
 		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
 		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
 	}
 
 	VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
 
 	/*
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
 	 * This prevents spurious resilvering with nested replication.
 	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
 	 * A is out of date, we'll read from C+D, then use the data to
 	 * resilver A+B -- but we don't actually want to resilver B, just A.
 	 * The top-level mirror has no way to know this, so instead we just
 	 * discard unnecessary repairs as we work our way down the vdev tree.
 	 * The same logic applies to any form of nested replication:
 	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
 
 		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
 			return (ZIO_PIPELINE_CONTINUE);
 
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (ZIO_PIPELINE_STOP);
 
 		if (!vdev_accessible(vd, zio)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return (ZIO_PIPELINE_STOP);
 		}
 		zio->io_delay = gethrtime();
 	}
 
 	vd->vdev_ops->vdev_op_io_start(zio);
 	return (ZIO_PIPELINE_STOP);
 }
 
 static int
 zio_vdev_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 
 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
 
 		vdev_queue_io_done(zio);
 
 		if (zio->io_type == ZIO_TYPE_WRITE)
 			vdev_cache_write(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injections(vd, zio,
 			    EIO, EILSEQ);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
 		if (zio->io_error) {
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				unexpected_error = B_TRUE;
 			}
 		}
 	}
 
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error)
 		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * For non-raidz ZIOs, we can just copy aside the bad data read from the
  * disk, and use that to finish the checksum ereport later.
  */
 static void
 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
     const abd_t *good_buf)
 {
 	/* no processing needed */
 	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
 }
 
 /*ARGSUSED*/
 void
 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
 {
 	void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
 
 	abd_copy(abd, zio->io_abd, zio->io_size);
 
 	zcr->zcr_cbinfo = zio->io_size;
 	zcr->zcr_cbdata = abd;
 	zcr->zcr_finish = zio_vsd_default_cksum_finish;
 	zcr->zcr_free = zio_abd_free;
 }
 
 static int
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
 
 	if (zio->io_vsd != NULL) {
 		zio->io_vsd_ops->vsd_free(zio);
 		zio->io_vsd = NULL;
 	}
 
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 *
 	 * On retry, we cut in line in the issue queue, since we don't want
 	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
 	 */
 	if (zio->io_error && vd == NULL &&
 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
 		zio->io_error = 0;
 		zio->io_flags |= ZIO_FLAG_IO_RETRY |
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
 		    zio_requeue_io_start_cut_in_line);
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	/*
 	 * If we got an error on a leaf device, convert it to ENXIO
 	 * if the device is not accessible at all.
 	 */
 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    !vdev_accessible(vd, zio))
 		zio->io_error = SET_ERROR(ENXIO);
 
 	/*
 	 * If we can't write to an interior vdev (mirror or RAID-Z),
 	 * set vdev_cant_write so that we stop trying to allocate from it.
 	 */
 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
 		vd->vdev_cant_write = B_TRUE;
 	}
 
 	/*
 	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
 	 * attempts will ever succeed. In this case we set a persistent bit so
 	 * that we don't bother with it in the future.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
 	    zio->io_type == ZIO_TYPE_IOCTL &&
 	    zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
 		vd->vdev_nowritecache = B_TRUE;
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    zio->io_physdone != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
 		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
 		zio->io_physdone(zio->io_logical);
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 void
 zio_vdev_io_reissue(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_redone(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_bypass(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
  * ==========================================================================
  * Encrypt and store encryption parameters
  * ==========================================================================
  */
 
 
 /*
  * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
  * managing the storage of encryption parameters and passing them to the
  * lower-level encryption functions.
  */
 static int
 zio_encrypt(zio_t *zio)
 {
 	zio_prop_t *zp = &zio->io_prop;
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_GET_PSIZE(bp);
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	void *enc_buf = NULL;
 	abd_t *eabd = NULL;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/* the root zio already encrypted the data */
 	if (zio->io_child_type == ZIO_CHILD_GANG)
 		return (ZIO_PIPELINE_CONTINUE);
 
 	/* only ZIL blocks are re-encrypted on rewrite */
 	if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
 		return (ZIO_PIPELINE_CONTINUE);
 
 	if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
 		BP_SET_CRYPT(bp, B_FALSE);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	/* if we are doing raw encryption set the provided encryption params */
 	if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		BP_SET_BYTEORDER(bp, zp->zp_byteorder);
 		if (ot != DMU_OT_OBJSET)
 			zio_crypt_encode_mac_bp(bp, zp->zp_mac);
 		if (DMU_OT_IS_ENCRYPTED(ot))
 			zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	/* indirect blocks only maintain a cksum of the lower level MACs */
 	if (BP_GET_LEVEL(bp) > 0) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
 		    zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
 		    mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	/*
 	 * Objset blocks are a special case since they have 2 256-bit MACs
 	 * embedded within them.
 	 */
 	if (ot == DMU_OT_OBJSET) {
 		ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
 		ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa,
 		    zio->io_bookmark.zb_objset, zio->io_abd, psize,
 		    BP_SHOULD_BYTESWAP(bp)));
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	/* unencrypted object types are only authenticated with a MAC */
 	if (!DMU_OT_IS_ENCRYPTED(ot)) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa,
 		    zio->io_bookmark.zb_objset, zio->io_abd, psize, mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	/*
 	 * Later passes of sync-to-convergence may decide to rewrite data
 	 * in place to avoid more disk reallocations. This presents a problem
 	 * for encryption because this consitutes rewriting the new data with
 	 * the same encryption key and IV. However, this only applies to blocks
 	 * in the MOS (particularly the spacemaps) and we do not encrypt the
 	 * MOS. We assert that the zio is allocating or an intent log write
 	 * to enforce this.
 	 */
 	ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
 	ASSERT3U(psize, !=, 0);
 
 	enc_buf = zio_buf_alloc(psize);
 	eabd = abd_get_from_buf(enc_buf, psize);
 	abd_take_ownership_of_buf(eabd, B_TRUE);
 
 	/*
 	 * For an explanation of what encryption parameters are stored
 	 * where, see the block comment in zio_crypt.c.
 	 */
 	if (ot == DMU_OT_INTENT_LOG) {
 		zio_crypt_decode_params_bp(bp, salt, iv);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 	}
 
 	/* Perform the encryption. This should not fail */
 	VERIFY0(spa_do_crypt_abd(B_TRUE, spa, zio->io_bookmark.zb_objset, bp,
 	    zio->io_txg, psize, zio->io_abd, eabd, iv, mac, salt, &no_crypt));
 
 	/* encode encryption metadata into the bp */
 	if (ot == DMU_OT_INTENT_LOG) {
 		/*
 		 * ZIL blocks store the MAC in the embedded checksum, so the
 		 * transform must always be applied.
 		 */
 		zio_crypt_encode_mac_zil(enc_buf, mac);
 		zio_push_transform(zio, eabd, psize, psize, NULL);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 		zio_crypt_encode_params_bp(bp, salt, iv);
 		zio_crypt_encode_mac_bp(bp, mac);
 
 		if (no_crypt) {
 			ASSERT3U(ot, ==, DMU_OT_DNODE);
 			abd_free(eabd);
 		} else {
 			zio_push_transform(zio, eabd, psize, psize, NULL);
 		}
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * ==========================================================================
  * Generate and verify checksums
  * ==========================================================================
  */
 static int
 zio_checksum_generate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum;
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_write_phys().
 		 * We're either generating a label checksum, or none at all.
 		 */
 		checksum = zio->io_prop.zp_checksum;
 
 		if (checksum == ZIO_CHECKSUM_OFF)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
 	} else {
 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
 			ASSERT(!IO_IS_ALLOCATING(zio));
 			checksum = ZIO_CHECKSUM_GANG_HEADER;
 		} else {
 			checksum = BP_GET_CHECKSUM(bp);
 		}
 	}
 
 	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t info;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
 	ASSERT(zio->io_vd != NULL);
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_read_phys().
 		 * We're either verifying a label checksum, or nothing at all.
 		 */
 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
 	}
 
 	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM &&
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			zfs_ereport_start_checksum(zio->io_spa,
 			    zio->io_vd, &zio->io_bookmark, zio,
 			    zio->io_offset, zio->io_size, NULL, &info);
 		}
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * Called by RAID-Z to ensure we don't compute the checksum twice.
  */
 void
 zio_checksum_verified(zio_t *zio)
 {
 	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
  * ==========================================================================
  */
 int
 zio_worst_error(int e1, int e2)
 {
 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
 	int r1, r2;
 
 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
 		if (e1 == zio_error_rank[r1])
 			break;
 
 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
 		if (e2 == zio_error_rank[r2])
 			break;
 
 	return (r1 > r2 ? e1 : e2);
 }
 
 /*
  * ==========================================================================
  * I/O completion
  * ==========================================================================
  */
 static int
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
 		return (ZIO_PIPELINE_STOP);
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
 		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
 	}
 
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
 
 	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			/*
 			 * We were unable to allocate anything, unreserve and
 			 * issue the next I/O to allocate.
 			 */
 			metaslab_class_throttle_unreserve(
 			    spa_normal_class(zio->io_spa),
 			    zio->io_prop.zp_copies, zio);
 			zio_allocate_dispatch(zio->io_spa);
 		}
 	}
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
 	pio = zio_walk_parents(zio, &zl);
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * As we notify zio's parents, new parents could be added.
 	 * New parents go to the head of zio's io_parent_list, however,
 	 * so we will (correctly) not notify them.  The remainder of zio's
 	 * io_parent_list, from 'pio_next' onward, cannot change because
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
 	}
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		if (BP_IS_GANG(bp)) {
 			zio->io_flags &= ~ZIO_FLAG_NODATA;
 		} else {
 			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		}
 	}
 
 	if (zio_injection_enabled &&
 	    zio->io_spa->spa_syncing_txg == zio->io_txg)
 		zio_handle_ignored_writes(zio);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * Update the allocation throttle accounting.
  */
 static void
 zio_dva_throttle_done(zio_t *zio)
 {
 	ASSERTV(zio_t *lio = zio->io_logical);
 	zio_t *pio = zio_unique_parent(zio);
 	vdev_t *vd = zio->io_vd;
 	int flags = METASLAB_ASYNC_ALLOC;
 
 	ASSERT3P(zio->io_bp, !=, NULL);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 	ASSERT(vd != NULL);
 	ASSERT3P(vd, ==, vd->vdev_top);
 	ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
 	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
 
 	/*
 	 * Parents of gang children can have two flavors -- ones that
 	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
 	 * and ones that allocated the constituent blocks. The allocation
 	 * throttle needs to know the allocating parent zio so we must find
 	 * it here.
 	 */
 	if (pio->io_child_type == ZIO_CHILD_GANG) {
 		/*
 		 * If our parent is a rewrite gang child then our grandparent
 		 * would have been the one that performed the allocation.
 		 */
 		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
 			pio = zio_unique_parent(pio);
 		flags |= METASLAB_GANG_CHILD;
 	}
 
 	ASSERT(IO_IS_ALLOCATING(pio));
 	ASSERT3P(zio, !=, zio->io_logical);
 	ASSERT(zio->io_logical != NULL);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
 
 	mutex_enter(&pio->io_lock);
 	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
 	mutex_exit(&pio->io_lock);
 
 	metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
 	    1, pio);
 
 	/*
 	 * Call into the pipeline to see if there is more work that
 	 * needs to be done. If there is work to be done it will be
 	 * dispatched to another taskq thread.
 	 */
 	zio_allocate_dispatch(zio->io_spa);
 }
 
 static int
 zio_done(zio_t *zio)
 {
 	/*
 	 * Always attempt to keep stack usage minimal here since
 	 * we can be called recurisvely up to 19 levels deep.
 	 */
 	const uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	/*
 	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
 	/*
 	 * If the allocation throttle is enabled, then update the accounting.
 	 * We only track child I/Os that are part of an allocating async
 	 * write. We must do this since the allocation is performed
 	 * by the logical I/O but the actual write is done by child I/Os.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
 	    zio->io_child_type == ZIO_CHILD_VDEV) {
 		ASSERT(spa_normal_class(
 		    zio->io_spa)->mc_alloc_throttle_enabled);
 		zio_dva_throttle_done(zio);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, verify that
 	 * we have decremented the refcounts for every I/O that was throttled.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(zio->io_bp != NULL);
 		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio);
 		VERIFY(refcount_not_held(
 		    &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio));
 	}
 
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
 
 	if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
 		ASSERT(zio->io_bp->blk_pad[0] == 0);
 		ASSERT(zio->io_bp->blk_pad[1] == 0);
 		ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
 		    sizeof (blkptr_t)) == 0 ||
 		    (zio->io_bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
 		    zio->io_bp_override == NULL &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
 			ASSERT3U(zio->io_prop.zp_copies, <=,
 			    BP_GET_NDVAS(zio->io_bp));
 			ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
 			    (BP_COUNT_GANG(zio->io_bp) ==
 			    BP_GET_NDVAS(zio->io_bp)));
 		}
 		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
 			VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
 	}
 
 	/*
 	 * If there were child vdev/gang/ddt errors, they apply to us now.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
 	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
 
 	/*
 	 * If the I/O on the transformed data was successful, generate any
 	 * checksum reports now while we still have the transformed data.
 	 */
 	if (zio->io_error == 0) {
 		while (zio->io_cksum_report != NULL) {
 			zio_cksum_report_t *zcr = zio->io_cksum_report;
 			uint64_t align = zcr->zcr_align;
 			uint64_t asize = P2ROUNDUP(psize, align);
 			abd_t *adata = zio->io_abd;
 
 			if (asize != psize) {
 				adata = abd_alloc(asize, B_TRUE);
 				abd_copy(adata, zio->io_abd, psize);
 				abd_zero_off(adata, psize, asize - psize);
 			}
 
 			zio->io_cksum_report = zcr->zcr_next;
 			zcr->zcr_next = NULL;
 			zcr->zcr_finish(zcr, adata);
 			zfs_ereport_free_checksum(zcr);
 
 			if (asize != psize)
 				abd_free(adata);
 		}
 	}
 
 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
 	vdev_stat_update(zio, psize);
 
 	/*
 	 * If this I/O is attached to a particular vdev is slow, exceeding
 	 * 30 seconds to complete, post an error described the I/O delay.
 	 * We ignore these errors if the device is currently unavailable.
 	 */
 	if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
 		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
 			zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
 			    zio->io_vd, &zio->io_bookmark, zio, 0, 0);
 	}
 
 	if (zio->io_error) {
 		/*
 		 * If this I/O is attached to a particular vdev,
 		 * generate an error message describing the I/O failure
 		 * at the block level.  We ignore these errors if the
 		 * device is currently unavailable.
 		 */
 		if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
 		    !vdev_is_dead(zio->io_vd))
 			zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
 			    zio->io_vd, &zio->io_bookmark, zio, 0, 0);
 
 		if ((zio->io_error == EIO || !(zio->io_flags &
 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
 		    zio == zio->io_logical) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(zio->io_spa, &zio->io_bookmark);
 			zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
 			    NULL, &zio->io_bookmark, zio, 0, 0);
 		}
 	}
 
 	if (zio->io_error && zio == zio->io_logical) {
 		/*
 		 * Determine whether zio should be reexecuted.  This will
 		 * propagate all the way to the root via zio_notify_parent().
 		 */
 		ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		if (IO_IS_ALLOCATING(zio) &&
 		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
 		    zio->io_error == ENXIO &&
 		    spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		/*
 		 * Here is a possibly good place to attempt to do
 		 * either combinatorial reconstruction or error correction
 		 * based on checksums.  It also might be a good place
 		 * to send out preliminary ereports before we suspend
 		 * processing.
 		 */
 	}
 
 	/*
 	 * If there were logical child errors, they apply to us now.
 	 * We defer this until now to avoid conflating logical child
 	 * errors with errors that happened to the zio itself when
 	 * updating vdev stats and reporting FMA events above.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
 	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
 
 	/*
 	 * Godfather I/Os should never suspend.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
 	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
 		zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
 
 	if (zio->io_reexecute) {
 		/*
 		 * This is a logical I/O that wants to reexecute.
 		 *
 		 * Reexecute is top-down.  When an i/o fails, if it's not
 		 * the root, it simply notifies its parent and sticks around.
 		 * The parent, seeing that it still has children in zio_done(),
 		 * does the same.  This percolates all the way up to the root.
 		 * The root i/o will reexecute or suspend the entire tree.
 		 *
 		 * This approach ensures that zio_reexecute() honors
 		 * all the original i/o dependency relationships, e.g.
 		 * parents not executing until children are ready.
 		 */
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		zio->io_gang_leader = NULL;
 
 		mutex_enter(&zio->io_lock);
 		zio->io_state[ZIO_WAIT_DONE] = 1;
 		mutex_exit(&zio->io_lock);
 
 		/*
 		 * "The Godfather" I/O monitors its children but is
 		 * not a true parent to them. It will track them through
 		 * the pipeline but severs its ties whenever they get into
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
 		zl = NULL;
 		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
 		    pio = pio_next) {
 			zio_link_t *remove_zl = zl;
 			pio_next = zio_walk_parents(zio, &zl);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
 				zio_remove_child(pio, zio, remove_zl);
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 			}
 		}
 
 		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
 			 * upward since we haven't permanently failed yet.
 			 */
 			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
 			/*
 			 * We'd fail again if we reexecuted now, so suspend
 			 * until conditions improve (e.g. device comes online).
 			 */
 			zio_suspend(zio->io_spa, zio);
 		} else {
 			/*
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
 			ASSERT(taskq_empty_ent(&zio->io_tqent));
 			spa_taskq_dispatch_ent(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
 			    (task_func_t *)zio_reexecute, zio, 0,
 			    &zio->io_tqent);
 		}
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	ASSERT(zio->io_child_count == 0);
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
 	/*
 	 * Report any checksum errors, since the I/O is complete.
 	 */
 	while (zio->io_cksum_report != NULL) {
 		zio_cksum_report_t *zcr = zio->io_cksum_report;
 		zio->io_cksum_report = zcr->zcr_next;
 		zcr->zcr_next = NULL;
 		zcr->zcr_finish(zcr, NULL);
 		zfs_ereport_free_checksum(zcr);
 	}
 
 	if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
 	    !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
 	    !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
 		metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
 	}
 
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
 	 * such, cannot acquire any new parents.
 	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
 	zl = NULL;
 	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
 		zio_link_t *remove_zl = zl;
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_remove_child(pio, zio, remove_zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 	}
 
 	if (zio->io_waiter != NULL) {
 		mutex_enter(&zio->io_lock);
 		zio->io_executor = NULL;
 		cv_broadcast(&zio->io_cv);
 		mutex_exit(&zio->io_lock);
 	} else {
 		zio_destroy(zio);
 	}
 
 	return (ZIO_PIPELINE_STOP);
 }
 
 /*
  * ==========================================================================
  * I/O pipeline definition
  * ==========================================================================
  */
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
 	zio_write_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
 	zio_write_compress,
 	zio_encrypt,
 	zio_checksum_generate,
 	zio_nop_write,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_throttle,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
 	zio_ready,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
 	zio_checksum_verify,
 	zio_done
 };
 
 
 
 
 /*
  * Compare two zbookmark_phys_t's to see which we would reach first in a
  * pre-order traversal of the object tree.
  *
  * This is simple in every case aside from the meta-dnode object. For all other
  * objects, we traverse them in order (object 1 before object 2, and so on).
  * However, all of these objects are traversed while traversing object 0, since
  * the data it points to is the list of objects.  Thus, we need to convert to a
  * canonical representation so we can compare meta-dnode bookmarks to
  * non-meta-dnode bookmarks.
  *
  * We do this by calculating "equivalents" for each field of the zbookmark.
  * zbookmarks outside of the meta-dnode use their own object and level, and
  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
  * blocks this bookmark refers to) by multiplying their blkid by their span
  * (the number of L0 blocks contained within one block at their level).
  * zbookmarks inside the meta-dnode calculate their object equivalent
  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
  * level + 1<<31 (any value larger than a level could ever be) for their level.
  * This causes them to always compare before a bookmark in their object
  * equivalent, compare appropriately to bookmarks in other objects, and to
  * compare appropriately to other bookmarks in the meta-dnode.
  */
 int
 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
 {
 	/*
 	 * These variables represent the "equivalent" values for the zbookmark,
 	 * after converting zbookmarks inside the meta dnode to their
 	 * normal-object equivalents.
 	 */
 	uint64_t zb1obj, zb2obj;
 	uint64_t zb1L0, zb2L0;
 	uint64_t zb1level, zb2level;
 
 	if (zb1->zb_object == zb2->zb_object &&
 	    zb1->zb_level == zb2->zb_level &&
 	    zb1->zb_blkid == zb2->zb_blkid)
 		return (0);
 
 	/*
 	 * BP_SPANB calculates the span in blocks.
 	 */
 	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
 	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb1L0 = 0;
 		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb1obj = zb1->zb_object;
 		zb1level = zb1->zb_level;
 	}
 
 	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
 		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb2L0 = 0;
 		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb2obj = zb2->zb_object;
 		zb2level = zb2->zb_level;
 	}
 
 	/* Now that we have a canonical representation, do the comparison. */
 	if (zb1obj != zb2obj)
 		return (zb1obj < zb2obj ? -1 : 1);
 	else if (zb1L0 != zb2L0)
 		return (zb1L0 < zb2L0 ? -1 : 1);
 	else if (zb1level != zb2level)
 		return (zb1level > zb2level ? -1 : 1);
 	/*
 	 * This can (theoretically) happen if the bookmarks have the same object
 	 * and level, but different blkids, if the block sizes are not the same.
 	 * There is presently no way to change the indirect block sizes
 	 */
 	return (0);
 }
 
 /*
  *  This function checks the following: given that last_block is the place that
  *  our traversal stopped last time, does that guarantee that we've visited
  *  every node under subtree_root?  Therefore, we can't just use the raw output
  *  of zbookmark_compare.  We have to pass in a modified version of
  *  subtree_root; by incrementing the block id, and then checking whether
  *  last_block is before or equal to that, we can tell whether or not having
  *  visited last_block implies that all of subtree_root's children have been
  *  visited.
  */
 boolean_t
 zbookmark_subtree_completed(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	zbookmark_phys_t mod_zb = *subtree_root;
 	mod_zb.zb_blkid++;
 	ASSERT(last_block->zb_level == 0);
 
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
 		return (B_FALSE);
 
 	/*
 	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
 	 * data block size in sectors, because that variable is only used if
 	 * the bookmark refers to a block in the meta-dnode.  Since we don't
 	 * know without examining it what object it refers to, and there's no
 	 * harm in passing in this value in other cases, we always pass it in.
 	 *
 	 * We pass in 0 for the indirect block size shift because zb2 must be
 	 * level 0.  The indirect block size is only used to calculate the span
 	 * of the bookmark, but since the bookmark must be level 0, the span is
 	 * always 1, so the math works out.
 	 *
 	 * If you make changes to how the zbookmark_compare code works, be sure
 	 * to make sure that this code still works afterwards.
 	 */
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
 	    last_block) <= 0);
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(zio_type_name);
 EXPORT_SYMBOL(zio_buf_alloc);
 EXPORT_SYMBOL(zio_data_buf_alloc);
 EXPORT_SYMBOL(zio_buf_free);
 EXPORT_SYMBOL(zio_data_buf_free);
 
 module_param(zio_delay_max, int, 0644);
 MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
 
 module_param(zio_requeue_io_start_cut_in_line, int, 0644);
 MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
 
 module_param(zfs_sync_pass_deferred_free, int, 0644);
 MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
 	"Defer frees starting in this pass");
 
 module_param(zfs_sync_pass_dont_compress, int, 0644);
 MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
 	"Don't compress starting in this pass");
 
 module_param(zfs_sync_pass_rewrite, int, 0644);
 MODULE_PARM_DESC(zfs_sync_pass_rewrite,
 	"Rewrite new bps starting in this pass");
 
 module_param(zio_dva_throttle_enabled, int, 0644);
 MODULE_PARM_DESC(zio_dva_throttle_enabled,
 	"Throttle block allocations in the ZIO pipeline");
 #endif
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 5cd190c09bdf..6ea822467b61 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -1,2746 +1,2747 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  *
  * ZFS volume emulation driver.
  *
  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  * Volumes are accessed through the symbolic links named:
  *
  * /dev/<pool_name>/<dataset_name>
  *
  * Volumes are persistent through reboot and module load.  No user command
  * needs to be run before opening and using a device.
  *
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 /*
  * Note on locking of zvol state structures.
  *
  * These structures are used to maintain internal state used to emulate block
  * devices on top of zvols. In particular, management of device minor number
  * operations - create, remove, rename, and set_snapdev - involves access to
  * these structures. The zvol_state_lock is primarily used to protect the
  * zvol_state_list. The zv->zv_state_lock is used to protect the contents
  * of the zvol_state_t structures, as well as to make sure that when the
  * time comes to remove the structure from the list, it is not in use, and
  * therefore, it can be taken off zvol_state_list and freed.
  *
  * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
  * e.g. for the duration of receive and rollback operations. This lock can be
  * held for significant periods of time. Given that it is undesirable to hold
  * mutexes for long periods of time, the following lock ordering applies:
  * - take zvol_state_lock if necessary, to protect zvol_state_list
  * - take zv_suspend_lock if necessary, by the code path in question
  * - take zv_state_lock to protect zvol_state_t
  *
  * The minor operations are issued to spa->spa_zvol_taskq queues, that are
  * single-threaded (to preserve order of minor operations), and are executed
  * through the zvol_task_cb that dispatches the specific operations. Therefore,
  * these operations are serialized per pool. Consequently, we can be certain
  * that for a given zvol, there is only one operation at a time in progress.
  * That is why one can be sure that first, zvol_state_t for a given zvol is
  * allocated and placed on zvol_state_list, and then other minor operations
  * for this zvol are going to proceed in the order of issue.
  *
  * It is also worth keeping in mind that once add_disk() is called, the zvol is
  * announced to the world, and zvol_open()/zvol_release() can be called at any
  * time. Incidentally, add_disk() itself calls zvol_open()->zvol_first_open()
  * and zvol_release()->zvol_last_close() directly as well.
  */
 
 #include <sys/dbuf.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/zio.h>
 #include <sys/zfs_rlock.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
 #include <linux/blkdev_compat.h>
 
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_major = ZVOL_MAJOR;
 unsigned int zvol_threads = 32;
 unsigned int zvol_request_sync = 0;
 unsigned int zvol_prefetch_bytes = (128 * 1024);
 unsigned long zvol_max_discard_blocks = 16384;
 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
 
 static taskq_t *zvol_taskq;
 static kmutex_t zvol_state_lock;
 static list_t zvol_state_list;
 
 #define	ZVOL_HT_SIZE	1024
 static struct hlist_head *zvol_htable;
 #define	ZVOL_HT_HEAD(hash)	(&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)])
 
 static struct ida zvol_ida;
 
 /*
  * The in-core state of each volume.
  */
 struct zvol_state {
 	char			zv_name[MAXNAMELEN];	/* name */
 	uint64_t		zv_volsize;		/* advertised space */
 	uint64_t		zv_volblocksize;	/* volume block size */
 	objset_t		*zv_objset;	/* objset handle */
 	uint32_t		zv_flags;	/* ZVOL_* flags */
 	uint32_t		zv_open_count;	/* open counts */
 	uint32_t		zv_changed;	/* disk changed */
 	zilog_t			*zv_zilog;	/* ZIL handle */
 	zfs_rlock_t		zv_range_lock;	/* range lock */
 	dnode_t			*zv_dn;		/* dnode hold */
 	dev_t			zv_dev;		/* device id */
 	struct gendisk		*zv_disk;	/* generic disk */
 	struct request_queue	*zv_queue;	/* request queue */
 	list_node_t		zv_next;	/* next zvol_state_t linkage */
 	uint64_t		zv_hash;	/* name hash */
 	struct hlist_node	zv_hlink;	/* hash link */
 	kmutex_t		zv_state_lock;	/* protects zvol_state_t */
 	atomic_t		zv_suspend_ref;	/* refcount for suspend */
 	krwlock_t		zv_suspend_lock;	/* suspend lock */
 };
 
 typedef enum {
 	ZVOL_ASYNC_CREATE_MINORS,
 	ZVOL_ASYNC_REMOVE_MINORS,
 	ZVOL_ASYNC_RENAME_MINORS,
 	ZVOL_ASYNC_SET_SNAPDEV,
 	ZVOL_ASYNC_SET_VOLMODE,
 	ZVOL_ASYNC_MAX
 } zvol_async_op_t;
 
 typedef struct {
 	zvol_async_op_t op;
 	char pool[MAXNAMELEN];
 	char name1[MAXNAMELEN];
 	char name2[MAXNAMELEN];
 	zprop_source_t source;
 	uint64_t value;
 } zvol_task_t;
 
 #define	ZVOL_RDONLY	0x1
 
 static uint64_t
 zvol_name_hash(const char *name)
 {
 	int i;
 	uint64_t crc = -1ULL;
 	uint8_t *p = (uint8_t *)name;
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) {
 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
 	}
 	return (crc);
 }
 
 /*
  * Find a zvol_state_t given the full major+minor dev_t. If found,
  * return with zv_state_lock taken, otherwise, return (NULL) without
  * taking zv_state_lock.
  */
 static zvol_state_t *
 zvol_find_by_dev(dev_t dev)
 {
 	zvol_state_t *zv;
 
 	mutex_enter(&zvol_state_lock);
 	for (zv = list_head(&zvol_state_list); zv != NULL;
 	    zv = list_next(&zvol_state_list, zv)) {
 		mutex_enter(&zv->zv_state_lock);
 		if (zv->zv_dev == dev) {
 			mutex_exit(&zvol_state_lock);
 			return (zv);
 		}
 		mutex_exit(&zv->zv_state_lock);
 	}
 	mutex_exit(&zvol_state_lock);
 
 	return (NULL);
 }
 
 /*
  * Find a zvol_state_t given the name and hash generated by zvol_name_hash.
  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
  * before zv_state_lock. The mode argument indicates the mode (including none)
  * for zv_suspend_lock to be taken.
  */
 static zvol_state_t *
 zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
 {
 	zvol_state_t *zv;
 	struct hlist_node *p = NULL;
 
 	mutex_enter(&zvol_state_lock);
 	hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
 		zv = hlist_entry(p, zvol_state_t, zv_hlink);
 		mutex_enter(&zv->zv_state_lock);
 		if (zv->zv_hash == hash &&
 		    strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
 			/*
 			 * this is the right zvol, take the locks in the
 			 * right order
 			 */
 			if (mode != RW_NONE &&
 			    !rw_tryenter(&zv->zv_suspend_lock, mode)) {
 				mutex_exit(&zv->zv_state_lock);
 				rw_enter(&zv->zv_suspend_lock, mode);
 				mutex_enter(&zv->zv_state_lock);
 				/*
 				 * zvol cannot be renamed as we continue
 				 * to hold zvol_state_lock
 				 */
 				ASSERT(zv->zv_hash == hash &&
 				    strncmp(zv->zv_name, name, MAXNAMELEN)
 				    == 0);
 			}
 			mutex_exit(&zvol_state_lock);
 			return (zv);
 		}
 		mutex_exit(&zv->zv_state_lock);
 	}
 	mutex_exit(&zvol_state_lock);
 
 	return (NULL);
 }
 
 /*
  * Find a zvol_state_t given the name.
  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
  * before zv_state_lock. The mode argument indicates the mode (including none)
  * for zv_suspend_lock to be taken.
  */
 static zvol_state_t *
 zvol_find_by_name(const char *name, int mode)
 {
 	return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
 }
 
 
 /*
  * Given a path, return TRUE if path is a ZVOL.
  */
 boolean_t
 zvol_is_zvol(const char *device)
 {
 	struct block_device *bdev;
 	unsigned int major;
 
 	bdev = vdev_lookup_bdev(device);
 	if (IS_ERR(bdev))
 		return (B_FALSE);
 
 	major = MAJOR(bdev->bd_dev);
 	bdput(bdev);
 
 	if (major == zvol_major)
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
  */
 void
 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	zfs_creat_t *zct = arg;
 	nvlist_t *nvprops = zct->zct_props;
 	int error;
 	uint64_t volblocksize, volsize;
 
 	VERIFY(nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 	if (nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 
 	/*
 	 * These properties must be removed from the list so the generic
 	 * property setting step won't apply to them.
 	 */
 	VERIFY(nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 	(void) nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 
 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 	ASSERT(error == 0);
 }
 
 /*
  * ZFS_IOC_OBJSET_STATS entry point.
  */
 int
 zvol_get_stats(objset_t *os, nvlist_t *nv)
 {
 	int error;
 	dmu_object_info_t *doi;
 	uint64_t val;
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 	if (error)
 		return (SET_ERROR(error));
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 	error = dmu_object_info(os, ZVOL_OBJ, doi);
 
 	if (error == 0) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 		    doi->doi_data_block_size);
 	}
 
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	return (SET_ERROR(error));
 }
 
 static void
 zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
 {
 	struct block_device *bdev;
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	bdev = bdget_disk(zv->zv_disk, 0);
 	if (bdev == NULL)
 		return;
 
 	set_capacity(zv->zv_disk, volsize >> 9);
 	zv->zv_volsize = volsize;
 	check_disk_size_change(zv->zv_disk, bdev);
 
 	bdput(bdev);
 }
 
 /*
  * Sanity check volume size.
  */
 int
 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 {
 	if (volsize == 0)
 		return (SET_ERROR(EINVAL));
 
 	if (volsize % blocksize != 0)
 		return (SET_ERROR(EINVAL));
 
 #ifdef _ILP32
 	if (volsize - 1 > SPEC_MAXOFFSET_T)
 		return (SET_ERROR(EOVERFLOW));
 #endif
 	return (0);
 }
 
 /*
  * Ensure the zap is flushed then inform the VFS of the capacity change.
  */
 static int
 zvol_update_volsize(uint64_t volsize, objset_t *os)
 {
 	dmu_tx_t *tx;
 	int error;
 	uint64_t txg;
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (SET_ERROR(error));
 	}
 	txg = dmu_tx_get_txg(tx);
 
 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 	    &volsize, tx);
 	dmu_tx_commit(tx);
 
 	txg_wait_synced(dmu_objset_pool(os), txg);
 
 	if (error == 0)
 		error = dmu_free_long_range(os,
 		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
 
 	return (error);
 }
 
 static int
 zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
 {
 	zvol_size_changed(zv, volsize);
 
 	/*
 	 * We should post a event here describing the expansion.  However,
 	 * the zfs_ereport_post() interface doesn't nicely support posting
 	 * events for zvols, it assumes events relate to vdevs or zios.
 	 */
 
 	return (0);
 }
 
 /*
  * Set ZFS_PROP_VOLSIZE set entry point.
  */
 int
 zvol_set_volsize(const char *name, uint64_t volsize)
 {
 	zvol_state_t *zv = NULL;
 	objset_t *os = NULL;
 	int error;
 	dmu_object_info_t *doi;
 	uint64_t readonly;
 	boolean_t owned = B_FALSE;
 
 	error = dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 	if (error != 0)
 		return (SET_ERROR(error));
 	if (readonly)
 		return (SET_ERROR(EROFS));
 
 	zv = zvol_find_by_name(name, RW_READER);
 
 	ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_READ_HELD(&zv->zv_suspend_lock)));
 
 	if (zv == NULL || zv->zv_objset == NULL) {
 		if (zv != NULL)
 			rw_exit(&zv->zv_suspend_lock);
 		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
 		    FTAG, &os)) != 0) {
 			if (zv != NULL)
 				mutex_exit(&zv->zv_state_lock);
 			return (SET_ERROR(error));
 		}
 		owned = B_TRUE;
 		if (zv != NULL)
 			zv->zv_objset = os;
 	} else {
 		os = zv->zv_objset;
 	}
 
 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 
 	if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
 	    (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
 		goto out;
 
 	error = zvol_update_volsize(volsize, os);
 
 	if (error == 0 && zv != NULL)
 		error = zvol_update_live_volsize(zv, volsize);
 out:
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	if (owned) {
 		dmu_objset_disown(os, B_TRUE, FTAG);
 		if (zv != NULL)
 			zv->zv_objset = NULL;
 	} else {
 		rw_exit(&zv->zv_suspend_lock);
 	}
 
 	if (zv != NULL)
 		mutex_exit(&zv->zv_state_lock);
 
 	return (SET_ERROR(error));
 }
 
 /*
  * Sanity check volume block size.
  */
 int
 zvol_check_volblocksize(const char *name, uint64_t volblocksize)
 {
 	/* Record sizes above 128k need the feature to be enabled */
 	if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
 		spa_t *spa;
 		int error;
 
 		if ((error = spa_open(name, &spa, FTAG)) != 0)
 			return (error);
 
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 			spa_close(spa, FTAG);
 			return (SET_ERROR(ENOTSUP));
 		}
 
 		/*
 		 * We don't allow setting the property above 1MB,
 		 * unless the tunable has been changed.
 		 */
 		if (volblocksize > zfs_max_recordsize)
 			return (SET_ERROR(EDOM));
 
 		spa_close(spa, FTAG);
 	}
 
 	if (volblocksize < SPA_MINBLOCKSIZE ||
 	    volblocksize > SPA_MAXBLOCKSIZE ||
 	    !ISP2(volblocksize))
 		return (SET_ERROR(EDOM));
 
 	return (0);
 }
 
 /*
  * Set ZFS_PROP_VOLBLOCKSIZE set entry point.
  */
 int
 zvol_set_volblocksize(const char *name, uint64_t volblocksize)
 {
 	zvol_state_t *zv;
 	dmu_tx_t *tx;
 	int error;
 
 	zv = zvol_find_by_name(name, RW_READER);
 
 	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_READ_HELD(&zv->zv_suspend_lock));
 
 	if (zv->zv_flags & ZVOL_RDONLY) {
 		mutex_exit(&zv->zv_state_lock);
 		rw_exit(&zv->zv_suspend_lock);
 		return (SET_ERROR(EROFS));
 	}
 
 	tx = dmu_tx_create(zv->zv_objset);
 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 	} else {
 		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
 		    volblocksize, 0, tx);
 		if (error == ENOTSUP)
 			error = SET_ERROR(EBUSY);
 		dmu_tx_commit(tx);
 		if (error == 0)
 			zv->zv_volblocksize = volblocksize;
 	}
 
 	mutex_exit(&zv->zv_state_lock);
 	rw_exit(&zv->zv_suspend_lock);
 
 	return (SET_ERROR(error));
 }
 
 /*
  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
  * implement DKIOCFREE/free-long-range.
  */
 static int
 zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zvol_state_t *zv = arg1;
 	lr_truncate_t *lr = arg2;
 	uint64_t offset, length;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
 }
 
 /*
  * Replay a TX_WRITE ZIL transaction that didn't get committed
  * after a system failure
  */
 static int
 zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zvol_state_t *zv = arg1;
 	lr_write_t *lr = arg2;
 	objset_t *os = zv->zv_objset;
 	char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 	uint64_t offset, length;
 	dmu_tx_t *tx;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 		if (length < blocksize) {
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
 	}
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 	} else {
 		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 		dmu_tx_commit(tx);
 	}
 
 	return (error);
 }
 
 static int
 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 {
 	return (SET_ERROR(ENOTSUP));
 }
 
 /*
  * Callback vectors for replaying records.
  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  */
 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* no such transaction type */
 	zvol_replay_err,	/* TX_CREATE */
 	zvol_replay_err,	/* TX_MKDIR */
 	zvol_replay_err,	/* TX_MKXATTR */
 	zvol_replay_err,	/* TX_SYMLINK */
 	zvol_replay_err,	/* TX_REMOVE */
 	zvol_replay_err,	/* TX_RMDIR */
 	zvol_replay_err,	/* TX_LINK */
 	zvol_replay_err,	/* TX_RENAME */
 	zvol_replay_write,	/* TX_WRITE */
 	zvol_replay_truncate,	/* TX_TRUNCATE */
 	zvol_replay_err,	/* TX_SETATTR */
 	zvol_replay_err,	/* TX_ACL */
 	zvol_replay_err,	/* TX_CREATE_ATTR */
 	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL */
 	zvol_replay_err,	/* TX_MKDIR_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
 	zvol_replay_err,	/* TX_WRITE2 */
 };
 
 /*
  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
  *
  * We store data in the log buffers if it's small enough.
  * Otherwise we will later flush the data out via dmu_sync().
  */
 ssize_t zvol_immediate_write_sz = 32768;
 
 static void
 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
     uint64_t size, int sync)
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
 	itx_wr_state_t write_state;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 		write_state = WR_INDIRECT;
 	else if (!spa_has_slogs(zilog->zl_spa) &&
 	    size >= blocksize && blocksize > zvol_immediate_write_sz)
 		write_state = WR_INDIRECT;
 	else if (sync)
 		write_state = WR_COPIED;
 	else
 		write_state = WR_NEED_COPY;
 
 	while (size) {
 		itx_t *itx;
 		lr_write_t *lr;
 		itx_wr_state_t wr_state = write_state;
 		ssize_t len = size;
 
 		if (wr_state == WR_COPIED && size > ZIL_MAX_COPIED_DATA)
 			wr_state = WR_NEED_COPY;
 		else if (wr_state == WR_INDIRECT)
 			len = MIN(blocksize - P2PHASE(offset, blocksize), size);
 
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
 		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
 		    offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			wr_state = WR_NEED_COPY;
 		}
 
 		itx->itx_wr_state = wr_state;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = offset;
 		lr->lr_length = len;
 		lr->lr_blkoff = 0;
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zv;
 		itx->itx_sync = sync;
 
 		(void) zil_itx_assign(zilog, itx, tx);
 
 		offset += len;
 		size -= len;
 	}
 }
 
 typedef struct zv_request {
 	zvol_state_t	*zv;
 	struct bio	*bio;
 	rl_t		*rl;
 } zv_request_t;
 
 static void
 uio_from_bio(uio_t *uio, struct bio *bio)
 {
 	uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
 	uio->uio_skip = BIO_BI_SKIP(bio);
 	uio->uio_resid = BIO_BI_SIZE(bio);
 	uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
 	uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
 	uio->uio_limit = MAXOFFSET_T;
 	uio->uio_segflg = UIO_BVEC;
 }
 
 static void
 zvol_write(void *arg)
 {
 	zv_request_t *zvr = arg;
 	struct bio *bio = zvr->bio;
 	uio_t uio;
 	zvol_state_t *zv = zvr->zv;
 	uint64_t volsize = zv->zv_volsize;
 	boolean_t sync;
 	int error = 0;
 	unsigned long start_jif;
 
 	uio_from_bio(&uio, bio);
 
 	ASSERT(zv && zv->zv_open_count > 0);
 
 	start_jif = jiffies;
 	blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
 	    &zv->zv_disk->part0);
 
 	sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 
 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 		uint64_t off = uio.uio_loffset;
 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 
 		if (bytes > volsize - off)	/* don't write past the end */
 			bytes = volsize - off;
 
 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 
 		/* This will only fail for ENOSPC */
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			break;
 		}
 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
 		if (error == 0)
 			zvol_log_write(zv, tx, off, bytes, sync);
 		dmu_tx_commit(tx);
 
 		if (error)
 			break;
 	}
 	zfs_range_unlock(zvr->rl);
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 	rw_exit(&zv->zv_suspend_lock);
 	blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0,
 	    start_jif);
 	BIO_END_IO(bio, -error);
 	kmem_free(zvr, sizeof (zv_request_t));
 }
 
 /*
  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
  */
 static void
 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
     boolean_t sync)
 {
 	itx_t *itx;
 	lr_truncate_t *lr;
 	zilog_t *zilog = zv->zv_zilog;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
 	lr = (lr_truncate_t *)&itx->itx_lr;
 	lr->lr_foid = ZVOL_OBJ;
 	lr->lr_offset = off;
 	lr->lr_length = len;
 
 	itx->itx_sync = sync;
 	zil_itx_assign(zilog, itx, tx);
 }
 
 static void
 zvol_discard(void *arg)
 {
 	zv_request_t *zvr = arg;
 	struct bio *bio = zvr->bio;
 	zvol_state_t *zv = zvr->zv;
 	uint64_t start = BIO_BI_SECTOR(bio) << 9;
 	uint64_t size = BIO_BI_SIZE(bio);
 	uint64_t end = start + size;
 	boolean_t sync;
 	int error = 0;
 	dmu_tx_t *tx;
 	unsigned long start_jif;
 
 	ASSERT(zv && zv->zv_open_count > 0);
 
 	start_jif = jiffies;
 	blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
 	    &zv->zv_disk->part0);
 
 	sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 
 	if (end > zv->zv_volsize) {
 		error = SET_ERROR(EIO);
 		goto unlock;
 	}
 
 	/*
 	 * Align the request to volume block boundaries when a secure erase is
 	 * not required.  This will prevent dnode_free_range() from zeroing out
 	 * the unaligned parts which is slow (read-modify-write) and useless
 	 * since we are not freeing any space by doing so.
 	 */
 	if (!bio_is_secure_erase(bio)) {
 		start = P2ROUNDUP(start, zv->zv_volblocksize);
 		end = P2ALIGN(end, zv->zv_volblocksize);
 		size = end - start;
 	}
 
 	if (start >= end)
 		goto unlock;
 
 	tx = dmu_tx_create(zv->zv_objset);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0) {
 		dmu_tx_abort(tx);
 	} else {
 		zvol_log_truncate(zv, tx, start, size, B_TRUE);
 		dmu_tx_commit(tx);
 		error = dmu_free_long_range(zv->zv_objset,
 		    ZVOL_OBJ, start, size);
 	}
 unlock:
 	zfs_range_unlock(zvr->rl);
 	if (error == 0 && sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 	rw_exit(&zv->zv_suspend_lock);
 	blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0,
 	    start_jif);
 	BIO_END_IO(bio, -error);
 	kmem_free(zvr, sizeof (zv_request_t));
 }
 
 static void
 zvol_read(void *arg)
 {
 	zv_request_t *zvr = arg;
 	struct bio *bio = zvr->bio;
 	uio_t uio;
 	zvol_state_t *zv = zvr->zv;
 	uint64_t volsize = zv->zv_volsize;
 	int error = 0;
 	unsigned long start_jif;
 
 	uio_from_bio(&uio, bio);
 
 	ASSERT(zv && zv->zv_open_count > 0);
 
 	start_jif = jiffies;
 	blk_generic_start_io_acct(zv->zv_queue, READ, bio_sectors(bio),
 	    &zv->zv_disk->part0);
 
 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 
 		/* don't read past the end */
 		if (bytes > volsize - uio.uio_loffset)
 			bytes = volsize - uio.uio_loffset;
 
 		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 	}
 	zfs_range_unlock(zvr->rl);
 
 	rw_exit(&zv->zv_suspend_lock);
 	blk_generic_end_io_acct(zv->zv_queue, READ, &zv->zv_disk->part0,
 	    start_jif);
 	BIO_END_IO(bio, -error);
 	kmem_free(zvr, sizeof (zv_request_t));
 }
 
 static MAKE_REQUEST_FN_RET
 zvol_request(struct request_queue *q, struct bio *bio)
 {
 	zvol_state_t *zv = q->queuedata;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
 	uint64_t size = BIO_BI_SIZE(bio);
 	int rw = bio_data_dir(bio);
 	zv_request_t *zvr;
 
 	if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
 		printk(KERN_INFO
 		    "%s: bad access: offset=%llu, size=%lu\n",
 		    zv->zv_disk->disk_name,
 		    (long long unsigned)offset,
 		    (long unsigned)size);
 
 		BIO_END_IO(bio, -SET_ERROR(EIO));
 		goto out;
 	}
 
 	if (rw == WRITE) {
 		boolean_t need_sync = B_FALSE;
 
 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
 			BIO_END_IO(bio, -SET_ERROR(EROFS));
 			goto out;
 		}
 
 		/*
 		 * To be released in the I/O function. See the comment on
 		 * zfs_range_lock below.
 		 */
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
 
 		/* bio marked as FLUSH need to flush before write */
 		if (bio_is_flush(bio))
 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 		/* Some requests are just for flush and nothing else. */
 		if (size == 0) {
 			rw_exit(&zv->zv_suspend_lock);
 			BIO_END_IO(bio, 0);
 			goto out;
 		}
 
 		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
 		zvr->zv = zv;
 		zvr->bio = bio;
 
 		/*
 		 * To be released in the I/O function. Since the I/O functions
 		 * are asynchronous, we take it here synchronously to make
 		 * sure overlapped I/Os are properly ordered.
 		 */
 		zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
 		    RL_WRITER);
 		/*
 		 * Sync writes and discards execute zil_commit() which may need
 		 * to take a RL_READER lock on the whole block being modified
 		 * via its zillog->zl_get_data(): to avoid circular dependency
 		 * issues with taskq threads execute these requests
 		 * synchronously here in zvol_request().
 		 */
 		need_sync = bio_is_fua(bio) ||
 		    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 		if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
 			if (zvol_request_sync || need_sync ||
 			    taskq_dispatch(zvol_taskq, zvol_discard, zvr,
 			    TQ_SLEEP) == TASKQID_INVALID)
 				zvol_discard(zvr);
 		} else {
 			if (zvol_request_sync || need_sync ||
 			    taskq_dispatch(zvol_taskq, zvol_write, zvr,
 			    TQ_SLEEP) == TASKQID_INVALID)
 				zvol_write(zvr);
 		}
 	} else {
 		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
 		zvr->zv = zv;
 		zvr->bio = bio;
 
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
 
 		zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
 		    RL_READER);
 		if (zvol_request_sync || taskq_dispatch(zvol_taskq,
 		    zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
 			zvol_read(zvr);
 	}
 
 out:
 	spl_fstrans_unmark(cookie);
 #ifdef HAVE_MAKE_REQUEST_FN_RET_INT
 	return (0);
 #elif defined(HAVE_MAKE_REQUEST_FN_RET_QC)
 	return (BLK_QC_T_NONE);
 #endif
 }
 
 static void
 zvol_get_done(zgd_t *zgd, int error)
 {
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_range_unlock(zgd->zgd_rl);
 
 	if (error == 0 && zgd->zgd_bp)
-		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 static int
-zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 {
 	zvol_state_t *zv = arg;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error;
 
-	ASSERT(zio != NULL);
-	ASSERT(size != 0);
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(zio, !=, NULL);
+	ASSERT3U(size, !=, 0);
 
 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
-	zgd->zgd_zilog = zv->zv_zilog;
+	zgd->zgd_lwb = lwb;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
 		    RL_READER);
 		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
 	} else { /* indirect write */
 		/*
 		 * Have to lock the whole block to ensure when it's written out
 		 * and its checksum is being calculated that no one can change
 		 * the data. Contrarily to zfs_get_data we need not re-check
 		 * blocksize after we get the lock because it cannot be changed.
 		 */
 		size = zv->zv_volblocksize;
 		offset = P2ALIGN_TYPED(offset, size, uint64_t);
 		zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
 		    RL_READER);
 		error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db != NULL);
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zvol_get_done, zgd);
 
 			if (error == 0)
 				return (0);
 		}
 	}
 
 	zvol_get_done(zgd, error);
 
 	return (SET_ERROR(error));
 }
 
 /*
  * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
  */
 static void
 zvol_insert(zvol_state_t *zv)
 {
 	ASSERT(MUTEX_HELD(&zvol_state_lock));
 	ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
 	list_insert_head(&zvol_state_list, zv);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 }
 
 /*
  * Simply remove the zvol from to list of zvols.
  */
 static void
 zvol_remove(zvol_state_t *zv)
 {
 	ASSERT(MUTEX_HELD(&zvol_state_lock));
 	list_remove(&zvol_state_list, zv);
 	hlist_del(&zv->zv_hlink);
 }
 
 /*
  * Setup zv after we just own the zv->objset
  */
 static int
 zvol_setup_zv(zvol_state_t *zv)
 {
 	uint64_t volsize;
 	int error;
 	uint64_t ro;
 	objset_t *os = zv->zv_objset;
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_LOCK_HELD(&zv->zv_suspend_lock));
 
 	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
 	if (error)
 		return (SET_ERROR(error));
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error)
 		return (SET_ERROR(error));
 
 	error = dnode_hold(os, ZVOL_OBJ, FTAG, &zv->zv_dn);
 	if (error)
 		return (SET_ERROR(error));
 
 	set_capacity(zv->zv_disk, volsize >> 9);
 	zv->zv_volsize = volsize;
 	zv->zv_zilog = zil_open(os, zvol_get_data);
 
 	if (ro || dmu_objset_is_snapshot(os) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
 		set_disk_ro(zv->zv_disk, 1);
 		zv->zv_flags |= ZVOL_RDONLY;
 	} else {
 		set_disk_ro(zv->zv_disk, 0);
 		zv->zv_flags &= ~ZVOL_RDONLY;
 	}
 	return (0);
 }
 
 /*
  * Shutdown every zv_objset related stuff except zv_objset itself.
  * The is the reverse of zvol_setup_zv.
  */
 static void
 zvol_shutdown_zv(zvol_state_t *zv)
 {
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_LOCK_HELD(&zv->zv_suspend_lock));
 
 	zil_close(zv->zv_zilog);
 	zv->zv_zilog = NULL;
 
 	dnode_rele(zv->zv_dn, FTAG);
 	zv->zv_dn = NULL;
 
 	/*
 	 * Evict cached data
 	 */
 	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
 	    !(zv->zv_flags & ZVOL_RDONLY))
 		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 	(void) dmu_objset_evict_dbufs(zv->zv_objset);
 }
 
 /*
  * return the proper tag for rollback and recv
  */
 void *
 zvol_tag(zvol_state_t *zv)
 {
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 	return (zv->zv_open_count > 0 ? zv : NULL);
 }
 
 /*
  * Suspend the zvol for recv and rollback.
  */
 zvol_state_t *
 zvol_suspend(const char *name)
 {
 	zvol_state_t *zv;
 
 	zv = zvol_find_by_name(name, RW_WRITER);
 
 	if (zv == NULL)
 		return (NULL);
 
 	/* block all I/O, release in zvol_resume. */
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_WRITE_HELD(&zv->zv_suspend_lock));
 
 	atomic_inc(&zv->zv_suspend_ref);
 
 	if (zv->zv_open_count > 0)
 		zvol_shutdown_zv(zv);
 
 	/*
 	 * do not hold zv_state_lock across suspend/resume to
 	 * avoid locking up zvol lookups
 	 */
 	mutex_exit(&zv->zv_state_lock);
 
 	/* zv_suspend_lock is released in zvol_resume() */
 	return (zv);
 }
 
 int
 zvol_resume(zvol_state_t *zv)
 {
 	int error = 0;
 
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 
 	mutex_enter(&zv->zv_state_lock);
 
 	if (zv->zv_open_count > 0) {
 		VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
 		VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
 		VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
 		dmu_objset_rele(zv->zv_objset, zv);
 
 		error = zvol_setup_zv(zv);
 	}
 
 	mutex_exit(&zv->zv_state_lock);
 
 	rw_exit(&zv->zv_suspend_lock);
 	/*
 	 * We need this because we don't hold zvol_state_lock while releasing
 	 * zv_suspend_lock. zvol_remove_minors_impl thus cannot check
 	 * zv_suspend_lock to determine it is safe to free because rwlock is
 	 * not inherent atomic.
 	 */
 	atomic_dec(&zv->zv_suspend_ref);
 
 	return (SET_ERROR(error));
 }
 
 static int
 zvol_first_open(zvol_state_t *zv)
 {
 	objset_t *os;
 	int error, locked = 0;
 
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	/*
 	 * In all other cases the spa_namespace_lock is taken before the
 	 * bdev->bd_mutex lock.	 But in this case the Linux __blkdev_get()
 	 * function calls fops->open() with the bdev->bd_mutex lock held.
 	 * This deadlock can be easily observed with zvols used as vdevs.
 	 *
 	 * To avoid a potential lock inversion deadlock we preemptively
 	 * try to take the spa_namespace_lock().  Normally it will not
 	 * be contended and this is safe because spa_open_common() handles
 	 * the case where the caller already holds the spa_namespace_lock.
 	 *
 	 * When it is contended we risk a lock inversion if we were to
 	 * block waiting for the lock.	Luckily, the __blkdev_get()
 	 * function allows us to return -ERESTARTSYS which will result in
 	 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
 	 * called again.  This process can be repeated safely until both
 	 * locks are acquired.
 	 */
 	if (!mutex_owned(&spa_namespace_lock)) {
 		locked = mutex_tryenter(&spa_namespace_lock);
 		if (!locked)
 			return (-SET_ERROR(ERESTARTSYS));
 	}
 
 	/* lie and say we're read-only */
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, 1, zv, &os);
 	if (error)
 		goto out_mutex;
 
 	zv->zv_objset = os;
 
 	error = zvol_setup_zv(zv);
 
 	if (error) {
 		dmu_objset_disown(os, 1, zv);
 		zv->zv_objset = NULL;
 	}
 
 out_mutex:
 	if (locked)
 		mutex_exit(&spa_namespace_lock);
 	return (SET_ERROR(-error));
 }
 
 static void
 zvol_last_close(zvol_state_t *zv)
 {
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	zvol_shutdown_zv(zv);
 
 	dmu_objset_disown(zv->zv_objset, 1, zv);
 	zv->zv_objset = NULL;
 }
 
 static int
 zvol_open(struct block_device *bdev, fmode_t flag)
 {
 	zvol_state_t *zv;
 	int error = 0;
 	boolean_t drop_suspend = B_TRUE;
 
 	ASSERT(!MUTEX_HELD(&zvol_state_lock));
 
 	mutex_enter(&zvol_state_lock);
 	/*
 	 * Obtain a copy of private_data under the zvol_state_lock to make
 	 * sure that either the result of zvol free code path setting
 	 * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
 	 * is not called on this zv because of the positive zv_open_count.
 	 */
 	zv = bdev->bd_disk->private_data;
 	if (zv == NULL) {
 		mutex_exit(&zvol_state_lock);
 		return (SET_ERROR(-ENXIO));
 	}
 
 	mutex_enter(&zv->zv_state_lock);
 	/*
 	 * make sure zvol is not suspended during first open
 	 * (hold zv_suspend_lock) and respect proper lock acquisition
 	 * ordering - zv_suspend_lock before zv_state_lock
 	 */
 	if (zv->zv_open_count == 0) {
 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 0) {
 				rw_exit(&zv->zv_suspend_lock);
 				drop_suspend = B_FALSE;
 			}
 		}
 	} else {
 		drop_suspend = B_FALSE;
 	}
 	mutex_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
 
 	if (zv->zv_open_count == 0) {
 		error = zvol_first_open(zv);
 		if (error)
 			goto out_mutex;
 	}
 
 	if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 		error = -EROFS;
 		goto out_open_count;
 	}
 
 	zv->zv_open_count++;
 
 	check_disk_change(bdev);
 
 out_open_count:
 	if (zv->zv_open_count == 0)
 		zvol_last_close(zv);
 out_mutex:
 	mutex_exit(&zv->zv_state_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 	if (error == -ERESTARTSYS)
 		schedule();
 
 	return (SET_ERROR(error));
 }
 
 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
 static void
 #else
 static int
 #endif
 zvol_release(struct gendisk *disk, fmode_t mode)
 {
 	zvol_state_t *zv;
 	boolean_t drop_suspend = B_TRUE;
 
 	ASSERT(!MUTEX_HELD(&zvol_state_lock));
 
 	mutex_enter(&zvol_state_lock);
 	zv = disk->private_data;
 
 	mutex_enter(&zv->zv_state_lock);
 	ASSERT(zv->zv_open_count > 0);
 	/*
 	 * make sure zvol is not suspended during last close
 	 * (hold zv_suspend_lock) and respect proper lock acquisition
 	 * ordering - zv_suspend_lock before zv_state_lock
 	 */
 	if (zv->zv_open_count == 1) {
 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 1) {
 				rw_exit(&zv->zv_suspend_lock);
 				drop_suspend = B_FALSE;
 			}
 		}
 	} else {
 		drop_suspend = B_FALSE;
 	}
 	mutex_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
 
 	zv->zv_open_count--;
 	if (zv->zv_open_count == 0)
 		zvol_last_close(zv);
 
 	mutex_exit(&zv->zv_state_lock);
 
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 
 #ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
 	return (0);
 #endif
 }
 
 static int
 zvol_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned int cmd, unsigned long arg)
 {
 	zvol_state_t *zv = bdev->bd_disk->private_data;
 	int error = 0;
 
 	ASSERT(zv && zv->zv_open_count > 0);
 
 	switch (cmd) {
 	case BLKFLSBUF:
 		fsync_bdev(bdev);
 		invalidate_bdev(bdev);
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
 
 		if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
 		    !(zv->zv_flags & ZVOL_RDONLY))
 			txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 
 		rw_exit(&zv->zv_suspend_lock);
 		break;
 
 	case BLKZNAME:
 		mutex_enter(&zv->zv_state_lock);
 		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
 		mutex_exit(&zv->zv_state_lock);
 		break;
 
 	default:
 		error = -ENOTTY;
 		break;
 	}
 
 	return (SET_ERROR(error));
 }
 
 #ifdef CONFIG_COMPAT
 static int
 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned cmd, unsigned long arg)
 {
 	return (zvol_ioctl(bdev, mode, cmd, arg));
 }
 #else
 #define	zvol_compat_ioctl	NULL
 #endif
 
 static int zvol_media_changed(struct gendisk *disk)
 {
 	zvol_state_t *zv = disk->private_data;
 
 	ASSERT(zv && zv->zv_open_count > 0);
 
 	return (zv->zv_changed);
 }
 
 static int zvol_revalidate_disk(struct gendisk *disk)
 {
 	zvol_state_t *zv = disk->private_data;
 
 	ASSERT(zv && zv->zv_open_count > 0);
 
 	zv->zv_changed = 0;
 	set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
 
 	return (0);
 }
 
 /*
  * Provide a simple virtual geometry for legacy compatibility.  For devices
  * smaller than 1 MiB a small head and sector count is used to allow very
  * tiny devices.  For devices over 1 Mib a standard head and sector count
  * is used to keep the cylinders count reasonable.
  */
 static int
 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	zvol_state_t *zv = bdev->bd_disk->private_data;
 	sector_t sectors;
 
 	ASSERT(zv && zv->zv_open_count > 0);
 
 	sectors = get_capacity(zv->zv_disk);
 
 	if (sectors > 2048) {
 		geo->heads = 16;
 		geo->sectors = 63;
 	} else {
 		geo->heads = 2;
 		geo->sectors = 4;
 	}
 
 	geo->start = 0;
 	geo->cylinders = sectors / (geo->heads * geo->sectors);
 
 	return (0);
 }
 
 static struct kobject *
 zvol_probe(dev_t dev, int *part, void *arg)
 {
 	zvol_state_t *zv;
 	struct kobject *kobj;
 
 	zv = zvol_find_by_dev(dev);
 	kobj = zv ? get_disk(zv->zv_disk) : NULL;
 	ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
 	if (zv)
 		mutex_exit(&zv->zv_state_lock);
 
 	return (kobj);
 }
 
 #ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS
 static struct block_device_operations zvol_ops = {
 	.open			= zvol_open,
 	.release		= zvol_release,
 	.ioctl			= zvol_ioctl,
 	.compat_ioctl		= zvol_compat_ioctl,
 	.media_changed		= zvol_media_changed,
 	.revalidate_disk	= zvol_revalidate_disk,
 	.getgeo			= zvol_getgeo,
 	.owner			= THIS_MODULE,
 };
 
 #else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
 
 static int
 zvol_open_by_inode(struct inode *inode, struct file *file)
 {
 	return (zvol_open(inode->i_bdev, file->f_mode));
 }
 
 static int
 zvol_release_by_inode(struct inode *inode, struct file *file)
 {
 	return (zvol_release(inode->i_bdev->bd_disk, file->f_mode));
 }
 
 static int
 zvol_ioctl_by_inode(struct inode *inode, struct file *file,
     unsigned int cmd, unsigned long arg)
 {
 	if (file == NULL || inode == NULL)
 		return (SET_ERROR(-EINVAL));
 
 	return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg));
 }
 
 #ifdef CONFIG_COMPAT
 static long
 zvol_compat_ioctl_by_inode(struct file *file,
     unsigned int cmd, unsigned long arg)
 {
 	if (file == NULL)
 		return (SET_ERROR(-EINVAL));
 
 	return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
 	    file->f_mode, cmd, arg));
 }
 #else
 #define	zvol_compat_ioctl_by_inode	NULL
 #endif
 
 static struct block_device_operations zvol_ops = {
 	.open			= zvol_open_by_inode,
 	.release		= zvol_release_by_inode,
 	.ioctl			= zvol_ioctl_by_inode,
 	.compat_ioctl		= zvol_compat_ioctl_by_inode,
 	.media_changed		= zvol_media_changed,
 	.revalidate_disk	= zvol_revalidate_disk,
 	.getgeo			= zvol_getgeo,
 	.owner			= THIS_MODULE,
 };
 #endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
 
 /*
  * Allocate memory for a new zvol_state_t and setup the required
  * request queue and generic disk structures for the block device.
  */
 static zvol_state_t *
 zvol_alloc(dev_t dev, const char *name)
 {
 	zvol_state_t *zv;
 	uint64_t volmode;
 
 	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
 		return (NULL);
 
 	if (volmode == ZFS_VOLMODE_DEFAULT)
 		volmode = zvol_volmode;
 
 	if (volmode == ZFS_VOLMODE_NONE)
 		return (NULL);
 
 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 
 	list_link_init(&zv->zv_next);
 
 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
 	if (zv->zv_queue == NULL)
 		goto out_kmem;
 
 	blk_queue_make_request(zv->zv_queue, zvol_request);
 	blk_queue_set_write_cache(zv->zv_queue, B_TRUE, B_TRUE);
 
 	/* Limit read-ahead to a single page to prevent over-prefetching. */
 	blk_queue_set_read_ahead(zv->zv_queue, 1);
 
 	/* Disable write merging in favor of the ZIO pipeline. */
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, zv->zv_queue);
 
 	zv->zv_disk = alloc_disk(ZVOL_MINORS);
 	if (zv->zv_disk == NULL)
 		goto out_queue;
 
 	zv->zv_queue->queuedata = zv;
 	zv->zv_dev = dev;
 	zv->zv_open_count = 0;
 	strlcpy(zv->zv_name, name, MAXNAMELEN);
 
 	zfs_rlock_init(&zv->zv_range_lock);
 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
 
 	zv->zv_disk->major = zvol_major;
 	if (volmode == ZFS_VOLMODE_DEV) {
 		/*
 		 * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
 		 * gendisk->minors = 1 as noted in include/linux/genhd.h.
 		 * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
 		 * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
 		 * setting gendisk->flags accordingly.
 		 */
 		zv->zv_disk->minors = 1;
 #if defined(GENHD_FL_EXT_DEVT)
 		zv->zv_disk->flags &= ~GENHD_FL_EXT_DEVT;
 #endif
 #if defined(GENHD_FL_NO_PART_SCAN)
 		zv->zv_disk->flags |= GENHD_FL_NO_PART_SCAN;
 #endif
 	}
 	zv->zv_disk->first_minor = (dev & MINORMASK);
 	zv->zv_disk->fops = &zvol_ops;
 	zv->zv_disk->private_data = zv;
 	zv->zv_disk->queue = zv->zv_queue;
 	snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d",
 	    ZVOL_DEV_NAME, (dev & MINORMASK));
 
 	return (zv);
 
 out_queue:
 	blk_cleanup_queue(zv->zv_queue);
 out_kmem:
 	kmem_free(zv, sizeof (zvol_state_t));
 
 	return (NULL);
 }
 
 /*
  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
  * At this time, the structure is not opened by anyone, is taken off
  * the zvol_state_list, and has its private data set to NULL.
  * The zvol_state_lock is dropped.
  */
 static void
 zvol_free(void *arg)
 {
 	zvol_state_t *zv = arg;
 
 	ASSERT(!MUTEX_HELD(&zvol_state_lock));
 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(zv->zv_open_count == 0);
 	ASSERT(zv->zv_disk->private_data == NULL);
 
 	rw_destroy(&zv->zv_suspend_lock);
 	zfs_rlock_destroy(&zv->zv_range_lock);
 
 	del_gendisk(zv->zv_disk);
 	blk_cleanup_queue(zv->zv_queue);
 	put_disk(zv->zv_disk);
 
 	ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS);
 
 	mutex_destroy(&zv->zv_state_lock);
 
 	kmem_free(zv, sizeof (zvol_state_t));
 }
 
 /*
  * Create a block device minor node and setup the linkage between it
  * and the specified volume.  Once this function returns the block
  * device is live and ready for use.
  */
 static int
 zvol_create_minor_impl(const char *name)
 {
 	zvol_state_t *zv;
 	objset_t *os;
 	dmu_object_info_t *doi;
 	uint64_t volsize;
 	uint64_t len;
 	unsigned minor = 0;
 	int error = 0;
 	int idx;
 	uint64_t hash = zvol_name_hash(name);
 
 	if (zvol_inhibit_dev)
 		return (0);
 
 	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
 
 	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
 	if (zv) {
 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 		mutex_exit(&zv->zv_state_lock);
 		ida_simple_remove(&zvol_ida, idx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 
 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
 	if (error)
 		goto out_doi;
 
 	error = dmu_object_info(os, ZVOL_OBJ, doi);
 	if (error)
 		goto out_dmu_objset_disown;
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error)
 		goto out_dmu_objset_disown;
 
 	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
 	if (zv == NULL) {
 		error = SET_ERROR(EAGAIN);
 		goto out_dmu_objset_disown;
 	}
 	zv->zv_hash = hash;
 
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;
 
 	zv->zv_volblocksize = doi->doi_data_block_size;
 	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 
 	set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
 
 	blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9);
 	blk_queue_max_segments(zv->zv_queue, UINT16_MAX);
 	blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
 	blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
 	blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
 	blk_queue_max_discard_sectors(zv->zv_queue,
 	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
 	blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
 #ifdef QUEUE_FLAG_NONROT
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
 #endif
 #ifdef QUEUE_FLAG_ADD_RANDOM
 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
 #endif
 
 	if (spa_writeable(dmu_objset_spa(os))) {
 		if (zil_replay_disable)
 			zil_destroy(dmu_objset_zil(os), B_FALSE);
 		else
 			zil_replay(os, zv, zvol_replay_vector);
 	}
 
 	/*
 	 * When udev detects the addition of the device it will immediately
 	 * invoke blkid(8) to determine the type of content on the device.
 	 * Prefetching the blocks commonly scanned by blkid(8) will speed
 	 * up this process.
 	 */
 	len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
 	if (len > 0) {
 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
 		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	zv->zv_objset = NULL;
 out_dmu_objset_disown:
 	dmu_objset_disown(os, B_TRUE, FTAG);
 out_doi:
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	if (error == 0) {
 		mutex_enter(&zvol_state_lock);
 		zvol_insert(zv);
 		mutex_exit(&zvol_state_lock);
 		add_disk(zv->zv_disk);
 	} else {
 		ida_simple_remove(&zvol_ida, idx);
 	}
 
 	return (SET_ERROR(error));
 }
 
 /*
  * Rename a block device minor mode for the specified volume.
  */
 static void
 zvol_rename_minor(zvol_state_t *zv, const char *newname)
 {
 	int readonly = get_disk_ro(zv->zv_disk);
 
 	ASSERT(MUTEX_HELD(&zvol_state_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
 
 	/* move to new hashtable entry  */
 	zv->zv_hash = zvol_name_hash(zv->zv_name);
 	hlist_del(&zv->zv_hlink);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 
 	/*
 	 * The block device's read-only state is briefly changed causing
 	 * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
 	 * the name change and fixes the symlinks.  This does not change
 	 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
 	 * changes.  This would normally be done using kobject_uevent() but
 	 * that is a GPL-only symbol which is why we need this workaround.
 	 */
 	set_disk_ro(zv->zv_disk, !readonly);
 	set_disk_ro(zv->zv_disk, readonly);
 }
 
 typedef struct minors_job {
 	list_t *list;
 	list_node_t link;
 	/* input */
 	char *name;
 	/* output */
 	int error;
 } minors_job_t;
 
 /*
  * Prefetch zvol dnodes for the minors_job
  */
 static void
 zvol_prefetch_minors_impl(void *arg)
 {
 	minors_job_t *job = arg;
 	char *dsname = job->name;
 	objset_t *os = NULL;
 
 	job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
 	    FTAG, &os);
 	if (job->error == 0) {
 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
 		dmu_objset_disown(os, B_TRUE, FTAG);
 	}
 }
 
 /*
  * Mask errors to continue dmu_objset_find() traversal
  */
 static int
 zvol_create_snap_minor_cb(const char *dsname, void *arg)
 {
 	minors_job_t *j = arg;
 	list_t *minors_list = j->list;
 	const char *name = j->name;
 
 	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
 
 	/* skip the designated dataset */
 	if (name && strcmp(dsname, name) == 0)
 		return (0);
 
 	/* at this point, the dsname should name a snapshot */
 	if (strchr(dsname, '@') == 0) {
 		dprintf("zvol_create_snap_minor_cb(): "
 		    "%s is not a shapshot name\n", dsname);
 	} else {
 		minors_job_t *job;
 		char *n = strdup(dsname);
 		if (n == NULL)
 			return (0);
 
 		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 		job->name = n;
 		job->list = minors_list;
 		job->error = 0;
 		list_insert_tail(minors_list, job);
 		/* don't care if dispatch fails, because job->error is 0 */
 		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
 		    TQ_SLEEP);
 	}
 
 	return (0);
 }
 
 /*
  * Mask errors to continue dmu_objset_find() traversal
  */
 static int
 zvol_create_minors_cb(const char *dsname, void *arg)
 {
 	uint64_t snapdev;
 	int error;
 	list_t *minors_list = arg;
 
 	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
 
 	error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
 	if (error)
 		return (0);
 
 	/*
 	 * Given the name and the 'snapdev' property, create device minor nodes
 	 * with the linkages to zvols/snapshots as needed.
 	 * If the name represents a zvol, create a minor node for the zvol, then
 	 * check if its snapshots are 'visible', and if so, iterate over the
 	 * snapshots and create device minor nodes for those.
 	 */
 	if (strchr(dsname, '@') == 0) {
 		minors_job_t *job;
 		char *n = strdup(dsname);
 		if (n == NULL)
 			return (0);
 
 		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 		job->name = n;
 		job->list = minors_list;
 		job->error = 0;
 		list_insert_tail(minors_list, job);
 		/* don't care if dispatch fails, because job->error is 0 */
 		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
 		    TQ_SLEEP);
 
 		if (snapdev == ZFS_SNAPDEV_VISIBLE) {
 			/*
 			 * traverse snapshots only, do not traverse children,
 			 * and skip the 'dsname'
 			 */
 			error = dmu_objset_find((char *)dsname,
 			    zvol_create_snap_minor_cb, (void *)job,
 			    DS_FIND_SNAPSHOTS);
 		}
 	} else {
 		dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
 		    dsname);
 	}
 
 	return (0);
 }
 
 /*
  * Create minors for the specified dataset, including children and snapshots.
  * Pay attention to the 'snapdev' property and iterate over the snapshots
  * only if they are 'visible'. This approach allows one to assure that the
  * snapshot metadata is read from disk only if it is needed.
  *
  * The name can represent a dataset to be recursively scanned for zvols and
  * their snapshots, or a single zvol snapshot. If the name represents a
  * dataset, the scan is performed in two nested stages:
  * - scan the dataset for zvols, and
  * - for each zvol, create a minor node, then check if the zvol's snapshots
  *   are 'visible', and only then iterate over the snapshots if needed
  *
  * If the name represents a snapshot, a check is performed if the snapshot is
  * 'visible' (which also verifies that the parent is a zvol), and if so,
  * a minor node for that snapshot is created.
  */
 static int
 zvol_create_minors_impl(const char *name)
 {
 	int error = 0;
 	fstrans_cookie_t cookie;
 	char *atp, *parent;
 	list_t minors_list;
 	minors_job_t *job;
 
 	if (zvol_inhibit_dev)
 		return (0);
 
 	/*
 	 * This is the list for prefetch jobs. Whenever we found a match
 	 * during dmu_objset_find, we insert a minors_job to the list and do
 	 * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
 	 * any lock because all list operation is done on the current thread.
 	 *
 	 * We will use this list to do zvol_create_minor_impl after prefetch
 	 * so we don't have to traverse using dmu_objset_find again.
 	 */
 	list_create(&minors_list, sizeof (minors_job_t),
 	    offsetof(minors_job_t, link));
 
 	parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	(void) strlcpy(parent, name, MAXPATHLEN);
 
 	if ((atp = strrchr(parent, '@')) != NULL) {
 		uint64_t snapdev;
 
 		*atp = '\0';
 		error = dsl_prop_get_integer(parent, "snapdev",
 		    &snapdev, NULL);
 
 		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
 			error = zvol_create_minor_impl(name);
 	} else {
 		cookie = spl_fstrans_mark();
 		error = dmu_objset_find(parent, zvol_create_minors_cb,
 		    &minors_list, DS_FIND_CHILDREN);
 		spl_fstrans_unmark(cookie);
 	}
 
 	kmem_free(parent, MAXPATHLEN);
 	taskq_wait_outstanding(system_taskq, 0);
 
 	/*
 	 * Prefetch is completed, we can do zvol_create_minor_impl
 	 * sequentially.
 	 */
 	while ((job = list_head(&minors_list)) != NULL) {
 		list_remove(&minors_list, job);
 		if (!job->error)
 			zvol_create_minor_impl(job->name);
 		strfree(job->name);
 		kmem_free(job, sizeof (minors_job_t));
 	}
 
 	list_destroy(&minors_list);
 
 	return (SET_ERROR(error));
 }
 
 /*
  * Remove minors for specified dataset including children and snapshots.
  */
 static void
 zvol_remove_minors_impl(const char *name)
 {
 	zvol_state_t *zv, *zv_next;
 	int namelen = ((name) ? strlen(name) : 0);
 	taskqid_t t, tid = TASKQID_INVALID;
 	list_t free_list;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	list_create(&free_list, sizeof (zvol_state_t),
 	    offsetof(zvol_state_t, zv_next));
 
 	mutex_enter(&zvol_state_lock);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 		if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
 		    (strncmp(zv->zv_name, name, namelen) == 0 &&
 		    (zv->zv_name[namelen] == '/' ||
 		    zv->zv_name[namelen] == '@'))) {
 			/*
 			 * By holding zv_state_lock here, we guarantee that no
 			 * one is currently using this zv
 			 */
 
 			/* If in use, leave alone */
 			if (zv->zv_open_count > 0 ||
 			    atomic_read(&zv->zv_suspend_ref)) {
 				mutex_exit(&zv->zv_state_lock);
 				continue;
 			}
 
 			zvol_remove(zv);
 
 			/*
 			 * clear this while holding zvol_state_lock so
 			 * zvol_open won't open it
 			 */
 			zv->zv_disk->private_data = NULL;
 
 			/* Drop zv_state_lock before zvol_free() */
 			mutex_exit(&zv->zv_state_lock);
 
 			/* try parallel zv_free, if failed do it in place */
 			t = taskq_dispatch(system_taskq, zvol_free, zv,
 			    TQ_SLEEP);
 			if (t == TASKQID_INVALID)
 				list_insert_head(&free_list, zv);
 			else
 				tid = t;
 		} else {
 			mutex_exit(&zv->zv_state_lock);
 		}
 	}
 	mutex_exit(&zvol_state_lock);
 
 	/*
 	 * Drop zvol_state_lock before calling zvol_free()
 	 */
 	while ((zv = list_head(&free_list)) != NULL) {
 		list_remove(&free_list, zv);
 		zvol_free(zv);
 	}
 
 	if (tid != TASKQID_INVALID)
 		taskq_wait_outstanding(system_taskq, tid);
 }
 
 /* Remove minor for this specific volume only */
 static void
 zvol_remove_minor_impl(const char *name)
 {
 	zvol_state_t *zv = NULL, *zv_next;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	mutex_enter(&zvol_state_lock);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 		if (strcmp(zv->zv_name, name) == 0) {
 			/*
 			 * By holding zv_state_lock here, we guarantee that no
 			 * one is currently using this zv
 			 */
 
 			/* If in use, leave alone */
 			if (zv->zv_open_count > 0 ||
 			    atomic_read(&zv->zv_suspend_ref)) {
 				mutex_exit(&zv->zv_state_lock);
 				continue;
 			}
 			zvol_remove(zv);
 
 			/* clear this so zvol_open won't open it */
 			zv->zv_disk->private_data = NULL;
 
 			mutex_exit(&zv->zv_state_lock);
 			break;
 		} else {
 			mutex_exit(&zv->zv_state_lock);
 		}
 	}
 
 	/* Drop zvol_state_lock before calling zvol_free() */
 	mutex_exit(&zvol_state_lock);
 
 	if (zv != NULL)
 		zvol_free(zv);
 }
 
 /*
  * Rename minors for specified dataset including children and snapshots.
  */
 static void
 zvol_rename_minors_impl(const char *oldname, const char *newname)
 {
 	zvol_state_t *zv, *zv_next;
 	int oldnamelen, newnamelen;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	oldnamelen = strlen(oldname);
 	newnamelen = strlen(newname);
 
 	mutex_enter(&zvol_state_lock);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 
 		/* If in use, leave alone */
 		if (zv->zv_open_count > 0) {
 			mutex_exit(&zv->zv_state_lock);
 			continue;
 		}
 
 		if (strcmp(zv->zv_name, oldname) == 0) {
 			zvol_rename_minor(zv, newname);
 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
 		    (zv->zv_name[oldnamelen] == '/' ||
 		    zv->zv_name[oldnamelen] == '@')) {
 			char *name = kmem_asprintf("%s%c%s", newname,
 			    zv->zv_name[oldnamelen],
 			    zv->zv_name + oldnamelen + 1);
 			zvol_rename_minor(zv, name);
 			kmem_free(name, strlen(name + 1));
 		}
 
 		mutex_exit(&zv->zv_state_lock);
 	}
 
 	mutex_exit(&zvol_state_lock);
 }
 
 typedef struct zvol_snapdev_cb_arg {
 	uint64_t snapdev;
 } zvol_snapdev_cb_arg_t;
 
 static int
 zvol_set_snapdev_cb(const char *dsname, void *param)
 {
 	zvol_snapdev_cb_arg_t *arg = param;
 
 	if (strchr(dsname, '@') == NULL)
 		return (0);
 
 	switch (arg->snapdev) {
 		case ZFS_SNAPDEV_VISIBLE:
 			(void) zvol_create_minor_impl(dsname);
 			break;
 		case ZFS_SNAPDEV_HIDDEN:
 			(void) zvol_remove_minor_impl(dsname);
 			break;
 	}
 
 	return (0);
 }
 
 static void
 zvol_set_snapdev_impl(char *name, uint64_t snapdev)
 {
 	zvol_snapdev_cb_arg_t arg = {snapdev};
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	/*
 	 * The zvol_set_snapdev_sync() sets snapdev appropriately
 	 * in the dataset hierarchy. Here, we only scan snapshots.
 	 */
 	dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
 	spl_fstrans_unmark(cookie);
 }
 
 typedef struct zvol_volmode_cb_arg {
 	uint64_t volmode;
 } zvol_volmode_cb_arg_t;
 
 static void
 zvol_set_volmode_impl(char *name, uint64_t volmode)
 {
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	if (strchr(name, '@') != NULL)
 		return;
 
 	/*
 	 * It's unfortunate we need to remove minors before we create new ones:
 	 * this is necessary because our backing gendisk (zvol_state->zv_disk)
 	 * coule be different when we set, for instance, volmode from "geom"
 	 * to "dev" (or vice versa).
 	 * A possible optimization is to modify our consumers so we don't get
 	 * called when "volmode" does not change.
 	 */
 	switch (volmode) {
 		case ZFS_VOLMODE_NONE:
 			(void) zvol_remove_minor_impl(name);
 			break;
 		case ZFS_VOLMODE_GEOM:
 		case ZFS_VOLMODE_DEV:
 			(void) zvol_remove_minor_impl(name);
 			(void) zvol_create_minor_impl(name);
 			break;
 		case ZFS_VOLMODE_DEFAULT:
 			(void) zvol_remove_minor_impl(name);
 			if (zvol_volmode == ZFS_VOLMODE_NONE)
 				break;
 			else /* if zvol_volmode is invalid defaults to "geom" */
 				(void) zvol_create_minor_impl(name);
 			break;
 	}
 
 	spl_fstrans_unmark(cookie);
 }
 
 static zvol_task_t *
 zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
     uint64_t value)
 {
 	zvol_task_t *task;
 	char *delim;
 
 	/* Never allow tasks on hidden names. */
 	if (name1[0] == '$')
 		return (NULL);
 
 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 	task->op = op;
 	task->value = value;
 	delim = strchr(name1, '/');
 	strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
 
 	strlcpy(task->name1, name1, MAXNAMELEN);
 	if (name2 != NULL)
 		strlcpy(task->name2, name2, MAXNAMELEN);
 
 	return (task);
 }
 
 static void
 zvol_task_free(zvol_task_t *task)
 {
 	kmem_free(task, sizeof (zvol_task_t));
 }
 
 /*
  * The worker thread function performed asynchronously.
  */
 static void
 zvol_task_cb(void *param)
 {
 	zvol_task_t *task = (zvol_task_t *)param;
 
 	switch (task->op) {
 	case ZVOL_ASYNC_CREATE_MINORS:
 		(void) zvol_create_minors_impl(task->name1);
 		break;
 	case ZVOL_ASYNC_REMOVE_MINORS:
 		zvol_remove_minors_impl(task->name1);
 		break;
 	case ZVOL_ASYNC_RENAME_MINORS:
 		zvol_rename_minors_impl(task->name1, task->name2);
 		break;
 	case ZVOL_ASYNC_SET_SNAPDEV:
 		zvol_set_snapdev_impl(task->name1, task->value);
 		break;
 	case ZVOL_ASYNC_SET_VOLMODE:
 		zvol_set_volmode_impl(task->name1, task->value);
 		break;
 	default:
 		VERIFY(0);
 		break;
 	}
 
 	zvol_task_free(task);
 }
 
 typedef struct zvol_set_prop_int_arg {
 	const char *zsda_name;
 	uint64_t zsda_value;
 	zprop_source_t zsda_source;
 	dmu_tx_t *zsda_tx;
 } zvol_set_prop_int_arg_t;
 
 /*
  * Sanity check the dataset for safe use by the sync task.  No additional
  * conditions are imposed.
  */
 static int
 zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	int error;
 
 	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
 	if (error != 0)
 		return (error);
 
 	dsl_dir_rele(dd, FTAG);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	char dsname[MAXNAMELEN];
 	zvol_task_t *task;
 	uint64_t snapdev;
 
 	dsl_dataset_name(ds, dsname);
 	if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0)
 		return (0);
 	task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev);
 	if (task == NULL)
 		return (0);
 
 	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
 	    task, TQ_SLEEP);
 	return (0);
 }
 
 /*
  * Traverse all child datasets and apply snapdev appropriately.
  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
  * dataset and read the effective "snapdev" on every child in the callback
  * function: this is because the value is not guaranteed to be the same in the
  * whole dataset hierarchy.
  */
 static void
 zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	int error;
 
 	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
 	zsda->zsda_tx = tx;
 
 	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
 	if (error == 0) {
 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
 		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
 		    &zsda->zsda_value, zsda->zsda_tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
 	    zsda, DS_FIND_CHILDREN);
 
 	dsl_dir_rele(dd, FTAG);
 }
 
 int
 zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
 {
 	zvol_set_prop_int_arg_t zsda;
 
 	zsda.zsda_name = ddname;
 	zsda.zsda_source = source;
 	zsda.zsda_value = snapdev;
 
 	return (dsl_sync_task(ddname, zvol_set_snapdev_check,
 	    zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 /*
  * Sanity check the dataset for safe use by the sync task.  No additional
  * conditions are imposed.
  */
 static int
 zvol_set_volmode_check(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	int error;
 
 	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
 	if (error != 0)
 		return (error);
 
 	dsl_dir_rele(dd, FTAG);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	char dsname[MAXNAMELEN];
 	zvol_task_t *task;
 	uint64_t volmode;
 
 	dsl_dataset_name(ds, dsname);
 	if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0)
 		return (0);
 	task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode);
 	if (task == NULL)
 		return (0);
 
 	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
 	    task, TQ_SLEEP);
 	return (0);
 }
 
 /*
  * Traverse all child datasets and apply volmode appropriately.
  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
  * dataset and read the effective "volmode" on every child in the callback
  * function: this is because the value is not guaranteed to be the same in the
  * whole dataset hierarchy.
  */
 static void
 zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	int error;
 
 	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
 	zsda->zsda_tx = tx;
 
 	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
 	if (error == 0) {
 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE),
 		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
 		    &zsda->zsda_value, zsda->zsda_tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb,
 	    zsda, DS_FIND_CHILDREN);
 
 	dsl_dir_rele(dd, FTAG);
 }
 
 int
 zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode)
 {
 	zvol_set_prop_int_arg_t zsda;
 
 	zsda.zsda_name = ddname;
 	zsda.zsda_source = source;
 	zsda.zsda_value = volmode;
 
 	return (dsl_sync_task(ddname, zvol_set_volmode_check,
 	    zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 void
 zvol_create_minors(spa_t *spa, const char *name, boolean_t async)
 {
 	zvol_task_t *task;
 	taskqid_t id;
 
 	task = zvol_task_alloc(ZVOL_ASYNC_CREATE_MINORS, name, NULL, ~0ULL);
 	if (task == NULL)
 		return;
 
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
 }
 
 void
 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
 {
 	zvol_task_t *task;
 	taskqid_t id;
 
 	task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
 	if (task == NULL)
 		return;
 
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
 }
 
 void
 zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
     boolean_t async)
 {
 	zvol_task_t *task;
 	taskqid_t id;
 
 	task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
 	if (task == NULL)
 		return;
 
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
 }
 
 int
 zvol_init(void)
 {
 	int threads = MIN(MAX(zvol_threads, 1), 1024);
 	int i, error;
 
 	list_create(&zvol_state_list, sizeof (zvol_state_t),
 	    offsetof(zvol_state_t, zv_next));
 	mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
 	ida_init(&zvol_ida);
 
 	zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
 	    threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 	if (zvol_taskq == NULL) {
 		printk(KERN_INFO "ZFS: taskq_create() failed\n");
 		error = -ENOMEM;
 		goto out;
 	}
 
 	zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
 	    KM_SLEEP);
 	if (!zvol_htable) {
 		error = -ENOMEM;
 		goto out_taskq;
 	}
 	for (i = 0; i < ZVOL_HT_SIZE; i++)
 		INIT_HLIST_HEAD(&zvol_htable[i]);
 
 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
 	if (error) {
 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
 		goto out_free;
 	}
 
 	blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
 	    THIS_MODULE, zvol_probe, NULL, NULL);
 
 	return (0);
 
 out_free:
 	kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
 out_taskq:
 	taskq_destroy(zvol_taskq);
 out:
 	ida_destroy(&zvol_ida);
 	mutex_destroy(&zvol_state_lock);
 	list_destroy(&zvol_state_list);
 
 	return (SET_ERROR(error));
 }
 
 void
 zvol_fini(void)
 {
 	zvol_remove_minors_impl(NULL);
 
 	blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
 	kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
 
 	taskq_destroy(zvol_taskq);
 	list_destroy(&zvol_state_list);
 	mutex_destroy(&zvol_state_lock);
 
 	ida_destroy(&zvol_ida);
 }
 
 /* BEGIN CSTYLED */
 module_param(zvol_inhibit_dev, uint, 0644);
 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
 
 module_param(zvol_major, uint, 0444);
 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
 
 module_param(zvol_threads, uint, 0444);
 MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
 
 module_param(zvol_request_sync, uint, 0644);
 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
 
 module_param(zvol_max_discard_blocks, ulong, 0444);
 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
 
 module_param(zvol_prefetch_bytes, uint, 0644);
 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
 
 module_param(zvol_volmode, uint, 0644);
 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
 /* END CSTYLED */
diff --git a/scripts/zloop.sh b/scripts/zloop.sh
index a45c450247f9..afcd87f3626d 100755
--- a/scripts/zloop.sh
+++ b/scripts/zloop.sh
@@ -1,273 +1,273 @@
 #!/bin/bash
 
 #
 # CDDL HEADER START
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 # CDDL HEADER END
 #
 
 #
 # Copyright (c) 2015 by Delphix. All rights reserved.
 # Copyright (C) 2016 Lawrence Livermore National Security, LLC.
 #
 
 BASE_DIR=$(dirname "$0")
 SCRIPT_COMMON=common.sh
 if [ -f "${BASE_DIR}/${SCRIPT_COMMON}" ]; then
 	. "${BASE_DIR}/${SCRIPT_COMMON}"
 else
 	echo "Missing helper script ${SCRIPT_COMMON}" && exit 1
 fi
 
 # shellcheck disable=SC2034
 PROG=zloop.sh
 
 DEFAULTWORKDIR=/var/tmp
 DEFAULTCOREDIR=/var/tmp/zloop
 
 function usage
 {
 	echo -e "\n$0 [-t <timeout>] [ -s <vdev size> ] [-c <dump directory>]" \
 	    "[ -- [extra ztest parameters]]\n" \
 	    "\n" \
 	    "  This script runs ztest repeatedly with randomized arguments.\n" \
 	    "  If a crash is encountered, the ztest logs, any associated\n" \
 	    "  vdev files, and core file (if one exists) are moved to the\n" \
 	    "  output directory ($DEFAULTCOREDIR by default). Any options\n" \
 	    "  after the -- end-of-options marker will be passed to ztest.\n" \
 	    "\n" \
 	    "  Options:\n" \
 	    "    -t  Total time to loop for, in seconds. If not provided,\n" \
 	    "        zloop runs forever.\n" \
 	    "    -s  Size of vdev devices.\n" \
 	    "    -f  Specify working directory for ztest vdev files.\n" \
 	    "    -c  Specify a core dump directory to use.\n" \
 	    "    -h  Print this help message.\n" \
 	    "" >&2
 }
 
 function or_die
 {
 	# shellcheck disable=SC2068
 	$@
 	# shellcheck disable=SC2181
 	if [[ $? -ne 0 ]]; then
 		# shellcheck disable=SC2145
 		echo "Command failed: $@"
 		exit 1
 	fi
 }
 
 # core file helpers
 origcorepattern="$(cat /proc/sys/kernel/core_pattern)"
 coreglob="$(egrep -o '^([^|%[:space:]]*)' /proc/sys/kernel/core_pattern)*"
 
 if [[ $coreglob = "*" ]]; then
         echo "Setting core file pattern..."
         echo "core" > /proc/sys/kernel/core_pattern
         coreglob="$(egrep -o '^([^|%[:space:]]*)' \
             /proc/sys/kernel/core_pattern)*"
 fi
 
 function core_file
 {
 	# shellcheck disable=SC2012 disable=2086
         printf "%s" "$(ls -tr1 $coreglob 2> /dev/null | head -1)"
 }
 
 function core_prog
 {
 	prog=$ZTEST
 	core_id=$($GDB --batch -c "$1" | grep "Core was generated by" | \
 	    tr  \' ' ')
 	# shellcheck disable=SC2076
 	if [[ "$core_id" =~ "zdb "  ]]; then
 		prog=$ZDB
 	fi
 	printf "%s" "$prog"
 }
 
 function store_core
 {
 	core="$(core_file)"
 	if [[ $ztrc -ne 0 ]] || [[ -f "$core" ]]; then
 		df -h "$workdir" >>ztest.out
 		coreid=$(date "+zloop-%y%m%d-%H%M%S")
 		foundcrashes=$((foundcrashes + 1))
 
 		dest=$coredir/$coreid
 		or_die mkdir -p "$dest"
 		or_die mkdir -p "$dest/vdev"
 
 		echo "*** ztest crash found - moving logs to $dest"
 
 		or_die mv ztest.history "$dest/"
 		or_die mv ztest.ddt "$dest/"
 		or_die mv ztest.out "$dest/"
 		or_die mv "$workdir/ztest*" "$dest/vdev/"
 		or_die mv "$workdir/zpool.cache" "$dest/vdev/"
 
 		# check for core
 		if [[ -f "$core" ]]; then
 			coreprog=$(core_prog "$core")
 			corestatus=$($GDB --batch --quiet \
 			    -ex "set print thread-events off" \
 			    -ex "printf \"*\n* Backtrace \n*\n\"" \
 			    -ex "bt" \
 			    -ex "printf \"*\n* Libraries \n*\n\"" \
 			    -ex "info sharedlib" \
 			    -ex "printf \"*\n* Threads (full) \n*\n\"" \
 			    -ex "info threads" \
 			    -ex "printf \"*\n* Backtraces \n*\n\"" \
 			    -ex "thread apply all bt" \
 			    -ex "printf \"*\n* Backtraces (full) \n*\n\"" \
 			    -ex "thread apply all bt full" \
 			    -ex "quit" "$coreprog" "$core" | grep -v "New LWP")
 
 			# Dump core + logs to stored directory
 			echo "$corestatus" >>"$dest/status"
 			or_die mv "$core" "$dest/"
 
 			# Record info in cores logfile
 			echo "*** core @ $coredir/$coreid/$core:" | \
 			    tee -a ztest.cores
 			echo "$corestatus" | tee -a ztest.cores
 			echo "" | tee -a ztest.cores
 		fi
 		echo "continuing..."
 	fi
 }
 
 rngdpid=""
 function on_exit
 {
 	if [ -n "$rngdpid" ]; then
 		kill -9 "$rngdpid"
 	fi
 }
 trap on_exit EXIT
 
 # parse arguments
 # expected format: zloop [-t timeout] [-c coredir] [-- extra ztest args]
 coredir=$DEFAULTCOREDIR
 basedir=$DEFAULTWORKDIR
 rundir="zloop-run"
 timeout=0
 size="512m"
 while getopts ":ht:s:c:f:" opt; do
 	case $opt in
 		t ) [[ $OPTARG -gt 0 ]] && timeout=$OPTARG ;;
 		s ) [[ $OPTARG ]] && size=$OPTARG ;;
 		c ) [[ $OPTARG ]] && coredir=$OPTARG ;;
 		f ) [[ $OPTARG ]] && basedir=$(readlink -f "$OPTARG") ;;
 		h ) usage
 		    exit 2
 		    ;;
 		* ) echo "Invalid argument: -$OPTARG";
 		    usage
 		    exit 1
 	esac
 done
 # pass remaining arguments on to ztest
 shift $((OPTIND - 1))
 
 # enable core dumps
 ulimit -c unlimited
 
 if [[ -f "$(core_file)" ]]; then
 	echo -n "There's a core dump here you might want to look at first... "
 	core_file
 	exit 1
 fi
 
 if [[ ! -d $coredir ]]; then
 	echo "core dump directory ($coredir) does not exist, creating it."
 	or_die mkdir -p "$coredir"
 fi
 
 if [[ ! -w $coredir ]]; then
 	echo "core dump directory ($coredir) is not writable."
 	exit 1
 fi
 
 or_die rm -f ztest.history
 or_die rm -f ztest.ddt
 or_die rm -f ztest.cores
 
 # start rngd in the background so we don't run out of entropy
 or_die read -r rngdpid < <(rngd -f -r /dev/urandom & echo $!)
 
 ztrc=0		# ztest return value
 foundcrashes=0	# number of crashes found so far
 starttime=$(date +%s)
 curtime=$starttime
 
 # if no timeout was specified, loop forever.
 while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
 	zopt="-VVVVV"
 
 	# start each run with an empty directory
 	workdir="$basedir/$rundir"
 	or_die rm -rf "$workdir"
 	or_die mkdir "$workdir"
 
 	# switch between common arrangements & fully randomized
 	if [[ $((RANDOM % 2)) -eq 0 ]]; then
 		mirrors=2
 		raidz=0
 		parity=1
 		vdevs=2
 	else
 		mirrors=$(((RANDOM % 3) * 1))
 		parity=$(((RANDOM % 3) + 1))
 		raidz=$((((RANDOM % 9) + parity + 1) * (RANDOM % 2)))
 		vdevs=$(((RANDOM % 3) + 3))
 	fi
 	align=$(((RANDOM % 2) * 3 + 9))
 	runtime=$((RANDOM % 100))
 	passtime=$((RANDOM % (runtime / 3 + 1) + 10))
 
 	zopt="$zopt -m $mirrors"
 	zopt="$zopt -r $raidz"
 	zopt="$zopt -R $parity"
 	zopt="$zopt -v $vdevs"
 	zopt="$zopt -a $align"
 	zopt="$zopt -T $runtime"
 	zopt="$zopt -P $passtime"
 	zopt="$zopt -s $size"
 	zopt="$zopt -f $workdir"
 
 	# shellcheck disable=SC2124
 	cmd="$ZTEST $zopt $@"
 	desc="$(date '+%m/%d %T') $cmd"
 	echo "$desc" | tee -a ztest.history
 	echo "$desc" >>ztest.out
 	$cmd >>ztest.out 2>&1
 	ztrc=$?
 	egrep '===|WARNING' ztest.out >>ztest.history
-	$ZDB -U "$workdir/zpool.cache" -DD ztest >>ztest.ddt
+	$ZDB -U "$workdir/zpool.cache" -DD ztest >>ztest.ddt 2>&1
 
 	store_core
 
 	curtime=$(date +%s)
 done
 
 echo "zloop finished, $foundcrashes crashes found"
 
 #restore core pattern
 echo "$origcorepattern" > /proc/sys/kernel/core_pattern
 
 uptime >>ztest.out
 
 if [[ $foundcrashes -gt 0 ]]; then
 	exit 1
 fi