Index: vendor-sys/openzfs/dist/META
===================================================================
--- vendor-sys/openzfs/dist/META	(revision 364927)
+++ vendor-sys/openzfs/dist/META	(revision 364928)
@@ -1,10 +1,10 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       0.8.0
-Release:       1
+Version:       2.0.0
+Release:       rc1
 Release-Tags:  relext
 License:       CDDL
-Author:        OpenZFS on Linux
-Linux-Maximum: 5.6
+Author:        OpenZFS
+Linux-Maximum: 5.8
 Linux-Minimum: 3.10
Index: vendor-sys/openzfs/dist/NEWS
===================================================================
--- vendor-sys/openzfs/dist/NEWS	(revision 364927)
+++ vendor-sys/openzfs/dist/NEWS	(revision 364928)
@@ -1,3 +1,3 @@
 Descriptions of all releases can be found on github:
 
-https://github.com/zfsonlinux/zfs/releases
+https://github.com/openzfs/zfs/releases
Index: vendor-sys/openzfs/dist/cmd/zpool/zpool_main.c
===================================================================
--- vendor-sys/openzfs/dist/cmd/zpool/zpool_main.c	(revision 364927)
+++ vendor-sys/openzfs/dist/cmd/zpool/zpool_main.c	(revision 364928)
@@ -1,10326 +1,10329 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2012 by Frederik Wessels. All rights reserved.
  * Copyright (c) 2012 by Cyril Plisko. All rights reserved.
  * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
 #include <libgen.h>
 #include <libintl.h>
 #include <libuutil.h>
 #include <locale.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
 #include <time.h>
 #include <unistd.h>
 #include <pwd.h>
 #include <zone.h>
 #include <sys/wait.h>
 #include <zfs_prop.h>
 #include <sys/fs/zfs.h>
 #include <sys/stat.h>
 #include <sys/systeminfo.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/fm/util.h>
 #include <sys/fm/protocol.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/mount.h>
 #include <sys/sysmacros.h>
 
 #include <math.h>
 
 #include <libzfs.h>
 #include <libzutil.h>
 
 #include "zpool_util.h"
 #include "zfs_comutil.h"
 #include "zfeature_common.h"
 
 #include "statcommon.h"
 
 libzfs_handle_t *g_zfs;
 
 static int zpool_do_create(int, char **);
 static int zpool_do_destroy(int, char **);
 
 static int zpool_do_add(int, char **);
 static int zpool_do_remove(int, char **);
 static int zpool_do_labelclear(int, char **);
 
 static int zpool_do_checkpoint(int, char **);
 
 static int zpool_do_list(int, char **);
 static int zpool_do_iostat(int, char **);
 static int zpool_do_status(int, char **);
 
 static int zpool_do_online(int, char **);
 static int zpool_do_offline(int, char **);
 static int zpool_do_clear(int, char **);
 static int zpool_do_reopen(int, char **);
 
 static int zpool_do_reguid(int, char **);
 
 static int zpool_do_attach(int, char **);
 static int zpool_do_detach(int, char **);
 static int zpool_do_replace(int, char **);
 static int zpool_do_split(int, char **);
 
 static int zpool_do_initialize(int, char **);
 static int zpool_do_scrub(int, char **);
 static int zpool_do_resilver(int, char **);
 static int zpool_do_trim(int, char **);
 
 static int zpool_do_import(int, char **);
 static int zpool_do_export(int, char **);
 
 static int zpool_do_upgrade(int, char **);
 
 static int zpool_do_history(int, char **);
 static int zpool_do_events(int, char **);
 
 static int zpool_do_get(int, char **);
 static int zpool_do_set(int, char **);
 
 static int zpool_do_sync(int, char **);
 
 static int zpool_do_version(int, char **);
 
 static int zpool_do_wait(int, char **);
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 
 #ifdef DEBUG
 const char *
 _umem_debug_init(void)
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 #endif
 
 typedef enum {
 	HELP_ADD,
 	HELP_ATTACH,
 	HELP_CLEAR,
 	HELP_CREATE,
 	HELP_CHECKPOINT,
 	HELP_DESTROY,
 	HELP_DETACH,
 	HELP_EXPORT,
 	HELP_HISTORY,
 	HELP_IMPORT,
 	HELP_IOSTAT,
 	HELP_LABELCLEAR,
 	HELP_LIST,
 	HELP_OFFLINE,
 	HELP_ONLINE,
 	HELP_REPLACE,
 	HELP_REMOVE,
 	HELP_INITIALIZE,
 	HELP_SCRUB,
 	HELP_RESILVER,
 	HELP_TRIM,
 	HELP_STATUS,
 	HELP_UPGRADE,
 	HELP_EVENTS,
 	HELP_GET,
 	HELP_SET,
 	HELP_SPLIT,
 	HELP_SYNC,
 	HELP_REGUID,
 	HELP_REOPEN,
 	HELP_VERSION,
 	HELP_WAIT
 } zpool_help_t;
 
 
 /*
  * Flags for stats to display with "zpool iostats"
  */
 enum iostat_type {
 	IOS_DEFAULT = 0,
 	IOS_LATENCY = 1,
 	IOS_QUEUES = 2,
 	IOS_L_HISTO = 3,
 	IOS_RQ_HISTO = 4,
 	IOS_COUNT,	/* always last element */
 };
 
 /* iostat_type entries as bitmasks */
 #define	IOS_DEFAULT_M	(1ULL << IOS_DEFAULT)
 #define	IOS_LATENCY_M	(1ULL << IOS_LATENCY)
 #define	IOS_QUEUES_M	(1ULL << IOS_QUEUES)
 #define	IOS_L_HISTO_M	(1ULL << IOS_L_HISTO)
 #define	IOS_RQ_HISTO_M	(1ULL << IOS_RQ_HISTO)
 
 /* Mask of all the histo bits */
 #define	IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M)
 
 /*
  * Lookup table for iostat flags to nvlist names.  Basically a list
  * of all the nvlists a flag requires.  Also specifies the order in
  * which data gets printed in zpool iostat.
  */
 static const char *vsx_type_to_nvlist[IOS_COUNT][13] = {
 	[IOS_L_HISTO] = {
 	    ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
 	    NULL},
 	[IOS_LATENCY] = {
 	    ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
 	    ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
 	    NULL},
 	[IOS_QUEUES] = {
 	    ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
 	    ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
 	    ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
 	    ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
 	    ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
 	    ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
 	    NULL},
 	[IOS_RQ_HISTO] = {
 	    ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
 	    ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
 	    ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,
 	    ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,
 	    ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,
 	    ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,
 	    ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,
 	    ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,
 	    ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
 	    ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
 	    ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
 	    ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
 	    NULL},
 };
 
 
 /*
  * Given a cb->cb_flags with a histogram bit set, return the iostat_type.
  * Right now, only one histo bit is ever set at one time, so we can
  * just do a highbit64(a)
  */
 #define	IOS_HISTO_IDX(a)	(highbit64(a & IOS_ANYHISTO_M) - 1)
 
 typedef struct zpool_command {
 	const char	*name;
 	int		(*func)(int, char **);
 	zpool_help_t	usage;
 } zpool_command_t;
 
 /*
  * Master command table.  Each ZFS command has a name, associated function, and
  * usage message.  The usage messages need to be internationalized, so we have
  * to have a function to return the usage message based on a command index.
  *
  * These commands are organized according to how they are displayed in the usage
  * message.  An empty command (one with a NULL name) indicates an empty line in
  * the generic usage message.
  */
 static zpool_command_t command_table[] = {
 	{ "version",	zpool_do_version,	HELP_VERSION		},
 	{ NULL },
 	{ "create",	zpool_do_create,	HELP_CREATE		},
 	{ "destroy",	zpool_do_destroy,	HELP_DESTROY		},
 	{ NULL },
 	{ "add",	zpool_do_add,		HELP_ADD		},
 	{ "remove",	zpool_do_remove,	HELP_REMOVE		},
 	{ NULL },
 	{ "labelclear",	zpool_do_labelclear,	HELP_LABELCLEAR		},
 	{ NULL },
 	{ "checkpoint",	zpool_do_checkpoint,	HELP_CHECKPOINT		},
 	{ NULL },
 	{ "list",	zpool_do_list,		HELP_LIST		},
 	{ "iostat",	zpool_do_iostat,	HELP_IOSTAT		},
 	{ "status",	zpool_do_status,	HELP_STATUS		},
 	{ NULL },
 	{ "online",	zpool_do_online,	HELP_ONLINE		},
 	{ "offline",	zpool_do_offline,	HELP_OFFLINE		},
 	{ "clear",	zpool_do_clear,		HELP_CLEAR		},
 	{ "reopen",	zpool_do_reopen,	HELP_REOPEN		},
 	{ NULL },
 	{ "attach",	zpool_do_attach,	HELP_ATTACH		},
 	{ "detach",	zpool_do_detach,	HELP_DETACH		},
 	{ "replace",	zpool_do_replace,	HELP_REPLACE		},
 	{ "split",	zpool_do_split,		HELP_SPLIT		},
 	{ NULL },
 	{ "initialize",	zpool_do_initialize,	HELP_INITIALIZE		},
 	{ "resilver",	zpool_do_resilver,	HELP_RESILVER		},
 	{ "scrub",	zpool_do_scrub,		HELP_SCRUB		},
 	{ "trim",	zpool_do_trim,		HELP_TRIM		},
 	{ NULL },
 	{ "import",	zpool_do_import,	HELP_IMPORT		},
 	{ "export",	zpool_do_export,	HELP_EXPORT		},
 	{ "upgrade",	zpool_do_upgrade,	HELP_UPGRADE		},
 	{ "reguid",	zpool_do_reguid,	HELP_REGUID		},
 	{ NULL },
 	{ "history",	zpool_do_history,	HELP_HISTORY		},
 	{ "events",	zpool_do_events,	HELP_EVENTS		},
 	{ NULL },
 	{ "get",	zpool_do_get,		HELP_GET		},
 	{ "set",	zpool_do_set,		HELP_SET		},
 	{ "sync",	zpool_do_sync,		HELP_SYNC		},
 	{ NULL },
 	{ "wait",	zpool_do_wait,		HELP_WAIT		},
 };
 
 #define	NCOMMAND	(ARRAY_SIZE(command_table))
 
 #define	VDEV_ALLOC_CLASS_LOGS	"logs"
 
 static zpool_command_t *current_command;
 static char history_str[HIS_MAX_RECORD_LEN];
 static boolean_t log_history = B_TRUE;
 static uint_t timestamp_fmt = NODATE;
 
 static const char *
 get_usage(zpool_help_t idx)
 {
 	switch (idx) {
 	case HELP_ADD:
 		return (gettext("\tadd [-fgLnP] [-o property=value] "
 		    "<pool> <vdev> ...\n"));
 	case HELP_ATTACH:
 		return (gettext("\tattach [-fsw] [-o property=value] "
 		    "<pool> <device> <new-device>\n"));
 	case HELP_CLEAR:
 		return (gettext("\tclear [-nF] <pool> [device]\n"));
 	case HELP_CREATE:
 		return (gettext("\tcreate [-fnd] [-o property=value] ... \n"
 		    "\t    [-O file-system-property=value] ... \n"
 		    "\t    [-m mountpoint] [-R root] <pool> <vdev> ...\n"));
 	case HELP_CHECKPOINT:
 		return (gettext("\tcheckpoint [-d [-w]] <pool> ...\n"));
 	case HELP_DESTROY:
 		return (gettext("\tdestroy [-f] <pool>\n"));
 	case HELP_DETACH:
 		return (gettext("\tdetach <pool> <device>\n"));
 	case HELP_EXPORT:
 		return (gettext("\texport [-af] <pool> ...\n"));
 	case HELP_HISTORY:
 		return (gettext("\thistory [-il] [<pool>] ...\n"));
 	case HELP_IMPORT:
 		return (gettext("\timport [-d dir] [-D]\n"
 		    "\timport [-o mntopts] [-o property=value] ... \n"
 		    "\t    [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] "
 		    "[-R root] [-F [-n]] -a\n"
 		    "\timport [-o mntopts] [-o property=value] ... \n"
 		    "\t    [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] "
 		    "[-R root] [-F [-n]]\n"
 		    "\t    [--rewind-to-checkpoint] <pool | id> [newpool]\n"));
 	case HELP_IOSTAT:
 		return (gettext("\tiostat [[[-c [script1,script2,...]"
 		    "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n"
 		    "\t    [[pool ...]|[pool vdev ...]|[vdev ...]]"
 		    " [[-n] interval [count]]\n"));
 	case HELP_LABELCLEAR:
 		return (gettext("\tlabelclear [-f] <vdev>\n"));
 	case HELP_LIST:
 		return (gettext("\tlist [-gHLpPv] [-o property[,...]] "
 		    "[-T d|u] [pool] ... \n"
 		    "\t    [interval [count]]\n"));
 	case HELP_OFFLINE:
 		return (gettext("\toffline [-f] [-t] <pool> <device> ...\n"));
 	case HELP_ONLINE:
 		return (gettext("\tonline [-e] <pool> <device> ...\n"));
 	case HELP_REPLACE:
 		return (gettext("\treplace [-fsw] [-o property=value] "
 		    "<pool> <device> [new-device]\n"));
 	case HELP_REMOVE:
 		return (gettext("\tremove [-npsw] <pool> <device> ...\n"));
 	case HELP_REOPEN:
 		return (gettext("\treopen [-n] <pool>\n"));
 	case HELP_INITIALIZE:
 		return (gettext("\tinitialize [-c | -s] [-w] <pool> "
 		    "[<device> ...]\n"));
 	case HELP_SCRUB:
 		return (gettext("\tscrub [-s | -p] [-w] <pool> ...\n"));
 	case HELP_RESILVER:
 		return (gettext("\tresilver <pool> ...\n"));
 	case HELP_TRIM:
 		return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] <pool> "
 		    "[<device> ...]\n"));
 	case HELP_STATUS:
 		return (gettext("\tstatus [-c [script1,script2,...]] "
 		    "[-igLpPstvxD]  [-T d|u] [pool] ... \n"
 		    "\t    [interval [count]]\n"));
 	case HELP_UPGRADE:
 		return (gettext("\tupgrade\n"
 		    "\tupgrade -v\n"
 		    "\tupgrade [-V version] <-a | pool ...>\n"));
 	case HELP_EVENTS:
 		return (gettext("\tevents [-vHf [pool] | -c]\n"));
 	case HELP_GET:
 		return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] "
 		    "<\"all\" | property[,...]> <pool> ...\n"));
 	case HELP_SET:
 		return (gettext("\tset <property=value> <pool> \n"));
 	case HELP_SPLIT:
 		return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n"
 		    "\t    [-o property=value] <pool> <newpool> "
 		    "[<device> ...]\n"));
 	case HELP_REGUID:
 		return (gettext("\treguid <pool>\n"));
 	case HELP_SYNC:
 		return (gettext("\tsync [pool] ...\n"));
 	case HELP_VERSION:
 		return (gettext("\tversion\n"));
 	case HELP_WAIT:
 		return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] "
 		    "<pool> [interval]\n"));
 	}
 
 	abort();
 	/* NOTREACHED */
 }
 
 static void
 zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
 {
 	uint_t children = 0;
 	nvlist_t **child;
 	uint_t i;
 
 	(void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children);
 
 	if (children == 0) {
 		char *path = zpool_vdev_name(g_zfs, zhp, nvroot,
 		    VDEV_NAME_PATH);
 
 		if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 &&
 		    strcmp(path, VDEV_TYPE_HOLE) != 0)
 			fnvlist_add_boolean(res, path);
 
 		free(path);
 		return;
 	}
 
 	for (i = 0; i < children; i++) {
 		zpool_collect_leaves(zhp, child[i], res);
 	}
 }
 
 /*
  * Callback routine that will print out a pool property value.
  */
 static int
 print_prop_cb(int prop, void *cb)
 {
 	FILE *fp = cb;
 
 	(void) fprintf(fp, "\t%-19s  ", zpool_prop_to_name(prop));
 
 	if (zpool_prop_readonly(prop))
 		(void) fprintf(fp, "  NO   ");
 	else
 		(void) fprintf(fp, " YES   ");
 
 	if (zpool_prop_values(prop) == NULL)
 		(void) fprintf(fp, "-\n");
 	else
 		(void) fprintf(fp, "%s\n", zpool_prop_values(prop));
 
 	return (ZPROP_CONT);
 }
 
 /*
  * Display usage message.  If we're inside a command, display only the usage for
  * that command.  Otherwise, iterate over the entire command table and display
  * a complete usage message.
  */
 static void
 usage(boolean_t requested)
 {
 	FILE *fp = requested ? stdout : stderr;
 
 	if (current_command == NULL) {
 		int i;
 
 		(void) fprintf(fp, gettext("usage: zpool command args ...\n"));
 		(void) fprintf(fp,
 		    gettext("where 'command' is one of the following:\n\n"));
 
 		for (i = 0; i < NCOMMAND; i++) {
 			if (command_table[i].name == NULL)
 				(void) fprintf(fp, "\n");
 			else
 				(void) fprintf(fp, "%s",
 				    get_usage(command_table[i].usage));
 		}
 	} else {
 		(void) fprintf(fp, gettext("usage:\n"));
 		(void) fprintf(fp, "%s", get_usage(current_command->usage));
 	}
 
 	if (current_command != NULL &&
 	    ((strcmp(current_command->name, "set") == 0) ||
 	    (strcmp(current_command->name, "get") == 0) ||
 	    (strcmp(current_command->name, "list") == 0))) {
 
 		(void) fprintf(fp,
 		    gettext("\nthe following properties are supported:\n"));
 
 		(void) fprintf(fp, "\n\t%-19s  %s   %s\n\n",
 		    "PROPERTY", "EDIT", "VALUES");
 
 		/* Iterate over all properties */
 		(void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE,
 		    ZFS_TYPE_POOL);
 
 		(void) fprintf(fp, "\t%-19s   ", "feature@...");
 		(void) fprintf(fp, "YES   disabled | enabled | active\n");
 
 		(void) fprintf(fp, gettext("\nThe feature@ properties must be "
 		    "appended with a feature name.\nSee zpool-features(5).\n"));
 	}
 
 	/*
 	 * See comments at end of main().
 	 */
 	if (getenv("ZFS_ABORT") != NULL) {
 		(void) printf("dumping core by request\n");
 		abort();
 	}
 
 	exit(requested ? 0 : 2);
 }
 
 /*
  * zpool initialize [-c | -s] [-w] <pool> [<vdev> ...]
  * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool
  * if none specified.
  *
  *	-c	Cancel. Ends active initializing.
  *	-s	Suspend. Initializing can then be restarted with no flags.
  *	-w	Wait. Blocks until initializing has completed.
  */
 int
 zpool_do_initialize(int argc, char **argv)
 {
 	int c;
 	char *poolname;
 	zpool_handle_t *zhp;
 	nvlist_t *vdevs;
 	int err = 0;
 	boolean_t wait = B_FALSE;
 
 	struct option long_options[] = {
 		{"cancel",	no_argument,		NULL, 'c'},
 		{"suspend",	no_argument,		NULL, 's'},
 		{"wait",	no_argument,		NULL, 'w'},
 		{0, 0, 0, 0}
 	};
 
 	pool_initialize_func_t cmd_type = POOL_INITIALIZE_START;
 	while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) {
 		switch (c) {
 		case 'c':
 			if (cmd_type != POOL_INITIALIZE_START &&
 			    cmd_type != POOL_INITIALIZE_CANCEL) {
 				(void) fprintf(stderr, gettext("-c cannot be "
 				    "combined with other options\n"));
 				usage(B_FALSE);
 			}
 			cmd_type = POOL_INITIALIZE_CANCEL;
 			break;
 		case 's':
 			if (cmd_type != POOL_INITIALIZE_START &&
 			    cmd_type != POOL_INITIALIZE_SUSPEND) {
 				(void) fprintf(stderr, gettext("-s cannot be "
 				    "combined with other options\n"));
 				usage(B_FALSE);
 			}
 			cmd_type = POOL_INITIALIZE_SUSPEND;
 			break;
 		case 'w':
 			wait = B_TRUE;
 			break;
 		case '?':
 			if (optopt != 0) {
 				(void) fprintf(stderr,
 				    gettext("invalid option '%c'\n"), optopt);
 			} else {
 				(void) fprintf(stderr,
 				    gettext("invalid option '%s'\n"),
 				    argv[optind - 1]);
 			}
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 		return (-1);
 	}
 
 	if (wait && (cmd_type != POOL_INITIALIZE_START)) {
 		(void) fprintf(stderr, gettext("-w cannot be used with -c or "
 		    "-s\n"));
 		usage(B_FALSE);
 	}
 
 	poolname = argv[0];
 	zhp = zpool_open(g_zfs, poolname);
 	if (zhp == NULL)
 		return (-1);
 
 	vdevs = fnvlist_alloc();
 	if (argc == 1) {
 		/* no individual leaf vdevs specified, so add them all */
 		nvlist_t *config = zpool_get_config(zhp, NULL);
 		nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
 		    ZPOOL_CONFIG_VDEV_TREE);
 		zpool_collect_leaves(zhp, nvroot, vdevs);
 	} else {
 		for (int i = 1; i < argc; i++) {
 			fnvlist_add_boolean(vdevs, argv[i]);
 		}
 	}
 
 	if (wait)
 		err = zpool_initialize_wait(zhp, cmd_type, vdevs);
 	else
 		err = zpool_initialize(zhp, cmd_type, vdevs);
 
 	fnvlist_free(vdevs);
 	zpool_close(zhp);
 
 	return (err);
 }
 
 /*
  * print a pool vdev config for dry runs
  */
 static void
 print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent,
     const char *match, int name_flags)
 {
 	nvlist_t **child;
 	uint_t c, children;
 	char *vname;
 	boolean_t printed = B_FALSE;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0) {
 		if (name != NULL)
 			(void) printf("\t%*s%s\n", indent, "", name);
 		return;
 	}
 
 	for (c = 0; c < children; c++) {
 		uint64_t is_log = B_FALSE;
 		char *class = "";
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
 		    &is_log);
 		if (is_log)
 			class = VDEV_ALLOC_BIAS_LOG;
 		(void) nvlist_lookup_string(child[c],
 		    ZPOOL_CONFIG_ALLOCATION_BIAS, &class);
 		if (strcmp(match, class) != 0)
 			continue;
 
 		if (!printed && name != NULL) {
 			(void) printf("\t%*s%s\n", indent, "", name);
 			printed = B_TRUE;
 		}
 		vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags);
 		print_vdev_tree(zhp, vname, child[c], indent + 2, "",
 		    name_flags);
 		free(vname);
 	}
 }
 
 static boolean_t
 prop_list_contains_feature(nvlist_t *proplist)
 {
 	nvpair_t *nvp;
 	for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp;
 	    nvp = nvlist_next_nvpair(proplist, nvp)) {
 		if (zpool_prop_feature(nvpair_name(nvp)))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Add a property pair (name, string-value) into a property nvlist.
  */
 static int
 add_prop_list(const char *propname, char *propval, nvlist_t **props,
     boolean_t poolprop)
 {
 	zpool_prop_t prop = ZPOOL_PROP_INVAL;
 	nvlist_t *proplist;
 	const char *normnm;
 	char *strval;
 
 	if (*props == NULL &&
 	    nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) {
 		(void) fprintf(stderr,
 		    gettext("internal error: out of memory\n"));
 		return (1);
 	}
 
 	proplist = *props;
 
 	if (poolprop) {
 		const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION);
 
 		if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL &&
 		    !zpool_prop_feature(propname)) {
 			(void) fprintf(stderr, gettext("property '%s' is "
 			    "not a valid pool property\n"), propname);
 			return (2);
 		}
 
 		/*
 		 * feature@ properties and version should not be specified
 		 * at the same time.
 		 */
 		if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) &&
 		    nvlist_exists(proplist, vname)) ||
 		    (prop == ZPOOL_PROP_VERSION &&
 		    prop_list_contains_feature(proplist))) {
 			(void) fprintf(stderr, gettext("'feature@' and "
 			    "'version' properties cannot be specified "
 			    "together\n"));
 			return (2);
 		}
 
 
 		if (zpool_prop_feature(propname))
 			normnm = propname;
 		else
 			normnm = zpool_prop_to_name(prop);
 	} else {
 		zfs_prop_t fsprop = zfs_name_to_prop(propname);
 
 		if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM,
 		    B_FALSE)) {
 			normnm = zfs_prop_to_name(fsprop);
 		} else if (zfs_prop_user(propname) ||
 		    zfs_prop_userquota(propname)) {
 			normnm = propname;
 		} else {
 			(void) fprintf(stderr, gettext("property '%s' is "
 			    "not a valid filesystem property\n"), propname);
 			return (2);
 		}
 	}
 
 	if (nvlist_lookup_string(proplist, normnm, &strval) == 0 &&
 	    prop != ZPOOL_PROP_CACHEFILE) {
 		(void) fprintf(stderr, gettext("property '%s' "
 		    "specified multiple times\n"), propname);
 		return (2);
 	}
 
 	if (nvlist_add_string(proplist, normnm, propval) != 0) {
 		(void) fprintf(stderr, gettext("internal "
 		    "error: out of memory\n"));
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Set a default property pair (name, string-value) in a property nvlist
  */
 static int
 add_prop_list_default(const char *propname, char *propval, nvlist_t **props,
     boolean_t poolprop)
 {
 	char *pval;
 
 	if (nvlist_lookup_string(*props, propname, &pval) == 0)
 		return (0);
 
 	return (add_prop_list(propname, propval, props, B_TRUE));
 }
 
 /*
  * zpool add [-fgLnP] [-o property=value] <pool> <vdev> ...
  *
  *	-f	Force addition of devices, even if they appear in use
  *	-g	Display guid for individual vdev name.
  *	-L	Follow links when resolving vdev path name.
  *	-n	Do not add the devices, but display the resulting layout if
  *		they were to be added.
  *	-o	Set property=value.
  *	-P	Display full path for vdev name.
  *
  * Adds the given vdevs to 'pool'.  As with create, the bulk of this work is
  * handled by make_root_vdev(), which constructs the nvlist needed to pass to
  * libzfs.
  */
 int
 zpool_do_add(int argc, char **argv)
 {
 	boolean_t force = B_FALSE;
 	boolean_t dryrun = B_FALSE;
 	int name_flags = 0;
 	int c;
 	nvlist_t *nvroot;
 	char *poolname;
 	int ret;
 	zpool_handle_t *zhp;
 	nvlist_t *config;
 	nvlist_t *props = NULL;
 	char *propval;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "fgLno:P")) != -1) {
 		switch (c) {
 		case 'f':
 			force = B_TRUE;
 			break;
 		case 'g':
 			name_flags |= VDEV_NAME_GUID;
 			break;
 		case 'L':
 			name_flags |= VDEV_NAME_FOLLOW_LINKS;
 			break;
 		case 'n':
 			dryrun = B_TRUE;
 			break;
 		case 'o':
 			if ((propval = strchr(optarg, '=')) == NULL) {
 				(void) fprintf(stderr, gettext("missing "
 				    "'=' for -o option\n"));
 				usage(B_FALSE);
 			}
 			*propval = '\0';
 			propval++;
 
 			if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) ||
 			    (add_prop_list(optarg, propval, &props, B_TRUE)))
 				usage(B_FALSE);
 			break;
 		case 'P':
 			name_flags |= VDEV_NAME_PATH;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* get pool name and check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing vdev specification\n"));
 		usage(B_FALSE);
 	}
 
 	poolname = argv[0];
 
 	argc--;
 	argv++;
 
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
 		return (1);
 
 	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
 		(void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
 		    poolname);
 		zpool_close(zhp);
 		return (1);
 	}
 
 	/* unless manually specified use "ashift" pool property (if set) */
 	if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) {
 		int intval;
 		zprop_source_t src;
 		char strval[ZPOOL_MAXPROPLEN];
 
 		intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src);
 		if (src != ZPROP_SRC_DEFAULT) {
 			(void) sprintf(strval, "%" PRId32, intval);
 			verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval,
 			    &props, B_TRUE) == 0);
 		}
 	}
 
 	/* pass off to make_root_vdev for processing */
 	nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun,
 	    argc, argv);
 	if (nvroot == NULL) {
 		zpool_close(zhp);
 		return (1);
 	}
 
 	if (dryrun) {
 		nvlist_t *poolnvroot;
 		nvlist_t **l2child;
 		uint_t l2children, c;
 		char *vname;
 		boolean_t hadcache = B_FALSE;
 
 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 		    &poolnvroot) == 0);
 
 		(void) printf(gettext("would update '%s' to the following "
 		    "configuration:\n"), zpool_get_name(zhp));
 
 		/* print original main pool and new tree */
 		print_vdev_tree(zhp, poolname, poolnvroot, 0, "",
 		    name_flags | VDEV_NAME_TYPE_ID);
 		print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags);
 
 		/* print other classes: 'dedup', 'special', and 'log' */
 		if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) {
 			print_vdev_tree(zhp, "dedup", poolnvroot, 0,
 			    VDEV_ALLOC_BIAS_DEDUP, name_flags);
 			print_vdev_tree(zhp, NULL, nvroot, 0,
 			    VDEV_ALLOC_BIAS_DEDUP, name_flags);
 		} else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) {
 			print_vdev_tree(zhp, "dedup", nvroot, 0,
 			    VDEV_ALLOC_BIAS_DEDUP, name_flags);
 		}
 
 		if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) {
 			print_vdev_tree(zhp, "special", poolnvroot, 0,
 			    VDEV_ALLOC_BIAS_SPECIAL, name_flags);
 			print_vdev_tree(zhp, NULL, nvroot, 0,
 			    VDEV_ALLOC_BIAS_SPECIAL, name_flags);
 		} else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) {
 			print_vdev_tree(zhp, "special", nvroot, 0,
 			    VDEV_ALLOC_BIAS_SPECIAL, name_flags);
 		}
 
 		if (num_logs(poolnvroot) > 0) {
 			print_vdev_tree(zhp, "logs", poolnvroot, 0,
 			    VDEV_ALLOC_BIAS_LOG, name_flags);
 			print_vdev_tree(zhp, NULL, nvroot, 0,
 			    VDEV_ALLOC_BIAS_LOG, name_flags);
 		} else if (num_logs(nvroot) > 0) {
 			print_vdev_tree(zhp, "logs", nvroot, 0,
 			    VDEV_ALLOC_BIAS_LOG, name_flags);
 		}
 
 		/* Do the same for the caches */
 		if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE,
 		    &l2child, &l2children) == 0 && l2children) {
 			hadcache = B_TRUE;
 			(void) printf(gettext("\tcache\n"));
 			for (c = 0; c < l2children; c++) {
 				vname = zpool_vdev_name(g_zfs, NULL,
 				    l2child[c], name_flags);
 				(void) printf("\t  %s\n", vname);
 				free(vname);
 			}
 		}
 		if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    &l2child, &l2children) == 0 && l2children) {
 			if (!hadcache)
 				(void) printf(gettext("\tcache\n"));
 			for (c = 0; c < l2children; c++) {
 				vname = zpool_vdev_name(g_zfs, NULL,
 				    l2child[c], name_flags);
 				(void) printf("\t  %s\n", vname);
 				free(vname);
 			}
 		}
 
 		ret = 0;
 	} else {
 		ret = (zpool_add(zhp, nvroot) != 0);
 	}
 
 	nvlist_free(props);
 	nvlist_free(nvroot);
 	zpool_close(zhp);
 
 	return (ret);
 }
 
 /*
  * zpool remove [-npsw] <pool> <vdev> ...
  *
  * Removes the given vdev from the pool.
  */
 int
 zpool_do_remove(int argc, char **argv)
 {
 	char *poolname;
 	int i, ret = 0;
 	zpool_handle_t *zhp = NULL;
 	boolean_t stop = B_FALSE;
 	int c;
 	boolean_t noop = B_FALSE;
 	boolean_t parsable = B_FALSE;
 	boolean_t wait = B_FALSE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "npsw")) != -1) {
 		switch (c) {
 		case 'n':
 			noop = B_TRUE;
 			break;
 		case 'p':
 			parsable = B_TRUE;
 			break;
 		case 's':
 			stop = B_TRUE;
 			break;
 		case 'w':
 			wait = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* get pool name and check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 	}
 
 	poolname = argv[0];
 
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
 		return (1);
 
 	if (stop && noop) {
 		(void) fprintf(stderr, gettext("stop request ignored\n"));
 		return (0);
 	}
 
 	if (stop) {
 		if (argc > 1) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 		if (zpool_vdev_remove_cancel(zhp) != 0)
 			ret = 1;
 		if (wait) {
 			(void) fprintf(stderr, gettext("invalid option "
 			    "combination: -w cannot be used with -s\n"));
 			usage(B_FALSE);
 		}
 	} else {
 		if (argc < 2) {
 			(void) fprintf(stderr, gettext("missing device\n"));
 			usage(B_FALSE);
 		}
 
 		for (i = 1; i < argc; i++) {
 			if (noop) {
 				uint64_t size;
 
 				if (zpool_vdev_indirect_size(zhp, argv[i],
 				    &size) != 0) {
 					ret = 1;
 					break;
 				}
 				if (parsable) {
 					(void) printf("%s %llu\n",
 					    argv[i], (unsigned long long)size);
 				} else {
 					char valstr[32];
 					zfs_nicenum(size, valstr,
 					    sizeof (valstr));
 					(void) printf("Memory that will be "
 					    "used after removing %s: %s\n",
 					    argv[i], valstr);
 				}
 			} else {
 				if (zpool_vdev_remove(zhp, argv[i]) != 0)
 					ret = 1;
 			}
 		}
 
 		if (ret == 0 && wait)
 			ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE);
 	}
 	zpool_close(zhp);
 
 	return (ret);
 }
 
 /*
  * zpool labelclear [-f] <vdev>
  *
  *	-f	Force clearing the label for the vdevs which are members of
  *		the exported or foreign pools.
  *
  * Verifies that the vdev is not active and zeros out the label information
  * on the device.
  */
 int
 zpool_do_labelclear(int argc, char **argv)
 {
 	char vdev[MAXPATHLEN];
 	char *name = NULL;
 	struct stat st;
 	int c, fd = -1, ret = 0;
 	nvlist_t *config;
 	pool_state_t state;
 	boolean_t inuse = B_FALSE;
 	boolean_t force = B_FALSE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "f")) != -1) {
 		switch (c) {
 		case 'f':
 			force = B_TRUE;
 			break;
 		default:
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* get vdev name */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing vdev name\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	/*
 	 * Check if we were given absolute path and use it as is.
 	 * Otherwise if the provided vdev name doesn't point to a file,
 	 * try prepending expected disk paths and partition numbers.
 	 */
 	(void) strlcpy(vdev, argv[0], sizeof (vdev));
 	if (vdev[0] != '/' && stat(vdev, &st) != 0) {
 		int error;
 
 		error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN);
 		if (error == 0 && zfs_dev_is_whole_disk(vdev)) {
 			if (zfs_append_partition(vdev, MAXPATHLEN) == -1)
 				error = ENOENT;
 		}
 
 		if (error || (stat(vdev, &st) != 0)) {
 			(void) fprintf(stderr, gettext(
 			    "failed to find device %s, try specifying absolute "
 			    "path instead\n"), argv[0]);
 			return (1);
 		}
 	}
 
 	if ((fd = open(vdev, O_RDWR)) < 0) {
 		(void) fprintf(stderr, gettext("failed to open %s: %s\n"),
 		    vdev, strerror(errno));
 		return (1);
 	}
 
 	/*
 	 * Flush all dirty pages for the block device.  This should not be
 	 * fatal when the device does not support BLKFLSBUF as would be the
 	 * case for a file vdev.
 	 */
 	if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY))
 		(void) fprintf(stderr, gettext("failed to invalidate "
 		    "cache for %s: %s\n"), vdev, strerror(errno));
 
 	if (zpool_read_label(fd, &config, NULL) != 0) {
 		(void) fprintf(stderr,
 		    gettext("failed to read label from %s\n"), vdev);
 		ret = 1;
 		goto errout;
 	}
 	nvlist_free(config);
 
 	ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse);
 	if (ret != 0) {
 		(void) fprintf(stderr,
 		    gettext("failed to check state for %s\n"), vdev);
 		ret = 1;
 		goto errout;
 	}
 
 	if (!inuse)
 		goto wipe_label;
 
 	switch (state) {
 	default:
 	case POOL_STATE_ACTIVE:
 	case POOL_STATE_SPARE:
 	case POOL_STATE_L2CACHE:
 		(void) fprintf(stderr, gettext(
 		    "%s is a member (%s) of pool \"%s\"\n"),
 		    vdev, zpool_pool_state_to_name(state), name);
 		ret = 1;
 		goto errout;
 
 	case POOL_STATE_EXPORTED:
 		if (force)
 			break;
 		(void) fprintf(stderr, gettext(
 		    "use '-f' to override the following error:\n"
 		    "%s is a member of exported pool \"%s\"\n"),
 		    vdev, name);
 		ret = 1;
 		goto errout;
 
 	case POOL_STATE_POTENTIALLY_ACTIVE:
 		if (force)
 			break;
 		(void) fprintf(stderr, gettext(
 		    "use '-f' to override the following error:\n"
 		    "%s is a member of potentially active pool \"%s\"\n"),
 		    vdev, name);
 		ret = 1;
 		goto errout;
 
 	case POOL_STATE_DESTROYED:
 		/* inuse should never be set for a destroyed pool */
 		assert(0);
 		break;
 	}
 
 wipe_label:
 	ret = zpool_clear_label(fd);
 	if (ret != 0) {
 		(void) fprintf(stderr,
 		    gettext("failed to clear label for %s\n"), vdev);
 	}
 
 errout:
 	free(name);
 	(void) close(fd);
 
 	return (ret);
 }
 
 /*
  * zpool create [-fnd] [-o property=value] ...
  *		[-O file-system-property=value] ...
  *		[-R root] [-m mountpoint] <pool> <dev> ...
  *
  *	-f	Force creation, even if devices appear in use
  *	-n	Do not create the pool, but display the resulting layout if it
  *		were to be created.
  *      -R	Create a pool under an alternate root
  *      -m	Set default mountpoint for the root dataset.  By default it's
  *		'/<pool>'
  *	-o	Set property=value.
  *	-o	Set feature@feature=enabled|disabled.
  *	-d	Don't automatically enable all supported pool features
  *		(individual features can be enabled with -o).
  *	-O	Set fsproperty=value in the pool's root file system
  *
  * Creates the named pool according to the given vdev specification.  The
  * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c.
  * Once we get the nvlist back from make_root_vdev(), we either print out the
  * contents (if '-n' was specified), or pass it to libzfs to do the creation.
  */
 int
 zpool_do_create(int argc, char **argv)
 {
 	boolean_t force = B_FALSE;
 	boolean_t dryrun = B_FALSE;
 	boolean_t enable_all_pool_feat = B_TRUE;
 	int c;
 	nvlist_t *nvroot = NULL;
 	char *poolname;
 	char *tname = NULL;
 	int ret = 1;
 	char *altroot = NULL;
 	char *mountpoint = NULL;
 	nvlist_t *fsprops = NULL;
 	nvlist_t *props = NULL;
 	char *propval;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) {
 		switch (c) {
 		case 'f':
 			force = B_TRUE;
 			break;
 		case 'n':
 			dryrun = B_TRUE;
 			break;
 		case 'd':
 			enable_all_pool_feat = B_FALSE;
 			break;
 		case 'R':
 			altroot = optarg;
 			if (add_prop_list(zpool_prop_to_name(
 			    ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
 				goto errout;
 			if (add_prop_list_default(zpool_prop_to_name(
 			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
 				goto errout;
 			break;
 		case 'm':
 			/* Equivalent to -O mountpoint=optarg */
 			mountpoint = optarg;
 			break;
 		case 'o':
 			if ((propval = strchr(optarg, '=')) == NULL) {
 				(void) fprintf(stderr, gettext("missing "
 				    "'=' for -o option\n"));
 				goto errout;
 			}
 			*propval = '\0';
 			propval++;
 
 			if (add_prop_list(optarg, propval, &props, B_TRUE))
 				goto errout;
 
 			/*
 			 * If the user is creating a pool that doesn't support
 			 * feature flags, don't enable any features.
 			 */
 			if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) {
 				char *end;
 				u_longlong_t ver;
 
 				ver = strtoull(propval, &end, 10);
 				if (*end == '\0' &&
 				    ver < SPA_VERSION_FEATURES) {
 					enable_all_pool_feat = B_FALSE;
 				}
 			}
 			if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT)
 				altroot = propval;
 			break;
 		case 'O':
 			if ((propval = strchr(optarg, '=')) == NULL) {
 				(void) fprintf(stderr, gettext("missing "
 				    "'=' for -O option\n"));
 				goto errout;
 			}
 			*propval = '\0';
 			propval++;
 
 			/*
 			 * Mountpoints are checked and then added later.
 			 * Uniquely among properties, they can be specified
 			 * more than once, to avoid conflict with -m.
 			 */
 			if (0 == strcmp(optarg,
 			    zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) {
 				mountpoint = propval;
 			} else if (add_prop_list(optarg, propval, &fsprops,
 			    B_FALSE)) {
 				goto errout;
 			}
 			break;
 		case 't':
 			/*
 			 * Sanity check temporary pool name.
 			 */
 			if (strchr(optarg, '/') != NULL) {
 				(void) fprintf(stderr, gettext("cannot create "
 				    "'%s': invalid character '/' in temporary "
 				    "name\n"), optarg);
 				(void) fprintf(stderr, gettext("use 'zfs "
 				    "create' to create a dataset\n"));
 				goto errout;
 			}
 
 			if (add_prop_list(zpool_prop_to_name(
 			    ZPOOL_PROP_TNAME), optarg, &props, B_TRUE))
 				goto errout;
 			if (add_prop_list_default(zpool_prop_to_name(
 			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
 				goto errout;
 			tname = optarg;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			goto badusage;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			goto badusage;
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* get pool name and check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		goto badusage;
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing vdev specification\n"));
 		goto badusage;
 	}
 
 	poolname = argv[0];
 
 	/*
 	 * As a special case, check for use of '/' in the name, and direct the
 	 * user to use 'zfs create' instead.
 	 */
 	if (strchr(poolname, '/') != NULL) {
 		(void) fprintf(stderr, gettext("cannot create '%s': invalid "
 		    "character '/' in pool name\n"), poolname);
 		(void) fprintf(stderr, gettext("use 'zfs create' to "
 		    "create a dataset\n"));
 		goto errout;
 	}
 
 	/* pass off to make_root_vdev for bulk processing */
 	nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun,
 	    argc - 1, argv + 1);
 	if (nvroot == NULL)
 		goto errout;
 
 	/* make_root_vdev() allows 0 toplevel children if there are spares */
 	if (!zfs_allocatable_devs(nvroot)) {
 		(void) fprintf(stderr, gettext("invalid vdev "
 		    "specification: at least one toplevel vdev must be "
 		    "specified\n"));
 		goto errout;
 	}
 
 	if (altroot != NULL && altroot[0] != '/') {
 		(void) fprintf(stderr, gettext("invalid alternate root '%s': "
 		    "must be an absolute path\n"), altroot);
 		goto errout;
 	}
 
 	/*
 	 * Check the validity of the mountpoint and direct the user to use the
 	 * '-m' mountpoint option if it looks like its in use.
 	 */
 	if (mountpoint == NULL ||
 	    (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 &&
 	    strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) {
 		char buf[MAXPATHLEN];
 		DIR *dirp;
 
 		if (mountpoint && mountpoint[0] != '/') {
 			(void) fprintf(stderr, gettext("invalid mountpoint "
 			    "'%s': must be an absolute path, 'legacy', or "
 			    "'none'\n"), mountpoint);
 			goto errout;
 		}
 
 		if (mountpoint == NULL) {
 			if (altroot != NULL)
 				(void) snprintf(buf, sizeof (buf), "%s/%s",
 				    altroot, poolname);
 			else
 				(void) snprintf(buf, sizeof (buf), "/%s",
 				    poolname);
 		} else {
 			if (altroot != NULL)
 				(void) snprintf(buf, sizeof (buf), "%s%s",
 				    altroot, mountpoint);
 			else
 				(void) snprintf(buf, sizeof (buf), "%s",
 				    mountpoint);
 		}
 
 		if ((dirp = opendir(buf)) == NULL && errno != ENOENT) {
 			(void) fprintf(stderr, gettext("mountpoint '%s' : "
 			    "%s\n"), buf, strerror(errno));
 			(void) fprintf(stderr, gettext("use '-m' "
 			    "option to provide a different default\n"));
 			goto errout;
 		} else if (dirp) {
 			int count = 0;
 
 			while (count < 3 && readdir(dirp) != NULL)
 				count++;
 			(void) closedir(dirp);
 
 			if (count > 2) {
 				(void) fprintf(stderr, gettext("mountpoint "
 				    "'%s' exists and is not empty\n"), buf);
 				(void) fprintf(stderr, gettext("use '-m' "
 				    "option to provide a "
 				    "different default\n"));
 				goto errout;
 			}
 		}
 	}
 
 	/*
 	 * Now that the mountpoint's validity has been checked, ensure that
 	 * the property is set appropriately prior to creating the pool.
 	 */
 	if (mountpoint != NULL) {
 		ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
 		    mountpoint, &fsprops, B_FALSE);
 		if (ret != 0)
 			goto errout;
 	}
 
 	ret = 1;
 	if (dryrun) {
 		/*
 		 * For a dry run invocation, print out a basic message and run
 		 * through all the vdevs in the list and print out in an
 		 * appropriate hierarchy.
 		 */
 		(void) printf(gettext("would create '%s' with the "
 		    "following layout:\n\n"), poolname);
 
 		print_vdev_tree(NULL, poolname, nvroot, 0, "", 0);
 		print_vdev_tree(NULL, "dedup", nvroot, 0,
 		    VDEV_ALLOC_BIAS_DEDUP, 0);
 		print_vdev_tree(NULL, "special", nvroot, 0,
 		    VDEV_ALLOC_BIAS_SPECIAL, 0);
 		print_vdev_tree(NULL, "logs", nvroot, 0,
 		    VDEV_ALLOC_BIAS_LOG, 0);
 
 		ret = 0;
 	} else {
 		/*
 		 * Hand off to libzfs.
 		 */
 		spa_feature_t i;
 		for (i = 0; i < SPA_FEATURES; i++) {
 			char propname[MAXPATHLEN];
 			char *propval;
 			zfeature_info_t *feat = &spa_feature_table[i];
 
 			(void) snprintf(propname, sizeof (propname),
 			    "feature@%s", feat->fi_uname);
 
 			/*
 			 * Only features contained in props will be enabled:
 			 * remove from the nvlist every ZFS_FEATURE_DISABLED
 			 * value and add every missing ZFS_FEATURE_ENABLED if
 			 * enable_all_pool_feat is set.
 			 */
 			if (!nvlist_lookup_string(props, propname, &propval)) {
 				if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0)
 					(void) nvlist_remove_all(props,
 					    propname);
 			} else if (enable_all_pool_feat) {
 				ret = add_prop_list(propname,
 				    ZFS_FEATURE_ENABLED, &props, B_TRUE);
 				if (ret != 0)
 					goto errout;
 			}
 		}
 
 		ret = 1;
 		if (zpool_create(g_zfs, poolname,
 		    nvroot, props, fsprops) == 0) {
 			zfs_handle_t *pool = zfs_open(g_zfs,
 			    tname ? tname : poolname, ZFS_TYPE_FILESYSTEM);
 			if (pool != NULL) {
 				if (zfs_mount(pool, NULL, 0) == 0) {
 					ret = zfs_shareall(pool);
 					zfs_commit_all_shares();
 				}
 				zfs_close(pool);
 			}
 		} else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) {
 			(void) fprintf(stderr, gettext("pool name may have "
 			    "been omitted\n"));
 		}
 	}
 
 errout:
 	nvlist_free(nvroot);
 	nvlist_free(fsprops);
 	nvlist_free(props);
 	return (ret);
 badusage:
 	nvlist_free(fsprops);
 	nvlist_free(props);
 	usage(B_FALSE);
 	return (2);
 }
 
 /*
  * zpool destroy <pool>
  *
  * 	-f	Forcefully unmount any datasets
  *
  * Destroy the given pool.  Automatically unmounts any datasets in the pool.
  */
 int
 zpool_do_destroy(int argc, char **argv)
 {
 	boolean_t force = B_FALSE;
 	int c;
 	char *pool;
 	zpool_handle_t *zhp;
 	int ret;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "f")) != -1) {
 		switch (c) {
 		case 'f':
 			force = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	pool = argv[0];
 
 	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
 		/*
 		 * As a special case, check for use of '/' in the name, and
 		 * direct the user to use 'zfs destroy' instead.
 		 */
 		if (strchr(pool, '/') != NULL)
 			(void) fprintf(stderr, gettext("use 'zfs destroy' to "
 			    "destroy a dataset\n"));
 		return (1);
 	}
 
 	if (zpool_disable_datasets(zhp, force) != 0) {
 		(void) fprintf(stderr, gettext("could not destroy '%s': "
 		    "could not unmount datasets\n"), zpool_get_name(zhp));
 		zpool_close(zhp);
 		return (1);
 	}
 
 	/* The history must be logged as part of the export */
 	log_history = B_FALSE;
 
 	ret = (zpool_destroy(zhp, history_str) != 0);
 
 	zpool_close(zhp);
 
 	return (ret);
 }
 
 typedef struct export_cbdata {
 	boolean_t force;
 	boolean_t hardforce;
 } export_cbdata_t;
 
 /*
  * Export one pool
  */
 static int
 zpool_export_one(zpool_handle_t *zhp, void *data)
 {
 	export_cbdata_t *cb = data;
 
 	if (zpool_disable_datasets(zhp, cb->force) != 0)
 		return (1);
 
 	/* The history must be logged as part of the export */
 	log_history = B_FALSE;
 
 	if (cb->hardforce) {
 		if (zpool_export_force(zhp, history_str) != 0)
 			return (1);
 	} else if (zpool_export(zhp, cb->force, history_str) != 0) {
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * zpool export [-f] <pool> ...
  *
  *	-a	Export all pools
  *	-f	Forcefully unmount datasets
  *
  * Export the given pools.  By default, the command will attempt to cleanly
  * unmount any active datasets within the pool.  If the '-f' flag is specified,
  * then the datasets will be forcefully unmounted.
  */
 int
 zpool_do_export(int argc, char **argv)
 {
 	export_cbdata_t cb;
 	boolean_t do_all = B_FALSE;
 	boolean_t force = B_FALSE;
 	boolean_t hardforce = B_FALSE;
 	int c, ret;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "afF")) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = B_TRUE;
 			break;
 		case 'f':
 			force = B_TRUE;
 			break;
 		case 'F':
 			hardforce = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	cb.force = force;
 	cb.hardforce = hardforce;
 	argc -= optind;
 	argv += optind;
 
 	if (do_all) {
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		return (for_each_pool(argc, argv, B_TRUE, NULL,
 		    zpool_export_one, &cb));
 	}
 
 	/* check arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool argument\n"));
 		usage(B_FALSE);
 	}
 
 	ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb);
 
 	return (ret);
 }
 
 /*
  * Given a vdev configuration, determine the maximum width needed for the device
  * name column.
  */
 static int
 max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max,
     int name_flags)
 {
 	char *name;
 	nvlist_t **child;
 	uint_t c, children;
 	int ret;
 
 	name = zpool_vdev_name(g_zfs, zhp, nv, name_flags);
 	if (strlen(name) + depth > max)
 		max = strlen(name) + depth;
 
 	free(name);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) == 0) {
 		for (c = 0; c < children; c++)
 			if ((ret = max_width(zhp, child[c], depth + 2,
 			    max, name_flags)) > max)
 				max = ret;
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0) {
 		for (c = 0; c < children; c++)
 			if ((ret = max_width(zhp, child[c], depth + 2,
 			    max, name_flags)) > max)
 				max = ret;
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) == 0) {
 		for (c = 0; c < children; c++)
 			if ((ret = max_width(zhp, child[c], depth + 2,
 			    max, name_flags)) > max)
 				max = ret;
 	}
 
 	return (max);
 }
 
 typedef struct spare_cbdata {
 	uint64_t	cb_guid;
 	zpool_handle_t	*cb_zhp;
 } spare_cbdata_t;
 
 static boolean_t
 find_vdev(nvlist_t *nv, uint64_t search)
 {
 	uint64_t guid;
 	nvlist_t **child;
 	uint_t c, children;
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
 	    search == guid)
 		return (B_TRUE);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) == 0) {
 		for (c = 0; c < children; c++)
 			if (find_vdev(child[c], search))
 				return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 static int
 find_spare(zpool_handle_t *zhp, void *data)
 {
 	spare_cbdata_t *cbp = data;
 	nvlist_t *config, *nvroot;
 
 	config = zpool_get_config(zhp, NULL);
 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 
 	if (find_vdev(nvroot, cbp->cb_guid)) {
 		cbp->cb_zhp = zhp;
 		return (1);
 	}
 
 	zpool_close(zhp);
 	return (0);
 }
 
 typedef struct status_cbdata {
 	int		cb_count;
 	int		cb_name_flags;
 	int		cb_namewidth;
 	boolean_t	cb_allpools;
 	boolean_t	cb_verbose;
 	boolean_t	cb_literal;
 	boolean_t	cb_explain;
 	boolean_t	cb_first;
 	boolean_t	cb_dedup_stats;
 	boolean_t	cb_print_status;
 	boolean_t	cb_print_slow_ios;
 	boolean_t	cb_print_vdev_init;
 	boolean_t	cb_print_vdev_trim;
 	vdev_cmd_data_list_t	*vcdl;
 } status_cbdata_t;
 
 /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */
 static int
 is_blank_str(char *str)
 {
 	while (str != NULL && *str != '\0') {
 		if (!isblank(*str))
 			return (0);
 		str++;
 	}
 	return (1);
 }
 
 /* Print command output lines for specific vdev in a specific pool */
 static void
 zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path)
 {
 	vdev_cmd_data_t *data;
 	int i, j;
 	char *val;
 
 	for (i = 0; i < vcdl->count; i++) {
 		if ((strcmp(vcdl->data[i].path, path) != 0) ||
 		    (strcmp(vcdl->data[i].pool, pool) != 0)) {
 			/* Not the vdev we're looking for */
 			continue;
 		}
 
 		data = &vcdl->data[i];
 		/* Print out all the output values for this vdev */
 		for (j = 0; j < vcdl->uniq_cols_cnt; j++) {
 			val = NULL;
 			/* Does this vdev have values for this column? */
 			for (int k = 0; k < data->cols_cnt; k++) {
 				if (strcmp(data->cols[k],
 				    vcdl->uniq_cols[j]) == 0) {
 					/* yes it does, record the value */
 					val = data->lines[k];
 					break;
 				}
 			}
 			/*
 			 * Mark empty values with dashes to make output
 			 * awk-able.
 			 */
 			if (is_blank_str(val))
 				val = "-";
 
 			printf("%*s", vcdl->uniq_cols_width[j], val);
 			if (j < vcdl->uniq_cols_cnt - 1)
 				printf("  ");
 		}
 
 		/* Print out any values that aren't in a column at the end */
 		for (j = data->cols_cnt; j < data->lines_cnt; j++) {
 			/* Did we have any columns?  If so print a spacer. */
 			if (vcdl->uniq_cols_cnt > 0)
 				printf("  ");
 
 			val = data->lines[j];
 			printf("%s", val ? val : "");
 		}
 		break;
 	}
 }
 
 /*
  * Print vdev initialization status for leaves
  */
 static void
 print_status_initialize(vdev_stat_t *vs, boolean_t verbose)
 {
 	if (verbose) {
 		if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE ||
 		    vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
 		    vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) &&
 		    !vs->vs_scan_removing) {
 			char zbuf[1024];
 			char tbuf[256];
 			struct tm zaction_ts;
 
 			time_t t = vs->vs_initialize_action_time;
 			int initialize_pct = 100;
 			if (vs->vs_initialize_state !=
 			    VDEV_INITIALIZE_COMPLETE) {
 				initialize_pct = (vs->vs_initialize_bytes_done *
 				    100 / (vs->vs_initialize_bytes_est + 1));
 			}
 
 			(void) localtime_r(&t, &zaction_ts);
 			(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
 
 			switch (vs->vs_initialize_state) {
 			case VDEV_INITIALIZE_SUSPENDED:
 				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
 				    gettext("suspended, started at"), tbuf);
 				break;
 			case VDEV_INITIALIZE_ACTIVE:
 				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
 				    gettext("started at"), tbuf);
 				break;
 			case VDEV_INITIALIZE_COMPLETE:
 				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
 				    gettext("completed at"), tbuf);
 				break;
 			}
 
 			(void) printf(gettext("  (%d%% initialized%s)"),
 			    initialize_pct, zbuf);
 		} else {
 			(void) printf(gettext("  (uninitialized)"));
 		}
 	} else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) {
 		(void) printf(gettext("  (initializing)"));
 	}
 }
 
 /*
  * Print vdev TRIM status for leaves
  */
 static void
 print_status_trim(vdev_stat_t *vs, boolean_t verbose)
 {
 	if (verbose) {
 		if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE ||
 		    vs->vs_trim_state == VDEV_TRIM_SUSPENDED ||
 		    vs->vs_trim_state == VDEV_TRIM_COMPLETE) &&
 		    !vs->vs_scan_removing) {
 			char zbuf[1024];
 			char tbuf[256];
 			struct tm zaction_ts;
 
 			time_t t = vs->vs_trim_action_time;
 			int trim_pct = 100;
 			if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) {
 				trim_pct = (vs->vs_trim_bytes_done *
 				    100 / (vs->vs_trim_bytes_est + 1));
 			}
 
 			(void) localtime_r(&t, &zaction_ts);
 			(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
 
 			switch (vs->vs_trim_state) {
 			case VDEV_TRIM_SUSPENDED:
 				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
 				    gettext("suspended, started at"), tbuf);
 				break;
 			case VDEV_TRIM_ACTIVE:
 				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
 				    gettext("started at"), tbuf);
 				break;
 			case VDEV_TRIM_COMPLETE:
 				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
 				    gettext("completed at"), tbuf);
 				break;
 			}
 
 			(void) printf(gettext("  (%d%% trimmed%s)"),
 			    trim_pct, zbuf);
 		} else if (vs->vs_trim_notsup) {
 			(void) printf(gettext("  (trim unsupported)"));
 		} else {
 			(void) printf(gettext("  (untrimmed)"));
 		}
 	} else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) {
 		(void) printf(gettext("  (trimming)"));
 	}
 }
 
 /*
  * Return the color associated with a health string.  This includes returning
  * NULL for no color change.
  */
 static char *
 health_str_to_color(const char *health)
 {
 	if (strcmp(health, gettext("FAULTED")) == 0 ||
 	    strcmp(health, gettext("SUSPENDED")) == 0 ||
 	    strcmp(health, gettext("UNAVAIL")) == 0) {
 		return (ANSI_RED);
 	}
 
 	if (strcmp(health, gettext("OFFLINE")) == 0 ||
 	    strcmp(health, gettext("DEGRADED")) == 0 ||
 	    strcmp(health, gettext("REMOVED")) == 0) {
 		return (ANSI_YELLOW);
 	}
 
 	return (NULL);
 }
 
 /*
  * Print out configuration state as requested by status_callback.
  */
 static void
 print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
     nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs)
 {
 	nvlist_t **child, *root;
 	uint_t c, i, vsc, children;
 	pool_scan_stat_t *ps = NULL;
 	vdev_stat_t *vs;
 	char rbuf[6], wbuf[6], cbuf[6];
 	char *vname;
 	uint64_t notpresent;
 	spare_cbdata_t spare_cb;
 	const char *state;
 	char *type;
 	char *path = NULL;
 	char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
 		children = 0;
 
 	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &vsc) == 0);
 
 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
 
 	if (strcmp(type, VDEV_TYPE_INDIRECT) == 0)
 		return;
 
 	state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
 
 	if (isspare) {
 		/*
 		 * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
 		 * online drives.
 		 */
 		if (vs->vs_aux == VDEV_AUX_SPARED)
 			state = gettext("INUSE");
 		else if (vs->vs_state == VDEV_STATE_HEALTHY)
 			state = gettext("AVAIL");
 	}
 
 	printf_color(health_str_to_color(state),
 	    "\t%*s%-*s  %-8s", depth, "", cb->cb_namewidth - depth,
 	    name, state);
 
 	if (!isspare) {
 		if (vs->vs_read_errors)
 			rcolor = ANSI_RED;
 
 		if (vs->vs_write_errors)
 			wcolor = ANSI_RED;
 
 		if (vs->vs_checksum_errors)
 			ccolor = ANSI_RED;
 
 		if (cb->cb_literal) {
 			printf(" ");
 			printf_color(rcolor, "%5llu",
 			    (u_longlong_t)vs->vs_read_errors);
 			printf(" ");
 			printf_color(wcolor, "%5llu",
 			    (u_longlong_t)vs->vs_write_errors);
 			printf(" ");
 			printf_color(ccolor, "%5llu",
 			    (u_longlong_t)vs->vs_checksum_errors);
 		} else {
 			zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
 			zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
 			zfs_nicenum(vs->vs_checksum_errors, cbuf,
 			    sizeof (cbuf));
 			printf(" ");
 			printf_color(rcolor, "%5s", rbuf);
 			printf(" ");
 			printf_color(wcolor, "%5s", wbuf);
 			printf(" ");
 			printf_color(ccolor, "%5s", cbuf);
 		}
 		if (cb->cb_print_slow_ios) {
 			if (children == 0)  {
 				/* Only leafs vdevs have slow IOs */
 				zfs_nicenum(vs->vs_slow_ios, rbuf,
 				    sizeof (rbuf));
 			} else {
 				snprintf(rbuf, sizeof (rbuf), "-");
 			}
 
 			if (cb->cb_literal)
 				printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
 			else
 				printf(" %5s", rbuf);
 		}
 	}
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 	    &notpresent) == 0) {
 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
 		(void) printf("  %s %s", gettext("was"), path);
 	} else if (vs->vs_aux != 0) {
 		(void) printf("  ");
 		color_start(ANSI_RED);
 		switch (vs->vs_aux) {
 		case VDEV_AUX_OPEN_FAILED:
 			(void) printf(gettext("cannot open"));
 			break;
 
 		case VDEV_AUX_BAD_GUID_SUM:
 			(void) printf(gettext("missing device"));
 			break;
 
 		case VDEV_AUX_NO_REPLICAS:
 			(void) printf(gettext("insufficient replicas"));
 			break;
 
 		case VDEV_AUX_VERSION_NEWER:
 			(void) printf(gettext("newer version"));
 			break;
 
 		case VDEV_AUX_UNSUP_FEAT:
 			(void) printf(gettext("unsupported feature(s)"));
 			break;
 
 		case VDEV_AUX_ASHIFT_TOO_BIG:
 			(void) printf(gettext("unsupported minimum blocksize"));
 			break;
 
 		case VDEV_AUX_SPARED:
 			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
 			    &spare_cb.cb_guid) == 0);
 			if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) {
 				if (strcmp(zpool_get_name(spare_cb.cb_zhp),
 				    zpool_get_name(zhp)) == 0)
 					(void) printf(gettext("currently in "
 					    "use"));
 				else
 					(void) printf(gettext("in use by "
 					    "pool '%s'"),
 					    zpool_get_name(spare_cb.cb_zhp));
 				zpool_close(spare_cb.cb_zhp);
 			} else {
 				(void) printf(gettext("currently in use"));
 			}
 			break;
 
 		case VDEV_AUX_ERR_EXCEEDED:
 			(void) printf(gettext("too many errors"));
 			break;
 
 		case VDEV_AUX_IO_FAILURE:
 			(void) printf(gettext("experienced I/O failures"));
 			break;
 
 		case VDEV_AUX_BAD_LOG:
 			(void) printf(gettext("bad intent log"));
 			break;
 
 		case VDEV_AUX_EXTERNAL:
 			(void) printf(gettext("external device fault"));
 			break;
 
 		case VDEV_AUX_SPLIT_POOL:
 			(void) printf(gettext("split into new pool"));
 			break;
 
 		case VDEV_AUX_ACTIVE:
 			(void) printf(gettext("currently in use"));
 			break;
 
 		case VDEV_AUX_CHILDREN_OFFLINE:
 			(void) printf(gettext("all children offline"));
 			break;
 
 		default:
 			(void) printf(gettext("corrupted data"));
 			break;
 		}
 		color_end();
 	}
 
 	/* The root vdev has the scrub/resilver stats */
 	root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
 	    ZPOOL_CONFIG_VDEV_TREE);
 	(void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS,
 	    (uint64_t **)&ps, &c);
 
 	if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) {
 		if (vs->vs_scan_processed != 0) {
 			(void) printf(gettext("  (%s)"),
 			    (ps->pss_func == POOL_SCAN_RESILVER) ?
 			    "resilvering" : "repairing");
 		} else if (vs->vs_resilver_deferred) {
 			(void) printf(gettext("  (awaiting resilver)"));
 		}
 	}
 
 	/* The top-level vdevs have the rebuild stats */
 	if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE &&
 	    children == 0) {
 		if (vs->vs_rebuild_processed != 0) {
 			(void) printf(gettext("  (resilvering)"));
 		}
 	}
 
 	if (cb->vcdl != NULL) {
 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
 			printf("  ");
 			zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path);
 		}
 	}
 
 	/* Display vdev initialization and trim status for leaves */
 	if (children == 0) {
 		print_status_initialize(vs, cb->cb_print_vdev_init);
 		print_status_trim(vs, cb->cb_print_vdev_trim);
 	}
 
 	(void) printf("\n");
 
 	for (c = 0; c < children; c++) {
 		uint64_t islog = B_FALSE, ishole = B_FALSE;
 
 		/* Don't print logs or holes here */
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
 		    &islog);
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
 		    &ishole);
 		if (islog || ishole)
 			continue;
 		/* Only print normal classes here */
 		if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
 			continue;
 
 		/* Provide vdev_rebuild_stats to children if available */
 		if (vrs == NULL) {
 			(void) nvlist_lookup_uint64_array(nv,
 			    ZPOOL_CONFIG_REBUILD_STATS,
 			    (uint64_t **)&vrs, &i);
 		}
 
 		vname = zpool_vdev_name(g_zfs, zhp, child[c],
 		    cb->cb_name_flags | VDEV_NAME_TYPE_ID);
 		print_status_config(zhp, cb, vname, child[c], depth + 2,
 		    isspare, vrs);
 		free(vname);
 	}
 }
 
 /*
  * Print the configuration of an exported pool.  Iterate over all vdevs in the
  * pool, printing out the name and status for each one.
  */
 static void
 print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv,
     int depth)
 {
 	nvlist_t **child;
 	uint_t c, children;
 	vdev_stat_t *vs;
 	char *type, *vname;
 
 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
 	if (strcmp(type, VDEV_TYPE_MISSING) == 0 ||
 	    strcmp(type, VDEV_TYPE_HOLE) == 0)
 		return;
 
 	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &c) == 0);
 
 	(void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name);
 	(void) printf("  %s", zpool_state_to_name(vs->vs_state, vs->vs_aux));
 
 	if (vs->vs_aux != 0) {
 		(void) printf("  ");
 
 		switch (vs->vs_aux) {
 		case VDEV_AUX_OPEN_FAILED:
 			(void) printf(gettext("cannot open"));
 			break;
 
 		case VDEV_AUX_BAD_GUID_SUM:
 			(void) printf(gettext("missing device"));
 			break;
 
 		case VDEV_AUX_NO_REPLICAS:
 			(void) printf(gettext("insufficient replicas"));
 			break;
 
 		case VDEV_AUX_VERSION_NEWER:
 			(void) printf(gettext("newer version"));
 			break;
 
 		case VDEV_AUX_UNSUP_FEAT:
 			(void) printf(gettext("unsupported feature(s)"));
 			break;
 
 		case VDEV_AUX_ERR_EXCEEDED:
 			(void) printf(gettext("too many errors"));
 			break;
 
 		case VDEV_AUX_ACTIVE:
 			(void) printf(gettext("currently in use"));
 			break;
 
 		case VDEV_AUX_CHILDREN_OFFLINE:
 			(void) printf(gettext("all children offline"));
 			break;
 
 		default:
 			(void) printf(gettext("corrupted data"));
 			break;
 		}
 	}
 	(void) printf("\n");
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
 		return;
 
 	for (c = 0; c < children; c++) {
 		uint64_t is_log = B_FALSE;
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
 		    &is_log);
 		if (is_log)
 			continue;
 		if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
 			continue;
 
 		vname = zpool_vdev_name(g_zfs, NULL, child[c],
 		    cb->cb_name_flags | VDEV_NAME_TYPE_ID);
 		print_import_config(cb, vname, child[c], depth + 2);
 		free(vname);
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0) {
 		(void) printf(gettext("\tcache\n"));
 		for (c = 0; c < children; c++) {
 			vname = zpool_vdev_name(g_zfs, NULL, child[c],
 			    cb->cb_name_flags);
 			(void) printf("\t  %s\n", vname);
 			free(vname);
 		}
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) == 0) {
 		(void) printf(gettext("\tspares\n"));
 		for (c = 0; c < children; c++) {
 			vname = zpool_vdev_name(g_zfs, NULL, child[c],
 			    cb->cb_name_flags);
 			(void) printf("\t  %s\n", vname);
 			free(vname);
 		}
 	}
 }
 
 /*
  * Print specialized class vdevs.
  *
  * These are recorded as top level vdevs in the main pool child array
  * but with "is_log" set to 1 or an "alloc_bias" string. We use either
  * print_status_config() or print_import_config() to print the top level
  * class vdevs then any of their children (eg mirrored slogs) are printed
  * recursively - which works because only the top level vdev is marked.
  */
 static void
 print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv,
     const char *class)
 {
 	uint_t c, children;
 	nvlist_t **child;
 	boolean_t printed = B_FALSE;
 
 	assert(zhp != NULL || !cb->cb_verbose);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) != 0)
 		return;
 
 	for (c = 0; c < children; c++) {
 		uint64_t is_log = B_FALSE;
 		char *bias = NULL;
 		char *type = NULL;
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
 		    &is_log);
 
 		if (is_log) {
 			bias = VDEV_ALLOC_CLASS_LOGS;
 		} else {
 			(void) nvlist_lookup_string(child[c],
 			    ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
 			(void) nvlist_lookup_string(child[c],
 			    ZPOOL_CONFIG_TYPE, &type);
 		}
 
 		if (bias == NULL || strcmp(bias, class) != 0)
 			continue;
 		if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
 			continue;
 
 		if (!printed) {
 			(void) printf("\t%s\t\n", gettext(class));
 			printed = B_TRUE;
 		}
 
 		char *name = zpool_vdev_name(g_zfs, zhp, child[c],
 		    cb->cb_name_flags | VDEV_NAME_TYPE_ID);
 		if (cb->cb_print_status)
 			print_status_config(zhp, cb, name, child[c], 2,
 			    B_FALSE, NULL);
 		else
 			print_import_config(cb, name, child[c], 2);
 		free(name);
 	}
 }
 
 /*
  * Display the status for the given pool.
  */
 static void
 show_import(nvlist_t *config)
 {
 	uint64_t pool_state;
 	vdev_stat_t *vs;
 	char *name;
 	uint64_t guid;
 	uint64_t hostid = 0;
 	char *msgid;
 	char *hostname = "unknown";
 	nvlist_t *nvroot, *nvinfo;
 	zpool_status_t reason;
 	zpool_errata_t errata;
 	const char *health;
 	uint_t vsc;
 	char *comment;
 	status_cbdata_t cb = { 0 };
 
 	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 	    &name) == 0);
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    &guid) == 0);
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    &pool_state) == 0);
 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 
 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &vsc) == 0);
 	health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
 
 	reason = zpool_import_status(config, &msgid, &errata);
 
 	(void) printf(gettext("   pool: %s\n"), name);
 	(void) printf(gettext("     id: %llu\n"), (u_longlong_t)guid);
 	(void) printf(gettext("  state: %s"), health);
 	if (pool_state == POOL_STATE_DESTROYED)
 		(void) printf(gettext(" (DESTROYED)"));
 	(void) printf("\n");
 
 	switch (reason) {
 	case ZPOOL_STATUS_MISSING_DEV_R:
 	case ZPOOL_STATUS_MISSING_DEV_NR:
 	case ZPOOL_STATUS_BAD_GUID_SUM:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices are "
 		    "missing from the system.\n"));
 		break;
 
 	case ZPOOL_STATUS_CORRUPT_LABEL_R:
 	case ZPOOL_STATUS_CORRUPT_LABEL_NR:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices contains"
 		    " corrupted data.\n"));
 		break;
 
 	case ZPOOL_STATUS_CORRUPT_DATA:
 		(void) printf(
 		    gettext(" status: The pool data is corrupted.\n"));
 		break;
 
 	case ZPOOL_STATUS_OFFLINE_DEV:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices "
 		    "are offlined.\n"));
 		break;
 
 	case ZPOOL_STATUS_CORRUPT_POOL:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool metadata is "
 		    "corrupted.\n"));
 		break;
 
 	case ZPOOL_STATUS_VERSION_OLDER:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool is formatted using "
 		    "a legacy on-disk version.\n"));
 		break;
 
 	case ZPOOL_STATUS_VERSION_NEWER:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool is formatted using "
 		    "an incompatible version.\n"));
 		break;
 
 	case ZPOOL_STATUS_FEAT_DISABLED:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("Some supported features are "
 		    "not enabled on the pool.\n"));
 		break;
 
 	case ZPOOL_STATUS_UNSUP_FEAT_READ:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool uses the following "
 		    "feature(s) not supported on this system:\n"));
 		color_start(ANSI_YELLOW);
 		zpool_print_unsup_feat(config);
 		color_end();
 		break;
 
 	case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool can only be "
 		    "accessed in read-only mode on this system. It\n\tcannot be"
 		    " accessed in read-write mode because it uses the "
 		    "following\n\tfeature(s) not supported on this system:\n"));
 		color_start(ANSI_YELLOW);
 		zpool_print_unsup_feat(config);
 		color_end();
 		break;
 
 	case ZPOOL_STATUS_HOSTID_ACTIVE:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool is currently "
 		    "imported by another system.\n"));
 		break;
 
 	case ZPOOL_STATUS_HOSTID_REQUIRED:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool has the "
 		    "multihost property on.  It cannot\n\tbe safely imported "
 		    "when the system hostid is not set.\n"));
 		break;
 
 	case ZPOOL_STATUS_HOSTID_MISMATCH:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool was last accessed "
 		    "by another system.\n"));
 		break;
 
 	case ZPOOL_STATUS_FAULTED_DEV_R:
 	case ZPOOL_STATUS_FAULTED_DEV_NR:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices are "
 		    "faulted.\n"));
 		break;
 
 	case ZPOOL_STATUS_BAD_LOG:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("An intent log record cannot "
 		    "be read.\n"));
 		break;
 
 	case ZPOOL_STATUS_RESILVERING:
 	case ZPOOL_STATUS_REBUILDING:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices were "
 		    "being resilvered.\n"));
 		break;
 
 	case ZPOOL_STATUS_ERRATA:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"),
 		    errata);
 		break;
 
 	default:
 		/*
 		 * No other status can be seen when importing pools.
 		 */
 		assert(reason == ZPOOL_STATUS_OK);
 	}
 
 	/*
 	 * Print out an action according to the overall state of the pool.
 	 */
 	if (vs->vs_state == VDEV_STATE_HEALTHY) {
 		if (reason == ZPOOL_STATUS_VERSION_OLDER ||
 		    reason == ZPOOL_STATUS_FEAT_DISABLED) {
 			(void) printf(gettext(" action: The pool can be "
 			    "imported using its name or numeric identifier, "
 			    "though\n\tsome features will not be available "
 			    "without an explicit 'zpool upgrade'.\n"));
 		} else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) {
 			(void) printf(gettext(" action: The pool can be "
 			    "imported using its name or numeric "
 			    "identifier and\n\tthe '-f' flag.\n"));
 		} else if (reason == ZPOOL_STATUS_ERRATA) {
 			switch (errata) {
 			case ZPOOL_ERRATA_NONE:
 				break;
 
 			case ZPOOL_ERRATA_ZOL_2094_SCRUB:
 				(void) printf(gettext(" action: The pool can "
 				    "be imported using its name or numeric "
 				    "identifier,\n\thowever there is a compat"
 				    "ibility issue which should be corrected"
 				    "\n\tby running 'zpool scrub'\n"));
 				break;
 
 			case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY:
 				(void) printf(gettext(" action: The pool can"
 				    "not be imported with this version of ZFS "
 				    "due to\n\tan active asynchronous destroy. "
 				    "Revert to an earlier version\n\tand "
 				    "allow the destroy to complete before "
 				    "updating.\n"));
 				break;
 
 			case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION:
 				(void) printf(gettext(" action: Existing "
 				    "encrypted datasets contain an on-disk "
 				    "incompatibility, which\n\tneeds to be "
 				    "corrected. Backup these datasets to new "
 				    "encrypted datasets\n\tand destroy the "
 				    "old ones.\n"));
 				break;
 
 			case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION:
 				(void) printf(gettext(" action: Existing "
 				    "encrypted snapshots and bookmarks contain "
 				    "an on-disk\n\tincompatibility. This may "
 				    "cause on-disk corruption if they are used"
 				    "\n\twith 'zfs recv'. To correct the "
 				    "issue, enable the bookmark_v2 feature.\n\t"
 				    "No additional action is needed if there "
 				    "are no encrypted snapshots or\n\t"
 				    "bookmarks. If preserving the encrypted "
 				    "snapshots and bookmarks is\n\trequired, "
 				    "use a non-raw send to backup and restore "
 				    "them. Alternately,\n\tthey may be removed"
 				    " to resolve the incompatibility.\n"));
 				break;
 			default:
 				/*
 				 * All errata must contain an action message.
 				 */
 				assert(0);
 			}
 		} else {
 			(void) printf(gettext(" action: The pool can be "
 			    "imported using its name or numeric "
 			    "identifier.\n"));
 		}
 	} else if (vs->vs_state == VDEV_STATE_DEGRADED) {
 		(void) printf(gettext(" action: The pool can be imported "
 		    "despite missing or damaged devices.  The\n\tfault "
 		    "tolerance of the pool may be compromised if imported.\n"));
 	} else {
 		switch (reason) {
 		case ZPOOL_STATUS_VERSION_NEWER:
 			(void) printf(gettext(" action: The pool cannot be "
 			    "imported.  Access the pool on a system running "
 			    "newer\n\tsoftware, or recreate the pool from "
 			    "backup.\n"));
 			break;
 		case ZPOOL_STATUS_UNSUP_FEAT_READ:
 			printf_color(ANSI_BOLD, gettext("action: "));
 			printf_color(ANSI_YELLOW, gettext("The pool cannot be "
 			    "imported. Access the pool on a system that "
 			    "supports\n\tthe required feature(s), or recreate "
 			    "the pool from backup.\n"));
 			break;
 		case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
 			printf_color(ANSI_BOLD, gettext("action: "));
 			printf_color(ANSI_YELLOW, gettext("The pool cannot be "
 			    "imported in read-write mode. Import the pool "
 			    "with\n"
 			    "\t\"-o readonly=on\", access the pool on a system "
 			    "that supports the\n\trequired feature(s), or "
 			    "recreate the pool from backup.\n"));
 			break;
 		case ZPOOL_STATUS_MISSING_DEV_R:
 		case ZPOOL_STATUS_MISSING_DEV_NR:
 		case ZPOOL_STATUS_BAD_GUID_SUM:
 			(void) printf(gettext(" action: The pool cannot be "
 			    "imported. Attach the missing\n\tdevices and try "
 			    "again.\n"));
 			break;
 		case ZPOOL_STATUS_HOSTID_ACTIVE:
 			VERIFY0(nvlist_lookup_nvlist(config,
 			    ZPOOL_CONFIG_LOAD_INFO, &nvinfo));
 
 			if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME))
 				hostname = fnvlist_lookup_string(nvinfo,
 				    ZPOOL_CONFIG_MMP_HOSTNAME);
 
 			if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID))
 				hostid = fnvlist_lookup_uint64(nvinfo,
 				    ZPOOL_CONFIG_MMP_HOSTID);
 
 			(void) printf(gettext(" action: The pool must be "
 			    "exported from %s (hostid=%lx)\n\tbefore it "
 			    "can be safely imported.\n"), hostname,
 			    (unsigned long) hostid);
 			break;
 		case ZPOOL_STATUS_HOSTID_REQUIRED:
 			(void) printf(gettext(" action: Set a unique system "
 			    "hostid with the zgenhostid(8) command.\n"));
 			break;
 		default:
 			(void) printf(gettext(" action: The pool cannot be "
 			    "imported due to damaged devices or data.\n"));
 		}
 	}
 
 	/* Print the comment attached to the pool. */
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
 		(void) printf(gettext("comment: %s\n"), comment);
 
 	/*
 	 * If the state is "closed" or "can't open", and the aux state
 	 * is "corrupt data":
 	 */
 	if (((vs->vs_state == VDEV_STATE_CLOSED) ||
 	    (vs->vs_state == VDEV_STATE_CANT_OPEN)) &&
 	    (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) {
 		if (pool_state == POOL_STATE_DESTROYED)
 			(void) printf(gettext("\tThe pool was destroyed, "
 			    "but can be imported using the '-Df' flags.\n"));
 		else if (pool_state != POOL_STATE_EXPORTED)
 			(void) printf(gettext("\tThe pool may be active on "
 			    "another system, but can be imported using\n\t"
 			    "the '-f' flag.\n"));
 	}
 
 	if (msgid != NULL) {
 		(void) printf(gettext(
-		    "   see: https://zfsonlinux.org/msg/%s\n"), msgid);
+		    "   see: https://openzfs.github.io/openzfs-docs/msg/%s\n"),
+		    msgid);
 	}
 
 	(void) printf(gettext(" config:\n\n"));
 
 	cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name),
 	    VDEV_NAME_TYPE_ID);
 	if (cb.cb_namewidth < 10)
 		cb.cb_namewidth = 10;
 
 	print_import_config(&cb, name, nvroot, 0);
 
 	print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP);
 	print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
 	print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS);
 
 	if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
 		(void) printf(gettext("\n\tAdditional devices are known to "
 		    "be part of this pool, though their\n\texact "
 		    "configuration cannot be determined.\n"));
 	}
 }
 
 static boolean_t
 zfs_force_import_required(nvlist_t *config)
 {
 	uint64_t state;
 	uint64_t hostid = 0;
 	nvlist_t *nvinfo;
 
 	state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE);
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
 
 	if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid())
 		return (B_TRUE);
 
 	nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
 	if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) {
 		mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo,
 		    ZPOOL_CONFIG_MMP_STATE);
 
 		if (mmp_state != MMP_STATE_INACTIVE)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Perform the import for the given configuration.  This passes the heavy
  * lifting off to zpool_import_props(), and then mounts the datasets contained
  * within the pool.
  */
 static int
 do_import(nvlist_t *config, const char *newname, const char *mntopts,
     nvlist_t *props, int flags)
 {
 	int ret = 0;
 	zpool_handle_t *zhp;
 	char *name;
 	uint64_t version;
 
 	name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME);
 	version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION);
 
 	if (!SPA_VERSION_IS_SUPPORTED(version)) {
 		(void) fprintf(stderr, gettext("cannot import '%s': pool "
 		    "is formatted using an unsupported ZFS version\n"), name);
 		return (1);
 	} else if (zfs_force_import_required(config) &&
 	    !(flags & ZFS_IMPORT_ANY_HOST)) {
 		mmp_state_t mmp_state = MMP_STATE_INACTIVE;
 		nvlist_t *nvinfo;
 
 		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
 		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE))
 			mmp_state = fnvlist_lookup_uint64(nvinfo,
 			    ZPOOL_CONFIG_MMP_STATE);
 
 		if (mmp_state == MMP_STATE_ACTIVE) {
 			char *hostname = "<unknown>";
 			uint64_t hostid = 0;
 
 			if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME))
 				hostname = fnvlist_lookup_string(nvinfo,
 				    ZPOOL_CONFIG_MMP_HOSTNAME);
 
 			if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID))
 				hostid = fnvlist_lookup_uint64(nvinfo,
 				    ZPOOL_CONFIG_MMP_HOSTID);
 
 			(void) fprintf(stderr, gettext("cannot import '%s': "
 			    "pool is imported on %s (hostid: "
 			    "0x%lx)\nExport the pool on the other system, "
 			    "then run 'zpool import'.\n"),
 			    name, hostname, (unsigned long) hostid);
 		} else if (mmp_state == MMP_STATE_NO_HOSTID) {
 			(void) fprintf(stderr, gettext("Cannot import '%s': "
 			    "pool has the multihost property on and the\n"
 			    "system's hostid is not set. Set a unique hostid "
 			    "with the zgenhostid(8) command.\n"), name);
 		} else {
 			char *hostname = "<unknown>";
 			uint64_t timestamp = 0;
 			uint64_t hostid = 0;
 
 			if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME))
 				hostname = fnvlist_lookup_string(config,
 				    ZPOOL_CONFIG_HOSTNAME);
 
 			if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP))
 				timestamp = fnvlist_lookup_uint64(config,
 				    ZPOOL_CONFIG_TIMESTAMP);
 
 			if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID))
 				hostid = fnvlist_lookup_uint64(config,
 				    ZPOOL_CONFIG_HOSTID);
 
 			(void) fprintf(stderr, gettext("cannot import '%s': "
 			    "pool was previously in use from another system.\n"
 			    "Last accessed by %s (hostid=%lx) at %s"
 			    "The pool can be imported, use 'zpool import -f' "
 			    "to import the pool.\n"), name, hostname,
 			    (unsigned long)hostid, ctime((time_t *)&timestamp));
 		}
 
 		return (1);
 	}
 
 	if (zpool_import_props(g_zfs, config, newname, props, flags) != 0)
 		return (1);
 
 	if (newname != NULL)
 		name = (char *)newname;
 
 	if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL)
 		return (1);
 
 	/*
 	 * Loading keys is best effort. We don't want to return immediately
 	 * if it fails but we do want to give the error to the caller.
 	 */
 	if (flags & ZFS_IMPORT_LOAD_KEYS) {
 		ret = zfs_crypto_attempt_load_keys(g_zfs, name);
 		if (ret != 0)
 			ret = 1;
 	}
 
 	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
 	    !(flags & ZFS_IMPORT_ONLY) &&
 	    zpool_enable_datasets(zhp, mntopts, 0) != 0) {
 		zpool_close(zhp);
 		return (1);
 	}
 
 	zpool_close(zhp);
 	return (ret);
 }
 
 typedef struct target_exists_args {
 	const char	*poolname;
 	uint64_t	poolguid;
 } target_exists_args_t;
 
 static int
 name_or_guid_exists(zpool_handle_t *zhp, void *data)
 {
 	target_exists_args_t *args = data;
 	nvlist_t *config = zpool_get_config(zhp, NULL);
 	int found = 0;
 
 	if (config == NULL)
 		return (0);
 
 	if (args->poolname != NULL) {
 		char *pool_name;
 
 		verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    &pool_name) == 0);
 		if (strcmp(pool_name, args->poolname) == 0)
 			found = 1;
 	} else {
 		uint64_t pool_guid;
 
 		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 		    &pool_guid) == 0);
 		if (pool_guid == args->poolguid)
 			found = 1;
 	}
 	zpool_close(zhp);
 
 	return (found);
 }
 /*
  * zpool checkpoint <pool>
  *       checkpoint --discard <pool>
  *
  *       -d         Discard the checkpoint from a checkpointed
  *       --discard  pool.
  *
  *       -w         Wait for discarding a checkpoint to complete.
  *       --wait
  *
  * Checkpoints the specified pool, by taking a "snapshot" of its
  * current state. A pool can only have one checkpoint at a time.
  */
 int
 zpool_do_checkpoint(int argc, char **argv)
 {
 	boolean_t discard, wait;
 	char *pool;
 	zpool_handle_t *zhp;
 	int c, err;
 
 	struct option long_options[] = {
 		{"discard", no_argument, NULL, 'd'},
 		{"wait", no_argument, NULL, 'w'},
 		{0, 0, 0, 0}
 	};
 
 	discard = B_FALSE;
 	wait = B_FALSE;
 	while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) {
 		switch (c) {
 		case 'd':
 			discard = B_TRUE;
 			break;
 		case 'w':
 			wait = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	if (wait && !discard) {
 		(void) fprintf(stderr, gettext("--wait only valid when "
 		    "--discard also specified\n"));
 		usage(B_FALSE);
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool argument\n"));
 		usage(B_FALSE);
 	}
 
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	pool = argv[0];
 
 	if ((zhp = zpool_open(g_zfs, pool)) == NULL) {
 		/* As a special case, check for use of '/' in the name */
 		if (strchr(pool, '/') != NULL)
 			(void) fprintf(stderr, gettext("'zpool checkpoint' "
 			    "doesn't work on datasets. To save the state "
 			    "of a dataset from a specific point in time "
 			    "please use 'zfs snapshot'\n"));
 		return (1);
 	}
 
 	if (discard) {
 		err = (zpool_discard_checkpoint(zhp) != 0);
 		if (err == 0 && wait)
 			err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD);
 	} else {
 		err = (zpool_checkpoint(zhp) != 0);
 	}
 
 	zpool_close(zhp);
 
 	return (err);
 }
 
 #define	CHECKPOINT_OPT	1024
 
 /*
  * zpool import [-d dir] [-D]
  *       import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l]
  *              [-d dir | -c cachefile] [-f] -a
  *       import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l]
  *              [-d dir | -c cachefile] [-f] [-n] [-F] <pool | id> [newpool]
  *
  *	 -c	Read pool information from a cachefile instead of searching
  *		devices.
  *
  *       -d	Scan in a specific directory, other than /dev/.  More than
  *		one directory can be specified using multiple '-d' options.
  *
  *       -D     Scan for previously destroyed pools or import all or only
  *              specified destroyed pools.
  *
  *       -R	Temporarily import the pool, with all mountpoints relative to
  *		the given root.  The pool will remain exported when the machine
  *		is rebooted.
  *
  *       -V	Import even in the presence of faulted vdevs.  This is an
  *       	intentionally undocumented option for testing purposes, and
  *       	treats the pool configuration as complete, leaving any bad
  *		vdevs in the FAULTED state. In other words, it does verbatim
  *		import.
  *
  *       -f	Force import, even if it appears that the pool is active.
  *
  *       -F     Attempt rewind if necessary.
  *
  *       -n     See if rewind would work, but don't actually rewind.
  *
  *       -N     Import the pool but don't mount datasets.
  *
  *       -T     Specify a starting txg to use for import. This option is
  *       	intentionally undocumented option for testing purposes.
  *
  *       -a	Import all pools found.
  *
  *       -l	Load encryption keys while importing.
  *
  *       -o	Set property=value and/or temporary mount options (without '=').
  *
  *	 -s	Scan using the default search path, the libblkid cache will
  *	        not be consulted.
  *
  *       --rewind-to-checkpoint
  *       	Import the pool and revert back to the checkpoint.
  *
  * The import command scans for pools to import, and import pools based on pool
  * name and GUID.  The pool can also be renamed as part of the import process.
  */
 int
 zpool_do_import(int argc, char **argv)
 {
 	char **searchdirs = NULL;
 	char *env, *envdup = NULL;
 	int nsearch = 0;
 	int c;
 	int err = 0;
 	nvlist_t *pools = NULL;
 	boolean_t do_all = B_FALSE;
 	boolean_t do_destroyed = B_FALSE;
 	char *mntopts = NULL;
 	nvpair_t *elem;
 	nvlist_t *config;
 	uint64_t searchguid = 0;
 	char *searchname = NULL;
 	char *propval;
 	nvlist_t *found_config;
 	nvlist_t *policy = NULL;
 	nvlist_t *props = NULL;
 	boolean_t first;
 	int flags = ZFS_IMPORT_NORMAL;
 	uint32_t rewind_policy = ZPOOL_NO_REWIND;
 	boolean_t dryrun = B_FALSE;
 	boolean_t do_rewind = B_FALSE;
 	boolean_t xtreme_rewind = B_FALSE;
 	boolean_t do_scan = B_FALSE;
 	boolean_t pool_exists = B_FALSE;
 	uint64_t pool_state, txg = -1ULL;
 	char *cachefile = NULL;
 	importargs_t idata = { 0 };
 	char *endptr;
 
 	struct option long_options[] = {
 		{"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT},
 		{0, 0, 0, 0}
 	};
 
 	/* check options */
 	while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX",
 	    long_options, NULL)) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = B_TRUE;
 			break;
 		case 'c':
 			cachefile = optarg;
 			break;
 		case 'd':
 			if (searchdirs == NULL) {
 				searchdirs = safe_malloc(sizeof (char *));
 			} else {
 				char **tmp = safe_malloc((nsearch + 1) *
 				    sizeof (char *));
 				bcopy(searchdirs, tmp, nsearch *
 				    sizeof (char *));
 				free(searchdirs);
 				searchdirs = tmp;
 			}
 			searchdirs[nsearch++] = optarg;
 			break;
 		case 'D':
 			do_destroyed = B_TRUE;
 			break;
 		case 'f':
 			flags |= ZFS_IMPORT_ANY_HOST;
 			break;
 		case 'F':
 			do_rewind = B_TRUE;
 			break;
 		case 'l':
 			flags |= ZFS_IMPORT_LOAD_KEYS;
 			break;
 		case 'm':
 			flags |= ZFS_IMPORT_MISSING_LOG;
 			break;
 		case 'n':
 			dryrun = B_TRUE;
 			break;
 		case 'N':
 			flags |= ZFS_IMPORT_ONLY;
 			break;
 		case 'o':
 			if ((propval = strchr(optarg, '=')) != NULL) {
 				*propval = '\0';
 				propval++;
 				if (add_prop_list(optarg, propval,
 				    &props, B_TRUE))
 					goto error;
 			} else {
 				mntopts = optarg;
 			}
 			break;
 		case 'R':
 			if (add_prop_list(zpool_prop_to_name(
 			    ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
 				goto error;
 			if (add_prop_list_default(zpool_prop_to_name(
 			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
 				goto error;
 			break;
 		case 's':
 			do_scan = B_TRUE;
 			break;
 		case 't':
 			flags |= ZFS_IMPORT_TEMP_NAME;
 			if (add_prop_list_default(zpool_prop_to_name(
 			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
 				goto error;
 			break;
 
 		case 'T':
 			errno = 0;
 			txg = strtoull(optarg, &endptr, 0);
 			if (errno != 0 || *endptr != '\0') {
 				(void) fprintf(stderr,
 				    gettext("invalid txg value\n"));
 				usage(B_FALSE);
 			}
 			rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND;
 			break;
 		case 'V':
 			flags |= ZFS_IMPORT_VERBATIM;
 			break;
 		case 'X':
 			xtreme_rewind = B_TRUE;
 			break;
 		case CHECKPOINT_OPT:
 			flags |= ZFS_IMPORT_CHECKPOINT;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (cachefile && nsearch != 0) {
 		(void) fprintf(stderr, gettext("-c is incompatible with -d\n"));
 		usage(B_FALSE);
 	}
 
 	if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) {
 		(void) fprintf(stderr, gettext("-l is incompatible with -N\n"));
 		usage(B_FALSE);
 	}
 
 	if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) {
 		(void) fprintf(stderr, gettext("-l is only meaningful during "
 		    "an import\n"));
 		usage(B_FALSE);
 	}
 
 	if ((dryrun || xtreme_rewind) && !do_rewind) {
 		(void) fprintf(stderr,
 		    gettext("-n or -X only meaningful with -F\n"));
 		usage(B_FALSE);
 	}
 	if (dryrun)
 		rewind_policy = ZPOOL_TRY_REWIND;
 	else if (do_rewind)
 		rewind_policy = ZPOOL_DO_REWIND;
 	if (xtreme_rewind)
 		rewind_policy |= ZPOOL_EXTREME_REWIND;
 
 	/* In the future, we can capture further policy and include it here */
 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
 	    nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 ||
 	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY,
 	    rewind_policy) != 0)
 		goto error;
 
 	/* check argument count */
 	if (do_all) {
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 	} else {
 		if (argc > 2) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 	}
 
 	/*
 	 * Check for the effective uid.  We do this explicitly here because
 	 * otherwise any attempt to discover pools will silently fail.
 	 */
 	if (argc == 0 && geteuid() != 0) {
 		(void) fprintf(stderr, gettext("cannot "
 		    "discover pools: permission denied\n"));
 		if (searchdirs != NULL)
 			free(searchdirs);
 
 		nvlist_free(props);
 		nvlist_free(policy);
 		return (1);
 	}
 
 	/*
 	 * Depending on the arguments given, we do one of the following:
 	 *
 	 *	<none>	Iterate through all pools and display information about
 	 *		each one.
 	 *
 	 *	-a	Iterate through all pools and try to import each one.
 	 *
 	 *	<id>	Find the pool that corresponds to the given GUID/pool
 	 *		name and import that one.
 	 *
 	 *	-D	Above options applies only to destroyed pools.
 	 */
 	if (argc != 0) {
 		char *endptr;
 
 		errno = 0;
 		searchguid = strtoull(argv[0], &endptr, 10);
 		if (errno != 0 || *endptr != '\0') {
 			searchname = argv[0];
 			searchguid = 0;
 		}
 		found_config = NULL;
 
 		/*
 		 * User specified a name or guid.  Ensure it's unique.
 		 */
 		target_exists_args_t search = {searchname, searchguid};
 		pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search);
 	}
 
 	/*
 	 * Check the environment for the preferred search path.
 	 */
 	if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) {
 		char *dir;
 
 		envdup = strdup(env);
 
 		dir = strtok(envdup, ":");
 		while (dir != NULL) {
 			if (searchdirs == NULL) {
 				searchdirs = safe_malloc(sizeof (char *));
 			} else {
 				char **tmp = safe_malloc((nsearch + 1) *
 				    sizeof (char *));
 				bcopy(searchdirs, tmp, nsearch *
 				    sizeof (char *));
 				free(searchdirs);
 				searchdirs = tmp;
 			}
 			searchdirs[nsearch++] = dir;
 			dir = strtok(NULL, ":");
 		}
 	}
 
 	idata.path = searchdirs;
 	idata.paths = nsearch;
 	idata.poolname = searchname;
 	idata.guid = searchguid;
 	idata.cachefile = cachefile;
 	idata.scan = do_scan;
 	idata.policy = policy;
 
 	pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops);
 
 	if (pools != NULL && pool_exists &&
 	    (argc == 1 || strcmp(argv[0], argv[1]) == 0)) {
 		(void) fprintf(stderr, gettext("cannot import '%s': "
 		    "a pool with that name already exists\n"),
 		    argv[0]);
 		(void) fprintf(stderr, gettext("use the form '%s "
 		    "<pool | id> <newpool>' to give it a new name\n"),
 		    "zpool import");
 		err = 1;
 	} else if (pools == NULL && pool_exists) {
 		(void) fprintf(stderr, gettext("cannot import '%s': "
 		    "a pool with that name is already created/imported,\n"),
 		    argv[0]);
 		(void) fprintf(stderr, gettext("and no additional pools "
 		    "with that name were found\n"));
 		err = 1;
 	} else if (pools == NULL) {
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("cannot import '%s': "
 			    "no such pool available\n"), argv[0]);
 		}
 		err = 1;
 	}
 
 	if (err == 1) {
 		if (searchdirs != NULL)
 			free(searchdirs);
 		if (envdup != NULL)
 			free(envdup);
 		nvlist_free(policy);
 		nvlist_free(pools);
 		nvlist_free(props);
 		return (1);
 	}
 
 	/*
 	 * At this point we have a list of import candidate configs. Even if
 	 * we were searching by pool name or guid, we still need to
 	 * post-process the list to deal with pool state and possible
 	 * duplicate names.
 	 */
 	err = 0;
 	elem = NULL;
 	first = B_TRUE;
 	while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
 
 		verify(nvpair_value_nvlist(elem, &config) == 0);
 
 		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 		    &pool_state) == 0);
 		if (!do_destroyed && pool_state == POOL_STATE_DESTROYED)
 			continue;
 		if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
 			continue;
 
 		verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
 		    policy) == 0);
 
 		if (argc == 0) {
 			if (first)
 				first = B_FALSE;
 			else if (!do_all)
 				(void) printf("\n");
 
 			if (do_all) {
 				err |= do_import(config, NULL, mntopts,
 				    props, flags);
 			} else {
 				show_import(config);
 			}
 		} else if (searchname != NULL) {
 			char *name;
 
 			/*
 			 * We are searching for a pool based on name.
 			 */
 			verify(nvlist_lookup_string(config,
 			    ZPOOL_CONFIG_POOL_NAME, &name) == 0);
 
 			if (strcmp(name, searchname) == 0) {
 				if (found_config != NULL) {
 					(void) fprintf(stderr, gettext(
 					    "cannot import '%s': more than "
 					    "one matching pool\n"), searchname);
 					(void) fprintf(stderr, gettext(
 					    "import by numeric ID instead\n"));
 					err = B_TRUE;
 				}
 				found_config = config;
 			}
 		} else {
 			uint64_t guid;
 
 			/*
 			 * Search for a pool by guid.
 			 */
 			verify(nvlist_lookup_uint64(config,
 			    ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
 
 			if (guid == searchguid)
 				found_config = config;
 		}
 	}
 
 	/*
 	 * If we were searching for a specific pool, verify that we found a
 	 * pool, and then do the import.
 	 */
 	if (argc != 0 && err == 0) {
 		if (found_config == NULL) {
 			(void) fprintf(stderr, gettext("cannot import '%s': "
 			    "no such pool available\n"), argv[0]);
 			err = B_TRUE;
 		} else {
 			err |= do_import(found_config, argc == 1 ? NULL :
 			    argv[1], mntopts, props, flags);
 		}
 	}
 
 	/*
 	 * If we were just looking for pools, report an error if none were
 	 * found.
 	 */
 	if (argc == 0 && first)
 		(void) fprintf(stderr,
 		    gettext("no pools available to import\n"));
 
 error:
 	nvlist_free(props);
 	nvlist_free(pools);
 	nvlist_free(policy);
 	if (searchdirs != NULL)
 		free(searchdirs);
 	if (envdup != NULL)
 		free(envdup);
 
 	return (err ? 1 : 0);
 }
 
 /*
  * zpool sync [-f] [pool] ...
  *
  * -f (undocumented) force uberblock (and config including zpool cache file)
  *    update.
  *
  * Sync the specified pool(s).
  * Without arguments "zpool sync" will sync all pools.
  * This command initiates TXG sync(s) and will return after the TXG(s) commit.
  *
  */
 static int
 zpool_do_sync(int argc, char **argv)
 {
 	int ret;
 	boolean_t force = B_FALSE;
 
 	/* check options */
 	while ((ret  = getopt(argc, argv, "f")) != -1) {
 		switch (ret) {
 		case 'f':
 			force = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* if argc == 0 we will execute zpool_sync_one on all pools */
 	ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force);
 
 	return (ret);
 }
 
 typedef struct iostat_cbdata {
 	uint64_t cb_flags;
 	int cb_name_flags;
 	int cb_namewidth;
 	int cb_iteration;
 	char **cb_vdev_names; /* Only show these vdevs */
 	unsigned int cb_vdev_names_count;
 	boolean_t cb_verbose;
 	boolean_t cb_literal;
 	boolean_t cb_scripted;
 	zpool_list_t *cb_list;
 	vdev_cmd_data_list_t *vcdl;
 } iostat_cbdata_t;
 
 /*  iostat labels */
 typedef struct name_and_columns {
 	const char *name;	/* Column name */
 	unsigned int columns;	/* Center name to this number of columns */
 } name_and_columns_t;
 
 #define	IOSTAT_MAX_LABELS	13	/* Max number of labels on one line */
 
 static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] =
 {
 	[IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2},
 	    {NULL}},
 	[IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
 	    {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {NULL}},
 	[IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2},
 	    {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2},
 	    {"trimq_write", 2}, {NULL}},
 	[IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
 	    {"asyncq_wait", 2}, {NULL}},
 	[IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2},
 	    {"async_read", 2}, {"async_write", 2}, {"scrub", 2},
 	    {"trim", 2}, {NULL}},
 };
 
 /* Shorthand - if "columns" field not set, default to 1 column */
 static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] =
 {
 	[IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"},
 	    {"write"}, {NULL}},
 	[IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
 	    {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}},
 	[IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"},
 	    {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"},
 	    {"pend"}, {"activ"}, {NULL}},
 	[IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
 	    {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}},
 	[IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"},
 	    {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}},
 };
 
 static const char *histo_to_title[] = {
 	[IOS_L_HISTO] = "latency",
 	[IOS_RQ_HISTO] = "req_size",
 };
 
 /*
  * Return the number of labels in a null-terminated name_and_columns_t
  * array.
  *
  */
 static unsigned int
 label_array_len(const name_and_columns_t *labels)
 {
 	int i = 0;
 
 	while (labels[i].name)
 		i++;
 
 	return (i);
 }
 
 /*
  * Return the number of strings in a null-terminated string array.
  * For example:
  *
  *     const char foo[] = {"bar", "baz", NULL}
  *
  * returns 2
  */
 static uint64_t
 str_array_len(const char *array[])
 {
 	uint64_t i = 0;
 	while (array[i])
 		i++;
 
 	return (i);
 }
 
 
 /*
  * Return a default column width for default/latency/queue columns. This does
  * not include histograms, which have their columns autosized.
  */
 static unsigned int
 default_column_width(iostat_cbdata_t *cb, enum iostat_type type)
 {
 	unsigned long column_width = 5; /* Normal niceprint */
 	static unsigned long widths[] = {
 		/*
 		 * Choose some sane default column sizes for printing the
 		 * raw numbers.
 		 */
 		[IOS_DEFAULT] = 15, /* 1PB capacity */
 		[IOS_LATENCY] = 10, /* 1B ns = 10sec */
 		[IOS_QUEUES] = 6,   /* 1M queue entries */
 		[IOS_L_HISTO] = 10, /* 1B ns = 10sec */
 		[IOS_RQ_HISTO] = 6, /* 1M queue entries */
 	};
 
 	if (cb->cb_literal)
 		column_width = widths[type];
 
 	return (column_width);
 }
 
 /*
  * Print the column labels, i.e:
  *
  *   capacity     operations     bandwidth
  * alloc   free   read  write   read  write  ...
  *
  * If force_column_width is set, use it for the column width.  If not set, use
  * the default column width.
  */
 static void
 print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,
     const name_and_columns_t labels[][IOSTAT_MAX_LABELS])
 {
 	int i, idx, s;
 	int text_start, rw_column_width, spaces_to_end;
 	uint64_t flags = cb->cb_flags;
 	uint64_t f;
 	unsigned int column_width = force_column_width;
 
 	/* For each bit set in flags */
 	for (f = flags; f; f &= ~(1ULL << idx)) {
 		idx = lowbit64(f) - 1;
 		if (!force_column_width)
 			column_width = default_column_width(cb, idx);
 		/* Print our top labels centered over "read  write" label. */
 		for (i = 0; i < label_array_len(labels[idx]); i++) {
 			const char *name = labels[idx][i].name;
 			/*
 			 * We treat labels[][].columns == 0 as shorthand
 			 * for one column.  It makes writing out the label
 			 * tables more concise.
 			 */
 			unsigned int columns = MAX(1, labels[idx][i].columns);
 			unsigned int slen = strlen(name);
 
 			rw_column_width = (column_width * columns) +
 			    (2 * (columns - 1));
 
 			text_start = (int)((rw_column_width) / columns -
 			    slen / columns);
 			if (text_start < 0)
 				text_start = 0;
 
 			printf("  ");	/* Two spaces between columns */
 
 			/* Space from beginning of column to label */
 			for (s = 0; s < text_start; s++)
 				printf(" ");
 
 			printf("%s", name);
 
 			/* Print space after label to end of column */
 			spaces_to_end = rw_column_width - text_start - slen;
 			if (spaces_to_end < 0)
 				spaces_to_end = 0;
 
 			for (s = 0; s < spaces_to_end; s++)
 				printf(" ");
 		}
 	}
 }
 
 
 /*
  * print_cmd_columns - Print custom column titles from -c
  *
  * If the user specified the "zpool status|iostat -c" then print their custom
  * column titles in the header.  For example, print_cmd_columns() would print
  * the "  col1  col2" part of this:
  *
  * $ zpool iostat -vc 'echo col1=val1; echo col2=val2'
  * ...
  *	      capacity     operations     bandwidth
  * pool        alloc   free   read  write   read  write  col1  col2
  * ----------  -----  -----  -----  -----  -----  -----  ----  ----
  * mypool       269K  1008M      0      0    107    946
  *   mirror     269K  1008M      0      0    107    946
  *     sdb         -      -      0      0    102    473  val1  val2
  *     sdc         -      -      0      0      5    473  val1  val2
  * ----------  -----  -----  -----  -----  -----  -----  ----  ----
  */
 static void
 print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes)
 {
 	int i, j;
 	vdev_cmd_data_t *data = &vcdl->data[0];
 
 	if (vcdl->count == 0 || data == NULL)
 		return;
 
 	/*
 	 * Each vdev cmd should have the same column names unless the user did
 	 * something weird with their cmd.  Just take the column names from the
 	 * first vdev and assume it works for all of them.
 	 */
 	for (i = 0; i < vcdl->uniq_cols_cnt; i++) {
 		printf("  ");
 		if (use_dashes) {
 			for (j = 0; j < vcdl->uniq_cols_width[i]; j++)
 				printf("-");
 		} else {
 			printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i],
 			    vcdl->uniq_cols[i]);
 		}
 	}
 }
 
 
 /*
  * Utility function to print out a line of dashes like:
  *
  * 	--------------------------------  -----  -----  -----  -----  -----
  *
  * ...or a dashed named-row line like:
  *
  * 	logs                                  -      -      -      -      -
  *
  * @cb:				iostat data
  *
  * @force_column_width		If non-zero, use the value as the column width.
  * 				Otherwise use the default column widths.
  *
  * @name:			Print a dashed named-row line starting
  * 				with @name.  Otherwise, print a regular
  * 				dashed line.
  */
 static void
 print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width,
     const char *name)
 {
 	int i;
 	unsigned int namewidth;
 	uint64_t flags = cb->cb_flags;
 	uint64_t f;
 	int idx;
 	const name_and_columns_t *labels;
 	const char *title;
 
 
 	if (cb->cb_flags & IOS_ANYHISTO_M) {
 		title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)];
 	} else if (cb->cb_vdev_names_count) {
 		title = "vdev";
 	} else  {
 		title = "pool";
 	}
 
 	namewidth = MAX(MAX(strlen(title), cb->cb_namewidth),
 	    name ? strlen(name) : 0);
 
 
 	if (name) {
 		printf("%-*s", namewidth, name);
 	} else {
 		for (i = 0; i < namewidth; i++)
 			(void) printf("-");
 	}
 
 	/* For each bit in flags */
 	for (f = flags; f; f &= ~(1ULL << idx)) {
 		unsigned int column_width;
 		idx = lowbit64(f) - 1;
 		if (force_column_width)
 			column_width = force_column_width;
 		else
 			column_width = default_column_width(cb, idx);
 
 		labels = iostat_bottom_labels[idx];
 		for (i = 0; i < label_array_len(labels); i++) {
 			if (name)
 				printf("  %*s-", column_width - 1, " ");
 			else
 				printf("  %.*s", column_width,
 				    "--------------------");
 		}
 	}
 }
 
 
 static void
 print_iostat_separator_impl(iostat_cbdata_t *cb,
     unsigned int force_column_width)
 {
 	print_iostat_dashes(cb, force_column_width, NULL);
 }
 
 static void
 print_iostat_separator(iostat_cbdata_t *cb)
 {
 	print_iostat_separator_impl(cb, 0);
 }
 
 static void
 print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width,
     const char *histo_vdev_name)
 {
 	unsigned int namewidth;
 	const char *title;
 
 	if (cb->cb_flags & IOS_ANYHISTO_M) {
 		title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)];
 	} else if (cb->cb_vdev_names_count) {
 		title = "vdev";
 	} else  {
 		title = "pool";
 	}
 
 	namewidth = MAX(MAX(strlen(title), cb->cb_namewidth),
 	    histo_vdev_name ? strlen(histo_vdev_name) : 0);
 
 	if (histo_vdev_name)
 		printf("%-*s", namewidth, histo_vdev_name);
 	else
 		printf("%*s", namewidth, "");
 
 
 	print_iostat_labels(cb, force_column_width, iostat_top_labels);
 	printf("\n");
 
 	printf("%-*s", namewidth, title);
 
 	print_iostat_labels(cb, force_column_width, iostat_bottom_labels);
 	if (cb->vcdl != NULL)
 		print_cmd_columns(cb->vcdl, 0);
 
 	printf("\n");
 
 	print_iostat_separator_impl(cb, force_column_width);
 
 	if (cb->vcdl != NULL)
 		print_cmd_columns(cb->vcdl, 1);
 
 	printf("\n");
 }
 
 static void
 print_iostat_header(iostat_cbdata_t *cb)
 {
 	print_iostat_header_impl(cb, 0, NULL);
 }
 
 
 /*
  * Display a single statistic.
  */
 static void
 print_one_stat(uint64_t value, enum zfs_nicenum_format format,
     unsigned int column_size, boolean_t scripted)
 {
 	char buf[64];
 
 	zfs_nicenum_format(value, buf, sizeof (buf), format);
 
 	if (scripted)
 		printf("\t%s", buf);
 	else
 		printf("  %*s", column_size, buf);
 }
 
 /*
  * Calculate the default vdev stats
  *
  * Subtract oldvs from newvs, apply a scaling factor, and save the resulting
  * stats into calcvs.
  */
 static void
 calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs,
     vdev_stat_t *calcvs)
 {
 	int i;
 
 	memcpy(calcvs, newvs, sizeof (*calcvs));
 	for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++)
 		calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]);
 
 	for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++)
 		calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]);
 }
 
 /*
  * Internal representation of the extended iostats data.
  *
  * The extended iostat stats are exported in nvlists as either uint64_t arrays
  * or single uint64_t's.  We make both look like arrays to make them easier
  * to process.  In order to make single uint64_t's look like arrays, we set
  * __data to the stat data, and then set *data = &__data with count = 1.  Then,
  * we can just use *data and count.
  */
 struct stat_array {
 	uint64_t *data;
 	uint_t count;	/* Number of entries in data[] */
 	uint64_t __data; /* Only used when data is a single uint64_t */
 };
 
 static uint64_t
 stat_histo_max(struct stat_array *nva, unsigned int len)
 {
 	uint64_t max = 0;
 	int i;
 	for (i = 0; i < len; i++)
 		max = MAX(max, array64_max(nva[i].data, nva[i].count));
 
 	return (max);
 }
 
 /*
  * Helper function to lookup a uint64_t array or uint64_t value and store its
  * data as a stat_array.  If the nvpair is a single uint64_t value, then we make
  * it look like a one element array to make it easier to process.
  */
 static int
 nvpair64_to_stat_array(nvlist_t *nvl, const char *name,
     struct stat_array *nva)
 {
 	nvpair_t *tmp;
 	int ret;
 
 	verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0);
 	switch (nvpair_type(tmp)) {
 	case DATA_TYPE_UINT64_ARRAY:
 		ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count);
 		break;
 	case DATA_TYPE_UINT64:
 		ret = nvpair_value_uint64(tmp, &nva->__data);
 		nva->data = &nva->__data;
 		nva->count = 1;
 		break;
 	default:
 		/* Not a uint64_t */
 		ret = EINVAL;
 		break;
 	}
 
 	return (ret);
 }
 
 /*
  * Given a list of nvlist names, look up the extended stats in newnv and oldnv,
  * subtract them, and return the results in a newly allocated stat_array.
  * You must free the returned array after you are done with it with
  * free_calc_stats().
  *
  * Additionally, you can set "oldnv" to NULL if you simply want the newnv
  * values.
  */
 static struct stat_array *
 calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv,
     nvlist_t *newnv)
 {
 	nvlist_t *oldnvx = NULL, *newnvx;
 	struct stat_array *oldnva, *newnva, *calcnva;
 	int i, j;
 	unsigned int alloc_size = (sizeof (struct stat_array)) * len;
 
 	/* Extract our extended stats nvlist from the main list */
 	verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX,
 	    &newnvx) == 0);
 	if (oldnv) {
 		verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX,
 		    &oldnvx) == 0);
 	}
 
 	newnva = safe_malloc(alloc_size);
 	oldnva = safe_malloc(alloc_size);
 	calcnva = safe_malloc(alloc_size);
 
 	for (j = 0; j < len; j++) {
 		verify(nvpair64_to_stat_array(newnvx, names[j],
 		    &newnva[j]) == 0);
 		calcnva[j].count = newnva[j].count;
 		alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]);
 		calcnva[j].data = safe_malloc(alloc_size);
 		memcpy(calcnva[j].data, newnva[j].data, alloc_size);
 
 		if (oldnvx) {
 			verify(nvpair64_to_stat_array(oldnvx, names[j],
 			    &oldnva[j]) == 0);
 			for (i = 0; i < oldnva[j].count; i++)
 				calcnva[j].data[i] -= oldnva[j].data[i];
 		}
 	}
 	free(newnva);
 	free(oldnva);
 	return (calcnva);
 }
 
 static void
 free_calc_stats(struct stat_array *nva, unsigned int len)
 {
 	int i;
 	for (i = 0; i < len; i++)
 		free(nva[i].data);
 
 	free(nva);
 }
 
 static void
 print_iostat_histo(struct stat_array *nva, unsigned int len,
     iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth,
     double scale)
 {
 	int i, j;
 	char buf[6];
 	uint64_t val;
 	enum zfs_nicenum_format format;
 	unsigned int buckets;
 	unsigned int start_bucket;
 
 	if (cb->cb_literal)
 		format = ZFS_NICENUM_RAW;
 	else
 		format = ZFS_NICENUM_1024;
 
 	/* All these histos are the same size, so just use nva[0].count */
 	buckets = nva[0].count;
 
 	if (cb->cb_flags & IOS_RQ_HISTO_M) {
 		/* Start at 512 - req size should never be lower than this */
 		start_bucket = 9;
 	} else {
 		start_bucket = 0;
 	}
 
 	for (j = start_bucket; j < buckets; j++) {
 		/* Print histogram bucket label */
 		if (cb->cb_flags & IOS_L_HISTO_M) {
 			/* Ending range of this bucket */
 			val = (1UL << (j + 1)) - 1;
 			zfs_nicetime(val, buf, sizeof (buf));
 		} else {
 			/* Request size (starting range of bucket) */
 			val = (1UL << j);
 			zfs_nicenum(val, buf, sizeof (buf));
 		}
 
 		if (cb->cb_scripted)
 			printf("%llu", (u_longlong_t)val);
 		else
 			printf("%-*s", namewidth, buf);
 
 		/* Print the values on the line */
 		for (i = 0; i < len; i++) {
 			print_one_stat(nva[i].data[j] * scale, format,
 			    column_width, cb->cb_scripted);
 		}
 		printf("\n");
 	}
 }
 
 static void
 print_solid_separator(unsigned int length)
 {
 	while (length--)
 		printf("-");
 	printf("\n");
 }
 
 static void
 print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv,
     nvlist_t *newnv, double scale, const char *name)
 {
 	unsigned int column_width;
 	unsigned int namewidth;
 	unsigned int entire_width;
 	enum iostat_type type;
 	struct stat_array *nva;
 	const char **names;
 	unsigned int names_len;
 
 	/* What type of histo are we? */
 	type = IOS_HISTO_IDX(cb->cb_flags);
 
 	/* Get NULL-terminated array of nvlist names for our histo */
 	names = vsx_type_to_nvlist[type];
 	names_len = str_array_len(names); /* num of names */
 
 	nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv);
 
 	if (cb->cb_literal) {
 		column_width = MAX(5,
 		    (unsigned int) log10(stat_histo_max(nva, names_len)) + 1);
 	} else {
 		column_width = 5;
 	}
 
 	namewidth = MAX(cb->cb_namewidth,
 	    strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]));
 
 	/*
 	 * Calculate the entire line width of what we're printing.  The
 	 * +2 is for the two spaces between columns:
 	 */
 	/*	 read  write				*/
 	/*	-----  -----				*/
 	/*	|___|  <---------- column_width		*/
 	/*						*/
 	/*	|__________|  <--- entire_width		*/
 	/*						*/
 	entire_width = namewidth + (column_width + 2) *
 	    label_array_len(iostat_bottom_labels[type]);
 
 	if (cb->cb_scripted)
 		printf("%s\n", name);
 	else
 		print_iostat_header_impl(cb, column_width, name);
 
 	print_iostat_histo(nva, names_len, cb, column_width,
 	    namewidth, scale);
 
 	free_calc_stats(nva, names_len);
 	if (!cb->cb_scripted)
 		print_solid_separator(entire_width);
 }
 
 /*
  * Calculate the average latency of a power-of-two latency histogram
  */
 static uint64_t
 single_histo_average(uint64_t *histo, unsigned int buckets)
 {
 	int i;
 	uint64_t count = 0, total = 0;
 
 	for (i = 0; i < buckets; i++) {
 		/*
 		 * Our buckets are power-of-two latency ranges.  Use the
 		 * midpoint latency of each bucket to calculate the average.
 		 * For example:
 		 *
 		 * Bucket          Midpoint
 		 * 8ns-15ns:       12ns
 		 * 16ns-31ns:      24ns
 		 * ...
 		 */
 		if (histo[i] != 0) {
 			total += histo[i] * (((1UL << i) + ((1UL << i)/2)));
 			count += histo[i];
 		}
 	}
 
 	/* Prevent divide by zero */
 	return (count == 0 ? 0 : total / count);
 }
 
 static void
 print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv,
     nvlist_t *newnv)
 {
 	int i;
 	uint64_t val;
 	const char *names[] = {
 		ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
 		ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
 		ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
 		ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
 		ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
 		ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
 		ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
 		ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
 		ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
 		ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
 		ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
 		ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
 	};
 
 	struct stat_array *nva;
 
 	unsigned int column_width = default_column_width(cb, IOS_QUEUES);
 	enum zfs_nicenum_format format;
 
 	nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv);
 
 	if (cb->cb_literal)
 		format = ZFS_NICENUM_RAW;
 	else
 		format = ZFS_NICENUM_1024;
 
 	for (i = 0; i < ARRAY_SIZE(names); i++) {
 		val = nva[i].data[0];
 		print_one_stat(val, format, column_width, cb->cb_scripted);
 	}
 
 	free_calc_stats(nva, ARRAY_SIZE(names));
 }
 
 static void
 print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv,
     nvlist_t *newnv)
 {
 	int i;
 	uint64_t val;
 	const char *names[] = {
 		ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
 		ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
 		ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
 		ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
 		ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
 		ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
 		ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
 		ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
 		ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
 		ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
 	};
 	struct stat_array *nva;
 
 	unsigned int column_width = default_column_width(cb, IOS_LATENCY);
 	enum zfs_nicenum_format format;
 
 	nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv);
 
 	if (cb->cb_literal)
 		format = ZFS_NICENUM_RAWTIME;
 	else
 		format = ZFS_NICENUM_TIME;
 
 	/* Print our avg latencies on the line */
 	for (i = 0; i < ARRAY_SIZE(names); i++) {
 		/* Compute average latency for a latency histo */
 		val = single_histo_average(nva[i].data, nva[i].count);
 		print_one_stat(val, format, column_width, cb->cb_scripted);
 	}
 	free_calc_stats(nva, ARRAY_SIZE(names));
 }
 
 /*
  * Print default statistics (capacity/operations/bandwidth)
  */
 static void
 print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale)
 {
 	unsigned int column_width = default_column_width(cb, IOS_DEFAULT);
 	enum zfs_nicenum_format format;
 	char na;	/* char to print for "not applicable" values */
 
 	if (cb->cb_literal) {
 		format = ZFS_NICENUM_RAW;
 		na = '0';
 	} else {
 		format = ZFS_NICENUM_1024;
 		na = '-';
 	}
 
 	/* only toplevel vdevs have capacity stats */
 	if (vs->vs_space == 0) {
 		if (cb->cb_scripted)
 			printf("\t%c\t%c", na, na);
 		else
 			printf("  %*c  %*c", column_width, na, column_width,
 			    na);
 	} else {
 		print_one_stat(vs->vs_alloc, format, column_width,
 		    cb->cb_scripted);
 		print_one_stat(vs->vs_space - vs->vs_alloc, format,
 		    column_width, cb->cb_scripted);
 	}
 
 	print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale),
 	    format, column_width, cb->cb_scripted);
 	print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale),
 	    format, column_width, cb->cb_scripted);
 	print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale),
 	    format, column_width, cb->cb_scripted);
 	print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale),
 	    format, column_width, cb->cb_scripted);
 }
 
 static const char *class_name[] = {
 	VDEV_ALLOC_BIAS_DEDUP,
 	VDEV_ALLOC_BIAS_SPECIAL,
 	VDEV_ALLOC_CLASS_LOGS
 };
 
 /*
  * Print out all the statistics for the given vdev.  This can either be the
  * toplevel configuration, or called recursively.  If 'name' is NULL, then this
  * is a verbose output, and we don't want to display the toplevel pool stats.
  *
  * Returns the number of stat lines printed.
  */
 static unsigned int
 print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
     nvlist_t *newnv, iostat_cbdata_t *cb, int depth)
 {
 	nvlist_t **oldchild, **newchild;
 	uint_t c, children, oldchildren;
 	vdev_stat_t *oldvs, *newvs, *calcvs;
 	vdev_stat_t zerovs = { 0 };
 	char *vname;
 	int i;
 	int ret = 0;
 	uint64_t tdelta;
 	double scale;
 
 	if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
 		return (ret);
 
 	calcvs = safe_malloc(sizeof (*calcvs));
 
 	if (oldnv != NULL) {
 		verify(nvlist_lookup_uint64_array(oldnv,
 		    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0);
 	} else {
 		oldvs = &zerovs;
 	}
 
 	/* Do we only want to see a specific vdev? */
 	for (i = 0; i < cb->cb_vdev_names_count; i++) {
 		/* Yes we do.  Is this the vdev? */
 		if (strcmp(name, cb->cb_vdev_names[i]) == 0) {
 			/*
 			 * This is our vdev.  Since it is the only vdev we
 			 * will be displaying, make depth = 0 so that it
 			 * doesn't get indented.
 			 */
 			depth = 0;
 			break;
 		}
 	}
 
 	if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) {
 		/* Couldn't match the name */
 		goto children;
 	}
 
 
 	verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&newvs, &c) == 0);
 
 	/*
 	 * Print the vdev name unless it's is a histogram.  Histograms
 	 * display the vdev name in the header itself.
 	 */
 	if (!(cb->cb_flags & IOS_ANYHISTO_M)) {
 		if (cb->cb_scripted) {
 			printf("%s", name);
 		} else {
 			if (strlen(name) + depth > cb->cb_namewidth)
 				(void) printf("%*s%s", depth, "", name);
 			else
 				(void) printf("%*s%s%*s", depth, "", name,
 				    (int)(cb->cb_namewidth - strlen(name) -
 				    depth), "");
 		}
 	}
 
 	/* Calculate our scaling factor */
 	tdelta = newvs->vs_timestamp - oldvs->vs_timestamp;
 	if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) {
 		/*
 		 * If we specify printing histograms with no time interval, then
 		 * print the histogram numbers over the entire lifetime of the
 		 * vdev.
 		 */
 		scale = 1;
 	} else {
 		if (tdelta == 0)
 			scale = 1.0;
 		else
 			scale = (double)NANOSEC / tdelta;
 	}
 
 	if (cb->cb_flags & IOS_DEFAULT_M) {
 		calc_default_iostats(oldvs, newvs, calcvs);
 		print_iostat_default(calcvs, cb, scale);
 	}
 	if (cb->cb_flags & IOS_LATENCY_M)
 		print_iostat_latency(cb, oldnv, newnv);
 	if (cb->cb_flags & IOS_QUEUES_M)
 		print_iostat_queues(cb, oldnv, newnv);
 	if (cb->cb_flags & IOS_ANYHISTO_M) {
 		printf("\n");
 		print_iostat_histos(cb, oldnv, newnv, scale, name);
 	}
 
 	if (cb->vcdl != NULL) {
 		char *path;
 		if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH,
 		    &path) == 0) {
 			printf("  ");
 			zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path);
 		}
 	}
 
 	if (!(cb->cb_flags & IOS_ANYHISTO_M))
 		printf("\n");
 
 	ret++;
 
 children:
 
 	free(calcvs);
 
 	if (!cb->cb_verbose)
 		return (ret);
 
 	if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN,
 	    &newchild, &children) != 0)
 		return (ret);
 
 	if (oldnv) {
 		if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
 		    &oldchild, &oldchildren) != 0)
 			return (ret);
 
 		children = MIN(oldchildren, children);
 	}
 
 	/*
 	 * print normal top-level devices
 	 */
 	for (c = 0; c < children; c++) {
 		uint64_t ishole = B_FALSE, islog = B_FALSE;
 
 		(void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE,
 		    &ishole);
 
 		(void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG,
 		    &islog);
 
 		if (ishole || islog)
 			continue;
 
 		if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
 			continue;
 
 		vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
 		    cb->cb_name_flags);
 		ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
 		    newchild[c], cb, depth + 2);
 		free(vname);
 	}
 
 	/*
 	 * print all other top-level devices
 	 */
 	for (uint_t n = 0; n < 3; n++) {
 		boolean_t printed = B_FALSE;
 
 		for (c = 0; c < children; c++) {
 			uint64_t islog = B_FALSE;
 			char *bias = NULL;
 			char *type = NULL;
 
 			(void) nvlist_lookup_uint64(newchild[c],
 			    ZPOOL_CONFIG_IS_LOG, &islog);
 			if (islog) {
 				bias = VDEV_ALLOC_CLASS_LOGS;
 			} else {
 				(void) nvlist_lookup_string(newchild[c],
 				    ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
 				(void) nvlist_lookup_string(newchild[c],
 				    ZPOOL_CONFIG_TYPE, &type);
 			}
 			if (bias == NULL || strcmp(bias, class_name[n]) != 0)
 				continue;
 			if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
 				continue;
 
 			if (!printed) {
 				if ((!(cb->cb_flags & IOS_ANYHISTO_M)) &&
 				    !cb->cb_scripted && !cb->cb_vdev_names) {
 					print_iostat_dashes(cb, 0,
 					    class_name[n]);
 				}
 				printf("\n");
 				printed = B_TRUE;
 			}
 
 			vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
 			    cb->cb_name_flags);
 			ret += print_vdev_stats(zhp, vname, oldnv ?
 			    oldchild[c] : NULL, newchild[c], cb, depth + 2);
 			free(vname);
 		}
 	}
 
 	/*
 	 * Include level 2 ARC devices in iostat output
 	 */
 	if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE,
 	    &newchild, &children) != 0)
 		return (ret);
 
 	if (oldnv) {
 		if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE,
 		    &oldchild, &oldchildren) != 0)
 			return (ret);
 
 		children = MIN(oldchildren, children);
 	}
 
 	if (children > 0) {
 		if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted &&
 		    !cb->cb_vdev_names) {
 			print_iostat_dashes(cb, 0, "cache");
 		}
 		printf("\n");
 
 		for (c = 0; c < children; c++) {
 			vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
 			    cb->cb_name_flags);
 			ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c]
 			    : NULL, newchild[c], cb, depth + 2);
 			free(vname);
 		}
 	}
 
 	return (ret);
 }
 
 static int
 refresh_iostat(zpool_handle_t *zhp, void *data)
 {
 	iostat_cbdata_t *cb = data;
 	boolean_t missing;
 
 	/*
 	 * If the pool has disappeared, remove it from the list and continue.
 	 */
 	if (zpool_refresh_stats(zhp, &missing) != 0)
 		return (-1);
 
 	if (missing)
 		pool_list_remove(cb->cb_list, zhp);
 
 	return (0);
 }
 
 /*
  * Callback to print out the iostats for the given pool.
  */
 static int
 print_iostat(zpool_handle_t *zhp, void *data)
 {
 	iostat_cbdata_t *cb = data;
 	nvlist_t *oldconfig, *newconfig;
 	nvlist_t *oldnvroot, *newnvroot;
 	int ret;
 
 	newconfig = zpool_get_config(zhp, &oldconfig);
 
 	if (cb->cb_iteration == 1)
 		oldconfig = NULL;
 
 	verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE,
 	    &newnvroot) == 0);
 
 	if (oldconfig == NULL)
 		oldnvroot = NULL;
 	else
 		verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE,
 		    &oldnvroot) == 0);
 
 	ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot,
 	    cb, 0);
 	if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) &&
 	    !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) {
 		print_iostat_separator(cb);
 		if (cb->vcdl != NULL) {
 			print_cmd_columns(cb->vcdl, 1);
 		}
 		printf("\n");
 	}
 
 	return (ret);
 }
 
 static int
 get_columns(void)
 {
 	struct winsize ws;
 	int columns = 80;
 	int error;
 
 	if (isatty(STDOUT_FILENO)) {
 		error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws);
 		if (error == 0)
 			columns = ws.ws_col;
 	} else {
 		columns = 999;
 	}
 
 	return (columns);
 }
 
 /*
  * Return the required length of the pool/vdev name column.  The minimum
  * allowed width and output formatting flags must be provided.
  */
 static int
 get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose)
 {
 	nvlist_t *config, *nvroot;
 	int width = min_width;
 
 	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 		    &nvroot) == 0);
 		unsigned int poolname_len = strlen(zpool_get_name(zhp));
 		if (verbose == B_FALSE) {
 			width = MAX(poolname_len, min_width);
 		} else {
 			width = MAX(poolname_len,
 			    max_width(zhp, nvroot, 0, min_width, flags));
 		}
 	}
 
 	return (width);
 }
 
 /*
  * Parse the input string, get the 'interval' and 'count' value if there is one.
  */
 static void
 get_interval_count(int *argcp, char **argv, float *iv,
     unsigned long *cnt)
 {
 	float interval = 0;
 	unsigned long count = 0;
 	int argc = *argcp;
 
 	/*
 	 * Determine if the last argument is an integer or a pool name
 	 */
 	if (argc > 0 && zfs_isnumber(argv[argc - 1])) {
 		char *end;
 
 		errno = 0;
 		interval = strtof(argv[argc - 1], &end);
 
 		if (*end == '\0' && errno == 0) {
 			if (interval == 0) {
 				(void) fprintf(stderr, gettext("interval "
 				    "cannot be zero\n"));
 				usage(B_FALSE);
 			}
 			/*
 			 * Ignore the last parameter
 			 */
 			argc--;
 		} else {
 			/*
 			 * If this is not a valid number, just plow on.  The
 			 * user will get a more informative error message later
 			 * on.
 			 */
 			interval = 0;
 		}
 	}
 
 	/*
 	 * If the last argument is also an integer, then we have both a count
 	 * and an interval.
 	 */
 	if (argc > 0 && zfs_isnumber(argv[argc - 1])) {
 		char *end;
 
 		errno = 0;
 		count = interval;
 		interval = strtof(argv[argc - 1], &end);
 
 		if (*end == '\0' && errno == 0) {
 			if (interval == 0) {
 				(void) fprintf(stderr, gettext("interval "
 				    "cannot be zero\n"));
 				usage(B_FALSE);
 			}
 
 			/*
 			 * Ignore the last parameter
 			 */
 			argc--;
 		} else {
 			interval = 0;
 		}
 	}
 
 	*iv = interval;
 	*cnt = count;
 	*argcp = argc;
 }
 
 static void
 get_timestamp_arg(char c)
 {
 	if (c == 'u')
 		timestamp_fmt = UDATE;
 	else if (c == 'd')
 		timestamp_fmt = DDATE;
 	else
 		usage(B_FALSE);
 }
 
 /*
  * Return stat flags that are supported by all pools by both the module and
  * zpool iostat.  "*data" should be initialized to all 0xFFs before running.
  * It will get ANDed down until only the flags that are supported on all pools
  * remain.
  */
 static int
 get_stat_flags_cb(zpool_handle_t *zhp, void *data)
 {
 	uint64_t *mask = data;
 	nvlist_t *config, *nvroot, *nvx;
 	uint64_t flags = 0;
 	int i, j;
 
 	config = zpool_get_config(zhp, NULL);
 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 
 	/* Default stats are always supported, but for completeness.. */
 	if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS))
 		flags |= IOS_DEFAULT_M;
 
 	/* Get our extended stats nvlist from the main list */
 	if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX,
 	    &nvx) != 0) {
 		/*
 		 * No extended stats; they're probably running an older
 		 * module.  No big deal, we support that too.
 		 */
 		goto end;
 	}
 
 	/* For each extended stat, make sure all its nvpairs are supported */
 	for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) {
 		if (!vsx_type_to_nvlist[j][0])
 			continue;
 
 		/* Start off by assuming the flag is supported, then check */
 		flags |= (1ULL << j);
 		for (i = 0; vsx_type_to_nvlist[j][i]; i++) {
 			if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) {
 				/* flag isn't supported */
 				flags = flags & ~(1ULL  << j);
 				break;
 			}
 		}
 	}
 end:
 	*mask = *mask & flags;
 	return (0);
 }
 
 /*
  * Return a bitmask of stats that are supported on all pools by both the module
  * and zpool iostat.
  */
 static uint64_t
 get_stat_flags(zpool_list_t *list)
 {
 	uint64_t mask = -1;
 
 	/*
 	 * get_stat_flags_cb() will lop off bits from "mask" until only the
 	 * flags that are supported on all pools remain.
 	 */
 	pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask);
 	return (mask);
 }
 
 /*
  * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise.
  */
 static int
 is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data)
 {
 	iostat_cbdata_t *cb = cb_data;
 	char *name = NULL;
 	int ret = 0;
 
 	name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags);
 
 	if (strcmp(name, cb->cb_vdev_names[0]) == 0)
 		ret = 1; /* match */
 	free(name);
 
 	return (ret);
 }
 
 /*
  * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise.
  */
 static int
 is_vdev(zpool_handle_t *zhp, void *cb_data)
 {
 	return (for_each_vdev(zhp, is_vdev_cb, cb_data));
 }
 
 /*
  * Check if vdevs are in a pool
  *
  * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise
  * return 0.  If pool_name is NULL, then search all pools.
  */
 static int
 are_vdevs_in_pool(int argc, char **argv, char *pool_name,
     iostat_cbdata_t *cb)
 {
 	char **tmp_name;
 	int ret = 0;
 	int i;
 	int pool_count = 0;
 
 	if ((argc == 0) || !*argv)
 		return (0);
 
 	if (pool_name)
 		pool_count = 1;
 
 	/* Temporarily hijack cb_vdev_names for a second... */
 	tmp_name = cb->cb_vdev_names;
 
 	/* Go though our list of prospective vdev names */
 	for (i = 0; i < argc; i++) {
 		cb->cb_vdev_names = argv + i;
 
 		/* Is this name a vdev in our pools? */
 		ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL,
 		    is_vdev, cb);
 		if (!ret) {
 			/* No match */
 			break;
 		}
 	}
 
 	cb->cb_vdev_names = tmp_name;
 
 	return (ret);
 }
 
 static int
 is_pool_cb(zpool_handle_t *zhp, void *data)
 {
 	char *name = data;
 	if (strcmp(name, zpool_get_name(zhp)) == 0)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Do we have a pool named *name?  If so, return 1, otherwise 0.
  */
 static int
 is_pool(char *name)
 {
 	return (for_each_pool(0, NULL, B_TRUE, NULL,  is_pool_cb, name));
 }
 
 /* Are all our argv[] strings pool names?  If so return 1, 0 otherwise. */
 static int
 are_all_pools(int argc, char **argv)
 {
 	if ((argc == 0) || !*argv)
 		return (0);
 
 	while (--argc >= 0)
 		if (!is_pool(argv[argc]))
 			return (0);
 
 	return (1);
 }
 
 /*
  * Helper function to print out vdev/pool names we can't resolve.  Used for an
  * error message.
  */
 static void
 error_list_unresolved_vdevs(int argc, char **argv, char *pool_name,
     iostat_cbdata_t *cb)
 {
 	int i;
 	char *name;
 	char *str;
 	for (i = 0; i < argc; i++) {
 		name = argv[i];
 
 		if (is_pool(name))
 			str = gettext("pool");
 		else if (are_vdevs_in_pool(1, &name, pool_name, cb))
 			str = gettext("vdev in this pool");
 		else if (are_vdevs_in_pool(1, &name, NULL, cb))
 			str = gettext("vdev in another pool");
 		else
 			str = gettext("unknown");
 
 		fprintf(stderr, "\t%s (%s)\n", name, str);
 	}
 }
 
 /*
  * Same as get_interval_count(), but with additional checks to not misinterpret
  * guids as interval/count values.  Assumes VDEV_NAME_GUID is set in
  * cb.cb_name_flags.
  */
 static void
 get_interval_count_filter_guids(int *argc, char **argv, float *interval,
     unsigned long *count, iostat_cbdata_t *cb)
 {
 	char **tmpargv = argv;
 	int argc_for_interval = 0;
 
 	/* Is the last arg an interval value?  Or a guid? */
 	if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) {
 		/*
 		 * The last arg is not a guid, so it's probably an
 		 * interval value.
 		 */
 		argc_for_interval++;
 
 		if (*argc >= 2 &&
 		    !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) {
 			/*
 			 * The 2nd to last arg is not a guid, so it's probably
 			 * an interval value.
 			 */
 			argc_for_interval++;
 		}
 	}
 
 	/* Point to our list of possible intervals */
 	tmpargv = &argv[*argc - argc_for_interval];
 
 	*argc = *argc - argc_for_interval;
 	get_interval_count(&argc_for_interval, tmpargv,
 	    interval, count);
 }
 
 /*
  * Floating point sleep().  Allows you to pass in a floating point value for
  * seconds.
  */
 static void
 fsleep(float sec)
 {
 	struct timespec req;
 	req.tv_sec = floor(sec);
 	req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC;
 	nanosleep(&req, NULL);
 }
 
 /*
  * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or
  * if we were unable to determine its size.
  */
 static int
 terminal_height(void)
 {
 	struct winsize win;
 
 	if (isatty(STDOUT_FILENO) == 0)
 		return (-1);
 
 	if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0)
 		return (win.ws_row);
 
 	return (-1);
 }
 
 /*
  * Run one of the zpool status/iostat -c scripts with the help (-h) option and
  * print the result.
  *
  * name:	Short name of the script ('iostat').
  * path:	Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat');
  */
 static void
 print_zpool_script_help(char *name, char *path)
 {
 	char *argv[] = {path, "-h", NULL};
 	char **lines = NULL;
 	int lines_cnt = 0;
 	int rc;
 
 	rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines,
 	    &lines_cnt);
 	if (rc != 0 || lines == NULL || lines_cnt <= 0) {
 		if (lines != NULL)
 			libzfs_free_str_array(lines, lines_cnt);
 		return;
 	}
 
 	for (int i = 0; i < lines_cnt; i++)
 		if (!is_blank_str(lines[i]))
 			printf("  %-14s  %s\n", name, lines[i]);
 
 	libzfs_free_str_array(lines, lines_cnt);
 }
 
 /*
  * Go though the zpool status/iostat -c scripts in the user's path, run their
  * help option (-h), and print out the results.
  */
 static void
 print_zpool_dir_scripts(char *dirpath)
 {
 	DIR *dir;
 	struct dirent *ent;
 	char fullpath[MAXPATHLEN];
 	struct stat dir_stat;
 
 	if ((dir = opendir(dirpath)) != NULL) {
 		/* print all the files and directories within directory */
 		while ((ent = readdir(dir)) != NULL) {
 			sprintf(fullpath, "%s/%s", dirpath, ent->d_name);
 
 			/* Print the scripts */
 			if (stat(fullpath, &dir_stat) == 0)
 				if (dir_stat.st_mode & S_IXUSR &&
 				    S_ISREG(dir_stat.st_mode))
 					print_zpool_script_help(ent->d_name,
 					    fullpath);
 		}
 		closedir(dir);
 	}
 }
 
 /*
  * Print out help text for all zpool status/iostat -c scripts.
  */
 static void
 print_zpool_script_list(char *subcommand)
 {
 	char *dir, *sp;
 
 	printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand);
 
 	sp = zpool_get_cmd_search_path();
 	if (sp == NULL)
 		return;
 
 	dir = strtok(sp, ":");
 	while (dir != NULL) {
 		print_zpool_dir_scripts(dir);
 		dir = strtok(NULL, ":");
 	}
 
 	free(sp);
 }
 
 /*
  * Set the minimum pool/vdev name column width.  The width must be at least 10,
  * but may be as large as the column width - 42 so it still fits on one line.
  * NOTE: 42 is the width of the default capacity/operations/bandwidth output
  */
 static int
 get_namewidth_iostat(zpool_handle_t *zhp, void *data)
 {
 	iostat_cbdata_t *cb = data;
 	int width, available_width;
 
 	/*
 	 * get_namewidth() returns the maximum width of any name in that column
 	 * for any pool/vdev/device line that will be output.
 	 */
 	width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags,
 	    cb->cb_verbose);
 
 	/*
 	 * The width we are calculating is the width of the header and also the
 	 * padding width for names that are less than maximum width.  The stats
 	 * take up 42 characters, so the width available for names is:
 	 */
 	available_width = get_columns() - 42;
 
 	/*
 	 * If the maximum width fits on a screen, then great!  Make everything
 	 * line up by justifying all lines to the same width.  If that max
 	 * width is larger than what's available, the name plus stats won't fit
 	 * on one line, and justifying to that width would cause every line to
 	 * wrap on the screen.  We only want lines with long names to wrap.
 	 * Limit the padding to what won't wrap.
 	 */
 	if (width > available_width)
 		width = available_width;
 
 	/*
 	 * And regardless of whatever the screen width is (get_columns can
 	 * return 0 if the width is not known or less than 42 for a narrow
 	 * terminal) have the width be a minimum of 10.
 	 */
 	if (width < 10)
 		width = 10;
 
 	/* Save the calculated width */
 	cb->cb_namewidth = width;
 
 	return (0);
 }
 
 /*
  * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name]
  *              [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]]
  *              [interval [count]]
  *
  *	-c CMD  For each vdev, run command CMD
  *	-g	Display guid for individual vdev name.
  *	-L	Follow links when resolving vdev path name.
  *	-P	Display full path for vdev name.
  *	-v	Display statistics for individual vdevs
  *	-h	Display help
  *	-p	Display values in parsable (exact) format.
  *	-H	Scripted mode.  Don't display headers, and separate properties
  *		by a single tab.
  *	-l	Display average latency
  *	-q	Display queue depths
  *	-w	Display latency histograms
  *	-r	Display request size histogram
  *	-T	Display a timestamp in date(1) or Unix format
  *	-n	Only print headers once
  *
  * This command can be tricky because we want to be able to deal with pool
  * creation/destruction as well as vdev configuration changes.  The bulk of this
  * processing is handled by the pool_list_* routines in zpool_iter.c.  We rely
  * on pool_list_update() to detect the addition of new pools.  Configuration
  * changes are all handled within libzfs.
  */
 int
 zpool_do_iostat(int argc, char **argv)
 {
 	int c;
 	int ret;
 	int npools;
 	float interval = 0;
 	unsigned long count = 0;
 	int winheight = 24;
 	zpool_list_t *list;
 	boolean_t verbose = B_FALSE;
 	boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE;
 	boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE;
 	boolean_t omit_since_boot = B_FALSE;
 	boolean_t guid = B_FALSE;
 	boolean_t follow_links = B_FALSE;
 	boolean_t full_name = B_FALSE;
 	boolean_t headers_once = B_FALSE;
 	iostat_cbdata_t cb = { 0 };
 	char *cmd = NULL;
 
 	/* Used for printing error message */
 	const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q',
 	    [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'};
 
 	uint64_t unsupported_flags;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) {
 		switch (c) {
 		case 'c':
 			if (cmd != NULL) {
 				fprintf(stderr,
 				    gettext("Can't set -c flag twice\n"));
 				exit(1);
 			}
 
 			if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL &&
 			    !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) {
 				fprintf(stderr, gettext(
 				    "Can't run -c, disabled by "
 				    "ZPOOL_SCRIPTS_ENABLED.\n"));
 				exit(1);
 			}
 
 			if ((getuid() <= 0 || geteuid() <= 0) &&
 			    !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) {
 				fprintf(stderr, gettext(
 				    "Can't run -c with root privileges "
 				    "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n"));
 				exit(1);
 			}
 			cmd = optarg;
 			verbose = B_TRUE;
 			break;
 		case 'g':
 			guid = B_TRUE;
 			break;
 		case 'L':
 			follow_links = B_TRUE;
 			break;
 		case 'P':
 			full_name = B_TRUE;
 			break;
 		case 'T':
 			get_timestamp_arg(*optarg);
 			break;
 		case 'v':
 			verbose = B_TRUE;
 			break;
 		case 'p':
 			parsable = B_TRUE;
 			break;
 		case 'l':
 			latency = B_TRUE;
 			break;
 		case 'q':
 			queues = B_TRUE;
 			break;
 		case 'H':
 			scripted = B_TRUE;
 			break;
 		case 'w':
 			l_histo = B_TRUE;
 			break;
 		case 'r':
 			rq_histo = B_TRUE;
 			break;
 		case 'y':
 			omit_since_boot = B_TRUE;
 			break;
 		case 'n':
 			headers_once = B_TRUE;
 			break;
 		case 'h':
 			usage(B_FALSE);
 			break;
 		case '?':
 			if (optopt == 'c') {
 				print_zpool_script_list("iostat");
 				exit(0);
 			} else {
 				fprintf(stderr,
 				    gettext("invalid option '%c'\n"), optopt);
 			}
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	cb.cb_literal = parsable;
 	cb.cb_scripted = scripted;
 
 	if (guid)
 		cb.cb_name_flags |= VDEV_NAME_GUID;
 	if (follow_links)
 		cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
 	if (full_name)
 		cb.cb_name_flags |= VDEV_NAME_PATH;
 	cb.cb_iteration = 0;
 	cb.cb_namewidth = 0;
 	cb.cb_verbose = verbose;
 
 	/* Get our interval and count values (if any) */
 	if (guid) {
 		get_interval_count_filter_guids(&argc, argv, &interval,
 		    &count, &cb);
 	} else {
 		get_interval_count(&argc, argv, &interval, &count);
 	}
 
 	if (argc == 0) {
 		/* No args, so just print the defaults. */
 	} else if (are_all_pools(argc, argv)) {
 		/* All the args are pool names */
 	} else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) {
 		/* All the args are vdevs */
 		cb.cb_vdev_names = argv;
 		cb.cb_vdev_names_count = argc;
 		argc = 0; /* No pools to process */
 	} else if (are_all_pools(1, argv)) {
 		/* The first arg is a pool name */
 		if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) {
 			/* ...and the rest are vdev names */
 			cb.cb_vdev_names = argv + 1;
 			cb.cb_vdev_names_count = argc - 1;
 			argc = 1; /* One pool to process */
 		} else {
 			fprintf(stderr, gettext("Expected either a list of "));
 			fprintf(stderr, gettext("pools, or list of vdevs in"));
 			fprintf(stderr, " \"%s\", ", argv[0]);
 			fprintf(stderr, gettext("but got:\n"));
 			error_list_unresolved_vdevs(argc - 1, argv + 1,
 			    argv[0], &cb);
 			fprintf(stderr, "\n");
 			usage(B_FALSE);
 			return (1);
 		}
 	} else {
 		/*
 		 * The args don't make sense. The first arg isn't a pool name,
 		 * nor are all the args vdevs.
 		 */
 		fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n"));
 		fprintf(stderr, "\n");
 		return (1);
 	}
 
 	if (cb.cb_vdev_names_count != 0) {
 		/*
 		 * If user specified vdevs, it implies verbose.
 		 */
 		cb.cb_verbose = B_TRUE;
 	}
 
 	/*
 	 * Construct the list of all interesting pools.
 	 */
 	ret = 0;
 	if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL)
 		return (1);
 
 	if (pool_list_count(list) == 0 && argc != 0) {
 		pool_list_free(list);
 		return (1);
 	}
 
 	if (pool_list_count(list) == 0 && interval == 0) {
 		pool_list_free(list);
 		(void) fprintf(stderr, gettext("no pools available\n"));
 		return (1);
 	}
 
 	if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) {
 		pool_list_free(list);
 		(void) fprintf(stderr,
 		    gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n"));
 		usage(B_FALSE);
 		return (1);
 	}
 
 	if (l_histo && rq_histo) {
 		pool_list_free(list);
 		(void) fprintf(stderr,
 		    gettext("Only one of [-r|-w] can be passed at a time\n"));
 		usage(B_FALSE);
 		return (1);
 	}
 
 	/*
 	 * Enter the main iostat loop.
 	 */
 	cb.cb_list = list;
 
 	if (l_histo) {
 		/*
 		 * Histograms tables look out of place when you try to display
 		 * them with the other stats, so make a rule that you can only
 		 * print histograms by themselves.
 		 */
 		cb.cb_flags = IOS_L_HISTO_M;
 	} else if (rq_histo) {
 		cb.cb_flags = IOS_RQ_HISTO_M;
 	} else {
 		cb.cb_flags = IOS_DEFAULT_M;
 		if (latency)
 			cb.cb_flags |= IOS_LATENCY_M;
 		if (queues)
 			cb.cb_flags |= IOS_QUEUES_M;
 	}
 
 	/*
 	 * See if the module supports all the stats we want to display.
 	 */
 	unsupported_flags = cb.cb_flags & ~get_stat_flags(list);
 	if (unsupported_flags) {
 		uint64_t f;
 		int idx;
 		fprintf(stderr,
 		    gettext("The loaded zfs module doesn't support:"));
 
 		/* for each bit set in unsupported_flags */
 		for (f = unsupported_flags; f; f &= ~(1ULL << idx)) {
 			idx = lowbit64(f) - 1;
 			fprintf(stderr, " -%c", flag_to_arg[idx]);
 		}
 
 		fprintf(stderr, ".  Try running a newer module.\n");
 		pool_list_free(list);
 
 		return (1);
 	}
 
 	for (;;) {
 		if ((npools = pool_list_count(list)) == 0)
 			(void) fprintf(stderr, gettext("no pools available\n"));
 		else {
 			/*
 			 * If this is the first iteration and -y was supplied
 			 * we skip any printing.
 			 */
 			boolean_t skip = (omit_since_boot &&
 			    cb.cb_iteration == 0);
 
 			/*
 			 * Refresh all statistics.  This is done as an
 			 * explicit step before calculating the maximum name
 			 * width, so that any * configuration changes are
 			 * properly accounted for.
 			 */
 			(void) pool_list_iter(list, B_FALSE, refresh_iostat,
 			    &cb);
 
 			/*
 			 * Iterate over all pools to determine the maximum width
 			 * for the pool / device name column across all pools.
 			 */
 			cb.cb_namewidth = 0;
 			(void) pool_list_iter(list, B_FALSE,
 			    get_namewidth_iostat, &cb);
 
 			if (timestamp_fmt != NODATE)
 				print_timestamp(timestamp_fmt);
 
 			if (cmd != NULL && cb.cb_verbose &&
 			    !(cb.cb_flags & IOS_ANYHISTO_M)) {
 				cb.vcdl = all_pools_for_each_vdev_run(argc,
 				    argv, cmd, g_zfs, cb.cb_vdev_names,
 				    cb.cb_vdev_names_count, cb.cb_name_flags);
 			} else {
 				cb.vcdl = NULL;
 			}
 
 
 			/*
 			 * Check terminal size so we can print headers
 			 * even when terminal window has its height
 			 * changed.
 			 */
 			winheight = terminal_height();
 			/*
 			 * Are we connected to TTY? If not, headers_once
 			 * should be true, to avoid breaking scripts.
 			 */
 			if (winheight < 0)
 				headers_once = B_TRUE;
 
 			/*
 			 * If it's the first time and we're not skipping it,
 			 * or either skip or verbose mode, print the header.
 			 *
 			 * The histogram code explicitly prints its header on
 			 * every vdev, so skip this for histograms.
 			 */
 			if (((++cb.cb_iteration == 1 && !skip) ||
 			    (skip != verbose) ||
 			    (!headers_once &&
 			    (cb.cb_iteration % winheight) == 0)) &&
 			    (!(cb.cb_flags & IOS_ANYHISTO_M)) &&
 			    !cb.cb_scripted)
 				print_iostat_header(&cb);
 
 			if (skip) {
 				(void) fsleep(interval);
 				continue;
 			}
 
 			pool_list_iter(list, B_FALSE, print_iostat, &cb);
 
 			/*
 			 * If there's more than one pool, and we're not in
 			 * verbose mode (which prints a separator for us),
 			 * then print a separator.
 			 *
 			 * In addition, if we're printing specific vdevs then
 			 * we also want an ending separator.
 			 */
 			if (((npools > 1 && !verbose &&
 			    !(cb.cb_flags & IOS_ANYHISTO_M)) ||
 			    (!(cb.cb_flags & IOS_ANYHISTO_M) &&
 			    cb.cb_vdev_names_count)) &&
 			    !cb.cb_scripted) {
 				print_iostat_separator(&cb);
 				if (cb.vcdl != NULL)
 					print_cmd_columns(cb.vcdl, 1);
 				printf("\n");
 			}
 
 			if (cb.vcdl != NULL)
 				free_vdev_cmd_data_list(cb.vcdl);
 
 		}
 
 		/*
 		 * Flush the output so that redirection to a file isn't buffered
 		 * indefinitely.
 		 */
 		(void) fflush(stdout);
 
 		if (interval == 0)
 			break;
 
 		if (count != 0 && --count == 0)
 			break;
 
 		(void) fsleep(interval);
 	}
 
 	pool_list_free(list);
 
 	return (ret);
 }
 
 typedef struct list_cbdata {
 	boolean_t	cb_verbose;
 	int		cb_name_flags;
 	int		cb_namewidth;
 	boolean_t	cb_scripted;
 	zprop_list_t	*cb_proplist;
 	boolean_t	cb_literal;
 } list_cbdata_t;
 
 
 /*
  * Given a list of columns to display, output appropriate headers for each one.
  */
 static void
 print_header(list_cbdata_t *cb)
 {
 	zprop_list_t *pl = cb->cb_proplist;
 	char headerbuf[ZPOOL_MAXPROPLEN];
 	const char *header;
 	boolean_t first = B_TRUE;
 	boolean_t right_justify;
 	size_t width = 0;
 
 	for (; pl != NULL; pl = pl->pl_next) {
 		width = pl->pl_width;
 		if (first && cb->cb_verbose) {
 			/*
 			 * Reset the width to accommodate the verbose listing
 			 * of devices.
 			 */
 			width = cb->cb_namewidth;
 		}
 
 		if (!first)
 			(void) printf("  ");
 		else
 			first = B_FALSE;
 
 		right_justify = B_FALSE;
 		if (pl->pl_prop != ZPROP_INVAL) {
 			header = zpool_prop_column_name(pl->pl_prop);
 			right_justify = zpool_prop_align_right(pl->pl_prop);
 		} else {
 			int i;
 
 			for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
 				headerbuf[i] = toupper(pl->pl_user_prop[i]);
 			headerbuf[i] = '\0';
 			header = headerbuf;
 		}
 
 		if (pl->pl_next == NULL && !right_justify)
 			(void) printf("%s", header);
 		else if (right_justify)
 			(void) printf("%*s", (int)width, header);
 		else
 			(void) printf("%-*s", (int)width, header);
 	}
 
 	(void) printf("\n");
 }
 
 /*
  * Given a pool and a list of properties, print out all the properties according
  * to the described layout. Used by zpool_do_list().
  */
 static void
 print_pool(zpool_handle_t *zhp, list_cbdata_t *cb)
 {
 	zprop_list_t *pl = cb->cb_proplist;
 	boolean_t first = B_TRUE;
 	char property[ZPOOL_MAXPROPLEN];
 	char *propstr;
 	boolean_t right_justify;
 	size_t width;
 
 	for (; pl != NULL; pl = pl->pl_next) {
 
 		width = pl->pl_width;
 		if (first && cb->cb_verbose) {
 			/*
 			 * Reset the width to accommodate the verbose listing
 			 * of devices.
 			 */
 			width = cb->cb_namewidth;
 		}
 
 		if (!first) {
 			if (cb->cb_scripted)
 				(void) printf("\t");
 			else
 				(void) printf("  ");
 		} else {
 			first = B_FALSE;
 		}
 
 		right_justify = B_FALSE;
 		if (pl->pl_prop != ZPROP_INVAL) {
 			if (zpool_get_prop(zhp, pl->pl_prop, property,
 			    sizeof (property), NULL, cb->cb_literal) != 0)
 				propstr = "-";
 			else
 				propstr = property;
 
 			right_justify = zpool_prop_align_right(pl->pl_prop);
 		} else if ((zpool_prop_feature(pl->pl_user_prop) ||
 		    zpool_prop_unsupported(pl->pl_user_prop)) &&
 		    zpool_prop_get_feature(zhp, pl->pl_user_prop, property,
 		    sizeof (property)) == 0) {
 			propstr = property;
 		} else {
 			propstr = "-";
 		}
 
 
 		/*
 		 * If this is being called in scripted mode, or if this is the
 		 * last column and it is left-justified, don't include a width
 		 * format specifier.
 		 */
 		if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
 			(void) printf("%s", propstr);
 		else if (right_justify)
 			(void) printf("%*s", (int)width, propstr);
 		else
 			(void) printf("%-*s", (int)width, propstr);
 	}
 
 	(void) printf("\n");
 }
 
 static void
 print_one_column(zpool_prop_t prop, uint64_t value, const char *str,
     boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format)
 {
 	char propval[64];
 	boolean_t fixed;
 	size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);
 
 	switch (prop) {
 	case ZPOOL_PROP_EXPANDSZ:
 	case ZPOOL_PROP_CHECKPOINT:
 	case ZPOOL_PROP_DEDUPRATIO:
 		if (value == 0)
 			(void) strlcpy(propval, "-", sizeof (propval));
 		else
 			zfs_nicenum_format(value, propval, sizeof (propval),
 			    format);
 		break;
 	case ZPOOL_PROP_FRAGMENTATION:
 		if (value == ZFS_FRAG_INVALID) {
 			(void) strlcpy(propval, "-", sizeof (propval));
 		} else if (format == ZFS_NICENUM_RAW) {
 			(void) snprintf(propval, sizeof (propval), "%llu",
 			    (unsigned long long)value);
 		} else {
 			(void) snprintf(propval, sizeof (propval), "%llu%%",
 			    (unsigned long long)value);
 		}
 		break;
 	case ZPOOL_PROP_CAPACITY:
 		/* capacity value is in parts-per-10,000 (aka permyriad) */
 		if (format == ZFS_NICENUM_RAW)
 			(void) snprintf(propval, sizeof (propval), "%llu",
 			    (unsigned long long)value / 100);
 		else
 			(void) snprintf(propval, sizeof (propval),
 			    value < 1000 ? "%1.2f%%" : value < 10000 ?
 			    "%2.1f%%" : "%3.0f%%", value / 100.0);
 		break;
 	case ZPOOL_PROP_HEALTH:
 		width = 8;
 		snprintf(propval, sizeof (propval), "%-*s", (int)width, str);
 		break;
 	default:
 		zfs_nicenum_format(value, propval, sizeof (propval), format);
 	}
 
 	if (!valid)
 		(void) strlcpy(propval, "-", sizeof (propval));
 
 	if (scripted)
 		(void) printf("\t%s", propval);
 	else
 		(void) printf("  %*s", (int)width, propval);
 }
 
 /*
  * print static default line per vdev
  * not compatible with '-o' <proplist> option
  */
 static void
 print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
     list_cbdata_t *cb, int depth, boolean_t isspare)
 {
 	nvlist_t **child;
 	vdev_stat_t *vs;
 	uint_t c, children;
 	char *vname;
 	boolean_t scripted = cb->cb_scripted;
 	uint64_t islog = B_FALSE;
 	char *dashes = "%-*s      -      -      -        -         "
 	    "-      -      -      -  -\n";
 
 	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &c) == 0);
 
 	if (name != NULL) {
 		boolean_t toplevel = (vs->vs_space != 0);
 		uint64_t cap;
 		enum zfs_nicenum_format format;
 		const char *state;
 
 		if (cb->cb_literal)
 			format = ZFS_NICENUM_RAW;
 		else
 			format = ZFS_NICENUM_1024;
 
 		if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
 			return;
 
 		if (scripted)
 			(void) printf("\t%s", name);
 		else if (strlen(name) + depth > cb->cb_namewidth)
 			(void) printf("%*s%s", depth, "", name);
 		else
 			(void) printf("%*s%s%*s", depth, "", name,
 			    (int)(cb->cb_namewidth - strlen(name) - depth), "");
 
 		/*
 		 * Print the properties for the individual vdevs. Some
 		 * properties are only applicable to toplevel vdevs. The
 		 * 'toplevel' boolean value is passed to the print_one_column()
 		 * to indicate that the value is valid.
 		 */
 		print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted,
 		    toplevel, format);
 		print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL,
 		    scripted, toplevel, format);
 		print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
 		    NULL, scripted, toplevel, format);
 		print_one_column(ZPOOL_PROP_CHECKPOINT,
 		    vs->vs_checkpoint_space, NULL, scripted, toplevel, format);
 		print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL,
 		    scripted, B_TRUE, format);
 		print_one_column(ZPOOL_PROP_FRAGMENTATION,
 		    vs->vs_fragmentation, NULL, scripted,
 		    (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel),
 		    format);
 		cap = (vs->vs_space == 0) ? 0 :
 		    (vs->vs_alloc * 10000 / vs->vs_space);
 		print_one_column(ZPOOL_PROP_CAPACITY, cap, NULL,
 		    scripted, toplevel, format);
 		print_one_column(ZPOOL_PROP_DEDUPRATIO, 0, NULL,
 		    scripted, toplevel, format);
 		state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
 		if (isspare) {
 			if (vs->vs_aux == VDEV_AUX_SPARED)
 				state = "INUSE";
 			else if (vs->vs_state == VDEV_STATE_HEALTHY)
 				state = "AVAIL";
 		}
 		print_one_column(ZPOOL_PROP_HEALTH, 0, state, scripted,
 		    B_TRUE, format);
 		(void) printf("\n");
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
 		return;
 
 	/* list the normal vdevs first */
 	for (c = 0; c < children; c++) {
 		uint64_t ishole = B_FALSE;
 
 		if (nvlist_lookup_uint64(child[c],
 		    ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole)
 			continue;
 
 		if (nvlist_lookup_uint64(child[c],
 		    ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog)
 			continue;
 
 		if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
 			continue;
 
 		vname = zpool_vdev_name(g_zfs, zhp, child[c],
 		    cb->cb_name_flags);
 		print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE);
 		free(vname);
 	}
 
 	/* list the classes: 'logs', 'dedup', and 'special' */
 	for (uint_t n = 0; n < 3; n++) {
 		boolean_t printed = B_FALSE;
 
 		for (c = 0; c < children; c++) {
 			char *bias = NULL;
 			char *type = NULL;
 
 			if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
 			    &islog) == 0 && islog) {
 				bias = VDEV_ALLOC_CLASS_LOGS;
 			} else {
 				(void) nvlist_lookup_string(child[c],
 				    ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
 				(void) nvlist_lookup_string(child[c],
 				    ZPOOL_CONFIG_TYPE, &type);
 			}
 			if (bias == NULL || strcmp(bias, class_name[n]) != 0)
 				continue;
 			if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
 				continue;
 
 			if (!printed) {
 				/* LINTED E_SEC_PRINTF_VAR_FMT */
 				(void) printf(dashes, cb->cb_namewidth,
 				    class_name[n]);
 				printed = B_TRUE;
 			}
 			vname = zpool_vdev_name(g_zfs, zhp, child[c],
 			    cb->cb_name_flags);
 			print_list_stats(zhp, vname, child[c], cb, depth + 2,
 			    B_FALSE);
 			free(vname);
 		}
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0 && children > 0) {
 		/* LINTED E_SEC_PRINTF_VAR_FMT */
 		(void) printf(dashes, cb->cb_namewidth, "cache");
 		for (c = 0; c < children; c++) {
 			vname = zpool_vdev_name(g_zfs, zhp, child[c],
 			    cb->cb_name_flags);
 			print_list_stats(zhp, vname, child[c], cb, depth + 2,
 			    B_FALSE);
 			free(vname);
 		}
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child,
 	    &children) == 0 && children > 0) {
 		/* LINTED E_SEC_PRINTF_VAR_FMT */
 		(void) printf(dashes, cb->cb_namewidth, "spare");
 		for (c = 0; c < children; c++) {
 			vname = zpool_vdev_name(g_zfs, zhp, child[c],
 			    cb->cb_name_flags);
 			print_list_stats(zhp, vname, child[c], cb, depth + 2,
 			    B_TRUE);
 			free(vname);
 		}
 	}
 }
 
 /*
  * Generic callback function to list a pool.
  */
 static int
 list_callback(zpool_handle_t *zhp, void *data)
 {
 	list_cbdata_t *cbp = data;
 
 	print_pool(zhp, cbp);
 
 	if (cbp->cb_verbose) {
 		nvlist_t *config, *nvroot;
 
 		config = zpool_get_config(zhp, NULL);
 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 		    &nvroot) == 0);
 		print_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE);
 	}
 
 	return (0);
 }
 
 /*
  * Set the minimum pool/vdev name column width.  The width must be at least 9,
  * but may be as large as needed.
  */
 static int
 get_namewidth_list(zpool_handle_t *zhp, void *data)
 {
 	list_cbdata_t *cb = data;
 	int width;
 
 	width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags,
 	    cb->cb_verbose);
 
 	if (width < 9)
 		width = 9;
 
 	cb->cb_namewidth = width;
 
 	return (0);
 }
 
 /*
  * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]]
  *
  *	-g	Display guid for individual vdev name.
  *	-H	Scripted mode.  Don't display headers, and separate properties
  *		by a single tab.
  *	-L	Follow links when resolving vdev path name.
  *	-o	List of properties to display.  Defaults to
  *		"name,size,allocated,free,expandsize,fragmentation,capacity,"
  *		"dedupratio,health,altroot"
  *	-p	Display values in parsable (exact) format.
  *	-P	Display full path for vdev name.
  *	-T	Display a timestamp in date(1) or Unix format
  *
  * List all pools in the system, whether or not they're healthy.  Output space
  * statistics for each one, as well as health status summary.
  */
 int
 zpool_do_list(int argc, char **argv)
 {
 	int c;
 	int ret = 0;
 	list_cbdata_t cb = { 0 };
 	static char default_props[] =
 	    "name,size,allocated,free,checkpoint,expandsize,fragmentation,"
 	    "capacity,dedupratio,health,altroot";
 	char *props = default_props;
 	float interval = 0;
 	unsigned long count = 0;
 	zpool_list_t *list;
 	boolean_t first = B_TRUE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) {
 		switch (c) {
 		case 'g':
 			cb.cb_name_flags |= VDEV_NAME_GUID;
 			break;
 		case 'H':
 			cb.cb_scripted = B_TRUE;
 			break;
 		case 'L':
 			cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
 			break;
 		case 'o':
 			props = optarg;
 			break;
 		case 'P':
 			cb.cb_name_flags |= VDEV_NAME_PATH;
 			break;
 		case 'p':
 			cb.cb_literal = B_TRUE;
 			break;
 		case 'T':
 			get_timestamp_arg(*optarg);
 			break;
 		case 'v':
 			cb.cb_verbose = B_TRUE;
 			cb.cb_namewidth = 8;	/* 8 until precalc is avail */
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	get_interval_count(&argc, argv, &interval, &count);
 
 	if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0)
 		usage(B_FALSE);
 
 	for (;;) {
 		if ((list = pool_list_get(argc, argv, &cb.cb_proplist,
 		    &ret)) == NULL)
 			return (1);
 
 		if (pool_list_count(list) == 0)
 			break;
 
 		cb.cb_namewidth = 0;
 		(void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb);
 
 		if (timestamp_fmt != NODATE)
 			print_timestamp(timestamp_fmt);
 
 		if (!cb.cb_scripted && (first || cb.cb_verbose)) {
 			print_header(&cb);
 			first = B_FALSE;
 		}
 		ret = pool_list_iter(list, B_TRUE, list_callback, &cb);
 
 		if (interval == 0)
 			break;
 
 		if (count != 0 && --count == 0)
 			break;
 
 		pool_list_free(list);
 		(void) fsleep(interval);
 	}
 
 	if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) {
 		(void) printf(gettext("no pools available\n"));
 		ret = 0;
 	}
 
 	pool_list_free(list);
 	zprop_free_list(cb.cb_proplist);
 	return (ret);
 }
 
 static int
 zpool_do_attach_or_replace(int argc, char **argv, int replacing)
 {
 	boolean_t force = B_FALSE;
 	boolean_t rebuild = B_FALSE;
 	boolean_t wait = B_FALSE;
 	int c;
 	nvlist_t *nvroot;
 	char *poolname, *old_disk, *new_disk;
 	zpool_handle_t *zhp;
 	nvlist_t *props = NULL;
 	char *propval;
 	int ret;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "fo:sw")) != -1) {
 		switch (c) {
 		case 'f':
 			force = B_TRUE;
 			break;
 		case 'o':
 			if ((propval = strchr(optarg, '=')) == NULL) {
 				(void) fprintf(stderr, gettext("missing "
 				    "'=' for -o option\n"));
 				usage(B_FALSE);
 			}
 			*propval = '\0';
 			propval++;
 
 			if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) ||
 			    (add_prop_list(optarg, propval, &props, B_TRUE)))
 				usage(B_FALSE);
 			break;
 		case 's':
 			rebuild = B_TRUE;
 			break;
 		case 'w':
 			wait = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* get pool name and check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 	}
 
 	poolname = argv[0];
 
 	if (argc < 2) {
 		(void) fprintf(stderr,
 		    gettext("missing <device> specification\n"));
 		usage(B_FALSE);
 	}
 
 	old_disk = argv[1];
 
 	if (argc < 3) {
 		if (!replacing) {
 			(void) fprintf(stderr,
 			    gettext("missing <new_device> specification\n"));
 			usage(B_FALSE);
 		}
 		new_disk = old_disk;
 		argc -= 1;
 		argv += 1;
 	} else {
 		new_disk = argv[2];
 		argc -= 2;
 		argv += 2;
 	}
 
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
 		nvlist_free(props);
 		return (1);
 	}
 
 	if (zpool_get_config(zhp, NULL) == NULL) {
 		(void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
 		    poolname);
 		zpool_close(zhp);
 		nvlist_free(props);
 		return (1);
 	}
 
 	/* unless manually specified use "ashift" pool property (if set) */
 	if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) {
 		int intval;
 		zprop_source_t src;
 		char strval[ZPOOL_MAXPROPLEN];
 
 		intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src);
 		if (src != ZPROP_SRC_DEFAULT) {
 			(void) sprintf(strval, "%" PRId32, intval);
 			verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval,
 			    &props, B_TRUE) == 0);
 		}
 	}
 
 	nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE,
 	    argc, argv);
 	if (nvroot == NULL) {
 		zpool_close(zhp);
 		nvlist_free(props);
 		return (1);
 	}
 
 	ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing,
 	    rebuild);
 
 	if (ret == 0 && wait)
 		ret = zpool_wait(zhp,
 		    replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER);
 
 	nvlist_free(props);
 	nvlist_free(nvroot);
 	zpool_close(zhp);
 
 	return (ret);
 }
 
 /*
  * zpool replace [-fsw] [-o property=value] <pool> <device> <new_device>
  *
  *	-f	Force attach, even if <new_device> appears to be in use.
  *	-s	Use sequential instead of healing reconstruction for resilver.
  *	-o	Set property=value.
  *	-w	Wait for replacing to complete before returning
  *
  * Replace <device> with <new_device>.
  */
 /* ARGSUSED */
 int
 zpool_do_replace(int argc, char **argv)
 {
 	return (zpool_do_attach_or_replace(argc, argv, B_TRUE));
 }
 
 /*
  * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device>
  *
  *	-f	Force attach, even if <new_device> appears to be in use.
  *	-s	Use sequential instead of healing reconstruction for resilver.
  *	-o	Set property=value.
  *	-w	Wait for resilvering to complete before returning
  *
  * Attach <new_device> to the mirror containing <device>.  If <device> is not
  * part of a mirror, then <device> will be transformed into a mirror of
  * <device> and <new_device>.  In either case, <new_device> will begin life
  * with a DTL of [0, now], and will immediately begin to resilver itself.
  */
 int
 zpool_do_attach(int argc, char **argv)
 {
 	return (zpool_do_attach_or_replace(argc, argv, B_FALSE));
 }
 
 /*
  * zpool detach [-f] <pool> <device>
  *
  *	-f	Force detach of <device>, even if DTLs argue against it
  *		(not supported yet)
  *
  * Detach a device from a mirror.  The operation will be refused if <device>
  * is the last device in the mirror, or if the DTLs indicate that this device
  * has the only valid copy of some data.
  */
 /* ARGSUSED */
 int
 zpool_do_detach(int argc, char **argv)
 {
 	int c;
 	char *poolname, *path;
 	zpool_handle_t *zhp;
 	int ret;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "")) != -1) {
 		switch (c) {
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* get pool name and check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 	}
 
 	if (argc < 2) {
 		(void) fprintf(stderr,
 		    gettext("missing <device> specification\n"));
 		usage(B_FALSE);
 	}
 
 	poolname = argv[0];
 	path = argv[1];
 
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
 		return (1);
 
 	ret = zpool_vdev_detach(zhp, path);
 
 	zpool_close(zhp);
 
 	return (ret);
 }
 
 /*
  * zpool split [-gLnP] [-o prop=val] ...
  *		[-o mntopt] ...
  *		[-R altroot] <pool> <newpool> [<device> ...]
  *
  *	-g      Display guid for individual vdev name.
  *	-L	Follow links when resolving vdev path name.
  *	-n	Do not split the pool, but display the resulting layout if
  *		it were to be split.
  *	-o	Set property=value, or set mount options.
  *	-P	Display full path for vdev name.
  *	-R	Mount the split-off pool under an alternate root.
  *	-l	Load encryption keys while importing.
  *
  * Splits the named pool and gives it the new pool name.  Devices to be split
  * off may be listed, provided that no more than one device is specified
  * per top-level vdev mirror.  The newly split pool is left in an exported
  * state unless -R is specified.
  *
  * Restrictions: the top-level of the pool pool must only be made up of
  * mirrors; all devices in the pool must be healthy; no device may be
  * undergoing a resilvering operation.
  */
 int
 zpool_do_split(int argc, char **argv)
 {
 	char *srcpool, *newpool, *propval;
 	char *mntopts = NULL;
 	splitflags_t flags;
 	int c, ret = 0;
 	boolean_t loadkeys = B_FALSE;
 	zpool_handle_t *zhp;
 	nvlist_t *config, *props = NULL;
 
 	flags.dryrun = B_FALSE;
 	flags.import = B_FALSE;
 	flags.name_flags = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) {
 		switch (c) {
 		case 'g':
 			flags.name_flags |= VDEV_NAME_GUID;
 			break;
 		case 'L':
 			flags.name_flags |= VDEV_NAME_FOLLOW_LINKS;
 			break;
 		case 'R':
 			flags.import = B_TRUE;
 			if (add_prop_list(
 			    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg,
 			    &props, B_TRUE) != 0) {
 				nvlist_free(props);
 				usage(B_FALSE);
 			}
 			break;
 		case 'l':
 			loadkeys = B_TRUE;
 			break;
 		case 'n':
 			flags.dryrun = B_TRUE;
 			break;
 		case 'o':
 			if ((propval = strchr(optarg, '=')) != NULL) {
 				*propval = '\0';
 				propval++;
 				if (add_prop_list(optarg, propval,
 				    &props, B_TRUE) != 0) {
 					nvlist_free(props);
 					usage(B_FALSE);
 				}
 			} else {
 				mntopts = optarg;
 			}
 			break;
 		case 'P':
 			flags.name_flags |= VDEV_NAME_PATH;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 			break;
 		}
 	}
 
 	if (!flags.import && mntopts != NULL) {
 		(void) fprintf(stderr, gettext("setting mntopts is only "
 		    "valid when importing the pool\n"));
 		usage(B_FALSE);
 	}
 
 	if (!flags.import && loadkeys) {
 		(void) fprintf(stderr, gettext("loading keys is only "
 		    "valid when importing the pool\n"));
 		usage(B_FALSE);
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("Missing pool name\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("Missing new pool name\n"));
 		usage(B_FALSE);
 	}
 
 	srcpool = argv[0];
 	newpool = argv[1];
 
 	argc -= 2;
 	argv += 2;
 
 	if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) {
 		nvlist_free(props);
 		return (1);
 	}
 
 	config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv);
 	if (config == NULL) {
 		ret = 1;
 	} else {
 		if (flags.dryrun) {
 			(void) printf(gettext("would create '%s' with the "
 			    "following layout:\n\n"), newpool);
 			print_vdev_tree(NULL, newpool, config, 0, "",
 			    flags.name_flags);
 		}
 	}
 
 	zpool_close(zhp);
 
 	if (ret != 0 || flags.dryrun || !flags.import) {
 		nvlist_free(config);
 		nvlist_free(props);
 		return (ret);
 	}
 
 	/*
 	 * The split was successful. Now we need to open the new
 	 * pool and import it.
 	 */
 	if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) {
 		nvlist_free(config);
 		nvlist_free(props);
 		return (1);
 	}
 
 	if (loadkeys) {
 		ret = zfs_crypto_attempt_load_keys(g_zfs, newpool);
 		if (ret != 0)
 			ret = 1;
 	}
 
 	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
 	    zpool_enable_datasets(zhp, mntopts, 0) != 0) {
 		ret = 1;
 		(void) fprintf(stderr, gettext("Split was successful, but "
 		    "the datasets could not all be mounted\n"));
 		(void) fprintf(stderr, gettext("Try doing '%s' with a "
 		    "different altroot\n"), "zpool import");
 	}
 	zpool_close(zhp);
 	nvlist_free(config);
 	nvlist_free(props);
 
 	return (ret);
 }
 
 
 
 /*
  * zpool online <pool> <device> ...
  */
 int
 zpool_do_online(int argc, char **argv)
 {
 	int c, i;
 	char *poolname;
 	zpool_handle_t *zhp;
 	int ret = 0;
 	vdev_state_t newstate;
 	int flags = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "e")) != -1) {
 		switch (c) {
 		case 'e':
 			flags |= ZFS_ONLINE_EXPAND;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* get pool name and check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing device name\n"));
 		usage(B_FALSE);
 	}
 
 	poolname = argv[0];
 
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
 		return (1);
 
 	for (i = 1; i < argc; i++) {
 		if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
 			if (newstate != VDEV_STATE_HEALTHY) {
 				(void) printf(gettext("warning: device '%s' "
 				    "onlined, but remains in faulted state\n"),
 				    argv[i]);
 				if (newstate == VDEV_STATE_FAULTED)
 					(void) printf(gettext("use 'zpool "
 					    "clear' to restore a faulted "
 					    "device\n"));
 				else
 					(void) printf(gettext("use 'zpool "
 					    "replace' to replace devices "
 					    "that are no longer present\n"));
 			}
 		} else {
 			ret = 1;
 		}
 	}
 
 	zpool_close(zhp);
 
 	return (ret);
 }
 
 /*
  * zpool offline [-ft] <pool> <device> ...
  *
  *	-f	Force the device into a faulted state.
  *
  *	-t	Only take the device off-line temporarily.  The offline/faulted
  *		state will not be persistent across reboots.
  */
 /* ARGSUSED */
 int
 zpool_do_offline(int argc, char **argv)
 {
 	int c, i;
 	char *poolname;
 	zpool_handle_t *zhp;
 	int ret = 0;
 	boolean_t istmp = B_FALSE;
 	boolean_t fault = B_FALSE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "ft")) != -1) {
 		switch (c) {
 		case 'f':
 			fault = B_TRUE;
 			break;
 		case 't':
 			istmp = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* get pool name and check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing device name\n"));
 		usage(B_FALSE);
 	}
 
 	poolname = argv[0];
 
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
 		return (1);
 
 	for (i = 1; i < argc; i++) {
 		if (fault) {
 			uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]);
 			vdev_aux_t aux;
 			if (istmp == B_FALSE) {
 				/* Force the fault to persist across imports */
 				aux = VDEV_AUX_EXTERNAL_PERSIST;
 			} else {
 				aux = VDEV_AUX_EXTERNAL;
 			}
 
 			if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0)
 				ret = 1;
 		} else {
 			if (zpool_vdev_offline(zhp, argv[i], istmp) != 0)
 				ret = 1;
 		}
 	}
 
 	zpool_close(zhp);
 
 	return (ret);
 }
 
 /*
  * zpool clear <pool> [device]
  *
  * Clear all errors associated with a pool or a particular device.
  */
 int
 zpool_do_clear(int argc, char **argv)
 {
 	int c;
 	int ret = 0;
 	boolean_t dryrun = B_FALSE;
 	boolean_t do_rewind = B_FALSE;
 	boolean_t xtreme_rewind = B_FALSE;
 	uint32_t rewind_policy = ZPOOL_NO_REWIND;
 	nvlist_t *policy = NULL;
 	zpool_handle_t *zhp;
 	char *pool, *device;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "FnX")) != -1) {
 		switch (c) {
 		case 'F':
 			do_rewind = B_TRUE;
 			break;
 		case 'n':
 			dryrun = B_TRUE;
 			break;
 		case 'X':
 			xtreme_rewind = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name\n"));
 		usage(B_FALSE);
 	}
 
 	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	if ((dryrun || xtreme_rewind) && !do_rewind) {
 		(void) fprintf(stderr,
 		    gettext("-n or -X only meaningful with -F\n"));
 		usage(B_FALSE);
 	}
 	if (dryrun)
 		rewind_policy = ZPOOL_TRY_REWIND;
 	else if (do_rewind)
 		rewind_policy = ZPOOL_DO_REWIND;
 	if (xtreme_rewind)
 		rewind_policy |= ZPOOL_EXTREME_REWIND;
 
 	/* In future, further rewind policy choices can be passed along here */
 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
 	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY,
 	    rewind_policy) != 0) {
 		return (1);
 	}
 
 	pool = argv[0];
 	device = argc == 2 ? argv[1] : NULL;
 
 	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
 		nvlist_free(policy);
 		return (1);
 	}
 
 	if (zpool_clear(zhp, device, policy) != 0)
 		ret = 1;
 
 	zpool_close(zhp);
 
 	nvlist_free(policy);
 
 	return (ret);
 }
 
 /*
  * zpool reguid <pool>
  */
 int
 zpool_do_reguid(int argc, char **argv)
 {
 	int c;
 	char *poolname;
 	zpool_handle_t *zhp;
 	int ret = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "")) != -1) {
 		switch (c) {
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* get pool name and check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name\n"));
 		usage(B_FALSE);
 	}
 
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	poolname = argv[0];
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
 		return (1);
 
 	ret = zpool_reguid(zhp);
 
 	zpool_close(zhp);
 	return (ret);
 }
 
 
 /*
  * zpool reopen <pool>
  *
  * Reopen the pool so that the kernel can update the sizes of all vdevs.
  */
 int
 zpool_do_reopen(int argc, char **argv)
 {
 	int c;
 	int ret = 0;
 	boolean_t scrub_restart = B_TRUE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "n")) != -1) {
 		switch (c) {
 		case 'n':
 			scrub_restart = B_FALSE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* if argc == 0 we will execute zpool_reopen_one on all pools */
 	ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_reopen_one,
 	    &scrub_restart);
 
 	return (ret);
 }
 
 typedef struct scrub_cbdata {
 	int	cb_type;
 	pool_scrub_cmd_t cb_scrub_cmd;
 } scrub_cbdata_t;
 
 static boolean_t
 zpool_has_checkpoint(zpool_handle_t *zhp)
 {
 	nvlist_t *config, *nvroot;
 
 	config = zpool_get_config(zhp, NULL);
 
 	if (config != NULL) {
 		pool_checkpoint_stat_t *pcs = NULL;
 		uint_t c;
 
 		nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 		(void) nvlist_lookup_uint64_array(nvroot,
 		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
 
 		if (pcs == NULL || pcs->pcs_state == CS_NONE)
 			return (B_FALSE);
 
 		assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS ||
 		    pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 static int
 scrub_callback(zpool_handle_t *zhp, void *data)
 {
 	scrub_cbdata_t *cb = data;
 	int err;
 
 	/*
 	 * Ignore faulted pools.
 	 */
 	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
 		(void) fprintf(stderr, gettext("cannot scan '%s': pool is "
 		    "currently unavailable\n"), zpool_get_name(zhp));
 		return (1);
 	}
 
 	err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);
 
 	if (err == 0 && zpool_has_checkpoint(zhp) &&
 	    cb->cb_type == POOL_SCAN_SCRUB) {
 		(void) printf(gettext("warning: will not scrub state that "
 		    "belongs to the checkpoint of pool '%s'\n"),
 		    zpool_get_name(zhp));
 	}
 
 	return (err != 0);
 }
 
 static int
 wait_callback(zpool_handle_t *zhp, void *data)
 {
 	zpool_wait_activity_t *act = data;
 	return (zpool_wait(zhp, *act));
 }
 
 /*
  * zpool scrub [-s | -p] [-w] <pool> ...
  *
  *	-s	Stop.  Stops any in-progress scrub.
  *	-p	Pause. Pause in-progress scrub.
  *	-w	Wait.  Blocks until scrub has completed.
  */
 int
 zpool_do_scrub(int argc, char **argv)
 {
 	int c;
 	scrub_cbdata_t cb;
 	boolean_t wait = B_FALSE;
 	int error;
 
 	cb.cb_type = POOL_SCAN_SCRUB;
 	cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "spw")) != -1) {
 		switch (c) {
 		case 's':
 			cb.cb_type = POOL_SCAN_NONE;
 			break;
 		case 'p':
 			cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
 			break;
 		case 'w':
 			wait = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	if (cb.cb_type == POOL_SCAN_NONE &&
 	    cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) {
 		(void) fprintf(stderr, gettext("invalid option combination: "
 		    "-s and -p are mutually exclusive\n"));
 		usage(B_FALSE);
 	}
 
 	if (wait && (cb.cb_type == POOL_SCAN_NONE ||
 	    cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) {
 		(void) fprintf(stderr, gettext("invalid option combination: "
 		    "-w cannot be used with -p or -s\n"));
 		usage(B_FALSE);
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 	}
 
 	error = for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb);
 
 	if (wait && !error) {
 		zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB;
 		error = for_each_pool(argc, argv, B_TRUE, NULL, wait_callback,
 		    &act);
 	}
 
 	return (error);
 }
 
 /*
  * zpool resilver <pool> ...
  *
  *	Restarts any in-progress resilver
  */
 int
 zpool_do_resilver(int argc, char **argv)
 {
 	int c;
 	scrub_cbdata_t cb;
 
 	cb.cb_type = POOL_SCAN_RESILVER;
 	cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "")) != -1) {
 		switch (c) {
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 	}
 
 	return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
 }
 
 /*
  * zpool trim [-d] [-r <rate>] [-c | -s] <pool> [<device> ...]
  *
  *	-c		Cancel. Ends any in-progress trim.
  *	-d		Secure trim.  Requires kernel and device support.
  *	-r <rate>	Sets the TRIM rate in bytes (per second). Supports
  *			adding a multiplier suffix such as 'k' or 'm'.
  *	-s		Suspend. TRIM can then be restarted with no flags.
  *	-w		Wait. Blocks until trimming has completed.
  */
 int
 zpool_do_trim(int argc, char **argv)
 {
 	struct option long_options[] = {
 		{"cancel",	no_argument,		NULL,	'c'},
 		{"secure",	no_argument,		NULL,	'd'},
 		{"rate",	required_argument,	NULL,	'r'},
 		{"suspend",	no_argument,		NULL,	's'},
 		{"wait",	no_argument,		NULL,	'w'},
 		{0, 0, 0, 0}
 	};
 
 	pool_trim_func_t cmd_type = POOL_TRIM_START;
 	uint64_t rate = 0;
 	boolean_t secure = B_FALSE;
 	boolean_t wait = B_FALSE;
 
 	int c;
 	while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL))
 	    != -1) {
 		switch (c) {
 		case 'c':
 			if (cmd_type != POOL_TRIM_START &&
 			    cmd_type != POOL_TRIM_CANCEL) {
 				(void) fprintf(stderr, gettext("-c cannot be "
 				    "combined with other options\n"));
 				usage(B_FALSE);
 			}
 			cmd_type = POOL_TRIM_CANCEL;
 			break;
 		case 'd':
 			if (cmd_type != POOL_TRIM_START) {
 				(void) fprintf(stderr, gettext("-d cannot be "
 				    "combined with the -c or -s options\n"));
 				usage(B_FALSE);
 			}
 			secure = B_TRUE;
 			break;
 		case 'r':
 			if (cmd_type != POOL_TRIM_START) {
 				(void) fprintf(stderr, gettext("-r cannot be "
 				    "combined with the -c or -s options\n"));
 				usage(B_FALSE);
 			}
 			if (zfs_nicestrtonum(NULL, optarg, &rate) == -1) {
 				(void) fprintf(stderr,
 				    gettext("invalid value for rate\n"));
 				usage(B_FALSE);
 			}
 			break;
 		case 's':
 			if (cmd_type != POOL_TRIM_START &&
 			    cmd_type != POOL_TRIM_SUSPEND) {
 				(void) fprintf(stderr, gettext("-s cannot be "
 				    "combined with other options\n"));
 				usage(B_FALSE);
 			}
 			cmd_type = POOL_TRIM_SUSPEND;
 			break;
 		case 'w':
 			wait = B_TRUE;
 			break;
 		case '?':
 			if (optopt != 0) {
 				(void) fprintf(stderr,
 				    gettext("invalid option '%c'\n"), optopt);
 			} else {
 				(void) fprintf(stderr,
 				    gettext("invalid option '%s'\n"),
 				    argv[optind - 1]);
 			}
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name argument\n"));
 		usage(B_FALSE);
 		return (-1);
 	}
 
 	if (wait && (cmd_type != POOL_TRIM_START)) {
 		(void) fprintf(stderr, gettext("-w cannot be used with -c or "
 		    "-s\n"));
 		usage(B_FALSE);
 	}
 
 	char *poolname = argv[0];
 	zpool_handle_t *zhp = zpool_open(g_zfs, poolname);
 	if (zhp == NULL)
 		return (-1);
 
 	trimflags_t trim_flags = {
 		.secure = secure,
 		.rate = rate,
 		.wait = wait,
 	};
 
 	nvlist_t *vdevs = fnvlist_alloc();
 	if (argc == 1) {
 		/* no individual leaf vdevs specified, so add them all */
 		nvlist_t *config = zpool_get_config(zhp, NULL);
 		nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
 		    ZPOOL_CONFIG_VDEV_TREE);
 		zpool_collect_leaves(zhp, nvroot, vdevs);
 		trim_flags.fullpool = B_TRUE;
 	} else {
 		trim_flags.fullpool = B_FALSE;
 		for (int i = 1; i < argc; i++) {
 			fnvlist_add_boolean(vdevs, argv[i]);
 		}
 	}
 
 	int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags);
 
 	fnvlist_free(vdevs);
 	zpool_close(zhp);
 
 	return (error);
 }
 
 /*
  * Converts a total number of seconds to a human readable string broken
  * down in to days/hours/minutes/seconds.
  */
 static void
 secs_to_dhms(uint64_t total, char *buf)
 {
 	uint64_t days = total / 60 / 60 / 24;
 	uint64_t hours = (total / 60 / 60) % 24;
 	uint64_t mins = (total / 60) % 60;
 	uint64_t secs = (total % 60);
 
 	if (days > 0) {
 		(void) sprintf(buf, "%llu days %02llu:%02llu:%02llu",
 		    (u_longlong_t)days, (u_longlong_t)hours,
 		    (u_longlong_t)mins, (u_longlong_t)secs);
 	} else {
 		(void) sprintf(buf, "%02llu:%02llu:%02llu",
 		    (u_longlong_t)hours, (u_longlong_t)mins,
 		    (u_longlong_t)secs);
 	}
 }
 
 /*
  * Print out detailed scrub status.
  */
 static void
 print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 {
 	time_t start, end, pause;
 	uint64_t pass_scanned, scanned, pass_issued, issued, total;
 	uint64_t elapsed, scan_rate, issue_rate;
 	double fraction_done;
 	char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
 	char srate_buf[7], irate_buf[7], time_buf[32];
 
 	printf("  ");
 	printf_color(ANSI_BOLD, gettext("scan:"));
 	printf(" ");
 
 	/* If there's never been a scan, there's not much to say. */
 	if (ps == NULL || ps->pss_func == POOL_SCAN_NONE ||
 	    ps->pss_func >= POOL_SCAN_FUNCS) {
 		(void) printf(gettext("none requested\n"));
 		return;
 	}
 
 	start = ps->pss_start_time;
 	end = ps->pss_end_time;
 	pause = ps->pss_pass_scrub_pause;
 
 	zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf));
 
 	assert(ps->pss_func == POOL_SCAN_SCRUB ||
 	    ps->pss_func == POOL_SCAN_RESILVER);
 
 	/* Scan is finished or canceled. */
 	if (ps->pss_state == DSS_FINISHED) {
 		secs_to_dhms(end - start, time_buf);
 
 		if (ps->pss_func == POOL_SCAN_SCRUB) {
 			(void) printf(gettext("scrub repaired %s "
 			    "in %s with %llu errors on %s"), processed_buf,
 			    time_buf, (u_longlong_t)ps->pss_errors,
 			    ctime(&end));
 		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
 			(void) printf(gettext("resilvered %s "
 			    "in %s with %llu errors on %s"), processed_buf,
 			    time_buf, (u_longlong_t)ps->pss_errors,
 			    ctime(&end));
 		}
 		return;
 	} else if (ps->pss_state == DSS_CANCELED) {
 		if (ps->pss_func == POOL_SCAN_SCRUB) {
 			(void) printf(gettext("scrub canceled on %s"),
 			    ctime(&end));
 		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
 			(void) printf(gettext("resilver canceled on %s"),
 			    ctime(&end));
 		}
 		return;
 	}
 
 	assert(ps->pss_state == DSS_SCANNING);
 
 	/* Scan is in progress. Resilvers can't be paused. */
 	if (ps->pss_func == POOL_SCAN_SCRUB) {
 		if (pause == 0) {
 			(void) printf(gettext("scrub in progress since %s"),
 			    ctime(&start));
 		} else {
 			(void) printf(gettext("scrub paused since %s"),
 			    ctime(&pause));
 			(void) printf(gettext("\tscrub started on %s"),
 			    ctime(&start));
 		}
 	} else if (ps->pss_func == POOL_SCAN_RESILVER) {
 		(void) printf(gettext("resilver in progress since %s"),
 		    ctime(&start));
 	}
 
 	scanned = ps->pss_examined;
 	pass_scanned = ps->pss_pass_exam;
 	issued = ps->pss_issued;
 	pass_issued = ps->pss_pass_issued;
 	total = ps->pss_to_examine;
 
 	/* we are only done with a block once we have issued the IO for it */
 	fraction_done = (double)issued / total;
 
 	/* elapsed time for this pass, rounding up to 1 if it's 0 */
 	elapsed = time(NULL) - ps->pss_pass_start;
 	elapsed -= ps->pss_pass_scrub_spent_paused;
 	elapsed = (elapsed != 0) ? elapsed : 1;
 
 	scan_rate = pass_scanned / elapsed;
 	issue_rate = pass_issued / elapsed;
 	uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ?
 	    ((total - issued) / issue_rate) : UINT64_MAX;
 	secs_to_dhms(total_secs_left, time_buf);
 
 	/* format all of the numbers we will be reporting */
 	zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
 	zfs_nicebytes(issued, issued_buf, sizeof (issued_buf));
 	zfs_nicebytes(total, total_buf, sizeof (total_buf));
 	zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
 	zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
 
 	/* do not print estimated time if we have a paused scrub */
 	if (pause == 0) {
 		(void) printf(gettext("\t%s scanned at %s/s, "
 		    "%s issued at %s/s, %s total\n"),
 		    scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
 	} else {
 		(void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
 		    scanned_buf, issued_buf, total_buf);
 	}
 
 	if (ps->pss_func == POOL_SCAN_RESILVER) {
 		(void) printf(gettext("\t%s resilvered, %.2f%% done"),
 		    processed_buf, 100 * fraction_done);
 	} else if (ps->pss_func == POOL_SCAN_SCRUB) {
 		(void) printf(gettext("\t%s repaired, %.2f%% done"),
 		    processed_buf, 100 * fraction_done);
 	}
 
 	if (pause == 0) {
 		if (total_secs_left != UINT64_MAX &&
 		    issue_rate >= 10 * 1024 * 1024) {
 			(void) printf(gettext(", %s to go\n"), time_buf);
 		} else {
 			(void) printf(gettext(", no estimated "
 			    "completion time\n"));
 		}
 	} else {
 		(void) printf(gettext("\n"));
 	}
 }
 
 static void
 print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
 {
 	if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE)
 		return;
 
 	printf("  ");
 	printf_color(ANSI_BOLD, gettext("scan:"));
 	printf(" ");
 
 	uint64_t bytes_scanned = vrs->vrs_bytes_scanned;
 	uint64_t bytes_issued = vrs->vrs_bytes_issued;
 	uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt;
 	uint64_t bytes_est = vrs->vrs_bytes_est;
 	uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned /
 	    (vrs->vrs_pass_time_ms + 1)) * 1000;
 	uint64_t issue_rate = (vrs->vrs_pass_bytes_issued /
 	    (vrs->vrs_pass_time_ms + 1)) * 1000;
 	double scan_pct = MIN((double)bytes_scanned * 100 /
 	    (bytes_est + 1), 100);
 
 	/* Format all of the numbers we will be reporting */
 	char bytes_scanned_buf[7], bytes_issued_buf[7];
 	char bytes_rebuilt_buf[7], bytes_est_buf[7];
 	char scan_rate_buf[7], issue_rate_buf[7], time_buf[32];
 	zfs_nicebytes(bytes_scanned, bytes_scanned_buf,
 	    sizeof (bytes_scanned_buf));
 	zfs_nicebytes(bytes_issued, bytes_issued_buf,
 	    sizeof (bytes_issued_buf));
 	zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf,
 	    sizeof (bytes_rebuilt_buf));
 	zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf));
 	zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
 	zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf));
 
 	time_t start = vrs->vrs_start_time;
 	time_t end = vrs->vrs_end_time;
 
 	/* Rebuild is finished or canceled. */
 	if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) {
 		secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf);
 		(void) printf(gettext("resilvered (%s) %s in %s "
 		    "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf,
 		    time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end));
 		return;
 	} else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) {
 		(void) printf(gettext("resilver (%s) canceled on %s"),
 		    vdev_name, ctime(&end));
 		return;
 	} else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
 		(void) printf(gettext("resilver (%s) in progress since %s"),
 		    vdev_name, ctime(&start));
 	}
 
 	assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE);
 
 	secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) /
 	    MAX(scan_rate, 1), time_buf);
 
 	(void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, "
 	    "%s total\n"), bytes_scanned_buf, scan_rate_buf,
 	    bytes_issued_buf, issue_rate_buf, bytes_est_buf);
 	(void) printf(gettext("\t%s resilvered, %.2f%% done"),
 	    bytes_rebuilt_buf, scan_pct);
 
 	if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
 		if (scan_rate >= 10 * 1024 * 1024) {
 			(void) printf(gettext(", %s to go\n"), time_buf);
 		} else {
 			(void) printf(gettext(", no estimated "
 			    "completion time\n"));
 		}
 	} else {
 		(void) printf(gettext("\n"));
 	}
 }
 
 /*
  * Print rebuild status for top-level vdevs.
  */
 static void
 print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot)
 {
 	nvlist_t **child;
 	uint_t children;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
 		children = 0;
 
 	for (uint_t c = 0; c < children; c++) {
 		vdev_rebuild_stat_t *vrs;
 		uint_t i;
 
 		if (nvlist_lookup_uint64_array(child[c],
 		    ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
 			char *name = zpool_vdev_name(g_zfs, zhp,
 			    child[c], VDEV_NAME_TYPE_ID);
 			print_rebuild_status_impl(vrs, name);
 			free(name);
 		}
 	}
 }
 
 /*
  * As we don't scrub checkpointed blocks, we want to warn the user that we
  * skipped scanning some blocks if a checkpoint exists or existed at any
  * time during the scan.  If a sequential instead of healing reconstruction
  * was performed then the blocks were reconstructed.  However, their checksums
  * have not been verified so we still print the warning.
  */
 static void
 print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
 {
 	if (ps == NULL || pcs == NULL)
 		return;
 
 	if (pcs->pcs_state == CS_NONE ||
 	    pcs->pcs_state == CS_CHECKPOINT_DISCARDING)
 		return;
 
 	assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS);
 
 	if (ps->pss_state == DSS_NONE)
 		return;
 
 	if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) &&
 	    ps->pss_end_time < pcs->pcs_start_time)
 		return;
 
 	if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) {
 		(void) printf(gettext("    scan warning: skipped blocks "
 		    "that are only referenced by the checkpoint.\n"));
 	} else {
 		assert(ps->pss_state == DSS_SCANNING);
 		(void) printf(gettext("    scan warning: skipping blocks "
 		    "that are only referenced by the checkpoint.\n"));
 	}
 }
 
 /*
  * Returns B_TRUE if there is an active rebuild in progress.  Otherwise,
  * B_FALSE is returned and 'rebuild_end_time' is set to the end time for
  * the last completed (or cancelled) rebuild.
  */
 static boolean_t
 check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time)
 {
 	nvlist_t **child;
 	uint_t children;
 	boolean_t rebuilding = B_FALSE;
 	uint64_t end_time = 0;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
 		children = 0;
 
 	for (uint_t c = 0; c < children; c++) {
 		vdev_rebuild_stat_t *vrs;
 		uint_t i;
 
 		if (nvlist_lookup_uint64_array(child[c],
 		    ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
 
 			if (vrs->vrs_end_time > end_time)
 				end_time = vrs->vrs_end_time;
 
 			if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
 				rebuilding = B_TRUE;
 				end_time = 0;
 				break;
 			}
 		}
 	}
 
 	if (rebuild_end_time != NULL)
 		*rebuild_end_time = end_time;
 
 	return (rebuilding);
 }
 
 /*
  * Print the scan status.
  */
 static void
 print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
 {
 	uint64_t rebuild_end_time = 0, resilver_end_time = 0;
 	boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE;
 	boolean_t active_resilver = B_FALSE;
 	pool_checkpoint_stat_t *pcs = NULL;
 	pool_scan_stat_t *ps = NULL;
 	uint_t c;
 
 	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
 	    (uint64_t **)&ps, &c) == 0) {
 		if (ps->pss_func == POOL_SCAN_RESILVER) {
 			resilver_end_time = ps->pss_end_time;
 			active_resilver = (ps->pss_state == DSS_SCANNING);
 		}
 
 		have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
 		have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
 	}
 
 	boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
 	boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0));
 
 	/* Always print the scrub status when available. */
 	if (have_scrub)
 		print_scan_scrub_resilver_status(ps);
 
 	/*
 	 * When there is an active resilver or rebuild print its status.
 	 * Otherwise print the status of the last resilver or rebuild.
 	 */
 	if (active_resilver || (!active_rebuild && have_resilver &&
 	    resilver_end_time && resilver_end_time > rebuild_end_time)) {
 		print_scan_scrub_resilver_status(ps);
 	} else if (active_rebuild || (!active_resilver && have_rebuild &&
 	    rebuild_end_time && rebuild_end_time > resilver_end_time)) {
 		print_rebuild_status(zhp, nvroot);
 	}
 
 	(void) nvlist_lookup_uint64_array(nvroot,
 	    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
 	print_checkpoint_scan_warning(ps, pcs);
 }
 
 /*
  * Print out detailed removal status.
  */
 static void
 print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)
 {
 	char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
 	time_t start, end;
 	nvlist_t *config, *nvroot;
 	nvlist_t **child;
 	uint_t children;
 	char *vdev_name;
 
 	if (prs == NULL || prs->prs_state == DSS_NONE)
 		return;
 
 	/*
 	 * Determine name of vdev.
 	 */
 	config = zpool_get_config(zhp, NULL);
 	nvroot = fnvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE);
 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) == 0);
 	assert(prs->prs_removing_vdev < children);
 	vdev_name = zpool_vdev_name(g_zfs, zhp,
 	    child[prs->prs_removing_vdev], B_TRUE);
 
 	(void) printf(gettext("remove: "));
 
 	start = prs->prs_start_time;
 	end = prs->prs_end_time;
 	zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf));
 
 	/*
 	 * Removal is finished or canceled.
 	 */
 	if (prs->prs_state == DSS_FINISHED) {
 		uint64_t minutes_taken = (end - start) / 60;
 
 		(void) printf(gettext("Removal of vdev %llu copied %s "
 		    "in %lluh%um, completed on %s"),
 		    (longlong_t)prs->prs_removing_vdev,
 		    copied_buf,
 		    (u_longlong_t)(minutes_taken / 60),
 		    (uint_t)(minutes_taken % 60),
 		    ctime((time_t *)&end));
 	} else if (prs->prs_state == DSS_CANCELED) {
 		(void) printf(gettext("Removal of %s canceled on %s"),
 		    vdev_name, ctime(&end));
 	} else {
 		uint64_t copied, total, elapsed, mins_left, hours_left;
 		double fraction_done;
 		uint_t rate;
 
 		assert(prs->prs_state == DSS_SCANNING);
 
 		/*
 		 * Removal is in progress.
 		 */
 		(void) printf(gettext(
 		    "Evacuation of %s in progress since %s"),
 		    vdev_name, ctime(&start));
 
 		copied = prs->prs_copied > 0 ? prs->prs_copied : 1;
 		total = prs->prs_to_copy;
 		fraction_done = (double)copied / total;
 
 		/* elapsed time for this pass */
 		elapsed = time(NULL) - prs->prs_start_time;
 		elapsed = elapsed > 0 ? elapsed : 1;
 		rate = copied / elapsed;
 		rate = rate > 0 ? rate : 1;
 		mins_left = ((total - copied) / rate) / 60;
 		hours_left = mins_left / 60;
 
 		zfs_nicenum(copied, examined_buf, sizeof (examined_buf));
 		zfs_nicenum(total, total_buf, sizeof (total_buf));
 		zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
 
 		/*
 		 * do not print estimated time if hours_left is more than
 		 * 30 days
 		 */
 		(void) printf(gettext("    %s copied out of %s at %s/s, "
 		    "%.2f%% done"),
 		    examined_buf, total_buf, rate_buf, 100 * fraction_done);
 		if (hours_left < (30 * 24)) {
 			(void) printf(gettext(", %lluh%um to go\n"),
 			    (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
 		} else {
 			(void) printf(gettext(
 			    ", (copy is slow, no estimated time)\n"));
 		}
 	}
 	free(vdev_name);
 
 	if (prs->prs_mapping_memory > 0) {
 		char mem_buf[7];
 		zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf));
 		(void) printf(gettext("    %s memory used for "
 		    "removed device mappings\n"),
 		    mem_buf);
 	}
 }
 
 static void
 print_checkpoint_status(pool_checkpoint_stat_t *pcs)
 {
 	time_t start;
 	char space_buf[7];
 
 	if (pcs == NULL || pcs->pcs_state == CS_NONE)
 		return;
 
 	(void) printf(gettext("checkpoint: "));
 
 	start = pcs->pcs_start_time;
 	zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf));
 
 	if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) {
 		char *date = ctime(&start);
 
 		/*
 		 * ctime() adds a newline at the end of the generated
 		 * string, thus the weird format specifier and the
 		 * strlen() call used to chop it off from the output.
 		 */
 		(void) printf(gettext("created %.*s, consumes %s\n"),
 		    (int)(strlen(date) - 1), date, space_buf);
 		return;
 	}
 
 	assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
 
 	(void) printf(gettext("discarding, %s remaining.\n"),
 	    space_buf);
 }
 
 static void
 print_error_log(zpool_handle_t *zhp)
 {
 	nvlist_t *nverrlist = NULL;
 	nvpair_t *elem;
 	char *pathname;
 	size_t len = MAXPATHLEN * 2;
 
 	if (zpool_get_errlog(zhp, &nverrlist) != 0)
 		return;
 
 	(void) printf("errors: Permanent errors have been "
 	    "detected in the following files:\n\n");
 
 	pathname = safe_malloc(len);
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) {
 		nvlist_t *nv;
 		uint64_t dsobj, obj;
 
 		verify(nvpair_value_nvlist(elem, &nv) == 0);
 		verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET,
 		    &dsobj) == 0);
 		verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT,
 		    &obj) == 0);
 		zpool_obj_to_path(zhp, dsobj, obj, pathname, len);
 		(void) printf("%7s %s\n", "", pathname);
 	}
 	free(pathname);
 	nvlist_free(nverrlist);
 }
 
 static void
 print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares,
     uint_t nspares)
 {
 	uint_t i;
 	char *name;
 
 	if (nspares == 0)
 		return;
 
 	(void) printf(gettext("\tspares\n"));
 
 	for (i = 0; i < nspares; i++) {
 		name = zpool_vdev_name(g_zfs, zhp, spares[i],
 		    cb->cb_name_flags);
 		print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL);
 		free(name);
 	}
 }
 
 static void
 print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache,
     uint_t nl2cache)
 {
 	uint_t i;
 	char *name;
 
 	if (nl2cache == 0)
 		return;
 
 	(void) printf(gettext("\tcache\n"));
 
 	for (i = 0; i < nl2cache; i++) {
 		name = zpool_vdev_name(g_zfs, zhp, l2cache[i],
 		    cb->cb_name_flags);
 		print_status_config(zhp, cb, name, l2cache[i], 2,
 		    B_FALSE, NULL);
 		free(name);
 	}
 }
 
 static void
 print_dedup_stats(nvlist_t *config)
 {
 	ddt_histogram_t *ddh;
 	ddt_stat_t *dds;
 	ddt_object_t *ddo;
 	uint_t c;
 	char dspace[6], mspace[6];
 
 	/*
 	 * If the pool was faulted then we may not have been able to
 	 * obtain the config. Otherwise, if we have anything in the dedup
 	 * table continue processing the stats.
 	 */
 	if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,
 	    (uint64_t **)&ddo, &c) != 0)
 		return;
 
 	(void) printf("\n");
 	(void) printf(gettext(" dedup: "));
 	if (ddo->ddo_count == 0) {
 		(void) printf(gettext("no DDT entries\n"));
 		return;
 	}
 
 	zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace));
 	zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace));
 	(void) printf("DDT entries %llu, size %s on disk, %s in core\n",
 	    (u_longlong_t)ddo->ddo_count,
 	    dspace,
 	    mspace);
 
 	verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS,
 	    (uint64_t **)&dds, &c) == 0);
 	verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM,
 	    (uint64_t **)&ddh, &c) == 0);
 	zpool_dump_ddt(dds, ddh);
 }
 
 /*
  * Display a summary of pool status.  Displays a summary such as:
  *
  *        pool: tank
  *	status: DEGRADED
  *	reason: One or more devices ...
- *         see: https://zfsonlinux.org/msg/ZFS-xxxx-01
+ *         see: https://openzfs.github.io/openzfs-docs/msg/ZFS-xxxx-01
  *	config:
  *		mirror		DEGRADED
  *                c1t0d0	OK
  *                c2t0d0	UNAVAIL
  *
  * When given the '-v' option, we print out the complete config.  If the '-e'
  * option is specified, then we print out error rate information as well.
  */
 static int
 status_callback(zpool_handle_t *zhp, void *data)
 {
 	status_cbdata_t *cbp = data;
 	nvlist_t *config, *nvroot;
 	char *msgid;
 	zpool_status_t reason;
 	zpool_errata_t errata;
 	const char *health;
 	uint_t c;
 	vdev_stat_t *vs;
 
 	config = zpool_get_config(zhp, NULL);
 	reason = zpool_get_status(zhp, &msgid, &errata);
 
 	cbp->cb_count++;
 
 	/*
 	 * If we were given 'zpool status -x', only report those pools with
 	 * problems.
 	 */
 	if (cbp->cb_explain &&
 	    (reason == ZPOOL_STATUS_OK ||
 	    reason == ZPOOL_STATUS_VERSION_OLDER ||
 	    reason == ZPOOL_STATUS_FEAT_DISABLED)) {
 		if (!cbp->cb_allpools) {
 			(void) printf(gettext("pool '%s' is healthy\n"),
 			    zpool_get_name(zhp));
 			if (cbp->cb_first)
 				cbp->cb_first = B_FALSE;
 		}
 		return (0);
 	}
 
 	if (cbp->cb_first)
 		cbp->cb_first = B_FALSE;
 	else
 		(void) printf("\n");
 
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &c) == 0);
 
 	health = zpool_get_state_str(zhp);
 
 	printf("  ");
 	printf_color(ANSI_BOLD, gettext("pool:"));
 	printf(" %s\n", zpool_get_name(zhp));
 	printf(" ");
 	printf_color(ANSI_BOLD, gettext("state: "));
 
 	printf_color(health_str_to_color(health), "%s", health);
 
 	printf("\n");
 
 	switch (reason) {
 	case ZPOOL_STATUS_MISSING_DEV_R:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices could "
 		    "not be opened.  Sufficient replicas exist for\n\tthe pool "
 		    "to continue functioning in a degraded state.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Attach the missing device "
 		    "and online it using 'zpool online'.\n"));
 		break;
 
 	case ZPOOL_STATUS_MISSING_DEV_NR:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices could "
 		    "not be opened.  There are insufficient\n\treplicas for the"
 		    " pool to continue functioning.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Attach the missing device "
 		    "and online it using 'zpool online'.\n"));
 		break;
 
 	case ZPOOL_STATUS_CORRUPT_LABEL_R:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices could "
 		    "not be used because the label is missing or\n\tinvalid.  "
 		    "Sufficient replicas exist for the pool to continue\n\t"
 		    "functioning in a degraded state.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Replace the device using "
 		    "'zpool replace'.\n"));
 		break;
 
 	case ZPOOL_STATUS_CORRUPT_LABEL_NR:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices could "
 		    "not be used because the label is missing \n\tor invalid.  "
 		    "There are insufficient replicas for the pool to "
 		    "continue\n\tfunctioning.\n"));
 		zpool_explain_recover(zpool_get_handle(zhp),
 		    zpool_get_name(zhp), reason, config);
 		break;
 
 	case ZPOOL_STATUS_FAILING_DEV:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices has "
 		    "experienced an unrecoverable error.  An\n\tattempt was "
 		    "made to correct the error.  Applications are "
 		    "unaffected.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 			printf_color(ANSI_YELLOW, gettext("Determine if the "
 		    "device needs to be replaced, and clear the errors\n\tusing"
 		    " 'zpool clear' or replace the device with 'zpool "
 		    "replace'.\n"));
 		break;
 
 	case ZPOOL_STATUS_OFFLINE_DEV:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices has "
 		    "been taken offline by the administrator.\n\tSufficient "
 		    "replicas exist for the pool to continue functioning in "
 		    "a\n\tdegraded state.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Online the device "
 		    "using 'zpool online' or replace the device with\n\t'zpool "
 		    "replace'.\n"));
 		break;
 
 	case ZPOOL_STATUS_REMOVED_DEV:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices has "
 		    "been removed by the administrator.\n\tSufficient "
 		    "replicas exist for the pool to continue functioning in "
 		    "a\n\tdegraded state.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Online the device "
 		    "using zpool online' or replace the device with\n\t'zpool "
 		    "replace'.\n"));
 		break;
 
 	case ZPOOL_STATUS_RESILVERING:
 	case ZPOOL_STATUS_REBUILDING:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices is "
 		    "currently being resilvered.  The pool will\n\tcontinue "
 		    "to function, possibly in a degraded state.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Wait for the resilver to "
 		    "complete.\n"));
 		break;
 
 	case ZPOOL_STATUS_REBUILD_SCRUB:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices have "
 		    "been sequentially resilvered, scrubbing\n\tthe pool "
 		    "is recommended.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to "
 		    "verify all data checksums.\n"));
 		break;
 
 	case ZPOOL_STATUS_CORRUPT_DATA:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices has "
 		    "experienced an error resulting in data\n\tcorruption.  "
 		    "Applications may be affected.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Restore the file in question"
 		    " if possible.  Otherwise restore the\n\tentire pool from "
 		    "backup.\n"));
 		break;
 
 	case ZPOOL_STATUS_CORRUPT_POOL:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool metadata is "
 		    "corrupted and the pool cannot be opened.\n"));
 		zpool_explain_recover(zpool_get_handle(zhp),
 		    zpool_get_name(zhp), reason, config);
 		break;
 
 	case ZPOOL_STATUS_VERSION_OLDER:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool is formatted using "
 		    "a legacy on-disk format.  The pool can\n\tstill be used, "
 		    "but some features are unavailable.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Upgrade the pool using "
 		    "'zpool upgrade'.  Once this is done, the\n\tpool will no "
 		    "longer be accessible on software that does not support\n\t"
 		    "feature flags.\n"));
 		break;
 
 	case ZPOOL_STATUS_VERSION_NEWER:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool has been upgraded "
 		    "to a newer, incompatible on-disk version.\n\tThe pool "
 		    "cannot be accessed on this system.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Access the pool from a "
 		    "system running more recent software, or\n\trestore the "
 		    "pool from backup.\n"));
 		break;
 
 	case ZPOOL_STATUS_FEAT_DISABLED:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("Some supported features are "
 		    "not enabled on the pool. The pool can\n\tstill be used, "
 		    "but some features are unavailable.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Enable all features using "
 		    "'zpool upgrade'. Once this is done,\n\tthe pool may no "
 		    "longer be accessible by software that does not support\n\t"
 		    "the features. See zpool-features(5) for details.\n"));
 		break;
 
 	case ZPOOL_STATUS_UNSUP_FEAT_READ:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed "
 		    "on this system because it uses the\n\tfollowing feature(s)"
 		    " not supported on this system:\n"));
 		zpool_print_unsup_feat(config);
 		(void) printf("\n");
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Access the pool from a "
 		    "system that supports the required feature(s),\n\tor "
 		    "restore the pool from backup.\n"));
 		break;
 
 	case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool can only be "
 		    "accessed in read-only mode on this system. It\n\tcannot be"
 		    " accessed in read-write mode because it uses the "
 		    "following\n\tfeature(s) not supported on this system:\n"));
 		zpool_print_unsup_feat(config);
 		(void) printf("\n");
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed "
 		    "in read-write mode. Import the pool with\n"
 		    "\t\"-o readonly=on\", access the pool from a system that "
 		    "supports the\n\trequired feature(s), or restore the "
 		    "pool from backup.\n"));
 		break;
 
 	case ZPOOL_STATUS_FAULTED_DEV_R:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices are "
 		    "faulted in response to persistent errors.\n\tSufficient "
 		    "replicas exist for the pool to continue functioning "
 		    "in a\n\tdegraded state.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Replace the faulted device, "
 		    "or use 'zpool clear' to mark the device\n\trepaired.\n"));
 		break;
 
 	case ZPOOL_STATUS_FAULTED_DEV_NR:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices are "
 		    "faulted in response to persistent errors.  There are "
 		    "insufficient replicas for the pool to\n\tcontinue "
 		    "functioning.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Destroy and re-create the "
 		    "pool from a backup source.  Manually marking the device\n"
 		    "\trepaired using 'zpool clear' may allow some data "
 		    "to be recovered.\n"));
 		break;
 
 	case ZPOOL_STATUS_IO_FAILURE_MMP:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("The pool is suspended "
 		    "because multihost writes failed or were delayed;\n\t"
 		    "another system could import the pool undetected.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
 		    " are connected, then reboot your system and\n\timport the "
 		    "pool.\n"));
 		break;
 
 	case ZPOOL_STATUS_IO_FAILURE_WAIT:
 	case ZPOOL_STATUS_IO_FAILURE_CONTINUE:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("One or more devices are "
 		    "faulted in response to IO failures.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Make sure the affected "
 		    "devices are connected, then run 'zpool clear'.\n"));
 		break;
 
 	case ZPOOL_STATUS_BAD_LOG:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("An intent log record "
 		    "could not be read.\n"
 		    "\tWaiting for administrator intervention to fix the "
 		    "faulted pool.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Either restore the affected "
 		    "device(s) and run 'zpool online',\n"
 		    "\tor ignore the intent log records by running "
 		    "'zpool clear'.\n"));
 		break;
 
 	case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
 		(void) printf(gettext("status: One or more devices are "
 		    "configured to use a non-native block size.\n"
 		    "\tExpect reduced performance.\n"));
 		(void) printf(gettext("action: Replace affected devices with "
 		    "devices that support the\n\tconfigured block size, or "
 		    "migrate data to a properly configured\n\tpool.\n"));
 		break;
 
 	case ZPOOL_STATUS_HOSTID_MISMATCH:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid"
 		    " and system hostid on imported pool.\n\tThis pool was "
 		    "previously imported into a system with a different "
 		    "hostid,\n\tand then was verbatim imported into this "
 		    "system.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Export this pool on all "
 		    "systems on which it is imported.\n"
 		    "\tThen import it to correct the mismatch.\n"));
 		break;
 
 	case ZPOOL_STATUS_ERRATA:
 		printf_color(ANSI_BOLD, gettext("status: "));
 		printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"),
 		    errata);
 
 		switch (errata) {
 		case ZPOOL_ERRATA_NONE:
 			break;
 
 		case ZPOOL_ERRATA_ZOL_2094_SCRUB:
 			printf_color(ANSI_BOLD, gettext("action: "));
 			printf_color(ANSI_YELLOW, gettext("To correct the issue"
 			    " run 'zpool scrub'.\n"));
 			break;
 
 		case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION:
 			(void) printf(gettext("\tExisting encrypted datasets "
 			    "contain an on-disk incompatibility\n\twhich "
 			    "needs to be corrected.\n"));
 			printf_color(ANSI_BOLD, gettext("action: "));
 			printf_color(ANSI_YELLOW, gettext("To correct the issue"
 			    " backup existing encrypted datasets to new\n\t"
 			    "encrypted datasets and destroy the old ones. "
 			    "'zfs mount -o ro' can\n\tbe used to temporarily "
 			    "mount existing encrypted datasets readonly.\n"));
 			break;
 
 		case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION:
 			(void) printf(gettext("\tExisting encrypted snapshots "
 			    "and bookmarks contain an on-disk\n\tincompat"
 			    "ibility. This may cause on-disk corruption if "
 			    "they are used\n\twith 'zfs recv'.\n"));
 			printf_color(ANSI_BOLD, gettext("action: "));
 			printf_color(ANSI_YELLOW, gettext("To correct the"
 			    "issue, enable the bookmark_v2 feature. No "
 			    "additional\n\taction is needed if there are no "
 			    "encrypted snapshots or bookmarks.\n\tIf preserving"
 			    "the encrypted snapshots and bookmarks is required,"
 			    " use\n\ta non-raw send to backup and restore them."
 			    " Alternately, they may be\n\tremoved to resolve "
 			    "the incompatibility.\n"));
 			break;
 
 		default:
 			/*
 			 * All errata which allow the pool to be imported
 			 * must contain an action message.
 			 */
 			assert(0);
 		}
 		break;
 
 	default:
 		/*
 		 * The remaining errors can't actually be generated, yet.
 		 */
 		assert(reason == ZPOOL_STATUS_OK);
 	}
 
 	if (msgid != NULL) {
 		printf("   ");
 		printf_color(ANSI_BOLD, gettext("see:"));
-		printf(gettext(" https://zfsonlinux.org/msg/%s\n"), msgid);
+		printf(gettext(
+		    " https://openzfs.github.io/openzfs-docs/msg/%s\n"),
+		    msgid);
 	}
 
 	if (config != NULL) {
 		uint64_t nerr;
 		nvlist_t **spares, **l2cache;
 		uint_t nspares, nl2cache;
 		pool_checkpoint_stat_t *pcs = NULL;
 		pool_removal_stat_t *prs = NULL;
 
 		print_scan_status(zhp, nvroot);
 
 		(void) nvlist_lookup_uint64_array(nvroot,
 		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
 		print_removal_status(zhp, prs);
 
 		(void) nvlist_lookup_uint64_array(nvroot,
 		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
 		print_checkpoint_status(pcs);
 
 		cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
 		    cbp->cb_name_flags | VDEV_NAME_TYPE_ID);
 		if (cbp->cb_namewidth < 10)
 			cbp->cb_namewidth = 10;
 
 		color_start(ANSI_BOLD);
 		(void) printf(gettext("config:\n\n"));
 		(void) printf(gettext("\t%-*s  %-8s %5s %5s %5s"),
 		    cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE",
 		    "CKSUM");
 		color_end();
 
 		if (cbp->cb_print_slow_ios) {
 			printf_color(ANSI_BOLD, " %5s", gettext("SLOW"));
 		}
 
 		if (cbp->vcdl != NULL)
 			print_cmd_columns(cbp->vcdl, 0);
 
 		printf("\n");
 
 		print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0,
 		    B_FALSE, NULL);
 
 		print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP);
 		print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
 		print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS);
 
 		if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    &l2cache, &nl2cache) == 0)
 			print_l2cache(zhp, cbp, l2cache, nl2cache);
 
 		if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    &spares, &nspares) == 0)
 			print_spares(zhp, cbp, spares, nspares);
 
 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
 		    &nerr) == 0) {
 			nvlist_t *nverrlist = NULL;
 
 			/*
 			 * If the approximate error count is small, get a
 			 * precise count by fetching the entire log and
 			 * uniquifying the results.
 			 */
 			if (nerr > 0 && nerr < 100 && !cbp->cb_verbose &&
 			    zpool_get_errlog(zhp, &nverrlist) == 0) {
 				nvpair_t *elem;
 
 				elem = NULL;
 				nerr = 0;
 				while ((elem = nvlist_next_nvpair(nverrlist,
 				    elem)) != NULL) {
 					nerr++;
 				}
 			}
 			nvlist_free(nverrlist);
 
 			(void) printf("\n");
 
 			if (nerr == 0)
 				(void) printf(gettext("errors: No known data "
 				    "errors\n"));
 			else if (!cbp->cb_verbose)
 				(void) printf(gettext("errors: %llu data "
 				    "errors, use '-v' for a list\n"),
 				    (u_longlong_t)nerr);
 			else
 				print_error_log(zhp);
 		}
 
 		if (cbp->cb_dedup_stats)
 			print_dedup_stats(config);
 	} else {
 		(void) printf(gettext("config: The configuration cannot be "
 		    "determined.\n"));
 	}
 
 	return (0);
 }
 
 /*
  * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ...
  *              [interval [count]]
  *
  *	-c CMD	For each vdev, run command CMD
  *	-i	Display vdev initialization status.
  *	-g	Display guid for individual vdev name.
  *	-L	Follow links when resolving vdev path name.
  *	-p	Display values in parsable (exact) format.
  *	-P	Display full path for vdev name.
  *	-s	Display slow IOs column.
  *	-v	Display complete error logs
  *	-x	Display only pools with potential problems
  *	-D	Display dedup status (undocumented)
  *	-t	Display vdev TRIM status.
  *	-T	Display a timestamp in date(1) or Unix format
  *
  * Describes the health status of all pools or some subset.
  */
 int
 zpool_do_status(int argc, char **argv)
 {
 	int c;
 	int ret;
 	float interval = 0;
 	unsigned long count = 0;
 	status_cbdata_t cb = { 0 };
 	char *cmd = NULL;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) {
 		switch (c) {
 		case 'c':
 			if (cmd != NULL) {
 				fprintf(stderr,
 				    gettext("Can't set -c flag twice\n"));
 				exit(1);
 			}
 
 			if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL &&
 			    !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) {
 				fprintf(stderr, gettext(
 				    "Can't run -c, disabled by "
 				    "ZPOOL_SCRIPTS_ENABLED.\n"));
 				exit(1);
 			}
 
 			if ((getuid() <= 0 || geteuid() <= 0) &&
 			    !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) {
 				fprintf(stderr, gettext(
 				    "Can't run -c with root privileges "
 				    "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n"));
 				exit(1);
 			}
 			cmd = optarg;
 			break;
 		case 'i':
 			cb.cb_print_vdev_init = B_TRUE;
 			break;
 		case 'g':
 			cb.cb_name_flags |= VDEV_NAME_GUID;
 			break;
 		case 'L':
 			cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
 			break;
 		case 'p':
 			cb.cb_literal = B_TRUE;
 			break;
 		case 'P':
 			cb.cb_name_flags |= VDEV_NAME_PATH;
 			break;
 		case 's':
 			cb.cb_print_slow_ios = B_TRUE;
 			break;
 		case 'v':
 			cb.cb_verbose = B_TRUE;
 			break;
 		case 'x':
 			cb.cb_explain = B_TRUE;
 			break;
 		case 'D':
 			cb.cb_dedup_stats = B_TRUE;
 			break;
 		case 't':
 			cb.cb_print_vdev_trim = B_TRUE;
 			break;
 		case 'T':
 			get_timestamp_arg(*optarg);
 			break;
 		case '?':
 			if (optopt == 'c') {
 				print_zpool_script_list("status");
 				exit(0);
 			} else {
 				fprintf(stderr,
 				    gettext("invalid option '%c'\n"), optopt);
 			}
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	get_interval_count(&argc, argv, &interval, &count);
 
 	if (argc == 0)
 		cb.cb_allpools = B_TRUE;
 
 	cb.cb_first = B_TRUE;
 	cb.cb_print_status = B_TRUE;
 
 	for (;;) {
 		if (timestamp_fmt != NODATE)
 			print_timestamp(timestamp_fmt);
 
 		if (cmd != NULL)
 			cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd,
 			    NULL, NULL, 0, 0);
 
 		ret = for_each_pool(argc, argv, B_TRUE, NULL,
 		    status_callback, &cb);
 
 		if (cb.vcdl != NULL)
 			free_vdev_cmd_data_list(cb.vcdl);
 
 		if (argc == 0 && cb.cb_count == 0)
 			(void) fprintf(stderr, gettext("no pools available\n"));
 		else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
 			(void) printf(gettext("all pools are healthy\n"));
 
 		if (ret != 0)
 			return (ret);
 
 		if (interval == 0)
 			break;
 
 		if (count != 0 && --count == 0)
 			break;
 
 		(void) fsleep(interval);
 	}
 
 	return (0);
 }
 
 typedef struct upgrade_cbdata {
 	int	cb_first;
 	int	cb_argc;
 	uint64_t cb_version;
 	char	**cb_argv;
 } upgrade_cbdata_t;
 
 static int
 check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs)
 {
 	int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
 	int *count = (int *)unsupp_fs;
 
 	if (zfs_version > ZPL_VERSION) {
 		(void) printf(gettext("%s (v%d) is not supported by this "
 		    "implementation of ZFS.\n"),
 		    zfs_get_name(zhp), zfs_version);
 		(*count)++;
 	}
 
 	zfs_iter_filesystems(zhp, check_unsupp_fs, unsupp_fs);
 
 	zfs_close(zhp);
 
 	return (0);
 }
 
 static int
 upgrade_version(zpool_handle_t *zhp, uint64_t version)
 {
 	int ret;
 	nvlist_t *config;
 	uint64_t oldversion;
 	int unsupp_fs = 0;
 
 	config = zpool_get_config(zhp, NULL);
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &oldversion) == 0);
 
 	assert(SPA_VERSION_IS_SUPPORTED(oldversion));
 	assert(oldversion < version);
 
 	ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs);
 	if (ret != 0)
 		return (ret);
 
 	if (unsupp_fs) {
 		(void) fprintf(stderr, gettext("Upgrade not performed due "
 		    "to %d unsupported filesystems (max v%d).\n"),
 		    unsupp_fs, (int)ZPL_VERSION);
 		return (1);
 	}
 
 	ret = zpool_upgrade(zhp, version);
 	if (ret != 0)
 		return (ret);
 
 	if (version >= SPA_VERSION_FEATURES) {
 		(void) printf(gettext("Successfully upgraded "
 		    "'%s' from version %llu to feature flags.\n"),
 		    zpool_get_name(zhp), (u_longlong_t)oldversion);
 	} else {
 		(void) printf(gettext("Successfully upgraded "
 		    "'%s' from version %llu to version %llu.\n"),
 		    zpool_get_name(zhp), (u_longlong_t)oldversion,
 		    (u_longlong_t)version);
 	}
 
 	return (0);
 }
 
 static int
 upgrade_enable_all(zpool_handle_t *zhp, int *countp)
 {
 	int i, ret, count;
 	boolean_t firstff = B_TRUE;
 	nvlist_t *enabled = zpool_get_features(zhp);
 
 	count = 0;
 	for (i = 0; i < SPA_FEATURES; i++) {
 		const char *fname = spa_feature_table[i].fi_uname;
 		const char *fguid = spa_feature_table[i].fi_guid;
 		if (!nvlist_exists(enabled, fguid)) {
 			char *propname;
 			verify(-1 != asprintf(&propname, "feature@%s", fname));
 			ret = zpool_set_prop(zhp, propname,
 			    ZFS_FEATURE_ENABLED);
 			if (ret != 0) {
 				free(propname);
 				return (ret);
 			}
 			count++;
 
 			if (firstff) {
 				(void) printf(gettext("Enabled the "
 				    "following features on '%s':\n"),
 				    zpool_get_name(zhp));
 				firstff = B_FALSE;
 			}
 			(void) printf(gettext("  %s\n"), fname);
 			free(propname);
 		}
 	}
 
 	if (countp != NULL)
 		*countp = count;
 	return (0);
 }
 
 static int
 upgrade_cb(zpool_handle_t *zhp, void *arg)
 {
 	upgrade_cbdata_t *cbp = arg;
 	nvlist_t *config;
 	uint64_t version;
 	boolean_t printnl = B_FALSE;
 	int ret;
 
 	config = zpool_get_config(zhp, NULL);
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &version) == 0);
 
 	assert(SPA_VERSION_IS_SUPPORTED(version));
 
 	if (version < cbp->cb_version) {
 		cbp->cb_first = B_FALSE;
 		ret = upgrade_version(zhp, cbp->cb_version);
 		if (ret != 0)
 			return (ret);
 		printnl = B_TRUE;
 
 		/*
 		 * If they did "zpool upgrade -a", then we could
 		 * be doing ioctls to different pools.  We need
 		 * to log this history once to each pool, and bypass
 		 * the normal history logging that happens in main().
 		 */
 		(void) zpool_log_history(g_zfs, history_str);
 		log_history = B_FALSE;
 	}
 
 	if (cbp->cb_version >= SPA_VERSION_FEATURES) {
 		int count;
 		ret = upgrade_enable_all(zhp, &count);
 		if (ret != 0)
 			return (ret);
 
 		if (count > 0) {
 			cbp->cb_first = B_FALSE;
 			printnl = B_TRUE;
 		}
 	}
 
 	if (printnl) {
 		(void) printf(gettext("\n"));
 	}
 
 	return (0);
 }
 
 static int
 upgrade_list_older_cb(zpool_handle_t *zhp, void *arg)
 {
 	upgrade_cbdata_t *cbp = arg;
 	nvlist_t *config;
 	uint64_t version;
 
 	config = zpool_get_config(zhp, NULL);
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &version) == 0);
 
 	assert(SPA_VERSION_IS_SUPPORTED(version));
 
 	if (version < SPA_VERSION_FEATURES) {
 		if (cbp->cb_first) {
 			(void) printf(gettext("The following pools are "
 			    "formatted with legacy version numbers and can\n"
 			    "be upgraded to use feature flags.  After "
 			    "being upgraded, these pools\nwill no "
 			    "longer be accessible by software that does not "
 			    "support feature\nflags.\n\n"));
 			(void) printf(gettext("VER  POOL\n"));
 			(void) printf(gettext("---  ------------\n"));
 			cbp->cb_first = B_FALSE;
 		}
 
 		(void) printf("%2llu   %s\n", (u_longlong_t)version,
 		    zpool_get_name(zhp));
 	}
 
 	return (0);
 }
 
 static int
 upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
 {
 	upgrade_cbdata_t *cbp = arg;
 	nvlist_t *config;
 	uint64_t version;
 
 	config = zpool_get_config(zhp, NULL);
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &version) == 0);
 
 	if (version >= SPA_VERSION_FEATURES) {
 		int i;
 		boolean_t poolfirst = B_TRUE;
 		nvlist_t *enabled = zpool_get_features(zhp);
 
 		for (i = 0; i < SPA_FEATURES; i++) {
 			const char *fguid = spa_feature_table[i].fi_guid;
 			const char *fname = spa_feature_table[i].fi_uname;
 			if (!nvlist_exists(enabled, fguid)) {
 				if (cbp->cb_first) {
 					(void) printf(gettext("\nSome "
 					    "supported features are not "
 					    "enabled on the following pools. "
 					    "Once a\nfeature is enabled the "
 					    "pool may become incompatible with "
 					    "software\nthat does not support "
 					    "the feature. See "
 					    "zpool-features(5) for "
 					    "details.\n\n"));
 					(void) printf(gettext("POOL  "
 					    "FEATURE\n"));
 					(void) printf(gettext("------"
 					    "---------\n"));
 					cbp->cb_first = B_FALSE;
 				}
 
 				if (poolfirst) {
 					(void) printf(gettext("%s\n"),
 					    zpool_get_name(zhp));
 					poolfirst = B_FALSE;
 				}
 
 				(void) printf(gettext("      %s\n"), fname);
 			}
 			/*
 			 * If they did "zpool upgrade -a", then we could
 			 * be doing ioctls to different pools.  We need
 			 * to log this history once to each pool, and bypass
 			 * the normal history logging that happens in main().
 			 */
 			(void) zpool_log_history(g_zfs, history_str);
 			log_history = B_FALSE;
 		}
 	}
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 upgrade_one(zpool_handle_t *zhp, void *data)
 {
 	boolean_t printnl = B_FALSE;
 	upgrade_cbdata_t *cbp = data;
 	uint64_t cur_version;
 	int ret;
 
 	if (strcmp("log", zpool_get_name(zhp)) == 0) {
 		(void) fprintf(stderr, gettext("'log' is now a reserved word\n"
 		    "Pool 'log' must be renamed using export and import"
 		    " to upgrade.\n"));
 		return (1);
 	}
 
 	cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
 	if (cur_version > cbp->cb_version) {
 		(void) printf(gettext("Pool '%s' is already formatted "
 		    "using more current version '%llu'.\n\n"),
 		    zpool_get_name(zhp), (u_longlong_t)cur_version);
 		return (0);
 	}
 
 	if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) {
 		(void) printf(gettext("Pool '%s' is already formatted "
 		    "using version %llu.\n\n"), zpool_get_name(zhp),
 		    (u_longlong_t)cbp->cb_version);
 		return (0);
 	}
 
 	if (cur_version != cbp->cb_version) {
 		printnl = B_TRUE;
 		ret = upgrade_version(zhp, cbp->cb_version);
 		if (ret != 0)
 			return (ret);
 	}
 
 	if (cbp->cb_version >= SPA_VERSION_FEATURES) {
 		int count = 0;
 		ret = upgrade_enable_all(zhp, &count);
 		if (ret != 0)
 			return (ret);
 
 		if (count != 0) {
 			printnl = B_TRUE;
 		} else if (cur_version == SPA_VERSION) {
 			(void) printf(gettext("Pool '%s' already has all "
 			    "supported features enabled.\n"),
 			    zpool_get_name(zhp));
 		}
 	}
 
 	if (printnl) {
 		(void) printf(gettext("\n"));
 	}
 
 	return (0);
 }
 
 /*
  * zpool upgrade
  * zpool upgrade -v
  * zpool upgrade [-V version] <-a | pool ...>
  *
  * With no arguments, display downrev'd ZFS pool available for upgrade.
  * Individual pools can be upgraded by specifying the pool, and '-a' will
  * upgrade all pools.
  */
 int
 zpool_do_upgrade(int argc, char **argv)
 {
 	int c;
 	upgrade_cbdata_t cb = { 0 };
 	int ret = 0;
 	boolean_t showversions = B_FALSE;
 	boolean_t upgradeall = B_FALSE;
 	char *end;
 
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":avV:")) != -1) {
 		switch (c) {
 		case 'a':
 			upgradeall = B_TRUE;
 			break;
 		case 'v':
 			showversions = B_TRUE;
 			break;
 		case 'V':
 			cb.cb_version = strtoll(optarg, &end, 10);
 			if (*end != '\0' ||
 			    !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) {
 				(void) fprintf(stderr,
 				    gettext("invalid version '%s'\n"), optarg);
 				usage(B_FALSE);
 			}
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	cb.cb_argc = argc;
 	cb.cb_argv = argv;
 	argc -= optind;
 	argv += optind;
 
 	if (cb.cb_version == 0) {
 		cb.cb_version = SPA_VERSION;
 	} else if (!upgradeall && argc == 0) {
 		(void) fprintf(stderr, gettext("-V option is "
 		    "incompatible with other arguments\n"));
 		usage(B_FALSE);
 	}
 
 	if (showversions) {
 		if (upgradeall || argc != 0) {
 			(void) fprintf(stderr, gettext("-v option is "
 			    "incompatible with other arguments\n"));
 			usage(B_FALSE);
 		}
 	} else if (upgradeall) {
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("-a option should not "
 			    "be used along with a pool name\n"));
 			usage(B_FALSE);
 		}
 	}
 
 	(void) printf(gettext("This system supports ZFS pool feature "
 	    "flags.\n\n"));
 	if (showversions) {
 		int i;
 
 		(void) printf(gettext("The following features are "
 		    "supported:\n\n"));
 		(void) printf(gettext("FEAT DESCRIPTION\n"));
 		(void) printf("----------------------------------------------"
 		    "---------------\n");
 		for (i = 0; i < SPA_FEATURES; i++) {
 			zfeature_info_t *fi = &spa_feature_table[i];
 			const char *ro =
 			    (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 			    " (read-only compatible)" : "";
 
 			(void) printf("%-37s%s\n", fi->fi_uname, ro);
 			(void) printf("     %s\n", fi->fi_desc);
 		}
 		(void) printf("\n");
 
 		(void) printf(gettext("The following legacy versions are also "
 		    "supported:\n\n"));
 		(void) printf(gettext("VER  DESCRIPTION\n"));
 		(void) printf("---  -----------------------------------------"
 		    "---------------\n");
 		(void) printf(gettext(" 1   Initial ZFS version\n"));
 		(void) printf(gettext(" 2   Ditto blocks "
 		    "(replicated metadata)\n"));
 		(void) printf(gettext(" 3   Hot spares and double parity "
 		    "RAID-Z\n"));
 		(void) printf(gettext(" 4   zpool history\n"));
 		(void) printf(gettext(" 5   Compression using the gzip "
 		    "algorithm\n"));
 		(void) printf(gettext(" 6   bootfs pool property\n"));
 		(void) printf(gettext(" 7   Separate intent log devices\n"));
 		(void) printf(gettext(" 8   Delegated administration\n"));
 		(void) printf(gettext(" 9   refquota and refreservation "
 		    "properties\n"));
 		(void) printf(gettext(" 10  Cache devices\n"));
 		(void) printf(gettext(" 11  Improved scrub performance\n"));
 		(void) printf(gettext(" 12  Snapshot properties\n"));
 		(void) printf(gettext(" 13  snapused property\n"));
 		(void) printf(gettext(" 14  passthrough-x aclinherit\n"));
 		(void) printf(gettext(" 15  user/group space accounting\n"));
 		(void) printf(gettext(" 16  stmf property support\n"));
 		(void) printf(gettext(" 17  Triple-parity RAID-Z\n"));
 		(void) printf(gettext(" 18  Snapshot user holds\n"));
 		(void) printf(gettext(" 19  Log device removal\n"));
 		(void) printf(gettext(" 20  Compression using zle "
 		    "(zero-length encoding)\n"));
 		(void) printf(gettext(" 21  Deduplication\n"));
 		(void) printf(gettext(" 22  Received properties\n"));
 		(void) printf(gettext(" 23  Slim ZIL\n"));
 		(void) printf(gettext(" 24  System attributes\n"));
 		(void) printf(gettext(" 25  Improved scrub stats\n"));
 		(void) printf(gettext(" 26  Improved snapshot deletion "
 		    "performance\n"));
 		(void) printf(gettext(" 27  Improved snapshot creation "
 		    "performance\n"));
 		(void) printf(gettext(" 28  Multiple vdev replacements\n"));
 		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases,\n"));
 		(void) printf(gettext("see the ZFS Administration Guide.\n\n"));
 	} else if (argc == 0 && upgradeall) {
 		cb.cb_first = B_TRUE;
 		ret = zpool_iter(g_zfs, upgrade_cb, &cb);
 		if (ret == 0 && cb.cb_first) {
 			if (cb.cb_version == SPA_VERSION) {
 				(void) printf(gettext("All pools are already "
 				    "formatted using feature flags.\n\n"));
 				(void) printf(gettext("Every feature flags "
 				    "pool already has all supported features "
 				    "enabled.\n"));
 			} else {
 				(void) printf(gettext("All pools are already "
 				    "formatted with version %llu or higher.\n"),
 				    (u_longlong_t)cb.cb_version);
 			}
 		}
 	} else if (argc == 0) {
 		cb.cb_first = B_TRUE;
 		ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb);
 		assert(ret == 0);
 
 		if (cb.cb_first) {
 			(void) printf(gettext("All pools are formatted "
 			    "using feature flags.\n\n"));
 		} else {
 			(void) printf(gettext("\nUse 'zpool upgrade -v' "
 			    "for a list of available legacy versions.\n"));
 		}
 
 		cb.cb_first = B_TRUE;
 		ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb);
 		assert(ret == 0);
 
 		if (cb.cb_first) {
 			(void) printf(gettext("Every feature flags pool has "
 			    "all supported features enabled.\n"));
 		} else {
 			(void) printf(gettext("\n"));
 		}
 	} else {
 		ret = for_each_pool(argc, argv, B_FALSE, NULL,
 		    upgrade_one, &cb);
 	}
 
 	return (ret);
 }
 
 typedef struct hist_cbdata {
 	boolean_t first;
 	boolean_t longfmt;
 	boolean_t internal;
 } hist_cbdata_t;
 
 static void
 print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb)
 {
 	nvlist_t **records;
 	uint_t numrecords;
 	int i;
 
 	verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD,
 	    &records, &numrecords) == 0);
 	for (i = 0; i < numrecords; i++) {
 		nvlist_t *rec = records[i];
 		char tbuf[30] = "";
 
 		if (nvlist_exists(rec, ZPOOL_HIST_TIME)) {
 			time_t tsec;
 			struct tm t;
 
 			tsec = fnvlist_lookup_uint64(records[i],
 			    ZPOOL_HIST_TIME);
 			(void) localtime_r(&tsec, &t);
 			(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
 		}
 
 		if (nvlist_exists(rec, ZPOOL_HIST_CMD)) {
 			(void) printf("%s %s", tbuf,
 			    fnvlist_lookup_string(rec, ZPOOL_HIST_CMD));
 		} else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) {
 			int ievent =
 			    fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT);
 			if (!cb->internal)
 				continue;
 			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) {
 				(void) printf("%s unrecognized record:\n",
 				    tbuf);
 				dump_nvlist(rec, 4);
 				continue;
 			}
 			(void) printf("%s [internal %s txg:%lld] %s", tbuf,
 			    zfs_history_event_names[ievent],
 			    (longlong_t)fnvlist_lookup_uint64(
 			    rec, ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR));
 		} else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) {
 			if (!cb->internal)
 				continue;
 			(void) printf("%s [txg:%lld] %s", tbuf,
 			    (longlong_t)fnvlist_lookup_uint64(
 			    rec, ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME));
 			if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) {
 				(void) printf(" %s (%llu)",
 				    fnvlist_lookup_string(rec,
 				    ZPOOL_HIST_DSNAME),
 				    (u_longlong_t)fnvlist_lookup_uint64(rec,
 				    ZPOOL_HIST_DSID));
 			}
 			(void) printf(" %s", fnvlist_lookup_string(rec,
 			    ZPOOL_HIST_INT_STR));
 		} else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) {
 			if (!cb->internal)
 				continue;
 			(void) printf("%s ioctl %s\n", tbuf,
 			    fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL));
 			if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) {
 				(void) printf("    input:\n");
 				dump_nvlist(fnvlist_lookup_nvlist(rec,
 				    ZPOOL_HIST_INPUT_NVL), 8);
 			}
 			if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) {
 				(void) printf("    output:\n");
 				dump_nvlist(fnvlist_lookup_nvlist(rec,
 				    ZPOOL_HIST_OUTPUT_NVL), 8);
 			}
 			if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) {
 				(void) printf("    errno: %lld\n",
 				    (longlong_t)fnvlist_lookup_int64(rec,
 				    ZPOOL_HIST_ERRNO));
 			}
 		} else {
 			if (!cb->internal)
 				continue;
 			(void) printf("%s unrecognized record:\n", tbuf);
 			dump_nvlist(rec, 4);
 		}
 
 		if (!cb->longfmt) {
 			(void) printf("\n");
 			continue;
 		}
 		(void) printf(" [");
 		if (nvlist_exists(rec, ZPOOL_HIST_WHO)) {
 			uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO);
 			struct passwd *pwd = getpwuid(who);
 			(void) printf("user %d ", (int)who);
 			if (pwd != NULL)
 				(void) printf("(%s) ", pwd->pw_name);
 		}
 		if (nvlist_exists(rec, ZPOOL_HIST_HOST)) {
 			(void) printf("on %s",
 			    fnvlist_lookup_string(rec, ZPOOL_HIST_HOST));
 		}
 		if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) {
 			(void) printf(":%s",
 			    fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE));
 		}
 
 		(void) printf("]");
 		(void) printf("\n");
 	}
 }
 
 /*
  * Print out the command history for a specific pool.
  */
 static int
 get_history_one(zpool_handle_t *zhp, void *data)
 {
 	nvlist_t *nvhis;
 	int ret;
 	hist_cbdata_t *cb = (hist_cbdata_t *)data;
 	uint64_t off = 0;
 	boolean_t eof = B_FALSE;
 
 	cb->first = B_FALSE;
 
 	(void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp));
 
 	while (!eof) {
 		if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0)
 			return (ret);
 
 		print_history_records(nvhis, cb);
 		nvlist_free(nvhis);
 	}
 	(void) printf("\n");
 
 	return (ret);
 }
 
 /*
  * zpool history <pool>
  *
  * Displays the history of commands that modified pools.
  */
 int
 zpool_do_history(int argc, char **argv)
 {
 	hist_cbdata_t cbdata = { 0 };
 	int ret;
 	int c;
 
 	cbdata.first = B_TRUE;
 	/* check options */
 	while ((c = getopt(argc, argv, "li")) != -1) {
 		switch (c) {
 		case 'l':
 			cbdata.longfmt = B_TRUE;
 			break;
 		case 'i':
 			cbdata.internal = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	ret = for_each_pool(argc, argv, B_FALSE,  NULL, get_history_one,
 	    &cbdata);
 
 	if (argc == 0 && cbdata.first == B_TRUE) {
 		(void) fprintf(stderr, gettext("no pools available\n"));
 		return (0);
 	}
 
 	return (ret);
 }
 
 typedef struct ev_opts {
 	int verbose;
 	int scripted;
 	int follow;
 	int clear;
 	char poolname[ZFS_MAX_DATASET_NAME_LEN];
 } ev_opts_t;
 
 static void
 zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts)
 {
 	char ctime_str[26], str[32], *ptr;
 	int64_t *tv;
 	uint_t n;
 
 	verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0);
 	memset(str, ' ', 32);
 	(void) ctime_r((const time_t *)&tv[0], ctime_str);
 	(void) memcpy(str, ctime_str+4,  6);		/* 'Jun 30' */
 	(void) memcpy(str+7, ctime_str+20, 4);		/* '1993' */
 	(void) memcpy(str+12, ctime_str+11, 8);		/* '21:49:08' */
 	(void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */
 	if (opts->scripted)
 		(void) printf(gettext("%s\t"), str);
 	else
 		(void) printf(gettext("%s "), str);
 
 	verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0);
 	(void) printf(gettext("%s\n"), ptr);
 }
 
 static void
 zpool_do_events_nvprint(nvlist_t *nvl, int depth)
 {
 	nvpair_t *nvp;
 
 	for (nvp = nvlist_next_nvpair(nvl, NULL);
 	    nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
 
 		data_type_t type = nvpair_type(nvp);
 		const char *name = nvpair_name(nvp);
 
 		boolean_t b;
 		uint8_t i8;
 		uint16_t i16;
 		uint32_t i32;
 		uint64_t i64;
 		char *str;
 		nvlist_t *cnv;
 
 		printf(gettext("%*s%s = "), depth, "", name);
 
 		switch (type) {
 		case DATA_TYPE_BOOLEAN:
 			printf(gettext("%s"), "1");
 			break;
 
 		case DATA_TYPE_BOOLEAN_VALUE:
 			(void) nvpair_value_boolean_value(nvp, &b);
 			printf(gettext("%s"), b ? "1" : "0");
 			break;
 
 		case DATA_TYPE_BYTE:
 			(void) nvpair_value_byte(nvp, &i8);
 			printf(gettext("0x%x"), i8);
 			break;
 
 		case DATA_TYPE_INT8:
 			(void) nvpair_value_int8(nvp, (void *)&i8);
 			printf(gettext("0x%x"), i8);
 			break;
 
 		case DATA_TYPE_UINT8:
 			(void) nvpair_value_uint8(nvp, &i8);
 			printf(gettext("0x%x"), i8);
 			break;
 
 		case DATA_TYPE_INT16:
 			(void) nvpair_value_int16(nvp, (void *)&i16);
 			printf(gettext("0x%x"), i16);
 			break;
 
 		case DATA_TYPE_UINT16:
 			(void) nvpair_value_uint16(nvp, &i16);
 			printf(gettext("0x%x"), i16);
 			break;
 
 		case DATA_TYPE_INT32:
 			(void) nvpair_value_int32(nvp, (void *)&i32);
 			printf(gettext("0x%x"), i32);
 			break;
 
 		case DATA_TYPE_UINT32:
 			(void) nvpair_value_uint32(nvp, &i32);
 			printf(gettext("0x%x"), i32);
 			break;
 
 		case DATA_TYPE_INT64:
 			(void) nvpair_value_int64(nvp, (void *)&i64);
 			printf(gettext("0x%llx"), (u_longlong_t)i64);
 			break;
 
 		case DATA_TYPE_UINT64:
 			(void) nvpair_value_uint64(nvp, &i64);
 			/*
 			 * translate vdev state values to readable
 			 * strings to aide zpool events consumers
 			 */
 			if (strcmp(name,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 ||
 			    strcmp(name,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) {
 				printf(gettext("\"%s\" (0x%llx)"),
 				    zpool_state_to_name(i64, VDEV_AUX_NONE),
 				    (u_longlong_t)i64);
 			} else {
 				printf(gettext("0x%llx"), (u_longlong_t)i64);
 			}
 			break;
 
 		case DATA_TYPE_HRTIME:
 			(void) nvpair_value_hrtime(nvp, (void *)&i64);
 			printf(gettext("0x%llx"), (u_longlong_t)i64);
 			break;
 
 		case DATA_TYPE_STRING:
 			(void) nvpair_value_string(nvp, &str);
 			printf(gettext("\"%s\""), str ? str : "<NULL>");
 			break;
 
 		case DATA_TYPE_NVLIST:
 			printf(gettext("(embedded nvlist)\n"));
 			(void) nvpair_value_nvlist(nvp, &cnv);
 			zpool_do_events_nvprint(cnv, depth + 8);
 			printf(gettext("%*s(end %s)"), depth, "", name);
 			break;
 
 		case DATA_TYPE_NVLIST_ARRAY: {
 			nvlist_t **val;
 			uint_t i, nelem;
 
 			(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
 			printf(gettext("(%d embedded nvlists)\n"), nelem);
 			for (i = 0; i < nelem; i++) {
 				printf(gettext("%*s%s[%d] = %s\n"),
 				    depth, "", name, i, "(embedded nvlist)");
 				zpool_do_events_nvprint(val[i], depth + 8);
 				printf(gettext("%*s(end %s[%i])\n"),
 				    depth, "", name, i);
 			}
 			printf(gettext("%*s(end %s)\n"), depth, "", name);
 			}
 			break;
 
 		case DATA_TYPE_INT8_ARRAY: {
 			int8_t *val;
 			uint_t i, nelem;
 
 			(void) nvpair_value_int8_array(nvp, &val, &nelem);
 			for (i = 0; i < nelem; i++)
 				printf(gettext("0x%x "), val[i]);
 
 			break;
 			}
 
 		case DATA_TYPE_UINT8_ARRAY: {
 			uint8_t *val;
 			uint_t i, nelem;
 
 			(void) nvpair_value_uint8_array(nvp, &val, &nelem);
 			for (i = 0; i < nelem; i++)
 				printf(gettext("0x%x "), val[i]);
 
 			break;
 			}
 
 		case DATA_TYPE_INT16_ARRAY: {
 			int16_t *val;
 			uint_t i, nelem;
 
 			(void) nvpair_value_int16_array(nvp, &val, &nelem);
 			for (i = 0; i < nelem; i++)
 				printf(gettext("0x%x "), val[i]);
 
 			break;
 			}
 
 		case DATA_TYPE_UINT16_ARRAY: {
 			uint16_t *val;
 			uint_t i, nelem;
 
 			(void) nvpair_value_uint16_array(nvp, &val, &nelem);
 			for (i = 0; i < nelem; i++)
 				printf(gettext("0x%x "), val[i]);
 
 			break;
 			}
 
 		case DATA_TYPE_INT32_ARRAY: {
 			int32_t *val;
 			uint_t i, nelem;
 
 			(void) nvpair_value_int32_array(nvp, &val, &nelem);
 			for (i = 0; i < nelem; i++)
 				printf(gettext("0x%x "), val[i]);
 
 			break;
 			}
 
 		case DATA_TYPE_UINT32_ARRAY: {
 			uint32_t *val;
 			uint_t i, nelem;
 
 			(void) nvpair_value_uint32_array(nvp, &val, &nelem);
 			for (i = 0; i < nelem; i++)
 				printf(gettext("0x%x "), val[i]);
 
 			break;
 			}
 
 		case DATA_TYPE_INT64_ARRAY: {
 			int64_t *val;
 			uint_t i, nelem;
 
 			(void) nvpair_value_int64_array(nvp, &val, &nelem);
 			for (i = 0; i < nelem; i++)
 				printf(gettext("0x%llx "),
 				    (u_longlong_t)val[i]);
 
 			break;
 			}
 
 		case DATA_TYPE_UINT64_ARRAY: {
 			uint64_t *val;
 			uint_t i, nelem;
 
 			(void) nvpair_value_uint64_array(nvp, &val, &nelem);
 			for (i = 0; i < nelem; i++)
 				printf(gettext("0x%llx "),
 				    (u_longlong_t)val[i]);
 
 			break;
 			}
 
 		case DATA_TYPE_STRING_ARRAY: {
 			char **str;
 			uint_t i, nelem;
 
 			(void) nvpair_value_string_array(nvp, &str, &nelem);
 			for (i = 0; i < nelem; i++)
 				printf(gettext("\"%s\" "),
 				    str[i] ? str[i] : "<NULL>");
 
 			break;
 			}
 
 		case DATA_TYPE_BOOLEAN_ARRAY:
 		case DATA_TYPE_BYTE_ARRAY:
 		case DATA_TYPE_DOUBLE:
 		case DATA_TYPE_DONTCARE:
 		case DATA_TYPE_UNKNOWN:
 			printf(gettext("<unknown>"));
 			break;
 		}
 
 		printf(gettext("\n"));
 	}
 }
 
 static int
 zpool_do_events_next(ev_opts_t *opts)
 {
 	nvlist_t *nvl;
 	int zevent_fd, ret, dropped;
 	char *pool;
 
 	zevent_fd = open(ZFS_DEV, O_RDWR);
 	VERIFY(zevent_fd >= 0);
 
 	if (!opts->scripted)
 		(void) printf(gettext("%-30s %s\n"), "TIME", "CLASS");
 
 	while (1) {
 		ret = zpool_events_next(g_zfs, &nvl, &dropped,
 		    (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd);
 		if (ret || nvl == NULL)
 			break;
 
 		if (dropped > 0)
 			(void) printf(gettext("dropped %d events\n"), dropped);
 
 		if (strlen(opts->poolname) > 0 &&
 		    nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 &&
 		    strcmp(opts->poolname, pool) != 0)
 			continue;
 
 		zpool_do_events_short(nvl, opts);
 
 		if (opts->verbose) {
 			zpool_do_events_nvprint(nvl, 8);
 			printf(gettext("\n"));
 		}
 		(void) fflush(stdout);
 
 		nvlist_free(nvl);
 	}
 
 	VERIFY(0 == close(zevent_fd));
 
 	return (ret);
 }
 
 static int
 zpool_do_events_clear(ev_opts_t *opts)
 {
 	int count, ret;
 
 	ret = zpool_events_clear(g_zfs, &count);
 	if (!ret)
 		(void) printf(gettext("cleared %d events\n"), count);
 
 	return (ret);
 }
 
 /*
  * zpool events [-vHf [pool] | -c]
  *
  * Displays events logs by ZFS.
  */
 int
 zpool_do_events(int argc, char **argv)
 {
 	ev_opts_t opts = { 0 };
 	int ret;
 	int c;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "vHfc")) != -1) {
 		switch (c) {
 		case 'v':
 			opts.verbose = 1;
 			break;
 		case 'H':
 			opts.scripted = 1;
 			break;
 		case 'f':
 			opts.follow = 1;
 			break;
 		case 'c':
 			opts.clear = 1;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	} else if (argc == 1) {
 		(void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname));
 		if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) {
 			(void) fprintf(stderr,
 			    gettext("invalid pool name '%s'\n"), opts.poolname);
 			usage(B_FALSE);
 		}
 	}
 
 	if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) &&
 	    opts.clear) {
 		(void) fprintf(stderr,
 		    gettext("invalid options combined with -c\n"));
 		usage(B_FALSE);
 	}
 
 	if (opts.clear)
 		ret = zpool_do_events_clear(&opts);
 	else
 		ret = zpool_do_events_next(&opts);
 
 	return (ret);
 }
 
 static int
 get_callback(zpool_handle_t *zhp, void *data)
 {
 	zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data;
 	char value[MAXNAMELEN];
 	zprop_source_t srctype;
 	zprop_list_t *pl;
 
 	for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
 
 		/*
 		 * Skip the special fake placeholder. This will also skip
 		 * over the name property when 'all' is specified.
 		 */
 		if (pl->pl_prop == ZPOOL_PROP_NAME &&
 		    pl == cbp->cb_proplist)
 			continue;
 
 		if (pl->pl_prop == ZPROP_INVAL &&
 		    (zpool_prop_feature(pl->pl_user_prop) ||
 		    zpool_prop_unsupported(pl->pl_user_prop))) {
 			srctype = ZPROP_SRC_LOCAL;
 
 			if (zpool_prop_get_feature(zhp, pl->pl_user_prop,
 			    value, sizeof (value)) == 0) {
 				zprop_print_one_property(zpool_get_name(zhp),
 				    cbp, pl->pl_user_prop, value, srctype,
 				    NULL, NULL);
 			}
 		} else {
 			if (zpool_get_prop(zhp, pl->pl_prop, value,
 			    sizeof (value), &srctype, cbp->cb_literal) != 0)
 				continue;
 
 			zprop_print_one_property(zpool_get_name(zhp), cbp,
 			    zpool_prop_to_name(pl->pl_prop), value, srctype,
 			    NULL, NULL);
 		}
 	}
 	return (0);
 }
 
 /*
  * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> <pool> ...
  *
  *	-H	Scripted mode.  Don't display headers, and separate properties
  *		by a single tab.
  *	-o	List of columns to display.  Defaults to
  *		"name,property,value,source".
  * 	-p	Display values in parsable (exact) format.
  *
  * Get properties of pools in the system. Output space statistics
  * for each one as well as other attributes.
  */
 int
 zpool_do_get(int argc, char **argv)
 {
 	zprop_get_cbdata_t cb = { 0 };
 	zprop_list_t fake_name = { 0 };
 	int ret;
 	int c, i;
 	char *value;
 
 	cb.cb_first = B_TRUE;
 
 	/*
 	 * Set up default columns and sources.
 	 */
 	cb.cb_sources = ZPROP_SRC_ALL;
 	cb.cb_columns[0] = GET_COL_NAME;
 	cb.cb_columns[1] = GET_COL_PROPERTY;
 	cb.cb_columns[2] = GET_COL_VALUE;
 	cb.cb_columns[3] = GET_COL_SOURCE;
 	cb.cb_type = ZFS_TYPE_POOL;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":Hpo:")) != -1) {
 		switch (c) {
 		case 'p':
 			cb.cb_literal = B_TRUE;
 			break;
 		case 'H':
 			cb.cb_scripted = B_TRUE;
 			break;
 		case 'o':
 			bzero(&cb.cb_columns, sizeof (cb.cb_columns));
 			i = 0;
 			while (*optarg != '\0') {
 				static char *col_subopts[] =
 				{ "name", "property", "value", "source",
 				"all", NULL };
 
 				if (i == ZFS_GET_NCOLS) {
 					(void) fprintf(stderr, gettext("too "
 					"many fields given to -o "
 					"option\n"));
 					usage(B_FALSE);
 				}
 
 				switch (getsubopt(&optarg, col_subopts,
 				    &value)) {
 				case 0:
 					cb.cb_columns[i++] = GET_COL_NAME;
 					break;
 				case 1:
 					cb.cb_columns[i++] = GET_COL_PROPERTY;
 					break;
 				case 2:
 					cb.cb_columns[i++] = GET_COL_VALUE;
 					break;
 				case 3:
 					cb.cb_columns[i++] = GET_COL_SOURCE;
 					break;
 				case 4:
 					if (i > 0) {
 						(void) fprintf(stderr,
 						    gettext("\"all\" conflicts "
 						    "with specific fields "
 						    "given to -o option\n"));
 						usage(B_FALSE);
 					}
 					cb.cb_columns[0] = GET_COL_NAME;
 					cb.cb_columns[1] = GET_COL_PROPERTY;
 					cb.cb_columns[2] = GET_COL_VALUE;
 					cb.cb_columns[3] = GET_COL_SOURCE;
 					i = ZFS_GET_NCOLS;
 					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid column name "
 					    "'%s'\n"), value);
 					usage(B_FALSE);
 				}
 			}
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing property "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 
 	if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist,
 	    ZFS_TYPE_POOL) != 0)
 		usage(B_FALSE);
 
 	argc--;
 	argv++;
 
 	if (cb.cb_proplist != NULL) {
 		fake_name.pl_prop = ZPOOL_PROP_NAME;
 		fake_name.pl_width = strlen(gettext("NAME"));
 		fake_name.pl_next = cb.cb_proplist;
 		cb.cb_proplist = &fake_name;
 	}
 
 	ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
 	    get_callback, &cb);
 
 	if (cb.cb_proplist == &fake_name)
 		zprop_free_list(fake_name.pl_next);
 	else
 		zprop_free_list(cb.cb_proplist);
 
 	return (ret);
 }
 
 typedef struct set_cbdata {
 	char *cb_propname;
 	char *cb_value;
 	boolean_t cb_any_successful;
 } set_cbdata_t;
 
 static int
 set_callback(zpool_handle_t *zhp, void *data)
 {
 	int error;
 	set_cbdata_t *cb = (set_cbdata_t *)data;
 
 	error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value);
 
 	if (!error)
 		cb->cb_any_successful = B_TRUE;
 
 	return (error);
 }
 
 int
 zpool_do_set(int argc, char **argv)
 {
 	set_cbdata_t cb = { 0 };
 	int error;
 
 	if (argc > 1 && argv[1][0] == '-') {
 		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 		    argv[1][1]);
 		usage(B_FALSE);
 	}
 
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing property=value "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 
 	if (argc < 3) {
 		(void) fprintf(stderr, gettext("missing pool name\n"));
 		usage(B_FALSE);
 	}
 
 	if (argc > 3) {
 		(void) fprintf(stderr, gettext("too many pool names\n"));
 		usage(B_FALSE);
 	}
 
 	cb.cb_propname = argv[1];
 	cb.cb_value = strchr(cb.cb_propname, '=');
 	if (cb.cb_value == NULL) {
 		(void) fprintf(stderr, gettext("missing value in "
 		    "property=value argument\n"));
 		usage(B_FALSE);
 	}
 
 	*(cb.cb_value) = '\0';
 	cb.cb_value++;
 
 	error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL,
 	    set_callback, &cb);
 
 	return (error);
 }
 
 /* Add up the total number of bytes left to initialize/trim across all vdevs */
 static uint64_t
 vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity)
 {
 	uint64_t bytes_remaining;
 	nvlist_t **child;
 	uint_t c, children;
 	vdev_stat_t *vs;
 
 	assert(activity == ZPOOL_WAIT_INITIALIZE ||
 	    activity == ZPOOL_WAIT_TRIM);
 
 	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &c) == 0);
 
 	if (activity == ZPOOL_WAIT_INITIALIZE &&
 	    vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE)
 		bytes_remaining = vs->vs_initialize_bytes_est -
 		    vs->vs_initialize_bytes_done;
 	else if (activity == ZPOOL_WAIT_TRIM &&
 	    vs->vs_trim_state == VDEV_TRIM_ACTIVE)
 		bytes_remaining = vs->vs_trim_bytes_est -
 		    vs->vs_trim_bytes_done;
 	else
 		bytes_remaining = 0;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
 		children = 0;
 
 	for (c = 0; c < children; c++)
 		bytes_remaining += vdev_activity_remaining(child[c], activity);
 
 	return (bytes_remaining);
 }
 
 /* Add up the total number of bytes left to rebuild across top-level vdevs */
 static uint64_t
 vdev_activity_top_remaining(nvlist_t *nv)
 {
 	uint64_t bytes_remaining = 0;
 	nvlist_t **child;
 	uint_t children;
 	int error;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
 		children = 0;
 
 	for (uint_t c = 0; c < children; c++) {
 		vdev_rebuild_stat_t *vrs;
 		uint_t i;
 
 		error = nvlist_lookup_uint64_array(child[c],
 		    ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i);
 		if (error == 0) {
 			if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
 				bytes_remaining += (vrs->vrs_bytes_est -
 				    vrs->vrs_bytes_rebuilt);
 			}
 		}
 	}
 
 	return (bytes_remaining);
 }
 
 /* Whether any vdevs are 'spare' or 'replacing' vdevs */
 static boolean_t
 vdev_any_spare_replacing(nvlist_t *nv)
 {
 	nvlist_t **child;
 	uint_t c, children;
 	char *vdev_type;
 
 	(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type);
 
 	if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 ||
 	    strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) {
 		return (B_TRUE);
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
 		children = 0;
 
 	for (c = 0; c < children; c++) {
 		if (vdev_any_spare_replacing(child[c]))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 typedef struct wait_data {
 	char *wd_poolname;
 	boolean_t wd_scripted;
 	boolean_t wd_exact;
 	boolean_t wd_headers_once;
 	boolean_t wd_should_exit;
 	/* Which activities to wait for */
 	boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES];
 	float wd_interval;
 	pthread_cond_t wd_cv;
 	pthread_mutex_t wd_mutex;
 } wait_data_t;
 
 /*
  * Print to stdout a single line, containing one column for each activity that
  * we are waiting for specifying how many bytes of work are left for that
  * activity.
  */
 static void
 print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
 {
 	nvlist_t *config, *nvroot;
 	uint_t c;
 	int i;
 	pool_checkpoint_stat_t *pcs = NULL;
 	pool_scan_stat_t *pss = NULL;
 	pool_removal_stat_t *prs = NULL;
 	char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE",
 	    "REMOVE", "RESILVER", "SCRUB", "TRIM"};
 	int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES];
 
 	/* Calculate the width of each column */
 	for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
 		/*
 		 * Make sure we have enough space in the col for pretty-printed
 		 * numbers and for the column header, and then leave a couple
 		 * spaces between cols for readability.
 		 */
 		col_widths[i] = MAX(strlen(headers[i]), 6) + 2;
 	}
 
 	/* Print header if appropriate */
 	int term_height = terminal_height();
 	boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 &&
 	    row % (term_height-1) == 0);
 	if (!wd->wd_scripted && (row == 0 || reprint_header)) {
 		for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
 			if (wd->wd_enabled[i])
 				(void) printf("%*s", col_widths[i], headers[i]);
 		}
 		(void) printf("\n");
 	}
 
 	/* Bytes of work remaining in each activity */
 	int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0};
 
 	bytes_rem[ZPOOL_WAIT_FREE] =
 	    zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL);
 
 	config = zpool_get_config(zhp, NULL);
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 
 	(void) nvlist_lookup_uint64_array(nvroot,
 	    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
 	if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING)
 		bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space;
 
 	(void) nvlist_lookup_uint64_array(nvroot,
 	    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
 	if (prs != NULL && prs->prs_state == DSS_SCANNING)
 		bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy -
 		    prs->prs_copied;
 
 	(void) nvlist_lookup_uint64_array(nvroot,
 	    ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c);
 	if (pss != NULL && pss->pss_state == DSS_SCANNING &&
 	    pss->pss_pass_scrub_pause == 0) {
 		int64_t rem = pss->pss_to_examine - pss->pss_issued;
 		if (pss->pss_func == POOL_SCAN_SCRUB)
 			bytes_rem[ZPOOL_WAIT_SCRUB] = rem;
 		else
 			bytes_rem[ZPOOL_WAIT_RESILVER] = rem;
 	} else if (check_rebuilding(nvroot, NULL)) {
 		bytes_rem[ZPOOL_WAIT_RESILVER] =
 		    vdev_activity_top_remaining(nvroot);
 	}
 
 	bytes_rem[ZPOOL_WAIT_INITIALIZE] =
 	    vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE);
 	bytes_rem[ZPOOL_WAIT_TRIM] =
 	    vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM);
 
 	/*
 	 * A replace finishes after resilvering finishes, so the amount of work
 	 * left for a replace is the same as for resilvering.
 	 *
 	 * It isn't quite correct to say that if we have any 'spare' or
 	 * 'replacing' vdevs and a resilver is happening, then a replace is in
 	 * progress, like we do here. When a hot spare is used, the faulted vdev
 	 * is not removed after the hot spare is resilvered, so parent 'spare'
 	 * vdev is not removed either. So we could have a 'spare' vdev, but be
 	 * resilvering for a different reason. However, we use it as a heuristic
 	 * because we don't have access to the DTLs, which could tell us whether
 	 * or not we have really finished resilvering a hot spare.
 	 */
 	if (vdev_any_spare_replacing(nvroot))
 		bytes_rem[ZPOOL_WAIT_REPLACE] =  bytes_rem[ZPOOL_WAIT_RESILVER];
 
 	if (timestamp_fmt != NODATE)
 		print_timestamp(timestamp_fmt);
 
 	for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
 		char buf[64];
 		if (!wd->wd_enabled[i])
 			continue;
 
 		if (wd->wd_exact)
 			(void) snprintf(buf, sizeof (buf), "%" PRIi64,
 			    bytes_rem[i]);
 		else
 			zfs_nicenum(bytes_rem[i], buf, sizeof (buf));
 
 		if (wd->wd_scripted)
 			(void) printf(i == 0 ? "%s" : "\t%s", buf);
 		else
 			(void) printf(" %*s", col_widths[i] - 1, buf);
 	}
 	(void) printf("\n");
 	(void) fflush(stdout);
 }
 
 static void *
 wait_status_thread(void *arg)
 {
 	wait_data_t *wd = (wait_data_t *)arg;
 	zpool_handle_t *zhp;
 
 	if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL)
 		return (void *)(1);
 
 	for (int row = 0; ; row++) {
 		boolean_t missing;
 		struct timespec timeout;
 		int ret = 0;
 		(void) clock_gettime(CLOCK_REALTIME, &timeout);
 
 		if (zpool_refresh_stats(zhp, &missing) != 0 || missing ||
 		    zpool_props_refresh(zhp) != 0) {
 			zpool_close(zhp);
 			return (void *)(uintptr_t)(missing ? 0 : 1);
 		}
 
 		print_wait_status_row(wd, zhp, row);
 
 		timeout.tv_sec += floor(wd->wd_interval);
 		long nanos = timeout.tv_nsec +
 		    (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC;
 		if (nanos >= NANOSEC) {
 			timeout.tv_sec++;
 			timeout.tv_nsec = nanos - NANOSEC;
 		} else {
 			timeout.tv_nsec = nanos;
 		}
 		pthread_mutex_lock(&wd->wd_mutex);
 		if (!wd->wd_should_exit)
 			ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex,
 			    &timeout);
 		pthread_mutex_unlock(&wd->wd_mutex);
 		if (ret == 0) {
 			break; /* signaled by main thread */
 		} else if (ret != ETIMEDOUT) {
 			(void) fprintf(stderr, gettext("pthread_cond_timedwait "
 			    "failed: %s\n"), strerror(ret));
 			zpool_close(zhp);
 			return (void *)(uintptr_t)(1);
 		}
 	}
 
 	zpool_close(zhp);
 	return (void *)(0);
 }
 
 int
 zpool_do_wait(int argc, char **argv)
 {
 	boolean_t verbose = B_FALSE;
 	char c;
 	char *value;
 	int i;
 	unsigned long count;
 	pthread_t status_thr;
 	int error = 0;
 	zpool_handle_t *zhp;
 
 	wait_data_t wd;
 	wd.wd_scripted = B_FALSE;
 	wd.wd_exact = B_FALSE;
 	wd.wd_headers_once = B_FALSE;
 	wd.wd_should_exit = B_FALSE;
 
 	pthread_mutex_init(&wd.wd_mutex, NULL);
 	pthread_cond_init(&wd.wd_cv, NULL);
 
 	/* By default, wait for all types of activity. */
 	for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++)
 		wd.wd_enabled[i] = B_TRUE;
 
 	while ((c = getopt(argc, argv, "HpT:t:")) != -1) {
 		switch (c) {
 		case 'H':
 			wd.wd_scripted = B_TRUE;
 			break;
 		case 'n':
 			wd.wd_headers_once = B_TRUE;
 			break;
 		case 'p':
 			wd.wd_exact = B_TRUE;
 			break;
 		case 'T':
 			get_timestamp_arg(*optarg);
 			break;
 		case 't':
 		{
 			static char *col_subopts[] = { "discard", "free",
 			    "initialize", "replace", "remove", "resilver",
 			    "scrub", "trim", NULL };
 
 			/* Reset activities array */
 			bzero(&wd.wd_enabled, sizeof (wd.wd_enabled));
 			while (*optarg != '\0') {
 				int activity = getsubopt(&optarg, col_subopts,
 				    &value);
 
 				if (activity < 0) {
 					(void) fprintf(stderr,
 					    gettext("invalid activity '%s'\n"),
 					    value);
 					usage(B_FALSE);
 				}
 
 				wd.wd_enabled[activity] = B_TRUE;
 			}
 			break;
 		}
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	get_interval_count(&argc, argv, &wd.wd_interval, &count);
 	if (count != 0) {
 		/* This subcmd only accepts an interval, not a count */
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	if (wd.wd_interval != 0)
 		verbose = B_TRUE;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing 'pool' argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	wd.wd_poolname = argv[0];
 
 	if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL)
 		return (1);
 
 	if (verbose) {
 		/*
 		 * We use a separate thread for printing status updates because
 		 * the main thread will call lzc_wait(), which blocks as long
 		 * as an activity is in progress, which can be a long time.
 		 */
 		if (pthread_create(&status_thr, NULL, wait_status_thread, &wd)
 		    != 0) {
 			(void) fprintf(stderr, gettext("failed to create status"
 			    "thread: %s\n"), strerror(errno));
 			zpool_close(zhp);
 			return (1);
 		}
 	}
 
 	/*
 	 * Loop over all activities that we are supposed to wait for until none
 	 * of them are in progress. Note that this means we can end up waiting
 	 * for more activities to complete than just those that were in progress
 	 * when we began waiting; if an activity we are interested in begins
 	 * while we are waiting for another activity, we will wait for both to
 	 * complete before exiting.
 	 */
 	for (;;) {
 		boolean_t missing = B_FALSE;
 		boolean_t any_waited = B_FALSE;
 
 		for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
 			boolean_t waited;
 
 			if (!wd.wd_enabled[i])
 				continue;
 
 			error = zpool_wait_status(zhp, i, &missing, &waited);
 			if (error != 0 || missing)
 				break;
 
 			any_waited = (any_waited || waited);
 		}
 
 		if (error != 0 || missing || !any_waited)
 			break;
 	}
 
 	zpool_close(zhp);
 
 	if (verbose) {
 		uintptr_t status;
 		pthread_mutex_lock(&wd.wd_mutex);
 		wd.wd_should_exit = B_TRUE;
 		pthread_cond_signal(&wd.wd_cv);
 		pthread_mutex_unlock(&wd.wd_mutex);
 		(void) pthread_join(status_thr, (void *)&status);
 		if (status != 0)
 			error = status;
 	}
 
 	pthread_mutex_destroy(&wd.wd_mutex);
 	pthread_cond_destroy(&wd.wd_cv);
 	return (error);
 }
 
 static int
 find_command_idx(char *command, int *idx)
 {
 	int i;
 
 	for (i = 0; i < NCOMMAND; i++) {
 		if (command_table[i].name == NULL)
 			continue;
 
 		if (strcmp(command, command_table[i].name) == 0) {
 			*idx = i;
 			return (0);
 		}
 	}
 	return (1);
 }
 
 /*
  * Display version message
  */
 static int
 zpool_do_version(int argc, char **argv)
 {
 	if (zfs_version_print() == -1)
 		return (1);
 
 	return (0);
 }
 
 int
 main(int argc, char **argv)
 {
 	int ret = 0;
 	int i = 0;
 	char *cmdname;
 	char **newargv;
 
 	(void) setlocale(LC_ALL, "");
 	(void) textdomain(TEXT_DOMAIN);
 	srand(time(NULL));
 
 	opterr = 0;
 
 	/*
 	 * Make sure the user has specified some command.
 	 */
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing command\n"));
 		usage(B_FALSE);
 	}
 
 	cmdname = argv[1];
 
 	/*
 	 * Special case '-?'
 	 */
 	if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0)
 		usage(B_TRUE);
 
 	/*
 	 * Special case '-V|--version'
 	 */
 	if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0))
 		return (zpool_do_version(argc, argv));
 
 	if ((g_zfs = libzfs_init()) == NULL) {
 		(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
 		return (1);
 	}
 
 	libzfs_print_on_error(g_zfs, B_TRUE);
 
 	zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
 
 	/*
 	 * Many commands modify input strings for string parsing reasons.
 	 * We create a copy to protect the original argv.
 	 */
 	newargv = malloc((argc + 1) * sizeof (newargv[0]));
 	for (i = 0; i < argc; i++)
 		newargv[i] = strdup(argv[i]);
 	newargv[argc] = NULL;
 
 	/*
 	 * Run the appropriate command.
 	 */
 	if (find_command_idx(cmdname, &i) == 0) {
 		current_command = &command_table[i];
 		ret = command_table[i].func(argc - 1, newargv + 1);
 	} else if (strchr(cmdname, '=')) {
 		verify(find_command_idx("set", &i) == 0);
 		current_command = &command_table[i];
 		ret = command_table[i].func(argc, newargv);
 	} else if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
 		/*
 		 * 'freeze' is a vile debugging abomination, so we treat
 		 * it as such.
 		 */
 		zfs_cmd_t zc = {"\0"};
 
 		(void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name));
 		ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc);
 		if (ret != 0) {
 			(void) fprintf(stderr,
 			gettext("failed to freeze pool: %d\n"), errno);
 			ret = 1;
 		}
 
 		log_history = 0;
 	} else {
 		(void) fprintf(stderr, gettext("unrecognized "
 		    "command '%s'\n"), cmdname);
 		usage(B_FALSE);
 		ret = 1;
 	}
 
 	for (i = 0; i < argc; i++)
 		free(newargv[i]);
 	free(newargv);
 
 	if (ret == 0 && log_history)
 		(void) zpool_log_history(g_zfs, history_str);
 
 	libzfs_fini(g_zfs);
 
 	/*
 	 * The 'ZFS_ABORT' environment variable causes us to dump core on exit
 	 * for the purposes of running ::findleaks.
 	 */
 	if (getenv("ZFS_ABORT") != NULL) {
 		(void) printf("dumping core by request\n");
 		abort();
 	}
 
 	return (ret);
 }
Index: vendor-sys/openzfs/dist/include/libzfs_impl.h
===================================================================
--- vendor-sys/openzfs/dist/include/libzfs_impl.h	(revision 364927)
+++ vendor-sys/openzfs/dist/include/libzfs_impl.h	(revision 364928)
@@ -1,261 +1,262 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2018 Datto Inc.
  * Copyright 2020 Joyent, Inc.
  */
 
 #ifndef	_LIBZFS_IMPL_H
 #define	_LIBZFS_IMPL_H
 
 #include <sys/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/nvpair.h>
 #include <sys/dmu.h>
 #include <sys/zfs_ioctl.h>
 #include <regex.h>
 
 #include <libuutil.h>
 #include <libzfs.h>
 #include <libshare.h>
 #include <libzfs_core.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct libzfs_handle {
 	int libzfs_error;
 	int libzfs_fd;
 	FILE *libzfs_mnttab;
 	zpool_handle_t *libzfs_pool_handles;
 	uu_avl_pool_t *libzfs_ns_avlpool;
 	uu_avl_t *libzfs_ns_avl;
 	uint64_t libzfs_ns_gen;
 	int libzfs_desc_active;
 	char libzfs_action[1024];
 	char libzfs_desc[1024];
 	int libzfs_printerr;
 	int libzfs_storeerr; /* stuff error messages into buffer */
 	boolean_t libzfs_mnttab_enable;
 	/*
 	 * We need a lock to handle the case where parallel mount
 	 * threads are populating the mnttab cache simultaneously. The
 	 * lock only protects the integrity of the avl tree, and does
 	 * not protect the contents of the mnttab entries themselves.
 	 */
 	pthread_mutex_t libzfs_mnttab_cache_lock;
 	avl_tree_t libzfs_mnttab_cache;
 	int libzfs_pool_iter;
 	char libzfs_chassis_id[256];
 	boolean_t libzfs_prop_debug;
 	regex_t libzfs_urire;
+	uint64_t libzfs_max_nvlist;
 };
 
 struct zfs_handle {
 	libzfs_handle_t *zfs_hdl;
 	zpool_handle_t *zpool_hdl;
 	char zfs_name[ZFS_MAX_DATASET_NAME_LEN];
 	zfs_type_t zfs_type; /* type including snapshot */
 	zfs_type_t zfs_head_type; /* type excluding snapshot */
 	dmu_objset_stats_t zfs_dmustats;
 	nvlist_t *zfs_props;
 	nvlist_t *zfs_user_props;
 	nvlist_t *zfs_recvd_props;
 	boolean_t zfs_mntcheck;
 	char *zfs_mntopts;
 	uint8_t *zfs_props_table;
 };
 
 /*
  * This is different from checking zfs_type, because it will also catch
  * snapshots of volumes.
  */
 #define	ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME)
 
 struct zpool_handle {
 	libzfs_handle_t *zpool_hdl;
 	zpool_handle_t *zpool_next;
 	char zpool_name[ZFS_MAX_DATASET_NAME_LEN];
 	int zpool_state;
 	size_t zpool_config_size;
 	nvlist_t *zpool_config;
 	nvlist_t *zpool_old_config;
 	nvlist_t *zpool_props;
 	diskaddr_t zpool_start_block;
 };
 
 typedef enum {
 	PROTO_NFS = 0,
 	PROTO_SMB = 1,
 	PROTO_END = 2
 } zfs_share_proto_t;
 
 /*
  * The following can be used as a bitmask and any new values
  * added must preserve that capability.
  */
 typedef enum {
 	SHARED_NOT_SHARED = 0x0,
 	SHARED_NFS = 0x2,
 	SHARED_SMB = 0x4
 } zfs_share_type_t;
 
 typedef int (*zfs_uri_handler_fn_t)(struct libzfs_handle *, const char *,
     const char *, zfs_keyformat_t, boolean_t, uint8_t **, size_t *);
 
 typedef struct zfs_uri_handler {
 	const char *zuh_scheme;
 	zfs_uri_handler_fn_t zuh_handler;
 } zfs_uri_handler_t;
 
 #define	CONFIG_BUF_MINSIZE	262144
 
 int zfs_error(libzfs_handle_t *, int, const char *);
 int zfs_error_fmt(libzfs_handle_t *, int, const char *, ...);
 void zfs_error_aux(libzfs_handle_t *, const char *, ...);
 void *zfs_alloc(libzfs_handle_t *, size_t);
 void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t);
 char *zfs_asprintf(libzfs_handle_t *, const char *, ...);
 char *zfs_strdup(libzfs_handle_t *, const char *);
 int no_memory(libzfs_handle_t *);
 
 int zfs_standard_error(libzfs_handle_t *, int, const char *);
 int zfs_standard_error_fmt(libzfs_handle_t *, int, const char *, ...);
 void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
 int zpool_standard_error(libzfs_handle_t *, int, const char *);
 int zpool_standard_error_fmt(libzfs_handle_t *, int, const char *, ...);
 
 zfs_handle_t *make_dataset_handle_zc(libzfs_handle_t *, zfs_cmd_t *);
 zfs_handle_t *make_dataset_simple_handle_zc(zfs_handle_t *, zfs_cmd_t *);
 
 int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t,
     nvlist_t *, char **, uint64_t *, const char *);
 int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp,
     zfs_type_t type);
 
 /*
  * Use this changelist_gather() flag to force attempting mounts
  * on each change node regardless of whether or not it is currently
  * mounted.
  */
 #define	CL_GATHER_MOUNT_ALWAYS	1
 /*
  * changelist_gather() flag to force it to iterate on mounted datasets only
  */
 #define	CL_GATHER_ITER_MOUNTED	2
 
 typedef struct prop_changelist prop_changelist_t;
 
 int zcmd_alloc_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, size_t);
 int zcmd_write_src_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *);
 int zcmd_write_conf_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *);
 int zcmd_expand_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *);
 int zcmd_read_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t **);
 void zcmd_free_nvlists(zfs_cmd_t *);
 
 int changelist_prefix(prop_changelist_t *);
 int changelist_postfix(prop_changelist_t *);
 void changelist_rename(prop_changelist_t *, const char *, const char *);
 void changelist_remove(prop_changelist_t *, const char *);
 void changelist_free(prop_changelist_t *);
 prop_changelist_t *changelist_gather(zfs_handle_t *, zfs_prop_t, int, int);
 int changelist_unshare(prop_changelist_t *, zfs_share_proto_t *);
 int changelist_haszonedchild(prop_changelist_t *);
 
 void remove_mountpoint(zfs_handle_t *);
 int create_parents(libzfs_handle_t *, char *, int);
 boolean_t isa_child_of(const char *dataset, const char *parent);
 
 zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *);
 zfs_handle_t *make_bookmark_handle(zfs_handle_t *, const char *,
     nvlist_t *props);
 
 int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **);
 
 boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *);
 
 int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
     boolean_t modifying);
 
 void namespace_clear(libzfs_handle_t *);
 
 extern int zfs_parse_options(char *, zfs_share_proto_t);
 
 extern int zfs_unshare_proto(zfs_handle_t *,
     const char *, zfs_share_proto_t *);
 
 typedef struct {
 	zfs_prop_t p_prop;
 	char *p_name;
 	int p_share_err;
 	int p_unshare_err;
 } proto_table_t;
 
 typedef struct differ_info {
 	zfs_handle_t *zhp;
 	char *fromsnap;
 	char *frommnt;
 	char *tosnap;
 	char *tomnt;
 	char *ds;
 	char *dsmnt;
 	char *tmpsnap;
 	char errbuf[1024];
 	boolean_t isclone;
 	boolean_t scripted;
 	boolean_t classify;
 	boolean_t timestamped;
 	uint64_t shares;
 	int zerr;
 	int cleanupfd;
 	int outputfd;
 	int datafd;
 } differ_info_t;
 
 extern proto_table_t proto_table[PROTO_END];
 
 extern int do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts,
     int flags);
 extern int do_unmount(const char *mntpt, int flags);
 extern int zfs_mount_delegation_check(void);
 extern int zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto);
 extern int unshare_one(libzfs_handle_t *hdl, const char *name,
     const char *mountpoint, zfs_share_proto_t proto);
 extern boolean_t zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen,
     zprop_source_t *source, int flags);
 extern zfs_share_type_t is_shared(const char *mountpoint,
     zfs_share_proto_t proto);
 extern int libzfs_load_module(void);
 extern int zpool_relabel_disk(libzfs_handle_t *hdl, const char *path,
     const char *msg);
 extern int find_shares_object(differ_info_t *di);
 extern void libzfs_set_pipe_max(int infd);
 extern void zfs_commit_proto(zfs_share_proto_t *);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _LIBZFS_IMPL_H */
Index: vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/acl_impl.h
===================================================================
--- vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/acl_impl.h	(revision 364927)
+++ vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/acl_impl.h	(revision 364928)
@@ -1,61 +1,59 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ACL_IMPL_H
 #define	_SYS_ACL_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * acl flags
  *
  * ACL_AUTO_INHERIT, ACL_PROTECTED and ACL_DEFAULTED
  * flags can also be stored in this field.
  */
 #define	ACL_IS_TRIVIAL	0x10000
 #define	ACL_IS_DIR	0x20000
 
 typedef enum acl_type {
 	ACLENT_T = 0,
 	ACE_T = 1
 } zfs_acl_type_t;
 
 struct acl_info {
 	zfs_acl_type_t acl_type;	/* style of acl */
 	int acl_cnt;			/* number of acl entries */
 	int acl_entry_size;		/* sizeof acl entry */
 	int acl_flags;			/* special flags about acl */
 	void *acl_aclp;			/* the acl */
 };
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_ACL_IMPL_H */
Index: vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/cmn_err.h
===================================================================
--- vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/cmn_err.h	(revision 364927)
+++ vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/cmn_err.h	(revision 364928)
@@ -1,100 +1,98 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
 
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_CMN_ERR_H
 #define	_SYS_CMN_ERR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #if !defined(_ASM)
 #include <sys/_stdarg.h>
 #endif
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /* Common error handling severity levels */
 
 #define	CE_CONT		0	/* continuation		*/
 #define	CE_NOTE		1	/* notice		*/
 #define	CE_WARN		2	/* warning		*/
 #define	CE_PANIC	3	/* panic		*/
 #define	CE_IGNORE	4	/* print nothing	*/
 
 #ifndef _ASM
 
 /*PRINTFLIKE2*/
 extern void cmn_err(int, const char *, ...)
     __KPRINTFLIKE(2);
 #pragma rarely_called(cmn_err)
 
 extern void vzcmn_err(zoneid_t, int, const char *, __va_list)
     __KVPRINTFLIKE(3);
 #pragma rarely_called(vzcmn_err)
 
 extern void vcmn_err(int, const char *, __va_list)
     __KVPRINTFLIKE(2);
 #pragma rarely_called(vcmn_err)
 
 /*PRINTFLIKE3*/
 extern void zcmn_err(zoneid_t, int, const char *, ...)
     __KPRINTFLIKE(3);
 #pragma rarely_called(zcmn_err)
 
 extern void vzprintf(zoneid_t, const char *, __va_list)
     __KVPRINTFLIKE(2);
 #pragma rarely_called(vzprintf)
 
 /*PRINTFLIKE2*/
 extern void zprintf(zoneid_t, const char *, ...)
     __KPRINTFLIKE(2);
 #pragma rarely_called(zprintf)
 
 extern void vuprintf(const char *, __va_list)
     __KVPRINTFLIKE(1);
 #pragma rarely_called(vuprintf)
 
 /*PRINTFLIKE1*/
 extern void panic(const char *, ...)
     __KPRINTFLIKE(1) __NORETURN;
 #pragma rarely_called(panic)
 
 extern void vpanic(const char *, __va_list)
     __KVPRINTFLIKE(1) __NORETURN;
 #pragma rarely_called(vpanic)
 
 #endif /* !_ASM */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_CMN_ERR_H */
Index: vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/extdirent.h
===================================================================
--- vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/extdirent.h	(revision 364927)
+++ vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/extdirent.h	(revision 364928)
@@ -1,73 +1,71 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_EXTDIRENT_H
 #define	_SYS_EXTDIRENT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #include <sys/types.h>
 #include <sys/dirent.h>
 
 /*
  * Extended file-system independent directory entry.  This style of
  * dirent provides additional informational flag bits for each
  * directory entry.  This dirent will be returned instead of the
  * standard dirent if a VOP_READDIR() requests dirent flags via
  * V_RDDIR_ENTFLAGS, and if the file system supports the flags.
  */
 typedef struct edirent {
 	ino64_t		ed_ino;		/* "inode number" of entry */
 	off64_t		ed_off;		/* offset of disk directory entry */
 	uint32_t	ed_eflags;	/* per-entry flags */
 	unsigned short	ed_reclen;	/* length of this record */
 	char		ed_name[1];	/* name of file */
 } edirent_t;
 
 #define	EDIRENT_RECLEN(namelen)	\
 	((offsetof(edirent_t, ed_name[0]) + 1 + (namelen) + 7) & ~ 7)
 #define	EDIRENT_NAMELEN(reclen)	\
 	((reclen) - (offsetof(edirent_t, ed_name[0])))
 
 /*
  * Extended entry flags
  *	Extended entries include a bitfield of extra information
  *	regarding that entry.
  */
 #define	ED_CASE_CONFLICT  0x10  /* Disconsidering case, entry is not unique */
 
 /*
  * Extended flags accessor function
  */
 #define	ED_CASE_CONFLICTS(x)	((x)->ed_eflags & ED_CASE_CONFLICT)
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_EXTDIRENT_H */
Index: vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/list.h
===================================================================
--- vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/list.h	(revision 364927)
+++ vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/list.h	(revision 364928)
@@ -1,67 +1,65 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_LIST_H
 #define	_SYS_LIST_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/list_impl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 typedef struct list_node list_node_t;
 typedef struct list list_t;
 
 void list_create(list_t *, size_t, size_t);
 void list_destroy(list_t *);
 
 void list_insert_after(list_t *, void *, void *);
 void list_insert_before(list_t *, void *, void *);
 void list_insert_head(list_t *, void *);
 void list_insert_tail(list_t *, void *);
 void list_remove(list_t *, void *);
 void *list_remove_head(list_t *);
 void *list_remove_tail(list_t *);
 void list_move_tail(list_t *, list_t *);
 
 void *list_head(list_t *);
 void *list_tail(list_t *);
 void *list_next(list_t *, void *);
 void *list_prev(list_t *, void *);
 int list_is_empty(list_t *);
 
 void list_link_init(list_node_t *);
 void list_link_replace(list_node_t *, list_node_t *);
 
 int list_link_active(list_node_t *);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_LIST_H */
Index: vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/list_impl.h
===================================================================
--- vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/list_impl.h	(revision 364927)
+++ vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/list_impl.h	(revision 364928)
@@ -1,53 +1,51 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_LIST_IMPL_H
 #define	_SYS_LIST_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct list_node {
 	struct list_node *list_next;
 	struct list_node *list_prev;
 };
 
 struct list {
 	size_t	list_size;
 	size_t	list_offset;
 	struct list_node list_head;
 };
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_LIST_IMPL_H */
Index: vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/zmod.h
===================================================================
--- vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/zmod.h	(revision 364927)
+++ vendor-sys/openzfs/dist/include/os/freebsd/spl/sys/zmod.h	(revision 364928)
@@ -1,68 +1,66 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_ZMOD_H
 #define	_ZMOD_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * zmod - RFC-1950-compatible decompression routines
  *
  * This file provides the public interfaces to zmod, an in-kernel RFC 1950
  * decompression library.  More information about the implementation of these
  * interfaces can be found in the usr/src/uts/common/zmod/ directory.
  */
 
 #define	Z_OK		0
 #define	Z_STREAM_END	1
 #define	Z_NEED_DICT	2
 #define	Z_ERRNO		(-1)
 #define	Z_STREAM_ERROR	(-2)
 #define	Z_DATA_ERROR	(-3)
 #define	Z_MEM_ERROR	(-4)
 #define	Z_BUF_ERROR	(-5)
 #define	Z_VERSION_ERROR	(-6)
 
 #define	Z_NO_COMPRESSION	0
 #define	Z_BEST_SPEED		1
 #define	Z_BEST_COMPRESSION	9
 #define	Z_DEFAULT_COMPRESSION	(-1)
 
 extern int z_uncompress(void *, size_t *, const void *, size_t);
 extern int z_compress(void *, size_t *, const void *, size_t);
 extern int z_compress_level(void *, size_t *, const void *, size_t, int);
 extern const char *z_strerror(int);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _ZMOD_H */
Index: vendor-sys/openzfs/dist/include/sys/fs/zfs.h
===================================================================
--- vendor-sys/openzfs/dist/include/sys/fs/zfs.h	(revision 364927)
+++ vendor-sys/openzfs/dist/include/sys/fs/zfs.h	(revision 364928)
@@ -1,1589 +1,1589 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019 Datto Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_FS_ZFS_H
 #define	_SYS_FS_ZFS_H
 
 #include <sys/time.h>
 #include <sys/zio_priority.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Types and constants shared between userland and the kernel.
  */
 
 /*
  * Each dataset can be one of the following types.  These constants can be
  * combined into masks that can be passed to various functions.
  */
 typedef enum {
 	ZFS_TYPE_FILESYSTEM	= (1 << 0),
 	ZFS_TYPE_SNAPSHOT	= (1 << 1),
 	ZFS_TYPE_VOLUME		= (1 << 2),
 	ZFS_TYPE_POOL		= (1 << 3),
 	ZFS_TYPE_BOOKMARK	= (1 << 4)
 } zfs_type_t;
 
 /*
  * NB: lzc_dataset_type should be updated whenever a new objset type is added,
  * if it represents a real type of a dataset that can be created from userland.
  */
 typedef enum dmu_objset_type {
 	DMU_OST_NONE,
 	DMU_OST_META,
 	DMU_OST_ZFS,
 	DMU_OST_ZVOL,
 	DMU_OST_OTHER,			/* For testing only! */
 	DMU_OST_ANY,			/* Be careful! */
 	DMU_OST_NUMTYPES
 } dmu_objset_type_t;
 
 #define	ZFS_TYPE_DATASET	\
 	(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
 
 /*
  * All of these include the terminating NUL byte.
  */
 #define	ZAP_MAXNAMELEN 256
 #define	ZAP_MAXVALUELEN (1024 * 8)
 #define	ZAP_OLDMAXVALUELEN 1024
 #define	ZFS_MAX_DATASET_NAME_LEN 256
 
 /*
  * Dataset properties are identified by these constants and must be added to
  * the end of this list to ensure that external consumers are not affected
  * by the change. If you make any changes to this list, be sure to update
  * the property table in module/zcommon/zfs_prop.c.
  */
 typedef enum {
 	ZPROP_CONT = -2,
 	ZPROP_INVAL = -1,
 	ZFS_PROP_TYPE = 0,
 	ZFS_PROP_CREATION,
 	ZFS_PROP_USED,
 	ZFS_PROP_AVAILABLE,
 	ZFS_PROP_REFERENCED,
 	ZFS_PROP_COMPRESSRATIO,
 	ZFS_PROP_MOUNTED,
 	ZFS_PROP_ORIGIN,
 	ZFS_PROP_QUOTA,
 	ZFS_PROP_RESERVATION,
 	ZFS_PROP_VOLSIZE,
 	ZFS_PROP_VOLBLOCKSIZE,
 	ZFS_PROP_RECORDSIZE,
 	ZFS_PROP_MOUNTPOINT,
 	ZFS_PROP_SHARENFS,
 	ZFS_PROP_CHECKSUM,
 	ZFS_PROP_COMPRESSION,
 	ZFS_PROP_ATIME,
 	ZFS_PROP_DEVICES,
 	ZFS_PROP_EXEC,
 	ZFS_PROP_SETUID,
 	ZFS_PROP_READONLY,
 	ZFS_PROP_ZONED,
 	ZFS_PROP_SNAPDIR,
 	ZFS_PROP_ACLMODE,
 	ZFS_PROP_ACLINHERIT,
 	ZFS_PROP_CREATETXG,
 	ZFS_PROP_NAME,			/* not exposed to the user */
 	ZFS_PROP_CANMOUNT,
 	ZFS_PROP_ISCSIOPTIONS,		/* not exposed to the user */
 	ZFS_PROP_XATTR,
 	ZFS_PROP_NUMCLONES,		/* not exposed to the user */
 	ZFS_PROP_COPIES,
 	ZFS_PROP_VERSION,
 	ZFS_PROP_UTF8ONLY,
 	ZFS_PROP_NORMALIZE,
 	ZFS_PROP_CASE,
 	ZFS_PROP_VSCAN,
 	ZFS_PROP_NBMAND,
 	ZFS_PROP_SHARESMB,
 	ZFS_PROP_REFQUOTA,
 	ZFS_PROP_REFRESERVATION,
 	ZFS_PROP_GUID,
 	ZFS_PROP_PRIMARYCACHE,
 	ZFS_PROP_SECONDARYCACHE,
 	ZFS_PROP_USEDSNAP,
 	ZFS_PROP_USEDDS,
 	ZFS_PROP_USEDCHILD,
 	ZFS_PROP_USEDREFRESERV,
 	ZFS_PROP_USERACCOUNTING,	/* not exposed to the user */
 	ZFS_PROP_STMF_SHAREINFO,	/* not exposed to the user */
 	ZFS_PROP_DEFER_DESTROY,
 	ZFS_PROP_USERREFS,
 	ZFS_PROP_LOGBIAS,
 	ZFS_PROP_UNIQUE,		/* not exposed to the user */
 	ZFS_PROP_OBJSETID,
 	ZFS_PROP_DEDUP,
 	ZFS_PROP_MLSLABEL,
 	ZFS_PROP_SYNC,
 	ZFS_PROP_DNODESIZE,
 	ZFS_PROP_REFRATIO,
 	ZFS_PROP_WRITTEN,
 	ZFS_PROP_CLONES,
 	ZFS_PROP_LOGICALUSED,
 	ZFS_PROP_LOGICALREFERENCED,
 	ZFS_PROP_INCONSISTENT,		/* not exposed to the user */
 	ZFS_PROP_VOLMODE,
 	ZFS_PROP_FILESYSTEM_LIMIT,
 	ZFS_PROP_SNAPSHOT_LIMIT,
 	ZFS_PROP_FILESYSTEM_COUNT,
 	ZFS_PROP_SNAPSHOT_COUNT,
 	ZFS_PROP_SNAPDEV,
 	ZFS_PROP_ACLTYPE,
 	ZFS_PROP_SELINUX_CONTEXT,
 	ZFS_PROP_SELINUX_FSCONTEXT,
 	ZFS_PROP_SELINUX_DEFCONTEXT,
 	ZFS_PROP_SELINUX_ROOTCONTEXT,
 	ZFS_PROP_RELATIME,
 	ZFS_PROP_REDUNDANT_METADATA,
 	ZFS_PROP_OVERLAY,
 	ZFS_PROP_PREV_SNAP,
 	ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	ZFS_PROP_ENCRYPTION,
 	ZFS_PROP_KEYLOCATION,
 	ZFS_PROP_KEYFORMAT,
 	ZFS_PROP_PBKDF2_SALT,
 	ZFS_PROP_PBKDF2_ITERS,
 	ZFS_PROP_ENCRYPTION_ROOT,
 	ZFS_PROP_KEY_GUID,
 	ZFS_PROP_KEYSTATUS,
 	ZFS_PROP_REMAPTXG,		/* obsolete - no longer used */
 	ZFS_PROP_SPECIAL_SMALL_BLOCKS,
 	ZFS_PROP_IVSET_GUID,		/* not exposed to the user */
 	ZFS_PROP_REDACTED,
 	ZFS_PROP_REDACT_SNAPS,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 
 typedef enum {
 	ZFS_PROP_USERUSED,
 	ZFS_PROP_USERQUOTA,
 	ZFS_PROP_GROUPUSED,
 	ZFS_PROP_GROUPQUOTA,
 	ZFS_PROP_USEROBJUSED,
 	ZFS_PROP_USEROBJQUOTA,
 	ZFS_PROP_GROUPOBJUSED,
 	ZFS_PROP_GROUPOBJQUOTA,
 	ZFS_PROP_PROJECTUSED,
 	ZFS_PROP_PROJECTQUOTA,
 	ZFS_PROP_PROJECTOBJUSED,
 	ZFS_PROP_PROJECTOBJQUOTA,
 	ZFS_NUM_USERQUOTA_PROPS
 } zfs_userquota_prop_t;
 
 extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
 
 /*
  * Pool properties are identified by these constants and must be added to the
  * end of this list to ensure that external consumers are not affected
  * by the change.  Properties must be registered in zfs_prop_init().
  */
 typedef enum {
 	ZPOOL_PROP_INVAL = -1,
 	ZPOOL_PROP_NAME,
 	ZPOOL_PROP_SIZE,
 	ZPOOL_PROP_CAPACITY,
 	ZPOOL_PROP_ALTROOT,
 	ZPOOL_PROP_HEALTH,
 	ZPOOL_PROP_GUID,
 	ZPOOL_PROP_VERSION,
 	ZPOOL_PROP_BOOTFS,
 	ZPOOL_PROP_DELEGATION,
 	ZPOOL_PROP_AUTOREPLACE,
 	ZPOOL_PROP_CACHEFILE,
 	ZPOOL_PROP_FAILUREMODE,
 	ZPOOL_PROP_LISTSNAPS,
 	ZPOOL_PROP_AUTOEXPAND,
 	ZPOOL_PROP_DEDUPDITTO,
 	ZPOOL_PROP_DEDUPRATIO,
 	ZPOOL_PROP_FREE,
 	ZPOOL_PROP_ALLOCATED,
 	ZPOOL_PROP_READONLY,
 	ZPOOL_PROP_ASHIFT,
 	ZPOOL_PROP_COMMENT,
 	ZPOOL_PROP_EXPANDSZ,
 	ZPOOL_PROP_FREEING,
 	ZPOOL_PROP_FRAGMENTATION,
 	ZPOOL_PROP_LEAKED,
 	ZPOOL_PROP_MAXBLOCKSIZE,
 	ZPOOL_PROP_TNAME,
 	ZPOOL_PROP_MAXDNODESIZE,
 	ZPOOL_PROP_MULTIHOST,
 	ZPOOL_PROP_CHECKPOINT,
 	ZPOOL_PROP_LOAD_GUID,
 	ZPOOL_PROP_AUTOTRIM,
 	ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
 /* Small enough to not hog a whole line of printout in zpool(1M). */
 #define	ZPROP_MAX_COMMENT	32
 
 #define	ZPROP_VALUE		"value"
 #define	ZPROP_SOURCE		"source"
 
 typedef enum {
 	ZPROP_SRC_NONE = 0x1,
 	ZPROP_SRC_DEFAULT = 0x2,
 	ZPROP_SRC_TEMPORARY = 0x4,
 	ZPROP_SRC_LOCAL = 0x8,
 	ZPROP_SRC_INHERITED = 0x10,
 	ZPROP_SRC_RECEIVED = 0x20
 } zprop_source_t;
 
 #define	ZPROP_SRC_ALL	0x3f
 
 #define	ZPROP_SOURCE_VAL_RECVD	"$recvd"
 #define	ZPROP_N_MORE_ERRORS	"N_MORE_ERRORS"
 
 /*
  * Dataset flag implemented as a special entry in the props zap object
  * indicating that the dataset has received properties on or after
  * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
  * just as it did in earlier versions, and thereafter, local properties are
  * preserved.
  */
 #define	ZPROP_HAS_RECVD		"$hasrecvd"
 
 typedef enum {
 	ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
 	ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
 } zprop_errflags_t;
 
 typedef int (*zprop_func)(int, void *);
 
 /*
  * Properties to be set on the root file system of a new pool
  * are stuffed into their own nvlist, which is then included in
  * the properties nvlist with the pool properties.
  */
 #define	ZPOOL_ROOTFS_PROPS	"root-props-nvl"
 
 /*
  * Length of 'written@' and 'written#'
  */
 #define	ZFS_WRITTEN_PROP_PREFIX_LEN	8
 
 /*
  * Dataset property functions shared between libzfs and kernel.
  */
 const char *zfs_prop_default_string(zfs_prop_t);
 uint64_t zfs_prop_default_numeric(zfs_prop_t);
 boolean_t zfs_prop_readonly(zfs_prop_t);
 boolean_t zfs_prop_visible(zfs_prop_t prop);
 boolean_t zfs_prop_inheritable(zfs_prop_t);
 boolean_t zfs_prop_setonce(zfs_prop_t);
 boolean_t zfs_prop_encryption_key_param(zfs_prop_t);
 boolean_t zfs_prop_valid_keylocation(const char *, boolean_t);
 const char *zfs_prop_to_name(zfs_prop_t);
 zfs_prop_t zfs_name_to_prop(const char *);
 boolean_t zfs_prop_user(const char *);
 boolean_t zfs_prop_userquota(const char *);
 boolean_t zfs_prop_written(const char *);
 int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
 int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
 uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
 boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t);
 
 /*
  * Pool property functions shared between libzfs and kernel.
  */
 zpool_prop_t zpool_name_to_prop(const char *);
 const char *zpool_prop_to_name(zpool_prop_t);
 const char *zpool_prop_default_string(zpool_prop_t);
 uint64_t zpool_prop_default_numeric(zpool_prop_t);
 boolean_t zpool_prop_readonly(zpool_prop_t);
 boolean_t zpool_prop_setonce(zpool_prop_t);
 boolean_t zpool_prop_feature(const char *);
 boolean_t zpool_prop_unsupported(const char *);
 int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
 int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
 uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
 
 /*
  * Definitions for the Delegation.
  */
 typedef enum {
 	ZFS_DELEG_WHO_UNKNOWN = 0,
 	ZFS_DELEG_USER = 'u',
 	ZFS_DELEG_USER_SETS = 'U',
 	ZFS_DELEG_GROUP = 'g',
 	ZFS_DELEG_GROUP_SETS = 'G',
 	ZFS_DELEG_EVERYONE = 'e',
 	ZFS_DELEG_EVERYONE_SETS = 'E',
 	ZFS_DELEG_CREATE = 'c',
 	ZFS_DELEG_CREATE_SETS = 'C',
 	ZFS_DELEG_NAMED_SET = 's',
 	ZFS_DELEG_NAMED_SET_SETS = 'S'
 } zfs_deleg_who_type_t;
 
 typedef enum {
 	ZFS_DELEG_NONE = 0,
 	ZFS_DELEG_PERM_LOCAL = 1,
 	ZFS_DELEG_PERM_DESCENDENT = 2,
 	ZFS_DELEG_PERM_LOCALDESCENDENT = 3,
 	ZFS_DELEG_PERM_CREATE = 4
 } zfs_deleg_inherit_t;
 
 #define	ZFS_DELEG_PERM_UID	"uid"
 #define	ZFS_DELEG_PERM_GID	"gid"
 #define	ZFS_DELEG_PERM_GROUPS	"groups"
 
 #define	ZFS_MLSLABEL_DEFAULT	"none"
 
 #define	ZFS_SMB_ACL_SRC		"src"
 #define	ZFS_SMB_ACL_TARGET	"target"
 
 typedef enum {
 	ZFS_CANMOUNT_OFF = 0,
 	ZFS_CANMOUNT_ON = 1,
 	ZFS_CANMOUNT_NOAUTO = 2
 } zfs_canmount_type_t;
 
 typedef enum {
 	ZFS_LOGBIAS_LATENCY = 0,
 	ZFS_LOGBIAS_THROUGHPUT = 1
 } zfs_logbias_op_t;
 
 typedef enum zfs_share_op {
 	ZFS_SHARE_NFS = 0,
 	ZFS_UNSHARE_NFS = 1,
 	ZFS_SHARE_SMB = 2,
 	ZFS_UNSHARE_SMB = 3
 } zfs_share_op_t;
 
 typedef enum zfs_smb_acl_op {
 	ZFS_SMB_ACL_ADD,
 	ZFS_SMB_ACL_REMOVE,
 	ZFS_SMB_ACL_RENAME,
 	ZFS_SMB_ACL_PURGE
 } zfs_smb_acl_op_t;
 
 typedef enum zfs_cache_type {
 	ZFS_CACHE_NONE = 0,
 	ZFS_CACHE_METADATA = 1,
 	ZFS_CACHE_ALL = 2
 } zfs_cache_type_t;
 
 typedef enum {
 	ZFS_SYNC_STANDARD = 0,
 	ZFS_SYNC_ALWAYS = 1,
 	ZFS_SYNC_DISABLED = 2
 } zfs_sync_type_t;
 
 typedef enum {
 	ZFS_XATTR_OFF = 0,
 	ZFS_XATTR_DIR = 1,
 	ZFS_XATTR_SA = 2
 } zfs_xattr_type_t;
 
 typedef enum {
 	ZFS_DNSIZE_LEGACY = 0,
 	ZFS_DNSIZE_AUTO = 1,
 	ZFS_DNSIZE_1K = 1024,
 	ZFS_DNSIZE_2K = 2048,
 	ZFS_DNSIZE_4K = 4096,
 	ZFS_DNSIZE_8K = 8192,
 	ZFS_DNSIZE_16K = 16384
 } zfs_dnsize_type_t;
 
 typedef enum {
 	ZFS_REDUNDANT_METADATA_ALL,
 	ZFS_REDUNDANT_METADATA_MOST
 } zfs_redundant_metadata_type_t;
 
 typedef enum {
 	ZFS_VOLMODE_DEFAULT = 0,
 	ZFS_VOLMODE_GEOM = 1,
 	ZFS_VOLMODE_DEV = 2,
 	ZFS_VOLMODE_NONE = 3
 } zfs_volmode_t;
 
 typedef enum zfs_keystatus {
 	ZFS_KEYSTATUS_NONE = 0,
 	ZFS_KEYSTATUS_UNAVAILABLE,
 	ZFS_KEYSTATUS_AVAILABLE,
 } zfs_keystatus_t;
 
 typedef enum zfs_keyformat {
 	ZFS_KEYFORMAT_NONE = 0,
 	ZFS_KEYFORMAT_RAW,
 	ZFS_KEYFORMAT_HEX,
 	ZFS_KEYFORMAT_PASSPHRASE,
 	ZFS_KEYFORMAT_FORMATS
 } zfs_keyformat_t;
 
 typedef enum zfs_key_location {
 	ZFS_KEYLOCATION_NONE = 0,
 	ZFS_KEYLOCATION_PROMPT,
 	ZFS_KEYLOCATION_URI,
 	ZFS_KEYLOCATION_LOCATIONS
 } zfs_keylocation_t;
 
 #define	DEFAULT_PBKDF2_ITERATIONS 350000
 #define	MIN_PBKDF2_ITERATIONS 100000
 
 /*
  * On-disk version number.
  */
 #define	SPA_VERSION_1			1ULL
 #define	SPA_VERSION_2			2ULL
 #define	SPA_VERSION_3			3ULL
 #define	SPA_VERSION_4			4ULL
 #define	SPA_VERSION_5			5ULL
 #define	SPA_VERSION_6			6ULL
 #define	SPA_VERSION_7			7ULL
 #define	SPA_VERSION_8			8ULL
 #define	SPA_VERSION_9			9ULL
 #define	SPA_VERSION_10			10ULL
 #define	SPA_VERSION_11			11ULL
 #define	SPA_VERSION_12			12ULL
 #define	SPA_VERSION_13			13ULL
 #define	SPA_VERSION_14			14ULL
 #define	SPA_VERSION_15			15ULL
 #define	SPA_VERSION_16			16ULL
 #define	SPA_VERSION_17			17ULL
 #define	SPA_VERSION_18			18ULL
 #define	SPA_VERSION_19			19ULL
 #define	SPA_VERSION_20			20ULL
 #define	SPA_VERSION_21			21ULL
 #define	SPA_VERSION_22			22ULL
 #define	SPA_VERSION_23			23ULL
 #define	SPA_VERSION_24			24ULL
 #define	SPA_VERSION_25			25ULL
 #define	SPA_VERSION_26			26ULL
 #define	SPA_VERSION_27			27ULL
 #define	SPA_VERSION_28			28ULL
 #define	SPA_VERSION_5000		5000ULL
 
 /*
  * The incrementing pool version number has been replaced by pool feature
  * flags.  For more details, see zfeature.c.
  */
 #define	SPA_VERSION			SPA_VERSION_5000
 #define	SPA_VERSION_STRING		"5000"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
  * Used in the code when checking for presence or absence of a feature.
  * Feel free to define multiple symbolic names for each version if there
  * were multiple changes to on-disk structures during that version.
  *
  * NOTE: When checking the current SPA_VERSION in your code, be sure
  *       to use spa_version() since it reports the version of the
  *       last synced uberblock.  Checking the in-flight version can
  *       be dangerous in some cases.
  */
 #define	SPA_VERSION_INITIAL		SPA_VERSION_1
 #define	SPA_VERSION_DITTO_BLOCKS	SPA_VERSION_2
 #define	SPA_VERSION_SPARES		SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ2		SPA_VERSION_3
 #define	SPA_VERSION_BPOBJ_ACCOUNT	SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ_DEFLATE	SPA_VERSION_3
 #define	SPA_VERSION_DNODE_BYTES		SPA_VERSION_3
 #define	SPA_VERSION_ZPOOL_HISTORY	SPA_VERSION_4
 #define	SPA_VERSION_GZIP_COMPRESSION	SPA_VERSION_5
 #define	SPA_VERSION_BOOTFS		SPA_VERSION_6
 #define	SPA_VERSION_SLOGS		SPA_VERSION_7
 #define	SPA_VERSION_DELEGATED_PERMS	SPA_VERSION_8
 #define	SPA_VERSION_FUID		SPA_VERSION_9
 #define	SPA_VERSION_REFRESERVATION	SPA_VERSION_9
 #define	SPA_VERSION_REFQUOTA		SPA_VERSION_9
 #define	SPA_VERSION_UNIQUE_ACCURATE	SPA_VERSION_9
 #define	SPA_VERSION_L2CACHE		SPA_VERSION_10
 #define	SPA_VERSION_NEXT_CLONES		SPA_VERSION_11
 #define	SPA_VERSION_ORIGIN		SPA_VERSION_11
 #define	SPA_VERSION_DSL_SCRUB		SPA_VERSION_11
 #define	SPA_VERSION_SNAP_PROPS		SPA_VERSION_12
 #define	SPA_VERSION_USED_BREAKDOWN	SPA_VERSION_13
 #define	SPA_VERSION_PASSTHROUGH_X	SPA_VERSION_14
 #define	SPA_VERSION_USERSPACE		SPA_VERSION_15
 #define	SPA_VERSION_STMF_PROP		SPA_VERSION_16
 #define	SPA_VERSION_RAIDZ3		SPA_VERSION_17
 #define	SPA_VERSION_USERREFS		SPA_VERSION_18
 #define	SPA_VERSION_HOLES		SPA_VERSION_19
 #define	SPA_VERSION_ZLE_COMPRESSION	SPA_VERSION_20
 #define	SPA_VERSION_DEDUP		SPA_VERSION_21
 #define	SPA_VERSION_RECVD_PROPS		SPA_VERSION_22
 #define	SPA_VERSION_SLIM_ZIL		SPA_VERSION_23
 #define	SPA_VERSION_SA			SPA_VERSION_24
 #define	SPA_VERSION_SCAN		SPA_VERSION_25
 #define	SPA_VERSION_DIR_CLONES		SPA_VERSION_26
 #define	SPA_VERSION_DEADLISTS		SPA_VERSION_26
 #define	SPA_VERSION_FAST_SNAP		SPA_VERSION_27
 #define	SPA_VERSION_MULTI_REPLACE	SPA_VERSION_28
 #define	SPA_VERSION_BEFORE_FEATURES	SPA_VERSION_28
 #define	SPA_VERSION_FEATURES		SPA_VERSION_5000
 
 #define	SPA_VERSION_IS_SUPPORTED(v) \
 	(((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \
 	((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
  * occurs.  This is independent of SPA/DMU/ZAP versioning.  You must
  * also update the version_table[] and help message in zfs_prop.c.
  */
 #define	ZPL_VERSION_1			1ULL
 #define	ZPL_VERSION_2			2ULL
 #define	ZPL_VERSION_3			3ULL
 #define	ZPL_VERSION_4			4ULL
 #define	ZPL_VERSION_5			5ULL
 #define	ZPL_VERSION			ZPL_VERSION_5
 #define	ZPL_VERSION_STRING		"5"
 
 #define	ZPL_VERSION_INITIAL		ZPL_VERSION_1
 #define	ZPL_VERSION_DIRENT_TYPE		ZPL_VERSION_2
 #define	ZPL_VERSION_FUID		ZPL_VERSION_3
 #define	ZPL_VERSION_NORMALIZATION	ZPL_VERSION_3
 #define	ZPL_VERSION_SYSATTR		ZPL_VERSION_3
 #define	ZPL_VERSION_USERSPACE		ZPL_VERSION_4
 #define	ZPL_VERSION_SA			ZPL_VERSION_5
 
 /* Persistent L2ARC version */
 #define	L2ARC_PERSISTENT_VERSION_1	1ULL
 #define	L2ARC_PERSISTENT_VERSION	L2ARC_PERSISTENT_VERSION_1
 #define	L2ARC_PERSISTENT_VERSION_STRING	"1"
 
 /* Rewind policy information */
 #define	ZPOOL_NO_REWIND		1  /* No policy - default behavior */
 #define	ZPOOL_NEVER_REWIND	2  /* Do not search for best txg or rewind */
 #define	ZPOOL_TRY_REWIND	4  /* Search for best txg, but do not rewind */
 #define	ZPOOL_DO_REWIND		8  /* Rewind to best txg w/in deferred frees */
 #define	ZPOOL_EXTREME_REWIND	16 /* Allow extreme measures to find best txg */
 #define	ZPOOL_REWIND_MASK	28 /* All the possible rewind bits */
 #define	ZPOOL_REWIND_POLICIES	31 /* All the possible policy bits */
 
 typedef struct zpool_load_policy {
 	uint32_t	zlp_rewind;	/* rewind policy requested */
 	uint64_t	zlp_maxmeta;	/* max acceptable meta-data errors */
 	uint64_t	zlp_maxdata;	/* max acceptable data errors */
 	uint64_t	zlp_txg;	/* specific txg to load */
 } zpool_load_policy_t;
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
  * configuration.  New on-disk names should be prefixed with "<reversed-DNS>:"
  * (e.g. "org.openzfs:") to avoid conflicting names being developed
  * independently.
  */
 #define	ZPOOL_CONFIG_VERSION		"version"
 #define	ZPOOL_CONFIG_POOL_NAME		"name"
 #define	ZPOOL_CONFIG_POOL_STATE		"state"
 #define	ZPOOL_CONFIG_POOL_TXG		"txg"
 #define	ZPOOL_CONFIG_POOL_GUID		"pool_guid"
 #define	ZPOOL_CONFIG_CREATE_TXG		"create_txg"
 #define	ZPOOL_CONFIG_TOP_GUID		"top_guid"
 #define	ZPOOL_CONFIG_VDEV_TREE		"vdev_tree"
 #define	ZPOOL_CONFIG_TYPE		"type"
 #define	ZPOOL_CONFIG_CHILDREN		"children"
 #define	ZPOOL_CONFIG_ID			"id"
 #define	ZPOOL_CONFIG_GUID		"guid"
 #define	ZPOOL_CONFIG_INDIRECT_OBJECT	"com.delphix:indirect_object"
 #define	ZPOOL_CONFIG_INDIRECT_BIRTHS	"com.delphix:indirect_births"
 #define	ZPOOL_CONFIG_PREV_INDIRECT_VDEV	"com.delphix:prev_indirect_vdev"
 #define	ZPOOL_CONFIG_PATH		"path"
 #define	ZPOOL_CONFIG_DEVID		"devid"
 #define	ZPOOL_CONFIG_METASLAB_ARRAY	"metaslab_array"
 #define	ZPOOL_CONFIG_METASLAB_SHIFT	"metaslab_shift"
 #define	ZPOOL_CONFIG_ASHIFT		"ashift"
 #define	ZPOOL_CONFIG_ASIZE		"asize"
 #define	ZPOOL_CONFIG_DTL		"DTL"
 #define	ZPOOL_CONFIG_SCAN_STATS		"scan_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_REMOVAL_STATS	"removal_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_CHECKPOINT_STATS	"checkpoint_stats" /* not on disk */
 #define	ZPOOL_CONFIG_VDEV_STATS		"vdev_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_INDIRECT_SIZE	"indirect_size"	/* not stored on disk */
 
 /* container nvlist of extended stats */
 #define	ZPOOL_CONFIG_VDEV_STATS_EX	"vdev_stats_ex"
 
 /* Active queue read/write stats */
 #define	ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE	"vdev_sync_r_active_queue"
 #define	ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE	"vdev_sync_w_active_queue"
 #define	ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE	"vdev_async_r_active_queue"
 #define	ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE	"vdev_async_w_active_queue"
 #define	ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE	"vdev_async_scrub_active_queue"
 #define	ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE	"vdev_async_trim_active_queue"
 
 /* Queue sizes */
 #define	ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE	"vdev_sync_r_pend_queue"
 #define	ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE	"vdev_sync_w_pend_queue"
 #define	ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE	"vdev_async_r_pend_queue"
 #define	ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE	"vdev_async_w_pend_queue"
 #define	ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE	"vdev_async_scrub_pend_queue"
 #define	ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE	"vdev_async_trim_pend_queue"
 
 /* Latency read/write histogram stats */
 #define	ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO	"vdev_tot_r_lat_histo"
 #define	ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO	"vdev_tot_w_lat_histo"
 #define	ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO	"vdev_disk_r_lat_histo"
 #define	ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO	"vdev_disk_w_lat_histo"
 #define	ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO	"vdev_sync_r_lat_histo"
 #define	ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO	"vdev_sync_w_lat_histo"
 #define	ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO	"vdev_async_r_lat_histo"
 #define	ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO	"vdev_async_w_lat_histo"
 #define	ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO	"vdev_scrub_histo"
 #define	ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO	"vdev_trim_histo"
 
 /* Request size histograms */
 #define	ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO	"vdev_sync_ind_r_histo"
 #define	ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO	"vdev_sync_ind_w_histo"
 #define	ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO	"vdev_async_ind_r_histo"
 #define	ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO	"vdev_async_ind_w_histo"
 #define	ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO	"vdev_ind_scrub_histo"
 #define	ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO	"vdev_ind_trim_histo"
 #define	ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO	"vdev_sync_agg_r_histo"
 #define	ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO	"vdev_sync_agg_w_histo"
 #define	ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO	"vdev_async_agg_r_histo"
 #define	ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO	"vdev_async_agg_w_histo"
 #define	ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO	"vdev_agg_scrub_histo"
 #define	ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO	"vdev_agg_trim_histo"
 
 /* Number of slow IOs */
 #define	ZPOOL_CONFIG_VDEV_SLOW_IOS		"vdev_slow_ios"
 
 /* vdev enclosure sysfs path */
 #define	ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH	"vdev_enc_sysfs_path"
 
 #define	ZPOOL_CONFIG_WHOLE_DISK		"whole_disk"
 #define	ZPOOL_CONFIG_ERRCOUNT		"error_count"
 #define	ZPOOL_CONFIG_NOT_PRESENT	"not_present"
 #define	ZPOOL_CONFIG_SPARES		"spares"
 #define	ZPOOL_CONFIG_IS_SPARE		"is_spare"
 #define	ZPOOL_CONFIG_NPARITY		"nparity"
 #define	ZPOOL_CONFIG_HOSTID		"hostid"
 #define	ZPOOL_CONFIG_HOSTNAME		"hostname"
 #define	ZPOOL_CONFIG_LOADED_TIME	"initial_load_time"
 #define	ZPOOL_CONFIG_UNSPARE		"unspare"
 #define	ZPOOL_CONFIG_PHYS_PATH		"phys_path"
 #define	ZPOOL_CONFIG_IS_LOG		"is_log"
 #define	ZPOOL_CONFIG_L2CACHE		"l2cache"
 #define	ZPOOL_CONFIG_HOLE_ARRAY		"hole_array"
 #define	ZPOOL_CONFIG_VDEV_CHILDREN	"vdev_children"
 #define	ZPOOL_CONFIG_IS_HOLE		"is_hole"
 #define	ZPOOL_CONFIG_DDT_HISTOGRAM	"ddt_histogram"
 #define	ZPOOL_CONFIG_DDT_OBJ_STATS	"ddt_object_stats"
 #define	ZPOOL_CONFIG_DDT_STATS		"ddt_stats"
 #define	ZPOOL_CONFIG_SPLIT		"splitcfg"
 #define	ZPOOL_CONFIG_ORIG_GUID		"orig_guid"
 #define	ZPOOL_CONFIG_SPLIT_GUID		"split_guid"
 #define	ZPOOL_CONFIG_SPLIT_LIST		"guid_list"
 #define	ZPOOL_CONFIG_REMOVING		"removing"
 #define	ZPOOL_CONFIG_RESILVER_TXG	"resilver_txg"
 #define	ZPOOL_CONFIG_REBUILD_TXG	"rebuild_txg"
 #define	ZPOOL_CONFIG_COMMENT		"comment"
 #define	ZPOOL_CONFIG_SUSPENDED		"suspended"	/* not stored on disk */
 #define	ZPOOL_CONFIG_SUSPENDED_REASON	"suspended_reason"	/* not stored */
 #define	ZPOOL_CONFIG_TIMESTAMP		"timestamp"	/* not stored on disk */
 #define	ZPOOL_CONFIG_BOOTFS		"bootfs"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MISSING_DEVICES	"missing_vdevs"	/* not stored on disk */
 #define	ZPOOL_CONFIG_LOAD_INFO		"load_info"	/* not stored on disk */
 #define	ZPOOL_CONFIG_REWIND_INFO	"rewind_info"	/* not stored on disk */
 #define	ZPOOL_CONFIG_UNSUP_FEAT		"unsup_feat"	/* not stored on disk */
 #define	ZPOOL_CONFIG_ENABLED_FEAT	"enabled_feat"	/* not stored on disk */
 #define	ZPOOL_CONFIG_CAN_RDONLY		"can_rdonly"	/* not stored on disk */
 #define	ZPOOL_CONFIG_FEATURES_FOR_READ	"features_for_read"
 #define	ZPOOL_CONFIG_FEATURE_STATS	"feature_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_ERRATA		"errata"	/* not stored on disk */
 #define	ZPOOL_CONFIG_VDEV_TOP_ZAP	"com.delphix:vdev_zap_top"
 #define	ZPOOL_CONFIG_VDEV_LEAF_ZAP	"com.delphix:vdev_zap_leaf"
 #define	ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS	"com.delphix:has_per_vdev_zaps"
 #define	ZPOOL_CONFIG_RESILVER_DEFER	"com.datto:resilver_defer"
 #define	ZPOOL_CONFIG_CACHEFILE		"cachefile"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MMP_STATE		"mmp_state"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MMP_TXG		"mmp_txg"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MMP_SEQ		"mmp_seq"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MMP_HOSTNAME	"mmp_hostname"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MMP_HOSTID		"mmp_hostid"	/* not stored on disk */
 #define	ZPOOL_CONFIG_ALLOCATION_BIAS	"alloc_bias"	/* not stored on disk */
 #define	ZPOOL_CONFIG_EXPANSION_TIME	"expansion_time"	/* not stored */
 #define	ZPOOL_CONFIG_REBUILD_STATS	"org.openzfs:rebuild_stats"
 
 /*
  * The persistent vdev state is stored as separate values rather than a single
  * 'vdev_state' entry.  This is because a device can be in multiple states, such
  * as offline and degraded.
  */
 #define	ZPOOL_CONFIG_OFFLINE		"offline"
 #define	ZPOOL_CONFIG_FAULTED		"faulted"
 #define	ZPOOL_CONFIG_DEGRADED		"degraded"
 #define	ZPOOL_CONFIG_REMOVED		"removed"
 #define	ZPOOL_CONFIG_FRU		"fru"
 #define	ZPOOL_CONFIG_AUX_STATE		"aux_state"
 
 /* Pool load policy parameters */
 #define	ZPOOL_LOAD_POLICY		"load-policy"
 #define	ZPOOL_LOAD_REWIND_POLICY	"load-rewind-policy"
 #define	ZPOOL_LOAD_REQUEST_TXG		"load-request-txg"
 #define	ZPOOL_LOAD_META_THRESH		"load-meta-thresh"
 #define	ZPOOL_LOAD_DATA_THRESH		"load-data-thresh"
 
 /* Rewind data discovered */
 #define	ZPOOL_CONFIG_LOAD_TIME		"rewind_txg_ts"
 #define	ZPOOL_CONFIG_LOAD_DATA_ERRORS	"verify_data_errors"
 #define	ZPOOL_CONFIG_REWIND_TIME	"seconds_of_rewind"
 
 #define	VDEV_TYPE_ROOT			"root"
 #define	VDEV_TYPE_MIRROR		"mirror"
 #define	VDEV_TYPE_REPLACING		"replacing"
 #define	VDEV_TYPE_RAIDZ			"raidz"
 #define	VDEV_TYPE_DISK			"disk"
 #define	VDEV_TYPE_FILE			"file"
 #define	VDEV_TYPE_MISSING		"missing"
 #define	VDEV_TYPE_HOLE			"hole"
 #define	VDEV_TYPE_SPARE			"spare"
 #define	VDEV_TYPE_LOG			"log"
 #define	VDEV_TYPE_L2CACHE		"l2cache"
 #define	VDEV_TYPE_INDIRECT		"indirect"
 
 /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */
 #define	VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \
 	"com.delphix:indirect_obsolete_sm"
 #define	VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
 	"com.delphix:obsolete_counts_are_precise"
 #define	VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
 	"com.delphix:pool_checkpoint_sm"
 #define	VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \
 	"com.delphix:ms_unflushed_phys_txgs"
 
 #define	VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \
 	"org.openzfs:vdev_rebuild"
 
 #define	VDEV_TOP_ZAP_ALLOCATION_BIAS \
 	"org.zfsonlinux:allocation_bias"
 
 /* vdev metaslab allocation bias */
 #define	VDEV_ALLOC_BIAS_LOG		"log"
 #define	VDEV_ALLOC_BIAS_SPECIAL		"special"
 #define	VDEV_ALLOC_BIAS_DEDUP		"dedup"
 
 /* vdev initialize state */
 #define	VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET	\
 	"com.delphix:next_offset_to_initialize"
 #define	VDEV_LEAF_ZAP_INITIALIZE_STATE	\
 	"com.delphix:vdev_initialize_state"
 #define	VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME	\
 	"com.delphix:vdev_initialize_action_time"
 
 /* vdev TRIM state */
 #define	VDEV_LEAF_ZAP_TRIM_LAST_OFFSET	\
 	"org.zfsonlinux:next_offset_to_trim"
 #define	VDEV_LEAF_ZAP_TRIM_STATE	\
 	"org.zfsonlinux:vdev_trim_state"
 #define	VDEV_LEAF_ZAP_TRIM_ACTION_TIME	\
 	"org.zfsonlinux:vdev_trim_action_time"
 #define	VDEV_LEAF_ZAP_TRIM_RATE		\
 	"org.zfsonlinux:vdev_trim_rate"
 #define	VDEV_LEAF_ZAP_TRIM_PARTIAL	\
 	"org.zfsonlinux:vdev_trim_partial"
 #define	VDEV_LEAF_ZAP_TRIM_SECURE	\
 	"org.zfsonlinux:vdev_trim_secure"
 
 /*
  * This is needed in userland to report the minimum necessary device size.
  */
 #define	SPA_MINDEVSIZE		(64ULL << 20)
 
 /*
  * Set if the fragmentation has not yet been calculated. This can happen
  * because the space maps have not been upgraded or the histogram feature
  * is not enabled.
  */
 #define	ZFS_FRAG_INVALID	UINT64_MAX
 
 /*
  * The location of the pool configuration repository, shared between kernel and
  * userland.
  */
 #define	ZPOOL_CACHE_BOOT	"/boot/zfs/zpool.cache"
 #define	ZPOOL_CACHE		"/etc/zfs/zpool.cache"
 /*
  * vdev states are ordered from least to most healthy.
  * A vdev that's CANT_OPEN or below is considered unusable.
  */
 typedef enum vdev_state {
 	VDEV_STATE_UNKNOWN = 0,	/* Uninitialized vdev			*/
 	VDEV_STATE_CLOSED,	/* Not currently open			*/
 	VDEV_STATE_OFFLINE,	/* Not allowed to open			*/
 	VDEV_STATE_REMOVED,	/* Explicitly removed from system	*/
 	VDEV_STATE_CANT_OPEN,	/* Tried to open, but failed		*/
 	VDEV_STATE_FAULTED,	/* External request to fault device	*/
 	VDEV_STATE_DEGRADED,	/* Replicated vdev with unhealthy kids	*/
 	VDEV_STATE_HEALTHY	/* Presumed good			*/
 } vdev_state_t;
 
 #define	VDEV_STATE_ONLINE	VDEV_STATE_HEALTHY
 
 /*
  * vdev aux states.  When a vdev is in the CANT_OPEN state, the aux field
  * of the vdev stats structure uses these constants to distinguish why.
  */
 typedef enum vdev_aux {
 	VDEV_AUX_NONE,		/* no error				*/
 	VDEV_AUX_OPEN_FAILED,	/* ldi_open_*() or vn_open() failed	*/
 	VDEV_AUX_CORRUPT_DATA,	/* bad label or disk contents		*/
 	VDEV_AUX_NO_REPLICAS,	/* insufficient number of replicas	*/
 	VDEV_AUX_BAD_GUID_SUM,	/* vdev guid sum doesn't match		*/
 	VDEV_AUX_TOO_SMALL,	/* vdev size is too small		*/
 	VDEV_AUX_BAD_LABEL,	/* the label is OK but invalid		*/
 	VDEV_AUX_VERSION_NEWER,	/* on-disk version is too new		*/
 	VDEV_AUX_VERSION_OLDER,	/* on-disk version is too old		*/
 	VDEV_AUX_UNSUP_FEAT,	/* unsupported features			*/
 	VDEV_AUX_SPARED,	/* hot spare used in another pool	*/
 	VDEV_AUX_ERR_EXCEEDED,	/* too many errors			*/
 	VDEV_AUX_IO_FAILURE,	/* experienced I/O failure		*/
 	VDEV_AUX_BAD_LOG,	/* cannot read log chain(s)		*/
 	VDEV_AUX_EXTERNAL,	/* external diagnosis or forced fault	*/
 	VDEV_AUX_SPLIT_POOL,	/* vdev was split off into another pool	*/
 	VDEV_AUX_BAD_ASHIFT,	/* vdev ashift is invalid		*/
 	VDEV_AUX_EXTERNAL_PERSIST,	/* persistent forced fault	*/
 	VDEV_AUX_ACTIVE,	/* vdev active on a different host	*/
 	VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline		*/
 	VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large   */
 } vdev_aux_t;
 
 /*
  * pool state.  The following states are written to disk as part of the normal
  * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE.  The remaining
  * states are software abstractions used at various levels to communicate
  * pool state.
  */
 typedef enum pool_state {
 	POOL_STATE_ACTIVE = 0,		/* In active use		*/
 	POOL_STATE_EXPORTED,		/* Explicitly exported		*/
 	POOL_STATE_DESTROYED,		/* Explicitly destroyed		*/
 	POOL_STATE_SPARE,		/* Reserved for hot spare use	*/
 	POOL_STATE_L2CACHE,		/* Level 2 ARC device		*/
 	POOL_STATE_UNINITIALIZED,	/* Internal spa_t state		*/
 	POOL_STATE_UNAVAIL,		/* Internal libzfs state	*/
 	POOL_STATE_POTENTIALLY_ACTIVE	/* Internal libzfs state	*/
 } pool_state_t;
 
 /*
  * mmp state. The following states provide additional detail describing
  * why a pool couldn't be safely imported.
  */
 typedef enum mmp_state {
 	MMP_STATE_ACTIVE = 0,		/* In active use		*/
 	MMP_STATE_INACTIVE,		/* Inactive and safe to import	*/
 	MMP_STATE_NO_HOSTID		/* System hostid is not set	*/
 } mmp_state_t;
 
 /*
  * Scan Functions.
  */
 typedef enum pool_scan_func {
 	POOL_SCAN_NONE,
 	POOL_SCAN_SCRUB,
 	POOL_SCAN_RESILVER,
 	POOL_SCAN_FUNCS
 } pool_scan_func_t;
 
 /*
  * Used to control scrub pause and resume.
  */
 typedef enum pool_scrub_cmd {
 	POOL_SCRUB_NORMAL = 0,
 	POOL_SCRUB_PAUSE,
 	POOL_SCRUB_FLAGS_END
 } pool_scrub_cmd_t;
 
 typedef enum {
 	CS_NONE,
 	CS_CHECKPOINT_EXISTS,
 	CS_CHECKPOINT_DISCARDING,
 	CS_NUM_STATES
 } checkpoint_state_t;
 
 typedef struct pool_checkpoint_stat {
 	uint64_t pcs_state;		/* checkpoint_state_t */
 	uint64_t pcs_start_time;	/* time checkpoint/discard started */
 	uint64_t pcs_space;		/* checkpointed space */
 } pool_checkpoint_stat_t;
 
 /*
  * ZIO types.  Needed to interpret vdev statistics below.
  */
 typedef enum zio_type {
 	ZIO_TYPE_NULL = 0,
 	ZIO_TYPE_READ,
 	ZIO_TYPE_WRITE,
 	ZIO_TYPE_FREE,
 	ZIO_TYPE_CLAIM,
 	ZIO_TYPE_IOCTL,
 	ZIO_TYPE_TRIM,
 	ZIO_TYPES
 } zio_type_t;
 
 /*
  * Pool statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and userland as an nvlist uint64 array.
  */
 typedef struct pool_scan_stat {
 	/* values stored on disk */
 	uint64_t	pss_func;	/* pool_scan_func_t */
 	uint64_t	pss_state;	/* dsl_scan_state_t */
 	uint64_t	pss_start_time;	/* scan start time */
 	uint64_t	pss_end_time;	/* scan end time */
 	uint64_t	pss_to_examine;	/* total bytes to scan */
 	uint64_t	pss_examined;	/* total bytes located by scanner */
 	uint64_t	pss_to_process; /* total bytes to process */
 	uint64_t	pss_processed;	/* total processed bytes */
 	uint64_t	pss_errors;	/* scan errors	*/
 
 	/* values not stored on disk */
 	uint64_t	pss_pass_exam; /* examined bytes per scan pass */
 	uint64_t	pss_pass_start;	/* start time of a scan pass */
 	uint64_t	pss_pass_scrub_pause; /* pause time of a scrub pass */
 	/* cumulative time scrub spent paused, needed for rate calculation */
 	uint64_t	pss_pass_scrub_spent_paused;
 	uint64_t	pss_pass_issued; /* issued bytes per scan pass */
 	uint64_t	pss_issued;	/* total bytes checked by scanner */
 } pool_scan_stat_t;
 
 typedef struct pool_removal_stat {
 	uint64_t prs_state; /* dsl_scan_state_t */
 	uint64_t prs_removing_vdev;
 	uint64_t prs_start_time;
 	uint64_t prs_end_time;
 	uint64_t prs_to_copy; /* bytes that need to be copied */
 	uint64_t prs_copied; /* bytes copied so far */
 	/*
 	 * bytes of memory used for indirect mappings.
 	 * This includes all removed vdevs.
 	 */
 	uint64_t prs_mapping_memory;
 } pool_removal_stat_t;
 
 typedef enum dsl_scan_state {
 	DSS_NONE,
 	DSS_SCANNING,
 	DSS_FINISHED,
 	DSS_CANCELED,
 	DSS_NUM_STATES
 } dsl_scan_state_t;
 
 typedef struct vdev_rebuild_stat {
 	uint64_t vrs_state;		/* vdev_rebuild_state_t */
 	uint64_t vrs_start_time;	/* time_t */
 	uint64_t vrs_end_time;		/* time_t */
 	uint64_t vrs_scan_time_ms;	/* total run time (millisecs) */
 	uint64_t vrs_bytes_scanned;	/* allocated bytes scanned */
 	uint64_t vrs_bytes_issued;	/* read bytes issued */
 	uint64_t vrs_bytes_rebuilt;	/* rebuilt bytes */
 	uint64_t vrs_bytes_est;		/* total bytes to scan */
 	uint64_t vrs_errors;		/* scanning errors */
 	uint64_t vrs_pass_time_ms;	/* pass run time (millisecs) */
 	uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */
 	uint64_t vrs_pass_bytes_issued;	/* bytes rebuilt since start/resume */
 } vdev_rebuild_stat_t;
 
 /*
- * Errata described by https://zfsonlinux.org/msg/ZFS-8000-ER.  The ordering
- * of this enum must be maintained to ensure the errata identifiers map to
- * the correct documentation.  New errata may only be appended to the list
- * and must contain corresponding documentation at the above link.
+ * Errata described by https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-ER.
+ * The ordering of this enum must be maintained to ensure the errata identifiers
+ * map to the correct documentation.  New errata may only be appended to the
+ * list and must contain corresponding documentation at the above link.
  */
 typedef enum zpool_errata {
 	ZPOOL_ERRATA_NONE,
 	ZPOOL_ERRATA_ZOL_2094_SCRUB,
 	ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY,
 	ZPOOL_ERRATA_ZOL_6845_ENCRYPTION,
 	ZPOOL_ERRATA_ZOL_8308_ENCRYPTION,
 } zpool_errata_t;
 
 /*
  * Vdev statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and user land as an nvlist uint64 array.
  *
  * The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in
  * order to keep subsequent members at their known fixed offsets.  When
  * adding a new field it must be added to the end the structure.
  */
 #define	VS_ZIO_TYPES	6
 
 typedef struct vdev_stat {
 	hrtime_t	vs_timestamp;		/* time since vdev load	*/
 	uint64_t	vs_state;		/* vdev state		*/
 	uint64_t	vs_aux;			/* see vdev_aux_t	*/
 	uint64_t	vs_alloc;		/* space allocated	*/
 	uint64_t	vs_space;		/* total capacity	*/
 	uint64_t	vs_dspace;		/* deflated capacity	*/
 	uint64_t	vs_rsize;		/* replaceable dev size */
 	uint64_t	vs_esize;		/* expandable dev size */
 	uint64_t	vs_ops[VS_ZIO_TYPES];	/* operation count	*/
 	uint64_t	vs_bytes[VS_ZIO_TYPES];	/* bytes read/written	*/
 	uint64_t	vs_read_errors;		/* read errors		*/
 	uint64_t	vs_write_errors;	/* write errors		*/
 	uint64_t	vs_checksum_errors;	/* checksum errors	*/
 	uint64_t	vs_initialize_errors;	/* initializing errors	*/
 	uint64_t	vs_self_healed;		/* self-healed bytes	*/
 	uint64_t	vs_scan_removing;	/* removing?	*/
 	uint64_t	vs_scan_processed;	/* scan processed bytes	*/
 	uint64_t	vs_fragmentation;	/* device fragmentation */
 	uint64_t	vs_initialize_bytes_done; /* bytes initialized */
 	uint64_t	vs_initialize_bytes_est; /* total bytes to initialize */
 	uint64_t	vs_initialize_state;	/* vdev_initializing_state_t */
 	uint64_t	vs_initialize_action_time; /* time_t */
 	uint64_t	vs_checkpoint_space;    /* checkpoint-consumed space */
 	uint64_t	vs_resilver_deferred;	/* resilver deferred	*/
 	uint64_t	vs_slow_ios;		/* slow IOs */
 	uint64_t	vs_trim_errors;		/* trimming errors	*/
 	uint64_t	vs_trim_notsup;		/* supported by device */
 	uint64_t	vs_trim_bytes_done;	/* bytes trimmed */
 	uint64_t	vs_trim_bytes_est;	/* total bytes to trim */
 	uint64_t	vs_trim_state;		/* vdev_trim_state_t */
 	uint64_t	vs_trim_action_time;	/* time_t */
 	uint64_t	vs_rebuild_processed;	/* bytes rebuilt */
 	uint64_t	vs_configured_ashift;   /* TLV vdev_ashift */
 	uint64_t	vs_logical_ashift;	/* vdev_logical_ashift  */
 	uint64_t	vs_physical_ashift;	/* vdev_physical_ashift */
 } vdev_stat_t;
 
 /* BEGIN CSTYLED */
 #define	VDEV_STAT_VALID(field, uint64_t_field_count) \
     ((uint64_t_field_count * sizeof (uint64_t)) >=	 \
      (offsetof(vdev_stat_t, field) + sizeof (((vdev_stat_t *)NULL)->field)))
 /* END CSTYLED */
 
 /*
  * Extended stats
  *
  * These are stats which aren't included in the original iostat output.  For
  * convenience, they are grouped together in vdev_stat_ex, although each stat
  * is individually exported as an nvlist.
  */
 typedef struct vdev_stat_ex {
 	/* Number of ZIOs issued to disk and waiting to finish */
 	uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE];
 
 	/* Number of ZIOs pending to be issued to disk */
 	uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE];
 
 	/*
 	 * Below are the histograms for various latencies. Buckets are in
 	 * units of nanoseconds.
 	 */
 
 	/*
 	 * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in
 	 * before this.
 	 */
 #define	VDEV_L_HISTO_BUCKETS 37		/* Latency histo buckets */
 #define	VDEV_RQ_HISTO_BUCKETS 25	/* Request size histo buckets */
 
 	/* Amount of time in ZIO queue (ns) */
 	uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
 	    [VDEV_L_HISTO_BUCKETS];
 
 	/* Total ZIO latency (ns).  Includes queuing and disk access time */
 	uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS];
 
 	/* Amount of time to read/write the disk (ns) */
 	uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS];
 
 	/* "lookup the bucket for a value" histogram macros */
 #define	HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \
 	    buckets - 1) : 0)
 #define	L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS)
 #define	RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS)
 
 	/* Physical IO histogram */
 	uint64_t vsx_ind_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
 	    [VDEV_RQ_HISTO_BUCKETS];
 
 	/* Delegated (aggregated) physical IO histogram */
 	uint64_t vsx_agg_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
 	    [VDEV_RQ_HISTO_BUCKETS];
 
 } vdev_stat_ex_t;
 
 /*
  * Initialize functions.
  */
 typedef enum pool_initialize_func {
 	POOL_INITIALIZE_START,
 	POOL_INITIALIZE_CANCEL,
 	POOL_INITIALIZE_SUSPEND,
 	POOL_INITIALIZE_FUNCS
 } pool_initialize_func_t;
 
 /*
  * TRIM functions.
  */
 typedef enum pool_trim_func {
 	POOL_TRIM_START,
 	POOL_TRIM_CANCEL,
 	POOL_TRIM_SUSPEND,
 	POOL_TRIM_FUNCS
 } pool_trim_func_t;
 
 /*
  * DDT statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and userland as an nvlist uint64 array.
  */
 typedef struct ddt_object {
 	uint64_t	ddo_count;	/* number of elements in ddt	*/
 	uint64_t	ddo_dspace;	/* size of ddt on disk		*/
 	uint64_t	ddo_mspace;	/* size of ddt in-core		*/
 } ddt_object_t;
 
 typedef struct ddt_stat {
 	uint64_t	dds_blocks;	/* blocks			*/
 	uint64_t	dds_lsize;	/* logical size			*/
 	uint64_t	dds_psize;	/* physical size		*/
 	uint64_t	dds_dsize;	/* deflated allocated size	*/
 	uint64_t	dds_ref_blocks;	/* referenced blocks		*/
 	uint64_t	dds_ref_lsize;	/* referenced lsize * refcnt	*/
 	uint64_t	dds_ref_psize;	/* referenced psize * refcnt	*/
 	uint64_t	dds_ref_dsize;	/* referenced dsize * refcnt	*/
 } ddt_stat_t;
 
 typedef struct ddt_histogram {
 	ddt_stat_t	ddh_stat[64];	/* power-of-two histogram buckets */
 } ddt_histogram_t;
 
 #define	ZVOL_DRIVER	"zvol"
 #define	ZFS_DRIVER	"zfs"
 #define	ZFS_DEV		"/dev/zfs"
 
 #define	ZFS_SUPER_MAGIC	0x2fc12fc1
 
 /* general zvol path */
 #define	ZVOL_DIR		"/dev/zvol/"
 
 #define	ZVOL_MAJOR		230
 #define	ZVOL_MINOR_BITS		4
 #define	ZVOL_MINOR_MASK		((1U << ZVOL_MINOR_BITS) - 1)
 #define	ZVOL_MINORS		(1 << 4)
 #define	ZVOL_DEV_NAME		"zd"
 
 #define	ZVOL_PROP_NAME		"name"
 #define	ZVOL_DEFAULT_BLOCKSIZE	8192
 
 typedef enum {
 	VDEV_INITIALIZE_NONE,
 	VDEV_INITIALIZE_ACTIVE,
 	VDEV_INITIALIZE_CANCELED,
 	VDEV_INITIALIZE_SUSPENDED,
 	VDEV_INITIALIZE_COMPLETE
 } vdev_initializing_state_t;
 
 typedef enum {
 	VDEV_TRIM_NONE,
 	VDEV_TRIM_ACTIVE,
 	VDEV_TRIM_CANCELED,
 	VDEV_TRIM_SUSPENDED,
 	VDEV_TRIM_COMPLETE,
 } vdev_trim_state_t;
 
 typedef enum {
 	VDEV_REBUILD_NONE,
 	VDEV_REBUILD_ACTIVE,
 	VDEV_REBUILD_CANCELED,
 	VDEV_REBUILD_COMPLETE,
 } vdev_rebuild_state_t;
 
 /*
  * nvlist name constants. Facilitate restricting snapshot iteration range for
  * the "list next snapshot" ioctl
  */
 #define	SNAP_ITER_MIN_TXG	"snap_iter_min_txg"
 #define	SNAP_ITER_MAX_TXG	"snap_iter_max_txg"
 
 /*
  * /dev/zfs ioctl numbers.
  *
  * These numbers cannot change over time. New ioctl numbers must be appended.
  */
 typedef enum zfs_ioc {
 	/*
 	 * Core features - 81/128 numbers reserved.
 	 */
 #ifdef __FreeBSD__
 	ZFS_IOC_FIRST =	0,
 #else
 	ZFS_IOC_FIRST =	('Z' << 8),
 #endif
 	ZFS_IOC = ZFS_IOC_FIRST,
 	ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST,	/* 0x5a00 */
 	ZFS_IOC_POOL_DESTROY,			/* 0x5a01 */
 	ZFS_IOC_POOL_IMPORT,			/* 0x5a02 */
 	ZFS_IOC_POOL_EXPORT,			/* 0x5a03 */
 	ZFS_IOC_POOL_CONFIGS,			/* 0x5a04 */
 	ZFS_IOC_POOL_STATS,			/* 0x5a05 */
 	ZFS_IOC_POOL_TRYIMPORT,			/* 0x5a06 */
 	ZFS_IOC_POOL_SCAN,			/* 0x5a07 */
 	ZFS_IOC_POOL_FREEZE,			/* 0x5a08 */
 	ZFS_IOC_POOL_UPGRADE,			/* 0x5a09 */
 	ZFS_IOC_POOL_GET_HISTORY,		/* 0x5a0a */
 	ZFS_IOC_VDEV_ADD,			/* 0x5a0b */
 	ZFS_IOC_VDEV_REMOVE,			/* 0x5a0c */
 	ZFS_IOC_VDEV_SET_STATE,			/* 0x5a0d */
 	ZFS_IOC_VDEV_ATTACH,			/* 0x5a0e */
 	ZFS_IOC_VDEV_DETACH,			/* 0x5a0f */
 	ZFS_IOC_VDEV_SETPATH,			/* 0x5a10 */
 	ZFS_IOC_VDEV_SETFRU,			/* 0x5a11 */
 	ZFS_IOC_OBJSET_STATS,			/* 0x5a12 */
 	ZFS_IOC_OBJSET_ZPLPROPS,		/* 0x5a13 */
 	ZFS_IOC_DATASET_LIST_NEXT,		/* 0x5a14 */
 	ZFS_IOC_SNAPSHOT_LIST_NEXT,		/* 0x5a15 */
 	ZFS_IOC_SET_PROP,			/* 0x5a16 */
 	ZFS_IOC_CREATE,				/* 0x5a17 */
 	ZFS_IOC_DESTROY,			/* 0x5a18 */
 	ZFS_IOC_ROLLBACK,			/* 0x5a19 */
 	ZFS_IOC_RENAME,				/* 0x5a1a */
 	ZFS_IOC_RECV,				/* 0x5a1b */
 	ZFS_IOC_SEND,				/* 0x5a1c */
 	ZFS_IOC_INJECT_FAULT,			/* 0x5a1d */
 	ZFS_IOC_CLEAR_FAULT,			/* 0x5a1e */
 	ZFS_IOC_INJECT_LIST_NEXT,		/* 0x5a1f */
 	ZFS_IOC_ERROR_LOG,			/* 0x5a20 */
 	ZFS_IOC_CLEAR,				/* 0x5a21 */
 	ZFS_IOC_PROMOTE,			/* 0x5a22 */
 	ZFS_IOC_SNAPSHOT,			/* 0x5a23 */
 	ZFS_IOC_DSOBJ_TO_DSNAME,		/* 0x5a24 */
 	ZFS_IOC_OBJ_TO_PATH,			/* 0x5a25 */
 	ZFS_IOC_POOL_SET_PROPS,			/* 0x5a26 */
 	ZFS_IOC_POOL_GET_PROPS,			/* 0x5a27 */
 	ZFS_IOC_SET_FSACL,			/* 0x5a28 */
 	ZFS_IOC_GET_FSACL,			/* 0x5a29 */
 	ZFS_IOC_SHARE,				/* 0x5a2a */
 	ZFS_IOC_INHERIT_PROP,			/* 0x5a2b */
 	ZFS_IOC_SMB_ACL,			/* 0x5a2c */
 	ZFS_IOC_USERSPACE_ONE,			/* 0x5a2d */
 	ZFS_IOC_USERSPACE_MANY,			/* 0x5a2e */
 	ZFS_IOC_USERSPACE_UPGRADE,		/* 0x5a2f */
 	ZFS_IOC_HOLD,				/* 0x5a30 */
 	ZFS_IOC_RELEASE,			/* 0x5a31 */
 	ZFS_IOC_GET_HOLDS,			/* 0x5a32 */
 	ZFS_IOC_OBJSET_RECVD_PROPS,		/* 0x5a33 */
 	ZFS_IOC_VDEV_SPLIT,			/* 0x5a34 */
 	ZFS_IOC_NEXT_OBJ,			/* 0x5a35 */
 	ZFS_IOC_DIFF,				/* 0x5a36 */
 	ZFS_IOC_TMP_SNAPSHOT,			/* 0x5a37 */
 	ZFS_IOC_OBJ_TO_STATS,			/* 0x5a38 */
 	ZFS_IOC_SPACE_WRITTEN,			/* 0x5a39 */
 	ZFS_IOC_SPACE_SNAPS,			/* 0x5a3a */
 	ZFS_IOC_DESTROY_SNAPS,			/* 0x5a3b */
 	ZFS_IOC_POOL_REGUID,			/* 0x5a3c */
 	ZFS_IOC_POOL_REOPEN,			/* 0x5a3d */
 	ZFS_IOC_SEND_PROGRESS,			/* 0x5a3e */
 	ZFS_IOC_LOG_HISTORY,			/* 0x5a3f */
 	ZFS_IOC_SEND_NEW,			/* 0x5a40 */
 	ZFS_IOC_SEND_SPACE,			/* 0x5a41 */
 	ZFS_IOC_CLONE,				/* 0x5a42 */
 	ZFS_IOC_BOOKMARK,			/* 0x5a43 */
 	ZFS_IOC_GET_BOOKMARKS,			/* 0x5a44 */
 	ZFS_IOC_DESTROY_BOOKMARKS,		/* 0x5a45 */
 	ZFS_IOC_RECV_NEW,			/* 0x5a46 */
 	ZFS_IOC_POOL_SYNC,			/* 0x5a47 */
 	ZFS_IOC_CHANNEL_PROGRAM,		/* 0x5a48 */
 	ZFS_IOC_LOAD_KEY,			/* 0x5a49 */
 	ZFS_IOC_UNLOAD_KEY,			/* 0x5a4a */
 	ZFS_IOC_CHANGE_KEY,			/* 0x5a4b */
 	ZFS_IOC_REMAP,				/* 0x5a4c */
 	ZFS_IOC_POOL_CHECKPOINT,		/* 0x5a4d */
 	ZFS_IOC_POOL_DISCARD_CHECKPOINT,	/* 0x5a4e */
 	ZFS_IOC_POOL_INITIALIZE,		/* 0x5a4f */
 	ZFS_IOC_POOL_TRIM,			/* 0x5a50 */
 	ZFS_IOC_REDACT,				/* 0x5a51 */
 	ZFS_IOC_GET_BOOKMARK_PROPS,		/* 0x5a52 */
 	ZFS_IOC_WAIT,				/* 0x5a53 */
 	ZFS_IOC_WAIT_FS,			/* 0x5a54 */
 
 	/*
 	 * Per-platform (Optional) - 8/128 numbers reserved.
 	 */
 	ZFS_IOC_PLATFORM = ZFS_IOC_FIRST + 0x80,
 	ZFS_IOC_EVENTS_NEXT,			/* 0x81 (Linux) */
 	ZFS_IOC_EVENTS_CLEAR,			/* 0x82 (Linux) */
 	ZFS_IOC_EVENTS_SEEK,			/* 0x83 (Linux) */
 	ZFS_IOC_NEXTBOOT,			/* 0x84 (FreeBSD) */
 	ZFS_IOC_JAIL,				/* 0x85 (FreeBSD) */
 	ZFS_IOC_UNJAIL,				/* 0x86 (FreeBSD) */
 	ZFS_IOC_SET_BOOTENV,			/* 0x87 (Linux) */
 	ZFS_IOC_GET_BOOTENV,			/* 0x88 (Linux) */
 	ZFS_IOC_LAST
 } zfs_ioc_t;
 
 /*
  * zvol ioctl to get dataset name
  */
 #define	BLKZNAME		_IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN])
 
 /*
  * ZFS-specific error codes used for returning descriptive errors
  * to the userland through zfs ioctls.
  *
  * The enum implicitly includes all the error codes from errno.h.
  * New code should use and extend this enum for errors that are
  * not described precisely by generic errno codes.
  *
  * These numbers should not change over time. New entries should be appended.
  *
  * (Keep in sync with contrib/pyzfs/libzfs_core/_constants.py)
  */
 typedef enum {
 	ZFS_ERR_CHECKPOINT_EXISTS = 1024,
 	ZFS_ERR_DISCARDING_CHECKPOINT,
 	ZFS_ERR_NO_CHECKPOINT,
 	ZFS_ERR_DEVRM_IN_PROGRESS,
 	ZFS_ERR_VDEV_TOO_BIG,
 	ZFS_ERR_IOC_CMD_UNAVAIL,
 	ZFS_ERR_IOC_ARG_UNAVAIL,
 	ZFS_ERR_IOC_ARG_REQUIRED,
 	ZFS_ERR_IOC_ARG_BADTYPE,
 	ZFS_ERR_WRONG_PARENT,
 	ZFS_ERR_FROM_IVSET_GUID_MISSING,
 	ZFS_ERR_FROM_IVSET_GUID_MISMATCH,
 	ZFS_ERR_SPILL_BLOCK_FLAG_MISSING,
 	ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE,
 	ZFS_ERR_EXPORT_IN_PROGRESS,
 	ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR,
 	ZFS_ERR_STREAM_TRUNCATED,
 	ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH,
 	ZFS_ERR_RESILVER_IN_PROGRESS,
 	ZFS_ERR_REBUILD_IN_PROGRESS,
 	ZFS_ERR_BADPROP,
 } zfs_errno_t;
 
 /*
  * Internal SPA load state.  Used by FMA diagnosis engine.
  */
 typedef enum {
 	SPA_LOAD_NONE,		/* no load in progress	*/
 	SPA_LOAD_OPEN,		/* normal open		*/
 	SPA_LOAD_IMPORT,	/* import in progress	*/
 	SPA_LOAD_TRYIMPORT,	/* tryimport in progress */
 	SPA_LOAD_RECOVER,	/* recovery requested	*/
 	SPA_LOAD_ERROR,		/* load failed		*/
 	SPA_LOAD_CREATE		/* creation in progress */
 } spa_load_state_t;
 
 typedef enum {
 	ZPOOL_WAIT_CKPT_DISCARD,
 	ZPOOL_WAIT_FREE,
 	ZPOOL_WAIT_INITIALIZE,
 	ZPOOL_WAIT_REPLACE,
 	ZPOOL_WAIT_REMOVE,
 	ZPOOL_WAIT_RESILVER,
 	ZPOOL_WAIT_SCRUB,
 	ZPOOL_WAIT_TRIM,
 	ZPOOL_WAIT_NUM_ACTIVITIES
 } zpool_wait_activity_t;
 
 typedef enum {
 	ZFS_WAIT_DELETEQ,
 	ZFS_WAIT_NUM_ACTIVITIES
 } zfs_wait_activity_t;
 
 /*
  * Bookmark name values.
  */
 #define	ZPOOL_ERR_LIST		"error list"
 #define	ZPOOL_ERR_DATASET	"dataset"
 #define	ZPOOL_ERR_OBJECT	"object"
 
 #define	HIS_MAX_RECORD_LEN	(MAXPATHLEN + MAXPATHLEN + 1)
 
 /*
  * The following are names used in the nvlist describing
  * the pool's history log.
  */
 #define	ZPOOL_HIST_RECORD	"history record"
 #define	ZPOOL_HIST_TIME		"history time"
 #define	ZPOOL_HIST_CMD		"history command"
 #define	ZPOOL_HIST_WHO		"history who"
 #define	ZPOOL_HIST_ZONE		"history zone"
 #define	ZPOOL_HIST_HOST		"history hostname"
 #define	ZPOOL_HIST_TXG		"history txg"
 #define	ZPOOL_HIST_INT_EVENT	"history internal event"
 #define	ZPOOL_HIST_INT_STR	"history internal str"
 #define	ZPOOL_HIST_INT_NAME	"internal_name"
 #define	ZPOOL_HIST_IOCTL	"ioctl"
 #define	ZPOOL_HIST_INPUT_NVL	"in_nvl"
 #define	ZPOOL_HIST_OUTPUT_NVL	"out_nvl"
 #define	ZPOOL_HIST_DSNAME	"dsname"
 #define	ZPOOL_HIST_DSID		"dsid"
 #define	ZPOOL_HIST_ERRNO	"errno"
 
 /*
  * Special nvlist name that will not have its args recorded in the pool's
  * history log.
  */
 #define	ZPOOL_HIDDEN_ARGS	"hidden_args"
 
 /*
  * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE.
  */
 #define	ZPOOL_INITIALIZE_COMMAND	"initialize_command"
 #define	ZPOOL_INITIALIZE_VDEVS		"initialize_vdevs"
 
 /*
  * The following are names used when invoking ZFS_IOC_POOL_TRIM.
  */
 #define	ZPOOL_TRIM_COMMAND		"trim_command"
 #define	ZPOOL_TRIM_VDEVS		"trim_vdevs"
 #define	ZPOOL_TRIM_RATE			"trim_rate"
 #define	ZPOOL_TRIM_SECURE		"trim_secure"
 
 /*
  * The following are names used when invoking ZFS_IOC_POOL_WAIT.
  */
 #define	ZPOOL_WAIT_ACTIVITY		"wait_activity"
 #define	ZPOOL_WAIT_TAG			"wait_tag"
 #define	ZPOOL_WAIT_WAITED		"wait_waited"
 
 /*
  * The following are names used when invoking ZFS_IOC_WAIT_FS.
  */
 #define	ZFS_WAIT_ACTIVITY		"wait_activity"
 #define	ZFS_WAIT_WAITED			"wait_waited"
 
 /*
  * Flags for ZFS_IOC_VDEV_SET_STATE
  */
 #define	ZFS_ONLINE_CHECKREMOVE	0x1
 #define	ZFS_ONLINE_UNSPARE	0x2
 #define	ZFS_ONLINE_FORCEFAULT	0x4
 #define	ZFS_ONLINE_EXPAND	0x8
 #define	ZFS_OFFLINE_TEMPORARY	0x1
 
 /*
  * Flags for ZFS_IOC_POOL_IMPORT
  */
 #define	ZFS_IMPORT_NORMAL	0x0
 #define	ZFS_IMPORT_VERBATIM	0x1
 #define	ZFS_IMPORT_ANY_HOST	0x2
 #define	ZFS_IMPORT_MISSING_LOG	0x4
 #define	ZFS_IMPORT_ONLY		0x8
 #define	ZFS_IMPORT_TEMP_NAME	0x10
 #define	ZFS_IMPORT_SKIP_MMP	0x20
 #define	ZFS_IMPORT_LOAD_KEYS	0x40
 #define	ZFS_IMPORT_CHECKPOINT	0x80
 
 /*
  * Channel program argument/return nvlist keys and defaults.
  */
 #define	ZCP_ARG_PROGRAM		"program"
 #define	ZCP_ARG_ARGLIST		"arg"
 #define	ZCP_ARG_SYNC		"sync"
 #define	ZCP_ARG_INSTRLIMIT	"instrlimit"
 #define	ZCP_ARG_MEMLIMIT	"memlimit"
 
 #define	ZCP_ARG_CLIARGV		"argv"
 
 #define	ZCP_RET_ERROR		"error"
 #define	ZCP_RET_RETURN		"return"
 
 #define	ZCP_DEFAULT_INSTRLIMIT	(10 * 1000 * 1000)
 #define	ZCP_MAX_INSTRLIMIT	(10 * ZCP_DEFAULT_INSTRLIMIT)
 #define	ZCP_DEFAULT_MEMLIMIT	(10 * 1024 * 1024)
 #define	ZCP_MAX_MEMLIMIT	(10 * ZCP_DEFAULT_MEMLIMIT)
 
 /*
  * Sysevent payload members.  ZFS will generate the following sysevents with the
  * given payloads:
  *
  *	ESC_ZFS_RESILVER_START
  *	ESC_ZFS_RESILVER_FINISH
  *
  *		ZFS_EV_POOL_NAME	DATA_TYPE_STRING
  *		ZFS_EV_POOL_GUID	DATA_TYPE_UINT64
  *		ZFS_EV_RESILVER_TYPE	DATA_TYPE_STRING
  *
  *	ESC_ZFS_POOL_DESTROY
  *	ESC_ZFS_POOL_REGUID
  *
  *		ZFS_EV_POOL_NAME	DATA_TYPE_STRING
  *		ZFS_EV_POOL_GUID	DATA_TYPE_UINT64
  *
  *	ESC_ZFS_VDEV_REMOVE
  *	ESC_ZFS_VDEV_CLEAR
  *	ESC_ZFS_VDEV_CHECK
  *
  *		ZFS_EV_POOL_NAME	DATA_TYPE_STRING
  *		ZFS_EV_POOL_GUID	DATA_TYPE_UINT64
  *		ZFS_EV_VDEV_PATH	DATA_TYPE_STRING	(optional)
  *		ZFS_EV_VDEV_GUID	DATA_TYPE_UINT64
  *
  *	ESC_ZFS_HISTORY_EVENT
  *
  *		ZFS_EV_POOL_NAME	DATA_TYPE_STRING
  *		ZFS_EV_POOL_GUID	DATA_TYPE_UINT64
  *		ZFS_EV_HIST_TIME	DATA_TYPE_UINT64	(optional)
  *		ZFS_EV_HIST_CMD		DATA_TYPE_STRING	(optional)
  *		ZFS_EV_HIST_WHO		DATA_TYPE_UINT64	(optional)
  *		ZFS_EV_HIST_ZONE	DATA_TYPE_STRING	(optional)
  *		ZFS_EV_HIST_HOST	DATA_TYPE_STRING	(optional)
  *		ZFS_EV_HIST_TXG		DATA_TYPE_UINT64	(optional)
  *		ZFS_EV_HIST_INT_EVENT	DATA_TYPE_UINT64	(optional)
  *		ZFS_EV_HIST_INT_STR	DATA_TYPE_STRING	(optional)
  *		ZFS_EV_HIST_INT_NAME	DATA_TYPE_STRING	(optional)
  *		ZFS_EV_HIST_IOCTL	DATA_TYPE_STRING	(optional)
  *		ZFS_EV_HIST_DSNAME	DATA_TYPE_STRING	(optional)
  *		ZFS_EV_HIST_DSID	DATA_TYPE_UINT64	(optional)
  *
  * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the
  * history log nvlist.  The keynames will be free of any spaces or other
  * characters that could be potentially unexpected to consumers of the
  * sysevents.
  */
 #define	ZFS_EV_POOL_NAME	"pool_name"
 #define	ZFS_EV_POOL_GUID	"pool_guid"
 #define	ZFS_EV_VDEV_PATH	"vdev_path"
 #define	ZFS_EV_VDEV_GUID	"vdev_guid"
 #define	ZFS_EV_HIST_TIME	"history_time"
 #define	ZFS_EV_HIST_CMD		"history_command"
 #define	ZFS_EV_HIST_WHO		"history_who"
 #define	ZFS_EV_HIST_ZONE	"history_zone"
 #define	ZFS_EV_HIST_HOST	"history_hostname"
 #define	ZFS_EV_HIST_TXG		"history_txg"
 #define	ZFS_EV_HIST_INT_EVENT	"history_internal_event"
 #define	ZFS_EV_HIST_INT_STR	"history_internal_str"
 #define	ZFS_EV_HIST_INT_NAME	"history_internal_name"
 #define	ZFS_EV_HIST_IOCTL	"history_ioctl"
 #define	ZFS_EV_HIST_DSNAME	"history_dsname"
 #define	ZFS_EV_HIST_DSID	"history_dsid"
 #define	ZFS_EV_RESILVER_TYPE	"resilver_type"
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_FS_ZFS_H */
Index: vendor-sys/openzfs/dist/lib/libspl/include/os/freebsd/sys/mount.h
===================================================================
--- vendor-sys/openzfs/dist/lib/libspl/include/os/freebsd/sys/mount.h	(revision 364927)
+++ vendor-sys/openzfs/dist/lib/libspl/include/os/freebsd/sys/mount.h	(revision 364928)
@@ -1,108 +1,104 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 
 #ifndef _LIBSPL_SYS_MOUNT_H
 #define	_LIBSPL_SYS_MOUNT_H
 
 #undef _SYS_MOUNT_H_
 #include_next <sys/mount.h>
 
 #include <assert.h>
 #include <string.h>
 #include <stdlib.h>
 
-/*
- * Some old glibc headers don't define BLKGETSIZE64
- * and we don't want to require the kernel headers
- */
 #if !defined(BLKGETSIZE64)
-#define	BLKGETSIZE64		_IOR(0x12, 114, size_t)
+#define	BLKGETSIZE64		DIOCGMEDIASIZE
 #endif
 
 /*
  * Some old glibc headers don't correctly define MS_DIRSYNC and
  * instead use the enum name S_WRITE.  When using these older
  * headers define MS_DIRSYNC to be S_WRITE.
  */
 #if !defined(MS_DIRSYNC)
 #define	MS_DIRSYNC		S_WRITE
 #endif
 
 /*
  * Some old glibc headers don't correctly define MS_POSIXACL and
  * instead leave it undefined.  When using these older headers define
  * MS_POSIXACL to the reserved value of (1<<16).
  */
 #if !defined(MS_POSIXACL)
 #define	MS_POSIXACL		(1<<16)
 #endif
 
 #define	MS_NOSUID	MNT_NOSUID
 #define	MS_NOEXEC	MNT_NOEXEC
 #define	MS_NODEV	0
 #define	S_WRITE		0
 #define	MS_BIND		0
 #define	MS_REMOUNT	0
 #define	MS_SYNCHRONOUS	MNT_SYNCHRONOUS
 
 #define	MS_USERS	(MS_NOEXEC|MS_NOSUID|MS_NODEV)
 #define	MS_OWNER	(MS_NOSUID|MS_NODEV)
 #define	MS_GROUP	(MS_NOSUID|MS_NODEV)
 #define	MS_COMMENT	0
 
 /*
  * Older glibc <sys/mount.h> headers did not define all the available
  * umount2(2) flags.  Both MNT_FORCE and MNT_DETACH are supported in the
  * kernel back to 2.4.11 so we define them correctly if they are missing.
  */
 #ifdef MNT_FORCE
 #define	MS_FORCE	MNT_FORCE
 #else
 #define	MS_FORCE	0x00000001
 #endif /* MNT_FORCE */
 
 #ifdef MNT_DETACH
 #define	MS_DETACH	MNT_DETACH
 #else
 #define	MS_DETACH	0x00000002
 #endif /* MNT_DETACH */
 
 /*
  * Overlay mount is default in Linux, but for solaris/zfs
  * compatibility, MS_OVERLAY is defined to explicitly have the user
  * provide a flag (-O) to mount over a non empty directory.
  */
 #define	MS_OVERLAY	0x00000004
 
 /*
  * MS_CRYPT indicates that encryption keys should be loaded if they are not
  * already available. This is not defined in glibc, but it is never seen by
  * the kernel so it will not cause any problems.
  */
 #define	MS_CRYPT	0x00000008
 
 #endif /* _LIBSPL_SYS_MOUNT_H */
Index: vendor-sys/openzfs/dist/lib/libzfs/libzfs_sendrecv.c
===================================================================
--- vendor-sys/openzfs/dist/lib/libzfs/libzfs_sendrecv.c	(revision 364927)
+++ vendor-sys/openzfs/dist/lib/libzfs/libzfs_sendrecv.c	(revision 364928)
@@ -1,5170 +1,5202 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2019 Datto Inc.
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <stddef.h>
 #include <fcntl.h>
 #include <sys/mount.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
 #include <sys/avl.h>
 #include <sys/debug.h>
 #include <sys/stat.h>
 #include <stddef.h>
 #include <pthread.h>
 #include <umem.h>
 #include <time.h>
 
 #include <libzfs.h>
 #include <libzfs_core.h>
 #include <libzutil.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "zfs_fletcher.h"
 #include "libzfs_impl.h"
 #include <cityhash.h>
 #include <zlib.h>
 #include <sys/zio_checksum.h>
 #include <sys/dsl_crypt.h>
 #include <sys/ddt.h>
 #include <sys/socket.h>
 #include <sys/sha2.h>
 
 static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *,
     recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **,
     const char *, nvlist_t *);
 static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent,
     uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids,
     uint64_t num_redact_snaps, char *name);
 static int guid_to_name(libzfs_handle_t *, const char *,
     uint64_t, boolean_t, char *);
 
 typedef struct progress_arg {
 	zfs_handle_t *pa_zhp;
 	int pa_fd;
 	boolean_t pa_parsable;
 	boolean_t pa_estimate;
 	int pa_verbosity;
 } progress_arg_t;
 
 static int
 dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
     zio_cksum_t *zc, int outfd)
 {
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 	fletcher_4_incremental_native(drr,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
 	if (drr->drr_type != DRR_BEGIN) {
 		ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
 		    drr_checksum.drr_checksum));
 		drr->drr_u.drr_checksum.drr_checksum = *zc;
 	}
 	fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
 	    sizeof (zio_cksum_t), zc);
 	if (write(outfd, drr, sizeof (*drr)) == -1)
 		return (errno);
 	if (payload_len != 0) {
 		fletcher_4_incremental_native(payload, payload_len, zc);
 		if (write(outfd, payload, payload_len) == -1)
 			return (errno);
 	}
 	return (0);
 }
 
 /*
  * Routines for dealing with the AVL tree of fs-nvlists
  */
 typedef struct fsavl_node {
 	avl_node_t fn_node;
 	nvlist_t *fn_nvfs;
 	char *fn_snapname;
 	uint64_t fn_guid;
 } fsavl_node_t;
 
 static int
 fsavl_compare(const void *arg1, const void *arg2)
 {
 	const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1;
 	const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2;
 
 	return (TREE_CMP(fn1->fn_guid, fn2->fn_guid));
 }
 
 /*
  * Given the GUID of a snapshot, find its containing filesystem and
  * (optionally) name.
  */
 static nvlist_t *
 fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname)
 {
 	fsavl_node_t fn_find;
 	fsavl_node_t *fn;
 
 	fn_find.fn_guid = snapguid;
 
 	fn = avl_find(avl, &fn_find, NULL);
 	if (fn) {
 		if (snapname)
 			*snapname = fn->fn_snapname;
 		return (fn->fn_nvfs);
 	}
 	return (NULL);
 }
 
 static void
 fsavl_destroy(avl_tree_t *avl)
 {
 	fsavl_node_t *fn;
 	void *cookie;
 
 	if (avl == NULL)
 		return;
 
 	cookie = NULL;
 	while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL)
 		free(fn);
 	avl_destroy(avl);
 	free(avl);
 }
 
 /*
  * Given an nvlist, produce an avl tree of snapshots, ordered by guid
  */
 static avl_tree_t *
 fsavl_create(nvlist_t *fss)
 {
 	avl_tree_t *fsavl;
 	nvpair_t *fselem = NULL;
 
 	if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL)
 		return (NULL);
 
 	avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t),
 	    offsetof(fsavl_node_t, fn_node));
 
 	while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) {
 		nvlist_t *nvfs, *snaps;
 		nvpair_t *snapelem = NULL;
 
 		VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs));
 		VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps));
 
 		while ((snapelem =
 		    nvlist_next_nvpair(snaps, snapelem)) != NULL) {
 			fsavl_node_t *fn;
 			uint64_t guid;
 
 			VERIFY(0 == nvpair_value_uint64(snapelem, &guid));
 			if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) {
 				fsavl_destroy(fsavl);
 				return (NULL);
 			}
 			fn->fn_nvfs = nvfs;
 			fn->fn_snapname = nvpair_name(snapelem);
 			fn->fn_guid = guid;
 
 			/*
 			 * Note: if there are multiple snaps with the
 			 * same GUID, we ignore all but one.
 			 */
 			if (avl_find(fsavl, fn, NULL) == NULL)
 				avl_add(fsavl, fn);
 			else
 				free(fn);
 		}
 	}
 
 	return (fsavl);
 }
 
 /*
  * Routines for dealing with the giant nvlist of fs-nvlists, etc.
  */
 typedef struct send_data {
 	/*
 	 * assigned inside every recursive call,
 	 * restored from *_save on return:
 	 *
 	 * guid of fromsnap snapshot in parent dataset
 	 * txg of fromsnap snapshot in current dataset
 	 * txg of tosnap snapshot in current dataset
 	 */
 
 	uint64_t parent_fromsnap_guid;
 	uint64_t fromsnap_txg;
 	uint64_t tosnap_txg;
 
 	/* the nvlists get accumulated during depth-first traversal */
 	nvlist_t *parent_snaps;
 	nvlist_t *fss;
 	nvlist_t *snapprops;
 	nvlist_t *snapholds;	/* user holds */
 
 	/* send-receive configuration, does not change during traversal */
 	const char *fsname;
 	const char *fromsnap;
 	const char *tosnap;
 	boolean_t recursive;
 	boolean_t raw;
 	boolean_t doall;
 	boolean_t replicate;
 	boolean_t verbose;
 	boolean_t backup;
 	boolean_t seenfrom;
 	boolean_t seento;
 	boolean_t holds;	/* were holds requested with send -h */
 	boolean_t props;
 
 	/*
 	 * The header nvlist is of the following format:
 	 * {
 	 *   "tosnap" -> string
 	 *   "fromsnap" -> string (if incremental)
 	 *   "fss" -> {
 	 *	id -> {
 	 *
 	 *	 "name" -> string (full name; for debugging)
 	 *	 "parentfromsnap" -> number (guid of fromsnap in parent)
 	 *
 	 *	 "props" -> { name -> value (only if set here) }
 	 *	 "snaps" -> { name (lastname) -> number (guid) }
 	 *	 "snapprops" -> { name (lastname) -> { name -> value } }
 	 *	 "snapholds" -> { name (lastname) -> { holdname -> crtime } }
 	 *
 	 *	 "origin" -> number (guid) (if clone)
 	 *	 "is_encroot" -> boolean
 	 *	 "sent" -> boolean (not on-disk)
 	 *	}
 	 *   }
 	 * }
 	 *
 	 */
 } send_data_t;
 
 static void
 send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv);
 
 static int
 send_iterate_snap(zfs_handle_t *zhp, void *arg)
 {
 	send_data_t *sd = arg;
 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
 	uint64_t txg = zhp->zfs_dmustats.dds_creation_txg;
 	char *snapname;
 	nvlist_t *nv;
 	boolean_t isfromsnap, istosnap, istosnapwithnofrom;
 
 	snapname = strrchr(zhp->zfs_name, '@')+1;
 	isfromsnap = (sd->fromsnap != NULL &&
 	    strcmp(sd->fromsnap, snapname) == 0);
 	istosnap = (sd->tosnap != NULL && (strcmp(sd->tosnap, snapname) == 0));
 	istosnapwithnofrom = (istosnap && sd->fromsnap == NULL);
 
 	if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) {
 		if (sd->verbose) {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "skipping snapshot %s because it was created "
 			    "after the destination snapshot (%s)\n"),
 			    zhp->zfs_name, sd->tosnap);
 		}
 		zfs_close(zhp);
 		return (0);
 	}
 
 	VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid));
 	/*
 	 * NB: if there is no fromsnap here (it's a newly created fs in
 	 * an incremental replication), we will substitute the tosnap.
 	 */
 	if (isfromsnap || (sd->parent_fromsnap_guid == 0 && istosnap)) {
 		sd->parent_fromsnap_guid = guid;
 	}
 
 	if (!sd->recursive) {
 		if (!sd->seenfrom && isfromsnap) {
 			sd->seenfrom = B_TRUE;
 			zfs_close(zhp);
 			return (0);
 		}
 
 		if ((sd->seento || !sd->seenfrom) && !istosnapwithnofrom) {
 			zfs_close(zhp);
 			return (0);
 		}
 
 		if (istosnap)
 			sd->seento = B_TRUE;
 	}
 
 	VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0));
 	send_iterate_prop(zhp, sd->backup, nv);
 	VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv));
 	nvlist_free(nv);
 	if (sd->holds) {
 		nvlist_t *holds = fnvlist_alloc();
 		int err = lzc_get_holds(zhp->zfs_name, &holds);
 		if (err == 0) {
 			VERIFY(0 == nvlist_add_nvlist(sd->snapholds,
 			    snapname, holds));
 		}
 		fnvlist_free(holds);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 static void
 send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv)
 {
 	nvlist_t *props = NULL;
 	nvpair_t *elem = NULL;
 
 	if (received_only)
 		props = zfs_get_recvd_props(zhp);
 	else
 		props = zhp->zfs_props;
 
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		char *propname = nvpair_name(elem);
 		zfs_prop_t prop = zfs_name_to_prop(propname);
 		nvlist_t *propnv;
 
 		if (!zfs_prop_user(propname)) {
 			/*
 			 * Realistically, this should never happen.  However,
 			 * we want the ability to add DSL properties without
 			 * needing to make incompatible version changes.  We
 			 * need to ignore unknown properties to allow older
 			 * software to still send datasets containing these
 			 * properties, with the unknown properties elided.
 			 */
 			if (prop == ZPROP_INVAL)
 				continue;
 
 			if (zfs_prop_readonly(prop))
 				continue;
 		}
 
 		verify(nvpair_value_nvlist(elem, &propnv) == 0);
 		if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION ||
 		    prop == ZFS_PROP_REFQUOTA ||
 		    prop == ZFS_PROP_REFRESERVATION) {
 			char *source;
 			uint64_t value;
 			verify(nvlist_lookup_uint64(propnv,
 			    ZPROP_VALUE, &value) == 0);
 			if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
 				continue;
 			/*
 			 * May have no source before SPA_VERSION_RECVD_PROPS,
 			 * but is still modifiable.
 			 */
 			if (nvlist_lookup_string(propnv,
 			    ZPROP_SOURCE, &source) == 0) {
 				if ((strcmp(source, zhp->zfs_name) != 0) &&
 				    (strcmp(source,
 				    ZPROP_SOURCE_VAL_RECVD) != 0))
 					continue;
 			}
 		} else {
 			char *source;
 			if (nvlist_lookup_string(propnv,
 			    ZPROP_SOURCE, &source) != 0)
 				continue;
 			if ((strcmp(source, zhp->zfs_name) != 0) &&
 			    (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0))
 				continue;
 		}
 
 		if (zfs_prop_user(propname) ||
 		    zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
 			char *value;
 			verify(nvlist_lookup_string(propnv,
 			    ZPROP_VALUE, &value) == 0);
 			VERIFY(0 == nvlist_add_string(nv, propname, value));
 		} else {
 			uint64_t value;
 			verify(nvlist_lookup_uint64(propnv,
 			    ZPROP_VALUE, &value) == 0);
 			VERIFY(0 == nvlist_add_uint64(nv, propname, value));
 		}
 	}
 }
 
 /*
  * returns snapshot creation txg
  * and returns 0 if the snapshot does not exist
  */
 static uint64_t
 get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	uint64_t txg = 0;
 
 	if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0')
 		return (txg);
 
 	(void) snprintf(name, sizeof (name), "%s@%s", fs, snap);
 	if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) {
 		zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT);
 		if (zhp != NULL) {
 			txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG);
 			zfs_close(zhp);
 		}
 	}
 
 	return (txg);
 }
 
 /*
  * recursively generate nvlists describing datasets.  See comment
  * for the data structure send_data_t above for description of contents
  * of the nvlist.
  */
 static int
 send_iterate_fs(zfs_handle_t *zhp, void *arg)
 {
 	send_data_t *sd = arg;
 	nvlist_t *nvfs = NULL, *nv = NULL;
 	int rv = 0;
 	uint64_t min_txg = 0, max_txg = 0;
 	uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid;
 	uint64_t fromsnap_txg_save = sd->fromsnap_txg;
 	uint64_t tosnap_txg_save = sd->tosnap_txg;
 	uint64_t txg = zhp->zfs_dmustats.dds_creation_txg;
 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
 	uint64_t fromsnap_txg, tosnap_txg;
 	char guidstring[64];
 
 	fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap);
 	if (fromsnap_txg != 0)
 		sd->fromsnap_txg = fromsnap_txg;
 
 	tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap);
 	if (tosnap_txg != 0)
 		sd->tosnap_txg = tosnap_txg;
 
 	/*
 	 * on the send side, if the current dataset does not have tosnap,
 	 * perform two additional checks:
 	 *
 	 * - skip sending the current dataset if it was created later than
 	 *   the parent tosnap
 	 * - return error if the current dataset was created earlier than
 	 *   the parent tosnap
 	 */
 	if (sd->tosnap != NULL && tosnap_txg == 0) {
 		if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) {
 			if (sd->verbose) {
 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 				    "skipping dataset %s: snapshot %s does "
 				    "not exist\n"), zhp->zfs_name, sd->tosnap);
 			}
 		} else {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "cannot send %s@%s%s: snapshot %s@%s does not "
 			    "exist\n"), sd->fsname, sd->tosnap, sd->recursive ?
 			    dgettext(TEXT_DOMAIN, " recursively") : "",
 			    zhp->zfs_name, sd->tosnap);
 			rv = EZFS_NOENT;
 		}
 		goto out;
 	}
 
 	nvfs = fnvlist_alloc();
 	fnvlist_add_string(nvfs, "name", zhp->zfs_name);
 	fnvlist_add_uint64(nvfs, "parentfromsnap",
 	    sd->parent_fromsnap_guid);
 
 	if (zhp->zfs_dmustats.dds_origin[0]) {
 		zfs_handle_t *origin = zfs_open(zhp->zfs_hdl,
 		    zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
 		if (origin == NULL) {
 			rv = -1;
 			goto out;
 		}
 		fnvlist_add_uint64(nvfs, "origin",
 		    origin->zfs_dmustats.dds_guid);
 
 		zfs_close(origin);
 	}
 
 	/* iterate over props */
 	if (sd->props || sd->backup || sd->recursive) {
 		nv = fnvlist_alloc();
 		send_iterate_prop(zhp, sd->backup, nv);
 	}
 	if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) {
 		boolean_t encroot;
 
 		/* determine if this dataset is an encryption root */
 		if (zfs_crypto_get_encryption_root(zhp, &encroot, NULL) != 0) {
 			rv = -1;
 			goto out;
 		}
 
 		if (encroot)
 			fnvlist_add_boolean(nvfs, "is_encroot");
 
 		/*
 		 * Encrypted datasets can only be sent with properties if
 		 * the raw flag is specified because the receive side doesn't
 		 * currently have a mechanism for recursively asking the user
 		 * for new encryption parameters.
 		 */
 		if (!sd->raw) {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "cannot send %s@%s: encrypted dataset %s may not "
 			    "be sent with properties without the raw flag\n"),
 			    sd->fsname, sd->tosnap, zhp->zfs_name);
 			rv = -1;
 			goto out;
 		}
 
 	}
 
 	if (nv != NULL)
 		fnvlist_add_nvlist(nvfs, "props", nv);
 
 	/* iterate over snaps, and set sd->parent_fromsnap_guid */
 	sd->parent_fromsnap_guid = 0;
 	sd->parent_snaps = fnvlist_alloc();
 	sd->snapprops = fnvlist_alloc();
 	if (sd->holds)
 		VERIFY(0 == nvlist_alloc(&sd->snapholds, NV_UNIQUE_NAME, 0));
 
 
 	/*
 	 * If this is a "doall" send, a replicate send or we're just trying
 	 * to gather a list of previous snapshots, iterate through all the
 	 * snaps in the txg range. Otherwise just look at the one we're
 	 * interested in.
 	 */
 	if (sd->doall || sd->replicate || sd->tosnap == NULL) {
 		if (!sd->replicate && fromsnap_txg != 0)
 			min_txg = fromsnap_txg;
 		if (!sd->replicate && tosnap_txg != 0)
 			max_txg = tosnap_txg;
 		(void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd,
 		    min_txg, max_txg);
 	} else {
 		char snapname[MAXPATHLEN] = { 0 };
 		zfs_handle_t *snap;
 
 		(void) snprintf(snapname, sizeof (snapname), "%s@%s",
 		    zhp->zfs_name, sd->tosnap);
 		if (sd->fromsnap != NULL)
 			sd->seenfrom = B_TRUE;
 		snap = zfs_open(zhp->zfs_hdl, snapname,
 		    ZFS_TYPE_SNAPSHOT);
 		if (snap != NULL)
 			(void) send_iterate_snap(snap, sd);
 	}
 
 	fnvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps);
 	fnvlist_add_nvlist(nvfs, "snapprops", sd->snapprops);
 	if (sd->holds)
 		fnvlist_add_nvlist(nvfs, "snapholds", sd->snapholds);
 	fnvlist_free(sd->parent_snaps);
 	fnvlist_free(sd->snapprops);
 	fnvlist_free(sd->snapholds);
 
+	/* Do not allow the size of the properties list to exceed the limit */
+	if ((fnvlist_size(nvfs) + fnvlist_size(sd->fss)) >
+	    zhp->zfs_hdl->libzfs_max_nvlist) {
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+		    "warning: cannot send %s@%s: the size of the list of "
+		    "snapshots and properties is too large to be received "
+		    "successfully.\n"
+		    "Select a smaller number of snapshots to send.\n"),
+		    zhp->zfs_name, sd->tosnap);
+		rv = EZFS_NOSPC;
+		goto out;
+	}
 	/* add this fs to nvlist */
 	(void) snprintf(guidstring, sizeof (guidstring),
 	    "0x%llx", (longlong_t)guid);
 	fnvlist_add_nvlist(sd->fss, guidstring, nvfs);
 
 	/* iterate over children */
 	if (sd->recursive)
 		rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
 
 out:
 	sd->parent_fromsnap_guid = parent_fromsnap_guid_save;
 	sd->fromsnap_txg = fromsnap_txg_save;
 	sd->tosnap_txg = tosnap_txg_save;
 	fnvlist_free(nv);
 	fnvlist_free(nvfs);
 
 	zfs_close(zhp);
 	return (rv);
 }
 
 static int
 gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
     const char *tosnap, boolean_t recursive, boolean_t raw, boolean_t doall,
     boolean_t replicate, boolean_t verbose, boolean_t backup, boolean_t holds,
     boolean_t props, nvlist_t **nvlp, avl_tree_t **avlp)
 {
 	zfs_handle_t *zhp;
 	send_data_t sd = { 0 };
 	int error;
 
 	zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 	if (zhp == NULL)
 		return (EZFS_BADTYPE);
 
 	VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0));
 	sd.fsname = fsname;
 	sd.fromsnap = fromsnap;
 	sd.tosnap = tosnap;
 	sd.recursive = recursive;
 	sd.raw = raw;
 	sd.doall = doall;
 	sd.replicate = replicate;
 	sd.verbose = verbose;
 	sd.backup = backup;
 	sd.holds = holds;
 	sd.props = props;
 
 	if ((error = send_iterate_fs(zhp, &sd)) != 0) {
 		nvlist_free(sd.fss);
 		if (avlp != NULL)
 			*avlp = NULL;
 		*nvlp = NULL;
 		return (error);
 	}
 
 	if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) {
 		nvlist_free(sd.fss);
 		*nvlp = NULL;
 		return (EZFS_NOMEM);
 	}
 
 	*nvlp = sd.fss;
 	return (0);
 }
 
 /*
  * Routines specific to "zfs send"
  */
 typedef struct send_dump_data {
 	/* these are all just the short snapname (the part after the @) */
 	const char *fromsnap;
 	const char *tosnap;
 	char prevsnap[ZFS_MAX_DATASET_NAME_LEN];
 	uint64_t prevsnap_obj;
 	boolean_t seenfrom, seento, replicate, doall, fromorigin;
 	boolean_t dryrun, parsable, progress, embed_data, std_out;
 	boolean_t large_block, compress, raw, holds;
 	int outfd;
 	boolean_t err;
 	nvlist_t *fss;
 	nvlist_t *snapholds;
 	avl_tree_t *fsavl;
 	snapfilter_cb_t *filter_cb;
 	void *filter_cb_arg;
 	nvlist_t *debugnv;
 	char holdtag[ZFS_MAX_DATASET_NAME_LEN];
 	int cleanup_fd;
 	int verbosity;
 	uint64_t size;
 } send_dump_data_t;
 
 static int
 zfs_send_space(zfs_handle_t *zhp, const char *snapname, const char *from,
     enum lzc_send_flags flags, uint64_t *spacep)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	int error;
 
 	assert(snapname != NULL);
 	error = lzc_send_space(snapname, from, flags, spacep);
 
 	if (error != 0) {
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "warning: cannot estimate space for '%s'"), snapname);
 
 		switch (error) {
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case ENOENT:
 			if (zfs_dataset_exists(hdl, snapname,
 			    ZFS_TYPE_SNAPSHOT)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incremental source (%s) does not exist"),
 				    snapname);
 			}
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 		case ENOSTR:
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 		case EINVAL:
 			zfs_error_aux(hdl, strerror(error));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, error, errbuf));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Dumps a backup of the given snapshot (incremental from fromsnap if it's not
  * NULL) to the file descriptor specified by outfd.
  */
 static int
 dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
     boolean_t fromorigin, int outfd, enum lzc_send_flags flags,
     nvlist_t *debugnv)
 {
 	zfs_cmd_t zc = {"\0"};
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *thisdbg;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 	assert(fromsnap_obj == 0 || !fromorigin);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	zc.zc_cookie = outfd;
 	zc.zc_obj = fromorigin;
 	zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 	zc.zc_fromobj = fromsnap_obj;
 	zc.zc_flags = flags;
 
 	VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
 	if (fromsnap && fromsnap[0] != '\0') {
 		VERIFY(0 == nvlist_add_string(thisdbg,
 		    "fromsnap", fromsnap));
 	}
 
 	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "warning: cannot send '%s'"), zhp->zfs_name);
 
 		VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno));
 		if (debugnv) {
 			VERIFY(0 == nvlist_add_nvlist(debugnv,
 			    zhp->zfs_name, thisdbg));
 		}
 		nvlist_free(thisdbg);
 
 		switch (errno) {
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case EACCES:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "source key must be loaded"));
 			return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf));
 
 		case ENOENT:
 			if (zfs_dataset_exists(hdl, zc.zc_name,
 			    ZFS_TYPE_SNAPSHOT)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incremental source (@%s) does not exist"),
 				    zc.zc_value);
 			}
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 		case ENOSTR:
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	if (debugnv)
 		VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
 	nvlist_free(thisdbg);
 
 	return (0);
 }
 
 static void
 gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd)
 {
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 
 	/*
 	 * zfs_send() only sets snapholds for sends that need them,
 	 * e.g. replication and doall.
 	 */
 	if (sdd->snapholds == NULL)
 		return;
 
 	fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag);
 }
 
 int
 zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written,
     uint64_t *blocks_visited)
 {
 	zfs_cmd_t zc = {"\0"};
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	zc.zc_cookie = fd;
 	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0)
 		return (errno);
 	if (bytes_written != NULL)
 		*bytes_written = zc.zc_cookie;
 	if (blocks_visited != NULL)
 		*blocks_visited = zc.zc_objset_type;
 	return (0);
 }
 
 static void *
 send_progress_thread(void *arg)
 {
 	progress_arg_t *pa = arg;
 	zfs_handle_t *zhp = pa->pa_zhp;
 	uint64_t bytes;
 	uint64_t blocks;
 	char buf[16];
 	time_t t;
 	struct tm *tm;
 	boolean_t firstloop = B_TRUE;
 
 	/*
 	 * Print the progress from ZFS_IOC_SEND_PROGRESS every second.
 	 */
 	for (;;) {
 		int err;
 		(void) sleep(1);
 		if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes,
 		    &blocks)) != 0) {
 			if (err == EINTR || err == ENOENT)
 				return ((void *)0);
 			return ((void *)(uintptr_t)err);
 		}
 
 		if (firstloop && !pa->pa_parsable) {
 			(void) fprintf(stderr,
 			    "TIME       %s   %sSNAPSHOT %s\n",
 			    pa->pa_estimate ? "BYTES" : " SENT",
 			    pa->pa_verbosity >= 2 ? "   BLOCKS    " : "",
 			    zhp->zfs_name);
 			firstloop = B_FALSE;
 		}
 
 		(void) time(&t);
 		tm = localtime(&t);
 
 		if (pa->pa_verbosity >= 2 && pa->pa_parsable) {
 			(void) fprintf(stderr,
 			    "%02d:%02d:%02d\t%llu\t%llu\t%s\n",
 			    tm->tm_hour, tm->tm_min, tm->tm_sec,
 			    (u_longlong_t)bytes, (u_longlong_t)blocks,
 			    zhp->zfs_name);
 		} else if (pa->pa_verbosity >= 2) {
 			zfs_nicenum(bytes, buf, sizeof (buf));
 			(void) fprintf(stderr,
 			    "%02d:%02d:%02d   %5s    %8llu    %s\n",
 			    tm->tm_hour, tm->tm_min, tm->tm_sec,
 			    buf, (u_longlong_t)blocks, zhp->zfs_name);
 		} else if (pa->pa_parsable) {
 			(void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n",
 			    tm->tm_hour, tm->tm_min, tm->tm_sec,
 			    (u_longlong_t)bytes, zhp->zfs_name);
 		} else {
 			zfs_nicebytes(bytes, buf, sizeof (buf));
 			(void) fprintf(stderr, "%02d:%02d:%02d   %5s   %s\n",
 			    tm->tm_hour, tm->tm_min, tm->tm_sec,
 			    buf, zhp->zfs_name);
 		}
 	}
 }
 
 static void
 send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap,
     uint64_t size, boolean_t parsable)
 {
 	if (parsable) {
 		if (fromsnap != NULL) {
 			(void) fprintf(fout, "incremental\t%s\t%s",
 			    fromsnap, tosnap);
 		} else {
 			(void) fprintf(fout, "full\t%s",
 			    tosnap);
 		}
 	} else {
 		if (fromsnap != NULL) {
 			if (strchr(fromsnap, '@') == NULL &&
 			    strchr(fromsnap, '#') == NULL) {
 				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 				    "send from @%s to %s"),
 				    fromsnap, tosnap);
 			} else {
 				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 				    "send from %s to %s"),
 				    fromsnap, tosnap);
 			}
 		} else {
 			(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 			    "full send of %s"),
 			    tosnap);
 		}
 	}
 
 	if (parsable) {
 		(void) fprintf(fout, "\t%llu",
 		    (longlong_t)size);
 	} else if (size != 0) {
 		char buf[16];
 		zfs_nicebytes(size, buf, sizeof (buf));
 		(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 		    " estimated size is %s"), buf);
 	}
 	(void) fprintf(fout, "\n");
 }
 
 static int
 dump_snapshot(zfs_handle_t *zhp, void *arg)
 {
 	send_dump_data_t *sdd = arg;
 	progress_arg_t pa = { 0 };
 	pthread_t tid;
 	char *thissnap;
 	enum lzc_send_flags flags = 0;
 	int err;
 	boolean_t isfromsnap, istosnap, fromorigin;
 	boolean_t exclude = B_FALSE;
 	FILE *fout = sdd->std_out ? stdout : stderr;
 
 	err = 0;
 	thissnap = strchr(zhp->zfs_name, '@') + 1;
 	isfromsnap = (sdd->fromsnap != NULL &&
 	    strcmp(sdd->fromsnap, thissnap) == 0);
 
 	if (!sdd->seenfrom && isfromsnap) {
 		gather_holds(zhp, sdd);
 		sdd->seenfrom = B_TRUE;
 		(void) strlcpy(sdd->prevsnap, thissnap,
 		    sizeof (sdd->prevsnap));
 		sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (sdd->seento || !sdd->seenfrom) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	istosnap = (strcmp(sdd->tosnap, thissnap) == 0);
 	if (istosnap)
 		sdd->seento = B_TRUE;
 
 	if (sdd->large_block)
 		flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 	if (sdd->embed_data)
 		flags |= LZC_SEND_FLAG_EMBED_DATA;
 	if (sdd->compress)
 		flags |= LZC_SEND_FLAG_COMPRESS;
 	if (sdd->raw)
 		flags |= LZC_SEND_FLAG_RAW;
 
 	if (!sdd->doall && !isfromsnap && !istosnap) {
 		if (sdd->replicate) {
 			char *snapname;
 			nvlist_t *snapprops;
 			/*
 			 * Filter out all intermediate snapshots except origin
 			 * snapshots needed to replicate clones.
 			 */
 			nvlist_t *nvfs = fsavl_find(sdd->fsavl,
 			    zhp->zfs_dmustats.dds_guid, &snapname);
 
 			VERIFY(0 == nvlist_lookup_nvlist(nvfs,
 			    "snapprops", &snapprops));
 			VERIFY(0 == nvlist_lookup_nvlist(snapprops,
 			    thissnap, &snapprops));
 			exclude = !nvlist_exists(snapprops, "is_clone_origin");
 		} else {
 			exclude = B_TRUE;
 		}
 	}
 
 	/*
 	 * If a filter function exists, call it to determine whether
 	 * this snapshot will be sent.
 	 */
 	if (exclude || (sdd->filter_cb != NULL &&
 	    sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) {
 		/*
 		 * This snapshot is filtered out.  Don't send it, and don't
 		 * set prevsnap_obj, so it will be as if this snapshot didn't
 		 * exist, and the next accepted snapshot will be sent as
 		 * an incremental from the last accepted one, or as the
 		 * first (and full) snapshot in the case of a replication,
 		 * non-incremental send.
 		 */
 		zfs_close(zhp);
 		return (0);
 	}
 
 	gather_holds(zhp, sdd);
 	fromorigin = sdd->prevsnap[0] == '\0' &&
 	    (sdd->fromorigin || sdd->replicate);
 
 	if (sdd->verbosity != 0) {
 		uint64_t size = 0;
 		char fromds[ZFS_MAX_DATASET_NAME_LEN];
 
 		if (sdd->prevsnap[0] != '\0') {
 			(void) strlcpy(fromds, zhp->zfs_name, sizeof (fromds));
 			*(strchr(fromds, '@') + 1) = '\0';
 			(void) strlcat(fromds, sdd->prevsnap, sizeof (fromds));
 		}
 		if (zfs_send_space(zhp, zhp->zfs_name,
 		    sdd->prevsnap[0] ? fromds : NULL, flags, &size) != 0) {
 			size = 0; /* cannot estimate send space */
 		} else {
 			send_print_verbose(fout, zhp->zfs_name,
 			    sdd->prevsnap[0] ? sdd->prevsnap : NULL,
 			    size, sdd->parsable);
 		}
 		sdd->size += size;
 	}
 
 	if (!sdd->dryrun) {
 		/*
 		 * If progress reporting is requested, spawn a new thread to
 		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
 		 */
 		if (sdd->progress) {
 			pa.pa_zhp = zhp;
 			pa.pa_fd = sdd->outfd;
 			pa.pa_parsable = sdd->parsable;
 			pa.pa_estimate = B_FALSE;
 			pa.pa_verbosity = sdd->verbosity;
 
 			if ((err = pthread_create(&tid, NULL,
 			    send_progress_thread, &pa)) != 0) {
 				zfs_close(zhp);
 				return (err);
 			}
 		}
 
 		err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
 		    fromorigin, sdd->outfd, flags, sdd->debugnv);
 
 		if (sdd->progress) {
 			void *status = NULL;
 			(void) pthread_cancel(tid);
 			(void) pthread_join(tid, &status);
 			int error = (int)(uintptr_t)status;
 			if (error != 0 && status != PTHREAD_CANCELED) {
 				char errbuf[1024];
 				(void) snprintf(errbuf, sizeof (errbuf),
 				    dgettext(TEXT_DOMAIN,
 				    "progress thread exited nonzero"));
 				return (zfs_standard_error(zhp->zfs_hdl, error,
 				    errbuf));
 			}
 		}
 	}
 
 	(void) strcpy(sdd->prevsnap, thissnap);
 	sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 	zfs_close(zhp);
 	return (err);
 }
 
 static int
 dump_filesystem(zfs_handle_t *zhp, void *arg)
 {
 	int rv = 0;
 	send_dump_data_t *sdd = arg;
 	boolean_t missingfrom = B_FALSE;
 	zfs_cmd_t zc = {"\0"};
 	uint64_t min_txg = 0, max_txg = 0;
 
 	(void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
 	    zhp->zfs_name, sdd->tosnap);
 	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "WARNING: could not send %s@%s: does not exist\n"),
 		    zhp->zfs_name, sdd->tosnap);
 		sdd->err = B_TRUE;
 		return (0);
 	}
 
 	if (sdd->replicate && sdd->fromsnap) {
 		/*
 		 * If this fs does not have fromsnap, and we're doing
 		 * recursive, we need to send a full stream from the
 		 * beginning (or an incremental from the origin if this
 		 * is a clone).  If we're doing non-recursive, then let
 		 * them get the error.
 		 */
 		(void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
 		    zhp->zfs_name, sdd->fromsnap);
 		if (zfs_ioctl(zhp->zfs_hdl,
 		    ZFS_IOC_OBJSET_STATS, &zc) != 0) {
 			missingfrom = B_TRUE;
 		}
 	}
 
 	sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
 	sdd->prevsnap_obj = 0;
 	if (sdd->fromsnap == NULL || missingfrom)
 		sdd->seenfrom = B_TRUE;
 
 
 
 	/*
 	 * Iterate through all snapshots and process the ones we will be
 	 * sending. If we only have a "from" and "to" snapshot to deal
 	 * with, we can avoid iterating through all the other snapshots.
 	 */
 	if (sdd->doall || sdd->replicate || sdd->tosnap == NULL) {
 		if (!sdd->replicate && sdd->fromsnap != NULL)
 			min_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name,
 			    sdd->fromsnap);
 		if (!sdd->replicate && sdd->tosnap != NULL)
 			max_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name,
 			    sdd->tosnap);
 		rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg,
 		    min_txg, max_txg);
 	} else {
 		char snapname[MAXPATHLEN] = { 0 };
 		zfs_handle_t *snap;
 
 		if (!sdd->seenfrom) {
 			(void) snprintf(snapname, sizeof (snapname),
 			    "%s@%s", zhp->zfs_name, sdd->fromsnap);
 			snap = zfs_open(zhp->zfs_hdl, snapname,
 			    ZFS_TYPE_SNAPSHOT);
 			if (snap != NULL)
 				rv = dump_snapshot(snap, sdd);
 			else
 				rv = -1;
 		}
 
 		if (rv == 0) {
 			(void) snprintf(snapname, sizeof (snapname),
 			    "%s@%s", zhp->zfs_name, sdd->tosnap);
 			snap = zfs_open(zhp->zfs_hdl, snapname,
 			    ZFS_TYPE_SNAPSHOT);
 			if (snap != NULL)
 				rv = dump_snapshot(snap, sdd);
 			else
 				rv = -1;
 		}
 	}
 
 	if (!sdd->seenfrom) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "WARNING: could not send %s@%s:\n"
 		    "incremental source (%s@%s) does not exist\n"),
 		    zhp->zfs_name, sdd->tosnap,
 		    zhp->zfs_name, sdd->fromsnap);
 		sdd->err = B_TRUE;
 	} else if (!sdd->seento) {
 		if (sdd->fromsnap) {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "WARNING: could not send %s@%s:\n"
 			    "incremental source (%s@%s) "
 			    "is not earlier than it\n"),
 			    zhp->zfs_name, sdd->tosnap,
 			    zhp->zfs_name, sdd->fromsnap);
 		} else {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "WARNING: "
 			    "could not send %s@%s: does not exist\n"),
 			    zhp->zfs_name, sdd->tosnap);
 		}
 		sdd->err = B_TRUE;
 	}
 
 	return (rv);
 }
 
 static int
 dump_filesystems(zfs_handle_t *rzhp, void *arg)
 {
 	send_dump_data_t *sdd = arg;
 	nvpair_t *fspair;
 	boolean_t needagain, progress;
 
 	if (!sdd->replicate)
 		return (dump_filesystem(rzhp, sdd));
 
 	/* Mark the clone origin snapshots. */
 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
 		nvlist_t *nvfs;
 		uint64_t origin_guid = 0;
 
 		VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs));
 		(void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid);
 		if (origin_guid != 0) {
 			char *snapname;
 			nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
 			    origin_guid, &snapname);
 			if (origin_nv != NULL) {
 				nvlist_t *snapprops;
 				VERIFY(0 == nvlist_lookup_nvlist(origin_nv,
 				    "snapprops", &snapprops));
 				VERIFY(0 == nvlist_lookup_nvlist(snapprops,
 				    snapname, &snapprops));
 				VERIFY(0 == nvlist_add_boolean(
 				    snapprops, "is_clone_origin"));
 			}
 		}
 	}
 again:
 	needagain = progress = B_FALSE;
 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
 		nvlist_t *fslist, *parent_nv;
 		char *fsname;
 		zfs_handle_t *zhp;
 		int err;
 		uint64_t origin_guid = 0;
 		uint64_t parent_guid = 0;
 
 		VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
 		if (nvlist_lookup_boolean(fslist, "sent") == 0)
 			continue;
 
 		VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0);
 		(void) nvlist_lookup_uint64(fslist, "origin", &origin_guid);
 		(void) nvlist_lookup_uint64(fslist, "parentfromsnap",
 		    &parent_guid);
 
 		if (parent_guid != 0) {
 			parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL);
 			if (!nvlist_exists(parent_nv, "sent")) {
 				/* parent has not been sent; skip this one */
 				needagain = B_TRUE;
 				continue;
 			}
 		}
 
 		if (origin_guid != 0) {
 			nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
 			    origin_guid, NULL);
 			if (origin_nv != NULL &&
 			    !nvlist_exists(origin_nv, "sent")) {
 				/*
 				 * origin has not been sent yet;
 				 * skip this clone.
 				 */
 				needagain = B_TRUE;
 				continue;
 			}
 		}
 
 		zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET);
 		if (zhp == NULL)
 			return (-1);
 		err = dump_filesystem(zhp, sdd);
 		VERIFY(nvlist_add_boolean(fslist, "sent") == 0);
 		progress = B_TRUE;
 		zfs_close(zhp);
 		if (err)
 			return (err);
 	}
 	if (needagain) {
 		assert(progress);
 		goto again;
 	}
 
 	/* clean out the sent flags in case we reuse this fss */
 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
 		nvlist_t *fslist;
 
 		VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
 		(void) nvlist_remove_all(fslist, "sent");
 	}
 
 	return (0);
 }
 
 nvlist_t *
 zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token)
 {
 	unsigned int version;
 	int nread, i;
 	unsigned long long checksum, packed_len;
 
 	/*
 	 * Decode token header, which is:
 	 *   <token version>-<checksum of payload>-<uncompressed payload length>
 	 * Note that the only supported token version is 1.
 	 */
 	nread = sscanf(token, "%u-%llx-%llx-",
 	    &version, &checksum, &packed_len);
 	if (nread != 3) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (invalid format)"));
 		return (NULL);
 	}
 
 	if (version != ZFS_SEND_RESUME_TOKEN_VERSION) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (invalid version %u)"),
 		    version);
 		return (NULL);
 	}
 
 	/* convert hexadecimal representation to binary */
 	token = strrchr(token, '-') + 1;
 	int len = strlen(token) / 2;
 	unsigned char *compressed = zfs_alloc(hdl, len);
 	for (i = 0; i < len; i++) {
 		nread = sscanf(token + i * 2, "%2hhx", compressed + i);
 		if (nread != 1) {
 			free(compressed);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "resume token is corrupt "
 			    "(payload is not hex-encoded)"));
 			return (NULL);
 		}
 	}
 
 	/* verify checksum */
 	zio_cksum_t cksum;
 	fletcher_4_native_varsize(compressed, len, &cksum);
 	if (cksum.zc_word[0] != checksum) {
 		free(compressed);
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (incorrect checksum)"));
 		return (NULL);
 	}
 
 	/* uncompress */
 	void *packed = zfs_alloc(hdl, packed_len);
 	uLongf packed_len_long = packed_len;
 	if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK ||
 	    packed_len_long != packed_len) {
 		free(packed);
 		free(compressed);
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (decompression failed)"));
 		return (NULL);
 	}
 
 	/* unpack nvlist */
 	nvlist_t *nv;
 	int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP);
 	free(packed);
 	free(compressed);
 	if (error != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt (nvlist_unpack failed)"));
 		return (NULL);
 	}
 	return (nv);
 }
 static enum lzc_send_flags
 lzc_flags_from_sendflags(const sendflags_t *flags)
 {
 	enum lzc_send_flags lzc_flags = 0;
 	if (flags->largeblock)
 		lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 	if (flags->embed_data)
 		lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
 	if (flags->compress)
 		lzc_flags |= LZC_SEND_FLAG_COMPRESS;
 	if (flags->raw)
 		lzc_flags |= LZC_SEND_FLAG_RAW;
 	if (flags->saved)
 		lzc_flags |= LZC_SEND_FLAG_SAVED;
 	return (lzc_flags);
 }
 
 static int
 estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
     uint64_t resumeobj, uint64_t resumeoff, uint64_t bytes,
     const char *redactbook, char *errbuf)
 {
 	uint64_t size;
 	FILE *fout = flags->dryrun ? stdout : stderr;
 	progress_arg_t pa = { 0 };
 	int err = 0;
 	pthread_t ptid;
 
 	if (flags->progress) {
 		pa.pa_zhp = zhp;
 		pa.pa_fd = fd;
 		pa.pa_parsable = flags->parsable;
 		pa.pa_estimate = B_TRUE;
 		pa.pa_verbosity = flags->verbosity;
 
 		err = pthread_create(&ptid, NULL,
 		    send_progress_thread, &pa);
 		if (err != 0) {
 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
 			return (zfs_error(zhp->zfs_hdl,
 			    EZFS_THREADCREATEFAILED, errbuf));
 		}
 	}
 
 	err = lzc_send_space_resume_redacted(zhp->zfs_name, from,
 	    lzc_flags_from_sendflags(flags), resumeobj, resumeoff, bytes,
 	    redactbook, fd, &size);
 
 	if (flags->progress) {
 		void *status = NULL;
 		(void) pthread_cancel(ptid);
 		(void) pthread_join(ptid, &status);
 		int error = (int)(uintptr_t)status;
 		if (error != 0 && status != PTHREAD_CANCELED) {
 			char errbuf[1024];
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN, "progress thread exited "
 			    "nonzero"));
 			return (zfs_standard_error(zhp->zfs_hdl, error,
 			    errbuf));
 		}
 	}
 
 	if (err != 0) {
 		zfs_error_aux(zhp->zfs_hdl, strerror(err));
 		return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP,
 		    errbuf));
 	}
 	send_print_verbose(fout, zhp->zfs_name, from, size,
 	    flags->parsable);
 
 	if (flags->parsable) {
 		(void) fprintf(fout, "size\t%llu\n", (longlong_t)size);
 	} else {
 		char buf[16];
 		zfs_nicenum(size, buf, sizeof (buf));
 		(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 		    "total estimated size is %s\n"), buf);
 	}
 	return (0);
 }
 
 static boolean_t
 redact_snaps_contains(const uint64_t *snaps, uint64_t num_snaps, uint64_t guid)
 {
 	for (int i = 0; i < num_snaps; i++) {
 		if (snaps[i] == guid)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static boolean_t
 redact_snaps_equal(const uint64_t *snaps1, uint64_t num_snaps1,
     const uint64_t *snaps2, uint64_t num_snaps2)
 {
 	if (num_snaps1 != num_snaps2)
 		return (B_FALSE);
 	for (int i = 0; i < num_snaps1; i++) {
 		if (!redact_snaps_contains(snaps2, num_snaps2, snaps1[i]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 /*
  * Check that the list of redaction snapshots in the bookmark matches the send
  * we're resuming, and return whether or not it's complete.
  *
  * Note that the caller needs to free the contents of *bookname with free() if
  * this function returns successfully.
  */
 static int
 find_redact_book(libzfs_handle_t *hdl, const char *path,
     const uint64_t *redact_snap_guids, int num_redact_snaps,
     char **bookname)
 {
 	char errbuf[1024];
 	int error = 0;
 	nvlist_t *props = fnvlist_alloc();
 	nvlist_t *bmarks;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot resume send"));
 
 	fnvlist_add_boolean(props, "redact_complete");
 	fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS));
 	error = lzc_get_bookmarks(path, props, &bmarks);
 	nvlist_free(props);
 	if (error != 0) {
 		if (error == ESRCH) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "nonexistent redaction bookmark provided"));
 		} else if (error == ENOENT) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "dataset to be sent no longer exists"));
 		} else {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "unknown error: %s"), strerror(error));
 		}
 		return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 	}
 	nvpair_t *pair;
 	for (pair = nvlist_next_nvpair(bmarks, NULL); pair;
 	    pair = nvlist_next_nvpair(bmarks, pair)) {
 
 		nvlist_t *bmark = fnvpair_value_nvlist(pair);
 		nvlist_t *vallist = fnvlist_lookup_nvlist(bmark,
 		    zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS));
 		uint_t len = 0;
 		uint64_t *bmarksnaps = fnvlist_lookup_uint64_array(vallist,
 		    ZPROP_VALUE, &len);
 		if (redact_snaps_equal(redact_snap_guids,
 		    num_redact_snaps, bmarksnaps, len)) {
 			break;
 		}
 	}
 	if (pair == NULL)  {
 		fnvlist_free(bmarks);
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "no appropriate redaction bookmark exists"));
 		return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 	}
 	char *name = nvpair_name(pair);
 	nvlist_t *bmark = fnvpair_value_nvlist(pair);
 	nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, "redact_complete");
 	boolean_t complete = fnvlist_lookup_boolean_value(vallist,
 	    ZPROP_VALUE);
 	if (!complete) {
 		fnvlist_free(bmarks);
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "incomplete redaction bookmark provided"));
 		return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 	}
 	*bookname = strndup(name, ZFS_MAX_DATASET_NAME_LEN);
 	ASSERT3P(*bookname, !=, NULL);
 	fnvlist_free(bmarks);
 	return (0);
 }
 
 static int
 zfs_send_resume_impl(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
     nvlist_t *resume_nvl)
 {
 	char errbuf[1024];
 	char *toname;
 	char *fromname = NULL;
 	uint64_t resumeobj, resumeoff, toguid, fromguid, bytes;
 	zfs_handle_t *zhp;
 	int error = 0;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	enum lzc_send_flags lzc_flags = 0;
 	FILE *fout = (flags->verbosity > 0 && flags->dryrun) ? stdout : stderr;
 	uint64_t *redact_snap_guids = NULL;
 	int num_redact_snaps = 0;
 	char *redact_book = NULL;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot resume send"));
 
 	if (flags->verbosity != 0) {
 		(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 		    "resume token contents:\n"));
 		nvlist_print(fout, resume_nvl);
 	}
 
 	if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 ||
 	    nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "resume token is corrupt"));
 		return (zfs_error(hdl, EZFS_FAULT, errbuf));
 	}
 	fromguid = 0;
 	(void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid);
 
 	if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok"))
 		lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 	if (flags->embed_data || nvlist_exists(resume_nvl, "embedok"))
 		lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
 	if (flags->compress || nvlist_exists(resume_nvl, "compressok"))
 		lzc_flags |= LZC_SEND_FLAG_COMPRESS;
 	if (flags->raw || nvlist_exists(resume_nvl, "rawok"))
 		lzc_flags |= LZC_SEND_FLAG_RAW;
 	if (flags->saved || nvlist_exists(resume_nvl, "savedok"))
 		lzc_flags |= LZC_SEND_FLAG_SAVED;
 
 	if (flags->saved) {
 		(void) strcpy(name, toname);
 	} else {
 		error = guid_to_name(hdl, toname, toguid, B_FALSE, name);
 		if (error != 0) {
 			if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' is no longer the same snapshot "
 				    "used in the initial send"), toname);
 			} else {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' used in the initial send no "
 				    "longer exists"), toname);
 			}
 			return (zfs_error(hdl, EZFS_BADPATH, errbuf));
 		}
 	}
 
 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 	if (zhp == NULL) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "unable to access '%s'"), name);
 		return (zfs_error(hdl, EZFS_BADPATH, errbuf));
 	}
 
 	if (nvlist_lookup_uint64_array(resume_nvl, "book_redact_snaps",
 	    &redact_snap_guids, (uint_t *)&num_redact_snaps) != 0) {
 		num_redact_snaps = -1;
 	}
 
 	if (fromguid != 0) {
 		if (guid_to_name_redact_snaps(hdl, toname, fromguid, B_TRUE,
 		    redact_snap_guids, num_redact_snaps, name) != 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "incremental source %#llx no longer exists"),
 			    (longlong_t)fromguid);
 			return (zfs_error(hdl, EZFS_BADPATH, errbuf));
 		}
 		fromname = name;
 	}
 
 	redact_snap_guids = NULL;
 
 	if (nvlist_lookup_uint64_array(resume_nvl,
 	    zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snap_guids,
 	    (uint_t *)&num_redact_snaps) == 0) {
 		char path[ZFS_MAX_DATASET_NAME_LEN];
 
 		(void) strlcpy(path, toname, sizeof (path));
 		char *at = strchr(path, '@');
 		ASSERT3P(at, !=, NULL);
 
 		*at = '\0';
 
 		if ((error = find_redact_book(hdl, path, redact_snap_guids,
 		    num_redact_snaps, &redact_book)) != 0) {
 			return (error);
 		}
 	}
 
 	if (flags->verbosity != 0) {
 		/*
 		 * Some of these may have come from the resume token, set them
 		 * here for size estimate purposes.
 		 */
 		sendflags_t tmpflags = *flags;
 		if (lzc_flags & LZC_SEND_FLAG_LARGE_BLOCK)
 			tmpflags.largeblock = B_TRUE;
 		if (lzc_flags & LZC_SEND_FLAG_COMPRESS)
 			tmpflags.compress = B_TRUE;
 		if (lzc_flags & LZC_SEND_FLAG_EMBED_DATA)
 			tmpflags.embed_data = B_TRUE;
 		error = estimate_size(zhp, fromname, outfd, &tmpflags,
 		    resumeobj, resumeoff, bytes, redact_book, errbuf);
 	}
 
 	if (!flags->dryrun) {
 		progress_arg_t pa = { 0 };
 		pthread_t tid;
 		/*
 		 * If progress reporting is requested, spawn a new thread to
 		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
 		 */
 		if (flags->progress) {
 			pa.pa_zhp = zhp;
 			pa.pa_fd = outfd;
 			pa.pa_parsable = flags->parsable;
 			pa.pa_estimate = B_FALSE;
 			pa.pa_verbosity = flags->verbosity;
 
 			error = pthread_create(&tid, NULL,
 			    send_progress_thread, &pa);
 			if (error != 0) {
 				if (redact_book != NULL)
 					free(redact_book);
 				zfs_close(zhp);
 				return (error);
 			}
 		}
 
 		error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd,
 		    lzc_flags, resumeobj, resumeoff, redact_book);
 		if (redact_book != NULL)
 			free(redact_book);
 
 		if (flags->progress) {
 			void *status = NULL;
 			(void) pthread_cancel(tid);
 			(void) pthread_join(tid, &status);
 			int error = (int)(uintptr_t)status;
 			if (error != 0 && status != PTHREAD_CANCELED) {
 				char errbuf[1024];
 				(void) snprintf(errbuf, sizeof (errbuf),
 				    dgettext(TEXT_DOMAIN,
 				    "progress thread exited nonzero"));
 				return (zfs_standard_error(hdl, error, errbuf));
 			}
 		}
 
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "warning: cannot send '%s'"), zhp->zfs_name);
 
 		zfs_close(zhp);
 
 		switch (error) {
 		case 0:
 			return (0);
 		case EACCES:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "source key must be loaded"));
 			return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf));
 		case ESRCH:
 			if (lzc_exists(zhp->zfs_name)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incremental source could not be found"));
 			}
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EXDEV:
 		case ENOENT:
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 		case ENOSTR:
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	} else {
 		if (redact_book != NULL)
 			free(redact_book);
 	}
 
 	zfs_close(zhp);
 
 	return (error);
 }
 
 int
 zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
     const char *resume_token)
 {
 	int ret;
 	char errbuf[1024];
 	nvlist_t *resume_nvl;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot resume send"));
 
 	resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token);
 	if (resume_nvl == NULL) {
 		/*
 		 * zfs_error_aux has already been set by
 		 * zfs_send_resume_token_to_nvlist()
 		 */
 		return (zfs_error(hdl, EZFS_FAULT, errbuf));
 	}
 
 	ret = zfs_send_resume_impl(hdl, flags, outfd, resume_nvl);
 	nvlist_free(resume_nvl);
 
 	return (ret);
 }
 
 int
 zfs_send_saved(zfs_handle_t *zhp, sendflags_t *flags, int outfd,
     const char *resume_token)
 {
 	int ret;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *saved_nvl = NULL, *resume_nvl = NULL;
 	uint64_t saved_guid = 0, resume_guid = 0;
 	uint64_t obj = 0, off = 0, bytes = 0;
 	char token_buf[ZFS_MAXPROPLEN];
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "saved send failed"));
 
 	ret = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	    token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE);
 	if (ret != 0)
 		goto out;
 
 	saved_nvl = zfs_send_resume_token_to_nvlist(hdl, token_buf);
 	if (saved_nvl == NULL) {
 		/*
 		 * zfs_error_aux has already been set by
 		 * zfs_send_resume_token_to_nvlist()
 		 */
 		ret = zfs_error(hdl, EZFS_FAULT, errbuf);
 		goto out;
 	}
 
 	/*
 	 * If a resume token is provided we use the object and offset
 	 * from that instead of the default, which starts from the
 	 * beginning.
 	 */
 	if (resume_token != NULL) {
 		resume_nvl = zfs_send_resume_token_to_nvlist(hdl,
 		    resume_token);
 		if (resume_nvl == NULL) {
 			ret = zfs_error(hdl, EZFS_FAULT, errbuf);
 			goto out;
 		}
 
 		if (nvlist_lookup_uint64(resume_nvl, "object", &obj) != 0 ||
 		    nvlist_lookup_uint64(resume_nvl, "offset", &off) != 0 ||
 		    nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 ||
 		    nvlist_lookup_uint64(resume_nvl, "toguid",
 		    &resume_guid) != 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "provided resume token is corrupt"));
 			ret = zfs_error(hdl, EZFS_FAULT, errbuf);
 			goto out;
 		}
 
 		if (nvlist_lookup_uint64(saved_nvl, "toguid",
 		    &saved_guid)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "dataset's resume token is corrupt"));
 			ret = zfs_error(hdl, EZFS_FAULT, errbuf);
 			goto out;
 		}
 
 		if (resume_guid != saved_guid) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "provided resume token does not match dataset"));
 			ret = zfs_error(hdl, EZFS_BADBACKUP, errbuf);
 			goto out;
 		}
 	}
 
 	(void) nvlist_remove_all(saved_nvl, "object");
 	fnvlist_add_uint64(saved_nvl, "object", obj);
 
 	(void) nvlist_remove_all(saved_nvl, "offset");
 	fnvlist_add_uint64(saved_nvl, "offset", off);
 
 	(void) nvlist_remove_all(saved_nvl, "bytes");
 	fnvlist_add_uint64(saved_nvl, "bytes", bytes);
 
 	(void) nvlist_remove_all(saved_nvl, "toname");
 	fnvlist_add_string(saved_nvl, "toname", zhp->zfs_name);
 
 	ret = zfs_send_resume_impl(hdl, flags, outfd, saved_nvl);
 
 out:
 	nvlist_free(saved_nvl);
 	nvlist_free(resume_nvl);
 	return (ret);
 }
 
 /*
  * This function informs the target system that the recursive send is complete.
  * The record is also expected in the case of a send -p.
  */
 static int
 send_conclusion_record(int fd, zio_cksum_t *zc)
 {
 	dmu_replay_record_t drr = { 0 };
 	drr.drr_type = DRR_END;
 	if (zc != NULL)
 		drr.drr_u.drr_end.drr_checksum = *zc;
 	if (write(fd, &drr, sizeof (drr)) == -1) {
 		return (errno);
 	}
 	return (0);
 }
 
 /*
  * This function is responsible for sending the records that contain the
  * necessary information for the target system's libzfs to be able to set the
  * properties of the filesystem being received, or to be able to prepare for
  * a recursive receive.
  *
  * The "zhp" argument is the handle of the snapshot we are sending
  * (the "tosnap").  The "from" argument is the short snapshot name (the part
  * after the @) of the incremental source.
  */
 static int
 send_prelim_records(zfs_handle_t *zhp, const char *from, int fd,
     boolean_t gather_props, boolean_t recursive, boolean_t verbose,
     boolean_t dryrun, boolean_t raw, boolean_t replicate, boolean_t backup,
     boolean_t holds, boolean_t props, boolean_t doall,
     nvlist_t **fssp, avl_tree_t **fsavlp)
 {
 	int err = 0;
 	char *packbuf = NULL;
 	size_t buflen = 0;
 	zio_cksum_t zc = { {0} };
 	int featureflags = 0;
 	/* name of filesystem/volume that contains snapshot we are sending */
 	char tofs[ZFS_MAX_DATASET_NAME_LEN];
 	/* short name of snap we are sending */
 	char *tosnap = "";
 
 	char errbuf[1024];
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "warning: cannot send '%s'"), zhp->zfs_name);
 	if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && zfs_prop_get_int(zhp,
 	    ZFS_PROP_VERSION) >= ZPL_VERSION_SA) {
 		featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
 	}
 
 	if (holds)
 		featureflags |= DMU_BACKUP_FEATURE_HOLDS;
 
 	(void) strlcpy(tofs, zhp->zfs_name, ZFS_MAX_DATASET_NAME_LEN);
 	char *at = strchr(tofs, '@');
 	if (at != NULL) {
 		*at = '\0';
 		tosnap = at + 1;
 	}
 
 	if (gather_props) {
 		nvlist_t *hdrnv = fnvlist_alloc();
 		nvlist_t *fss = NULL;
 
 		if (from != NULL)
 			fnvlist_add_string(hdrnv, "fromsnap", from);
 		fnvlist_add_string(hdrnv, "tosnap", tosnap);
 		if (!recursive)
 			fnvlist_add_boolean(hdrnv, "not_recursive");
 
 		if (raw) {
 			VERIFY0(nvlist_add_boolean(hdrnv, "raw"));
 		}
 
 		if ((err = gather_nvlist(zhp->zfs_hdl, tofs,
 		    from, tosnap, recursive, raw, doall, replicate, verbose,
 		    backup, holds, props, &fss, fsavlp)) != 0) {
 			return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP,
 			    errbuf));
 		}
+		/*
+		 * Do not allow the size of the properties list to exceed
+		 * the limit
+		 */
+		if ((fnvlist_size(fss) + fnvlist_size(hdrnv)) >
+		    zhp->zfs_hdl->libzfs_max_nvlist) {
+			(void) snprintf(errbuf, sizeof (errbuf),
+			    dgettext(TEXT_DOMAIN, "warning: cannot send '%s': "
+			    "the size of the list of snapshots and properties "
+			    "is too large to be received successfully.\n"
+			    "Select a smaller number of snapshots to send.\n"),
+			    zhp->zfs_name);
+			return (zfs_error(zhp->zfs_hdl, EZFS_NOSPC,
+			    errbuf));
+		}
 		fnvlist_add_nvlist(hdrnv, "fss", fss);
 		VERIFY0(nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR,
 		    0));
 		if (fssp != NULL) {
 			*fssp = fss;
 		} else {
 			nvlist_free(fss);
 		}
 		nvlist_free(hdrnv);
 	}
 
 	if (!dryrun) {
 		dmu_replay_record_t drr = { 0 };
 		/* write first begin record */
 		drr.drr_type = DRR_BEGIN;
 		drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
 		DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.
 		    drr_versioninfo, DMU_COMPOUNDSTREAM);
 		DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.
 		    drr_versioninfo, featureflags);
 		if (snprintf(drr.drr_u.drr_begin.drr_toname,
 		    sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", tofs,
 		    tosnap) >= sizeof (drr.drr_u.drr_begin.drr_toname)) {
 			return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP,
 			    errbuf));
 		}
 		drr.drr_payloadlen = buflen;
 
 		err = dump_record(&drr, packbuf, buflen, &zc, fd);
 		free(packbuf);
 		if (err != 0) {
 			zfs_error_aux(zhp->zfs_hdl, strerror(err));
 			return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP,
 			    errbuf));
 		}
 		err = send_conclusion_record(fd, &zc);
 		if (err != 0) {
 			zfs_error_aux(zhp->zfs_hdl, strerror(err));
 			return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP,
 			    errbuf));
 		}
 	}
 	return (0);
 }
 
 /*
  * Generate a send stream.  The "zhp" argument is the filesystem/volume
  * that contains the snapshot to send.  The "fromsnap" argument is the
  * short name (the part after the '@') of the snapshot that is the
  * incremental source to send from (if non-NULL).  The "tosnap" argument
  * is the short name of the snapshot to send.
  *
  * The content of the send stream is the snapshot identified by
  * 'tosnap'.  Incremental streams are requested in two ways:
  *     - from the snapshot identified by "fromsnap" (if non-null) or
  *     - from the origin of the dataset identified by zhp, which must
  *	 be a clone.  In this case, "fromsnap" is null and "fromorigin"
  *	 is TRUE.
  *
  * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and
  * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM)
  * if "replicate" is set.  If "doall" is set, dump all the intermediate
  * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall"
  * case too. If "props" is set, send properties.
  */
 int
 zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
     sendflags_t *flags, int outfd, snapfilter_cb_t filter_func,
     void *cb_arg, nvlist_t **debugnvp)
 {
 	char errbuf[1024];
 	send_dump_data_t sdd = { 0 };
 	int err = 0;
 	nvlist_t *fss = NULL;
 	avl_tree_t *fsavl = NULL;
 	static uint64_t holdseq;
 	int spa_version;
 	pthread_t tid = 0;
 	int pipefd[2];
 	int featureflags = 0;
 	FILE *fout;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot send '%s'"), zhp->zfs_name);
 
 	if (fromsnap && fromsnap[0] == '\0') {
 		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 		    "zero-length incremental source"));
 		return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
 	}
 
 	if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) {
 		uint64_t version;
 		version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
 		if (version >= ZPL_VERSION_SA) {
 			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
 		}
 	}
 
 	if (flags->holds)
 		featureflags |= DMU_BACKUP_FEATURE_HOLDS;
 
 	if (flags->replicate || flags->doall || flags->props ||
 	    flags->holds || flags->backup) {
 		char full_tosnap_name[ZFS_MAX_DATASET_NAME_LEN];
 		if (snprintf(full_tosnap_name, sizeof (full_tosnap_name),
 		    "%s@%s", zhp->zfs_name, tosnap) >=
 		    sizeof (full_tosnap_name)) {
 			err = EINVAL;
 			goto stderr_out;
 		}
 		zfs_handle_t *tosnap = zfs_open(zhp->zfs_hdl,
 		    full_tosnap_name, ZFS_TYPE_SNAPSHOT);
 		if (tosnap == NULL) {
 			err = -1;
 			goto err_out;
 		}
 		err = send_prelim_records(tosnap, fromsnap, outfd,
 		    flags->replicate || flags->props || flags->holds,
 		    flags->replicate, flags->verbosity > 0, flags->dryrun,
 		    flags->raw, flags->replicate, flags->backup, flags->holds,
 		    flags->props, flags->doall, &fss, &fsavl);
 		zfs_close(tosnap);
 		if (err != 0)
 			goto err_out;
 	}
 
 	/* dump each stream */
 	sdd.fromsnap = fromsnap;
 	sdd.tosnap = tosnap;
 	if (tid != 0)
 		sdd.outfd = pipefd[0];
 	else
 		sdd.outfd = outfd;
 	sdd.replicate = flags->replicate;
 	sdd.doall = flags->doall;
 	sdd.fromorigin = flags->fromorigin;
 	sdd.fss = fss;
 	sdd.fsavl = fsavl;
 	sdd.verbosity = flags->verbosity;
 	sdd.parsable = flags->parsable;
 	sdd.progress = flags->progress;
 	sdd.dryrun = flags->dryrun;
 	sdd.large_block = flags->largeblock;
 	sdd.embed_data = flags->embed_data;
 	sdd.compress = flags->compress;
 	sdd.raw = flags->raw;
 	sdd.holds = flags->holds;
 	sdd.filter_cb = filter_func;
 	sdd.filter_cb_arg = cb_arg;
 	if (debugnvp)
 		sdd.debugnv = *debugnvp;
 	if (sdd.verbosity != 0 && sdd.dryrun)
 		sdd.std_out = B_TRUE;
 	fout = sdd.std_out ? stdout : stderr;
 
 	/*
 	 * Some flags require that we place user holds on the datasets that are
 	 * being sent so they don't get destroyed during the send. We can skip
 	 * this step if the pool is imported read-only since the datasets cannot
 	 * be destroyed.
 	 */
 	if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp),
 	    ZPOOL_PROP_READONLY, NULL) &&
 	    zfs_spa_version(zhp, &spa_version) == 0 &&
 	    spa_version >= SPA_VERSION_USERREFS &&
 	    (flags->doall || flags->replicate)) {
 		++holdseq;
 		(void) snprintf(sdd.holdtag, sizeof (sdd.holdtag),
 		    ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
 		sdd.cleanup_fd = open(ZFS_DEV, O_RDWR);
 		if (sdd.cleanup_fd < 0) {
 			err = errno;
 			goto stderr_out;
 		}
 		sdd.snapholds = fnvlist_alloc();
 	} else {
 		sdd.cleanup_fd = -1;
 		sdd.snapholds = NULL;
 	}
 
 	if (flags->verbosity != 0 || sdd.snapholds != NULL) {
 		/*
 		 * Do a verbose no-op dry run to get all the verbose output
 		 * or to gather snapshot hold's before generating any data,
 		 * then do a non-verbose real run to generate the streams.
 		 */
 		sdd.dryrun = B_TRUE;
 		err = dump_filesystems(zhp, &sdd);
 
 		if (err != 0)
 			goto stderr_out;
 
 		if (flags->verbosity != 0) {
 			if (flags->parsable) {
 				(void) fprintf(fout, "size\t%llu\n",
 				    (longlong_t)sdd.size);
 			} else {
 				char buf[16];
 				zfs_nicebytes(sdd.size, buf, sizeof (buf));
 				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
 				    "total estimated size is %s\n"), buf);
 			}
 		}
 
 		/* Ensure no snaps found is treated as an error. */
 		if (!sdd.seento) {
 			err = ENOENT;
 			goto err_out;
 		}
 
 		/* Skip the second run if dryrun was requested. */
 		if (flags->dryrun)
 			goto err_out;
 
 		if (sdd.snapholds != NULL) {
 			err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds);
 			if (err != 0)
 				goto stderr_out;
 
 			fnvlist_free(sdd.snapholds);
 			sdd.snapholds = NULL;
 		}
 
 		sdd.dryrun = B_FALSE;
 		sdd.verbosity = 0;
 	}
 
 	err = dump_filesystems(zhp, &sdd);
 	fsavl_destroy(fsavl);
 	nvlist_free(fss);
 
 	/* Ensure no snaps found is treated as an error. */
 	if (err == 0 && !sdd.seento)
 		err = ENOENT;
 
 	if (tid != 0) {
 		if (err != 0)
 			(void) pthread_cancel(tid);
 		(void) close(pipefd[0]);
 		(void) pthread_join(tid, NULL);
 	}
 
 	if (sdd.cleanup_fd != -1) {
 		VERIFY(0 == close(sdd.cleanup_fd));
 		sdd.cleanup_fd = -1;
 	}
 
 	if (!flags->dryrun && (flags->replicate || flags->doall ||
 	    flags->props || flags->backup || flags->holds)) {
 		/*
 		 * write final end record.  NB: want to do this even if
 		 * there was some error, because it might not be totally
 		 * failed.
 		 */
 		err = send_conclusion_record(outfd, NULL);
 		if (err != 0)
 			return (zfs_standard_error(zhp->zfs_hdl, err, errbuf));
 	}
 
 	return (err || sdd.err);
 
 stderr_out:
 	err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
 err_out:
 	fsavl_destroy(fsavl);
 	nvlist_free(fss);
 	fnvlist_free(sdd.snapholds);
 
 	if (sdd.cleanup_fd != -1)
 		VERIFY(0 == close(sdd.cleanup_fd));
 	if (tid != 0) {
 		(void) pthread_cancel(tid);
 		(void) close(pipefd[0]);
 		(void) pthread_join(tid, NULL);
 	}
 	return (err);
 }
 
 static zfs_handle_t *
 name_to_dir_handle(libzfs_handle_t *hdl, const char *snapname)
 {
 	char dirname[ZFS_MAX_DATASET_NAME_LEN];
 	(void) strlcpy(dirname, snapname, ZFS_MAX_DATASET_NAME_LEN);
 	char *c = strchr(dirname, '@');
 	if (c != NULL)
 		*c = '\0';
 	return (zfs_open(hdl, dirname, ZFS_TYPE_DATASET));
 }
 
 /*
  * Returns B_TRUE if earlier is an earlier snapshot in later's timeline; either
  * an earlier snapshot in the same filesystem, or a snapshot before later's
  * origin, or it's origin's origin, etc.
  */
 static boolean_t
 snapshot_is_before(zfs_handle_t *earlier, zfs_handle_t *later)
 {
 	boolean_t ret;
 	uint64_t later_txg =
 	    (later->zfs_type == ZFS_TYPE_FILESYSTEM ||
 	    later->zfs_type == ZFS_TYPE_VOLUME ?
 	    UINT64_MAX : zfs_prop_get_int(later, ZFS_PROP_CREATETXG));
 	uint64_t earlier_txg = zfs_prop_get_int(earlier, ZFS_PROP_CREATETXG);
 
 	if (earlier_txg >= later_txg)
 		return (B_FALSE);
 
 	zfs_handle_t *earlier_dir = name_to_dir_handle(earlier->zfs_hdl,
 	    earlier->zfs_name);
 	zfs_handle_t *later_dir = name_to_dir_handle(later->zfs_hdl,
 	    later->zfs_name);
 
 	if (strcmp(earlier_dir->zfs_name, later_dir->zfs_name) == 0) {
 		zfs_close(earlier_dir);
 		zfs_close(later_dir);
 		return (B_TRUE);
 	}
 
 	char clonename[ZFS_MAX_DATASET_NAME_LEN];
 	if (zfs_prop_get(later_dir, ZFS_PROP_ORIGIN, clonename,
 	    ZFS_MAX_DATASET_NAME_LEN, NULL, NULL, 0, B_TRUE) != 0) {
 		zfs_close(earlier_dir);
 		zfs_close(later_dir);
 		return (B_FALSE);
 	}
 
 	zfs_handle_t *origin = zfs_open(earlier->zfs_hdl, clonename,
 	    ZFS_TYPE_DATASET);
 	uint64_t origin_txg = zfs_prop_get_int(origin, ZFS_PROP_CREATETXG);
 
 	/*
 	 * If "earlier" is exactly the origin, then
 	 * snapshot_is_before(earlier, origin) will return false (because
 	 * they're the same).
 	 */
 	if (origin_txg == earlier_txg &&
 	    strcmp(origin->zfs_name, earlier->zfs_name) == 0) {
 		zfs_close(earlier_dir);
 		zfs_close(later_dir);
 		zfs_close(origin);
 		return (B_TRUE);
 	}
 	zfs_close(earlier_dir);
 	zfs_close(later_dir);
 
 	ret = snapshot_is_before(earlier, origin);
 	zfs_close(origin);
 	return (ret);
 }
 
 /*
  * The "zhp" argument is the handle of the dataset to send (typically a
  * snapshot).  The "from" argument is the full name of the snapshot or
  * bookmark that is the incremental source.
  */
 int
 zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
     const char *redactbook)
 {
 	int err;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char *name = zhp->zfs_name;
 	int orig_fd = fd;
 	pthread_t ptid;
 	progress_arg_t pa = { 0 };
 
 	char errbuf[1024];
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "warning: cannot send '%s'"), name);
 
 	if (from != NULL && strchr(from, '@')) {
 		zfs_handle_t *from_zhp = zfs_open(hdl, from,
 		    ZFS_TYPE_DATASET);
 		if (from_zhp == NULL)
 			return (-1);
 		if (!snapshot_is_before(from_zhp, zhp)) {
 			zfs_close(from_zhp);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 		}
 		zfs_close(from_zhp);
 	}
 
 	if (redactbook != NULL) {
 		char bookname[ZFS_MAX_DATASET_NAME_LEN];
 		nvlist_t *redact_snaps;
 		zfs_handle_t *book_zhp;
 		char *at, *pound;
 		int dsnamelen;
 
 		pound = strchr(redactbook, '#');
 		if (pound != NULL)
 			redactbook = pound + 1;
 		at = strchr(name, '@');
 		if (at == NULL) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "cannot do a redacted send to a filesystem"));
 			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 		}
 		dsnamelen = at - name;
 		if (snprintf(bookname, sizeof (bookname), "%.*s#%s",
 		    dsnamelen, name, redactbook)
 		    >= sizeof (bookname)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid bookmark name"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 		book_zhp = zfs_open(hdl, bookname, ZFS_TYPE_BOOKMARK);
 		if (book_zhp == NULL)
 			return (-1);
 		if (nvlist_lookup_nvlist(book_zhp->zfs_props,
 		    zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS),
 		    &redact_snaps) != 0 || redact_snaps == NULL) {
 			zfs_close(book_zhp);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not a redaction bookmark"));
 			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 		}
 		zfs_close(book_zhp);
 	}
 
 	/*
 	 * Send fs properties
 	 */
 	if (flags->props || flags->holds || flags->backup) {
 		/*
 		 * Note: the header generated by send_prelim_records()
 		 * assumes that the incremental source is in the same
 		 * filesystem/volume as the target (which is a requirement
 		 * when doing "zfs send -R").  But that isn't always the
 		 * case here (e.g. send from snap in origin, or send from
 		 * bookmark).  We pass from=NULL, which will omit this
 		 * information from the prelim records; it isn't used
 		 * when receiving this type of stream.
 		 */
 		err = send_prelim_records(zhp, NULL, fd, B_TRUE, B_FALSE,
 		    flags->verbosity > 0, flags->dryrun, flags->raw,
 		    flags->replicate, flags->backup, flags->holds,
 		    flags->props, flags->doall, NULL, NULL);
 		if (err != 0)
 			return (err);
 	}
 
 	/*
 	 * Perform size estimate if verbose was specified.
 	 */
 	if (flags->verbosity != 0) {
 		err = estimate_size(zhp, from, fd, flags, 0, 0, 0, redactbook,
 		    errbuf);
 		if (err != 0)
 			return (err);
 	}
 
 	if (flags->dryrun)
 		return (0);
 
 	/*
 	 * If progress reporting is requested, spawn a new thread to poll
 	 * ZFS_IOC_SEND_PROGRESS at a regular interval.
 	 */
 	if (flags->progress) {
 		pa.pa_zhp = zhp;
 		pa.pa_fd = fd;
 		pa.pa_parsable = flags->parsable;
 		pa.pa_estimate = B_FALSE;
 		pa.pa_verbosity = flags->verbosity;
 
 		err = pthread_create(&ptid, NULL,
 		    send_progress_thread, &pa);
 		if (err != 0) {
 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
 			return (zfs_error(zhp->zfs_hdl,
 			    EZFS_THREADCREATEFAILED, errbuf));
 		}
 	}
 
 	err = lzc_send_redacted(name, from, fd,
 	    lzc_flags_from_sendflags(flags), redactbook);
 
 	if (flags->progress) {
 		void *status = NULL;
 		if (err != 0)
 			(void) pthread_cancel(ptid);
 		(void) pthread_join(ptid, &status);
 		int error = (int)(uintptr_t)status;
 		if (error != 0 && status != PTHREAD_CANCELED) {
 			char errbuf[1024];
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN, "progress thread exited "
 			    "nonzero"));
 			return (zfs_standard_error(hdl, error, errbuf));
 		}
 	}
 
 	if (flags->props || flags->holds || flags->backup) {
 		/* Write the final end record. */
 		err = send_conclusion_record(orig_fd, NULL);
 		if (err != 0)
 			return (zfs_standard_error(hdl, err, errbuf));
 	}
 	if (err != 0) {
 		switch (errno) {
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case ENOENT:
 		case ESRCH:
 			if (lzc_exists(name)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incremental source (%s) does not exist"),
 				    from);
 			}
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EACCES:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "dataset key must be loaded"));
 			return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf));
 
 		case EBUSY:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "target is busy; if a filesystem, "
 			    "it must not be mounted"));
 			return (zfs_error(hdl, EZFS_BUSY, errbuf));
 
 		case EDQUOT:
 		case EFAULT:
 		case EFBIG:
 		case EINVAL:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 		case ENOSTR:
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 	return (err != 0);
 }
 
 /*
  * Routines specific to "zfs recv"
  */
 
 static int
 recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen,
     boolean_t byteswap, zio_cksum_t *zc)
 {
 	char *cp = buf;
 	int rv;
 	int len = ilen;
 
-	assert(ilen <= SPA_MAXBLOCKSIZE);
-
 	do {
 		rv = read(fd, cp, len);
 		cp += rv;
 		len -= rv;
 	} while (rv > 0);
 
 	if (rv < 0 || len != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "failed to read from stream"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN,
 		    "cannot receive")));
 	}
 
 	if (zc) {
 		if (byteswap)
 			fletcher_4_incremental_byteswap(buf, ilen, zc);
 		else
 			fletcher_4_incremental_native(buf, ilen, zc);
 	}
 	return (0);
 }
 
 static int
 recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp,
     boolean_t byteswap, zio_cksum_t *zc)
 {
 	char *buf;
 	int err;
 
 	buf = zfs_alloc(hdl, len);
 	if (buf == NULL)
 		return (ENOMEM);
 
+	if (len > hdl->libzfs_max_nvlist) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "nvlist too large"));
+		return (ENOMEM);
+	}
+
 	err = recv_read(hdl, fd, buf, len, byteswap, zc);
 	if (err != 0) {
 		free(buf);
 		return (err);
 	}
 
 	err = nvlist_unpack(buf, len, nvp, 0);
 	free(buf);
 	if (err != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (malformed nvlist)"));
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /*
  * Returns the grand origin (origin of origin of origin...) of a given handle.
  * If this dataset is not a clone, it simply returns a copy of the original
  * handle.
  */
 static zfs_handle_t *
 recv_open_grand_origin(zfs_handle_t *zhp)
 {
 	char origin[ZFS_MAX_DATASET_NAME_LEN];
 	zprop_source_t src;
 	zfs_handle_t *ozhp = zfs_handle_dup(zhp);
 
 	while (ozhp != NULL) {
 		if (zfs_prop_get(ozhp, ZFS_PROP_ORIGIN, origin,
 		    sizeof (origin), &src, NULL, 0, B_FALSE) != 0)
 			break;
 
 		(void) zfs_close(ozhp);
 		ozhp = zfs_open(zhp->zfs_hdl, origin, ZFS_TYPE_FILESYSTEM);
 	}
 
 	return (ozhp);
 }
 
 static int
 recv_rename_impl(zfs_handle_t *zhp, const char *name, const char *newname)
 {
 	int err;
 	zfs_handle_t *ozhp = NULL;
 
 	/*
 	 * Attempt to rename the dataset. If it fails with EACCES we have
 	 * attempted to rename the dataset outside of its encryption root.
 	 * Force the dataset to become an encryption root and try again.
 	 */
 	err = lzc_rename(name, newname);
 	if (err == EACCES) {
 		ozhp = recv_open_grand_origin(zhp);
 		if (ozhp == NULL) {
 			err = ENOENT;
 			goto out;
 		}
 
 		err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY,
 		    NULL, NULL, 0);
 		if (err != 0)
 			goto out;
 
 		err = lzc_rename(name, newname);
 	}
 
 out:
 	if (ozhp != NULL)
 		zfs_close(ozhp);
 	return (err);
 }
 
 static int
 recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname,
     int baselen, char *newname, recvflags_t *flags)
 {
 	static int seq;
 	int err;
 	prop_changelist_t *clp = NULL;
 	zfs_handle_t *zhp = NULL;
 
 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 	if (zhp == NULL) {
 		err = -1;
 		goto out;
 	}
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
 	    flags->force ? MS_FORCE : 0);
 	if (clp == NULL) {
 		err = -1;
 		goto out;
 	}
 	err = changelist_prefix(clp);
 	if (err)
 		goto out;
 
 	if (tryname) {
 		(void) strcpy(newname, tryname);
 		if (flags->verbose) {
 			(void) printf("attempting rename %s to %s\n",
 			    name, newname);
 		}
 		err = recv_rename_impl(zhp, name, newname);
 		if (err == 0)
 			changelist_rename(clp, name, tryname);
 	} else {
 		err = ENOENT;
 	}
 
 	if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) {
 		seq++;
 
 		(void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN,
 		    "%.*srecv-%u-%u", baselen, name, getpid(), seq);
 
 		if (flags->verbose) {
 			(void) printf("failed - trying rename %s to %s\n",
 			    name, newname);
 		}
 		err = recv_rename_impl(zhp, name, newname);
 		if (err == 0)
 			changelist_rename(clp, name, newname);
 		if (err && flags->verbose) {
 			(void) printf("failed (%u) - "
 			    "will try again on next pass\n", errno);
 		}
 		err = EAGAIN;
 	} else if (flags->verbose) {
 		if (err == 0)
 			(void) printf("success\n");
 		else
 			(void) printf("failed (%u)\n", errno);
 	}
 
 	(void) changelist_postfix(clp);
 
 out:
 	if (clp != NULL)
 		changelist_free(clp);
 	if (zhp != NULL)
 		zfs_close(zhp);
 
 	return (err);
 }
 
 static int
 recv_promote(libzfs_handle_t *hdl, const char *fsname,
     const char *origin_fsname, recvflags_t *flags)
 {
 	int err;
 	zfs_cmd_t zc = {"\0"};
 	zfs_handle_t *zhp = NULL, *ozhp = NULL;
 
 	if (flags->verbose)
 		(void) printf("promoting %s\n", fsname);
 
 	(void) strlcpy(zc.zc_value, origin_fsname, sizeof (zc.zc_value));
 	(void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name));
 
 	/*
 	 * Attempt to promote the dataset. If it fails with EACCES the
 	 * promotion would cause this dataset to leave its encryption root.
 	 * Force the origin to become an encryption root and try again.
 	 */
 	err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc);
 	if (err == EACCES) {
 		zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET);
 		if (zhp == NULL) {
 			err = -1;
 			goto out;
 		}
 
 		ozhp = recv_open_grand_origin(zhp);
 		if (ozhp == NULL) {
 			err = -1;
 			goto out;
 		}
 
 		err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY,
 		    NULL, NULL, 0);
 		if (err != 0)
 			goto out;
 
 		err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc);
 	}
 
 out:
 	if (zhp != NULL)
 		zfs_close(zhp);
 	if (ozhp != NULL)
 		zfs_close(ozhp);
 
 	return (err);
 }
 
 static int
 recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
     char *newname, recvflags_t *flags)
 {
 	int err = 0;
 	prop_changelist_t *clp;
 	zfs_handle_t *zhp;
 	boolean_t defer = B_FALSE;
 	int spa_version;
 
 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 	if (zhp == NULL)
 		return (-1);
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
 	    flags->force ? MS_FORCE : 0);
 	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
 	    zfs_spa_version(zhp, &spa_version) == 0 &&
 	    spa_version >= SPA_VERSION_USERREFS)
 		defer = B_TRUE;
 	zfs_close(zhp);
 	if (clp == NULL)
 		return (-1);
 	err = changelist_prefix(clp);
 	if (err)
 		return (err);
 
 	if (flags->verbose)
 		(void) printf("attempting destroy %s\n", name);
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		nvlist_t *nv = fnvlist_alloc();
 		fnvlist_add_boolean(nv, name);
 		err = lzc_destroy_snaps(nv, defer, NULL);
 		fnvlist_free(nv);
 	} else {
 		err = lzc_destroy(name);
 	}
 	if (err == 0) {
 		if (flags->verbose)
 			(void) printf("success\n");
 		changelist_remove(clp, name);
 	}
 
 	(void) changelist_postfix(clp);
 	changelist_free(clp);
 
 	/*
 	 * Deferred destroy might destroy the snapshot or only mark it to be
 	 * destroyed later, and it returns success in either case.
 	 */
 	if (err != 0 || (defer && zfs_dataset_exists(hdl, name,
 	    ZFS_TYPE_SNAPSHOT))) {
 		err = recv_rename(hdl, name, NULL, baselen, newname, flags);
 	}
 
 	return (err);
 }
 
 typedef struct guid_to_name_data {
 	uint64_t guid;
 	boolean_t bookmark_ok;
 	char *name;
 	char *skip;
 	uint64_t *redact_snap_guids;
 	uint64_t num_redact_snaps;
 } guid_to_name_data_t;
 
 static boolean_t
 redact_snaps_match(zfs_handle_t *zhp, guid_to_name_data_t *gtnd)
 {
 	uint64_t *bmark_snaps;
 	uint_t bmark_num_snaps;
 	nvlist_t *nvl;
 	if (zhp->zfs_type != ZFS_TYPE_BOOKMARK)
 		return (B_FALSE);
 
 	nvl = fnvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS));
 	bmark_snaps = fnvlist_lookup_uint64_array(nvl, ZPROP_VALUE,
 	    &bmark_num_snaps);
 	if (bmark_num_snaps != gtnd->num_redact_snaps)
 		return (B_FALSE);
 	int i = 0;
 	for (; i < bmark_num_snaps; i++) {
 		int j = 0;
 		for (; j < bmark_num_snaps; j++) {
 			if (bmark_snaps[i] == gtnd->redact_snap_guids[j])
 				break;
 		}
 		if (j == bmark_num_snaps)
 			break;
 	}
 	return (i == bmark_num_snaps);
 }
 
 static int
 guid_to_name_cb(zfs_handle_t *zhp, void *arg)
 {
 	guid_to_name_data_t *gtnd = arg;
 	const char *slash;
 	int err;
 
 	if (gtnd->skip != NULL &&
 	    (slash = strrchr(zhp->zfs_name, '/')) != NULL &&
 	    strcmp(slash + 1, gtnd->skip) == 0) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid &&
 	    (gtnd->num_redact_snaps == -1 || redact_snaps_match(zhp, gtnd))) {
 		(void) strcpy(gtnd->name, zhp->zfs_name);
 		zfs_close(zhp);
 		return (EEXIST);
 	}
 
 	err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
 	if (err != EEXIST && gtnd->bookmark_ok)
 		err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd);
 	zfs_close(zhp);
 	return (err);
 }
 
 /*
  * Attempt to find the local dataset associated with this guid.  In the case of
  * multiple matches, we attempt to find the "best" match by searching
  * progressively larger portions of the hierarchy.  This allows one to send a
  * tree of datasets individually and guarantee that we will find the source
  * guid within that hierarchy, even if there are multiple matches elsewhere.
  *
  * If num_redact_snaps is not -1, we attempt to find a redaction bookmark with
  * the specified number of redaction snapshots.  If num_redact_snaps isn't 0 or
  * -1, then redact_snap_guids will be an array of the guids of the snapshots the
  * redaction bookmark was created with.  If num_redact_snaps is -1, then we will
  * attempt to find a snapshot or bookmark (if bookmark_ok is passed) with the
  * given guid.  Note that a redaction bookmark can be returned if
  * num_redact_snaps == -1.
  */
 static int
 guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent,
     uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids,
     uint64_t num_redact_snaps, char *name)
 {
 	char pname[ZFS_MAX_DATASET_NAME_LEN];
 	guid_to_name_data_t gtnd;
 
 	gtnd.guid = guid;
 	gtnd.bookmark_ok = bookmark_ok;
 	gtnd.name = name;
 	gtnd.skip = NULL;
 	gtnd.redact_snap_guids = redact_snap_guids;
 	gtnd.num_redact_snaps = num_redact_snaps;
 
 	/*
 	 * Search progressively larger portions of the hierarchy, starting
 	 * with the filesystem specified by 'parent'.  This will
 	 * select the "most local" version of the origin snapshot in the case
 	 * that there are multiple matching snapshots in the system.
 	 */
 	(void) strlcpy(pname, parent, sizeof (pname));
 	char *cp = strrchr(pname, '@');
 	if (cp == NULL)
 		cp = strchr(pname, '\0');
 	for (; cp != NULL; cp = strrchr(pname, '/')) {
 		/* Chop off the last component and open the parent */
 		*cp = '\0';
 		zfs_handle_t *zhp = make_dataset_handle(hdl, pname);
 
 		if (zhp == NULL)
 			continue;
 		int err = guid_to_name_cb(zfs_handle_dup(zhp), &gtnd);
 		if (err != EEXIST)
 			err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
 		if (err != EEXIST && bookmark_ok)
 			err = zfs_iter_bookmarks(zhp, guid_to_name_cb, &gtnd);
 		zfs_close(zhp);
 		if (err == EEXIST)
 			return (0);
 
 		/*
 		 * Remember the last portion of the dataset so we skip it next
 		 * time through (as we've already searched that portion of the
 		 * hierarchy).
 		 */
 		gtnd.skip = strrchr(pname, '/') + 1;
 	}
 
 	return (ENOENT);
 }
 
 static int
 guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid,
     boolean_t bookmark_ok, char *name)
 {
 	return (guid_to_name_redact_snaps(hdl, parent, guid, bookmark_ok, NULL,
 	    -1, name));
 }
 
 /*
  * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if
  * guid1 is after guid2.
  */
 static int
 created_before(libzfs_handle_t *hdl, avl_tree_t *avl,
     uint64_t guid1, uint64_t guid2)
 {
 	nvlist_t *nvfs;
 	char *fsname = NULL, *snapname = NULL;
 	char buf[ZFS_MAX_DATASET_NAME_LEN];
 	int rv;
 	zfs_handle_t *guid1hdl, *guid2hdl;
 	uint64_t create1, create2;
 
 	if (guid2 == 0)
 		return (0);
 	if (guid1 == 0)
 		return (1);
 
 	nvfs = fsavl_find(avl, guid1, &snapname);
 	VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 	(void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
 	guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
 	if (guid1hdl == NULL)
 		return (-1);
 
 	nvfs = fsavl_find(avl, guid2, &snapname);
 	VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 	(void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
 	guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
 	if (guid2hdl == NULL) {
 		zfs_close(guid1hdl);
 		return (-1);
 	}
 
 	create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG);
 	create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG);
 
 	if (create1 < create2)
 		rv = -1;
 	else if (create1 > create2)
 		rv = +1;
 	else
 		rv = 0;
 
 	zfs_close(guid1hdl);
 	zfs_close(guid2hdl);
 
 	return (rv);
 }
 
 /*
  * This function reestablishes the hierarchy of encryption roots after a
  * recursive incremental receive has completed. This must be done after the
  * second call to recv_incremental_replication() has renamed and promoted all
  * sent datasets to their final locations in the dataset hierarchy.
  */
 static int
 recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *top_zfs,
     nvlist_t *stream_nv, avl_tree_t *stream_avl)
 {
 	int err;
 	nvpair_t *fselem = NULL;
 	nvlist_t *stream_fss;
 
 	VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss", &stream_fss));
 
 	while ((fselem = nvlist_next_nvpair(stream_fss, fselem)) != NULL) {
 		zfs_handle_t *zhp = NULL;
 		uint64_t crypt;
 		nvlist_t *snaps, *props, *stream_nvfs = NULL;
 		nvpair_t *snapel = NULL;
 		boolean_t is_encroot, is_clone, stream_encroot;
 		char *cp;
 		char *stream_keylocation = NULL;
 		char keylocation[MAXNAMELEN];
 		char fsname[ZFS_MAX_DATASET_NAME_LEN];
 
 		keylocation[0] = '\0';
 		VERIFY(0 == nvpair_value_nvlist(fselem, &stream_nvfs));
 		VERIFY(0 == nvlist_lookup_nvlist(stream_nvfs, "snaps", &snaps));
 		VERIFY(0 == nvlist_lookup_nvlist(stream_nvfs, "props", &props));
 		stream_encroot = nvlist_exists(stream_nvfs, "is_encroot");
 
 		/* find a snapshot from the stream that exists locally */
 		err = ENOENT;
 		while ((snapel = nvlist_next_nvpair(snaps, snapel)) != NULL) {
 			uint64_t guid;
 
 			VERIFY(0 == nvpair_value_uint64(snapel, &guid));
 			err = guid_to_name(hdl, top_zfs, guid, B_FALSE,
 			    fsname);
 			if (err == 0)
 				break;
 		}
 
 		if (err != 0)
 			continue;
 
 		cp = strchr(fsname, '@');
 		if (cp != NULL)
 			*cp = '\0';
 
 		zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET);
 		if (zhp == NULL) {
 			err = ENOENT;
 			goto error;
 		}
 
 		crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION);
 		is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0';
 		(void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL);
 
 		/* we don't need to do anything for unencrypted datasets */
 		if (crypt == ZIO_CRYPT_OFF) {
 			zfs_close(zhp);
 			continue;
 		}
 
 		/*
 		 * If the dataset is flagged as an encryption root, was not
 		 * received as a clone and is not currently an encryption root,
 		 * force it to become one. Fixup the keylocation if necessary.
 		 */
 		if (stream_encroot) {
 			if (!is_clone && !is_encroot) {
 				err = lzc_change_key(fsname,
 				    DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0);
 				if (err != 0) {
 					zfs_close(zhp);
 					goto error;
 				}
 			}
 
 			VERIFY(0 == nvlist_lookup_string(props,
 			    zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
 			    &stream_keylocation));
 
 			/*
 			 * Refresh the properties in case the call to
 			 * lzc_change_key() changed the value.
 			 */
 			zfs_refresh_properties(zhp);
 			err = zfs_prop_get(zhp, ZFS_PROP_KEYLOCATION,
 			    keylocation, sizeof (keylocation), NULL, NULL,
 			    0, B_TRUE);
 			if (err != 0) {
 				zfs_close(zhp);
 				goto error;
 			}
 
 			if (strcmp(keylocation, stream_keylocation) != 0) {
 				err = zfs_prop_set(zhp,
 				    zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
 				    stream_keylocation);
 				if (err != 0) {
 					zfs_close(zhp);
 					goto error;
 				}
 			}
 		}
 
 		/*
 		 * If the dataset is not flagged as an encryption root and is
 		 * currently an encryption root, force it to inherit from its
 		 * parent. The root of a raw send should never be
 		 * force-inherited.
 		 */
 		if (!stream_encroot && is_encroot &&
 		    strcmp(top_zfs, fsname) != 0) {
 			err = lzc_change_key(fsname, DCP_CMD_FORCE_INHERIT,
 			    NULL, NULL, 0);
 			if (err != 0) {
 				zfs_close(zhp);
 				goto error;
 			}
 		}
 
 		zfs_close(zhp);
 	}
 
 	return (0);
 
 error:
 	return (err);
 }
 
 static int
 recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
     recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl,
     nvlist_t *renamed)
 {
 	nvlist_t *local_nv, *deleted = NULL;
 	avl_tree_t *local_avl;
 	nvpair_t *fselem, *nextfselem;
 	char *fromsnap;
 	char newname[ZFS_MAX_DATASET_NAME_LEN];
 	char guidname[32];
 	int error;
 	boolean_t needagain, progress, recursive;
 	char *s1, *s2;
 
 	VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap));
 
 	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
 	    ENOENT);
 
 	if (flags->dryrun)
 		return (0);
 
 again:
 	needagain = progress = B_FALSE;
 
 	VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0));
 
 	if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL,
 	    recursive, B_TRUE, B_FALSE, recursive, B_FALSE, B_FALSE,
 	    B_FALSE, B_TRUE, &local_nv, &local_avl)) != 0)
 		return (error);
 
 	/*
 	 * Process deletes and renames
 	 */
 	for (fselem = nvlist_next_nvpair(local_nv, NULL);
 	    fselem; fselem = nextfselem) {
 		nvlist_t *nvfs, *snaps;
 		nvlist_t *stream_nvfs = NULL;
 		nvpair_t *snapelem, *nextsnapelem;
 		uint64_t fromguid = 0;
 		uint64_t originguid = 0;
 		uint64_t stream_originguid = 0;
 		uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid;
 		char *fsname, *stream_fsname;
 
 		nextfselem = nvlist_next_nvpair(local_nv, fselem);
 
 		VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs));
 		VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps));
 		VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 		VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap",
 		    &parent_fromsnap_guid));
 		(void) nvlist_lookup_uint64(nvfs, "origin", &originguid);
 
 		/*
 		 * First find the stream's fs, so we can check for
 		 * a different origin (due to "zfs promote")
 		 */
 		for (snapelem = nvlist_next_nvpair(snaps, NULL);
 		    snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) {
 			uint64_t thisguid;
 
 			VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid));
 			stream_nvfs = fsavl_find(stream_avl, thisguid, NULL);
 
 			if (stream_nvfs != NULL)
 				break;
 		}
 
 		/* check for promote */
 		(void) nvlist_lookup_uint64(stream_nvfs, "origin",
 		    &stream_originguid);
 		if (stream_nvfs && originguid != stream_originguid) {
 			switch (created_before(hdl, local_avl,
 			    stream_originguid, originguid)) {
 			case 1: {
 				/* promote it! */
 				nvlist_t *origin_nvfs;
 				char *origin_fsname;
 
 				origin_nvfs = fsavl_find(local_avl, originguid,
 				    NULL);
 				VERIFY(0 == nvlist_lookup_string(origin_nvfs,
 				    "name", &origin_fsname));
 				error = recv_promote(hdl, fsname, origin_fsname,
 				    flags);
 				if (error == 0)
 					progress = B_TRUE;
 				break;
 			}
 			default:
 				break;
 			case -1:
 				fsavl_destroy(local_avl);
 				nvlist_free(local_nv);
 				return (-1);
 			}
 			/*
 			 * We had/have the wrong origin, therefore our
 			 * list of snapshots is wrong.  Need to handle
 			 * them on the next pass.
 			 */
 			needagain = B_TRUE;
 			continue;
 		}
 
 		for (snapelem = nvlist_next_nvpair(snaps, NULL);
 		    snapelem; snapelem = nextsnapelem) {
 			uint64_t thisguid;
 			char *stream_snapname;
 			nvlist_t *found, *props;
 
 			nextsnapelem = nvlist_next_nvpair(snaps, snapelem);
 
 			VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid));
 			found = fsavl_find(stream_avl, thisguid,
 			    &stream_snapname);
 
 			/* check for delete */
 			if (found == NULL) {
 				char name[ZFS_MAX_DATASET_NAME_LEN];
 
 				if (!flags->force)
 					continue;
 
 				(void) snprintf(name, sizeof (name), "%s@%s",
 				    fsname, nvpair_name(snapelem));
 
 				error = recv_destroy(hdl, name,
 				    strlen(fsname)+1, newname, flags);
 				if (error)
 					needagain = B_TRUE;
 				else
 					progress = B_TRUE;
 				sprintf(guidname, "%llu",
 				    (u_longlong_t)thisguid);
 				nvlist_add_boolean(deleted, guidname);
 				continue;
 			}
 
 			stream_nvfs = found;
 
 			if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops",
 			    &props) && 0 == nvlist_lookup_nvlist(props,
 			    stream_snapname, &props)) {
 				zfs_cmd_t zc = {"\0"};
 
 				zc.zc_cookie = B_TRUE; /* received */
 				(void) snprintf(zc.zc_name, sizeof (zc.zc_name),
 				    "%s@%s", fsname, nvpair_name(snapelem));
 				if (zcmd_write_src_nvlist(hdl, &zc,
 				    props) == 0) {
 					(void) zfs_ioctl(hdl,
 					    ZFS_IOC_SET_PROP, &zc);
 					zcmd_free_nvlists(&zc);
 				}
 			}
 
 			/* check for different snapname */
 			if (strcmp(nvpair_name(snapelem),
 			    stream_snapname) != 0) {
 				char name[ZFS_MAX_DATASET_NAME_LEN];
 				char tryname[ZFS_MAX_DATASET_NAME_LEN];
 
 				(void) snprintf(name, sizeof (name), "%s@%s",
 				    fsname, nvpair_name(snapelem));
 				(void) snprintf(tryname, sizeof (name), "%s@%s",
 				    fsname, stream_snapname);
 
 				error = recv_rename(hdl, name, tryname,
 				    strlen(fsname)+1, newname, flags);
 				if (error)
 					needagain = B_TRUE;
 				else
 					progress = B_TRUE;
 			}
 
 			if (strcmp(stream_snapname, fromsnap) == 0)
 				fromguid = thisguid;
 		}
 
 		/* check for delete */
 		if (stream_nvfs == NULL) {
 			if (!flags->force)
 				continue;
 
 			error = recv_destroy(hdl, fsname, strlen(tofs)+1,
 			    newname, flags);
 			if (error)
 				needagain = B_TRUE;
 			else
 				progress = B_TRUE;
 			sprintf(guidname, "%llu",
 			    (u_longlong_t)parent_fromsnap_guid);
 			nvlist_add_boolean(deleted, guidname);
 			continue;
 		}
 
 		if (fromguid == 0) {
 			if (flags->verbose) {
 				(void) printf("local fs %s does not have "
 				    "fromsnap (%s in stream); must have "
 				    "been deleted locally; ignoring\n",
 				    fsname, fromsnap);
 			}
 			continue;
 		}
 
 		VERIFY(0 == nvlist_lookup_string(stream_nvfs,
 		    "name", &stream_fsname));
 		VERIFY(0 == nvlist_lookup_uint64(stream_nvfs,
 		    "parentfromsnap", &stream_parent_fromsnap_guid));
 
 		s1 = strrchr(fsname, '/');
 		s2 = strrchr(stream_fsname, '/');
 
 		/*
 		 * Check if we're going to rename based on parent guid change
 		 * and the current parent guid was also deleted. If it was then
 		 * rename will fail and is likely unneeded, so avoid this and
 		 * force an early retry to determine the new
 		 * parent_fromsnap_guid.
 		 */
 		if (stream_parent_fromsnap_guid != 0 &&
 		    parent_fromsnap_guid != 0 &&
 		    stream_parent_fromsnap_guid != parent_fromsnap_guid) {
 			sprintf(guidname, "%llu",
 			    (u_longlong_t)parent_fromsnap_guid);
 			if (nvlist_exists(deleted, guidname)) {
 				progress = B_TRUE;
 				needagain = B_TRUE;
 				goto doagain;
 			}
 		}
 
 		/*
 		 * Check for rename. If the exact receive path is specified, it
 		 * does not count as a rename, but we still need to check the
 		 * datasets beneath it.
 		 */
 		if ((stream_parent_fromsnap_guid != 0 &&
 		    parent_fromsnap_guid != 0 &&
 		    stream_parent_fromsnap_guid != parent_fromsnap_guid) ||
 		    ((flags->isprefix || strcmp(tofs, fsname) != 0) &&
 		    (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
 			nvlist_t *parent;
 			char tryname[ZFS_MAX_DATASET_NAME_LEN];
 
 			parent = fsavl_find(local_avl,
 			    stream_parent_fromsnap_guid, NULL);
 			/*
 			 * NB: parent might not be found if we used the
 			 * tosnap for stream_parent_fromsnap_guid,
 			 * because the parent is a newly-created fs;
 			 * we'll be able to rename it after we recv the
 			 * new fs.
 			 */
 			if (parent != NULL) {
 				char *pname;
 
 				VERIFY(0 == nvlist_lookup_string(parent, "name",
 				    &pname));
 				(void) snprintf(tryname, sizeof (tryname),
 				    "%s%s", pname, strrchr(stream_fsname, '/'));
 			} else {
 				tryname[0] = '\0';
 				if (flags->verbose) {
 					(void) printf("local fs %s new parent "
 					    "not found\n", fsname);
 				}
 			}
 
 			newname[0] = '\0';
 
 			error = recv_rename(hdl, fsname, tryname,
 			    strlen(tofs)+1, newname, flags);
 
 			if (renamed != NULL && newname[0] != '\0') {
 				VERIFY(0 == nvlist_add_boolean(renamed,
 				    newname));
 			}
 
 			if (error)
 				needagain = B_TRUE;
 			else
 				progress = B_TRUE;
 		}
 	}
 
 doagain:
 	fsavl_destroy(local_avl);
 	nvlist_free(local_nv);
 	nvlist_free(deleted);
 
 	if (needagain && progress) {
 		/* do another pass to fix up temporary names */
 		if (flags->verbose)
 			(void) printf("another pass:\n");
 		goto again;
 	}
 
 	return (needagain || error != 0);
 }
 
 static int
 zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
     recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc,
     char **top_zfs, nvlist_t *cmdprops)
 {
 	nvlist_t *stream_nv = NULL;
 	avl_tree_t *stream_avl = NULL;
 	char *fromsnap = NULL;
 	char *sendsnap = NULL;
 	char *cp;
 	char tofs[ZFS_MAX_DATASET_NAME_LEN];
 	char sendfs[ZFS_MAX_DATASET_NAME_LEN];
 	char errbuf[1024];
 	dmu_replay_record_t drre;
 	int error;
 	boolean_t anyerr = B_FALSE;
 	boolean_t softerr = B_FALSE;
 	boolean_t recursive, raw;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	assert(drr->drr_type == DRR_BEGIN);
 	assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC);
 	assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM);
 
 	/*
 	 * Read in the nvlist from the stream.
 	 */
 	if (drr->drr_payloadlen != 0) {
 		error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen,
 		    &stream_nv, flags->byteswap, zc);
 		if (error) {
 			error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			goto out;
 		}
 	}
 
 	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
 	    ENOENT);
 	raw = (nvlist_lookup_boolean(stream_nv, "raw") == 0);
 
 	if (recursive && strchr(destname, '@')) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "cannot specify snapshot name for multi-snapshot stream"));
 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 		goto out;
 	}
 
 	/*
 	 * Read in the end record and verify checksum.
 	 */
 	if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre),
 	    flags->byteswap, NULL)))
 		goto out;
 	if (flags->byteswap) {
 		drre.drr_type = BSWAP_32(drre.drr_type);
 		drre.drr_u.drr_end.drr_checksum.zc_word[0] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]);
 		drre.drr_u.drr_end.drr_checksum.zc_word[1] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]);
 		drre.drr_u.drr_end.drr_checksum.zc_word[2] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]);
 		drre.drr_u.drr_end.drr_checksum.zc_word[3] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]);
 	}
 	if (drre.drr_type != DRR_END) {
 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 		goto out;
 	}
 	if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "incorrect header checksum"));
 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 		goto out;
 	}
 
 	(void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap);
 
 	if (drr->drr_payloadlen != 0) {
 		nvlist_t *stream_fss;
 
 		VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss",
 		    &stream_fss));
 		if ((stream_avl = fsavl_create(stream_fss)) == NULL) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "couldn't allocate avl tree"));
 			error = zfs_error(hdl, EZFS_NOMEM, errbuf);
 			goto out;
 		}
 
 		if (fromsnap != NULL && recursive) {
 			nvlist_t *renamed = NULL;
 			nvpair_t *pair = NULL;
 
 			(void) strlcpy(tofs, destname, sizeof (tofs));
 			if (flags->isprefix) {
 				struct drr_begin *drrb = &drr->drr_u.drr_begin;
 				int i;
 
 				if (flags->istail) {
 					cp = strrchr(drrb->drr_toname, '/');
 					if (cp == NULL) {
 						(void) strlcat(tofs, "/",
 						    sizeof (tofs));
 						i = 0;
 					} else {
 						i = (cp - drrb->drr_toname);
 					}
 				} else {
 					i = strcspn(drrb->drr_toname, "/@");
 				}
 				/* zfs_receive_one() will create_parents() */
 				(void) strlcat(tofs, &drrb->drr_toname[i],
 				    sizeof (tofs));
 				*strchr(tofs, '@') = '\0';
 			}
 
 			if (!flags->dryrun && !flags->nomount) {
 				VERIFY(0 == nvlist_alloc(&renamed,
 				    NV_UNIQUE_NAME, 0));
 			}
 
 			softerr = recv_incremental_replication(hdl, tofs, flags,
 			    stream_nv, stream_avl, renamed);
 
 			/* Unmount renamed filesystems before receiving. */
 			while ((pair = nvlist_next_nvpair(renamed,
 			    pair)) != NULL) {
 				zfs_handle_t *zhp;
 				prop_changelist_t *clp = NULL;
 
 				zhp = zfs_open(hdl, nvpair_name(pair),
 				    ZFS_TYPE_FILESYSTEM);
 				if (zhp != NULL) {
 					clp = changelist_gather(zhp,
 					    ZFS_PROP_MOUNTPOINT, 0,
 					    flags->forceunmount ? MS_FORCE : 0);
 					zfs_close(zhp);
 					if (clp != NULL) {
 						softerr |=
 						    changelist_prefix(clp);
 						changelist_free(clp);
 					}
 				}
 			}
 
 			nvlist_free(renamed);
 		}
 	}
 
 	/*
 	 * Get the fs specified by the first path in the stream (the top level
 	 * specified by 'zfs send') and pass it to each invocation of
 	 * zfs_receive_one().
 	 */
 	(void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname,
 	    sizeof (sendfs));
 	if ((cp = strchr(sendfs, '@')) != NULL) {
 		*cp = '\0';
 		/*
 		 * Find the "sendsnap", the final snapshot in a replication
 		 * stream.  zfs_receive_one() handles certain errors
 		 * differently, depending on if the contained stream is the
 		 * last one or not.
 		 */
 		sendsnap = (cp + 1);
 	}
 
 	/* Finally, receive each contained stream */
 	do {
 		/*
 		 * we should figure out if it has a recoverable
 		 * error, in which case do a recv_skip() and drive on.
 		 * Note, if we fail due to already having this guid,
 		 * zfs_receive_one() will take care of it (ie,
 		 * recv_skip() and return 0).
 		 */
 		error = zfs_receive_impl(hdl, destname, NULL, flags, fd,
 		    sendfs, stream_nv, stream_avl, top_zfs, sendsnap, cmdprops);
 		if (error == ENODATA) {
 			error = 0;
 			break;
 		}
 		anyerr |= error;
 	} while (error == 0);
 
 	if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) {
 		/*
 		 * Now that we have the fs's they sent us, try the
 		 * renames again.
 		 */
 		softerr = recv_incremental_replication(hdl, tofs, flags,
 		    stream_nv, stream_avl, NULL);
 	}
 
 	if (raw && softerr == 0 && *top_zfs != NULL) {
 		softerr = recv_fix_encryption_hierarchy(hdl, *top_zfs,
 		    stream_nv, stream_avl);
 	}
 
 out:
 	fsavl_destroy(stream_avl);
 	nvlist_free(stream_nv);
 	if (softerr)
 		error = -2;
 	if (anyerr)
 		error = -1;
 	return (error);
 }
 
 static void
 trunc_prop_errs(int truncated)
 {
 	ASSERT(truncated != 0);
 
 	if (truncated == 1)
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "1 more property could not be set\n"));
 	else
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 		    "%d more properties could not be set\n"), truncated);
 }
 
 static int
 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 {
 	dmu_replay_record_t *drr;
 	void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
 	uint64_t payload_size;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	/* XXX would be great to use lseek if possible... */
 	drr = buf;
 
 	while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t),
 	    byteswap, NULL) == 0) {
 		if (byteswap)
 			drr->drr_type = BSWAP_32(drr->drr_type);
 
 		switch (drr->drr_type) {
 		case DRR_BEGIN:
 			if (drr->drr_payloadlen != 0) {
 				(void) recv_read(hdl, fd, buf,
 				    drr->drr_payloadlen, B_FALSE, NULL);
 			}
 			break;
 
 		case DRR_END:
 			free(buf);
 			return (0);
 
 		case DRR_OBJECT:
 			if (byteswap) {
 				drr->drr_u.drr_object.drr_bonuslen =
 				    BSWAP_32(drr->drr_u.drr_object.
 				    drr_bonuslen);
 				drr->drr_u.drr_object.drr_raw_bonuslen =
 				    BSWAP_32(drr->drr_u.drr_object.
 				    drr_raw_bonuslen);
 			}
 
 			payload_size =
 			    DRR_OBJECT_PAYLOAD_SIZE(&drr->drr_u.drr_object);
 			(void) recv_read(hdl, fd, buf, payload_size,
 			    B_FALSE, NULL);
 			break;
 
 		case DRR_WRITE:
 			if (byteswap) {
 				drr->drr_u.drr_write.drr_logical_size =
 				    BSWAP_64(
 				    drr->drr_u.drr_write.drr_logical_size);
 				drr->drr_u.drr_write.drr_compressed_size =
 				    BSWAP_64(
 				    drr->drr_u.drr_write.drr_compressed_size);
 			}
 			payload_size =
 			    DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write);
+			assert(payload_size <= SPA_MAXBLOCKSIZE);
 			(void) recv_read(hdl, fd, buf,
 			    payload_size, B_FALSE, NULL);
 			break;
 		case DRR_SPILL:
 			if (byteswap) {
 				drr->drr_u.drr_spill.drr_length =
 				    BSWAP_64(drr->drr_u.drr_spill.drr_length);
 				drr->drr_u.drr_spill.drr_compressed_size =
 				    BSWAP_64(drr->drr_u.drr_spill.
 				    drr_compressed_size);
 			}
 
 			payload_size =
 			    DRR_SPILL_PAYLOAD_SIZE(&drr->drr_u.drr_spill);
 			(void) recv_read(hdl, fd, buf, payload_size,
 			    B_FALSE, NULL);
 			break;
 		case DRR_WRITE_EMBEDDED:
 			if (byteswap) {
 				drr->drr_u.drr_write_embedded.drr_psize =
 				    BSWAP_32(drr->drr_u.drr_write_embedded.
 				    drr_psize);
 			}
 			(void) recv_read(hdl, fd, buf,
 			    P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize,
 			    8), B_FALSE, NULL);
 			break;
 		case DRR_OBJECT_RANGE:
 		case DRR_WRITE_BYREF:
 		case DRR_FREEOBJECTS:
 		case DRR_FREE:
 			break;
 
 		default:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid record type"));
 			free(buf);
 			return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 		}
 	}
 
 	free(buf);
 	return (-1);
 }
 
 static void
 recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap,
     boolean_t resumable, boolean_t checksum)
 {
 	char target_fs[ZFS_MAX_DATASET_NAME_LEN];
 
 	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, (checksum ?
 	    "checksum mismatch" : "incomplete stream")));
 
 	if (!resumable)
 		return;
 	(void) strlcpy(target_fs, target_snap, sizeof (target_fs));
 	*strchr(target_fs, '@') = '\0';
 	zfs_handle_t *zhp = zfs_open(hdl, target_fs,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 	if (zhp == NULL)
 		return;
 
 	char token_buf[ZFS_MAXPROPLEN];
 	int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	    token_buf, sizeof (token_buf),
 	    NULL, NULL, 0, B_TRUE);
 	if (error == 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "checksum mismatch or incomplete stream.\n"
 		    "Partially received snapshot is saved.\n"
 		    "A resuming stream can be generated on the sending "
 		    "system by running:\n"
 		    "    zfs send -t %s"),
 		    token_buf);
 	}
 	zfs_close(zhp);
 }
 
 /*
  * Prepare a new nvlist of properties that are to override (-o) or be excluded
  * (-x) from the received dataset
  * recvprops: received properties from the send stream
  * cmdprops: raw input properties from command line
  * origprops: properties, both locally-set and received, currently set on the
  *            target dataset if it exists, NULL otherwise.
  * oxprops: valid output override (-o) and excluded (-x) properties
  */
 static int
 zfs_setup_cmdline_props(libzfs_handle_t *hdl, zfs_type_t type,
     char *fsname, boolean_t zoned, boolean_t recursive, boolean_t newfs,
     boolean_t raw, boolean_t toplevel, nvlist_t *recvprops, nvlist_t *cmdprops,
     nvlist_t *origprops, nvlist_t **oxprops, uint8_t **wkeydata_out,
     uint_t *wkeylen_out, const char *errbuf)
 {
 	nvpair_t *nvp;
 	nvlist_t *oprops, *voprops;
 	zfs_handle_t *zhp = NULL;
 	zpool_handle_t *zpool_hdl = NULL;
 	char *cp;
 	int ret = 0;
 	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
 	if (nvlist_empty(cmdprops))
 		return (0); /* No properties to override or exclude */
 
 	*oxprops = fnvlist_alloc();
 	oprops = fnvlist_alloc();
 
 	strlcpy(namebuf, fsname, ZFS_MAX_DATASET_NAME_LEN);
 
 	/*
 	 * Get our dataset handle. The target dataset may not exist yet.
 	 */
 	if (zfs_dataset_exists(hdl, namebuf, ZFS_TYPE_DATASET)) {
 		zhp = zfs_open(hdl, namebuf, ZFS_TYPE_DATASET);
 		if (zhp == NULL) {
 			ret = -1;
 			goto error;
 		}
 	}
 
 	/* open the zpool handle */
 	cp = strchr(namebuf, '/');
 	if (cp != NULL)
 		*cp = '\0';
 	zpool_hdl = zpool_open(hdl, namebuf);
 	if (zpool_hdl == NULL) {
 		ret = -1;
 		goto error;
 	}
 
 	/* restore namebuf to match fsname for later use */
 	if (cp != NULL)
 		*cp = '/';
 
 	/*
 	 * first iteration: process excluded (-x) properties now and gather
 	 * added (-o) properties to be later processed by zfs_valid_proplist()
 	 */
 	nvp = NULL;
 	while ((nvp = nvlist_next_nvpair(cmdprops, nvp)) != NULL) {
 		const char *name = nvpair_name(nvp);
 		zfs_prop_t prop = zfs_name_to_prop(name);
 
 		/* "origin" is processed separately, don't handle it here */
 		if (prop == ZFS_PROP_ORIGIN)
 			continue;
 
 		/*
 		 * we're trying to override or exclude a property that does not
 		 * make sense for this type of dataset, but we don't want to
 		 * fail if the receive is recursive: this comes in handy when
 		 * the send stream contains, for instance, a child ZVOL and
 		 * we're trying to receive it with "-o atime=on"
 		 */
 		if (!zfs_prop_valid_for_type(prop, type, B_FALSE) &&
 		    !zfs_prop_user(name)) {
 			if (recursive)
 				continue;
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property '%s' does not apply to datasets of this "
 			    "type"), name);
 			ret = zfs_error(hdl, EZFS_BADPROP, errbuf);
 			goto error;
 		}
 
 		/* raw streams can't override encryption properties */
 		if ((zfs_prop_encryption_key_param(prop) ||
 		    prop == ZFS_PROP_ENCRYPTION) && raw) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "encryption property '%s' cannot "
 			    "be set or excluded for raw streams."), name);
 			ret = zfs_error(hdl, EZFS_BADPROP, errbuf);
 			goto error;
 		}
 
 		/* incremental streams can only exclude encryption properties */
 		if ((zfs_prop_encryption_key_param(prop) ||
 		    prop == ZFS_PROP_ENCRYPTION) && !newfs &&
 		    nvpair_type(nvp) != DATA_TYPE_BOOLEAN) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "encryption property '%s' cannot "
 			    "be set for incremental streams."), name);
 			ret = zfs_error(hdl, EZFS_BADPROP, errbuf);
 			goto error;
 		}
 
 		switch (nvpair_type(nvp)) {
 		case DATA_TYPE_BOOLEAN: /* -x property */
 			/*
 			 * DATA_TYPE_BOOLEAN is the way we're asked to "exclude"
 			 * a property: this is done by forcing an explicit
 			 * inherit on the destination so the effective value is
 			 * not the one we received from the send stream.
 			 * We do this only if the property is not already
 			 * locally-set, in which case its value will take
 			 * priority over the received anyway.
 			 */
 			if (nvlist_exists(origprops, name)) {
 				nvlist_t *attrs;
 				char *source = NULL;
 
 				attrs = fnvlist_lookup_nvlist(origprops, name);
 				if (nvlist_lookup_string(attrs,
 				    ZPROP_SOURCE, &source) == 0 &&
 				    strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)
 					continue;
 			}
 			/*
 			 * We can't force an explicit inherit on non-inheritable
 			 * properties: if we're asked to exclude this kind of
 			 * values we remove them from "recvprops" input nvlist.
 			 */
 			if (!zfs_prop_inheritable(prop) &&
 			    !zfs_prop_user(name) && /* can be inherited too */
 			    nvlist_exists(recvprops, name))
 				fnvlist_remove(recvprops, name);
 			else
 				fnvlist_add_nvpair(*oxprops, nvp);
 			break;
 		case DATA_TYPE_STRING: /* -o property=value */
 			fnvlist_add_nvpair(oprops, nvp);
 			break;
 		default:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property '%s' must be a string or boolean"), name);
 			ret = zfs_error(hdl, EZFS_BADPROP, errbuf);
 			goto error;
 		}
 	}
 
 	if (toplevel) {
 		/* convert override strings properties to native */
 		if ((voprops = zfs_valid_proplist(hdl, ZFS_TYPE_DATASET,
 		    oprops, zoned, zhp, zpool_hdl, B_FALSE, errbuf)) == NULL) {
 			ret = zfs_error(hdl, EZFS_BADPROP, errbuf);
 			goto error;
 		}
 
 		/*
 		 * zfs_crypto_create() requires the parent name. Get it
 		 * by truncating the fsname copy stored in namebuf.
 		 */
 		cp = strrchr(namebuf, '/');
 		if (cp != NULL)
 			*cp = '\0';
 
 		if (!raw && zfs_crypto_create(hdl, namebuf, voprops, NULL,
 		    B_FALSE, wkeydata_out, wkeylen_out) != 0) {
 			fnvlist_free(voprops);
 			ret = zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf);
 			goto error;
 		}
 
 		/* second pass: process "-o" properties */
 		fnvlist_merge(*oxprops, voprops);
 		fnvlist_free(voprops);
 	} else {
 		/* override props on child dataset are inherited */
 		nvp = NULL;
 		while ((nvp = nvlist_next_nvpair(oprops, nvp)) != NULL) {
 			const char *name = nvpair_name(nvp);
 			fnvlist_add_boolean(*oxprops, name);
 		}
 	}
 
 error:
 	if (zhp != NULL)
 		zfs_close(zhp);
 	if (zpool_hdl != NULL)
 		zpool_close(zpool_hdl);
 	fnvlist_free(oprops);
 	return (ret);
 }
 
 /*
  * Restores a backup of tosnap from the file descriptor specified by infd.
  */
 static int
 zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
     const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr,
     dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv,
     avl_tree_t *stream_avl, char **top_zfs,
     const char *finalsnap, nvlist_t *cmdprops)
 {
 	time_t begin_time;
 	int ioctl_err, ioctl_errno, err;
 	char *cp;
 	struct drr_begin *drrb = &drr->drr_u.drr_begin;
 	char errbuf[1024];
 	const char *chopprefix;
 	boolean_t newfs = B_FALSE;
 	boolean_t stream_wantsnewfs;
 	boolean_t newprops = B_FALSE;
 	uint64_t read_bytes = 0;
 	uint64_t errflags = 0;
 	uint64_t parent_snapguid = 0;
 	prop_changelist_t *clp = NULL;
 	nvlist_t *snapprops_nvlist = NULL;
 	nvlist_t *snapholds_nvlist = NULL;
 	zprop_errflags_t prop_errflags;
 	nvlist_t *prop_errors = NULL;
 	boolean_t recursive;
 	char *snapname = NULL;
 	char destsnap[MAXPATHLEN * 2];
 	char origin[MAXNAMELEN];
 	char name[MAXPATHLEN];
 	char tmp_keylocation[MAXNAMELEN];
 	nvlist_t *rcvprops = NULL; /* props received from the send stream */
 	nvlist_t *oxprops = NULL; /* override (-o) and exclude (-x) props */
 	nvlist_t *origprops = NULL; /* original props (if destination exists) */
 	zfs_type_t type;
 	boolean_t toplevel = B_FALSE;
 	boolean_t zoned = B_FALSE;
 	boolean_t hastoken = B_FALSE;
 	boolean_t redacted;
 	uint8_t *wkeydata = NULL;
 	uint_t wkeylen = 0;
 
 	begin_time = time(NULL);
 	bzero(origin, MAXNAMELEN);
 	bzero(tmp_keylocation, MAXNAMELEN);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
 	    ENOENT);
 
 	/* Did the user request holds be skipped via zfs recv -k? */
 	boolean_t holds = flags->holds && !flags->skipholds;
 
 	if (stream_avl != NULL) {
 		char *keylocation = NULL;
 		nvlist_t *lookup = NULL;
 		nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid,
 		    &snapname);
 
 		(void) nvlist_lookup_uint64(fs, "parentfromsnap",
 		    &parent_snapguid);
 		err = nvlist_lookup_nvlist(fs, "props", &rcvprops);
 		if (err) {
 			VERIFY(0 == nvlist_alloc(&rcvprops, NV_UNIQUE_NAME, 0));
 			newprops = B_TRUE;
 		}
 
 		/*
 		 * The keylocation property may only be set on encryption roots,
 		 * but this dataset might not become an encryption root until
 		 * recv_fix_encryption_hierarchy() is called. That function
 		 * will fixup the keylocation anyway, so we temporarily unset
 		 * the keylocation for now to avoid any errors from the receive
 		 * ioctl.
 		 */
 		err = nvlist_lookup_string(rcvprops,
 		    zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation);
 		if (err == 0) {
 			strcpy(tmp_keylocation, keylocation);
 			(void) nvlist_remove_all(rcvprops,
 			    zfs_prop_to_name(ZFS_PROP_KEYLOCATION));
 		}
 
 		if (flags->canmountoff) {
 			VERIFY(0 == nvlist_add_uint64(rcvprops,
 			    zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0));
 		} else if (newprops) {	/* nothing in rcvprops, eliminate it */
 			nvlist_free(rcvprops);
 			rcvprops = NULL;
 			newprops = B_FALSE;
 		}
 		if (0 == nvlist_lookup_nvlist(fs, "snapprops", &lookup)) {
 			VERIFY(0 == nvlist_lookup_nvlist(lookup,
 			    snapname, &snapprops_nvlist));
 		}
 		if (holds) {
 			if (0 == nvlist_lookup_nvlist(fs, "snapholds",
 			    &lookup)) {
 				VERIFY(0 == nvlist_lookup_nvlist(lookup,
 				    snapname, &snapholds_nvlist));
 			}
 		}
 	}
 
 	cp = NULL;
 
 	/*
 	 * Determine how much of the snapshot name stored in the stream
 	 * we are going to tack on to the name they specified on the
 	 * command line, and how much we are going to chop off.
 	 *
 	 * If they specified a snapshot, chop the entire name stored in
 	 * the stream.
 	 */
 	if (flags->istail) {
 		/*
 		 * A filesystem was specified with -e. We want to tack on only
 		 * the tail of the sent snapshot path.
 		 */
 		if (strchr(tosnap, '@')) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 			    "argument - snapshot not allowed with -e"));
 			err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
 			goto out;
 		}
 
 		chopprefix = strrchr(sendfs, '/');
 
 		if (chopprefix == NULL) {
 			/*
 			 * The tail is the poolname, so we need to
 			 * prepend a path separator.
 			 */
 			int len = strlen(drrb->drr_toname);
 			cp = malloc(len + 2);
 			cp[0] = '/';
 			(void) strcpy(&cp[1], drrb->drr_toname);
 			chopprefix = cp;
 		} else {
 			chopprefix = drrb->drr_toname + (chopprefix - sendfs);
 		}
 	} else if (flags->isprefix) {
 		/*
 		 * A filesystem was specified with -d. We want to tack on
 		 * everything but the first element of the sent snapshot path
 		 * (all but the pool name).
 		 */
 		if (strchr(tosnap, '@')) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 			    "argument - snapshot not allowed with -d"));
 			err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
 			goto out;
 		}
 
 		chopprefix = strchr(drrb->drr_toname, '/');
 		if (chopprefix == NULL)
 			chopprefix = strchr(drrb->drr_toname, '@');
 	} else if (strchr(tosnap, '@') == NULL) {
 		/*
 		 * If a filesystem was specified without -d or -e, we want to
 		 * tack on everything after the fs specified by 'zfs send'.
 		 */
 		chopprefix = drrb->drr_toname + strlen(sendfs);
 	} else {
 		/* A snapshot was specified as an exact path (no -d or -e). */
 		if (recursive) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "cannot specify snapshot name for multi-snapshot "
 			    "stream"));
 			err = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			goto out;
 		}
 		chopprefix = drrb->drr_toname + strlen(drrb->drr_toname);
 	}
 
 	ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname);
 	ASSERT(chopprefix > drrb->drr_toname || strchr(sendfs, '/') == NULL);
 	ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname) ||
 	    strchr(sendfs, '/') == NULL);
 	ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' ||
 	    chopprefix[0] == '\0');
 
 	/*
 	 * Determine name of destination snapshot.
 	 */
 	(void) strlcpy(destsnap, tosnap, sizeof (destsnap));
 	(void) strlcat(destsnap, chopprefix, sizeof (destsnap));
 	free(cp);
 	if (!zfs_name_valid(destsnap, ZFS_TYPE_SNAPSHOT)) {
 		err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
 		goto out;
 	}
 
 	/*
 	 * Determine the name of the origin snapshot.
 	 */
 	if (originsnap) {
 		(void) strlcpy(origin, originsnap, sizeof (origin));
 		if (flags->verbose)
 			(void) printf("using provided clone origin %s\n",
 			    origin);
 	} else if (drrb->drr_flags & DRR_FLAG_CLONE) {
 		if (guid_to_name(hdl, destsnap,
 		    drrb->drr_fromguid, B_FALSE, origin) != 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "local origin for clone %s does not exist"),
 			    destsnap);
 			err = zfs_error(hdl, EZFS_NOENT, errbuf);
 			goto out;
 		}
 		if (flags->verbose)
 			(void) printf("found clone origin %s\n", origin);
 	}
 
 	if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
 	    DMU_BACKUP_FEATURE_DEDUP)) {
 		(void) fprintf(stderr,
 		    gettext("ERROR: \"zfs receive\" no longer supports "
 		    "deduplicated send streams.  Use\n"
 		    "the \"zstream redup\" command to convert this stream "
 		    "to a regular,\n"
 		    "non-deduplicated stream.\n"));
 		err = zfs_error(hdl, EZFS_NOTSUP, errbuf);
 		goto out;
 	}
 
 	boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
 	    DMU_BACKUP_FEATURE_RESUMING;
 	boolean_t raw = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
 	    DMU_BACKUP_FEATURE_RAW;
 	boolean_t embedded = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
 	    DMU_BACKUP_FEATURE_EMBED_DATA;
 	stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
 	    (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming;
 
 	if (stream_wantsnewfs) {
 		/*
 		 * if the parent fs does not exist, look for it based on
 		 * the parent snap GUID
 		 */
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot receive new filesystem stream"));
 
 		(void) strcpy(name, destsnap);
 		cp = strrchr(name, '/');
 		if (cp)
 			*cp = '\0';
 		if (cp &&
 		    !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) {
 			char suffix[ZFS_MAX_DATASET_NAME_LEN];
 			(void) strcpy(suffix, strrchr(destsnap, '/'));
 			if (guid_to_name(hdl, name, parent_snapguid,
 			    B_FALSE, destsnap) == 0) {
 				*strchr(destsnap, '@') = '\0';
 				(void) strcat(destsnap, suffix);
 			}
 		}
 	} else {
 		/*
 		 * If the fs does not exist, look for it based on the
 		 * fromsnap GUID.
 		 */
 		if (resuming) {
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN,
 			    "cannot receive resume stream"));
 		} else {
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN,
 			    "cannot receive incremental stream"));
 		}
 
 		(void) strcpy(name, destsnap);
 		*strchr(name, '@') = '\0';
 
 		/*
 		 * If the exact receive path was specified and this is the
 		 * topmost path in the stream, then if the fs does not exist we
 		 * should look no further.
 		 */
 		if ((flags->isprefix || (*(chopprefix = drrb->drr_toname +
 		    strlen(sendfs)) != '\0' && *chopprefix != '@')) &&
 		    !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) {
 			char snap[ZFS_MAX_DATASET_NAME_LEN];
 			(void) strcpy(snap, strchr(destsnap, '@'));
 			if (guid_to_name(hdl, name, drrb->drr_fromguid,
 			    B_FALSE, destsnap) == 0) {
 				*strchr(destsnap, '@') = '\0';
 				(void) strcat(destsnap, snap);
 			}
 		}
 	}
 
 	(void) strcpy(name, destsnap);
 	*strchr(name, '@') = '\0';
 
 	redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
 	    DMU_BACKUP_FEATURE_REDACTED;
 
 	if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) {
 		zfs_cmd_t zc = {"\0"};
 		zfs_handle_t *zhp;
 		boolean_t encrypted;
 
 		(void) strcpy(zc.zc_name, name);
 
 		/*
 		 * Destination fs exists.  It must be one of these cases:
 		 *  - an incremental send stream
 		 *  - the stream specifies a new fs (full stream or clone)
 		 *    and they want us to blow away the existing fs (and
 		 *    have therefore specified -F and removed any snapshots)
 		 *  - we are resuming a failed receive.
 		 */
 		if (stream_wantsnewfs) {
 			boolean_t is_volume = drrb->drr_type == DMU_OST_ZVOL;
 			if (!flags->force) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "destination '%s' exists\n"
 				    "must specify -F to overwrite it"), name);
 				err = zfs_error(hdl, EZFS_EXISTS, errbuf);
 				goto out;
 			}
 			if (zfs_ioctl(hdl, ZFS_IOC_SNAPSHOT_LIST_NEXT,
 			    &zc) == 0) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "destination has snapshots (eg. %s)\n"
 				    "must destroy them to overwrite it"),
 				    zc.zc_name);
 				err = zfs_error(hdl, EZFS_EXISTS, errbuf);
 				goto out;
 			}
 			if (is_volume && strrchr(name, '/') == NULL) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "destination %s is the root dataset\n"
 				    "cannot overwrite with a ZVOL"),
 				    name);
 				err = zfs_error(hdl, EZFS_EXISTS, errbuf);
 				goto out;
 			}
 			if (is_volume &&
 			    zfs_ioctl(hdl, ZFS_IOC_DATASET_LIST_NEXT,
 			    &zc) == 0) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "destination has children (eg. %s)\n"
 				    "cannot overwrite with a ZVOL"),
 				    zc.zc_name);
 				err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf);
 				goto out;
 			}
 		}
 
 		if ((zhp = zfs_open(hdl, name,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) {
 			err = -1;
 			goto out;
 		}
 
 		if (stream_wantsnewfs &&
 		    zhp->zfs_dmustats.dds_origin[0]) {
 			zfs_close(zhp);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination '%s' is a clone\n"
 			    "must destroy it to overwrite it"), name);
 			err = zfs_error(hdl, EZFS_EXISTS, errbuf);
 			goto out;
 		}
 
 		/*
 		 * Raw sends can not be performed as an incremental on top
 		 * of existing unencrypted datasets. zfs recv -F can't be
 		 * used to blow away an existing encrypted filesystem. This
 		 * is because it would require the dsl dir to point to the
 		 * new key (or lack of a key) and the old key at the same
 		 * time. The -F flag may still be used for deleting
 		 * intermediate snapshots that would otherwise prevent the
 		 * receive from working.
 		 */
 		encrypted = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) !=
 		    ZIO_CRYPT_OFF;
 		if (!stream_wantsnewfs && !encrypted && raw) {
 			zfs_close(zhp);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "cannot perform raw receive on top of "
 			    "existing unencrypted dataset"));
 			err = zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			goto out;
 		}
 
 		if (stream_wantsnewfs && flags->force &&
 		    ((raw && !encrypted) || encrypted)) {
 			zfs_close(zhp);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "zfs receive -F cannot be used to destroy an "
 			    "encrypted filesystem or overwrite an "
 			    "unencrypted one with an encrypted one"));
 			err = zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			goto out;
 		}
 
 		if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
 		    stream_wantsnewfs) {
 			/* We can't do online recv in this case */
 			clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
 			    flags->forceunmount ? MS_FORCE : 0);
 			if (clp == NULL) {
 				zfs_close(zhp);
 				err = -1;
 				goto out;
 			}
 			if (changelist_prefix(clp) != 0) {
 				changelist_free(clp);
 				zfs_close(zhp);
 				err = -1;
 				goto out;
 			}
 		}
 
 		/*
 		 * If we are resuming a newfs, set newfs here so that we will
 		 * mount it if the recv succeeds this time.  We can tell
 		 * that it was a newfs on the first recv because the fs
 		 * itself will be inconsistent (if the fs existed when we
 		 * did the first recv, we would have received it into
 		 * .../%recv).
 		 */
 		if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT))
 			newfs = B_TRUE;
 
 		/* we want to know if we're zoned when validating -o|-x props */
 		zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
 
 		/* may need this info later, get it now we have zhp around */
 		if (zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0,
 		    NULL, NULL, 0, B_TRUE) == 0)
 			hastoken = B_TRUE;
 
 		/* gather existing properties on destination */
 		origprops = fnvlist_alloc();
 		fnvlist_merge(origprops, zhp->zfs_props);
 		fnvlist_merge(origprops, zhp->zfs_user_props);
 
 		zfs_close(zhp);
 	} else {
 		zfs_handle_t *zhp;
 
 		/*
 		 * Destination filesystem does not exist.  Therefore we better
 		 * be creating a new filesystem (either from a full backup, or
 		 * a clone).  It would therefore be invalid if the user
 		 * specified only the pool name (i.e. if the destination name
 		 * contained no slash character).
 		 */
 		cp = strrchr(name, '/');
 
 		if (!stream_wantsnewfs || cp == NULL) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination '%s' does not exist"), name);
 			err = zfs_error(hdl, EZFS_NOENT, errbuf);
 			goto out;
 		}
 
 		/*
 		 * Trim off the final dataset component so we perform the
 		 * recvbackup ioctl to the filesystems's parent.
 		 */
 		*cp = '\0';
 
 		if (flags->isprefix && !flags->istail && !flags->dryrun &&
 		    create_parents(hdl, destsnap, strlen(tosnap)) != 0) {
 			err = zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			goto out;
 		}
 
 		/* validate parent */
 		zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 		if (zhp == NULL) {
 			err = zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			goto out;
 		}
 		if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "parent '%s' is not a filesystem"), name);
 			err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf);
 			zfs_close(zhp);
 			goto out;
 		}
 
 		zfs_close(zhp);
 
 		newfs = B_TRUE;
 		*cp = '/';
 	}
 
 	if (flags->verbose) {
 		(void) printf("%s %s stream of %s into %s\n",
 		    flags->dryrun ? "would receive" : "receiving",
 		    drrb->drr_fromguid ? "incremental" : "full",
 		    drrb->drr_toname, destsnap);
 		(void) fflush(stdout);
 	}
 
 	if (flags->dryrun) {
 		void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
 
 		/*
 		 * We have read the DRR_BEGIN record, but we have
 		 * not yet read the payload. For non-dryrun sends
 		 * this will be done by the kernel, so we must
 		 * emulate that here, before attempting to read
 		 * more records.
 		 */
 		err = recv_read(hdl, infd, buf, drr->drr_payloadlen,
 		    flags->byteswap, NULL);
 		free(buf);
 		if (err != 0)
 			goto out;
 
 		err = recv_skip(hdl, infd, flags->byteswap);
 		goto out;
 	}
 
 	/*
 	 * If this is the top-level dataset, record it so we can use it
 	 * for recursive operations later.
 	 */
 	if (top_zfs != NULL &&
 	    (*top_zfs == NULL || strcmp(*top_zfs, name) == 0)) {
 		toplevel = B_TRUE;
 		if (*top_zfs == NULL)
 			*top_zfs = zfs_strdup(hdl, name);
 	}
 
 	if (drrb->drr_type == DMU_OST_ZVOL) {
 		type = ZFS_TYPE_VOLUME;
 	} else if (drrb->drr_type == DMU_OST_ZFS) {
 		type = ZFS_TYPE_FILESYSTEM;
 	} else {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid record type: 0x%d"), drrb->drr_type);
 		err = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 		goto out;
 	}
 	if ((err = zfs_setup_cmdline_props(hdl, type, name, zoned, recursive,
 	    stream_wantsnewfs, raw, toplevel, rcvprops, cmdprops, origprops,
 	    &oxprops, &wkeydata, &wkeylen, errbuf)) != 0)
 		goto out;
 
 	/*
 	 * When sending with properties (zfs send -p), the encryption property
 	 * is not included because it is a SETONCE property and therefore
 	 * treated as read only. However, we are always able to determine its
 	 * value because raw sends will include it in the DRR_BDEGIN payload
 	 * and non-raw sends with properties are not allowed for encrypted
 	 * datasets. Therefore, if this is a non-raw properties stream, we can
 	 * infer that the value should be ZIO_CRYPT_OFF and manually add that
 	 * to the received properties.
 	 */
 	if (stream_wantsnewfs && !raw && rcvprops != NULL &&
 	    !nvlist_exists(cmdprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) {
 		if (oxprops == NULL)
 			oxprops = fnvlist_alloc();
 		fnvlist_add_uint64(oxprops,
 		    zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF);
 	}
 
 	err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops,
 	    oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable,
 	    raw, infd, drr_noswap, -1, &read_bytes, &errflags,
 	    NULL, &prop_errors);
 	ioctl_errno = ioctl_err;
 	prop_errflags = errflags;
 
 	if (err == 0) {
 		nvpair_t *prop_err = NULL;
 
 		while ((prop_err = nvlist_next_nvpair(prop_errors,
 		    prop_err)) != NULL) {
 			char tbuf[1024];
 			zfs_prop_t prop;
 			int intval;
 
 			prop = zfs_name_to_prop(nvpair_name(prop_err));
 			(void) nvpair_value_int32(prop_err, &intval);
 			if (strcmp(nvpair_name(prop_err),
 			    ZPROP_N_MORE_ERRORS) == 0) {
 				trunc_prop_errs(intval);
 				break;
 			} else if (snapname == NULL || finalsnap == NULL ||
 			    strcmp(finalsnap, snapname) == 0 ||
 			    strcmp(nvpair_name(prop_err),
 			    zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) {
 				/*
 				 * Skip the special case of, for example,
 				 * "refquota", errors on intermediate
 				 * snapshots leading up to a final one.
 				 * That's why we have all of the checks above.
 				 *
 				 * See zfs_ioctl.c's extract_delay_props() for
 				 * a list of props which can fail on
 				 * intermediate snapshots, but shouldn't
 				 * affect the overall receive.
 				 */
 				(void) snprintf(tbuf, sizeof (tbuf),
 				    dgettext(TEXT_DOMAIN,
 				    "cannot receive %s property on %s"),
 				    nvpair_name(prop_err), name);
 				zfs_setprop_error(hdl, prop, intval, tbuf);
 			}
 		}
 	}
 
 	if (err == 0 && snapprops_nvlist) {
 		zfs_cmd_t zc = {"\0"};
 
 		(void) strcpy(zc.zc_name, destsnap);
 		zc.zc_cookie = B_TRUE; /* received */
 		if (zcmd_write_src_nvlist(hdl, &zc, snapprops_nvlist) == 0) {
 			(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
 			zcmd_free_nvlists(&zc);
 		}
 	}
 	if (err == 0 && snapholds_nvlist) {
 		nvpair_t *pair;
 		nvlist_t *holds, *errors = NULL;
 		int cleanup_fd = -1;
 
 		VERIFY(0 == nvlist_alloc(&holds, 0, KM_SLEEP));
 		for (pair = nvlist_next_nvpair(snapholds_nvlist, NULL);
 		    pair != NULL;
 		    pair = nvlist_next_nvpair(snapholds_nvlist, pair)) {
 			VERIFY(0 == nvlist_add_string(holds, destsnap,
 			    nvpair_name(pair)));
 		}
 		(void) lzc_hold(holds, cleanup_fd, &errors);
 		nvlist_free(snapholds_nvlist);
 		nvlist_free(holds);
 	}
 
 	if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) {
 		/*
 		 * It may be that this snapshot already exists,
 		 * in which case we want to consume & ignore it
 		 * rather than failing.
 		 */
 		avl_tree_t *local_avl;
 		nvlist_t *local_nv, *fs;
 		cp = strchr(destsnap, '@');
 
 		/*
 		 * XXX Do this faster by just iterating over snaps in
 		 * this fs.  Also if zc_value does not exist, we will
 		 * get a strange "does not exist" error message.
 		 */
 		*cp = '\0';
 		if (gather_nvlist(hdl, destsnap, NULL, NULL, B_FALSE, B_TRUE,
 		    B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE,
 		    &local_nv, &local_avl) == 0) {
 			*cp = '@';
 			fs = fsavl_find(local_avl, drrb->drr_toguid, NULL);
 			fsavl_destroy(local_avl);
 			nvlist_free(local_nv);
 
 			if (fs != NULL) {
 				if (flags->verbose) {
 					(void) printf("snap %s already exists; "
 					    "ignoring\n", destsnap);
 				}
 				err = ioctl_err = recv_skip(hdl, infd,
 				    flags->byteswap);
 			}
 		}
 		*cp = '@';
 	}
 
 	if (ioctl_err != 0) {
 		switch (ioctl_errno) {
 		case ENODEV:
 			cp = strchr(destsnap, '@');
 			*cp = '\0';
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "most recent snapshot of %s does not\n"
 			    "match incremental source"), destsnap);
 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			*cp = '@';
 			break;
 		case ETXTBSY:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination %s has been modified\n"
 			    "since most recent snapshot"), name);
 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			break;
 		case EACCES:
 			if (raw && stream_wantsnewfs) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "failed to create encryption key"));
 			} else if (raw && !stream_wantsnewfs) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "encryption key does not match "
 				    "existing key"));
 			} else {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "inherited key must be loaded"));
 			}
 			(void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf);
 			break;
 		case EEXIST:
 			cp = strchr(destsnap, '@');
 			if (newfs) {
 				/* it's the containing fs that exists */
 				*cp = '\0';
 			}
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination already exists"));
 			(void) zfs_error_fmt(hdl, EZFS_EXISTS,
 			    dgettext(TEXT_DOMAIN, "cannot restore to %s"),
 			    destsnap);
 			*cp = '@';
 			break;
 		case EINVAL:
 			if (flags->resumable) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "kernel modules must be upgraded to "
 				    "receive this stream."));
 			} else if (embedded && !raw) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "incompatible embedded data stream "
 				    "feature with encrypted receive."));
 			}
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ECKSUM:
 		case ZFS_ERR_STREAM_TRUNCATED:
 			recv_ecksum_set_aux(hdl, destsnap, flags->resumable,
 			    ioctl_err == ECKSUM);
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "incremental send stream requires -L "
 			    "(--large-block), to match previous receive."));
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded to receive this stream."));
 			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EDQUOT:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination %s space quota exceeded."), name);
 			(void) zfs_error(hdl, EZFS_NOSPC, errbuf);
 			break;
 		case ZFS_ERR_FROM_IVSET_GUID_MISSING:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "IV set guid missing. See errata %u at "
-			    "https://zfsonlinux.org/msg/ZFS-8000-ER."),
+			    "https://openzfs.github.io/openzfs-docs/msg/"
+			    "ZFS-8000-ER."),
 			    ZPOOL_ERRATA_ZOL_8308_ENCRYPTION);
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ZFS_ERR_FROM_IVSET_GUID_MISMATCH:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "IV set guid mismatch. See the 'zfs receive' "
 			    "man page section\n discussing the limitations "
 			    "of raw encrypted send streams."));
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ZFS_ERR_SPILL_BLOCK_FLAG_MISSING:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "Spill block flag missing for raw send.\n"
 			    "The zfs software on the sending system must "
 			    "be updated."));
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case EBUSY:
 			if (hastoken) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "destination %s contains "
 				    "partially-complete state from "
 				    "\"zfs receive -s\"."), name);
 				(void) zfs_error(hdl, EZFS_BUSY, errbuf);
 				break;
 			}
 			/* fallthru */
 		default:
 			(void) zfs_standard_error(hdl, ioctl_errno, errbuf);
 		}
 	}
 
 	/*
 	 * Mount the target filesystem (if created).  Also mount any
 	 * children of the target filesystem if we did a replication
 	 * receive (indicated by stream_avl being non-NULL).
 	 */
 	if (clp) {
 		if (!flags->nomount)
 			err |= changelist_postfix(clp);
 		changelist_free(clp);
 	}
 
 	if ((newfs || stream_avl) && type == ZFS_TYPE_FILESYSTEM && !redacted)
 		flags->domount = B_TRUE;
 
 	if (prop_errflags & ZPROP_ERR_NOCLEAR) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
 		    "failed to clear unreceived properties on %s"), name);
 		(void) fprintf(stderr, "\n");
 	}
 	if (prop_errflags & ZPROP_ERR_NORESTORE) {
 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
 		    "failed to restore original properties on %s"), name);
 		(void) fprintf(stderr, "\n");
 	}
 
 	if (err || ioctl_err) {
 		err = -1;
 		goto out;
 	}
 
 	if (flags->verbose) {
 		char buf1[64];
 		char buf2[64];
 		uint64_t bytes = read_bytes;
 		time_t delta = time(NULL) - begin_time;
 		if (delta == 0)
 			delta = 1;
 		zfs_nicebytes(bytes, buf1, sizeof (buf1));
 		zfs_nicebytes(bytes/delta, buf2, sizeof (buf1));
 
 		(void) printf("received %s stream in %lld seconds (%s/sec)\n",
 		    buf1, (longlong_t)delta, buf2);
 	}
 
 	err = 0;
 out:
 	if (prop_errors != NULL)
 		nvlist_free(prop_errors);
 
 	if (tmp_keylocation[0] != '\0') {
 		VERIFY(0 == nvlist_add_string(rcvprops,
 		    zfs_prop_to_name(ZFS_PROP_KEYLOCATION), tmp_keylocation));
 	}
 
 	if (newprops)
 		nvlist_free(rcvprops);
 
 	nvlist_free(oxprops);
 	nvlist_free(origprops);
 
 	return (err);
 }
 
 /*
  * Check properties we were asked to override (both -o|-x)
  */
 static boolean_t
 zfs_receive_checkprops(libzfs_handle_t *hdl, nvlist_t *props,
     const char *errbuf)
 {
 	nvpair_t *nvp;
 	zfs_prop_t prop;
 	const char *name;
 
 	nvp = NULL;
 	while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) {
 		name = nvpair_name(nvp);
 		prop = zfs_name_to_prop(name);
 
 		if (prop == ZPROP_INVAL) {
 			if (!zfs_prop_user(name)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "invalid property '%s'"), name);
 				return (B_FALSE);
 			}
 			continue;
 		}
 		/*
 		 * "origin" is readonly but is used to receive datasets as
 		 * clones so we don't raise an error here
 		 */
 		if (prop == ZFS_PROP_ORIGIN)
 			continue;
 
 		/* encryption params have their own verification later */
 		if (prop == ZFS_PROP_ENCRYPTION ||
 		    zfs_prop_encryption_key_param(prop))
 			continue;
 
 		/*
 		 * cannot override readonly, set-once and other specific
 		 * settable properties
 		 */
 		if (zfs_prop_readonly(prop) || prop == ZFS_PROP_VERSION ||
 		    prop == ZFS_PROP_VOLSIZE) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property '%s'"), name);
 			return (B_FALSE);
 		}
 	}
 
 	return (B_TRUE);
 }
 
 static int
 zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
     const char *originsnap, recvflags_t *flags, int infd, const char *sendfs,
     nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs,
     const char *finalsnap, nvlist_t *cmdprops)
 {
 	int err;
 	dmu_replay_record_t drr, drr_noswap;
 	struct drr_begin *drrb = &drr.drr_u.drr_begin;
 	char errbuf[1024];
 	zio_cksum_t zcksum = { { 0 } };
 	uint64_t featureflags;
 	int hdrtype;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	/* check cmdline props, raise an error if they cannot be received */
 	if (!zfs_receive_checkprops(hdl, cmdprops, errbuf)) {
 		return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 	}
 
 	if (flags->isprefix &&
 	    !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs "
 		    "(%s) does not exist"), tosnap);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
 	if (originsnap &&
 	    !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs "
 		    "(%s) does not exist"), originsnap);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
 
 	/* read in the BEGIN record */
 	if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
 	    &zcksum)))
 		return (err);
 
 	if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) {
 		/* It's the double end record at the end of a package */
 		return (ENODATA);
 	}
 
 	/* the kernel needs the non-byteswapped begin record */
 	drr_noswap = drr;
 
 	flags->byteswap = B_FALSE;
 	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		/*
 		 * We computed the checksum in the wrong byteorder in
 		 * recv_read() above; do it again correctly.
 		 */
 		bzero(&zcksum, sizeof (zio_cksum_t));
 		fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum);
 		flags->byteswap = B_TRUE;
 
 		drr.drr_type = BSWAP_32(drr.drr_type);
 		drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen);
 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
 		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
 		drrb->drr_type = BSWAP_32(drrb->drr_type);
 		drrb->drr_flags = BSWAP_32(drrb->drr_flags);
 		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
 		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
 	}
 
 	if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad magic number)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 	hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo);
 
 	if (!DMU_STREAM_SUPPORTED(featureflags) ||
 	    (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "stream has unsupported feature, feature flags = %lx"),
 		    featureflags);
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	/* Holds feature is set once in the compound stream header. */
 	if (featureflags & DMU_BACKUP_FEATURE_HOLDS)
 		flags->holds = B_TRUE;
 
 	if (strchr(drrb->drr_toname, '@') == NULL) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad snapshot name)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) {
 		char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN];
 		if (sendfs == NULL) {
 			/*
 			 * We were not called from zfs_receive_package(). Get
 			 * the fs specified by 'zfs send'.
 			 */
 			char *cp;
 			(void) strlcpy(nonpackage_sendfs,
 			    drr.drr_u.drr_begin.drr_toname,
 			    sizeof (nonpackage_sendfs));
 			if ((cp = strchr(nonpackage_sendfs, '@')) != NULL)
 				*cp = '\0';
 			sendfs = nonpackage_sendfs;
 			VERIFY(finalsnap == NULL);
 		}
 		return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags,
 		    &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs,
 		    finalsnap, cmdprops));
 	} else {
 		assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 		    DMU_COMPOUNDSTREAM);
 		return (zfs_receive_package(hdl, infd, tosnap, flags, &drr,
 		    &zcksum, top_zfs, cmdprops));
 	}
 }
 
 /*
  * Restores a backup of tosnap from the file descriptor specified by infd.
  * Return 0 on total success, -2 if some things couldn't be
  * destroyed/renamed/promoted, -1 if some things couldn't be received.
  * (-1 will override -2, if -1 and the resumable flag was specified the
  * transfer can be resumed if the sending side supports it).
  */
 int
 zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props,
     recvflags_t *flags, int infd, avl_tree_t *stream_avl)
 {
 	char *top_zfs = NULL;
 	int err;
 	struct stat sb;
 	char *originsnap = NULL;
 
 	/*
 	 * The only way fstat can fail is if we do not have a valid file
 	 * descriptor.
 	 */
 	if (fstat(infd, &sb) == -1) {
 		perror("fstat");
 		return (-2);
 	}
 
 	/*
 	 * It is not uncommon for gigabytes to be processed in zfs receive.
 	 * Speculatively increase the buffer size if supported by the platform.
 	 */
 	if (S_ISFIFO(sb.st_mode))
 		libzfs_set_pipe_max(infd);
 
 	if (props) {
 		err = nvlist_lookup_string(props, "origin", &originsnap);
 		if (err && err != ENOENT)
 			return (err);
 	}
 
 	err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL,
 	    stream_avl, &top_zfs, NULL, props);
 
 	if (err == 0 && !flags->nomount && flags->domount && top_zfs) {
 		zfs_handle_t *zhp = NULL;
 		prop_changelist_t *clp = NULL;
 
 		zhp = zfs_open(hdl, top_zfs,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		if (zhp == NULL) {
 			err = -1;
 			goto out;
 		} else {
 			if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
 				zfs_close(zhp);
 				goto out;
 			}
 
 			clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT,
 			    CL_GATHER_MOUNT_ALWAYS,
 			    flags->forceunmount ? MS_FORCE : 0);
 			zfs_close(zhp);
 			if (clp == NULL) {
 				err = -1;
 				goto out;
 			}
 
 			/* mount and share received datasets */
 			err = changelist_postfix(clp);
 			changelist_free(clp);
 			if (err != 0)
 				err = -1;
 		}
 	}
 
 out:
 	if (top_zfs)
 		free(top_zfs);
 
 	return (err);
 }
Index: vendor-sys/openzfs/dist/lib/libzfs/libzfs_util.c
===================================================================
--- vendor-sys/openzfs/dist/lib/libzfs/libzfs_util.c	(revision 364927)
+++ vendor-sys/openzfs/dist/lib/libzfs/libzfs_util.c	(revision 364928)
@@ -1,2092 +1,2102 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2020 Joyent, Inc. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2020 The FreeBSD Foundation
  *
  * Portions of this software were developed by Allan Jude
  * under sponsorship from the FreeBSD Foundation.
  */
 
 /*
  * Internal utility routines for the ZFS library.
  */
 
 #include <errno.h>
 #include <fcntl.h>
 #include <libintl.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <math.h>
 #include <sys/stat.h>
 #include <sys/mnttab.h>
 #include <sys/mntent.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 
 #include <libzfs.h>
 #include <libzfs_core.h>
 
 #include "libzfs_impl.h"
 #include "zfs_prop.h"
 #include "zfeature_common.h"
 #include <zfs_fletcher.h>
 #include <libzutil.h>
 
 /*
  * We only care about the scheme in order to match the scheme
  * with the handler. Each handler should validate the full URI
  * as necessary.
  */
 #define	URI_REGEX	"^\\([A-Za-z][A-Za-z0-9+.\\-]*\\):"
 
 int
 libzfs_errno(libzfs_handle_t *hdl)
 {
 	return (hdl->libzfs_error);
 }
 
 const char *
 libzfs_error_action(libzfs_handle_t *hdl)
 {
 	return (hdl->libzfs_action);
 }
 
 const char *
 libzfs_error_description(libzfs_handle_t *hdl)
 {
 	if (hdl->libzfs_desc[0] != '\0')
 		return (hdl->libzfs_desc);
 
 	switch (hdl->libzfs_error) {
 	case EZFS_NOMEM:
 		return (dgettext(TEXT_DOMAIN, "out of memory"));
 	case EZFS_BADPROP:
 		return (dgettext(TEXT_DOMAIN, "invalid property value"));
 	case EZFS_PROPREADONLY:
 		return (dgettext(TEXT_DOMAIN, "read-only property"));
 	case EZFS_PROPTYPE:
 		return (dgettext(TEXT_DOMAIN, "property doesn't apply to "
 		    "datasets of this type"));
 	case EZFS_PROPNONINHERIT:
 		return (dgettext(TEXT_DOMAIN, "property cannot be inherited"));
 	case EZFS_PROPSPACE:
 		return (dgettext(TEXT_DOMAIN, "invalid quota or reservation"));
 	case EZFS_BADTYPE:
 		return (dgettext(TEXT_DOMAIN, "operation not applicable to "
 		    "datasets of this type"));
 	case EZFS_BUSY:
 		return (dgettext(TEXT_DOMAIN, "pool or dataset is busy"));
 	case EZFS_EXISTS:
 		return (dgettext(TEXT_DOMAIN, "pool or dataset exists"));
 	case EZFS_NOENT:
 		return (dgettext(TEXT_DOMAIN, "no such pool or dataset"));
 	case EZFS_BADSTREAM:
 		return (dgettext(TEXT_DOMAIN, "invalid backup stream"));
 	case EZFS_DSREADONLY:
 		return (dgettext(TEXT_DOMAIN, "dataset is read-only"));
 	case EZFS_VOLTOOBIG:
 		return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
 		    "this system"));
 	case EZFS_INVALIDNAME:
 		return (dgettext(TEXT_DOMAIN, "invalid name"));
 	case EZFS_BADRESTORE:
 		return (dgettext(TEXT_DOMAIN, "unable to restore to "
 		    "destination"));
 	case EZFS_BADBACKUP:
 		return (dgettext(TEXT_DOMAIN, "backup failed"));
 	case EZFS_BADTARGET:
 		return (dgettext(TEXT_DOMAIN, "invalid target vdev"));
 	case EZFS_NODEVICE:
 		return (dgettext(TEXT_DOMAIN, "no such device in pool"));
 	case EZFS_BADDEV:
 		return (dgettext(TEXT_DOMAIN, "invalid device"));
 	case EZFS_NOREPLICAS:
 		return (dgettext(TEXT_DOMAIN, "no valid replicas"));
 	case EZFS_RESILVERING:
 		return (dgettext(TEXT_DOMAIN, "currently resilvering"));
 	case EZFS_BADVERSION:
 		return (dgettext(TEXT_DOMAIN, "unsupported version or "
 		    "feature"));
 	case EZFS_POOLUNAVAIL:
 		return (dgettext(TEXT_DOMAIN, "pool is unavailable"));
 	case EZFS_DEVOVERFLOW:
 		return (dgettext(TEXT_DOMAIN, "too many devices in one vdev"));
 	case EZFS_BADPATH:
 		return (dgettext(TEXT_DOMAIN, "must be an absolute path"));
 	case EZFS_CROSSTARGET:
 		return (dgettext(TEXT_DOMAIN, "operation crosses datasets or "
 		    "pools"));
 	case EZFS_ZONED:
 		return (dgettext(TEXT_DOMAIN, "dataset in use by local zone"));
 	case EZFS_MOUNTFAILED:
 		return (dgettext(TEXT_DOMAIN, "mount failed"));
 	case EZFS_UMOUNTFAILED:
 		return (dgettext(TEXT_DOMAIN, "umount failed"));
 	case EZFS_UNSHARENFSFAILED:
 		return (dgettext(TEXT_DOMAIN, "unshare(1M) failed"));
 	case EZFS_SHARENFSFAILED:
 		return (dgettext(TEXT_DOMAIN, "share(1M) failed"));
 	case EZFS_UNSHARESMBFAILED:
 		return (dgettext(TEXT_DOMAIN, "smb remove share failed"));
 	case EZFS_SHARESMBFAILED:
 		return (dgettext(TEXT_DOMAIN, "smb add share failed"));
 	case EZFS_PERM:
 		return (dgettext(TEXT_DOMAIN, "permission denied"));
 	case EZFS_NOSPC:
 		return (dgettext(TEXT_DOMAIN, "out of space"));
 	case EZFS_FAULT:
 		return (dgettext(TEXT_DOMAIN, "bad address"));
 	case EZFS_IO:
 		return (dgettext(TEXT_DOMAIN, "I/O error"));
 	case EZFS_INTR:
 		return (dgettext(TEXT_DOMAIN, "signal received"));
 	case EZFS_ISSPARE:
 		return (dgettext(TEXT_DOMAIN, "device is reserved as a hot "
 		    "spare"));
 	case EZFS_INVALCONFIG:
 		return (dgettext(TEXT_DOMAIN, "invalid vdev configuration"));
 	case EZFS_RECURSIVE:
 		return (dgettext(TEXT_DOMAIN, "recursive dataset dependency"));
 	case EZFS_NOHISTORY:
 		return (dgettext(TEXT_DOMAIN, "no history available"));
 	case EZFS_POOLPROPS:
 		return (dgettext(TEXT_DOMAIN, "failed to retrieve "
 		    "pool properties"));
 	case EZFS_POOL_NOTSUP:
 		return (dgettext(TEXT_DOMAIN, "operation not supported "
 		    "on this type of pool"));
 	case EZFS_POOL_INVALARG:
 		return (dgettext(TEXT_DOMAIN, "invalid argument for "
 		    "this pool operation"));
 	case EZFS_NAMETOOLONG:
 		return (dgettext(TEXT_DOMAIN, "dataset name is too long"));
 	case EZFS_OPENFAILED:
 		return (dgettext(TEXT_DOMAIN, "open failed"));
 	case EZFS_NOCAP:
 		return (dgettext(TEXT_DOMAIN,
 		    "disk capacity information could not be retrieved"));
 	case EZFS_LABELFAILED:
 		return (dgettext(TEXT_DOMAIN, "write of label failed"));
 	case EZFS_BADWHO:
 		return (dgettext(TEXT_DOMAIN, "invalid user/group"));
 	case EZFS_BADPERM:
 		return (dgettext(TEXT_DOMAIN, "invalid permission"));
 	case EZFS_BADPERMSET:
 		return (dgettext(TEXT_DOMAIN, "invalid permission set name"));
 	case EZFS_NODELEGATION:
 		return (dgettext(TEXT_DOMAIN, "delegated administration is "
 		    "disabled on pool"));
 	case EZFS_BADCACHE:
 		return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));
 	case EZFS_ISL2CACHE:
 		return (dgettext(TEXT_DOMAIN, "device is in use as a cache"));
 	case EZFS_VDEVNOTSUP:
 		return (dgettext(TEXT_DOMAIN, "vdev specification is not "
 		    "supported"));
 	case EZFS_NOTSUP:
 		return (dgettext(TEXT_DOMAIN, "operation not supported "
 		    "on this dataset"));
 	case EZFS_IOC_NOTSUPPORTED:
 		return (dgettext(TEXT_DOMAIN, "operation not supported by "
 		    "zfs kernel module"));
 	case EZFS_ACTIVE_SPARE:
 		return (dgettext(TEXT_DOMAIN, "pool has active shared spare "
 		    "device"));
 	case EZFS_UNPLAYED_LOGS:
 		return (dgettext(TEXT_DOMAIN, "log device has unplayed intent "
 		    "logs"));
 	case EZFS_REFTAG_RELE:
 		return (dgettext(TEXT_DOMAIN, "no such tag on this dataset"));
 	case EZFS_REFTAG_HOLD:
 		return (dgettext(TEXT_DOMAIN, "tag already exists on this "
 		    "dataset"));
 	case EZFS_TAGTOOLONG:
 		return (dgettext(TEXT_DOMAIN, "tag too long"));
 	case EZFS_PIPEFAILED:
 		return (dgettext(TEXT_DOMAIN, "pipe create failed"));
 	case EZFS_THREADCREATEFAILED:
 		return (dgettext(TEXT_DOMAIN, "thread create failed"));
 	case EZFS_POSTSPLIT_ONLINE:
 		return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
 		    "into a new one"));
 	case EZFS_SCRUB_PAUSED:
 		return (dgettext(TEXT_DOMAIN, "scrub is paused; "
 		    "use 'zpool scrub' to resume"));
 	case EZFS_SCRUBBING:
 		return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
 		    "use 'zpool scrub -s' to cancel current scrub"));
 	case EZFS_NO_SCRUB:
 		return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
 	case EZFS_DIFF:
 		return (dgettext(TEXT_DOMAIN, "unable to generate diffs"));
 	case EZFS_DIFFDATA:
 		return (dgettext(TEXT_DOMAIN, "invalid diff data"));
 	case EZFS_POOLREADONLY:
 		return (dgettext(TEXT_DOMAIN, "pool is read-only"));
 	case EZFS_NO_PENDING:
 		return (dgettext(TEXT_DOMAIN, "operation is not "
 		    "in progress"));
 	case EZFS_CHECKPOINT_EXISTS:
 		return (dgettext(TEXT_DOMAIN, "checkpoint exists"));
 	case EZFS_DISCARDING_CHECKPOINT:
 		return (dgettext(TEXT_DOMAIN, "currently discarding "
 		    "checkpoint"));
 	case EZFS_NO_CHECKPOINT:
 		return (dgettext(TEXT_DOMAIN, "checkpoint does not exist"));
 	case EZFS_DEVRM_IN_PROGRESS:
 		return (dgettext(TEXT_DOMAIN, "device removal in progress"));
 	case EZFS_VDEV_TOO_BIG:
 		return (dgettext(TEXT_DOMAIN, "device exceeds supported size"));
 	case EZFS_ACTIVE_POOL:
 		return (dgettext(TEXT_DOMAIN, "pool is imported on a "
 		    "different host"));
 	case EZFS_CRYPTOFAILED:
 		return (dgettext(TEXT_DOMAIN, "encryption failure"));
 	case EZFS_TOOMANY:
 		return (dgettext(TEXT_DOMAIN, "argument list too long"));
 	case EZFS_INITIALIZING:
 		return (dgettext(TEXT_DOMAIN, "currently initializing"));
 	case EZFS_NO_INITIALIZE:
 		return (dgettext(TEXT_DOMAIN, "there is no active "
 		    "initialization"));
 	case EZFS_WRONG_PARENT:
 		return (dgettext(TEXT_DOMAIN, "invalid parent dataset"));
 	case EZFS_TRIMMING:
 		return (dgettext(TEXT_DOMAIN, "currently trimming"));
 	case EZFS_NO_TRIM:
 		return (dgettext(TEXT_DOMAIN, "there is no active trim"));
 	case EZFS_TRIM_NOTSUP:
 		return (dgettext(TEXT_DOMAIN, "trim operations are not "
 		    "supported by this device"));
 	case EZFS_NO_RESILVER_DEFER:
 		return (dgettext(TEXT_DOMAIN, "this action requires the "
 		    "resilver_defer feature"));
 	case EZFS_EXPORT_IN_PROGRESS:
 		return (dgettext(TEXT_DOMAIN, "pool export in progress"));
 	case EZFS_REBUILDING:
 		return (dgettext(TEXT_DOMAIN, "currently sequentially "
 		    "resilvering"));
 	case EZFS_UNKNOWN:
 		return (dgettext(TEXT_DOMAIN, "unknown error"));
 	default:
 		assert(hdl->libzfs_error == 0);
 		return (dgettext(TEXT_DOMAIN, "no error"));
 	}
 }
 
 /*PRINTFLIKE2*/
 void
 zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 
 	(void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc),
 	    fmt, ap);
 	hdl->libzfs_desc_active = 1;
 
 	va_end(ap);
 }
 
 static void
 zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap)
 {
 	(void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action),
 	    fmt, ap);
 	hdl->libzfs_error = error;
 
 	if (hdl->libzfs_desc_active)
 		hdl->libzfs_desc_active = 0;
 	else
 		hdl->libzfs_desc[0] = '\0';
 
 	if (hdl->libzfs_printerr) {
 		if (error == EZFS_UNKNOWN) {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal "
 			    "error: %s: %s\n"), hdl->libzfs_action,
 			    libzfs_error_description(hdl));
 			abort();
 		}
 
 		(void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action,
 		    libzfs_error_description(hdl));
 		if (error == EZFS_NOMEM)
 			exit(1);
 	}
 }
 
 int
 zfs_error(libzfs_handle_t *hdl, int error, const char *msg)
 {
 	return (zfs_error_fmt(hdl, error, "%s", msg));
 }
 
 /*PRINTFLIKE3*/
 int
 zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 
 	zfs_verror(hdl, error, fmt, ap);
 
 	va_end(ap);
 
 	return (-1);
 }
 
 static int
 zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt,
     va_list ap)
 {
 	switch (error) {
 	case EPERM:
 	case EACCES:
 		zfs_verror(hdl, EZFS_PERM, fmt, ap);
 		return (-1);
 
 	case ECANCELED:
 		zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap);
 		return (-1);
 
 	case EIO:
 		zfs_verror(hdl, EZFS_IO, fmt, ap);
 		return (-1);
 
 	case EFAULT:
 		zfs_verror(hdl, EZFS_FAULT, fmt, ap);
 		return (-1);
 
 	case EINTR:
 		zfs_verror(hdl, EZFS_INTR, fmt, ap);
 		return (-1);
 	}
 
 	return (0);
 }
 
 int
 zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg)
 {
 	return (zfs_standard_error_fmt(hdl, error, "%s", msg));
 }
 
 /*PRINTFLIKE3*/
 int
 zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 
 	if (zfs_common_error(hdl, error, fmt, ap) != 0) {
 		va_end(ap);
 		return (-1);
 	}
 
 	switch (error) {
 	case ENXIO:
 	case ENODEV:
 	case EPIPE:
 		zfs_verror(hdl, EZFS_IO, fmt, ap);
 		break;
 
 	case ENOENT:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset does not exist"));
 		zfs_verror(hdl, EZFS_NOENT, fmt, ap);
 		break;
 
 	case ENOSPC:
 	case EDQUOT:
 		zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
 		break;
 
 	case EEXIST:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset already exists"));
 		zfs_verror(hdl, EZFS_EXISTS, fmt, ap);
 		break;
 
 	case EBUSY:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset is busy"));
 		zfs_verror(hdl, EZFS_BUSY, fmt, ap);
 		break;
 	case EROFS:
 		zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
 		break;
 	case ENAMETOOLONG:
 		zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap);
 		break;
 	case ENOTSUP:
 		zfs_verror(hdl, EZFS_BADVERSION, fmt, ap);
 		break;
 	case EAGAIN:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool I/O is currently suspended"));
 		zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
 		break;
 	case EREMOTEIO:
 		zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap);
 		break;
 	case ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE:
 	case ZFS_ERR_IOC_CMD_UNAVAIL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
 		    "module does not support this operation. A reboot may "
 		    "be required to enable this operation."));
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	case ZFS_ERR_IOC_ARG_UNAVAIL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
 		    "module does not support an option for this operation. "
 		    "A reboot may be required to enable this option."));
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	case ZFS_ERR_IOC_ARG_REQUIRED:
 	case ZFS_ERR_IOC_ARG_BADTYPE:
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	case ZFS_ERR_WRONG_PARENT:
 		zfs_verror(hdl, EZFS_WRONG_PARENT, fmt, ap);
 		break;
 	case ZFS_ERR_BADPROP:
 		zfs_verror(hdl, EZFS_BADPROP, fmt, ap);
 		break;
 	default:
 		zfs_error_aux(hdl, strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
 		break;
 	}
 
 	va_end(ap);
 	return (-1);
 }
 
 void
 zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
     char *errbuf)
 {
 	switch (err) {
 
 	case ENOSPC:
 		/*
 		 * For quotas and reservations, ENOSPC indicates
 		 * something different; setting a quota or reservation
 		 * doesn't use any disk space.
 		 */
 		switch (prop) {
 		case ZFS_PROP_QUOTA:
 		case ZFS_PROP_REFQUOTA:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "size is less than current used or "
 			    "reserved space"));
 			(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
 			break;
 
 		case ZFS_PROP_RESERVATION:
 		case ZFS_PROP_REFRESERVATION:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "size is greater than available space"));
 			(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
 			break;
 
 		default:
 			(void) zfs_standard_error(hdl, err, errbuf);
 			break;
 		}
 		break;
 
 	case EBUSY:
 		(void) zfs_standard_error(hdl, EBUSY, errbuf);
 		break;
 
 	case EROFS:
 		(void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
 		break;
 
 	case E2BIG:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "property value too long"));
 		(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		break;
 
 	case ENOTSUP:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool and or dataset must be upgraded to set this "
 		    "property or value"));
 		(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 		break;
 
 	case ERANGE:
 		if (prop == ZFS_PROP_COMPRESSION ||
 		    prop == ZFS_PROP_DNODESIZE ||
 		    prop == ZFS_PROP_RECORDSIZE) {
 			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property setting is not allowed on "
 			    "bootable datasets"));
 			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
 		} else if (prop == ZFS_PROP_CHECKSUM ||
 		    prop == ZFS_PROP_DEDUP) {
 			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property setting is not allowed on "
 			    "root pools"));
 			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, err, errbuf);
 		}
 		break;
 
 	case EINVAL:
 		if (prop == ZPROP_INVAL) {
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, err, errbuf);
 		}
 		break;
 
 	case ZFS_ERR_BADPROP:
 		(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		break;
 
 	case EACCES:
 		if (prop == ZFS_PROP_KEYLOCATION) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "keylocation may only be set on encryption roots"));
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, err, errbuf);
 		}
 		break;
 
 	case EOVERFLOW:
 		/*
 		 * This platform can't address a volume this big.
 		 */
 #ifdef _ILP32
 		if (prop == ZFS_PROP_VOLSIZE) {
 			(void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
 			break;
 		}
 #endif
 		/* FALLTHROUGH */
 	default:
 		(void) zfs_standard_error(hdl, err, errbuf);
 	}
 }
 
 int
 zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg)
 {
 	return (zpool_standard_error_fmt(hdl, error, "%s", msg));
 }
 
 /*PRINTFLIKE3*/
 int
 zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 
 	if (zfs_common_error(hdl, error, fmt, ap) != 0) {
 		va_end(ap);
 		return (-1);
 	}
 
 	switch (error) {
 	case ENODEV:
 		zfs_verror(hdl, EZFS_NODEVICE, fmt, ap);
 		break;
 
 	case ENOENT:
 		zfs_error_aux(hdl,
 		    dgettext(TEXT_DOMAIN, "no such pool or dataset"));
 		zfs_verror(hdl, EZFS_NOENT, fmt, ap);
 		break;
 
 	case EEXIST:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool already exists"));
 		zfs_verror(hdl, EZFS_EXISTS, fmt, ap);
 		break;
 
 	case EBUSY:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy"));
 		zfs_verror(hdl, EZFS_BUSY, fmt, ap);
 		break;
 
 	/* There is no pending operation to cancel */
 	case ENOTACTIVE:
 		zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap);
 		break;
 
 	case ENXIO:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "one or more devices is currently unavailable"));
 		zfs_verror(hdl, EZFS_BADDEV, fmt, ap);
 		break;
 
 	case ENAMETOOLONG:
 		zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap);
 		break;
 
 	case ENOTSUP:
 		zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap);
 		break;
 
 	case EINVAL:
 		zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap);
 		break;
 
 	case ENOSPC:
 	case EDQUOT:
 		zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
 		return (-1);
 
 	case EAGAIN:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool I/O is currently suspended"));
 		zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
 		break;
 
 	case EROFS:
 		zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
 		break;
 	case EDOM:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "block size out of range or does not match"));
 		zfs_verror(hdl, EZFS_BADPROP, fmt, ap);
 		break;
 	case EREMOTEIO:
 		zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap);
 		break;
 	case ZFS_ERR_CHECKPOINT_EXISTS:
 		zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap);
 		break;
 	case ZFS_ERR_DISCARDING_CHECKPOINT:
 		zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap);
 		break;
 	case ZFS_ERR_NO_CHECKPOINT:
 		zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap);
 		break;
 	case ZFS_ERR_DEVRM_IN_PROGRESS:
 		zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap);
 		break;
 	case ZFS_ERR_VDEV_TOO_BIG:
 		zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap);
 		break;
 	case ZFS_ERR_EXPORT_IN_PROGRESS:
 		zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap);
 		break;
 	case ZFS_ERR_RESILVER_IN_PROGRESS:
 		zfs_verror(hdl, EZFS_RESILVERING, fmt, ap);
 		break;
 	case ZFS_ERR_REBUILD_IN_PROGRESS:
 		zfs_verror(hdl, EZFS_REBUILDING, fmt, ap);
 		break;
 	case ZFS_ERR_BADPROP:
 		zfs_verror(hdl, EZFS_BADPROP, fmt, ap);
 		break;
 	case ZFS_ERR_IOC_CMD_UNAVAIL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
 		    "module does not support this operation. A reboot may "
 		    "be required to enable this operation."));
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	case ZFS_ERR_IOC_ARG_UNAVAIL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
 		    "module does not support an option for this operation. "
 		    "A reboot may be required to enable this option."));
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	case ZFS_ERR_IOC_ARG_REQUIRED:
 	case ZFS_ERR_IOC_ARG_BADTYPE:
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	default:
 		zfs_error_aux(hdl, strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
 	}
 
 	va_end(ap);
 	return (-1);
 }
 
 /*
  * Display an out of memory error message and abort the current program.
  */
 int
 no_memory(libzfs_handle_t *hdl)
 {
 	return (zfs_error(hdl, EZFS_NOMEM, "internal error"));
 }
 
 /*
  * A safe form of malloc() which will die if the allocation fails.
  */
 void *
 zfs_alloc(libzfs_handle_t *hdl, size_t size)
 {
 	void *data;
 
 	if ((data = calloc(1, size)) == NULL)
 		(void) no_memory(hdl);
 
 	return (data);
 }
 
 /*
  * A safe form of asprintf() which will die if the allocation fails.
  */
 /*PRINTFLIKE2*/
 char *
 zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...)
 {
 	va_list ap;
 	char *ret;
 	int err;
 
 	va_start(ap, fmt);
 
 	err = vasprintf(&ret, fmt, ap);
 
 	va_end(ap);
 
 	if (err < 0)
 		(void) no_memory(hdl);
 
 	return (ret);
 }
 
 /*
  * A safe form of realloc(), which also zeroes newly allocated space.
  */
 void *
 zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize)
 {
 	void *ret;
 
 	if ((ret = realloc(ptr, newsize)) == NULL) {
 		(void) no_memory(hdl);
 		return (NULL);
 	}
 
 	bzero((char *)ret + oldsize, (newsize - oldsize));
 	return (ret);
 }
 
 /*
  * A safe form of strdup() which will die if the allocation fails.
  */
 char *
 zfs_strdup(libzfs_handle_t *hdl, const char *str)
 {
 	char *ret;
 
 	if ((ret = strdup(str)) == NULL)
 		(void) no_memory(hdl);
 
 	return (ret);
 }
 
 void
 libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr)
 {
 	hdl->libzfs_printerr = printerr;
 }
 
 /*
  * Read lines from an open file descriptor and store them in an array of
  * strings until EOF.  lines[] will be allocated and populated with all the
  * lines read.  All newlines are replaced with NULL terminators for
  * convenience.  lines[] must be freed after use with libzfs_free_str_array().
  *
  * Returns the number of lines read.
  */
 static int
 libzfs_read_stdout_from_fd(int fd, char **lines[])
 {
 
 	FILE *fp;
 	int lines_cnt = 0;
 	size_t len = 0;
 	char *line = NULL;
 	char **tmp_lines = NULL, **tmp;
 	char *nl = NULL;
 	int rc;
 
 	fp = fdopen(fd, "r");
 	if (fp == NULL)
 		return (0);
 	while (1) {
 		rc = getline(&line, &len, fp);
 		if (rc == -1)
 			break;
 
 		tmp = realloc(tmp_lines, sizeof (*tmp_lines) * (lines_cnt + 1));
 		if (tmp == NULL) {
 			/* Return the lines we were able to process */
 			break;
 		}
 		tmp_lines = tmp;
 
 		/* Terminate newlines */
 		if ((nl = strchr(line, '\n')) != NULL)
 			*nl = '\0';
 		tmp_lines[lines_cnt] = line;
 		lines_cnt++;
 		line = NULL;
 	}
 	fclose(fp);
 	*lines = tmp_lines;
 	return (lines_cnt);
 }
 
 static int
 libzfs_run_process_impl(const char *path, char *argv[], char *env[], int flags,
     char **lines[], int *lines_cnt)
 {
 	pid_t pid;
 	int error, devnull_fd;
 	int link[2];
 
 	/*
 	 * Setup a pipe between our child and parent process if we're
 	 * reading stdout.
 	 */
 	if ((lines != NULL) && pipe(link) == -1)
 		return (-EPIPE);
 
 	pid = vfork();
 	if (pid == 0) {
 		/* Child process */
 		devnull_fd = open("/dev/null", O_WRONLY);
 
 		if (devnull_fd < 0)
 			_exit(-1);
 
 		if (!(flags & STDOUT_VERBOSE) && (lines == NULL))
 			(void) dup2(devnull_fd, STDOUT_FILENO);
 		else if (lines != NULL) {
 			/* Save the output to lines[] */
 			dup2(link[1], STDOUT_FILENO);
 			close(link[0]);
 			close(link[1]);
 		}
 
 		if (!(flags & STDERR_VERBOSE))
 			(void) dup2(devnull_fd, STDERR_FILENO);
 
 		close(devnull_fd);
 
 		if (flags & NO_DEFAULT_PATH) {
 			if (env == NULL)
 				execv(path, argv);
 			else
 				execve(path, argv, env);
 		} else {
 			if (env == NULL)
 				execvp(path, argv);
 			else
 				execvpe(path, argv, env);
 		}
 
 		_exit(-1);
 	} else if (pid > 0) {
 		/* Parent process */
 		int status;
 
 		while ((error = waitpid(pid, &status, 0)) == -1 &&
 		    errno == EINTR) { }
 		if (error < 0 || !WIFEXITED(status))
 			return (-1);
 
 		if (lines != NULL) {
 			close(link[1]);
 			*lines_cnt = libzfs_read_stdout_from_fd(link[0], lines);
 		}
 		return (WEXITSTATUS(status));
 	}
 
 	return (-1);
 }
 
 int
 libzfs_run_process(const char *path, char *argv[], int flags)
 {
 	return (libzfs_run_process_impl(path, argv, NULL, flags, NULL, NULL));
 }
 
 /*
  * Run a command and store its stdout lines in an array of strings (lines[]).
  * lines[] is allocated and populated for you, and the number of lines is set in
  * lines_cnt.  lines[] must be freed after use with libzfs_free_str_array().
  * All newlines (\n) in lines[] are terminated for convenience.
  */
 int
 libzfs_run_process_get_stdout(const char *path, char *argv[], char *env[],
     char **lines[], int *lines_cnt)
 {
 	return (libzfs_run_process_impl(path, argv, env, 0, lines, lines_cnt));
 }
 
 /*
  * Same as libzfs_run_process_get_stdout(), but run without $PATH set.  This
  * means that *path needs to be the full path to the executable.
  */
 int
 libzfs_run_process_get_stdout_nopath(const char *path, char *argv[],
     char *env[], char **lines[], int *lines_cnt)
 {
 	return (libzfs_run_process_impl(path, argv, env, NO_DEFAULT_PATH,
 	    lines, lines_cnt));
 }
 
 /*
  * Free an array of strings.  Free both the strings contained in the array and
  * the array itself.
  */
 void
 libzfs_free_str_array(char **strs, int count)
 {
 	while (--count >= 0)
 		free(strs[count]);
 
 	free(strs);
 }
 
 /*
  * Returns 1 if environment variable is set to "YES", "yes", "ON", "on", or
  * a non-zero number.
  *
  * Returns 0 otherwise.
  */
 int
 libzfs_envvar_is_set(char *envvar)
 {
 	char *env = getenv(envvar);
 	if (env && (strtoul(env, NULL, 0) > 0 ||
 	    (!strncasecmp(env, "YES", 3) && strnlen(env, 4) == 3) ||
 	    (!strncasecmp(env, "ON", 2) && strnlen(env, 3) == 2)))
 		return (1);
 
 	return (0);
 }
 
 libzfs_handle_t *
 libzfs_init(void)
 {
 	libzfs_handle_t *hdl;
 	int error;
+	char *env;
 
 	error = libzfs_load_module();
 	if (error) {
 		errno = error;
 		return (NULL);
 	}
 
 	if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) {
 		return (NULL);
 	}
 
 	if (regcomp(&hdl->libzfs_urire, URI_REGEX, 0) != 0) {
 		free(hdl);
 		return (NULL);
 	}
 
 	if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR|O_EXCL)) < 0) {
 		free(hdl);
 		return (NULL);
 	}
 
 #ifdef HAVE_SETMNTENT
 	if ((hdl->libzfs_mnttab = setmntent(MNTTAB, "r")) == NULL) {
 #else
 	if ((hdl->libzfs_mnttab = fopen(MNTTAB, "r")) == NULL) {
 #endif
 		(void) close(hdl->libzfs_fd);
 		free(hdl);
 		return (NULL);
 	}
 
 	if (libzfs_core_init() != 0) {
 		(void) close(hdl->libzfs_fd);
 		(void) fclose(hdl->libzfs_mnttab);
 		free(hdl);
 		return (NULL);
 	}
 
 	zfs_prop_init();
 	zpool_prop_init();
 	zpool_feature_init();
 	libzfs_mnttab_init(hdl);
 	fletcher_4_init();
 
 	if (getenv("ZFS_PROP_DEBUG") != NULL) {
 		hdl->libzfs_prop_debug = B_TRUE;
+	}
+	if ((env = getenv("ZFS_SENDRECV_MAX_NVLIST")) != NULL) {
+		if ((error = zfs_nicestrtonum(hdl, env,
+		    &hdl->libzfs_max_nvlist))) {
+			errno = error;
+			return (NULL);
+		}
+	} else {
+		hdl->libzfs_max_nvlist = (SPA_MAXBLOCKSIZE * 4);
 	}
 
 	/*
 	 * For testing, remove some settable properties and features
 	 */
 	if (libzfs_envvar_is_set("ZFS_SYSFS_PROP_SUPPORT_TEST")) {
 		zprop_desc_t *proptbl;
 
 		proptbl = zpool_prop_get_table();
 		proptbl[ZPOOL_PROP_COMMENT].pd_zfs_mod_supported = B_FALSE;
 
 		proptbl = zfs_prop_get_table();
 		proptbl[ZFS_PROP_DNODESIZE].pd_zfs_mod_supported = B_FALSE;
 
 		zfeature_info_t *ftbl = spa_feature_table;
 		ftbl[SPA_FEATURE_LARGE_BLOCKS].fi_zfs_mod_supported = B_FALSE;
 	}
 
 	return (hdl);
 }
 
 void
 libzfs_fini(libzfs_handle_t *hdl)
 {
 	(void) close(hdl->libzfs_fd);
 	if (hdl->libzfs_mnttab)
 #ifdef HAVE_SETMNTENT
 		(void) endmntent(hdl->libzfs_mnttab);
 #else
 		(void) fclose(hdl->libzfs_mnttab);
 #endif
 	zpool_free_handles(hdl);
 	namespace_clear(hdl);
 	libzfs_mnttab_fini(hdl);
 	libzfs_core_fini();
 	regfree(&hdl->libzfs_urire);
 	fletcher_4_fini();
 	free(hdl);
 }
 
 libzfs_handle_t *
 zpool_get_handle(zpool_handle_t *zhp)
 {
 	return (zhp->zpool_hdl);
 }
 
 libzfs_handle_t *
 zfs_get_handle(zfs_handle_t *zhp)
 {
 	return (zhp->zfs_hdl);
 }
 
 zpool_handle_t *
 zfs_get_pool_handle(const zfs_handle_t *zhp)
 {
 	return (zhp->zpool_hdl);
 }
 
 /*
  * Given a name, determine whether or not it's a valid path
  * (starts with '/' or "./").  If so, walk the mnttab trying
  * to match the device number.  If not, treat the path as an
  * fs/vol/snap/bkmark name.
  */
 zfs_handle_t *
 zfs_path_to_zhandle(libzfs_handle_t *hdl, const char *path, zfs_type_t argtype)
 {
 	struct stat64 statbuf;
 	struct extmnttab entry;
 
 	if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) {
 		/*
 		 * It's not a valid path, assume it's a name of type 'argtype'.
 		 */
 		return (zfs_open(hdl, path, argtype));
 	}
 
 	/* Reopen MNTTAB to prevent reading stale data from open file */
 	if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL)
 		return (NULL);
 
 	if (getextmntent(path, &entry, &statbuf) != 0)
 		return (NULL);
 
 	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
 		(void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"),
 		    path);
 		return (NULL);
 	}
 
 	return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM));
 }
 
 /*
  * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from
  * an ioctl().
  */
 int
 zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
 {
 	if (len == 0)
 		len = 256 * 1024;
 	zc->zc_nvlist_dst_size = len;
 	zc->zc_nvlist_dst =
 	    (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
 	if (zc->zc_nvlist_dst == 0)
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Called when an ioctl() which returns an nvlist fails with ENOMEM.  This will
  * expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was
  * filled in by the kernel to indicate the actual required size.
  */
 int
 zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc)
 {
 	free((void *)(uintptr_t)zc->zc_nvlist_dst);
 	zc->zc_nvlist_dst =
 	    (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
 	if (zc->zc_nvlist_dst == 0)
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Called to free the src and dst nvlists stored in the command structure.
  */
 void
 zcmd_free_nvlists(zfs_cmd_t *zc)
 {
 	free((void *)(uintptr_t)zc->zc_nvlist_conf);
 	free((void *)(uintptr_t)zc->zc_nvlist_src);
 	free((void *)(uintptr_t)zc->zc_nvlist_dst);
 	zc->zc_nvlist_conf = 0;
 	zc->zc_nvlist_src = 0;
 	zc->zc_nvlist_dst = 0;
 }
 
 static int
 zcmd_write_nvlist_com(libzfs_handle_t *hdl, uint64_t *outnv, uint64_t *outlen,
     nvlist_t *nvl)
 {
 	char *packed;
 	size_t len;
 
 	verify(nvlist_size(nvl, &len, NV_ENCODE_NATIVE) == 0);
 
 	if ((packed = zfs_alloc(hdl, len)) == NULL)
 		return (-1);
 
 	verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0);
 
 	*outnv = (uint64_t)(uintptr_t)packed;
 	*outlen = len;
 
 	return (0);
 }
 
 int
 zcmd_write_conf_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl)
 {
 	return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf,
 	    &zc->zc_nvlist_conf_size, nvl));
 }
 
 int
 zcmd_write_src_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl)
 {
 	return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src,
 	    &zc->zc_nvlist_src_size, nvl));
 }
 
 /*
  * Unpacks an nvlist from the ZFS ioctl command structure.
  */
 int
 zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp)
 {
 	if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst,
 	    zc->zc_nvlist_dst_size, nvlp, 0) != 0)
 		return (no_memory(hdl));
 
 	return (0);
 }
 
 /*
  * ================================================================
  * API shared by zfs and zpool property management
  * ================================================================
  */
 
 static void
 zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 {
 	zprop_list_t *pl = cbp->cb_proplist;
 	int i;
 	char *title;
 	size_t len;
 
 	cbp->cb_first = B_FALSE;
 	if (cbp->cb_scripted)
 		return;
 
 	/*
 	 * Start with the length of the column headers.
 	 */
 	cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME"));
 	cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN,
 	    "PROPERTY"));
 	cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN,
 	    "VALUE"));
 	cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN,
 	    "RECEIVED"));
 	cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
 	    "SOURCE"));
 
 	/* first property is always NAME */
 	assert(cbp->cb_proplist->pl_prop ==
 	    ((type == ZFS_TYPE_POOL) ?  ZPOOL_PROP_NAME : ZFS_PROP_NAME));
 
 	/*
 	 * Go through and calculate the widths for each column.  For the
 	 * 'source' column, we kludge it up by taking the worst-case scenario of
 	 * inheriting from the longest name.  This is acceptable because in the
 	 * majority of cases 'SOURCE' is the last column displayed, and we don't
 	 * use the width anyway.  Note that the 'VALUE' column can be oversized,
 	 * if the name of the property is much longer than any values we find.
 	 */
 	for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
 		/*
 		 * 'PROPERTY' column
 		 */
 		if (pl->pl_prop != ZPROP_INVAL) {
 			const char *propname = (type == ZFS_TYPE_POOL) ?
 			    zpool_prop_to_name(pl->pl_prop) :
 			    zfs_prop_to_name(pl->pl_prop);
 
 			len = strlen(propname);
 			if (len > cbp->cb_colwidths[GET_COL_PROPERTY])
 				cbp->cb_colwidths[GET_COL_PROPERTY] = len;
 		} else {
 			len = strlen(pl->pl_user_prop);
 			if (len > cbp->cb_colwidths[GET_COL_PROPERTY])
 				cbp->cb_colwidths[GET_COL_PROPERTY] = len;
 		}
 
 		/*
 		 * 'VALUE' column.  The first property is always the 'name'
 		 * property that was tacked on either by /sbin/zfs's
 		 * zfs_do_get() or when calling zprop_expand_list(), so we
 		 * ignore its width.  If the user specified the name property
 		 * to display, then it will be later in the list in any case.
 		 */
 		if (pl != cbp->cb_proplist &&
 		    pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
 			cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;
 
 		/* 'RECEIVED' column. */
 		if (pl != cbp->cb_proplist &&
 		    pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD])
 			cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width;
 
 		/*
 		 * 'NAME' and 'SOURCE' columns
 		 */
 		if (pl->pl_prop == (type == ZFS_TYPE_POOL ? ZPOOL_PROP_NAME :
 		    ZFS_PROP_NAME) &&
 		    pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) {
 			cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width;
 			cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width +
 			    strlen(dgettext(TEXT_DOMAIN, "inherited from"));
 		}
 	}
 
 	/*
 	 * Now go through and print the headers.
 	 */
 	for (i = 0; i < ZFS_GET_NCOLS; i++) {
 		switch (cbp->cb_columns[i]) {
 		case GET_COL_NAME:
 			title = dgettext(TEXT_DOMAIN, "NAME");
 			break;
 		case GET_COL_PROPERTY:
 			title = dgettext(TEXT_DOMAIN, "PROPERTY");
 			break;
 		case GET_COL_VALUE:
 			title = dgettext(TEXT_DOMAIN, "VALUE");
 			break;
 		case GET_COL_RECVD:
 			title = dgettext(TEXT_DOMAIN, "RECEIVED");
 			break;
 		case GET_COL_SOURCE:
 			title = dgettext(TEXT_DOMAIN, "SOURCE");
 			break;
 		default:
 			title = NULL;
 		}
 
 		if (title != NULL) {
 			if (i == (ZFS_GET_NCOLS - 1) ||
 			    cbp->cb_columns[i + 1] == GET_COL_NONE)
 				(void) printf("%s", title);
 			else
 				(void) printf("%-*s  ",
 				    cbp->cb_colwidths[cbp->cb_columns[i]],
 				    title);
 		}
 	}
 	(void) printf("\n");
 }
 
 /*
  * Display a single line of output, according to the settings in the callback
  * structure.
  */
 void
 zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
     const char *propname, const char *value, zprop_source_t sourcetype,
     const char *source, const char *recvd_value)
 {
 	int i;
 	const char *str = NULL;
 	char buf[128];
 
 	/*
 	 * Ignore those source types that the user has chosen to ignore.
 	 */
 	if ((sourcetype & cbp->cb_sources) == 0)
 		return;
 
 	if (cbp->cb_first)
 		zprop_print_headers(cbp, cbp->cb_type);
 
 	for (i = 0; i < ZFS_GET_NCOLS; i++) {
 		switch (cbp->cb_columns[i]) {
 		case GET_COL_NAME:
 			str = name;
 			break;
 
 		case GET_COL_PROPERTY:
 			str = propname;
 			break;
 
 		case GET_COL_VALUE:
 			str = value;
 			break;
 
 		case GET_COL_SOURCE:
 			switch (sourcetype) {
 			case ZPROP_SRC_NONE:
 				str = "-";
 				break;
 
 			case ZPROP_SRC_DEFAULT:
 				str = "default";
 				break;
 
 			case ZPROP_SRC_LOCAL:
 				str = "local";
 				break;
 
 			case ZPROP_SRC_TEMPORARY:
 				str = "temporary";
 				break;
 
 			case ZPROP_SRC_INHERITED:
 				(void) snprintf(buf, sizeof (buf),
 				    "inherited from %s", source);
 				str = buf;
 				break;
 			case ZPROP_SRC_RECEIVED:
 				str = "received";
 				break;
 
 			default:
 				str = NULL;
 				assert(!"unhandled zprop_source_t");
 			}
 			break;
 
 		case GET_COL_RECVD:
 			str = (recvd_value == NULL ? "-" : recvd_value);
 			break;
 
 		default:
 			continue;
 		}
 
 		if (i == (ZFS_GET_NCOLS - 1) ||
 		    cbp->cb_columns[i + 1] == GET_COL_NONE)
 			(void) printf("%s", str);
 		else if (cbp->cb_scripted)
 			(void) printf("%s\t", str);
 		else
 			(void) printf("%-*s  ",
 			    cbp->cb_colwidths[cbp->cb_columns[i]],
 			    str);
 	}
 
 	(void) printf("\n");
 }
 
 /*
  * Given a numeric suffix, convert the value into a number of bits that the
  * resulting value must be shifted.
  */
 static int
 str2shift(libzfs_handle_t *hdl, const char *buf)
 {
 	const char *ends = "BKMGTPEZ";
 	int i;
 
 	if (buf[0] == '\0')
 		return (0);
 	for (i = 0; i < strlen(ends); i++) {
 		if (toupper(buf[0]) == ends[i])
 			break;
 	}
 	if (i == strlen(ends)) {
 		if (hdl)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid numeric suffix '%s'"), buf);
 		return (-1);
 	}
 
 	/*
 	 * Allow 'G' = 'GB' = 'GiB', case-insensitively.
 	 * However, 'BB' and 'BiB' are disallowed.
 	 */
 	if (buf[1] == '\0' ||
 	    (toupper(buf[0]) != 'B' &&
 	    ((toupper(buf[1]) == 'B' && buf[2] == '\0') ||
 	    (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'B' &&
 	    buf[3] == '\0'))))
 		return (10 * i);
 
 	if (hdl)
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid numeric suffix '%s'"), buf);
 	return (-1);
 }
 
 /*
  * Convert a string of the form '100G' into a real number.  Used when setting
  * properties or creating a volume.  'buf' is used to place an extended error
  * message for the caller to use.
  */
 int
 zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num)
 {
 	char *end;
 	int shift;
 
 	*num = 0;
 
 	/* Check to see if this looks like a number.  */
 	if ((value[0] < '0' || value[0] > '9') && value[0] != '.') {
 		if (hdl)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "bad numeric value '%s'"), value);
 		return (-1);
 	}
 
 	/* Rely on strtoull() to process the numeric portion.  */
 	errno = 0;
 	*num = strtoull(value, &end, 10);
 
 	/*
 	 * Check for ERANGE, which indicates that the value is too large to fit
 	 * in a 64-bit value.
 	 */
 	if (errno == ERANGE) {
 		if (hdl)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "numeric value is too large"));
 		return (-1);
 	}
 
 	/*
 	 * If we have a decimal value, then do the computation with floating
 	 * point arithmetic.  Otherwise, use standard arithmetic.
 	 */
 	if (*end == '.') {
 		double fval = strtod(value, &end);
 
 		if ((shift = str2shift(hdl, end)) == -1)
 			return (-1);
 
 		fval *= pow(2, shift);
 
 		/*
 		 * UINT64_MAX is not exactly representable as a double.
 		 * The closest representation is UINT64_MAX + 1, so we
 		 * use a >= comparison instead of > for the bounds check.
 		 */
 		if (fval >= (double)UINT64_MAX) {
 			if (hdl)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "numeric value is too large"));
 			return (-1);
 		}
 
 		*num = (uint64_t)fval;
 	} else {
 		if ((shift = str2shift(hdl, end)) == -1)
 			return (-1);
 
 		/* Check for overflow */
 		if (shift >= 64 || (*num << shift) >> shift != *num) {
 			if (hdl)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "numeric value is too large"));
 			return (-1);
 		}
 
 		*num <<= shift;
 	}
 
 	return (0);
 }
 
 /*
  * Given a propname=value nvpair to set, parse any numeric properties
  * (index, boolean, etc) if they are specified as strings and add the
  * resulting nvpair to the returned nvlist.
  *
  * At the DSL layer, all properties are either 64-bit numbers or strings.
  * We want the user to be able to ignore this fact and specify properties
  * as native values (numbers, for example) or as strings (to simplify
  * command line utilities).  This also handles converting index types
  * (compression, checksum, etc) from strings to their on-disk index.
  */
 int
 zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
     zfs_type_t type, nvlist_t *ret, char **svalp, uint64_t *ivalp,
     const char *errbuf)
 {
 	data_type_t datatype = nvpair_type(elem);
 	zprop_type_t proptype;
 	const char *propname;
 	char *value;
 	boolean_t isnone = B_FALSE;
 	boolean_t isauto = B_FALSE;
 	int err = 0;
 
 	if (type == ZFS_TYPE_POOL) {
 		proptype = zpool_prop_get_type(prop);
 		propname = zpool_prop_to_name(prop);
 	} else {
 		proptype = zfs_prop_get_type(prop);
 		propname = zfs_prop_to_name(prop);
 	}
 
 	/*
 	 * Convert any properties to the internal DSL value types.
 	 */
 	*svalp = NULL;
 	*ivalp = 0;
 
 	switch (proptype) {
 	case PROP_TYPE_STRING:
 		if (datatype != DATA_TYPE_STRING) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' must be a string"), nvpair_name(elem));
 			goto error;
 		}
 		err = nvpair_value_string(elem, svalp);
 		if (err != 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' is invalid"), nvpair_name(elem));
 			goto error;
 		}
 		if (strlen(*svalp) >= ZFS_MAXPROPLEN) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' is too long"), nvpair_name(elem));
 			goto error;
 		}
 		break;
 
 	case PROP_TYPE_NUMBER:
 		if (datatype == DATA_TYPE_STRING) {
 			(void) nvpair_value_string(elem, &value);
 			if (strcmp(value, "none") == 0) {
 				isnone = B_TRUE;
 			} else if (strcmp(value, "auto") == 0) {
 				isauto = B_TRUE;
 			} else if (zfs_nicestrtonum(hdl, value, ivalp) != 0) {
 				goto error;
 			}
 		} else if (datatype == DATA_TYPE_UINT64) {
 			(void) nvpair_value_uint64(elem, ivalp);
 		} else {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' must be a number"), nvpair_name(elem));
 			goto error;
 		}
 
 		/*
 		 * Quota special: force 'none' and don't allow 0.
 		 */
 		if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone &&
 		    (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "use 'none' to disable quota/refquota"));
 			goto error;
 		}
 
 		/*
 		 * Special handling for "*_limit=none". In this case it's not
 		 * 0 but UINT64_MAX.
 		 */
 		if ((type & ZFS_TYPE_DATASET) && isnone &&
 		    (prop == ZFS_PROP_FILESYSTEM_LIMIT ||
 		    prop == ZFS_PROP_SNAPSHOT_LIMIT)) {
 			*ivalp = UINT64_MAX;
 		}
 
 		/*
 		 * Special handling for setting 'refreservation' to 'auto'.  Use
 		 * UINT64_MAX to tell the caller to use zfs_fix_auto_resv().
 		 * 'auto' is only allowed on volumes.
 		 */
 		if (isauto) {
 			switch (prop) {
 			case ZFS_PROP_REFRESERVATION:
 				if ((type & ZFS_TYPE_VOLUME) == 0) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s=auto' only allowed on "
 					    "volumes"), nvpair_name(elem));
 					goto error;
 				}
 				*ivalp = UINT64_MAX;
 				break;
 			default:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'auto' is invalid value for '%s'"),
 				    nvpair_name(elem));
 				goto error;
 			}
 		}
 
 		break;
 
 	case PROP_TYPE_INDEX:
 		if (datatype != DATA_TYPE_STRING) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' must be a string"), nvpair_name(elem));
 			goto error;
 		}
 
 		(void) nvpair_value_string(elem, &value);
 
 		if (zprop_string_to_index(prop, value, ivalp, type) != 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' must be one of '%s'"), propname,
 			    zprop_values(prop, type));
 			goto error;
 		}
 		break;
 
 	default:
 		abort();
 	}
 
 	/*
 	 * Add the result to our return set of properties.
 	 */
 	if (*svalp != NULL) {
 		if (nvlist_add_string(ret, propname, *svalp) != 0) {
 			(void) no_memory(hdl);
 			return (-1);
 		}
 	} else {
 		if (nvlist_add_uint64(ret, propname, *ivalp) != 0) {
 			(void) no_memory(hdl);
 			return (-1);
 		}
 	}
 
 	return (0);
 error:
 	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 	return (-1);
 }
 
 static int
 addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp,
     zfs_type_t type)
 {
 	int prop;
 	zprop_list_t *entry;
 
 	prop = zprop_name_to_prop(propname, type);
 
 	if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type, B_FALSE))
 		prop = ZPROP_INVAL;
 
 	/*
 	 * When no property table entry can be found, return failure if
 	 * this is a pool property or if this isn't a user-defined
 	 * dataset property,
 	 */
 	if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL &&
 	    !zpool_prop_feature(propname) &&
 	    !zpool_prop_unsupported(propname)) ||
 	    (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) &&
 	    !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid property '%s'"), propname);
 		return (zfs_error(hdl, EZFS_BADPROP,
 		    dgettext(TEXT_DOMAIN, "bad property list")));
 	}
 
 	if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL)
 		return (-1);
 
 	entry->pl_prop = prop;
 	if (prop == ZPROP_INVAL) {
 		if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) ==
 		    NULL) {
 			free(entry);
 			return (-1);
 		}
 		entry->pl_width = strlen(propname);
 	} else {
 		entry->pl_width = zprop_width(prop, &entry->pl_fixed,
 		    type);
 	}
 
 	*listp = entry;
 
 	return (0);
 }
 
 /*
  * Given a comma-separated list of properties, construct a property list
  * containing both user-defined and native properties.  This function will
  * return a NULL list if 'all' is specified, which can later be expanded
  * by zprop_expand_list().
  */
 int
 zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp,
     zfs_type_t type)
 {
 	*listp = NULL;
 
 	/*
 	 * If 'all' is specified, return a NULL list.
 	 */
 	if (strcmp(props, "all") == 0)
 		return (0);
 
 	/*
 	 * If no props were specified, return an error.
 	 */
 	if (props[0] == '\0') {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "no properties specified"));
 		return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN,
 		    "bad property list")));
 	}
 
 	/*
 	 * It would be nice to use getsubopt() here, but the inclusion of column
 	 * aliases makes this more effort than it's worth.
 	 */
 	while (*props != '\0') {
 		size_t len;
 		char *p;
 		char c;
 
 		if ((p = strchr(props, ',')) == NULL) {
 			len = strlen(props);
 			p = props + len;
 		} else {
 			len = p - props;
 		}
 
 		/*
 		 * Check for empty options.
 		 */
 		if (len == 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "empty property name"));
 			return (zfs_error(hdl, EZFS_BADPROP,
 			    dgettext(TEXT_DOMAIN, "bad property list")));
 		}
 
 		/*
 		 * Check all regular property names.
 		 */
 		c = props[len];
 		props[len] = '\0';
 
 		if (strcmp(props, "space") == 0) {
 			static char *spaceprops[] = {
 				"name", "avail", "used", "usedbysnapshots",
 				"usedbydataset", "usedbyrefreservation",
 				"usedbychildren", NULL
 			};
 			int i;
 
 			for (i = 0; spaceprops[i]; i++) {
 				if (addlist(hdl, spaceprops[i], listp, type))
 					return (-1);
 				listp = &(*listp)->pl_next;
 			}
 		} else {
 			if (addlist(hdl, props, listp, type))
 				return (-1);
 			listp = &(*listp)->pl_next;
 		}
 
 		props = p;
 		if (c == ',')
 			props++;
 	}
 
 	return (0);
 }
 
 void
 zprop_free_list(zprop_list_t *pl)
 {
 	zprop_list_t *next;
 
 	while (pl != NULL) {
 		next = pl->pl_next;
 		free(pl->pl_user_prop);
 		free(pl);
 		pl = next;
 	}
 }
 
 typedef struct expand_data {
 	zprop_list_t	**last;
 	libzfs_handle_t	*hdl;
 	zfs_type_t type;
 } expand_data_t;
 
 static int
 zprop_expand_list_cb(int prop, void *cb)
 {
 	zprop_list_t *entry;
 	expand_data_t *edp = cb;
 
 	if ((entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t))) == NULL)
 		return (ZPROP_INVAL);
 
 	entry->pl_prop = prop;
 	entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type);
 	entry->pl_all = B_TRUE;
 
 	*(edp->last) = entry;
 	edp->last = &entry->pl_next;
 
 	return (ZPROP_CONT);
 }
 
 int
 zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type)
 {
 	zprop_list_t *entry;
 	zprop_list_t **last;
 	expand_data_t exp;
 
 	if (*plp == NULL) {
 		/*
 		 * If this is the very first time we've been called for an 'all'
 		 * specification, expand the list to include all native
 		 * properties.
 		 */
 		last = plp;
 
 		exp.last = last;
 		exp.hdl = hdl;
 		exp.type = type;
 
 		if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE,
 		    B_FALSE, type) == ZPROP_INVAL)
 			return (-1);
 
 		/*
 		 * Add 'name' to the beginning of the list, which is handled
 		 * specially.
 		 */
 		if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL)
 			return (-1);
 
 		entry->pl_prop = (type == ZFS_TYPE_POOL) ?  ZPOOL_PROP_NAME :
 		    ZFS_PROP_NAME;
 		entry->pl_width = zprop_width(entry->pl_prop,
 		    &entry->pl_fixed, type);
 		entry->pl_all = B_TRUE;
 		entry->pl_next = *plp;
 		*plp = entry;
 	}
 	return (0);
 }
 
 int
 zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered,
     zfs_type_t type)
 {
 	return (zprop_iter_common(func, cb, show_all, ordered, type));
 }
 
 /*
  * Fill given version buffer with zfs userland version
  */
 void
 zfs_version_userland(char *version, int len)
 {
 	(void) strlcpy(version, ZFS_META_ALIAS, len);
 }
 
 /*
  * Prints both zfs userland and kernel versions
  * Returns 0 on success, and -1 on error (with errno set)
  */
 int
 zfs_version_print(void)
 {
 	char zver_userland[128];
 	char zver_kernel[128];
 
 	zfs_version_userland(zver_userland, sizeof (zver_userland));
 
 	(void) printf("%s\n", zver_userland);
 
 	if (zfs_version_kernel(zver_kernel, sizeof (zver_kernel)) == -1) {
 		fprintf(stderr, "zfs_version_kernel() failed: %s\n",
 		    strerror(errno));
 		return (-1);
 	}
 
 	(void) printf("zfs-kmod-%s\n", zver_kernel);
 
 	return (0);
 }
 
 /*
  * Return 1 if the user requested ANSI color output, and our terminal supports
  * it.  Return 0 for no color.
  */
 static int
 use_color(void)
 {
 	static int use_color = -1;
 	char *term;
 
 	/*
 	 * Optimization:
 	 *
 	 * For each zpool invocation, we do a single check to see if we should
 	 * be using color or not, and cache that value for the lifetime of the
 	 * the zpool command.  That makes it cheap to call use_color() when
 	 * we're printing with color.  We assume that the settings are not going
 	 * to change during the invocation of a zpool command (the user isn't
 	 * going to change the ZFS_COLOR value while zpool is running, for
 	 * example).
 	 */
 	if (use_color != -1) {
 		/*
 		 * We've already figured out if we should be using color or
 		 * not.  Return the cached value.
 		 */
 		return (use_color);
 	}
 
 	term = getenv("TERM");
 	/*
 	 * The user sets the ZFS_COLOR env var set to enable zpool ANSI color
 	 * output.  However if NO_COLOR is set (https://no-color.org/) then
 	 * don't use it.  Also, don't use color if terminal doesn't support
 	 * it.
 	 */
 	if (libzfs_envvar_is_set("ZFS_COLOR") &&
 	    !libzfs_envvar_is_set("NO_COLOR") &&
 	    isatty(STDOUT_FILENO) && term && strcmp("dumb", term) != 0 &&
 	    strcmp("unknown", term) != 0) {
 		/* Color supported */
 		use_color = 1;
 	} else {
 		use_color = 0;
 	}
 
 	return (use_color);
 }
 
 /*
  * color_start() and color_end() are used for when you want to colorize a block
  * of text.  For example:
  *
  * color_start(ANSI_RED_FG)
  * printf("hello");
  * printf("world");
  * color_end();
  */
 void
 color_start(char *color)
 {
 	if (use_color())
 		printf("%s", color);
 }
 
 void
 color_end(void)
 {
 	if (use_color())
 		printf(ANSI_RESET);
 }
 
 /* printf() with a color.  If color is NULL, then do a normal printf. */
 int
 printf_color(char *color, char *format, ...)
 {
 	va_list aptr;
 	int rc;
 
 	if (color)
 		color_start(color);
 
 	va_start(aptr, format);
 	rc = vprintf(format, aptr);
 	va_end(aptr);
 
 	if (color)
 		color_end();
 
 	return (rc);
 }
Index: vendor-sys/openzfs/dist/man/man1/arcstat.1
===================================================================
--- vendor-sys/openzfs/dist/man/man1/arcstat.1	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man1/arcstat.1	(revision 364928)
@@ -1,502 +1,502 @@
 .\"
 .\" This file and its contents are supplied under the terms of the
 .\" Common Development and Distribution License ("CDDL"), version 1.0.
 .\" You may only use this file in accordance with the terms of version
 .\" 1.0 of the CDDL.
 .\"
 .\" A full copy of the text of the CDDL should have accompanied this
 .\" source.  A copy of the CDDL is also available via the Internet at
 .\" http://www.illumos.org/license/CDDL.
 .\"
 .\"
 .\" Copyright 2014 Adam Stevko.  All rights reserved.
 .\" Copyright (c) 2015 by Delphix. All rights reserved.
 .\" Copyright (c) 2020 by AJ Jordan. All rights reserved.
 .\"
-.TH ARCSTAT 1 "May 7, 2020"
+.TH ARCSTAT 1 "Aug 24, 2020" OpenZFS
 .SH NAME
 arcstat \- report ZFS ARC and L2ARC statistics
 .SH SYNOPSIS
 .LP
 .nf
 \fBarcstat\fR [\fB-hvx\fR] [\fB-f field[,field]...\fR] [\fB-o file\fR] [\fB-s string\fR] [\fBinterval\fR [\fBcount\fR]]
 .fi
 
 .SH DESCRIPTION
 .LP
 The \fBarcstat\fR utility print various ZFS ARC and L2ARC statistics in
 vmstat-like fashion.
 .sp
 
 .sp
 .LP
 The \fBarcstat\fR command reports the following information:
 .sp
 .ne 2
 
 .\"
 .sp
 .ne 1
 .na
 \fBc \fR
 .ad
 .RS 14n
 ARC target size
 .RE
 
 .sp
 .ne 2
 .na
 \fBdh% \fR
 .ad
 .RS 14n
 Demand data hit percentage
 .RE
 
 .sp
 .ne 2
 .na
 \fBdm% \fR
 .ad
 .RS 14n
 Demand data miss percentage
 .RE
 
 .sp
 .ne 2
 .na
 \fBmfu \fR
 .ad
 .RS 14n
 MFU list hits per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBmh% \fR
 .ad
 .RS 14n
 Metadata hit percentage
 .RE
 
 .sp
 .ne 2
 .na
 \fBmm% \fR
 .ad
 .RS 14n
 Metadata miss percentage
 .RE
 
 .sp
 .ne 2
 .na
 \fBmru \fR
 .ad
 .RS 14n
 MRU list hits per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBph% \fR
 .ad
 .RS 14n
 Prefetch hits percentage
 .RE
 
 .sp
 .ne 2
 .na
 \fBpm% \fR
 .ad
 .RS 14n
 Prefetch miss percentage
 .RE
 
 .sp
 .ne 2
 .na
 \fBdhit \fR
 .ad
 .RS 14n
 Demand data hits per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBdmis \fR
 .ad
 .RS 14n
 Demand data misses per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBhit% \fR
 .ad
 .RS 14n
 ARC hit percentage
 .RE
 
 .sp
 .ne 2
 .na
 \fBhits \fR
 .ad
 .RS 14n
 ARC reads per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBmfug \fR
 .ad
 .RS 14n
 MFU ghost list hits per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBmhit \fR
 .ad
 .RS 14n
 Metadata hits per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBmiss \fR
 .ad
 .RS 14n
 ARC misses per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBmmis \fR
 .ad
 .RS 14n
 Metadata misses per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBmrug \fR
 .ad
 .RS 14n
 MRU ghost list hits per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBphit \fR
 .ad
 .RS 14n
 Prefetch hits per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBpmis \fR
 .ad
 .RS 14n
 Prefetch misses per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBread \fR
 .ad
 .RS 14n
 Total ARC accesses per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBtime \fR
 .ad
 .RS 14n
 Time
 .RE
 
 .sp
 .ne 2
 .na
 \fBsize \fR
 .ad
 .RS 14n
 ARC size
 .RE
 
 .sp
 .ne 2
 .na
 \fBarcsz \fR
 .ad
 .RS 14n
 Alias for \fBsize\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBdread \fR
 .ad
 .RS 14n
 Demand data accesses per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBeskip \fR
 .ad
 .RS 14n
 evict_skip per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBmiss% \fR
 .ad
 .RS 14n
 ARC miss percentage
 .RE
 
 .sp
 .ne 2
 .na
 \fBmread \fR
 .ad
 .RS 14n
 Metadata accesses per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBpread \fR
 .ad
 .RS 14n
 Prefetch accesses per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2hit% \fR
 .ad
 .RS 14n
 L2ARC access hit percentage
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2hits \fR
 .ad
 .RS 14n
 L2ARC hits per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2miss \fR
 .ad
 .RS 14n
 L2ARC misses per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2read \fR
 .ad
 .RS 14n
 Total L2ARC accesses per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2size \fR
 .ad
 .RS 14n
 Size of the L2ARC
 .RE
 
 .sp
 .ne 2
 .na
 \fBmtxmis \fR
 .ad
 .RS 14n
 mutex_miss per second
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2bytes \fR
 .ad
 .RS 14n
 Bytes read per second from the L2ARC
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2miss% \fR
 .ad
 .RS 14n
 L2ARC access miss percentage
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2asize \fR
 .ad
 .RS 14n
 Actual (compressed) size of the L2ARC
 .RE
 
 .sp
 .ne 2
 .na
 \fBgrow \fR
 .ad
 .RS 14n
 ARC grow disabled
 .RE
 
 .sp
 .ne 2
 .na
 \fBneed \fR
 .ad
 .RS 14n
 ARC reclaim needed
 .RE
 
 .sp
 .ne 2
 .na
 \fBfree \fR
 .ad
 .RS 14n
 The ARC's idea of how much free memory there is, which includes evictable memory in the page cache.
 Since the ARC tries to keep \fBavail\fR above zero, \fBavail\fR is usually more instructive to observe than \fBfree\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBavail \fR
 .ad
 .RS 14n
 The ARC's idea of how much free memory is available to it, which is a bit less than \fBfree\fR.
 May temporarily be negative, in which case the ARC will reduce the target size \fBc\fR.
 .RE
 .\"
 
 .SH OPTIONS
 .LP
 The following options are supported:
 
 .sp
 .ne 2
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 12n
 Display only specific fields. See \fBDESCRIPTION\fR for supported statistics.
 .RE
 
 .sp
 .ne 2
 .na
 \fB\fB-h\fR\fR
 .ad
 .RS 12n
 Display help message.
 .RE
 
 .sp
 .ne 2
 .na
 \fB\fB-o\fR\fR
 .ad
 .RS 12n
 Report statistics to a file instead of the standard output.
 .RE
 
 .sp
 .ne 2
 .na
 \fB\fB-s\fR\fR
 .ad
 .RS 12n
 Display data with a specified separator (default: 2 spaces).
 .RE
 
 .sp
 .ne 2
 .na
 \fB\fB-x\fR\fR
 .ad
 .RS 12n
 Print extended stats (same as -f time,mfu,mru,mfug,mrug,eskip,mtxmis,dread,pread,read).
 .RE
 
 .sp
 .ne 2
 .na
 \fB\fB-v\fR\fR
 .ad
 .RS 12n
 Show field headers and definitions
 .RE
 
 .SH OPERANDS
 .LP
 The following operands are supported:
 .sp
 .ne 2
 .na
 \fB\fIcount\fR\fR
 .ad
 .RS 12n
 Display only \fIcount\fR reports.
 .RE
 
 .sp
 .ne 2
 .na
 \fB\fIinterval\fR\fR
 .ad
 .RS 12n
 Specify the sampling interval in seconds.
 .RE
 
 .SH AUTHORS
 .LP
 arcstat was originally written in Perl by Neelakanth Nadgir and supported only ZFS ARC statistics.
 Mike Harsch updated it to support L2ARC statistics.
 John Hixson ported it to Python for FreeNAS over some beer, after which many individuals from the OpenZFS community continued to maintain and improve it.
Index: vendor-sys/openzfs/dist/man/man1/cstyle.1
===================================================================
--- vendor-sys/openzfs/dist/man/man1/cstyle.1	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man1/cstyle.1	(revision 364928)
@@ -1,167 +1,167 @@
 .\" Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 .\" Use is subject to license terms.
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
-.TH cstyle 1 "28 March 2005"
+.TH CSTYLE 1 "Aug 24, 2020" OpenZFS
 .SH NAME
 .I cstyle
 \- check for some common stylistic errors in C source files
 .SH SYNOPSIS
 \fBcstyle [-chpvCP] [-o constructs] [file...]\fP
 .LP
 .SH DESCRIPTION
 .IX "OS-Net build tools" "cstyle" "" "\fBcstyle\fP"
 .LP
 .I cstyle
 inspects C source files (*.c and *.h) for common stylistic errors.  It
 attempts to check for the cstyle documented in
 \fIhttp://www.cis.upenn.edu/~lee/06cse480/data/cstyle.ms.pdf\fP.
 Note that there is much in that document that
 .I cannot
 be checked for; just because your code is \fBcstyle(1)\fP clean does not
 mean that you've followed Sun's C style.  \fICaveat emptor\fP.
 .LP
 .SH OPTIONS
 .LP
 The following options are supported:
 .TP 4
 .B \-c
 Check continuation line indentation inside of functions.  Sun's C style
 states that all statements must be indented to an appropriate tab stop,
 and any continuation lines after them must be indented \fIexactly\fP four
 spaces from the start line.  This option enables a series of checks
 designed to find continuation line problems within functions only.  The
 checks have some limitations;  see CONTINUATION CHECKING, below.
 .LP
 .TP 4
 .B \-h
 Performs heuristic checks that are sometimes wrong.  Not generally used.
 .LP
 .TP 4
 .B \-p
 Performs some of the more picky checks.  Includes ANSI #else and #endif
 rules, and tries to detect spaces after casts.  Used as part of the
 putback checks.
 .LP
 .TP 4
 .B \-v
 Verbose output;  includes the text of the line of error, and, for
 \fB-c\fP, the first statement in the current continuation block.
 .LP
 .TP 4
 .B \-C
 Ignore errors in header comments (i.e. block comments starting in the
 first column).  Not generally used.
 .LP
 .TP 4
 .B \-P
 Check for use of non-POSIX types.  Historically, types like "u_int" and
 "u_long" were used, but they are now deprecated in favor of the POSIX
 types uint_t, ulong_t, etc.  This detects any use of the deprecated
 types.  Used as part of the putback checks.
 .LP
 .TP 4
 .B \-o \fIconstructs\fP
 Allow a comma-separated list of additional constructs.  Available
 constructs include:
 .LP
 .TP 10
 .B doxygen
 Allow doxygen-style block comments (\fB/**\fP and \fB/*!\fP)
 .LP
 .TP 10
 .B splint
 Allow splint-style lint comments (\fB/*@...@*/\fP)
 .LP
 .SH NOTES
 .LP
 The cstyle rule for the OS/Net consolidation is that all new files must
 be \fB-pP\fP clean.  For existing files, the following invocations are
 run against both the old and new files:
 .LP
 .TP 4
 \fBcstyle file\fB
 .LP
 .TP 4
 \fBcstyle -p file\fB
 .LP
 .TP 4
 \fBcstyle -pP file\fB
 .LP
 If the old file gave no errors for one of the invocations, the new file
 must also give no errors.  This way, files can only become more clean.
 .LP
 .SH CONTINUATION CHECKING
 .LP
 The continuation checker is a reasonably simple state machine that knows
 something about how C is laid out, and can match parenthesis, etc. over
 multiple lines.  It does have some limitations:
 .LP
 .TP 4
 .B 1.
 Preprocessor macros which cause unmatched parenthesis will confuse the
 checker for that line.  To fix this, you'll need to make sure that each
 branch of the #if statement has balanced parenthesis.
 .LP
 .TP 4
 .B 2.
 Some \fBcpp\fP macros do not require ;s after them.  Any such macros
 *must* be ALL_CAPS; any lower case letters will cause bad output.
 .LP
 The bad output will generally be corrected after the next \fB;\fP,
 \fB{\fP, or \fB}\fP.
 .LP
 Some continuation error messages deserve some additional explanation
 .LP
 .TP 4
 .B
 multiple statements continued over multiple lines
 A multi-line statement which is not broken at statement
 boundaries.  For example:
 .RS 4
 .HP 4
 if (this_is_a_long_variable == another_variable) a =
 .br
 b + c;
 .LP
 Will trigger this error.  Instead, do:
 .HP 8
 if (this_is_a_long_variable == another_variable)
 .br
 a = b + c;
 .RE
 .LP
 .TP 4
 .B
 empty if/for/while body not on its own line
 For visibility, empty bodies for if, for, and while statements should be
 on their own line.  For example:
 .RS 4
 .HP 4
 while (do_something(&x) == 0);
 .LP
 Will trigger this error.  Instead, do:
 .HP 8
 while (do_something(&x) == 0)
 .br
 ;
 .RE
 
Index: vendor-sys/openzfs/dist/man/man1/raidz_test.1
===================================================================
--- vendor-sys/openzfs/dist/man/man1/raidz_test.1	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man1/raidz_test.1	(revision 364928)
@@ -1,97 +1,97 @@
 '\" t
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\"
 .\" Copyright (c) 2016 Gvozden Nešković. All rights reserved.
 .\"
-.TH raidz_test 1 "2016" "ZFS on Linux" "User Commands"
+.TH RAIDZ_TEST 1 "Aug 24, 2020" OpenZFS
 
 .SH NAME
 \fBraidz_test\fR \- raidz implementation verification and benchmarking tool
 .SH SYNOPSIS
 .LP
 .BI "raidz_test <options>"
 .SH DESCRIPTION
 .LP
 This manual page documents briefly the \fBraidz_test\fR command.
 .LP
 Purpose of this tool is to run all supported raidz implementation and verify
 results of all methods. Tool also contains a parameter sweep option where all
 parameters affecting RAIDZ block are verified (like ashift size, data offset,
 data size, etc...).
 The tool also supports a benchmarking mode using -B option.
 .SH OPTION
 .HP
 .BI "\-h" ""
 .IP
 Print a help summary.
 .HP
 .BI "\-a" " ashift (default: 9)"
 .IP
 Ashift value.
 .HP
 .BI "\-o" " zio_off_shift" " (default: 0)"
 .IP
 Zio offset for raidz block. Offset value is 1 << (zio_off_shift)
 .HP
 .BI "\-d" " raidz_data_disks" " (default: 8)"
 .IP
 Number of raidz data disks to use. Additional disks for parity will be used
 during testing.
 .HP
 .BI "\-s" " zio_size_shift" " (default: 19)"
 .IP
 Size of data for raidz block. Size is 1 << (zio_size_shift).
 .HP
 .BI "\-S(weep)"
 .IP
 Sweep parameter space while verifying the raidz implementations. This option
 will exhaust all most of valid values for -a -o -d -s options. Runtime using
 this option will be long.
 .HP
 .BI "\-t(imeout)"
 .IP
 Wall time for sweep test in seconds. The actual runtime could be longer.
 .HP
 .BI "\-B(enchmark)"
 .IP
 This options starts the benchmark mode. All implementations are benchmarked
 using increasing per disk data size. Results are given as throughput per disk,
 measured in MiB/s.
 .HP
 .BI "\-v(erbose)"
 .IP
 Increase verbosity.
 .HP
 .BI "\-T(est the test)"
 .IP
 Debugging option. When this option is specified tool is supposed to fail
 all tests. This is to check if tests would properly verify bit-exactness.
 .HP
 .BI "\-D(ebug)"
 .IP
 Debugging option. Specify to attach gdb when SIGSEGV or SIGABRT are received.
 .HP
 
 .SH "SEE ALSO"
 .BR "ztest (1)"
 .SH "AUTHORS"
 vdev_raidz, created for ZFS on Linux by Gvozden Nešković <neskovic@gmail.com>
Index: vendor-sys/openzfs/dist/man/man1/zhack.1
===================================================================
--- vendor-sys/openzfs/dist/man/man1/zhack.1	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man1/zhack.1	(revision 364928)
@@ -1,98 +1,98 @@
 '\" t
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\"
 .\" Copyright 2013 Darik Horn <dajhorn@vanadac.com>. All rights reserved.
 .\"
-.TH zhack 1 "2013 MAR 16" "ZFS on Linux" "User Commands"
+.TH ZHACK 1 "Aug 24, 2020" OpenZFS
 
 .SH NAME
 zhack \- libzpool debugging tool
 .SH DESCRIPTION
 This utility pokes configuration changes directly into a ZFS pool,
 which is dangerous and can cause data corruption.
 .SH SYNOPSIS
 .LP
 .BI "zhack [\-c " "cachefile" "] [\-d " "dir" "] <" "subcommand" "> [" "arguments" "]"
 .SH OPTIONS
 .HP
 .BI "\-c" " cachefile"
 .IP
 Read the \fIpool\fR configuration from the \fIcachefile\fR, which is
 /etc/zfs/zpool.cache by default.
 .HP
 .BI "\-d" " dir"
 .IP
 Search for \fIpool\fR members in the \fIdir\fR path. Can be specified
 more than once.
 .SH SUBCOMMANDS
 .LP
 .BI "feature stat " "pool"
 .IP
 List feature flags.
 .LP
 .BI "feature enable [\-d " "description" "] [\-r] " "pool guid"
 .IP
 Add a new feature to \fIpool\fR that is uniquely identified by
 \fIguid\fR, which is specified in the same form as a zfs(8) user
 property.
 .IP
 The \fIdescription\fR is a short human readable explanation of the new
 feature.
 .IP
 The \fB\-r\fR switch indicates that \fIpool\fR can be safely opened
 in read-only mode by a system that does not have the \fIguid\fR
 feature.
 .LP
 .BI "feature ref [\-d|\-m] " "pool guid"
 .IP
 Increment the reference count of the \fIguid\fR feature in \fIpool\fR.
 .IP
 The \fB\-d\fR switch decrements the reference count of the \fIguid\fR
 feature in \fIpool\fR.
 .IP
 The \fB\-m\fR switch indicates that the \fIguid\fR feature is now
 required to read the pool MOS.
 .SH EXAMPLES
 .LP
 .nf
 # zhack feature stat tank
 
 for_read_obj:
 	org.illumos:lz4_compress = 0
 for_write_obj:
 	com.delphix:async_destroy = 0
 	com.delphix:empty_bpobj = 0
 descriptions_obj:
 	com.delphix:async_destroy = Destroy filesystems asynchronously.
 	com.delphix:empty_bpobj = Snapshots use less space.
 	org.illumos:lz4_compress = LZ4 compression algorithm support.
 .LP
 # zhack feature enable -d 'Predict future disk failures.' \\
     tank com.example:clairvoyance
 .LP
 # zhack feature ref tank com.example:clairvoyance
 .SH AUTHORS
 This man page was written by Darik Horn <dajhorn@vanadac.com>.
 .SH SEE ALSO
 .BR zfs (8),
 .BR zpool-features (5),
 .BR ztest (1)
Index: vendor-sys/openzfs/dist/man/man1/ztest.1
===================================================================
--- vendor-sys/openzfs/dist/man/man1/ztest.1	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man1/ztest.1	(revision 364928)
@@ -1,179 +1,179 @@
 '\" t
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\"
 .\" Copyright (c) 2009 Oracle and/or its affiliates. All rights reserved.
 .\" Copyright (c) 2009 Michael Gebetsroither <michael.geb@gmx.at>. All rights
 .\" reserved.
 .\"
-.TH ztest 1 "2009 NOV 01" "ZFS on Linux" "User Commands"
+.TH ZTEST 1 "Aug 24, 2020" OpenZFS
 
 .SH NAME
 \fBztest\fR \- was written by the ZFS Developers as a ZFS unit test.
 .SH SYNOPSIS
 .LP
 .BI "ztest <options>"
 .SH DESCRIPTION
 .LP
 This manual page documents briefly the \fBztest\fR command.
 .LP
 \fBztest\fR was written by the ZFS Developers as a ZFS unit test. The
 tool was developed in tandem with the ZFS functionality and was
 executed nightly as one of the many regression test against the daily
 build. As features were added to ZFS, unit tests were also added to
 \fBztest\fR.  In addition, a separate test development team wrote and
 executed more functional and stress tests.
 .LP
 By default \fBztest\fR runs for ten minutes and uses block files
 (stored in /tmp) to create pools rather than using physical disks.
 Block files afford \fBztest\fR its flexibility to play around with
 zpool components without requiring large hardware configurations.
 However, storing the block files in /tmp may not work for you if you
 have a small tmp directory.
 .LP
 By default is non-verbose. This is why entering the command above will
 result in \fBztest\fR quietly executing for 5 minutes. The -V option
 can be used to increase the verbosity of the tool. Adding multiple -V
 option is allowed and the more you add the more chatty \fBztest\fR
 becomes.
 .LP
 After the \fBztest\fR run completes, you should notice many ztest.*
 files lying around. Once the run completes you can safely remove these
 files. Note that you shouldn't remove these files during a run. You
 can re-use these files in your next \fBztest\fR run by using the -E
 option.
 .SH OPTIONS
 .HP
 .BI "\-?" ""
 .IP
 Print a help summary.
 .HP
 .BI "\-v" " vdevs" " (default: 5)
 .IP
 Number of vdevs.
 .HP
 .BI "\-s" " size_of_each_vdev" " (default: 64M)"
 .IP
 Size of each vdev.
 .HP
 .BI "\-a" " alignment_shift" " (default: 9) (use 0 for random)"
 .IP
 Used alignment in test.
 .HP
 .BI "\-m" " mirror_copies" " (default: 2)"
 .IP
 Number of mirror copies.
 .HP
 .BI "\-r" " raidz_disks" " (default: 4)"
 .IP
 Number of raidz disks.
 .HP
 .BI "\-R" " raidz_parity" " (default: 1)"
 .IP
 Raidz parity.
 .HP
 .BI "\-d" " datasets" " (default: 7)"
 .IP
 Number of datasets.
 .HP
 .BI "\-t" " threads" " (default: 23)"
 .IP
 Number of threads.
 .HP
 .BI "\-g" " gang_block_threshold" " (default: 32K)"
 .IP
 Gang block threshold.
 .HP
 .BI "\-i" " initialize_pool_i_times" " (default: 1)"
 .IP
 Number of pool initialisations.
 .HP
 .BI "\-k" " kill_percentage" " (default: 70%)"
 .IP
 Kill percentage.
 .HP
 .BI "\-p" " pool_name" " (default: ztest)"
 .IP
 Pool name.
 .HP
 .BI "\-V(erbose)"
 .IP
 Verbose (use multiple times for ever more blather).
 .HP
 .BI "\-E(xisting)"
 .IP
 Use existing pool (use existing pool instead of creating new one).
 .HP
 .BI "\-T" " time" " (default: 300 sec)"
 .IP
 Total test run time.
 .HP
 .BI "\-z" " zil_failure_rate" " (default: fail every 2^5 allocs)
 .IP
 Injected failure rate.
 .HP
 .BI "\-G"
 .IP
 Dump zfs_dbgmsg buffer before exiting.
 .SH "EXAMPLES"
 .LP
 To override /tmp as your location for block files, you can use the -f
 option:
 .IP
 ztest -f /
 .LP
 To get an idea of what ztest is actually testing try this:
 .IP
 ztest -f / -VVV
 .LP
 Maybe you'd like to run ztest for longer? To do so simply use the -T
 option and specify the runlength in seconds like so:
 .IP
 ztest -f / -V -T 120
 
 .SH "ENVIRONMENT VARIABLES"
 .TP
 .B "ZFS_HOSTID=id"
 Use \fBid\fR instead of the SPL hostid to identify this host.  Intended for use
 with ztest, but this environment variable will affect any utility which uses
 libzpool, including \fBzpool(8)\fR.  Since the kernel is unaware of this setting
 results with utilities other than ztest are undefined.
 .TP
 .B "ZFS_STACK_SIZE=stacksize"
 Limit the default stack size to \fBstacksize\fR bytes for the purpose of
 detecting and debugging kernel stack overflows.  This value defaults to
 \fB32K\fR which is double the default \fB16K\fR Linux kernel stack size.
 
 In practice, setting the stack size slightly higher is needed because
 differences in stack usage between kernel and user space can lead to spurious
 stack overflows (especially when debugging is enabled).  The specified value
 will be rounded up to a floor of PTHREAD_STACK_MIN which is the minimum stack
 required for a NULL procedure in user space.
 
 By default the stack size is limited to 256K.
 .SH "SEE ALSO"
 .BR "spl-module-parameters (5)" ","
 .BR "zpool (1)" ","
 .BR "zfs (1)" ","
 .BR "zdb (1)" ","
 .SH "AUTHOR"
 This manual page was transferred to asciidoc by Michael Gebetsroither
 <gebi@grml.org> from http://opensolaris.org/os/community/zfs/ztest/
Index: vendor-sys/openzfs/dist/man/man5/spl-module-parameters.5
===================================================================
--- vendor-sys/openzfs/dist/man/man5/spl-module-parameters.5	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man5/spl-module-parameters.5	(revision 364928)
@@ -1,326 +1,326 @@
 '\" te
 .\"
 .\" Copyright 2013 Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\"
-.TH SPL-MODULE-PARAMETERS 5 "Oct 28, 2017"
+.TH SPL-MODULE-PARAMETERS 5 "Aug 24, 2020" OpenZFS
 .SH NAME
 spl\-module\-parameters \- SPL module parameters
 .SH DESCRIPTION
 .sp
 .LP
 Description of the different parameters to the SPL module.
 
 .SS "Module parameters"
 .sp
 .LP
 
 .sp
 .ne 2
 .na
 \fBspl_kmem_cache_expire\fR (uint)
 .ad
 .RS 12n
 Cache expiration is part of default Illumos cache behavior.  The idea is
 that objects in magazines which have not been recently accessed should be
 returned to the slabs periodically.  This is known as cache aging and
 when enabled objects will be typically returned after 15 seconds.
 .sp
 On the other hand Linux slabs are designed to never move objects back to
 the slabs unless there is memory pressure.  This is possible because under
 Linux the cache will be notified when memory is low and objects can be
 released.
 .sp
 By default only the Linux method is enabled.  It has been shown to improve
 responsiveness on low memory systems and not negatively impact the performance
 of systems with more memory.  This policy may be changed by setting the
 \fBspl_kmem_cache_expire\fR bit mask as follows, both policies may be enabled
 concurrently.
 .sp
 0x01 - Aging (Illumos), 0x02 - Low memory (Linux)
 .sp
 Default value: \fB0x02\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_kmem_cache_kmem_threads\fR (uint)
 .ad
 .RS 12n
 The number of threads created for the spl_kmem_cache task queue.  This task
 queue is responsible for allocating new slabs for use by the kmem caches.
 For the majority of systems and workloads only a small number of threads are
 required.
 .sp
 Default value: \fB4\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_kmem_cache_reclaim\fR (uint)
 .ad
 .RS 12n
 When this is set it prevents Linux from being able to rapidly reclaim all the
 memory held by the kmem caches.  This may be useful in circumstances where
 it's preferable that Linux reclaim memory from some other subsystem first.
 Setting this will increase the likelihood out of memory events on a memory
 constrained system.
 .sp
 Default value: \fB0\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_kmem_cache_obj_per_slab\fR (uint)
 .ad
 .RS 12n
 The preferred number of objects per slab in the cache.   In general, a larger
 value will increase the caches memory footprint while decreasing the time
 required to perform an allocation.  Conversely, a smaller value will minimize
 the footprint and improve cache reclaim time but individual allocations may
 take longer.
 .sp
 Default value: \fB8\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_kmem_cache_obj_per_slab_min\fR (uint)
 .ad
 .RS 12n
 The minimum number of objects allowed per slab.  Normally slabs will contain
 \fBspl_kmem_cache_obj_per_slab\fR objects but for caches that contain very
 large objects it's desirable to only have a few, or even just one, object per
 slab.
 .sp
 Default value: \fB1\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_kmem_cache_max_size\fR (uint)
 .ad
 .RS 12n
 The maximum size of a kmem cache slab in MiB.  This effectively limits
 the maximum cache object size to \fBspl_kmem_cache_max_size\fR /
 \fBspl_kmem_cache_obj_per_slab\fR.  Caches may not be created with
 object sized larger than this limit.
 .sp
 Default value: \fB32 (64-bit) or 4 (32-bit)\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_kmem_cache_slab_limit\fR (uint)
 .ad
 .RS 12n
 For small objects the Linux slab allocator should be used to make the most
 efficient use of the memory.  However, large objects are not supported by
 the Linux slab and therefore the SPL implementation is preferred.  This
 value is used to determine the cutoff between a small and large object.
 .sp
 Objects of \fBspl_kmem_cache_slab_limit\fR or smaller will be allocated
 using the Linux slab allocator, large objects use the SPL allocator.  A
 cutoff of 16K was determined to be optimal for architectures using 4K pages.
 .sp
 Default value: \fB16,384\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_kmem_alloc_warn\fR (uint)
 .ad
 .RS 12n
 As a general rule kmem_alloc() allocations should be small, preferably
 just a few pages since they must by physically contiguous.  Therefore, a
 rate limited warning will be printed to the console for any kmem_alloc()
 which exceeds a reasonable threshold.
 .sp
 The default warning threshold is set to eight pages but capped at 32K to
 accommodate systems using large pages.  This value was selected to be small
 enough to ensure the largest allocations are quickly noticed and fixed.
 But large enough to avoid logging any warnings when a allocation size is
 larger than optimal but not a serious concern.  Since this value is tunable,
 developers are encouraged to set it lower when testing so any new largish
 allocations are quickly caught.  These warnings may be disabled by setting
 the threshold to zero.
 .sp
 Default value: \fB32,768\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_kmem_alloc_max\fR (uint)
 .ad
 .RS 12n
 Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
 Allocations which are marginally smaller than this limit may succeed but
 should still be avoided due to the expense of locating a contiguous range
 of free pages.  Therefore, a maximum kmem size with reasonable safely
 margin of 4x is set.  Kmem_alloc() allocations larger than this maximum
 will quickly fail.  Vmem_alloc() allocations less than or equal to this
 value will use kmalloc(), but shift to vmalloc() when exceeding this value.
 .sp
 Default value: \fBKMALLOC_MAX_SIZE/4\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_kmem_cache_magazine_size\fR (uint)
 .ad
 .RS 12n
 Cache magazines are an optimization designed to minimize the cost of
 allocating memory.  They do this by keeping a per-cpu cache of recently
 freed objects, which can then be reallocated without taking a lock. This
 can improve performance on highly contended caches.  However, because
 objects in magazines will prevent otherwise empty slabs from being
 immediately released this may not be ideal for low memory machines.
 .sp
 For this reason \fBspl_kmem_cache_magazine_size\fR can be used to set a
 maximum magazine size.  When this value is set to 0 the magazine size will
 be automatically determined based on the object size.  Otherwise magazines
 will be limited to 2-256 objects per magazine (i.e per cpu).  Magazines
 may never be entirely disabled in this implementation.
 .sp
 Default value: \fB0\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_hostid\fR (ulong)
 .ad
 .RS 12n
 The system hostid, when set this can be used to uniquely identify a system.
 By default this value is set to zero which indicates the hostid is disabled.
 It can be explicitly enabled by placing a unique non-zero value in
 \fB/etc/hostid/\fR.
 .sp
 Default value: \fB0\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_hostid_path\fR (charp)
 .ad
 .RS 12n
 The expected path to locate the system hostid when specified.  This value
 may be overridden for non-standard configurations.
 .sp
 Default value: \fB/etc/hostid\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_panic_halt\fR (uint)
 .ad
 .RS 12n
-Cause a kernel panic on assertion failures. When not enabled, the thread is 
+Cause a kernel panic on assertion failures. When not enabled, the thread is
 halted to facilitate further debugging.
 .sp
 Set to a non-zero value to enable.
 .sp
 Default value: \fB0\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_taskq_kick\fR (uint)
 .ad
 .RS 12n
 Kick stuck taskq to spawn threads. When writing a non-zero value to it, it will
 scan all the taskqs. If any of them have a pending task more than 5 seconds old,
 it will kick it to spawn more threads. This can be used if you find a rare
 deadlock occurs because one or more taskqs didn't spawn a thread when it should.
 .sp
 Default value: \fB0\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_taskq_thread_bind\fR (int)
 .ad
 .RS 12n
 Bind taskq threads to specific CPUs.  When enabled all taskq threads will
 be distributed evenly  over the available CPUs.  By default, this behavior
 is disabled to allow the Linux scheduler the maximum flexibility to determine
 where a thread should run.
 .sp
 Default value: \fB0\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_taskq_thread_dynamic\fR (int)
 .ad
 .RS 12n
 Allow dynamic taskqs.  When enabled taskqs which set the TASKQ_DYNAMIC flag
 will by default create only a single thread.  New threads will be created on
 demand up to a maximum allowed number to facilitate the completion of
 outstanding tasks.  Threads which are no longer needed will be promptly
 destroyed.  By default this behavior is enabled but it can be disabled to
 aid performance analysis or troubleshooting.
 .sp
 Default value: \fB1\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_taskq_thread_priority\fR (int)
 .ad
 .RS 12n
 Allow newly created taskq threads to set a non-default scheduler priority.
 When enabled the priority specified when a taskq is created will be applied
 to all threads created by that taskq.  When disabled all threads will use
 the default Linux kernel thread priority.  By default, this behavior is
 enabled.
 .sp
 Default value: \fB1\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_taskq_thread_sequential\fR (int)
 .ad
 .RS 12n
 The number of items a taskq worker thread must handle without interruption
 before requesting a new worker thread be spawned.  This is used to control
 how quickly taskqs ramp up the number of threads processing the queue.
 Because Linux thread creation and destruction are relatively inexpensive a
 small default value has been selected.  This means that normally threads will
 be created aggressively which is desirable.  Increasing this value will
 result in a slower thread creation rate which may be preferable for some
 configurations.
 .sp
 Default value: \fB4\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBspl_max_show_tasks\fR (uint)
 .ad
 .RS 12n
 The maximum number of tasks per pending list in each taskq shown in
 /proc/spl/{taskq,taskq-all}. Write 0 to turn off the limit. The proc file will
 walk the lists with lock held, reading it could cause a lock up if the list
 grow too large without limiting the output. "(truncated)" will be shown if the
 list is larger than the limit.
 .sp
 Default value: \fB512\fR
 .RE
Index: vendor-sys/openzfs/dist/man/man5/vdev_id.conf.5
===================================================================
--- vendor-sys/openzfs/dist/man/man5/vdev_id.conf.5	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man5/vdev_id.conf.5	(revision 364928)
@@ -1,222 +1,222 @@
-.TH vdev_id.conf 5
+.TH VDEV_ID.CONF 5 "Aug 24, 2020" OpenZFS
 .SH NAME
 vdev_id.conf \- Configuration file for vdev_id
 .SH DESCRIPTION
 .I vdev_id.conf
 is the configuration file for
 .BR vdev_id (8).
 It controls the default behavior of
 .BR vdev_id (8)
 while it is mapping a disk device name to an alias.
 .PP
 The
 .I vdev_id.conf
 file uses a simple format consisting of a keyword followed by one or
 more values on a single line.  Any line not beginning with a recognized
 keyword is ignored.  Comments may optionally begin with a hash
 character.
 
 The following keywords and values are used.
 .TP
 \fIalias\fR <name> <devlink>
 Maps a device link in the /dev directory hierarchy to a new device
 name.  The udev rule defining the device link must have run prior to
 .BR vdev_id (8).
 A defined alias takes precedence over a topology-derived name, but the
 two naming methods can otherwise coexist.  For example, one might name
 drives in a JBOD with the sas_direct topology while naming an internal
 L2ARC device with an alias.
 
 \fIname\fR - the name of the link to the device that will by created in
 /dev/disk/by-vdev.
 
 \fIdevlink\fR - the name of the device link that has already been
 defined by udev.  This may be an absolute path or the base filename.
 
 .TP
 \fIchannel\fR [pci_slot] <port> <name>
 Maps a physical path to a channel name (typically representing a single
 disk enclosure).
 
 .TP
 \fIenclosure_symlinks\fR <yes|no>
 Additionally create /dev/by-enclosure symlinks to the disk enclosure
 sg devices using the naming scheme from vdev_id.conf.
 \fIenclosure_symlinks\fR is only allowed for sas_direct mode.
 .TP
 \fIenclosure_symlinks_prefix\fR <prefix>
 Specify the prefix for the enclosure symlinks in the form of:
 
 /dev/by-enclosure/<prefix>-<channel><num>
 
 Defaults to "enc" if not specified.
 .TP
 \fIpci_slot\fR - specifies the PCI SLOT of the HBA
 hosting the disk enclosure being mapped, as found in the output of
 .BR lspci (8).
 This argument is not used in sas_switch mode.
 
 \fIport\fR - specifies the numeric identifier of the HBA or SAS switch port
 connected to the disk enclosure being mapped.
 
 \fIname\fR - specifies the name of the channel.
 
 .TP
 \fIslot\fR <old> <new> [channel]
 Maps a disk slot number as reported by the operating system to an
 alternative slot number.  If the \fIchannel\fR parameter is specified
 then the mapping is only applied to slots in the named channel,
 otherwise the mapping is applied to all channels. The first-specified
 \fIslot\fR rule that can match a slot takes precedence.  Therefore a
 channel-specific mapping for a given slot should generally appear before
 a generic mapping for the same slot.  In this way a custom mapping may
 be applied to a particular channel and a default mapping applied to the
 others.
 
 .TP
 \fImultipath\fR <yes|no>
 Specifies whether
 .BR vdev_id (8)
 will handle only dm-multipath devices.  If set to "yes" then
 .BR vdev_id (8)
 will examine the first running component disk of a dm-multipath
 device as listed by the
 .BR multipath (8)
 command to determine the physical path.
 .TP
 \fItopology\fR <sas_direct|sas_switch>
 Identifies a physical topology that governs how physical paths are
 mapped to channels.
 
 \fIsas_direct\fR - in this mode a channel is uniquely identified by
 a PCI slot and a HBA port number
 
 \fIsas_switch\fR - in this mode a channel is uniquely identified by
 a SAS switch port number
 
 .TP
 \fIphys_per_port\fR <num>
 Specifies the number of PHY devices associated with a SAS HBA port or SAS
 switch port.
 .BR vdev_id (8)
 internally uses this value to determine which HBA or switch port a
 device is connected to.  The default is 4.
 
 .TP
 \fIslot\fR <bay|phy|port|id|lun|ses>
 Specifies from which element of a SAS identifier the slot number is
 taken.  The default is bay.
 
 \fIbay\fR - read the slot number from the bay identifier.
 
 \fIphy\fR - read the slot number from the phy identifier.
 
 \fIport\fR - use the SAS port as the slot number.
 
 \fIid\fR - use the scsi id as the slot number.
 
 \fIlun\fR - use the scsi lun as the slot number.
 
 \fIses\fR - use the SCSI Enclosure Services (SES) enclosure device slot number,
 as reported by
 .BR sg_ses (8).
 This is intended for use only on systems where \fIbay\fR is unsupported,
 noting that \fIport\fR and \fIid\fR may be unstable across disk replacement.
 .SH EXAMPLES
 A non-multipath configuration with direct-attached SAS enclosures and an
 arbitrary slot re-mapping.
 .P
 .nf
 	multipath     no
 	topology      sas_direct
 	phys_per_port 4
 	slot          bay
 
 	#       PCI_SLOT HBA PORT  CHANNEL NAME
 	channel 85:00.0  1         A
 	channel 85:00.0  0         B
 	channel 86:00.0  1         C
 	channel 86:00.0  0         D
 
 	# Custom mapping for Channel A
 
 	#    Linux      Mapped
 	#    Slot       Slot      Channel
 	slot 1          7         A
 	slot 2          10        A
 	slot 3          3         A
 	slot 4          6         A
 
 	# Default mapping for B, C, and D
 
 	slot 1          4
 	slot 2          2
 	slot 3          1
 	slot 4          3
 .fi
 .P
 A SAS-switch topology.  Note that the
 .I channel
 keyword takes only two arguments in this example.
 .P
 .nf
 	topology      sas_switch
 
 	#       SWITCH PORT  CHANNEL NAME
 	channel 1            A
 	channel 2            B
 	channel 3            C
 	channel 4            D
 .fi
 .P
 A multipath configuration.  Note that channel names have multiple
 definitions - one per physical path.
 .P
 .nf
 	multipath yes
 
 	#       PCI_SLOT HBA PORT  CHANNEL NAME
 	channel 85:00.0  1         A
 	channel 85:00.0  0         B
 	channel 86:00.0  1         A
 	channel 86:00.0  0         B
 .fi
 .P
 A configuration with enclosure_symlinks enabled.
 .P
 .nf
 	multipath yes
 	enclosure_symlinks yes
 
 	#          PCI_ID      HBA PORT     CHANNEL NAME
 	channel    05:00.0     1            U
 	channel    05:00.0     0            L
 	channel    06:00.0     1            U
 	channel    06:00.0     0            L
 .fi
 In addition to the disks symlinks, this configuration will create:
 .P
 .nf
 	/dev/by-enclosure/enc-L0
 	/dev/by-enclosure/enc-L1
 	/dev/by-enclosure/enc-U0
 	/dev/by-enclosure/enc-U1
 .fi
 .P
 A configuration using device link aliases.
 .P
 .nf
 	#     by-vdev
 	#     name     fully qualified or base name of device link
 	alias d1       /dev/disk/by-id/wwn-0x5000c5002de3b9ca
 	alias d2       wwn-0x5000c5002def789e
 .fi
 .P
 
 .SH FILES
 .TP
 .I /etc/zfs/vdev_id.conf
 The configuration file for
 .BR vdev_id (8).
 .SH SEE ALSO
 .BR vdev_id (8)
Index: vendor-sys/openzfs/dist/man/man5/zfs-events.5
===================================================================
--- vendor-sys/openzfs/dist/man/man5/zfs-events.5	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man5/zfs-events.5	(revision 364928)
@@ -1,965 +1,965 @@
 '\" te
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" Portions Copyright 2018 by Richard Elling
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
 .\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
 .\"
 .\" See the License for the specific language governing permissions and
 .\" limitations under the License. When distributing Covered Code, include this
 .\" CDDL HEADER in each file and include the License file at
 .\" usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
-.TH ZFS-EVENTS 5 "Oct 24, 2018"
+.TH ZFS-EVENTS 5 "Aug 24, 2020" OpenZFS
 .SH NAME
 zfs\-events \- Events created by the ZFS filesystem.
 .SH DESCRIPTION
 .sp
 .LP
 Description of the different events generated by the ZFS stack.
 .sp
 Most of these don't have any description. The events generated by ZFS
 have never been publicly documented.  What is here is intended as a
 starting point to provide documentation for all possible events.
 .sp
 To view all events created since the loading of the ZFS infrastructure
 (i.e, "the module"), run
 .P
 .nf
 \fBzpool events\fR
 .fi
 .P
 to get a short list, and
 .P
 .nf
 \fBzpool events -v\fR
 .fi
 .P
 to get a full detail of the events and what information
 is available about it.
 .sp
 This man page lists the different subclasses that are issued
 in the case of an event. The full event name would be
 \fIereport.fs.zfs.SUBCLASS\fR, but we only list the last
 part here.
 
 .SS "EVENTS (SUBCLASS)"
 .sp
 .LP
 
 .sp
 .ne 2
 .na
 \fBchecksum\fR
 .ad
 .RS 12n
 Issued when a checksum error has been detected.
 .RE
 
 .sp
 .ne 2
 .na
 \fBio\fR
 .ad
 .RS 12n
 Issued when there is an I/O error in a vdev in the pool.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdata\fR
 .ad
 .RS 12n
 Issued when there have been data errors in the pool.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdeadman\fR
 .ad
 .RS 12n
 Issued when an I/O is determined to be "hung", this can be caused by lost
 completion events due to flaky hardware or drivers.  See the
 \fBzfs_deadman_failmode\fR module option description for additional
 information regarding "hung" I/O detection and configuration.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdelay\fR
 .ad
 .RS 12n
 Issued when a completed I/O exceeds the maximum allowed time specified
 by the \fBzio_slow_io_ms\fR module option.  This can be an indicator of
 problems with the underlying storage device.  The number of delay events is
 ratelimited by the \fBzfs_slow_io_events_per_second\fR module parameter.
 .RE
 
 .sp
 .ne 2
 .na
 \fBconfig.sync\fR
 .ad
 .RS 12n
 Issued every time a vdev change have been done to the pool.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzpool\fR
 .ad
 .RS 12n
 Issued when a pool cannot be imported.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzpool.destroy\fR
 .ad
 .RS 12n
 Issued when a pool is destroyed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzpool.export\fR
 .ad
 .RS 12n
 Issued when a pool is exported.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzpool.import\fR
 .ad
 .RS 12n
 Issued when a pool is imported.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzpool.reguid\fR
 .ad
 .RS 12n
 Issued when a REGUID (new unique identifier for the pool have been regenerated) have been detected.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.unknown\fR
 .ad
 .RS 12n
 Issued when the vdev is unknown. Such as trying to clear device
 errors on a vdev that have failed/been kicked from the system/pool
 and is no longer available.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.open_failed\fR
 .ad
 .RS 12n
 Issued when a vdev could not be opened (because it didn't exist for example).
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.corrupt_data\fR
 .ad
 .RS 12n
 Issued when corrupt data have been detected on a vdev.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.no_replicas\fR
 .ad
 .RS 12n
 Issued when there are no more replicas to sustain the pool.
 This would lead to the pool being \fIDEGRADED\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.bad_guid_sum\fR
 .ad
 .RS 12n
 Issued when a missing device in the pool have been detected.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.too_small\fR
 .ad
 .RS 12n
 Issued when the system (kernel) have removed a device, and ZFS
 notices that the device isn't there any more. This is usually
 followed by a \fBprobe_failure\fR event.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.bad_label\fR
 .ad
 .RS 12n
 Issued when the label is OK but invalid.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.bad_ashift\fR
 .ad
 .RS 12n
 Issued when the ashift alignment requirement has increased.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.remove\fR
 .ad
 .RS 12n
 Issued when a vdev is detached from a mirror (or a spare detached from a
 vdev where it have been used to replace a failed drive - only works if
 the original drive have been readded).
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.clear\fR
 .ad
 .RS 12n
 Issued when clearing device errors in a pool. Such as running \fBzpool clear\fR
 on a device in the pool.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.check\fR
 .ad
 .RS 12n
 Issued when a check to see if a given vdev could be opened is started.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.spare\fR
 .ad
 .RS 12n
 Issued when a spare have kicked in to replace a failed device.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev.autoexpand\fR
 .ad
 .RS 12n
 Issued when a vdev can be automatically expanded.
 .RE
 
 .sp
 .ne 2
 .na
 \fBio_failure\fR
 .ad
 .RS 12n
 Issued when there is an I/O failure in a vdev in the pool.
 .RE
 
 .sp
 .ne 2
 .na
 \fBprobe_failure\fR
 .ad
 .RS 12n
 Issued when a probe fails on a vdev. This would occur if a vdev
 have been kicked from the system outside of ZFS (such as the kernel
 have removed the device).
 .RE
 
 .sp
 .ne 2
 .na
 \fBlog_replay\fR
 .ad
 .RS 12n
 Issued when the intent log cannot be replayed.  The can occur in the case
 of a missing or damaged log device.
 .RE
 
 .sp
 .ne 2
 .na
 \fBresilver.start\fR
 .ad
 .RS 12n
 Issued when a resilver is started.
 .RE
 
 .sp
 .ne 2
 .na
 \fBresilver.finish\fR
 .ad
 .RS 12n
 Issued when the running resilver have finished.
 .RE
 
 .sp
 .ne 2
 .na
 \fBscrub.start\fR
 .ad
 .RS 12n
 Issued when a scrub is started on a pool.
 .RE
 
 .sp
 .ne 2
 .na
 \fBscrub.finish\fR
 .ad
 .RS 12n
 Issued when a pool has finished scrubbing.
 .RE
 
 .sp
 .ne 2
 .na
 \fBscrub.abort\fR
 .ad
 .RS 12n
 Issued when a scrub is aborted on a pool.
 .RE
 
 .sp
 .ne 2
 .na
 \fBscrub.resume\fR
 .ad
 .RS 12n
 Issued when a scrub is resumed on a pool.
 .RE
 
 .sp
 .ne 2
 .na
 \fBscrub.paused\fR
 .ad
 .RS 12n
 Issued when a scrub is paused on a pool.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbootfs.vdev.attach\fR
 .ad
 .RS 12n
 .RE
 
 .SS "PAYLOADS"
 .sp
 .LP
 This is the payload (data, information) that accompanies an
 event.
 .sp
 For
 .BR zed (8),
 these are set to uppercase and prefixed with \fBZEVENT_\fR.
 
 .sp
 .ne 2
 .na
 \fBpool\fR
 .ad
 .RS 12n
 Pool name.
 .RE
 
 .sp
 .ne 2
 .na
 \fBpool_failmode\fR
 .ad
 .RS 12n
 Failmode - \fBwait\fR, \fBcontinue\fR or \fBpanic\fR.
 See
 .BR zpool (8)
 (\fIfailmode\fR property) for more information.
 .RE
 
 .sp
 .ne 2
 .na
 \fBpool_guid\fR
 .ad
 .RS 12n
 The GUID of the pool.
 .RE
 
 .sp
 .ne 2
 .na
 \fBpool_context\fR
 .ad
 .RS 12n
 The load state for the pool (0=none, 1=open, 2=import, 3=tryimport, 4=recover
 5=error).
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_guid\fR
 .ad
 .RS 12n
 The GUID of the vdev in question (the vdev failing or operated upon with
 \fBzpool clear\fR etc).
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_type\fR
 .ad
 .RS 12n
 Type of vdev - \fBdisk\fR, \fBfile\fR, \fBmirror\fR etc. See
 .BR zpool (8)
 under \fBVirtual Devices\fR for more information on possible values.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_path\fR
 .ad
 .RS 12n
 Full path of the vdev, including any \fI-partX\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_devid\fR
 .ad
 .RS 12n
 ID of vdev (if any).
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_fru\fR
 .ad
 .RS 12n
 Physical FRU location.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_state\fR
 .ad
 .RS 12n
 State of vdev (0=uninitialized, 1=closed, 2=offline, 3=removed, 4=failed to open, 5=faulted, 6=degraded, 7=healthy).
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_ashift\fR
 .ad
 .RS 12n
 The ashift value of the vdev.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_complete_ts\fR
 .ad
 .RS 12n
 The time the last I/O completed for the specified vdev.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_delta_ts\fR
 .ad
 .RS 12n
 The time since the last I/O completed for the specified vdev.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_spare_paths\fR
 .ad
 .RS 12n
 List of spares, including full path and any \fI-partX\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_spare_guids\fR
 .ad
 .RS 12n
 GUID(s) of spares.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_read_errors\fR
 .ad
 .RS 12n
 How many read errors that have been detected on the vdev.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_write_errors\fR
 .ad
 .RS 12n
 How many write errors that have been detected on the vdev.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_cksum_errors\fR
 .ad
 .RS 12n
 How many checksum errors that have been detected on the vdev.
 .RE
 
 .sp
 .ne 2
 .na
 \fBparent_guid\fR
 .ad
 .RS 12n
 GUID of the vdev parent.
 .RE
 
 .sp
 .ne 2
 .na
 \fBparent_type\fR
 .ad
 .RS 12n
 Type of parent. See \fBvdev_type\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBparent_path\fR
 .ad
 .RS 12n
 Path of the vdev parent (if any).
 .RE
 
 .sp
 .ne 2
 .na
 \fBparent_devid\fR
 .ad
 .RS 12n
 ID of the vdev parent (if any).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_objset\fR
 .ad
 .RS 12n
 The object set number for a given I/O.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_object\fR
 .ad
 .RS 12n
 The object number for a given I/O.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_level\fR
 .ad
 .RS 12n
 The indirect level for the block. Level 0 is the lowest level and includes
 data blocks. Values > 0 indicate metadata blocks at the appropriate level.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_blkid\fR
 .ad
 .RS 12n
 The block ID for a given I/O.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_err\fR
 .ad
 .RS 12n
 The errno for a failure when handling a given I/O. The errno is compatible
 with \fBerrno\fR(3) with the value for EBADE (0x34) used to indicate ZFS
 checksum error.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_offset\fR
 .ad
 .RS 12n
 The offset in bytes of where to write the I/O for the specified vdev.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_size\fR
 .ad
 .RS 12n
 The size in bytes of the I/O.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_flags\fR
 .ad
 .RS 12n
 The current flags describing how the I/O should be handled.  See the
 \fBI/O FLAGS\fR section for the full list of I/O flags.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_stage\fR
 .ad
 .RS 12n
 The current stage of the I/O in the pipeline.  See the \fBI/O STAGES\fR
 section for a full list of all the I/O stages.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_pipeline\fR
 .ad
 .RS 12n
 The valid pipeline stages for the I/O.  See the \fBI/O STAGES\fR section for a
 full list of all the I/O stages.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_delay\fR
 .ad
 .RS 12n
 The time elapsed (in nanoseconds) waiting for the block layer to complete the
 I/O.  Unlike \fBzio_delta\fR this does not include any vdev queuing time and is
 therefore solely a measure of the block layer performance.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_timestamp\fR
 .ad
 .RS 12n
 The time when a given I/O was submitted.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_delta\fR
 .ad
 .RS 12n
 The time required to service a given I/O.
 .RE
 
 .sp
 .ne 2
 .na
 \fBprev_state\fR
 .ad
 .RS 12n
 The previous state of the vdev.
 .RE
 
 .sp
 .ne 2
 .na
 \fBcksum_expected\fR
 .ad
 .RS 12n
 The expected checksum value for the block.
 .RE
 
 .sp
 .ne 2
 .na
 \fBcksum_actual\fR
 .ad
 .RS 12n
 The actual checksum value for an errant block.
 .RE
 
 .sp
 .ne 2
 .na
 \fBcksum_algorithm\fR
 .ad
 .RS 12n
 Checksum algorithm used. See \fBzfs\fR(8) for more information on checksum
 algorithms available.
 .RE
 
 .sp
 .ne 2
 .na
 \fBcksum_byteswap\fR
 .ad
 .RS 12n
 Whether or not the data is byteswapped.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbad_ranges\fR
 .ad
 .RS 12n
 [start, end) pairs of corruption offsets. Offsets are always aligned on a
 64-bit boundary, and can include some gaps of non-corruption.
 (See \fBbad_ranges_min_gap\fR)
 .RE
 
 .sp
 .ne 2
 .na
 \fBbad_ranges_min_gap\fR
 .ad
 .RS 12n
 In order to bound the size of the \fBbad_ranges\fR array, gaps of non-corruption
 less than or equal to \fBbad_ranges_min_gap\fR bytes have been merged with
 adjacent corruption. Always at least 8 bytes, since corruption is detected
 on a 64-bit word basis.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbad_range_sets\fR
 .ad
 .RS 12n
 This array has one element per range in \fBbad_ranges\fR. Each element contains
 the count of bits in that range which were clear in the good data and set
 in the bad data.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbad_range_clears\fR
 .ad
 .RS 12n
 This array has one element per range in \fBbad_ranges\fR. Each element contains
 the count of bits for that range which were set in the good data and clear in
 the bad data.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbad_set_bits\fR
 .ad
 .RS 12n
 If this field exists, it is an array of: (bad data & ~(good data)); that is,
 the bits set in the bad data which are cleared in the good data. Each element
 corresponds a byte whose offset is in a range in \fBbad_ranges\fR, and the
 array is ordered by offset. Thus, the first element is the first byte in the
 first \fBbad_ranges\fR range, and the last element is the last byte in the last
 \fBbad_ranges\fR range.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbad_cleared_bits\fR
 .ad
 .RS 12n
 Like \fBbad_set_bits\fR, but contains: (good data & ~(bad data)); that is,
 the bits set in the good data which are cleared in the bad data.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbad_set_histogram\fR
 .ad
 .RS 12n
 If this field exists, it is an array of counters. Each entry counts bits set
 in a particular bit of a big-endian uint64 type. The first entry counts bits
 set in the high-order bit of the first byte, the 9th byte, etc, and the last
 entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc.
 This information is useful for observing a stuck bit in a parallel data path,
 such as IDE or parallel SCSI.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbad_cleared_histogram\fR
 .ad
 .RS 12n
 If this field exists, it is an array of counters. Each entry counts bit clears
 in a particular bit of a big-endian uint64 type. The first entry counts bits
 clears of the high-order bit of the first byte, the 9th byte, etc, and the
 last entry counts clears of the low-order bit of the 8th byte, the 16th byte,
 etc. This information is useful for observing a stuck bit in a parallel data
 path, such as IDE or parallel SCSI.
 .RE
 
 .SS "I/O STAGES"
 .sp
 .LP
 The ZFS I/O pipeline is comprised of various stages which are defined
 below.  The individual stages are used to construct these basic I/O
 operations: Read, Write, Free, Claim, and Ioctl.  These stages may be
 set on an event to describe the life cycle of a given I/O.
 
 .TS
 tab(:);
 l l l .
 Stage:Bit Mask:Operations
 _:_:_
 ZIO_STAGE_OPEN:0x00000001:RWFCI
 
 ZIO_STAGE_READ_BP_INIT:0x00000002:R----
 ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W---
 ZIO_STAGE_FREE_BP_INIT:0x00000008:--F--
 ZIO_STAGE_ISSUE_ASYNC:0x00000010:RWF--
 ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W---
 
 ZIO_STAGE_ENCRYPT:0x00000040:-W---
 ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W---
 
 ZIO_STAGE_NOP_WRITE:0x00000100:-W---
 
 ZIO_STAGE_DDT_READ_START:0x00000200:R----
 ZIO_STAGE_DDT_READ_DONE:0x00000400:R----
 ZIO_STAGE_DDT_WRITE:0x00000800:-W---
 ZIO_STAGE_DDT_FREE:0x00001000:--F--
 
 ZIO_STAGE_GANG_ASSEMBLE:0x00002000:RWFC-
 ZIO_STAGE_GANG_ISSUE:0x00004000:RWFC-
 
 ZIO_STAGE_DVA_THROTTLE:0x00008000:-W---
 ZIO_STAGE_DVA_ALLOCATE:0x00010000:-W---
 ZIO_STAGE_DVA_FREE:0x00020000:--F--
 ZIO_STAGE_DVA_CLAIM:0x00040000:---C-
 
 ZIO_STAGE_READY:0x00080000:RWFCI
 
 ZIO_STAGE_VDEV_IO_START:0x00100000:RW--I
 ZIO_STAGE_VDEV_IO_DONE:0x00200000:RW--I
 ZIO_STAGE_VDEV_IO_ASSESS:0x00400000:RW--I
 
 ZIO_STAGE_CHECKSUM_VERIFY:0x00800000:R----
 
 ZIO_STAGE_DONE:0x01000000:RWFCI
 .TE
 
 .SS "I/O FLAGS"
 .sp
 .LP
 Every I/O in the pipeline contains a set of flags which describe its
 function and are used to govern its behavior.  These flags will be set
 in an event as an \fBzio_flags\fR payload entry.
 
 .TS
 tab(:);
 l l .
 Flag:Bit Mask
 _:_
 ZIO_FLAG_DONT_AGGREGATE:0x00000001
 ZIO_FLAG_IO_REPAIR:0x00000002
 ZIO_FLAG_SELF_HEAL:0x00000004
 ZIO_FLAG_RESILVER:0x00000008
 ZIO_FLAG_SCRUB:0x00000010
 ZIO_FLAG_SCAN_THREAD:0x00000020
 ZIO_FLAG_PHYSICAL:0x00000040
 
 ZIO_FLAG_CANFAIL:0x00000080
 ZIO_FLAG_SPECULATIVE:0x00000100
 ZIO_FLAG_CONFIG_WRITER:0x00000200
 ZIO_FLAG_DONT_RETRY:0x00000400
 ZIO_FLAG_DONT_CACHE:0x00000800
 ZIO_FLAG_NODATA:0x00001000
 ZIO_FLAG_INDUCE_DAMAGE:0x00002000
 
 ZIO_FLAG_IO_ALLOCATING:0x00004000
 ZIO_FLAG_IO_RETRY:0x00008000
 ZIO_FLAG_PROBE:0x00010000
 ZIO_FLAG_TRYHARD:0x00020000
 ZIO_FLAG_OPTIONAL:0x00040000
 
 ZIO_FLAG_DONT_QUEUE:0x00080000
 ZIO_FLAG_DONT_PROPAGATE:0x00100000
 ZIO_FLAG_IO_BYPASS:0x00200000
 ZIO_FLAG_IO_REWRITE:0x00400000
 ZIO_FLAG_RAW_COMPRESS:0x00800000
 ZIO_FLAG_RAW_ENCRYPT:0x01000000
 
 ZIO_FLAG_GANG_CHILD:0x02000000
 ZIO_FLAG_DDT_CHILD:0x04000000
 ZIO_FLAG_GODFATHER:0x08000000
 ZIO_FLAG_NOPWRITE:0x10000000
 ZIO_FLAG_REEXECUTED:0x20000000
 ZIO_FLAG_DELEGATED:0x40000000
 ZIO_FLAG_FASTWRITE:0x80000000
 .TE
Index: vendor-sys/openzfs/dist/man/man5/zfs-module-parameters.5
===================================================================
--- vendor-sys/openzfs/dist/man/man5/zfs-module-parameters.5	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man5/zfs-module-parameters.5	(revision 364928)
@@ -1,4084 +1,4098 @@
 '\" te
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" Copyright (c) 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2019 Datto Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
 .\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
 .\"
 .\" See the License for the specific language governing permissions and
 .\" limitations under the License. When distributing Covered Code, include this
 .\" CDDL HEADER in each file and include the License file at
 .\" usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
-.TH ZFS-MODULE-PARAMETERS 5 "Feb 15, 2019"
+.TH ZFS-MODULE-PARAMETERS 5 "Aug 24, 2020" OpenZFS
 .SH NAME
 zfs\-module\-parameters \- ZFS module parameters
 .SH DESCRIPTION
 .sp
 .LP
 Description of the different parameters to the ZFS module.
 
 .SS "Module parameters"
 .sp
 .LP
 
 .sp
 .ne 2
 .na
 \fBdbuf_cache_max_bytes\fR (ulong)
 .ad
 .RS 12n
 Maximum size in bytes of the dbuf cache.  The target size is determined by the
 MIN versus \fB1/2^dbuf_cache_shift\fR (1/32) of the target ARC size.  The
 behavior of the dbuf cache and its associated settings can be observed via the
 \fB/proc/spl/kstat/zfs/dbufstats\fR kstat.
 .sp
 Default value: \fBULONG_MAX\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdbuf_metadata_cache_max_bytes\fR (ulong)
 .ad
 .RS 12n
 Maximum size in bytes of the metadata dbuf cache.  The target size is
 determined by the MIN versus \fB1/2^dbuf_metadata_cache_shift\fR (1/64) of the
 target ARC size.  The behavior of the metadata dbuf cache and its associated
 settings can be observed via the \fB/proc/spl/kstat/zfs/dbufstats\fR kstat.
 .sp
 Default value: \fBULONG_MAX\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdbuf_cache_hiwater_pct\fR (uint)
 .ad
 .RS 12n
 The percentage over \fBdbuf_cache_max_bytes\fR when dbufs must be evicted
 directly.
 .sp
 Default value: \fB10\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdbuf_cache_lowater_pct\fR (uint)
 .ad
 .RS 12n
 The percentage below \fBdbuf_cache_max_bytes\fR when the evict thread stops
 evicting dbufs.
 .sp
 Default value: \fB10\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdbuf_cache_shift\fR (int)
 .ad
 .RS 12n
 Set the size of the dbuf cache, \fBdbuf_cache_max_bytes\fR, to a log2 fraction
 of the target ARC size.
 .sp
 Default value: \fB5\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdbuf_metadata_cache_shift\fR (int)
 .ad
 .RS 12n
 Set the size of the dbuf metadata cache, \fBdbuf_metadata_cache_max_bytes\fR,
 to a log2 fraction of the target ARC size.
 .sp
 Default value: \fB6\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdmu_object_alloc_chunk_shift\fR (int)
 .ad
 .RS 12n
 dnode slots allocated in a single operation as a power of 2. The default value
 minimizes lock contention for the bulk operation performed.
 .sp
 Default value: \fB7\fR (128).
 .RE
 
 .sp
 .ne 2
 .na
 \fBdmu_prefetch_max\fR (int)
 .ad
 .RS 12n
 Limit the amount we can prefetch with one call to this amount (in bytes).
 This helps to limit the amount of memory that can be used by prefetching.
 .sp
 Default value: \fB134,217,728\fR (128MB).
 .RE
 
 .sp
 .ne 2
 .na
 \fBignore_hole_birth\fR (int)
 .ad
 .RS 12n
 This is an alias for \fBsend_holes_without_birth_time\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_feed_again\fR (int)
 .ad
 .RS 12n
 Turbo L2ARC warm-up. When the L2ARC is cold the fill interval will be set as
 fast as possible.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_feed_min_ms\fR (ulong)
 .ad
 .RS 12n
 Min feed interval in milliseconds. Requires \fBl2arc_feed_again=1\fR and only
 applicable in related situations.
 .sp
 Default value: \fB200\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_feed_secs\fR (ulong)
 .ad
 .RS 12n
 Seconds between L2ARC writing
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_headroom\fR (ulong)
 .ad
 .RS 12n
 How far through the ARC lists to search for L2ARC cacheable content, expressed
 as a multiplier of \fBl2arc_write_max\fR.
 ARC persistence across reboots can be achieved with persistent L2ARC by setting
 this parameter to \fB0\fR allowing the full length of ARC lists to be searched
 for cacheable content.
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_headroom_boost\fR (ulong)
 .ad
 .RS 12n
 Scales \fBl2arc_headroom\fR by this percentage when L2ARC contents are being
 successfully compressed before writing. A value of \fB100\fR disables this
 feature.
 .sp
 Default value: \fB200\fR%.
 .RE
 
 .sp
 .ne 2
 .na
+\fBl2arc_meta_percent\fR (int)
+.ad
+.RS 12n
+Percent of ARC size allowed for L2ARC-only headers.
+Since L2ARC buffers are not evicted on memory pressure, too large amount of
+headers on system with irrationaly large L2ARC can render it slow or unusable.
+This parameter limits L2ARC writes and rebuild to achieve it.
+.sp
+Default value: \fB33\fR%.
+.RE
+
+.sp
+.ne 2
+.na
 \fBl2arc_trim_ahead\fR (ulong)
 .ad
 .RS 12n
 Trims ahead of the current write size (\fBl2arc_write_max\fR) on L2ARC devices
 by this percentage of write size if we have filled the device. If set to
 \fB100\fR we TRIM twice the space required to accommodate upcoming writes. A
 minimum of 64MB will be trimmed. It also enables TRIM of the whole L2ARC device
 upon creation or addition to an existing pool or if the header of the device is
 invalid upon importing a pool or onlining a cache device. A value of \fB0\fR
 disables TRIM on L2ARC altogether and is the default as it can put significant
 stress on the underlying storage devices. This will vary depending of how well
 the specific device handles these commands.
 .sp
 Default value: \fB0\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_noprefetch\fR (int)
 .ad
 .RS 12n
 Do not write buffers to L2ARC if they were prefetched but not used by
 applications.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_norw\fR (int)
 .ad
 .RS 12n
 No reads during writes.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_write_boost\fR (ulong)
 .ad
 .RS 12n
 Cold L2ARC devices will have \fBl2arc_write_max\fR increased by this amount
 while they remain cold.
 .sp
 Default value: \fB8,388,608\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_write_max\fR (ulong)
 .ad
 .RS 12n
 Max write bytes per interval.
 .sp
 Default value: \fB8,388,608\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_rebuild_enabled\fR (int)
 .ad
 .RS 12n
 Rebuild the L2ARC when importing a pool (persistent L2ARC). This can be
 disabled if there are problems importing a pool or attaching an L2ARC device
 (e.g. the L2ARC device is slow in reading stored log metadata, or the metadata
 has become somehow fragmented/unusable).
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_rebuild_blocks_min_l2size\fR (ulong)
 .ad
 .RS 12n
 Min size (in bytes) of an L2ARC device required in order to write log blocks
 in it. The log blocks are used upon importing the pool to rebuild
 the L2ARC (persistent L2ARC). Rationale: for L2ARC devices less than 1GB, the
 amount of data l2arc_evict() evicts is significant compared to the amount of
 restored L2ARC data. In this case do not write log blocks in L2ARC in order not
 to waste space.
 .sp
 Default value: \fB1,073,741,824\fR (1GB).
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_aliquot\fR (ulong)
 .ad
 .RS 12n
 Metaslab granularity, in bytes. This is roughly similar to what would be
 referred to as the "stripe size" in traditional RAID arrays. In normal
 operation, ZFS will try to write this amount of data to a top-level vdev
 before moving on to the next one.
 .sp
 Default value: \fB524,288\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_bias_enabled\fR (int)
 .ad
 .RS 12n
 Enable metaslab group biasing based on its vdev's over- or under-utilization
 relative to the pool.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_force_ganging\fR (ulong)
 .ad
 .RS 12n
 Make some blocks above a certain size be gang blocks.  This option is used
 by the test suite to facilitate testing.
 .sp
 Default value: \fB16,777,217\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_keep_log_spacemaps_at_export\fR (int)
 .ad
 .RS 12n
 Prevent log spacemaps from being destroyed during pool exports and destroys.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_metaslab_segment_weight_enabled\fR (int)
 .ad
 .RS 12n
 Enable/disable segment-based metaslab selection.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_metaslab_switch_threshold\fR (int)
 .ad
 .RS 12n
 When using segment-based metaslab selection, continue allocating
 from the active metaslab until \fBzfs_metaslab_switch_threshold\fR
 worth of buckets have been exhausted.
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_debug_load\fR (int)
 .ad
 .RS 12n
 Load all metaslabs during pool import.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_debug_unload\fR (int)
 .ad
 .RS 12n
 Prevent metaslabs from being unloaded.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_fragmentation_factor_enabled\fR (int)
 .ad
 .RS 12n
 Enable use of the fragmentation metric in computing metaslab weights.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_df_max_search\fR (int)
 .ad
 .RS 12n
 Maximum distance to search forward from the last offset. Without this limit,
 fragmented pools can see >100,000 iterations and metaslab_block_picker()
 becomes the performance limiting factor on high-performance storage.
 
 With the default setting of 16MB, we typically see less than 500 iterations,
 even with very fragmented, ashift=9 pools. The maximum number of iterations
 possible is: \fBmetaslab_df_max_search / (2 * (1<<ashift))\fR.
 With the default setting of 16MB this is 16*1024 (with ashift=9) or 2048
 (with ashift=12).
 .sp
 Default value: \fB16,777,216\fR (16MB)
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_df_use_largest_segment\fR (int)
 .ad
 .RS 12n
 If we are not searching forward (due to metaslab_df_max_search,
 metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable controls
-what segment is used.  If it is set, we will use the largest free segment. 
+what segment is used.  If it is set, we will use the largest free segment.
 If it is not set, we will use a segment of exactly the requested size (or
 larger).
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_metaslab_max_size_cache_sec\fR (ulong)
 .ad
 .RS 12n
 When we unload a metaslab, we cache the size of the largest free chunk. We use
 that cached size to determine whether or not to load a metaslab for a given
 allocation. As more frees accumulate in that metaslab while it's unloaded, the
 cached max size becomes less and less accurate. After a number of seconds
 controlled by this tunable, we stop considering the cached max size and start
 considering only the histogram instead.
 .sp
 Default value: \fB3600 seconds\fR (one hour)
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_metaslab_mem_limit\fR (int)
 .ad
 .RS 12n
 When we are loading a new metaslab, we check the amount of memory being used
 to store metaslab range trees. If it is over a threshold, we attempt to unload
 the least recently used metaslab to prevent the system from clogging all of
 its memory with range trees. This tunable sets the percentage of total system
 memory that is the threshold.
 .sp
 Default value: \fB25 percent\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_default_ms_count\fR (int)
 .ad
 .RS 12n
 When a vdev is added target this number of metaslabs per top-level vdev.
 .sp
 Default value: \fB200\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_default_ms_shift\fR (int)
 .ad
 .RS 12n
 Default limit for metaslab size.
 .sp
 Default value: \fB29\fR [meaning (1 << 29) = 512MB].
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_max_auto_ashift\fR (ulong)
 .ad
 .RS 12n
 Maximum ashift used when optimizing for logical -> physical sector size on new
 top-level vdevs.
 .sp
 Default value: \fBASHIFT_MAX\fR (16).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_min_auto_ashift\fR (ulong)
 .ad
 .RS 12n
 Minimum ashift used when creating new top-level vdevs.
 .sp
 Default value: \fBASHIFT_MIN\fR (9).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_min_ms_count\fR (int)
 .ad
 .RS 12n
 Minimum number of metaslabs to create in a top-level vdev.
 .sp
 Default value: \fB16\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_validate_skip\fR (int)
 .ad
 .RS 12n
 Skip label validation steps during pool import. Changing is not recommended
 unless you know what you are doing and are recovering a damaged label.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_ms_count_limit\fR (int)
 .ad
 .RS 12n
 Practical upper limit of total metaslabs per top-level vdev.
 .sp
 Default value: \fB131,072\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_preload_enabled\fR (int)
 .ad
 .RS 12n
 Enable metaslab group preloading.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_lba_weighting_enabled\fR (int)
 .ad
 .RS 12n
 Give more weight to metaslabs with lower LBAs, assuming they have
 greater bandwidth as is typically the case on a modern constant
 angular velocity disk drive.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_unload_delay\fR (int)
 .ad
 .RS 12n
 After a metaslab is used, we keep it loaded for this many txgs, to attempt to
 reduce unnecessary reloading. Note that both this many txgs and
 \fBmetaslab_unload_delay_ms\fR milliseconds must pass before unloading will
 occur.
 .sp
 Default value: \fB32\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_unload_delay_ms\fR (int)
 .ad
 .RS 12n
 After a metaslab is used, we keep it loaded for this many milliseconds, to
 attempt to reduce unnecessary reloading. Note that both this many
 milliseconds and \fBmetaslab_unload_delay\fR txgs must pass before unloading
 will occur.
 .sp
 Default value: \fB600000\fR (ten minutes).
 .RE
 
 .sp
 .ne 2
 .na
 \fBsend_holes_without_birth_time\fR (int)
 .ad
 .RS 12n
 When set, the hole_birth optimization will not be used, and all holes will
 always be sent on zfs send.  This is useful if you suspect your datasets are
 affected by a bug in hole_birth.
 .sp
 Use \fB1\fR for on (default) and \fB0\fR for off.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_config_path\fR (charp)
 .ad
 .RS 12n
 SPA config file
 .sp
 Default value: \fB/etc/zfs/zpool.cache\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_asize_inflation\fR (int)
 .ad
 .RS 12n
 Multiplication factor used to estimate actual disk consumption from the
 size of data being written. The default value is a worst case estimate,
 but lower values may be valid for a given pool depending on its
 configuration.  Pool administrators who understand the factors involved
 may wish to specify a more realistic inflation factor, particularly if
 they operate close to quota or capacity limits.
 .sp
 Default value: \fB24\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_load_print_vdev_tree\fR (int)
 .ad
 .RS 12n
 Whether to print the vdev tree in the debugging message buffer during pool import.
 Use 0 to disable and 1 to enable.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_load_verify_data\fR (int)
 .ad
 .RS 12n
 Whether to traverse data blocks during an "extreme rewind" (\fB-X\fR)
 import.  Use 0 to disable and 1 to enable.
 
 An extreme rewind import normally performs a full traversal of all
 blocks in the pool for verification.  If this parameter is set to 0,
 the traversal skips non-metadata blocks.  It can be toggled once the
 import has started to stop or start the traversal of non-metadata blocks.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_load_verify_metadata\fR (int)
 .ad
 .RS 12n
 Whether to traverse blocks during an "extreme rewind" (\fB-X\fR)
 pool import.  Use 0 to disable and 1 to enable.
 
 An extreme rewind import normally performs a full traversal of all
 blocks in the pool for verification.  If this parameter is set to 0,
 the traversal is not performed.  It can be toggled once the import has
 started to stop or start the traversal.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_load_verify_shift\fR (int)
 .ad
 .RS 12n
 Sets the maximum number of bytes to consume during pool import to the log2
 fraction of the target ARC size.
 .sp
 Default value: \fB4\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_slop_shift\fR (int)
 .ad
 .RS 12n
 Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space
 in the pool to be consumed.  This ensures that we don't run the pool
 completely out of space, due to unaccounted changes (e.g. to the MOS).
 It also limits the worst-case time to allocate space.  If we have
 less than this amount of free space, most ZPL operations (e.g. write,
 create) will return ENOSPC.
 .sp
 Default value: \fB5\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBvdev_removal_max_span\fR (int)
 .ad
 .RS 12n
 During top-level vdev removal, chunks of data are copied from the vdev
 which may include free space in order to trade bandwidth for IOPS.
 This parameter determines the maximum span of free space (in bytes)
 which will be included as "unnecessary" data in a chunk of copied data.
 
 The default value here was chosen to align with
 \fBzfs_vdev_read_gap_limit\fR, which is a similar concept when doing
 regular reads (but there's no reason it has to be the same).
 .sp
 Default value: \fB32,768\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzap_iterate_prefetch\fR (int)
 .ad
 .RS 12n
 If this is set, when we start iterating over a ZAP object, zfs will prefetch
 the entire object (all leaf blocks).  However, this is limited by
 \fBdmu_prefetch_max\fR.
 .sp
 Use \fB1\fR for on (default) and \fB0\fR for off.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfetch_array_rd_sz\fR (ulong)
 .ad
 .RS 12n
 If prefetching is enabled, disable prefetching for reads larger than this size.
 .sp
 Default value: \fB1,048,576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfetch_max_distance\fR (uint)
 .ad
 .RS 12n
 Max bytes to prefetch per stream (default 8MB).
 .sp
 Default value: \fB8,388,608\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfetch_max_streams\fR (uint)
 .ad
 .RS 12n
 Max number of streams per zfetch (prefetch streams per file).
 .sp
 Default value: \fB8\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfetch_min_sec_reap\fR (uint)
 .ad
 .RS 12n
 Min time before an active prefetch stream can be reclaimed
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_abd_scatter_enabled\fR (int)
 .ad
 .RS 12n
 Enables ARC from using scatter/gather lists and forces all allocations to be
 linear in kernel memory. Disabling can improve performance in some code paths
 at the expense of fragmented kernel memory.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_abd_scatter_max_order\fR (iunt)
 .ad
 .RS 12n
 Maximum number of consecutive memory pages allocated in a single block for
 scatter/gather lists. Default value is specified by the kernel itself.
 .sp
 Default value: \fB10\fR at the time of this writing.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_abd_scatter_min_size\fR (uint)
 .ad
 .RS 12n
 This is the minimum allocation size that will use scatter (page-based)
 ABD's.  Smaller allocations will use linear ABD's.
 .sp
 Default value: \fB1536\fR (512B and 1KB allocations will be linear).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_dnode_limit\fR (ulong)
 .ad
 .RS 12n
 When the number of bytes consumed by dnodes in the ARC exceeds this number of
 bytes, try to unpin some of it in response to demand for non-metadata. This
 value acts as a ceiling to the amount of dnode metadata, and defaults to 0 which
 indicates that a percent which is based on \fBzfs_arc_dnode_limit_percent\fR of
 the ARC meta buffers that may be used for dnodes.
 
 See also \fBzfs_arc_meta_prune\fR which serves a similar purpose but is used
 when the amount of metadata in the ARC exceeds \fBzfs_arc_meta_limit\fR rather
 than in response to overall demand for non-metadata.
 
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_dnode_limit_percent\fR (ulong)
 .ad
 .RS 12n
 Percentage that can be consumed by dnodes of ARC meta buffers.
 .sp
 See also \fBzfs_arc_dnode_limit\fR which serves a similar purpose but has a
 higher priority if set to nonzero value.
 .sp
 Default value: \fB10\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_dnode_reduce_percent\fR (ulong)
 .ad
 .RS 12n
 Percentage of ARC dnodes to try to scan in response to demand for non-metadata
 when the number of bytes consumed by dnodes exceeds \fBzfs_arc_dnode_limit\fR.
 
 .sp
 Default value: \fB10\fR% of the number of dnodes in the ARC.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_average_blocksize\fR (int)
 .ad
 .RS 12n
 The ARC's buffer hash table is sized based on the assumption of an average
 block size of \fBzfs_arc_average_blocksize\fR (default 8K).  This works out
 to roughly 1MB of hash table per 1GB of physical memory with 8-byte pointers.
 For configurations with a known larger average block size this value can be
 increased to reduce the memory footprint.
 
 .sp
 Default value: \fB8192\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_eviction_pct\fR (int)
 .ad
 .RS 12n
 When \fBarc_is_overflowing()\fR, \fBarc_get_data_impl()\fR waits for this
 percent of the requested amount of data to be evicted.  For example, by
 default for every 2KB that's evicted, 1KB of it may be "reused" by a new
 allocation. Since this is above 100%, it ensures that progress is made
 towards getting \fBarc_size\fR under \fBarc_c\fR.  Since this is finite, it
 ensures that allocations can still happen, even during the potentially long
 time that \fBarc_size\fR is more than \fBarc_c\fR.
 .sp
 Default value: \fB200\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_evict_batch_limit\fR (int)
 .ad
 .RS 12n
 Number ARC headers to evict per sub-list before proceeding to another sub-list.
 This batch-style operation prevents entire sub-lists from being evicted at once
 but comes at a cost of additional unlocking and locking.
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_grow_retry\fR (int)
 .ad
 .RS 12n
 If set to a non zero value, it will replace the arc_grow_retry value with this value.
 The arc_grow_retry value (default 5) is the number of seconds the ARC will wait before
 trying to resume growth after a memory pressure event.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_lotsfree_percent\fR (int)
 .ad
 .RS 12n
 Throttle I/O when free system memory drops below this percentage of total
 system memory.  Setting this value to 0 will disable the throttle.
 .sp
 Default value: \fB10\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_max\fR (ulong)
 .ad
 .RS 12n
 Max size of ARC in bytes.  If set to 0 then the max size of ARC is determined
 by the amount of system memory installed.  For Linux, 1/2 of system memory will
 be used as the limit.  For FreeBSD, the larger of all system memory - 1GB or
 5/8 of system memory will be used as the limit.  This value must be at least
 67108864 (64 megabytes).
 .sp
 This value can be changed dynamically with some caveats. It cannot be set back
 to 0 while running and reducing it below the current ARC size will not cause
 the ARC to shrink without memory pressure to induce shrinking.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_meta_adjust_restarts\fR (ulong)
 .ad
 .RS 12n
 The number of restart passes to make while scanning the ARC attempting
 the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR.
 This value should not need to be tuned but is available to facilitate
 performance analysis.
 .sp
 Default value: \fB4096\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_meta_limit\fR (ulong)
 .ad
 .RS 12n
 The maximum allowed size in bytes that meta data buffers are allowed to
 consume in the ARC.  When this limit is reached meta data buffers will
 be reclaimed even if the overall arc_c_max has not been reached.  This
 value defaults to 0 which indicates that a percent which is based on
 \fBzfs_arc_meta_limit_percent\fR of the ARC may be used for meta data.
 .sp
 This value my be changed dynamically except that it cannot be set back to 0
 for a specific percent of the ARC; it must be set to an explicit value.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_meta_limit_percent\fR (ulong)
 .ad
 .RS 12n
 Percentage of ARC buffers that can be used for meta data.
 
 See also \fBzfs_arc_meta_limit\fR which serves a similar purpose but has a
 higher priority if set to nonzero value.
 
 .sp
 Default value: \fB75\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_meta_min\fR (ulong)
 .ad
 .RS 12n
 The minimum allowed size in bytes that meta data buffers may consume in
 the ARC.  This value defaults to 0 which disables a floor on the amount
 of the ARC devoted meta data.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_meta_prune\fR (int)
 .ad
 .RS 12n
 The number of dentries and inodes to be scanned looking for entries
 which can be dropped.  This may be required when the ARC reaches the
 \fBzfs_arc_meta_limit\fR because dentries and inodes can pin buffers
 in the ARC.  Increasing this value will cause to dentry and inode caches
 to be pruned more aggressively.  Setting this value to 0 will disable
 pruning the inode and dentry caches.
 .sp
 Default value: \fB10,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_meta_strategy\fR (int)
 .ad
 .RS 12n
 Define the strategy for ARC meta data buffer eviction (meta reclaim strategy).
 A value of 0 (META_ONLY) will evict only the ARC meta data buffers.
 A value of 1 (BALANCED) indicates that additional data buffers may be evicted if
 that is required to in order to evict the required number of meta data buffers.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_min\fR (ulong)
 .ad
 .RS 12n
 Min size of ARC in bytes. If set to 0 then arc_c_min will default to
 consuming the larger of 32M or 1/32 of total system memory.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_min_prefetch_ms\fR (int)
 .ad
 .RS 12n
 Minimum time prefetched blocks are locked in the ARC, specified in ms.
 A value of \fB0\fR will default to 1000 ms.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_min_prescient_prefetch_ms\fR (int)
 .ad
 .RS 12n
 Minimum time "prescient prefetched" blocks are locked in the ARC, specified
 in ms. These blocks are meant to be prefetched fairly aggressively ahead of
 the code that may use them. A value of \fB0\fR will default to 6000 ms.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_max_missing_tvds\fR (int)
 .ad
 .RS 12n
 Number of missing top-level vdevs which will be allowed during
 pool import (only in read-only mode).
 .sp
 Default value: \fB0\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_max_nvlist_src_size\fR (ulong)
 .ad
 .RS 12n
 Maximum size in bytes allowed to be passed as zc_nvlist_src_size for ioctls on
 /dev/zfs. This prevents a user from causing the kernel to allocate an excessive
 amount of memory. When the limit is exceeded, the ioctl fails with EINVAL and a
 description of the error is sent to the zfs-dbgmsg log. This parameter should
 not need to be touched under normal circumstances. On FreeBSD, the default is
 based on the system limit on user wired memory. On Linux, the default is
 \fBKMALLOC_MAX_SIZE\fR .
 .sp
 Default value: \fB0\fR (kernel decides)
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_multilist_num_sublists\fR (int)
 .ad
 .RS 12n
 To allow more fine-grained locking, each ARC state contains a series
 of lists for both data and meta data objects.  Locking is performed at
 the level of these "sub-lists".  This parameters controls the number of
 sub-lists per ARC state, and also applies to other uses of the
 multilist data structure.
 .sp
 Default value: \fB4\fR or the number of online CPUs, whichever is greater
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_overflow_shift\fR (int)
 .ad
 .RS 12n
 The ARC size is considered to be overflowing if it exceeds the current
 ARC target size (arc_c) by a threshold determined by this parameter.
 The threshold is calculated as a fraction of arc_c using the formula
 "arc_c >> \fBzfs_arc_overflow_shift\fR".
 
 The default value of 8 causes the ARC to be considered to be overflowing
 if it exceeds the target size by 1/256th (0.3%) of the target size.
 
 When the ARC is overflowing, new buffer allocations are stalled until
 the reclaim thread catches up and the overflow condition no longer exists.
 .sp
 Default value: \fB8\fR.
 .RE
 
 .sp
 .ne 2
 .na
 
 \fBzfs_arc_p_min_shift\fR (int)
 .ad
 .RS 12n
 If set to a non zero value, this will update arc_p_min_shift (default 4)
 with the new value.
 arc_p_min_shift is used to shift of arc_c for calculating both min and max
 max arc_p
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_p_dampener_disable\fR (int)
 .ad
 .RS 12n
 Disable arc_p adapt dampener
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_shrink_shift\fR (int)
 .ad
 .RS 12n
 If set to a non zero value, this will update arc_shrink_shift (default 7)
 with the new value.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_pc_percent\fR (uint)
 .ad
 .RS 12n
 Percent of pagecache to reclaim arc to
 
 This tunable allows ZFS arc to play more nicely with the kernel's LRU
 pagecache. It can guarantee that the ARC size won't collapse under scanning
 pressure on the pagecache, yet still allows arc to be reclaimed down to
 zfs_arc_min if necessary. This value is specified as percent of pagecache
 size (as measured by NR_FILE_PAGES) where that percent may exceed 100. This
 only operates during memory pressure/reclaim.
 .sp
 Default value: \fB0\fR% (disabled).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_shrinker_limit\fR (int)
 .ad
 .RS 12n
 This is a limit on how many pages the ARC shrinker makes available for
 eviction in response to one page allocation attempt.  Note that in
 practice, the kernel's shrinker can ask us to evict up to about 4x this
 for one allocation attempt.
 .sp
 The default limit of 10,000 (in practice, 160MB per allocation attempt with
 4K pages) limits the amount of time spent attempting to reclaim ARC memory to
 less than 100ms per allocation attempt, even with a small average compressed
 block size of ~8KB.
 .sp
 The parameter can be set to 0 (zero) to disable the limit.
 .sp
 This parameter only applies on Linux.
 .sp
 Default value: \fB10,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_sys_free\fR (ulong)
 .ad
 .RS 12n
 The target number of bytes the ARC should leave as free memory on the system.
 Defaults to the larger of 1/64 of physical memory or 512K.  Setting this
 option to a non-zero value will override the default.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_autoimport_disable\fR (int)
 .ad
 .RS 12n
 Disable pool import at module load by ignoring the cache file (typically \fB/etc/zfs/zpool.cache\fR).
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_checksum_events_per_second\fR (uint)
 .ad
 .RS 12n
 Rate limit checksum events to this many per second.  Note that this should
 not be set below the zed thresholds (currently 10 checksums over 10 sec)
 or else zed may not trigger any action.
 .sp
 Default value: 20
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_commit_timeout_pct\fR (int)
 .ad
 .RS 12n
 This controls the amount of time that a ZIL block (lwb) will remain "open"
 when it isn't "full", and it has a thread waiting for it to be committed to
 stable storage.  The timeout is scaled based on a percentage of the last lwb
 latency to avoid significantly impacting the latency of each individual
 transaction record (itx).
 .sp
 Default value: \fB5\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_condense_indirect_commit_entry_delay_ms\fR (int)
 .ad
 .RS 12n
 Vdev indirection layer (used for device removal) sleeps for this many
 milliseconds during mapping generation. Intended for use with the test suite
 to throttle vdev removal speed.
 .sp
 Default value: \fB0\fR (no throttle).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_condense_indirect_vdevs_enable\fR (int)
 .ad
 .RS 12n
 Enable condensing indirect vdev mappings.  When set to a non-zero value,
 attempt to condense indirect vdev mappings if the mapping uses more than
 \fBzfs_condense_min_mapping_bytes\fR bytes of memory and if the obsolete
 space map object uses more than \fBzfs_condense_max_obsolete_bytes\fR
 bytes on-disk.  The condensing process is an attempt to save memory by
 removing obsolete mappings.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_condense_max_obsolete_bytes\fR (ulong)
 .ad
 .RS 12n
 Only attempt to condense indirect vdev mappings if the on-disk size
 of the obsolete space map object is greater than this number of bytes
 (see \fBfBzfs_condense_indirect_vdevs_enable\fR).
 .sp
 Default value: \fB1,073,741,824\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_condense_min_mapping_bytes\fR (ulong)
 .ad
 .RS 12n
 Minimum size vdev mapping to attempt to condense (see
 \fBzfs_condense_indirect_vdevs_enable\fR).
 .sp
 Default value: \fB131,072\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dbgmsg_enable\fR (int)
 .ad
 .RS 12n
 Internally ZFS keeps a small log to facilitate debugging.  By default the log
 is disabled, to enable it set this option to 1.  The contents of the log can
 be accessed by reading the /proc/spl/kstat/zfs/dbgmsg file.  Writing 0 to
 this proc file clears the log.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dbgmsg_maxsize\fR (int)
 .ad
 .RS 12n
 The maximum size in bytes of the internal ZFS debug log.
 .sp
 Default value: \fB4M\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dbuf_state_index\fR (int)
 .ad
 .RS 12n
 This feature is currently unused. It is normally used for controlling what
 reporting is available under /proc/spl/kstat/zfs.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_deadman_enabled\fR (int)
 .ad
 .RS 12n
 When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR
 milliseconds, or when an individual I/O takes longer than
 \fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to
 be "hung".  If \fBzfs_deadman_enabled\fR is set then the deadman behavior is
 invoked as described by the \fBzfs_deadman_failmode\fR module option.
 By default the deadman is enabled and configured to \fBwait\fR which results
 in "hung" I/Os only being logged.  The deadman is automatically disabled
 when a pool gets suspended.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_deadman_failmode\fR (charp)
 .ad
 .RS 12n
 Controls the failure behavior when the deadman detects a "hung" I/O.  Valid
 values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR.
 .sp
 \fBwait\fR - Wait for a "hung" I/O to complete.  For each "hung" I/O a
 "deadman" event will be posted describing that I/O.
 .sp
 \fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it
 to the I/O pipeline if possible.
 .sp
 \fBpanic\fR - Panic the system.  This can be used to facilitate an automatic
 fail-over to a properly configured fail-over partner.
 .sp
 Default value: \fBwait\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_deadman_checktime_ms\fR (int)
 .ad
 .RS 12n
 Check time in milliseconds. This defines the frequency at which we check
 for hung I/O and potentially invoke the \fBzfs_deadman_failmode\fR behavior.
 .sp
 Default value: \fB60,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_deadman_synctime_ms\fR (ulong)
 .ad
 .RS 12n
 Interval in milliseconds after which the deadman is triggered and also
 the interval after which a pool sync operation is considered to be "hung".
 Once this limit is exceeded the deadman will be invoked every
 \fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes.
 .sp
 Default value: \fB600,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_deadman_ziotime_ms\fR (ulong)
 .ad
 .RS 12n
 Interval in milliseconds after which the deadman is triggered and an
 individual I/O operation is considered to be "hung".  As long as the I/O
 remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
 milliseconds until the I/O completes.
 .sp
 Default value: \fB300,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dedup_prefetch\fR (int)
 .ad
 .RS 12n
 Enable prefetching dedup-ed blks
 .sp
 Use \fB1\fR for yes and \fB0\fR to disable (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_delay_min_dirty_percent\fR (int)
 .ad
 .RS 12n
 Start to delay each transaction once there is this amount of dirty data,
 expressed as a percentage of \fBzfs_dirty_data_max\fR.
 This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
 See the section "ZFS TRANSACTION DELAY".
 .sp
 Default value: \fB60\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_delay_scale\fR (int)
 .ad
 .RS 12n
 This controls how quickly the transaction delay approaches infinity.
 Larger values cause longer delays for a given amount of dirty data.
 .sp
 For the smoothest delay, this value should be about 1 billion divided
 by the maximum number of operations per second.  This will smoothly
 handle between 10x and 1/10th this number.
 .sp
 See the section "ZFS TRANSACTION DELAY".
 .sp
 Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64.
 .sp
 Default value: \fB500,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_disable_ivset_guid_check\fR (int)
 .ad
 .RS 12n
 Disables requirement for IVset guids to be present and match when doing a raw
 receive of encrypted datasets. Intended for users whose pools were created with
 ZFS on Linux pre-release versions and now have compatibility issues.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_key_max_salt_uses\fR (ulong)
 .ad
 .RS 12n
 Maximum number of uses of a single salt value before generating a new one for
 encrypted datasets. The default value is also the maximum that will be
 accepted.
 .sp
 Default value: \fB400,000,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_object_mutex_size\fR (uint)
 .ad
 .RS 12n
 Size of the znode hashtable used for holds.
 
 Due to the need to hold locks on objects that may not exist yet, kernel mutexes
 are not created per-object and instead a hashtable is used where collisions
 will result in objects waiting when there is not actually contention on the
 same object.
 .sp
 Default value: \fB64\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_slow_io_events_per_second\fR (int)
 .ad
 .RS 12n
 Rate limit delay zevents (which report slow I/Os) to this many per second.
 .sp
 Default value: 20
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_unflushed_max_mem_amt\fR (ulong)
 .ad
 .RS 12n
 Upper-bound limit for unflushed metadata changes to be held by the
 log spacemap in memory (in bytes).
 .sp
 Default value: \fB1,073,741,824\fR (1GB).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_unflushed_max_mem_ppm\fR (ulong)
 .ad
 .RS 12n
 Percentage of the overall system memory that ZFS allows to be used
 for unflushed metadata changes by the log spacemap.
 (value is calculated over 1000000 for finer granularity).
 .sp
 Default value: \fB1000\fR (which is divided by 1000000, resulting in
 the limit to be \fB0.1\fR% of memory)
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_unflushed_log_block_max\fR (ulong)
 .ad
 .RS 12n
 Describes the maximum number of log spacemap blocks allowed for each pool.
 The default value of 262144 means that the space in all the log spacemaps
 can add up to no more than 262144 blocks (which means 32GB of logical
 space before compression and ditto blocks, assuming that blocksize is
 128k).
 .sp
 This tunable is important because it involves a trade-off between import
 time after an unclean export and the frequency of flushing metaslabs.
 The higher this number is, the more log blocks we allow when the pool is
 active which means that we flush metaslabs less often and thus decrease
 the number of I/Os for spacemap updates per TXG.
 At the same time though, that means that in the event of an unclean export,
 there will be more log spacemap blocks for us to read, inducing overhead
 in the import time of the pool.
 The lower the number, the amount of flushing increases destroying log
 blocks quicker as they become obsolete faster, which leaves less blocks
 to be read during import time after a crash.
 .sp
 Each log spacemap block existing during pool import leads to approximately
 one extra logical I/O issued.
 This is the reason why this tunable is exposed in terms of blocks rather
 than space used.
 .sp
 Default value: \fB262144\fR (256K).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_unflushed_log_block_min\fR (ulong)
 .ad
 .RS 12n
 If the number of metaslabs is small and our incoming rate is high, we
 could get into a situation that we are flushing all our metaslabs every
 TXG.
 Thus we always allow at least this many log blocks.
 .sp
 Default value: \fB1000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_unflushed_log_block_pct\fR (ulong)
 .ad
 .RS 12n
 Tunable used to determine the number of blocks that can be used for
 the spacemap log, expressed as a percentage of the total number of
 metaslabs in the pool.
 .sp
 Default value: \fB400\fR (read as \fB400\fR% - meaning that the number
 of log spacemap blocks are capped at 4 times the number of
 metaslabs in the pool).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_unlink_suspend_progress\fR (uint)
 .ad
 .RS 12n
 When enabled, files will not be asynchronously removed from the list of pending
 unlinks and the space they consume will be leaked. Once this option has been
 disabled and the dataset is remounted, the pending unlinks will be processed
 and the freed space returned to the pool.
 This option is used by the test suite to facilitate testing.
 .sp
 Uses \fB0\fR (default) to allow progress and \fB1\fR to pause progress.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_delete_blocks\fR (ulong)
 .ad
 .RS 12n
 This is the used to define a large file for the purposes of delete.  Files
 containing more than \fBzfs_delete_blocks\fR will be deleted asynchronously
 while smaller files are deleted synchronously.  Decreasing this value will
 reduce the time spent in an unlink(2) system call at the expense of a longer
 delay before the freed space is available.
 .sp
 Default value: \fB20,480\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dirty_data_max\fR (int)
 .ad
 .RS 12n
 Determines the dirty space limit in bytes.  Once this limit is exceeded, new
 writes are halted until space frees up. This parameter takes precedence
 over \fBzfs_dirty_data_max_percent\fR.
 See the section "ZFS TRANSACTION DELAY".
 .sp
 Default value: \fB10\fR% of physical RAM, capped at \fBzfs_dirty_data_max_max\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dirty_data_max_max\fR (int)
 .ad
 .RS 12n
 Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes.
 This limit is only enforced at module load time, and will be ignored if
 \fBzfs_dirty_data_max\fR is later changed.  This parameter takes
 precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section
 "ZFS TRANSACTION DELAY".
 .sp
 Default value: \fB25\fR% of physical RAM.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dirty_data_max_max_percent\fR (int)
 .ad
 .RS 12n
 Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a
 percentage of physical RAM.  This limit is only enforced at module load
 time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed.
 The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this
 one. See the section "ZFS TRANSACTION DELAY".
 .sp
 Default value: \fB25\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dirty_data_max_percent\fR (int)
 .ad
 .RS 12n
 Determines the dirty space limit, expressed as a percentage of all
 memory.  Once this limit is exceeded, new writes are halted until space frees
 up.  The parameter \fBzfs_dirty_data_max\fR takes precedence over this
 one.  See the section "ZFS TRANSACTION DELAY".
 .sp
 Default value: \fB10\fR%, subject to \fBzfs_dirty_data_max_max\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dirty_data_sync_percent\fR (int)
 .ad
 .RS 12n
 Start syncing out a transaction group if there's at least this much dirty data
 as a percentage of \fBzfs_dirty_data_max\fR.  This should be less than
 \fBzfs_vdev_async_write_active_min_dirty_percent\fR.
 .sp
 Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_fallocate_reserve_percent\fR (uint)
 .ad
 .RS 12n
 Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
 preallocated for a file in order to guarantee that later writes will not
 run out of space.  Instead, fallocate() space preallocation only checks
 that sufficient space is currently available in the pool or the user's
 project quota allocation, and then creates a sparse file of the requested
 size. The requested space is multiplied by \fBzfs_fallocate_reserve_percent\fR
 to allow additional space for indirect blocks and other internal metadata.
 Setting this value to 0 disables support for fallocate(2) and returns
 EOPNOTSUPP for fallocate() space preallocation again.
 .sp
 Default value: \fB110\fR%
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_fletcher_4_impl\fR (string)
 .ad
 .RS 12n
 Select a fletcher 4 implementation.
 .sp
 Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR,
 \fBavx2\fR, \fBavx512f\fR, \fBavx512bw\fR, and \fBaarch64_neon\fR.
 All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction
 set extensions to be available and will only appear if ZFS detects that they are
 present at runtime. If multiple implementations of fletcher 4 are available,
 the \fBfastest\fR will be chosen using a micro benchmark. Selecting \fBscalar\fR
 results in the original, CPU based calculation, being used. Selecting any option
 other than \fBfastest\fR and \fBscalar\fR results in vector instructions from
 the respective CPU instruction set being used.
 .sp
 Default value: \fBfastest\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_free_bpobj_enabled\fR (int)
 .ad
 .RS 12n
 Enable/disable the processing of the free_bpobj object.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_async_block_max_blocks\fR (ulong)
 .ad
 .RS 12n
 Maximum number of blocks freed in a single txg.
 .sp
 Default value: \fBULONG_MAX\fR (unlimited).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_max_async_dedup_frees\fR (ulong)
 .ad
 .RS 12n
 Maximum number of dedup blocks freed in a single txg.
 .sp
 Default value: \fB100,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_override_estimate_recordsize\fR (ulong)
 .ad
 .RS 12n
 Record size calculation override for zfs send estimates.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_read_max_active\fR (int)
 .ad
 .RS 12n
 Maximum asynchronous read I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB3\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_read_min_active\fR (int)
 .ad
 .RS 12n
 Minimum asynchronous read I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_write_active_max_dirty_percent\fR (int)
 .ad
 .RS 12n
 When the pool has more than
 \fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use
 \fBzfs_vdev_async_write_max_active\fR to limit active async writes.  If
 the dirty data is between min and max, the active I/O limit is linearly
 interpolated. See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB60\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_write_active_min_dirty_percent\fR (int)
 .ad
 .RS 12n
 When the pool has less than
 \fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use
 \fBzfs_vdev_async_write_min_active\fR to limit active async writes.  If
 the dirty data is between min and max, the active I/O limit is linearly
 interpolated. See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB30\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_write_max_active\fR (int)
 .ad
 .RS 12n
 Maximum asynchronous write I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_write_min_active\fR (int)
 .ad
 .RS 12n
 Minimum asynchronous write I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Lower values are associated with better latency on rotational media but poorer
 resilver performance. The default value of 2 was chosen as a compromise. A
 value of 3 has been shown to improve resilver performance further at a cost of
 further increasing latency.
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_initializing_max_active\fR (int)
 .ad
 .RS 12n
 Maximum initializing I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_initializing_min_active\fR (int)
 .ad
 .RS 12n
 Minimum initializing I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_max_active\fR (int)
 .ad
 .RS 12n
 The maximum number of I/Os active to each device.  Ideally, this will be >=
 the sum of each queue's max_active.  It must be at least the sum of each
 queue's min_active.  See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_rebuild_max_active\fR (int)
 .ad
 .RS 12n
 Maximum sequential resilver I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB3\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_rebuild_min_active\fR (int)
 .ad
 .RS 12n
 Minimum sequential resilver I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_removal_max_active\fR (int)
 .ad
 .RS 12n
 Maximum removal I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_removal_min_active\fR (int)
 .ad
 .RS 12n
 Minimum removal I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_scrub_max_active\fR (int)
 .ad
 .RS 12n
 Maximum scrub I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_scrub_min_active\fR (int)
 .ad
 .RS 12n
 Minimum scrub I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_sync_read_max_active\fR (int)
 .ad
 .RS 12n
 Maximum synchronous read I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_sync_read_min_active\fR (int)
 .ad
 .RS 12n
 Minimum synchronous read I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_sync_write_max_active\fR (int)
 .ad
 .RS 12n
 Maximum synchronous write I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_sync_write_min_active\fR (int)
 .ad
 .RS 12n
 Minimum synchronous write I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_trim_max_active\fR (int)
 .ad
 .RS 12n
 Maximum trim/discard I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_trim_min_active\fR (int)
 .ad
 .RS 12n
 Minimum trim/discard I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_queue_depth_pct\fR (int)
 .ad
 .RS 12n
 Maximum number of queued allocations per top-level vdev expressed as
 a percentage of \fBzfs_vdev_async_write_max_active\fR which allows the
 system to detect devices that are more capable of handling allocations
 and to allocate more blocks to those devices.  It allows for dynamic
 allocation distribution when devices are imbalanced as fuller devices
 will tend to be slower than empty devices.
 
 See also \fBzio_dva_throttle_enabled\fR.
 .sp
 Default value: \fB1000\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_expire_snapshot\fR (int)
 .ad
 .RS 12n
 Seconds to expire .zfs/snapshot
 .sp
 Default value: \fB300\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_admin_snapshot\fR (int)
 .ad
 .RS 12n
 Allow the creation, removal, or renaming of entries in the .zfs/snapshot
 directory to cause the creation, destruction, or renaming of snapshots.
 When enabled this functionality works both locally and over NFS exports
 which have the 'no_root_squash' option set. This functionality is disabled
 by default.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_flags\fR (int)
 .ad
 .RS 12n
 Set additional debugging flags. The following flags may be bitwise-or'd
 together.
 .sp
 .TS
 box;
 rB lB
 lB lB
 r l.
 Value	Symbolic Name
 	Description
 _
 1	ZFS_DEBUG_DPRINTF
 	Enable dprintf entries in the debug log.
 _
 2	ZFS_DEBUG_DBUF_VERIFY *
 	Enable extra dbuf verifications.
 _
 4	ZFS_DEBUG_DNODE_VERIFY *
 	Enable extra dnode verifications.
 _
 8	ZFS_DEBUG_SNAPNAMES
 	Enable snapshot name verification.
 _
 16	ZFS_DEBUG_MODIFY
 	Check for illegally modified ARC buffers.
 _
 64	ZFS_DEBUG_ZIO_FREE
 	Enable verification of block frees.
 _
 128	ZFS_DEBUG_HISTOGRAM_VERIFY
 	Enable extra spacemap histogram verifications.
 _
 256	ZFS_DEBUG_METASLAB_VERIFY
 	Verify space accounting on disk matches in-core range_trees.
 _
 512	ZFS_DEBUG_SET_ERROR
 	Enable SET_ERROR and dprintf entries in the debug log.
 _
 1024	ZFS_DEBUG_INDIRECT_REMAP
 	Verify split blocks created by device removal.
 _
 2048	ZFS_DEBUG_TRIM
 	Verify TRIM ranges are always within the allocatable range tree.
 _
 4096	ZFS_DEBUG_LOG_SPACEMAP
 	Verify that the log summary is consistent with the spacemap log
 	and enable zfs_dbgmsgs for metaslab loading and flushing.
 .TE
 .sp
 * Requires debug build.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_free_leak_on_eio\fR (int)
 .ad
 .RS 12n
 If destroy encounters an EIO while reading metadata (e.g. indirect
 blocks), space referenced by the missing metadata can not be freed.
 Normally this causes the background destroy to become "stalled", as
 it is unable to make forward progress.  While in this stalled state,
 all remaining space to free from the error-encountering filesystem is
 "temporarily leaked".  Set this flag to cause it to ignore the EIO,
 permanently leak the space from indirect blocks that can not be read,
 and continue to free everything else that it can.
 
 The default, "stalling" behavior is useful if the storage partially
 fails (i.e. some but not all i/os fail), and then later recovers.  In
 this case, we will be able to continue pool operations while it is
 partially failed, and when it recovers, we can continue to free the
 space, with no leaks.  However, note that this case is actually
 fairly rare.
 
 Typically pools either (a) fail completely (but perhaps temporarily,
 e.g. a top-level vdev going offline), or (b) have localized,
 permanent errors (e.g. disk returns the wrong data due to bit flip or
 firmware bug).  In case (a), this setting does not matter because the
 pool will be suspended and the sync thread will not be able to make
 forward progress regardless.  In case (b), because the error is
 permanent, the best we can do is leak the minimum amount of space,
 which is what setting this flag will do.  Therefore, it is reasonable
 for this flag to normally be set, but we chose the more conservative
 approach of not setting it, so that there is no possibility of
 leaking space in the "partial temporary" failure case.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_free_min_time_ms\fR (int)
 .ad
 .RS 12n
 During a \fBzfs destroy\fR operation using \fBfeature@async_destroy\fR a minimum
 of this much time will be spent working on freeing blocks per txg.
 .sp
 Default value: \fB1,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_obsolete_min_time_ms\fR (int)
 .ad
 .RS 12n
 Similar to \fBzfs_free_min_time_ms\fR but for cleanup of old indirection records
 for removed vdevs.
 .sp
 Default value: \fB500\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_immediate_write_sz\fR (long)
 .ad
 .RS 12n
 Largest data block to write to zil. Larger blocks will be treated as if the
 dataset being written to had the property setting \fBlogbias=throughput\fR.
 .sp
 Default value: \fB32,768\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_initialize_value\fR (ulong)
 .ad
 .RS 12n
 Pattern written to vdev free space by \fBzpool initialize\fR.
 .sp
 Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_initialize_chunk_size\fR (ulong)
 .ad
 .RS 12n
 Size of writes used by \fBzpool initialize\fR.
 This option is used by the test suite to facilitate testing.
 .sp
 Default value: \fB1,048,576\fR
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_livelist_max_entries\fR (ulong)
 .ad
 .RS 12n
 The threshold size (in block pointers) at which we create a new sub-livelist.
 Larger sublists are more costly from a memory perspective but the fewer
 sublists there are, the lower the cost of insertion.
 .sp
 Default value: \fB500,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_livelist_min_percent_shared\fR (int)
 .ad
 .RS 12n
 If the amount of shared space between a snapshot and its clone drops below
 this threshold, the clone turns off the livelist and reverts to the old deletion
 method. This is in place because once a clone has been overwritten enough
 livelists no long give us a benefit.
 .sp
 Default value: \fB75\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_livelist_condense_new_alloc\fR (int)
 .ad
 .RS 12n
 Incremented each time an extra ALLOC blkptr is added to a livelist entry while
 it is being condensed.
 This option is used by the test suite to track race conditions.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_livelist_condense_sync_cancel\fR (int)
 .ad
 .RS 12n
 Incremented each time livelist condensing is canceled while in
 spa_livelist_condense_sync.
 This option is used by the test suite to track race conditions.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_livelist_condense_sync_pause\fR (int)
 .ad
 .RS 12n
 When set, the livelist condense process pauses indefinitely before
 executing the synctask - spa_livelist_condense_sync.
 This option is used by the test suite to trigger race conditions.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_livelist_condense_zthr_cancel\fR (int)
 .ad
 .RS 12n
 Incremented each time livelist condensing is canceled while in
 spa_livelist_condense_cb.
 This option is used by the test suite to track race conditions.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_livelist_condense_zthr_pause\fR (int)
 .ad
 .RS 12n
 When set, the livelist condense process pauses indefinitely before
 executing the open context condensing work in spa_livelist_condense_cb.
 This option is used by the test suite to trigger race conditions.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_lua_max_instrlimit\fR (ulong)
 .ad
 .RS 12n
 The maximum execution time limit that can be set for a ZFS channel program,
 specified as a number of Lua instructions.
 .sp
 Default value: \fB100,000,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_lua_max_memlimit\fR (ulong)
 .ad
 .RS 12n
 The maximum memory limit that can be set for a ZFS channel program, specified
 in bytes.
 .sp
 Default value: \fB104,857,600\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_max_dataset_nesting\fR (int)
 .ad
 .RS 12n
 The maximum depth of nested datasets.  This value can be tuned temporarily to
 fix existing datasets that exceed the predefined limit.
 .sp
 Default value: \fB50\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_max_log_walking\fR (ulong)
 .ad
 .RS 12n
 The number of past TXGs that the flushing algorithm of the log spacemap
 feature uses to estimate incoming log blocks.
 .sp
 Default value: \fB5\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_max_logsm_summary_length\fR (ulong)
 .ad
 .RS 12n
 Maximum number of rows allowed in the summary of the spacemap log.
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_max_recordsize\fR (int)
 .ad
 .RS 12n
 We currently support block sizes from 512 bytes to 16MB.  The benefits of
 larger blocks, and thus larger I/O, need to be weighed against the cost of
 COWing a giant block to modify one byte.  Additionally, very large blocks
 can have an impact on i/o latency, and also potentially on the memory
 allocator.  Therefore, we do not allow the recordsize to be set larger than
 zfs_max_recordsize (default 1MB).  Larger blocks can be created by changing
 this tunable, and pools with larger blocks can always be imported and used,
 regardless of this setting.
 .sp
 Default value: \fB1,048,576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_allow_redacted_dataset_mount\fR (int)
 .ad
 .RS 12n
 Allow datasets received with redacted send/receive to be mounted. Normally
 disabled because these datasets may be missing key data.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_min_metaslabs_to_flush\fR (ulong)
 .ad
 .RS 12n
 Minimum number of metaslabs to flush per dirty TXG
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_metaslab_fragmentation_threshold\fR (int)
 .ad
 .RS 12n
 Allow metaslabs to keep their active state as long as their fragmentation
 percentage is less than or equal to this value. An active metaslab that
 exceeds this threshold will no longer keep its active status allowing
 better metaslabs to be selected.
 .sp
 Default value: \fB70\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_mg_fragmentation_threshold\fR (int)
 .ad
 .RS 12n
 Metaslab groups are considered eligible for allocations if their
 fragmentation metric (measured as a percentage) is less than or equal to
 this value. If a metaslab group exceeds this threshold then it will be
 skipped unless all metaslab groups within the metaslab class have also
 crossed this threshold.
 .sp
 Default value: \fB95\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_mg_noalloc_threshold\fR (int)
 .ad
 .RS 12n
 Defines a threshold at which metaslab groups should be eligible for
 allocations.  The value is expressed as a percentage of free space
 beyond which a metaslab group is always eligible for allocations.
 If a metaslab group's free space is less than or equal to the
 threshold, the allocator will avoid allocating to that group
 unless all groups in the pool have reached the threshold.  Once all
 groups have reached the threshold, all groups are allowed to accept
 allocations.  The default value of 0 disables the feature and causes
 all metaslab groups to be eligible for allocations.
 
 This parameter allows one to deal with pools having heavily imbalanced
 vdevs such as would be the case when a new vdev has been added.
 Setting the threshold to a non-zero percentage will stop allocations
 from being made to vdevs that aren't filled to the specified percentage
 and allow lesser filled vdevs to acquire more allocations than they
 otherwise would under the old \fBzfs_mg_alloc_failures\fR facility.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_ddt_data_is_special\fR (int)
 .ad
 .RS 12n
 If enabled, ZFS will place DDT data into the special allocation class.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_user_indirect_is_special\fR (int)
 .ad
 .RS 12n
 If enabled, ZFS will place user data (both file and zvol) indirect blocks
 into the special allocation class.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_multihost_history\fR (int)
 .ad
 .RS 12n
 Historical statistics for the last N multihost updates will be available in
 \fB/proc/spl/kstat/zfs/<pool>/multihost\fR
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_multihost_interval\fR (ulong)
 .ad
 .RS 12n
 Used to control the frequency of multihost writes which are performed when the
 \fBmultihost\fR pool property is on.  This is one factor used to determine the
 length of the activity check during import.
 .sp
 The multihost write period is \fBzfs_multihost_interval / leaf-vdevs\fR
 milliseconds.  On average a multihost write will be issued for each leaf vdev
 every \fBzfs_multihost_interval\fR milliseconds.  In practice, the observed
 period can vary with the I/O load and this observed value is the delay which is
 stored in the uberblock.
 .sp
 Default value: \fB1000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_multihost_import_intervals\fR (uint)
 .ad
 .RS 12n
 Used to control the duration of the activity test on import.  Smaller values of
 \fBzfs_multihost_import_intervals\fR will reduce the import time but increase
 the risk of failing to detect an active pool.  The total activity check time is
 never allowed to drop below one second.
 .sp
 On import the activity check waits a minimum amount of time determined by
 \fBzfs_multihost_interval * zfs_multihost_import_intervals\fR, or the same
 product computed on the host which last had the pool imported (whichever is
 greater).  The activity check time may be further extended if the value of mmp
 delay found in the best uberblock indicates actual multihost updates happened
 at longer intervals than \fBzfs_multihost_interval\fR.  A minimum value of
 \fB100ms\fR is enforced.
 .sp
 A value of 0 is ignored and treated as if it was set to 1.
 .sp
 Default value: \fB20\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_multihost_fail_intervals\fR (uint)
 .ad
 .RS 12n
 Controls the behavior of the pool when multihost write failures or delays are
 detected.
 .sp
 When \fBzfs_multihost_fail_intervals = 0\fR, multihost write failures or delays
 are ignored.  The failures will still be reported to the ZED which depending on
 its configuration may take action such as suspending the pool or offlining a
 device.
 
 .sp
 When \fBzfs_multihost_fail_intervals > 0\fR, the pool will be suspended if
 \fBzfs_multihost_fail_intervals * zfs_multihost_interval\fR milliseconds pass
 without a successful mmp write.  This guarantees the activity test will see
 mmp writes if the pool is imported.  A value of 1 is ignored and treated as
 if it was set to 2.  This is necessary to prevent the pool from being suspended
 due to normal, small I/O latency variations.
 
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_no_scrub_io\fR (int)
 .ad
 .RS 12n
 Set for no scrub I/O. This results in scrubs not actually scrubbing data and
 simply doing a metadata crawl of the pool instead.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_no_scrub_prefetch\fR (int)
 .ad
 .RS 12n
 Set to disable block prefetching for scrubs.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_nocacheflush\fR (int)
 .ad
 .RS 12n
 Disable cache flush operations on disks when writing.  Setting this will
 cause pool corruption on power loss if a volatile out-of-order write cache
 is enabled.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_nopwrite_enabled\fR (int)
 .ad
 .RS 12n
 Enable NOP writes
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dmu_offset_next_sync\fR (int)
 .ad
 .RS 12n
 Enable forcing txg sync to find holes. When enabled forces ZFS to act
 like prior versions when SEEK_HOLE or SEEK_DATA flags are used, which
 when a dnode is dirty causes txg's to be synced so that this data can be
 found.
 .sp
 Use \fB1\fR for yes and \fB0\fR to disable (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_pd_bytes_max\fR (int)
 .ad
 .RS 12n
 The number of bytes which should be prefetched during a pool traversal
 (eg: \fBzfs send\fR or other data crawling operations)
 .sp
 Default value: \fB52,428,800\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_per_txg_dirty_frees_percent \fR (ulong)
 .ad
 .RS 12n
 Tunable to control percentage of dirtied indirect blocks from frees allowed
 into one TXG. After this threshold is crossed, additional frees will wait until
 the next TXG.
 A value of zero will disable this throttle.
 .sp
 Default value: \fB5\fR, set to \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_prefetch_disable\fR (int)
 .ad
 .RS 12n
 This tunable disables predictive prefetch.  Note that it leaves "prescient"
 prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
 prescient prefetch never issues i/os that end up not being needed, so it
 can't hurt performance.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_qat_checksum_disable\fR (int)
 .ad
 .RS 12n
 This tunable disables qat hardware acceleration for sha256 checksums. It
 may be set after the zfs modules have been loaded to initialize the qat
 hardware as long as support is compiled in and the qat driver is present.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_qat_compress_disable\fR (int)
 .ad
 .RS 12n
 This tunable disables qat hardware acceleration for gzip compression. It
 may be set after the zfs modules have been loaded to initialize the qat
 hardware as long as support is compiled in and the qat driver is present.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_qat_encrypt_disable\fR (int)
 .ad
 .RS 12n
 This tunable disables qat hardware acceleration for AES-GCM encryption. It
 may be set after the zfs modules have been loaded to initialize the qat
 hardware as long as support is compiled in and the qat driver is present.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_read_chunk_size\fR (long)
 .ad
 .RS 12n
 Bytes to read per chunk
 .sp
 Default value: \fB1,048,576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_read_history\fR (int)
 .ad
 .RS 12n
 Historical statistics for the last N reads will be available in
 \fB/proc/spl/kstat/zfs/<pool>/reads\fR
 .sp
 Default value: \fB0\fR (no data is kept).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_read_history_hits\fR (int)
 .ad
 .RS 12n
 Include cache hits in read history
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_rebuild_max_segment\fR (ulong)
 .ad
 .RS 12n
 Maximum read segment size to issue when sequentially resilvering a
 top-level vdev.
 .sp
 Default value: \fB1,048,576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_reconstruct_indirect_combinations_max\fR (int)
 .ad
 .RS 12na
 If an indirect split block contains more than this many possible unique
 combinations when being reconstructed, consider it too computationally
 expensive to check them all. Instead, try at most
 \fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected
 combinations each time the block is accessed.  This allows all segment
 copies to participate fairly in the reconstruction when all combinations
 cannot be checked and prevents repeated use of one bad copy.
 .sp
 Default value: \fB4096\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_recover\fR (int)
 .ad
 .RS 12n
 Set to attempt to recover from fatal errors. This should only be used as a
 last resort, as it typically results in leaked space, or worse.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_removal_ignore_errors\fR (int)
 .ad
 .RS 12n
 .sp
 Ignore hard IO errors during device removal.  When set, if a device encounters
 a hard IO error during the removal process the removal will not be cancelled.
 This can result in a normally recoverable block becoming permanently damaged
 and is not recommended.  This should only be used as a last resort when the
 pool cannot be returned to a healthy state prior to removing the device.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_removal_suspend_progress\fR (int)
 .ad
 .RS 12n
 .sp
 This is used by the test suite so that it can ensure that certain actions
 happen while in the middle of a removal.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_remove_max_segment\fR (int)
 .ad
 .RS 12n
 .sp
 The largest contiguous segment that we will attempt to allocate when removing
 a device.  This can be no larger than 16MB.  If there is a performance
 problem with attempting to allocate large blocks, consider decreasing this.
 .sp
 Default value: \fB16,777,216\fR (16MB).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_resilver_disable_defer\fR (int)
 .ad
 .RS 12n
 Disables the \fBresilver_defer\fR feature, causing an operation that would
 start a resilver to restart one in progress immediately.
 .sp
 Default value: \fB0\fR (feature enabled).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_resilver_min_time_ms\fR (int)
 .ad
 .RS 12n
 Resilvers are processed by the sync thread. While resilvering it will spend
 at least this much time working on a resilver between txg flushes.
 .sp
 Default value: \fB3,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_ignore_errors\fR (int)
 .ad
 .RS 12n
 If set to a nonzero value, remove the DTL (dirty time list) upon
 completion of a pool scan (scrub) even if there were unrepairable
 errors.  It is intended to be used during pool repair or recovery to
 stop resilvering when the pool is next imported.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scrub_min_time_ms\fR (int)
 .ad
 .RS 12n
 Scrubs are processed by the sync thread. While scrubbing it will spend
 at least this much time working on a scrub between txg flushes.
 .sp
 Default value: \fB1,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_checkpoint_intval\fR (int)
 .ad
 .RS 12n
 To preserve progress across reboots the sequential scan algorithm periodically
 needs to stop metadata scanning and issue all the verifications I/Os to disk.
 The frequency of this flushing is determined by the
 \fBzfs_scan_checkpoint_intval\fR tunable.
 .sp
 Default value: \fB7200\fR seconds (every 2 hours).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_fill_weight\fR (int)
 .ad
 .RS 12n
 This tunable affects how scrub and resilver I/O segments are ordered. A higher
 number indicates that we care more about how filled in a segment is, while a
 lower number indicates we care more about the size of the extent without
 considering the gaps within a segment. This value is only tunable upon module
 insertion. Changing the value afterwards will have no affect on scrub or
 resilver performance.
 .sp
 Default value: \fB3\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_issue_strategy\fR (int)
 .ad
 .RS 12n
 Determines the order that data will be verified while scrubbing or resilvering.
 If set to \fB1\fR, data will be verified as sequentially as possible, given the
 amount of memory reserved for scrubbing (see \fBzfs_scan_mem_lim_fact\fR). This
 may improve scrub performance if the pool's data is very fragmented. If set to
 \fB2\fR, the largest mostly-contiguous chunk of found data will be verified
 first. By deferring scrubbing of small segments, we may later find adjacent data
 to coalesce and increase the segment size. If set to \fB0\fR, zfs will use
 strategy \fB1\fR during normal verification and strategy \fB2\fR while taking a
 checkpoint.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_legacy\fR (int)
 .ad
 .RS 12n
 A value of 0 indicates that scrubs and resilvers will gather metadata in
 memory before issuing sequential I/O. A value of 1 indicates that the legacy
 algorithm will be used where I/O is initiated as soon as it is discovered.
 Changing this value to 0 will not affect scrubs or resilvers that are already
 in progress.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_max_ext_gap\fR (int)
 .ad
 .RS 12n
 Indicates the largest gap in bytes between scrub / resilver I/Os that will still
 be considered sequential for sorting purposes. Changing this value will not
 affect scrubs or resilvers that are already in progress.
 .sp
 Default value: \fB2097152 (2 MB)\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_mem_lim_fact\fR (int)
 .ad
 .RS 12n
 Maximum fraction of RAM used for I/O sorting by sequential scan algorithm.
 This tunable determines the hard limit for I/O sorting memory usage.
 When the hard limit is reached we stop scanning metadata and start issuing
 data verification I/O. This is done until we get below the soft limit.
 .sp
 Default value: \fB20\fR which is 5% of RAM (1/20).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_mem_lim_soft_fact\fR (int)
 .ad
 .RS 12n
 The fraction of the hard limit used to determined the soft limit for I/O sorting
 by the sequential scan algorithm. When we cross this limit from below no action
 is taken. When we cross this limit from above it is because we are issuing
 verification I/O. In this case (unless the metadata scan is done) we stop
 issuing verification I/O and start scanning metadata again until we get to the
 hard limit.
 .sp
 Default value: \fB20\fR which is 5% of the hard limit (1/20).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_strict_mem_lim\fR (int)
 .ad
 .RS 12n
 Enforces tight memory limits on pool scans when a sequential scan is in
 progress. When disabled the memory limit may be exceeded by fast disks.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_suspend_progress\fR (int)
 .ad
 .RS 12n
 Freezes a scrub/resilver in progress without actually pausing it. Intended for
 testing/debugging.
 .sp
 Default value: \fB0\fR.
 .RE
 
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_vdev_limit\fR (int)
 .ad
 .RS 12n
 Maximum amount of data that can be concurrently issued at once for scrubs and
 resilvers per leaf device, given in bytes.
 .sp
 Default value: \fB41943040\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_send_corrupt_data\fR (int)
 .ad
 .RS 12n
 Allow sending of corrupt data (ignore read/checksum errors when sending data)
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_send_unmodified_spill_blocks\fR (int)
 .ad
 .RS 12n
 Include unmodified spill blocks in the send stream. Under certain circumstances
 previous versions of ZFS could incorrectly remove the spill block from an
 existing object.  Including unmodified copies of the spill blocks creates a
 backwards compatible stream which will recreate a spill block if it was
 incorrectly removed.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_send_no_prefetch_queue_ff\fR (int)
 .ad
 .RS 12n
 The fill fraction of the \fBzfs send\fR internal queues. The fill fraction
 controls the timing with which internal threads are woken up.
 .sp
 Default value: \fB20\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_send_no_prefetch_queue_length\fR (int)
 .ad
 .RS 12n
 The maximum number of bytes allowed in \fBzfs send\fR's internal queues.
 .sp
 Default value: \fB1,048,576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_send_queue_ff\fR (int)
 .ad
 .RS 12n
 The fill fraction of the \fBzfs send\fR prefetch queue. The fill fraction
 controls the timing with which internal threads are woken up.
 .sp
 Default value: \fB20\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_send_queue_length\fR (int)
 .ad
 .RS 12n
 The maximum number of bytes allowed that will be prefetched by \fBzfs send\fR.
 This value must be at least twice the maximum block size in use.
 .sp
 Default value: \fB16,777,216\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_recv_queue_ff\fR (int)
 .ad
 .RS 12n
 The fill fraction of the \fBzfs receive\fR queue. The fill fraction
 controls the timing with which internal threads are woken up.
 .sp
 Default value: \fB20\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_recv_queue_length\fR (int)
 .ad
 .RS 12n
 The maximum number of bytes allowed in the \fBzfs receive\fR queue. This value
 must be at least twice the maximum block size in use.
 .sp
 Default value: \fB16,777,216\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_recv_write_batch_size\fR (int)
 .ad
 .RS 12n
 The maximum amount of data (in bytes) that \fBzfs receive\fR will write in
 one DMU transaction.  This is the uncompressed size, even when receiving a
 compressed send stream.  This setting will not reduce the write size below
 a single block. Capped at a maximum of 32MB
 .sp
 Default value: \fB1MB\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_override_estimate_recordsize\fR (ulong)
 .ad
 .RS 12n
 Setting this variable overrides the default logic for estimating block
 sizes when doing a zfs send. The default heuristic is that the average
 block size will be the current recordsize. Override this value if most data
 in your dataset is not of that size and you require accurate zfs send size
 estimates.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_sync_pass_deferred_free\fR (int)
 .ad
 .RS 12n
 Flushing of data to disk is done in passes. Defer frees starting in this pass
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_spa_discard_memory_limit\fR (int)
 .ad
 .RS 12n
 Maximum memory used for prefetching a checkpoint's space map on each
 vdev while discarding the checkpoint.
 .sp
 Default value: \fB16,777,216\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_special_class_metadata_reserve_pct\fR (int)
 .ad
 .RS 12n
 Only allow small data blocks to be allocated on the special and dedup vdev
 types when the available free space percentage on these vdevs exceeds this
 value. This ensures reserved space is available for pool meta data as the
 special vdevs approach capacity.
 .sp
 Default value: \fB25\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_sync_pass_dont_compress\fR (int)
 .ad
 .RS 12n
-Starting in this sync pass, we disable compression (including of metadata). 
+Starting in this sync pass, we disable compression (including of metadata).
 With the default setting, in practice, we don't have this many sync passes,
 so this has no effect.
 .sp
 The original intent was that disabling compression would help the sync passes
 to converge. However, in practice disabling compression increases the average
 number of sync passes, because when we turn compression off, a lot of block's
 size will change and thus we have to re-allocate (not overwrite) them. It
 also increases the number of 128KB allocations (e.g. for indirect blocks and
 spacemaps) because these will not be compressed. The 128K allocations are
 especially detrimental to performance on highly fragmented systems, which may
 have very few free segments of this size, and may need to load new metaslabs
 to satisfy 128K allocations.
 .sp
 Default value: \fB8\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_sync_pass_rewrite\fR (int)
 .ad
 .RS 12n
 Rewrite new block pointers starting in this pass
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_sync_taskq_batch_pct\fR (int)
 .ad
 .RS 12n
 This controls the number of threads used by the dp_sync_taskq.  The default
 value of 75% will create a maximum of one thread per cpu.
 .sp
 Default value: \fB75\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_trim_extent_bytes_max\fR (uint)
 .ad
 .RS 12n
 Maximum size of TRIM command.  Ranges larger than this will be split in to
 chunks no larger than \fBzfs_trim_extent_bytes_max\fR bytes before being
 issued to the device.
 .sp
 Default value: \fB134,217,728\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_trim_extent_bytes_min\fR (uint)
 .ad
 .RS 12n
 Minimum size of TRIM commands.  TRIM ranges smaller than this will be skipped
 unless they're part of a larger range which was broken in to chunks.  This is
 done because it's common for these small TRIMs to negatively impact overall
 performance.  This value can be set to 0 to TRIM all unallocated space.
 .sp
 Default value: \fB32,768\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_trim_metaslab_skip\fR (uint)
 .ad
 .RS 12n
 Skip uninitialized metaslabs during the TRIM process.  This option is useful
 for pools constructed from large thinly-provisioned devices where TRIM
 operations are slow.  As a pool ages an increasing fraction of the pools
 metaslabs will be initialized progressively degrading the usefulness of
 this option.  This setting is stored when starting a manual TRIM and will
 persist for the duration of the requested TRIM.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_trim_queue_limit\fR (uint)
 .ad
 .RS 12n
 Maximum number of queued TRIMs outstanding per leaf vdev.  The number of
 concurrent TRIM commands issued to the device is controlled by the
 \fBzfs_vdev_trim_min_active\fR and \fBzfs_vdev_trim_max_active\fR module
 options.
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_trim_txg_batch\fR (uint)
 .ad
 .RS 12n
 The number of transaction groups worth of frees which should be aggregated
 before TRIM operations are issued to the device.  This setting represents a
 trade-off between issuing larger, more efficient TRIM operations and the
 delay before the recently trimmed space is available for use by the device.
 .sp
 Increasing this value will allow frees to be aggregated for a longer time.
 This will result is larger TRIM operations and potentially increased memory
 usage.  Decreasing this value will have the opposite effect.  The default
 value of 32 was determined to be a reasonable compromise.
 .sp
 Default value: \fB32\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_txg_history\fR (int)
 .ad
 .RS 12n
 Historical statistics for the last N txgs will be available in
 \fB/proc/spl/kstat/zfs/<pool>/txgs\fR
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_txg_timeout\fR (int)
 .ad
 .RS 12n
 Flush dirty data to disk at least every N seconds (maximum txg duration)
 .sp
 Default value: \fB5\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_aggregate_trim\fR (int)
 .ad
 .RS 12n
 Allow TRIM I/Os to be aggregated.  This is normally not helpful because
 the extents to be trimmed will have been already been aggregated by the
 metaslab.  This option is provided for debugging and performance analysis.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_aggregation_limit\fR (int)
 .ad
 .RS 12n
 Max vdev I/O aggregation size
 .sp
 Default value: \fB1,048,576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_aggregation_limit_non_rotating\fR (int)
 .ad
 .RS 12n
 Max vdev I/O aggregation size for non-rotating media
 .sp
 Default value: \fB131,072\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_cache_bshift\fR (int)
 .ad
 .RS 12n
 Shift size to inflate reads too
 .sp
 Default value: \fB16\fR (effectively 65536).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_cache_max\fR (int)
 .ad
 .RS 12n
 Inflate reads smaller than this value to meet the \fBzfs_vdev_cache_bshift\fR
 size (default 64k).
 .sp
 Default value: \fB16384\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_cache_size\fR (int)
 .ad
 .RS 12n
 Total size of the per-disk cache in bytes.
 .sp
 Currently this feature is disabled as it has been found to not be helpful
 for performance and in some cases harmful.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_mirror_rotating_inc\fR (int)
 .ad
 .RS 12n
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member when an I/O immediately
 follows its predecessor on rotational vdevs for the purpose of making decisions
 based on load.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_mirror_rotating_seek_inc\fR (int)
 .ad
 .RS 12n
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member when an I/O lacks
 locality as defined by the zfs_vdev_mirror_rotating_seek_offset.  I/Os within
 this that are not immediately following the previous I/O are incremented by
 half.
 .sp
 Default value: \fB5\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_mirror_rotating_seek_offset\fR (int)
 .ad
 .RS 12n
 The maximum distance for the last queued I/O in which the balancing algorithm
 considers an I/O to have locality.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1048576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_mirror_non_rotating_inc\fR (int)
 .ad
 .RS 12n
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member on non-rotational vdevs
 when I/Os do not immediately follow one another.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_mirror_non_rotating_seek_inc\fR (int)
 .ad
 .RS 12n
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member when an I/O lacks
 locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within
 this that are not immediately following the previous I/O are incremented by
 half.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_read_gap_limit\fR (int)
 .ad
 .RS 12n
 Aggregate read I/O operations if the gap on-disk between them is within this
 threshold.
 .sp
 Default value: \fB32,768\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_write_gap_limit\fR (int)
 .ad
 .RS 12n
 Aggregate write I/O over gap
 .sp
 Default value: \fB4,096\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_raidz_impl\fR (string)
 .ad
 .RS 12n
 Parameter for selecting raidz parity implementation to use.
 
 Options marked (always) below may be selected on module load as they are
 supported on all systems.
 The remaining options may only be set after the module is loaded, as they
 are available only if the implementations are compiled in and supported
 on the running system.
 
 Once the module is loaded, the content of
 /sys/module/zfs/parameters/zfs_vdev_raidz_impl will show available options
 with the currently selected one enclosed in [].
 Possible options are:
   fastest  - (always) implementation selected using built-in benchmark
   original - (always) original raidz implementation
   scalar   - (always) scalar raidz implementation
   sse2     - implementation using SSE2 instruction set (64bit x86 only)
   ssse3    - implementation using SSSE3 instruction set (64bit x86 only)
   avx2     - implementation using AVX2 instruction set (64bit x86 only)
   avx512f  - implementation using AVX512F instruction set (64bit x86 only)
   avx512bw - implementation using AVX512F & AVX512BW instruction sets (64bit x86 only)
   aarch64_neon - implementation using NEON (Aarch64/64 bit ARMv8 only)
   aarch64_neonx2 - implementation using NEON with more unrolling (Aarch64/64 bit ARMv8 only)
   powerpc_altivec - implementation using Altivec (PowerPC only)
 .sp
 Default value: \fBfastest\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_scheduler\fR (charp)
 .ad
 .RS 12n
 \fBDEPRECATED\fR: This option exists for compatibility with older user
 configurations. It does nothing except print a warning to the kernel log if
 set.
 .sp
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_zevent_cols\fR (int)
 .ad
 .RS 12n
 When zevents are logged to the console use this as the word wrap width.
 .sp
 Default value: \fB80\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_zevent_console\fR (int)
 .ad
 .RS 12n
 Log events to the console
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_zevent_len_max\fR (int)
 .ad
 .RS 12n
 Max event queue length. A value of 0 will result in a calculated value which
 increases with the number of CPUs in the system (minimum 64 events). Events
 in the queue can be viewed with the \fBzpool events\fR command.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_zil_clean_taskq_maxalloc\fR (int)
 .ad
 .RS 12n
 The maximum number of taskq entries that are allowed to be cached.  When this
 limit is exceeded transaction records (itxs) will be cleaned synchronously.
 .sp
 Default value: \fB1048576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_zil_clean_taskq_minalloc\fR (int)
 .ad
 .RS 12n
 The number of taskq entries that are pre-populated when the taskq is first
 created and are immediately available for use.
 .sp
 Default value: \fB1024\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_zil_clean_taskq_nthr_pct\fR (int)
 .ad
 .RS 12n
 This controls the number of threads used by the dp_zil_clean_taskq.  The default
 value of 100% will create a maximum of one thread per cpu.
 .sp
 Default value: \fB100\fR%.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzil_maxblocksize\fR (int)
 .ad
 .RS 12n
 This sets the maximum block size used by the ZIL.  On very fragmented pools,
 lowering this (typically to 36KB) can improve performance.
 .sp
 Default value: \fB131072\fR (128KB).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzil_nocacheflush\fR (int)
 .ad
 .RS 12n
 Disable the cache flush commands that are normally sent to the disk(s) by
 the ZIL after an LWB write has completed. Setting this will cause ZIL
 corruption on power loss if a volatile out-of-order write cache is enabled.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzil_replay_disable\fR (int)
 .ad
 .RS 12n
 Disable intent logging replay. Can be disabled for recovery from corrupted
 ZIL
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzil_slog_bulk\fR (ulong)
 .ad
 .RS 12n
 Limit SLOG write size per commit executed with synchronous priority.
 Any writes above that will be executed with lower (asynchronous) priority
 to limit potential SLOG device abuse by single active ZIL writer.
 .sp
 Default value: \fB786,432\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_deadman_log_all\fR (int)
 .ad
 .RS 12n
 If non-zero, the zio deadman will produce debugging messages (see
 \fBzfs_dbgmsg_enable\fR) for all zios, rather than only for leaf
 zios possessing a vdev. This is meant to be used by developers to gain
 diagnostic information for hang conditions which don't involve a mutex
 or other locking primitive; typically conditions in which a thread in
 the zio pipeline is looping indefinitely.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_decompress_fail_fraction\fR (int)
 .ad
 .RS 12n
 If non-zero, this value represents the denominator of the probability that zfs
 should induce a decompression failure. For instance, for a 5% decompression
 failure rate, this value should be set to 20.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_slow_io_ms\fR (int)
 .ad
 .RS 12n
 When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to
 complete is marked as a slow I/O.  Each slow I/O causes a delay zevent.  Slow
 I/O counters can be seen with "zpool status -s".
 
 .sp
 Default value: \fB30,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_dva_throttle_enabled\fR (int)
 .ad
 .RS 12n
 Throttle block allocations in the I/O pipeline. This allows for
 dynamic allocation distribution when devices are imbalanced.
 When enabled, the maximum number of pending allocations per top-level vdev
 is limited by \fBzfs_vdev_queue_depth_pct\fR.
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_requeue_io_start_cut_in_line\fR (int)
 .ad
 .RS 12n
 Prioritize requeued I/O
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_taskq_batch_pct\fR (uint)
 .ad
 .RS 12n
 Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
 for I/O. These workers are responsible for I/O work such as compression and
 checksum calculations. Fractional number of CPUs will be rounded down.
 .sp
 The default value of 75 was chosen to avoid using all CPUs which can result in
 latency issues and inconsistent application performance, especially when high
 compression is enabled.
 .sp
 Default value: \fB75\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_inhibit_dev\fR (uint)
 .ad
 .RS 12n
 Do not create zvol device nodes. This may slightly improve startup time on
 systems with a very large number of zvols.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_major\fR (uint)
 .ad
 .RS 12n
 Major number for zvol block devices
 .sp
 Default value: \fB230\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_max_discard_blocks\fR (ulong)
 .ad
 .RS 12n
 Discard (aka TRIM) operations done on zvols will be done in batches of this
 many blocks, where block size is determined by the \fBvolblocksize\fR property
 of a zvol.
 .sp
 Default value: \fB16,384\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_prefetch_bytes\fR (uint)
 .ad
 .RS 12n
 When adding a zvol to the system prefetch \fBzvol_prefetch_bytes\fR
 from the start and end of the volume.  Prefetching these regions
 of the volume is desirable because they are likely to be accessed
 immediately by \fBblkid(8)\fR or by the kernel scanning for a partition
 table.
 .sp
 Default value: \fB131,072\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_request_sync\fR (uint)
 .ad
 .RS 12n
 When processing I/O requests for a zvol submit them synchronously.  This
 effectively limits the queue depth to 1 for each I/O submitter.  When set
 to 0 requests are handled asynchronously by a thread pool.  The number of
 requests which can be handled concurrently is controller by \fBzvol_threads\fR.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_threads\fR (uint)
 .ad
 .RS 12n
 Max number of threads which can handle zvol I/O requests concurrently.
 .sp
 Default value: \fB32\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_volmode\fR (uint)
 .ad
 .RS 12n
 Defines zvol block devices behaviour when \fBvolmode\fR is set to \fBdefault\fR.
 Valid values are \fB1\fR (full), \fB2\fR (dev) and \fB3\fR (none).
 .sp
 Default value: \fB1\fR.
 .RE
 
 .SH ZFS I/O SCHEDULER
 ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os.
 The I/O scheduler determines when and in what order those operations are
 issued.  The I/O scheduler divides operations into five I/O classes
 prioritized in the following order: sync read, sync write, async read,
 async write, and scrub/resilver.  Each queue defines the minimum and
 maximum number of concurrent operations that may be issued to the
 device.  In addition, the device has an aggregate maximum,
 \fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums
 must not exceed the aggregate maximum.  If the sum of the per-queue
 maximums exceeds the aggregate maximum, then the number of active I/Os
 may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will
 be issued regardless of whether all per-queue minimums have been met.
 .sp
 For many physical devices, throughput increases with the number of
 concurrent operations, but latency typically suffers. Further, physical
 devices typically have a limit at which more concurrent operations have no
 effect on throughput or can actually cause it to decrease.
 .sp
 The scheduler selects the next operation to issue by first looking for an
 I/O class whose minimum has not been satisfied. Once all are satisfied and
 the aggregate maximum has not been hit, the scheduler looks for classes
 whose maximum has not been satisfied. Iteration through the I/O classes is
 done in the order specified above. No further operations are issued if the
 aggregate maximum number of concurrent operations has been hit or if there
 are no operations queued for an I/O class that has not hit its maximum.
 Every time an I/O is queued or an operation completes, the I/O scheduler
 looks for new operations to issue.
 .sp
 In general, smaller max_active's will lead to lower latency of synchronous
 operations.  Larger max_active's may lead to higher overall throughput,
 depending on underlying storage.
 .sp
 The ratio of the queues' max_actives determines the balance of performance
 between reads, writes, and scrubs.  E.g., increasing
 \fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete
 more quickly, but reads and writes to have higher latency and lower throughput.
 .sp
 All I/O classes have a fixed maximum number of outstanding operations
 except for the async write class. Asynchronous writes represent the data
 that is committed to stable storage during the syncing stage for
 transaction groups. Transaction groups enter the syncing state
 periodically so the number of queued async writes will quickly burst up
 and then bleed down to zero. Rather than servicing them as quickly as
 possible, the I/O scheduler changes the maximum number of active async
 write I/Os according to the amount of dirty data in the pool.  Since
 both throughput and latency typically increase with the number of
 concurrent operations issued to physical devices, reducing the
 burstiness in the number of concurrent operations also stabilizes the
 response time of operations from other -- and in particular synchronous
 -- queues. In broad strokes, the I/O scheduler will issue more
 concurrent operations from the async write queue as there's more dirty
 data in the pool.
 .sp
 Async Writes
 .sp
 The number of concurrent operations issued for the async write I/O class
 follows a piece-wise linear function defined by a few adjustable points.
 .nf
 
        |              o---------| <-- zfs_vdev_async_write_max_active
   ^    |             /^         |
   |    |            / |         |
 active |           /  |         |
  I/O   |          /   |         |
 count  |         /    |         |
        |        /     |         |
        |-------o      |         | <-- zfs_vdev_async_write_min_active
       0|_______^______|_________|
        0%      |      |       100% of zfs_dirty_data_max
                |      |
                |      `-- zfs_vdev_async_write_active_max_dirty_percent
                `--------- zfs_vdev_async_write_active_min_dirty_percent
 
 .fi
 Until the amount of dirty data exceeds a minimum percentage of the dirty
 data allowed in the pool, the I/O scheduler will limit the number of
 concurrent operations to the minimum. As that threshold is crossed, the
 number of concurrent operations issued increases linearly to the maximum at
 the specified maximum percentage of the dirty data allowed in the pool.
 .sp
 Ideally, the amount of dirty data on a busy pool will stay in the sloped
 part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR
 and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the
 maximum percentage, this indicates that the rate of incoming data is
 greater than the rate that the backend storage can handle. In this case, we
 must further throttle incoming writes, as described in the next section.
 
 .SH ZFS TRANSACTION DELAY
 We delay transactions when we've determined that the backend storage
 isn't able to accommodate the rate of incoming writes.
 .sp
 If there is already a transaction waiting, we delay relative to when
 that transaction will finish waiting.  This way the calculated delay time
 is independent of the number of threads concurrently executing
 transactions.
 .sp
 If we are the only waiter, wait relative to when the transaction
 started, rather than the current time.  This credits the transaction for
 "time already served", e.g. reading indirect blocks.
 .sp
 The minimum time for a transaction to take is calculated as:
 .nf
     min_time = zfs_delay_scale * (dirty - min) / (max - dirty)
     min_time is then capped at 100 milliseconds.
 .fi
 .sp
 The delay has two degrees of freedom that can be adjusted via tunables.  The
 percentage of dirty data at which we start to delay is defined by
 \fBzfs_delay_min_dirty_percent\fR. This should typically be at or above
 \fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to
 delay after writing at full speed has failed to keep up with the incoming write
 rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking,
 this variable determines the amount of delay at the midpoint of the curve.
 .sp
 .nf
 delay
  10ms +-------------------------------------------------------------*+
       |                                                             *|
   9ms +                                                             *+
       |                                                             *|
   8ms +                                                             *+
       |                                                            * |
   7ms +                                                            * +
       |                                                            * |
   6ms +                                                            * +
       |                                                            * |
   5ms +                                                           *  +
       |                                                           *  |
   4ms +                                                           *  +
       |                                                           *  |
   3ms +                                                          *   +
       |                                                          *   |
   2ms +                                              (midpoint) *    +
       |                                                  |    **     |
   1ms +                                                  v ***       +
       |             zfs_delay_scale ---------->     ********         |
     0 +-------------------------------------*********----------------+
       0%                    <- zfs_dirty_data_max ->               100%
 .fi
 .sp
 Note that since the delay is added to the outstanding time remaining on the
 most recent transaction, the delay is effectively the inverse of IOPS.
 Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
 was chosen such that small changes in the amount of accumulated dirty data
 in the first 3/4 of the curve yield relatively small differences in the
 amount of delay.
 .sp
 The effects can be easier to understand when the amount of delay is
 represented on a log scale:
 .sp
 .nf
 delay
 100ms +-------------------------------------------------------------++
       +                                                              +
       |                                                              |
       +                                                             *+
  10ms +                                                             *+
       +                                                           ** +
       |                                              (midpoint)  **  |
       +                                                  |     **    +
   1ms +                                                  v ****      +
       +             zfs_delay_scale ---------->        *****         +
       |                                             ****             |
       +                                          ****                +
 100us +                                        **                    +
       +                                       *                      +
       |                                      *                       |
       +                                     *                        +
  10us +                                     *                        +
       +                                                              +
       |                                                              |
       +                                                              +
       +--------------------------------------------------------------+
       0%                    <- zfs_dirty_data_max ->               100%
 .fi
 .sp
 Note here that only as the amount of dirty data approaches its limit does
 the delay start to increase rapidly. The goal of a properly tuned system
 should be to keep the amount of dirty data out of that range by first
 ensuring that the appropriate limits are set for the I/O scheduler to reach
 optimal throughput on the backend storage, and then by changing the value
 of \fBzfs_delay_scale\fR to increase the steepness of the curve.
Index: vendor-sys/openzfs/dist/man/man5/zpool-features.5
===================================================================
--- vendor-sys/openzfs/dist/man/man5/zpool-features.5	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man5/zpool-features.5	(revision 364928)
@@ -1,985 +1,985 @@
 '\" te
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
 .\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
 .\"
 .\" See the License for the specific language governing permissions and
 .\" limitations under the License. When distributing Covered Code, include this
 .\" CDDL HEADER in each file and include the License file at
 .\" usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\" Copyright (c) 2019, Klara Inc.
 .\" Copyright (c) 2019, Allan Jude
-.TH ZPOOL-FEATURES 5 "Jun 8, 2018"
+.TH ZPOOL-FEATURES 5 "Aug 24, 2020" OpenZFS
 .SH NAME
 zpool\-features \- ZFS pool feature descriptions
 .SH DESCRIPTION
 .sp
 .LP
 ZFS pool on\-disk format versions are specified via "features" which replace
 the old on\-disk format numbers (the last supported on\-disk format number is
 28). To enable a feature on a pool use the \fBupgrade\fR subcommand of the
 zpool(8) command, or set the \fBfeature@\fR\fIfeature_name\fR property
 to \fBenabled\fR.
 .sp
 .LP
 The pool format does not affect file system version compatibility or the ability
 to send file systems between pools.
 .sp
 .LP
 Since most features can be enabled independently of each other the on\-disk
 format of the pool is specified by the set of all features marked as
 \fBactive\fR on the pool. If the pool was created by another software version
 this set may include unsupported features.
 .SS "Identifying features"
 .sp
 .LP
 Every feature has a GUID of the form \fIcom.example:feature_name\fR. The
 reversed DNS name ensures that the feature's GUID is unique across all ZFS
 implementations. When unsupported features are encountered on a pool they will
 be identified by their GUIDs. Refer to the documentation for the ZFS
 implementation that created the pool for information about those features.
 .sp
 .LP
 Each supported feature also has a short name. By convention a feature's short
 name is the portion of its GUID which follows the ':' (e.g.
 \fIcom.example:feature_name\fR would have the short name \fIfeature_name\fR),
 however a feature's short name may differ across ZFS implementations if
 following the convention would result in name conflicts.
 .SS "Feature states"
 .sp
 .LP
 Features can be in one of three states:
 .sp
 .ne 2
 .na
 \fBactive\fR
 .ad
 .RS 12n
 This feature's on\-disk format changes are in effect on the pool. Support for
 this feature is required to import the pool in read\-write mode. If this
 feature is not read-only compatible, support is also required to import the pool
 in read\-only mode (see "Read\-only compatibility").
 .RE
 
 .sp
 .ne 2
 .na
 \fBenabled\fR
 .ad
 .RS 12n
 An administrator has marked this feature as enabled on the pool, but the
 feature's on\-disk format changes have not been made yet. The pool can still be
 imported by software that does not support this feature, but changes may be made
 to the on\-disk format at any time which will move the feature to the
 \fBactive\fR state. Some features may support returning to the \fBenabled\fR
 state after becoming \fBactive\fR. See feature\-specific documentation for
 details.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdisabled\fR
 .ad
 .RS 12n
 This feature's on\-disk format changes have not been made and will not be made
 unless an administrator moves the feature to the \fBenabled\fR state. Features
 cannot be disabled once they have been enabled.
 .RE
 
 .sp
 .LP
 The state of supported features is exposed through pool properties of the form
 \fIfeature@short_name\fR.
 .SS "Read\-only compatibility"
 .sp
 .LP
 Some features may make on\-disk format changes that do not interfere with other
 software's ability to read from the pool. These features are referred to as
 "read\-only compatible". If all unsupported features on a pool are read\-only
 compatible, the pool can be imported in read\-only mode by setting the
 \fBreadonly\fR property during import (see zpool(8) for details on
 importing pools).
 .SS "Unsupported features"
 .sp
 .LP
 For each unsupported feature enabled on an imported pool a pool property
 named \fIunsupported@feature_name\fR will indicate why the import was allowed
 despite the unsupported feature. Possible values for this property are:
 
 .sp
 .ne 2
 .na
 \fBinactive\fR
 .ad
 .RS 12n
 The feature is in the \fBenabled\fR state and therefore the pool's on\-disk
 format is still compatible with software that does not support this feature.
 .RE
 
 .sp
 .ne 2
 .na
 \fBreadonly\fR
 .ad
 .RS 12n
 The feature is read\-only compatible and the pool has been imported in
 read\-only mode.
 .RE
 
 .SS "Feature dependencies"
 .sp
 .LP
 Some features depend on other features being enabled in order to function
 properly. Enabling a feature will automatically enable any features it
 depends on.
 .SH FEATURES
 .sp
 .LP
 The following features are supported on this system:
 
 .sp
 .ne 2
 .na
 \fBallocation_classes\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.zfsonlinux:allocation_classes
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	none
 .TE
 
 This feature enables support for separate allocation classes.
 
 This feature becomes \fBactive\fR when a dedicated allocation class vdev
 (dedup or special) is created with the \fBzpool create\fR or \fBzpool add\fR
 subcommands. With device removal, it can be returned to the \fBenabled\fR
 state if all the dedicated allocation class vdevs are removed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBasync_destroy\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:async_destroy
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	none
 .TE
 
 Destroying a file system requires traversing all of its data in order to
 return its used space to the pool. Without \fBasync_destroy\fR the file system
 is not fully removed until all space has been reclaimed. If the destroy
 operation is interrupted by a reboot or power outage the next attempt to open
 the pool will need to complete the destroy operation synchronously.
 
 When \fBasync_destroy\fR is enabled the file system's data will be reclaimed
 by a background process, allowing the destroy operation to complete without
 traversing the entire file system. The background process is able to resume
 interrupted destroys after the pool has been opened, eliminating the need
 to finish interrupted destroys as part of the open operation. The amount
 of space remaining to be reclaimed by the background process is available
 through the \fBfreeing\fR property.
 
 This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbookmarks\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:bookmarks
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	extensible_dataset
 .TE
 
 This feature enables use of the \fBzfs bookmark\fR subcommand.
 
 This feature is \fBactive\fR while any bookmarks exist in the pool.
 All bookmarks in the pool can be listed by running
 \fBzfs list -t bookmark -r \fIpoolname\fR\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbookmark_v2\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.datto:bookmark_v2
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	bookmark, extensible_dataset
 .TE
 
 This feature enables the creation and management of larger bookmarks which are
 needed for other features in ZFS.
 
 This feature becomes \fBactive\fR when a v2 bookmark is created and will be
 returned to the \fBenabled\fR state when all v2 bookmarks are destroyed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBbookmark_written\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:bookmark_written
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	bookmark, extensible_dataset, bookmark_v2
 .TE
 
 This feature enables additional bookmark accounting fields, enabling the
 written#<bookmark> property (space written since a bookmark) and estimates of
 send stream sizes for incrementals from bookmarks.
 
 This feature becomes \fBactive\fR when a bookmark is created and will be
 returned to the \fBenabled\fR state when all bookmarks with these fields are destroyed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdevice_rebuild\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.openzfs:device_rebuild
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	none
 .TE
 
 This feature enables the ability for the \fBzpool attach\fR and \fBzpool
 replace\fR subcommands to perform sequential reconstruction (instead of
 healing reconstruction) when resilvering.
 
 Sequential reconstruction resilvers a device in LBA order without immediately
 verifying the checksums.  Once complete a scrub is started which then verifies
 the checksums.  This approach allows full redundancy to be restored to the pool
 in the minimum amount of time.  This two phase approach will take longer than a
 healing resilver when the time to verify the checksums is included.  However,
 unless there is additional pool damage no checksum errors should be reported
 by the scrub.  This feature is incompatible with raidz configurations.
 
 This feature becomes \fBactive\fR while a sequential resilver is in progress,
 and returns to \fBenabled\fR when the resilver completes.
 .RE
 
 .sp
 .ne 2
 .na
 \fBdevice_removal\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:device_removal
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	none
 .TE
 
 This feature enables the \fBzpool remove\fR subcommand to remove top-level
 vdevs, evacuating them to reduce the total size of the pool.
 
 This feature becomes \fBactive\fR when the \fBzpool remove\fR subcommand is used
 on a top-level vdev, and will never return to being \fBenabled\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBedonr\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.illumos:edonr
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	extensible_dataset
 .TE
 
 This feature enables the use of the Edon-R hash algorithm for checksum,
 including for nopwrite (if compression is also enabled, an overwrite of
 a block whose checksum matches the data being written will be ignored).
 In an abundance of caution, Edon-R requires verification when used with
 dedup: \fBzfs set dedup=edonr,verify\fR.  See \fBzfs\fR(8).
 
 Edon-R is a very high-performance hash algorithm that was part
 of the NIST SHA-3 competition. It provides extremely high hash
 performance (over 350% faster than SHA-256), but was not selected
 because of its unsuitability as a general purpose secure hash algorithm.
 This implementation utilizes the new salted checksumming functionality
 in ZFS, which means that the checksum is pre-seeded with a secret
 256-bit random key (stored on the pool) before being fed the data block
 to be checksummed. Thus the produced checksums are unique to a given
 pool.
 
 When the \fBedonr\fR feature is set to \fBenabled\fR, the administrator
 can turn on the \fBedonr\fR checksum on any dataset using the
 \fBzfs set checksum=edonr\fR. See zfs(8). This feature becomes
 \fBactive\fR once a \fBchecksum\fR property has been set to \fBedonr\fR,
 and will return to being \fBenabled\fR once all filesystems that have
 ever had their checksum set to \fBedonr\fR are destroyed.
 
 FreeBSD does not support the \fBedonr\fR feature.
 .RE
 
 .sp
 .ne 2
 .na
 \fBembedded_data\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:embedded_data
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	none
 .TE
 
 This feature improves the performance and compression ratio of
 highly-compressible blocks.  Blocks whose contents can compress to 112 bytes
 or smaller can take advantage of this feature.
 
 When this feature is enabled, the contents of highly-compressible blocks are
 stored in the block "pointer" itself (a misnomer in this case, as it contains
 the compressed data, rather than a pointer to its location on disk).  Thus
 the space of the block (one sector, typically 512 bytes or 4KB) is saved,
 and no additional i/o is needed to read and write the data block.
 
 This feature becomes \fBactive\fR as soon as it is enabled and will
 never return to being \fBenabled\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBempty_bpobj\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:empty_bpobj
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	none
 .TE
 
 This feature increases the performance of creating and using a large
 number of snapshots of a single filesystem or volume, and also reduces
 the disk space required.
 
 When there are many snapshots, each snapshot uses many Block Pointer
 Objects (bpobj's) to track blocks associated with that snapshot.
 However, in common use cases, most of these bpobj's are empty.  This
 feature allows us to create each bpobj on-demand, thus eliminating the
 empty bpobjs.
 
 This feature is \fBactive\fR while there are any filesystems, volumes,
 or snapshots which were created after enabling this feature.
 .RE
 
 .sp
 .ne 2
 .na
 \fBenabled_txg\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:enabled_txg
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	none
 .TE
 
 Once this feature is enabled ZFS records the transaction group number
 in which new features are enabled. This has no user-visible impact,
 but other features may depend on this feature.
 
 This feature becomes \fBactive\fR as soon as it is enabled and will
 never return to being \fBenabled\fB.
 .RE
 
 .sp
 .ne 2
 .na
 \fBencryption\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.datto:encryption
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	bookmark_v2, extensible_dataset
 .TE
 
 This feature enables the creation and management of natively encrypted datasets.
 
 This feature becomes \fBactive\fR when an encrypted dataset is created and will
 be returned to the \fBenabled\fR state when all datasets that use this feature
 are destroyed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBextensible_dataset\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:extensible_dataset
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	none
 .TE
 
 This feature allows more flexible use of internal ZFS data structures,
 and exists for other features to depend on.
 
 This feature will be \fBactive\fR when the first dependent feature uses it,
 and will be returned to the \fBenabled\fR state when all datasets that use
 this feature are destroyed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBfilesystem_limits\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.joyent:filesystem_limits
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	extensible_dataset
 .TE
 
 This feature enables filesystem and snapshot limits. These limits can be used
 to control how many filesystems and/or snapshots can be created at the point in
 the tree on which the limits are set.
 
 This feature is \fBactive\fR once either of the limit properties has been
 set on a dataset. Once activated the feature is never deactivated.
 .RE
 
 .sp
 .ne 2
 .na
 \fBhole_birth\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:hole_birth
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	enabled_txg
 .TE
 
 This feature has/had bugs, the result of which is that, if you do a
 \fBzfs send -i\fR (or \fB-R\fR, since it uses \fB-i\fR) from an affected
 dataset, the receiver will not see any checksum or other errors, but the
 resulting destination snapshot will not match the source.  Its use by
 \fBzfs send -i\fR has been disabled by default.  See the
 \fBsend_holes_without_birth_time\fR module parameter in
 zfs-module-parameters(5).
 
 This feature improves performance of incremental sends (\fBzfs send -i\fR)
 and receives for objects with many holes. The most common case of
 hole-filled objects is zvols.
 
 An incremental send stream from snapshot \fBA\fR to snapshot \fBB\fR
 contains information about every block that changed between \fBA\fR and
 \fBB\fR. Blocks which did not change between those snapshots can be
 identified and omitted from the stream using a piece of metadata called
 the 'block birth time', but birth times are not recorded for holes (blocks
 filled only with zeroes). Since holes created after \fBA\fR cannot be
 distinguished from holes created before \fBA\fR, information about every
 hole in the entire filesystem or zvol is included in the send stream.
 
 For workloads where holes are rare this is not a problem. However, when
 incrementally replicating filesystems or zvols with many holes (for
 example a zvol formatted with another filesystem) a lot of time will
 be spent sending and receiving unnecessary information about holes that
 already exist on the receiving side.
 
 Once the \fBhole_birth\fR feature has been enabled the block birth times
 of all new holes will be recorded. Incremental sends between snapshots
 created after this feature is enabled will use this new metadata to avoid
 sending information about holes that already exist on the receiving side.
 
 This feature becomes \fBactive\fR as soon as it is enabled and will
 never return to being \fBenabled\fB.
 .RE
 
 .sp
 .ne 2
 .na
 \fBlarge_blocks\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.open-zfs:large_blocks
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	extensible_dataset
 .TE
 
 The \fBlarge_block\fR feature allows the record size on a dataset to be
 set larger than 128KB.
 
 This feature becomes \fBactive\fR once a dataset contains a file with
 a block size larger than 128KB, and will return to being \fBenabled\fR once all
 filesystems that have ever had their recordsize larger than 128KB are destroyed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBlarge_dnode\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.zfsonlinux:large_dnode
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	extensible_dataset
 .TE
 
 The \fBlarge_dnode\fR feature allows the size of dnodes in a dataset to be
 set larger than 512B.
 
 This feature becomes \fBactive\fR once a dataset contains an object with
 a dnode larger than 512B, which occurs as a result of setting the
 \fBdnodesize\fR dataset property to a value other than \fBlegacy\fR. The
 feature will return to being \fBenabled\fR once all filesystems that
 have ever contained a dnode larger than 512B are destroyed. Large dnodes
 allow more data to be stored in the bonus buffer, thus potentially
 improving performance by avoiding the use of spill blocks.
 .RE
 
 .sp
 .ne 2
 .na
 \fB\fBlivelist\fR\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:livelist
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	none
 .TE
 This feature allows clones to be deleted faster than the traditional method
 when a large number of random/sparse writes have been made to the clone.
 All blocks allocated and freed after a clone is created are tracked by the
 the clone's livelist which is referenced during the deletion of the clone.
 The feature is activated when a clone is created and remains active until all
 clones have been destroyed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBlog_spacemap\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:log_spacemap
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	com.delphix:spacemap_v2
 .TE
 
 This feature improves performance for heavily-fragmented pools,
 especially when workloads are heavy in random-writes. It does so by
 logging all the metaslab changes on a single spacemap every TXG
 instead of scattering multiple writes to all the metaslab spacemaps.
 
 This feature becomes \fBactive\fR as soon as it is enabled and will never
 return to being \fBenabled\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBlz4_compress\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.illumos:lz4_compress
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	none
 .TE
 
 \fBlz4\fR is a high-performance real-time compression algorithm that
 features significantly faster compression and decompression as well as a
 higher compression ratio than the older \fBlzjb\fR compression.
 Typically, \fBlz4\fR compression is approximately 50% faster on
 compressible data and 200% faster on incompressible data than
 \fBlzjb\fR. It is also approximately 80% faster on decompression, while
 giving approximately 10% better compression ratio.
 
 When the \fBlz4_compress\fR feature is set to \fBenabled\fR, the
 administrator can turn on \fBlz4\fR compression on any dataset on the
 pool using the zfs(8) command. Please note that doing so will
 immediately activate the \fBlz4_compress\fR feature on the underlying
 pool using the zfs(8) command. Also, all newly written metadata
 will be compressed with \fBlz4\fR algorithm. Since this feature is not
 read-only compatible, this operation will render the pool unimportable
 on systems without support for the \fBlz4_compress\fR feature.
 
 Booting off of \fBlz4\fR-compressed root pools is supported.
 
 This feature becomes \fBactive\fR as soon as it is enabled and will
 never return to being \fBenabled\fB.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmulti_vdev_crash_dump\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.joyent:multi_vdev_crash_dump
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	none
 .TE
 
 This feature allows a dump device to be configured with a pool comprised
 of multiple vdevs.  Those vdevs may be arranged in any mirrored or raidz
 configuration.
 
 When the \fBmulti_vdev_crash_dump\fR feature is set to \fBenabled\fR,
 the administrator can use the \fBdumpadm\fR(1M) command to configure a
 dump device on a pool comprised of multiple vdevs.
 
 Under Linux this feature is registered for compatibility but not used.
 New pools created under Linux will have the feature \fBenabled\fR but
 will never transition to \fB\fBactive\fR.  This functionality is not
 required in order to support crash dumps under Linux.  Existing pools
 where this feature is \fB\fBactive\fR can be imported.
 .RE
 
 .sp
 .ne 2
 .na
 \fBobsolete_counts\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:obsolete_counts
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	device_removal
 .TE
 
 This feature is an enhancement of device_removal, which will over time
 reduce the memory used to track removed devices.  When indirect blocks
 are freed or remapped, we note that their part of the indirect mapping
 is "obsolete", i.e. no longer needed.
 
 This feature becomes \fBactive\fR when the \fBzpool remove\fR subcommand is
 used on a top-level vdev, and will never return to being \fBenabled\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBproject_quota\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.zfsonlinux:project_quota
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	extensible_dataset
 .TE
 
 This feature allows administrators to account the spaces and objects usage
 information against the project identifier (ID).
 
 The project ID is new object-based attribute. When upgrading an existing
 filesystem, object without project ID attribute will be assigned a zero
 project ID. After this feature is enabled, newly created object will inherit
 its parent directory's project ID if the parent inherit flag is set (via
 \fBchattr +/-P\fR or \fBzfs project [-s|-C]\fR). Otherwise, the new object's
 project ID will be set as zero. An object's project ID can be changed at
 anytime by the owner (or privileged user) via \fBchattr -p $prjid\fR or
 \fBzfs project -p $prjid\fR.
 
 This feature will become \fBactive\fR as soon as it is enabled and will never
 return to being \fBdisabled\fR. Each filesystem will be upgraded automatically
 when remounted or when new file is created under that filesystem. The upgrade
 can also be triggered on filesystems via `zfs set version=current <pool/fs>`.
 The upgrade process runs in the background and may take a while to complete
 for the filesystems containing a large number of files.
 .RE
 
 .sp
 .ne 2
 .na
 \fB\fBredaction_bookmarks\fR\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:redaction_bookmarks
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	bookmarks, extensible_dataset
 .TE
 
 This feature enables the use of the redacted zfs send.  Redacted \fBzfs send\fR
 creates redaction bookmarks, which store the list of blocks redacted by the
 send that created them.  For more information about redacted send,
 see \fBzfs\fR(8).
 
 .RE
 
 .sp
 .ne 2
 .na
 \fB\fBredacted_datasets\fR\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:redacted_datasets
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	extensible_dataset
 .TE
 
 This feature enables the receiving of redacted zfs send streams.  Redacted zfs
 send streams create redacted datasets when received.  These datasets are
 missing some of their blocks, and so cannot be safely mounted, and their
 contents cannot be safely read.  For more information about redacted receive,
 see \fBzfs\fR(8).
 .RE
 
 .sp
 .ne 2
 .na
 \fBresilver_defer\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.datto:resilver_defer
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	none
 .TE
 
 This feature allows zfs to postpone new resilvers if an existing one is already
 in progress. Without this feature, any new resilvers will cause the currently
 running one to be immediately restarted from the beginning.
 
 This feature becomes \fBactive\fR once a resilver has been deferred, and
 returns to being \fBenabled\fR when the deferred resilver begins.
 .RE
 
 .sp
 .ne 2
 .na
 \fBsha512\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.illumos:sha512
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	extensible_dataset
 .TE
 
 This feature enables the use of the SHA-512/256 truncated hash algorithm
 (FIPS 180-4) for checksum and dedup. The native 64-bit arithmetic of
 SHA-512 provides an approximate 50% performance boost over SHA-256 on
 64-bit hardware and is thus a good minimum-change replacement candidate
 for systems where hash performance is important, but these systems
 cannot for whatever reason utilize the faster \fBskein\fR and
 \fBedonr\fR algorithms.
 
 When the \fBsha512\fR feature is set to \fBenabled\fR, the administrator
 can turn on the \fBsha512\fR checksum on any dataset using
 \fBzfs set checksum=sha512\fR. See zfs(8). This feature becomes
 \fBactive\fR once a \fBchecksum\fR property has been set to \fBsha512\fR,
 and will return to being \fBenabled\fR once all filesystems that have
 ever had their checksum set to \fBsha512\fR are destroyed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBskein\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.illumos:skein
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	extensible_dataset
 .TE
 
 This feature enables the use of the Skein hash algorithm for checksum
 and dedup. Skein is a high-performance secure hash algorithm that was a
 finalist in the NIST SHA-3 competition. It provides a very high security
 margin and high performance on 64-bit hardware (80% faster than
 SHA-256). This implementation also utilizes the new salted checksumming
 functionality in ZFS, which means that the checksum is pre-seeded with a
 secret 256-bit random key (stored on the pool) before being fed the data
 block to be checksummed. Thus the produced checksums are unique to a
 given pool, preventing hash collision attacks on systems with dedup.
 
 When the \fBskein\fR feature is set to \fBenabled\fR, the administrator
 can turn on the \fBskein\fR checksum on any dataset using
 \fBzfs set checksum=skein\fR. See zfs(8). This feature becomes
 \fBactive\fR once a \fBchecksum\fR property has been set to \fBskein\fR,
 and will return to being \fBenabled\fR once all filesystems that have
 ever had their checksum set to \fBskein\fR are destroyed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspacemap_histogram\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:spacemap_histogram
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	none
 .TE
 
 This features allows ZFS to maintain more information about how free space
 is organized within the pool. If this feature is \fBenabled\fR, ZFS will
 set this feature to \fBactive\fR when a new space map object is created or
 an existing space map is upgraded to the new format. Once the feature is
 \fBactive\fR, it will remain in that state until the pool is destroyed.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspacemap_v2\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:spacemap_v2
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	none
 .TE
 
 This feature enables the use of the new space map encoding which
 consists of two words (instead of one) whenever it is advantageous.
 The new encoding allows space maps to represent large regions of
 space more efficiently on-disk while also increasing their maximum
 addressable offset.
 
 This feature becomes \fBactive\fR once it is \fBenabled\fR, and never
 returns back to being \fBenabled\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBuserobj_accounting\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.zfsonlinux:userobj_accounting
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	extensible_dataset
 .TE
 
 This feature allows administrators to account the object usage information
 by user and group.
 
 This feature becomes \fBactive\fR as soon as it is enabled and will never
 return to being \fBenabled\fR. Each filesystem will be upgraded automatically
 when remounted, or when new files are created under that filesystem.
 The upgrade can also be started manually on filesystems by running
 `zfs set version=current <pool/fs>`. The upgrade process runs in the background
 and may take a while to complete for filesystems containing a large number of
 files.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzpool_checkpoint\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	com.delphix:zpool_checkpoint
 READ\-ONLY COMPATIBLE	yes
 DEPENDENCIES	none
 .TE
 
 This feature enables the \fBzpool checkpoint\fR subcommand that can
 checkpoint the state of the pool at the time it was issued and later
 rewind back to it or discard it.
 
 This feature becomes \fBactive\fR when the \fBzpool checkpoint\fR subcommand
 is used to checkpoint the pool.
 The feature will only return back to being \fBenabled\fR when the pool
 is rewound or the checkpoint has been discarded.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzstd_compress\fR
 .ad
 .RS 4n
 .TS
 l l .
 GUID	org.freebsd:zstd_compress
 READ\-ONLY COMPATIBLE	no
 DEPENDENCIES	extensible_dataset
 .TE
 
 \fBzstd\fR is a high-performance compression algorithm that features a
 combination of high compression ratios and high speed. Compared to \fBgzip\fR,
 \fBzstd\fR offers slighty better compression at much higher speeds. Compared
 to \fBlz4\fR, \fBzstd\fR offers much better compression while being only
 modestly slower. Typically, \fBzstd\fR compression speed ranges from 250 to 500
 MB/s per thread and decompression speed is over 1 GB/s per thread.
 
 When the \fBzstd\fR feature is set to \fBenabled\fR, the administrator can turn
 on \fBzstd\fR compression of any dataset by running
 `zfs set compress=zstd <pool/fs>`.
 
 This feature becomes \fBactive\fR once a \fBcompress\fR property has been set to
 \fBzstd\fR, and will return to being \fBenabled\fR once all filesystems that
 have ever had their compress property set to \fBzstd\fR are destroyed.
 
 Booting off of \fBzstd\fR-compressed root pools is not yet supported.
 .RE
 
 .SH "SEE ALSO"
 zpool(8)
Index: vendor-sys/openzfs/dist/man/man8/fsck.zfs.8
===================================================================
--- vendor-sys/openzfs/dist/man/man8/fsck.zfs.8	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man8/fsck.zfs.8	(revision 364928)
@@ -1,67 +1,67 @@
 '\" t
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\"
 .\" Copyright 2013 Darik Horn <dajhorn@vanadac.com>. All rights reserved.
 .\"
-.TH fsck.zfs 8 "2013 MAR 16" "ZFS on Linux" "System Administration Commands"
+.TH FSCK.ZFS 8 "Aug 24, 2020" OpenZFS
 
 .SH NAME
 fsck.zfs \- Dummy ZFS filesystem checker.
 
 .SH SYNOPSIS
 .LP
 .BI "fsck.zfs [" "options" "] <" "dataset" ">"
 
 .SH DESCRIPTION
 .LP
 \fBfsck.zfs\fR is a shell stub that does nothing and always returns
 true. It is installed by ZoL because some Linux distributions expect
 a fsck helper for all filesystems.
 
 .SH OPTIONS
 .HP
 All \fIoptions\fR and the \fIdataset\fR are ignored.
 
 .SH "NOTES"
 .LP
 ZFS datasets are checked by running \fBzpool scrub\fR on the
 containing pool. An individual ZFS dataset is never checked
 independently of its pool, which is unlike a regular filesystem.
 
 .SH "BUGS"
 .LP
 On some systems, if the \fIdataset\fR is in a degraded pool, then it
 might be appropriate for \fBfsck.zfs\fR to return exit code 4 to
 indicate an uncorrected filesystem error.
 .LP
 Similarly, if the \fIdataset\fR is in a faulted pool and has a legacy
 /etc/fstab record, then \fBfsck.zfs\fR should return exit code 8 to
 indicate a fatal operational error.
 
 .SH "AUTHORS"
 .LP
 Darik Horn <dajhorn@vanadac.com>.
 
 .SH "SEE ALSO"
 .BR fsck (8),
 .BR fstab (5),
 .BR zpool-scrub (8)
Index: vendor-sys/openzfs/dist/man/man8/mount.zfs.8
===================================================================
--- vendor-sys/openzfs/dist/man/man8/mount.zfs.8	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man8/mount.zfs.8	(revision 364928)
@@ -1,144 +1,144 @@
 '\" t
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\"
 .\" Copyright 2013 Darik Horn <dajhorn@vanadac.com>. All rights reserved.
 .\"
-.TH mount.zfs 8 "2013 FEB 28" "ZFS on Linux" "System Administration Commands"
+.TH MOUNT.ZFS 8 "Aug 24, 2020" OpenZFS
 
 .SH NAME
 mount.zfs \- mount a ZFS filesystem
 .SH SYNOPSIS
 .LP
 .BI "mount.zfs [\-sfnvh] [\-o " options "]" " dataset mountpoint
 
 .SH DESCRIPTION
 .BR mount.zfs
 is part of the zfsutils package for Linux. It is a helper program that
 is usually invoked by the
 .BR mount (8)
 or
 .BR zfs (8)
 commands to mount a ZFS dataset.
 
 All
 .I options
 are handled according to the FILESYSTEM INDEPENDENT MOUNT OPTIONS
 section in the
 .BR mount (8)
 manual, except for those described below.
 
 The
 .I dataset
 parameter is a ZFS filesystem name, as output by the
 .B "zfs list -H -o name
 command. This parameter never has a leading slash character and is
 not a device name.
 
 The
 .I mountpoint
 parameter is the path name of a directory.
 
 
 .SH OPTIONS
 .TP
 .BI "\-s"
 Ignore bad or sloppy mount options.
 .TP
 .BI "\-f"
 Do a fake mount; do not perform the mount operation.
 .TP
 .BI "\-n"
 Do not update the /etc/mtab file.
 .TP
 .BI "\-v"
 Increase verbosity.
 .TP
 .BI "\-h"
 Print the usage message.
 .TP
 .BI "\-o context"
 This flag sets the SELinux context for all files in the filesystem
 under that mountpoint.
 .TP
 .BI "\-o fscontext"
 This flag sets the SELinux context for the filesystem being mounted.
 .TP
 .BI "\-o defcontext"
 This flag sets the SELinux context for unlabeled files.
 .TP
 .BI "\-o rootcontext"
 This flag sets the SELinux context for the root inode of the filesystem.
 .TP
 .BI "\-o legacy"
 This private flag indicates that the
 .I dataset
 has an entry in the /etc/fstab file.
 .TP
 .BI "\-o noxattr"
 This private flag disables extended attributes.
 .TP
 .BI "\-o xattr
 This private flag enables directory-based extended attributes and, if
 appropriate, adds a ZFS context to the selinux system policy.
 .TP
 .BI "\-o saxattr
 This private flag enables system attributed-based extended attributes and, if
 appropriate, adds a ZFS context to the selinux system policy.
 .TP
 .BI "\-o dirxattr
 Equivalent to
 .BR xattr .
 .TP
 .BI "\-o zfsutil"
 This private flag indicates that
 .BR mount (8)
 is being called by the
 .BR zfs (8)
 command.
 
 .SH NOTES
 ZFS conventionally requires that the
 .I mountpoint
 be an empty directory, but the Linux implementation inconsistently
 enforces the requirement.
 
 The
 .BR mount.zfs
 helper does not mount the contents of zvols.
 
 .SH FILES
 .TP 18n
 .I /etc/fstab
 The static filesystem table.
 .TP
 .I /etc/mtab
 The mounted filesystem table.
 .SH "AUTHORS"
 The primary author of
 .BR mount.zfs
 is Brian Behlendorf <behlendorf1@llnl.gov>.
 
 This man page was written by Darik Horn <dajhorn@vanadac.com>.
 .SH "SEE ALSO"
 .BR fstab (5),
 .BR mount (8),
 .BR zfs (8)
Index: vendor-sys/openzfs/dist/man/man8/vdev_id.8
===================================================================
--- vendor-sys/openzfs/dist/man/man8/vdev_id.8	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man8/vdev_id.8	(revision 364928)
@@ -1,77 +1,77 @@
-.TH vdev_id 8
+.TH VDEV_ID 8 "Aug 24, 2020" OpenZFS
 .SH NAME
 vdev_id \- generate user-friendly names for JBOD disks
 .SH SYNOPSIS
 .LP
 .nf
 \fBvdev_id\fR <-d dev> [-c config_file] [-g sas_direct|sas_switch]
                  [-m] [-p phys_per_port]
 \fBvdev_id\fR -h
 .fi
 .SH DESCRIPTION
 The \fBvdev_id\fR command is a udev helper which parses the file
 .BR /etc/zfs/vdev_id.conf (5)
 to map a physical path in a storage topology to a channel name.  The
 channel name is combined with a disk enclosure slot number to create an
 alias that reflects the physical location of the drive.  This is
 particularly helpful when it comes to tasks like replacing failed
 drives.  Slot numbers may also be re-mapped in case the default
 numbering is unsatisfactory.  The drive aliases will be created as
 symbolic links in /dev/disk/by-vdev.
 
 The currently supported topologies are sas_direct and sas_switch.  A
 multipath mode is supported in which dm-mpath devices are handled by
 examining the first-listed running component disk as reported by the
 .BR multipath (8)
 command.  In multipath mode the configuration file should contain a
 channel definition with the same name for each path to a given
 enclosure.
 
 .BR vdev_id
 also supports creating aliases based on existing udev links in the /dev
 hierarchy using the \fIalias\fR configuration file keyword.  See the
 .BR vdev_id.conf (5)
 man page for details.
 
 .SH OPTIONS
 .TP
 \fB\-c\fR <config_file>
 Specifies the path to an alternate configuration file.  The default is
 /etc/zfs/vdev_id.conf.
 .TP
 \fB\-d\fR <device>
 This is the only mandatory argument.  Specifies the name of a device
 in /dev, i.e. "sda".
 .TP
 \fB\-g\fR <sas_direct|sas_switch>
 Identifies a physical topology that governs how physical paths are
 mapped to channels.
 
 \fIsas_direct\fR - in this mode a channel is uniquely identified by
 a PCI slot and a HBA port number
 
 \fIsas_switch\fR - in this mode a channel is uniquely identified by
 a SAS switch port number
 .TP
 \fB\-m\fR
 Specifies that
 .BR vdev_id (8)
 will handle only dm-multipath devices.  If set to "yes" then
 .BR vdev_id (8)
 will examine the first running component disk of a dm-multipath
 device as listed by the
 .BR multipath (8)
 command to determine the physical path.
 .TP
 \fB\-p\fR <phys_per_port>
 Specifies the number of PHY devices associated with a SAS HBA port or SAS
 switch port.
 .BR vdev_id (8)
 internally uses this value to determine which HBA or switch port a
 device is connected to.  The default is 4.
 .TP
 \fB\-h\fR
 Print a usage summary.
 .SH SEE ALSO
 .LP
 \fBvdev_id.conf\fR(5)
Index: vendor-sys/openzfs/dist/man/man8/zed.8.in
===================================================================
--- vendor-sys/openzfs/dist/man/man8/zed.8.in	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man8/zed.8.in	(revision 364928)
@@ -1,268 +1,268 @@
 .\"
 .\" This file is part of the ZFS Event Daemon (ZED)
 .\" for ZFS on Linux (ZoL) <https://zfsonlinux.org/>.
 .\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 .\" Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
 .\" Refer to the ZoL git commit log for authoritative copyright attribution.
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License Version 1.0 (CDDL-1.0).
 .\" You can obtain a copy of the license from the top-level file
 .\" "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
 .\" You may not use this file except in compliance with the license.
 .\"
-.TH ZED 8 "Octember 1, 2013" "ZFS on Linux" "System Administration Commands"
+.TH ZED 8 "Aug 24, 2020" OpenZFS
 
 .SH NAME
 ZED \- ZFS Event Daemon
 
 .SH SYNOPSIS
 .HP
 .B zed
 .\" [\fB\-c\fR \fIconfigfile\fR]
 [\fB\-d\fR \fIzedletdir\fR]
 [\fB\-f\fR]
 [\fB\-F\fR]
 [\fB\-h\fR]
 [\fB\-I\fR]
 [\fB\-L\fR]
 [\fB\-M\fR]
 [\fB\-p\fR \fIpidfile\fR]
 [\fB\-P\fR \fIpath\fR]
 [\fB\-s\fR \fIstatefile\fR]
 [\fB\-v\fR]
 [\fB\-V\fR]
 [\fB\-Z\fR]
 
 .SH DESCRIPTION
 .PP
 \fBZED\fR (ZFS Event Daemon) monitors events generated by the ZFS kernel
 module.  When a zevent (ZFS Event) is posted, \fBZED\fR will run any ZEDLETs
 (ZFS Event Daemon Linkage for Executable Tasks) that have been enabled for the
 corresponding zevent class.
 
 .SH OPTIONS
 .TP
 .BI \-h
 Display a summary of the command-line options.
 .TP
 .BI \-L
 Display license information.
 .TP
 .BI \-V
 Display version information.
 .TP
 .BI \-v
 Be verbose.
 .TP
 .BI \-f
 Force the daemon to run if at all possible, disabling security checks and
 throwing caution to the wind.  Not recommended for use in production.
 .TP
 .BI \-F
 Run the daemon in the foreground.
 .TP
 .BI \-M
 Lock all current and future pages in the virtual memory address space.
 This may help the daemon remain responsive when the system is under heavy
 memory pressure.
 .TP
 .BI \-I
 Request that the daemon idle rather than exit when the kernel modules are
 not loaded. Processing of events will start, or resume, when the kernel
 modules are (re)loaded. Under Linux the kernel modules cannot be unloaded
 while the daemon is running.
 .TP
 .BI \-Z
 Zero the daemon's state, thereby allowing zevents still within the kernel
 to be reprocessed.
 .\" .TP
 .\" .BI \-c\  configfile
 .\" Read the configuration from the specified file.
 .TP
 .BI \-d\  zedletdir
 Read the enabled ZEDLETs from the specified directory.
 .TP
 .BI \-p\  pidfile
 Write the daemon's process ID to the specified file.
 .TP
 .BI \-P\  path
 Custom $PATH for zedlets to use.  Normally zedlets run in a locked-down
 environment, with hardcoded paths to the ZFS commands ($ZFS, $ZPOOL, $ZED, ...),
 and a hardcoded $PATH.  This is done for security reasons.  However, the
 ZFS test suite uses a custom PATH for its ZFS commands, and passes it to zed
 with -P.  In short, -P is only to be used by the ZFS test suite; never use
 it in production!
 .TP
 .BI \-s\  statefile
 Write the daemon's state to the specified file.
 .SH ZEVENTS
 .PP
 A zevent is comprised of a list of nvpairs (name/value pairs).  Each zevent
 contains an EID (Event IDentifier) that uniquely identifies it throughout
 the lifetime of the loaded ZFS kernel module; this EID is a monotonically
 increasing integer that resets to 1 each time the kernel module is loaded.
 Each zevent also contains a class string that identifies the type of event.
 For brevity, a subclass string is defined that omits the leading components
 of the class string.  Additional nvpairs exist to provide event details.
 .PP
 The kernel maintains a list of recent zevents that can be viewed (along with
 their associated lists of nvpairs) using the "\fBzpool events \-v\fR" command.
 
 .SH CONFIGURATION
 .PP
 ZEDLETs to be invoked in response to zevents are located in the
 \fIenabled-zedlets\fR directory.  These can be symlinked or copied from the
 \fIinstalled-zedlets\fR directory; symlinks allow for automatic updates
 from the installed ZEDLETs, whereas copies preserve local modifications.
 As a security measure, ZEDLETs must be owned by root.  They must have
 execute permissions for the user, but they must not have write permissions
 for group or other.  Dotfiles are ignored.
 .PP
 ZEDLETs are named after the zevent class for which they should be invoked.
 In particular, a ZEDLET will be invoked for a given zevent if either its
 class or subclass string is a prefix of its filename (and is followed by
 a non-alphabetic character).  As a special case, the prefix "all" matches
 all zevents.  Multiple ZEDLETs may be invoked for a given zevent.
 
 .SH ZEDLETS
 .PP
 ZEDLETs are executables invoked by the ZED in response to a given zevent.
 They should be written under the presumption they can be invoked concurrently,
 and they should use appropriate locking to access any shared resources.
 Common variables used by ZEDLETs can be stored in the default rc file which
 is sourced by scripts; these variables should be prefixed with "ZED_".
 .PP
 The zevent nvpairs are passed to ZEDLETs as environment variables.
 Each nvpair name is converted to an environment variable in the following
 manner: 1) it is prefixed with "ZEVENT_", 2) it is converted to uppercase,
 and 3) each non-alphanumeric character is converted to an underscore.
 Some additional environment variables have been defined to present certain
 nvpair values in a more convenient form.  An incomplete list of zevent
 environment variables is as follows:
 .TP
 .B
 ZEVENT_EID
 The Event IDentifier.
 .TP
 .B
 ZEVENT_CLASS
 The zevent class string.
 .TP
 .B
 ZEVENT_SUBCLASS
 The zevent subclass string.
 .TP
 .B
 ZEVENT_TIME
 The time at which the zevent was posted as
 "\fIseconds\fR\ \fInanoseconds\fR" since the Epoch.
 .TP
 .B
 ZEVENT_TIME_SECS
 The \fIseconds\fR component of ZEVENT_TIME.
 .TP
 .B
 ZEVENT_TIME_NSECS
 The \fInanoseconds\fR component of ZEVENT_TIME.
 .TP
 .B
 ZEVENT_TIME_STRING
 An almost-RFC3339-compliant string for ZEVENT_TIME.
 .PP
 Additionally, the following ZED & ZFS variables are defined:
 .TP
 .B
 ZED_PID
 The daemon's process ID.
 .TP
 .B
 ZED_ZEDLET_DIR
 The daemon's current \fIenabled-zedlets\fR directory.
 .TP
 .B
 ZFS_ALIAS
 The ZFS alias (\fIname-version-release\fR) string used to build the daemon.
 .TP
 .B
 ZFS_VERSION
 The ZFS version used to build the daemon.
 .TP
 .B
 ZFS_RELEASE
 The ZFS release used to build the daemon.
 .PP
 ZEDLETs may need to call other ZFS commands.  The installation paths of
 the following executables are defined: \fBZDB\fR, \fBZED\fR, \fBZFS\fR,
 \fBZINJECT\fR, and \fBZPOOL\fR.  These variables can be overridden in the
 rc file if needed.
 
 .SH FILES
 .\" .TP
 .\" @sysconfdir@/zfs/zed.conf
 .\" The default configuration file for the daemon.
 .TP
 .I @sysconfdir@/zfs/zed.d
 The default directory for enabled ZEDLETs.
 .TP
 .I @sysconfdir@/zfs/zed.d/zed.rc
 The default rc file for common variables used by ZEDLETs.
 .TP
 .I @zfsexecdir@/zed.d
 The default directory for installed ZEDLETs.
 .TP
 .I @runstatedir@/zed.pid
 The default file containing the daemon's process ID.
 .TP
 .I @runstatedir@/zed.state
 The default file containing the daemon's state.
 
 .SH SIGNALS
 .TP
 .B HUP
 Reconfigure the daemon and rescan the directory for enabled ZEDLETs.
 .TP
 .B TERM
 Terminate the daemon.
 
 .SH NOTES
 .PP
 \fBZED\fR requires root privileges.
 .\" Do not taunt zed.
 
 .SH BUGS
 .PP
 Events are processed synchronously by a single thread.  This can delay the
 processing of simultaneous zevents.
 .PP
 There is no maximum timeout for ZEDLET execution.  Consequently, a misbehaving
 ZEDLET can delay the processing of subsequent zevents.
 .PP
 The ownership and permissions of the \fIenabled-zedlets\fR directory (along
 with all parent directories) are not checked.  If any of these directories
 are improperly owned or permissioned, an unprivileged user could insert a
 ZEDLET to be executed as root.  The requirement that ZEDLETs be owned by
 root mitigates this to some extent.
 .PP
 ZEDLETs are unable to return state/status information to the kernel.
 .PP
 Some zevent nvpair types are not handled.  These are denoted by zevent
 environment variables having a "_NOT_IMPLEMENTED_" value.
 .PP
 Internationalization support via gettext has not been added.
 .PP
 The configuration file is not yet implemented.
 .PP
 The diagnosis engine is not yet implemented.
 
 .SH LICENSE
 .PP
 \fBZED\fR (ZFS Event Daemon) is distributed under the terms of the
 Common Development and Distribution License Version 1.0 (CDDL\-1.0).
 .PP
 Developed at Lawrence Livermore National Laboratory (LLNL\-CODE\-403049).
 
 .SH SEE ALSO
 .BR zfs (8),
 .BR zpool (8)
 .BR zpool-events (8)
Index: vendor-sys/openzfs/dist/man/man8/zfs-mount-generator.8.in
===================================================================
--- vendor-sys/openzfs/dist/man/man8/zfs-mount-generator.8.in	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man8/zfs-mount-generator.8.in	(revision 364928)
@@ -1,248 +1,248 @@
 .\"
 .\" Copyright 2018 Antonio Russo <antonio.e.russo@gmail.com>
 .\" Copyright 2019 Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
 .\" Copyright 2020 InsanePrawn <insane.prawny@gmail.com>
 .\"
 .\" Permission is hereby granted, free of charge, to any person obtaining
 .\" a copy of this software and associated documentation files (the
 .\" "Software"), to deal in the Software without restriction, including
 .\" without limitation the rights to use, copy, modify, merge, publish,
 .\" distribute, sublicense, and/or sell copies of the Software, and to
 .\" permit persons to whom the Software is furnished to do so, subject to
 .\" the following conditions:
 .\"
 .\" The above copyright notice and this permission notice shall be
 .\" included in all copies or substantial portions of the Software.
 .\"
 .\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 .\" EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 .\" MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 .\" NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 .\" LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 .\" OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 .\" WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-.TH "ZFS\-MOUNT\-GENERATOR" "8" "2020-01-19" "ZFS" "zfs-mount-generator" "\""
+.TH ZFS-MOUNT-GENERATOR 8 "Aug 24, 2020" OpenZFS
 
 .SH "NAME"
 zfs\-mount\-generator \- generates systemd mount units for ZFS
 .SH SYNOPSIS
 .B @systemdgeneratordir@/zfs\-mount\-generator
 .sp
 .SH DESCRIPTION
 zfs\-mount\-generator implements the \fBGenerators Specification\fP
 of
 .BR systemd (1),
 and is called during early boot to generate
 .BR systemd.mount (5)
 units for automatically mounted datasets. Mount ordering and dependencies
 are created for all tracked pools (see below).
 
 .SS ENCRYPTION KEYS
 If the dataset is an encryption root, a service that loads the associated key (either from file or through a
 .BR systemd\-ask\-password (1)
 prompt) will be created. This service
 . BR RequiresMountsFor
 the path of the key (if file-based) and also copies the mount unit's
 .BR After ,
 .BR Before
 and
 .BR Requires .
 All mount units of encrypted datasets add the key\-load service for their encryption root to their
 .BR Wants
 and
 .BR After .
 The service will not be
 .BR Want ed
 or
 .BR Require d
 by
 .BR local-fs.target
 directly, and so will only be started manually or as a dependency of a started mount unit.
 
 .SS UNIT ORDERING AND DEPENDENCIES
 mount unit's
 .BR Before
 \->
 key\-load service (if any)
 \->
 mount unit
 \->
 mount unit's
 .BR After
 
 It is worth nothing that when a mount unit is activated, it activates all available mount units for parent paths to its mountpoint, i.e. activating the mount unit for /tmp/foo/1/2/3 automatically activates all available mount units for /tmp, /tmp/foo, /tmp/foo/1, and /tmp/foo/1/2. This is true for any combination of mount units from any sources, not just ZFS.
 
 .SS CACHE FILE
 Because ZFS pools may not be available very early in the boot process,
 information on ZFS mountpoints must be stored separately. The output of the command
 .PP
 .RS 4
 zfs list -H -o name,mountpoint,canmount,atime,relatime,devices,exec,readonly,setuid,nbmand,encroot,keylocation,org.openzfs.systemd:requires,org.openzfs.systemd:requires-mounts-for,org.openzfs.systemd:before,org.openzfs.systemd:after,org.openzfs.systemd:wanted-by,org.openzfs.systemd:required-by,org.openzfs.systemd:nofail,org.openzfs.systemd:ignore
 
 .RE
 .PP
 for datasets that should be mounted by systemd, should be kept
 separate from the pool, at
 .PP
 .RS 4
 .RI @sysconfdir@/zfs/zfs-list.cache/ POOLNAME
 .
 .RE
 .PP
 The cache file, if writeable, will be kept synchronized with the pool
 state by the ZEDLET
 .PP
 .RS 4
 history_event-zfs-list-cacher.sh .
 .RE
 .PP
 .sp
 .SS PROPERTIES
 The behavior of the generator script can be influenced by the following dataset properties:
 .sp
 .TP 4
 .BR canmount = on | off | noauto
 If a dataset has
 .BR mountpoint
 set and
 .BR canmount
 is not
 .BR off ,
 a mount unit will be generated.
 Additionally, if
 .BR canmount
 is
 .BR on ,
 .BR local-fs.target
 will gain a dependency on the mount unit.
 
 This behavior is equal to the
 .BR auto
 and
 .BR noauto
 legacy mount options, see
 .BR systemd.mount (5).
 
 Encryption roots always generate a key-load service, even for
 .BR canmount=off .
 .TP 4
 .BR org.openzfs.systemd:requires\-mounts\-for = \fIpath\fR...
 Space\-separated list of mountpoints to require to be mounted for this mount unit
 .TP 4
 .BR org.openzfs.systemd:before = \fIunit\fR...
 The mount unit and associated key\-load service will be ordered before this space\-separated list of units.
 .TP 4
 .BR org.openzfs.systemd:after = \fIunit\fR...
 The mount unit and associated key\-load service will be ordered after this space\-separated list of units.
 .TP 4
 .BR org.openzfs.systemd:wanted\-by = \fIunit\fR...
 Space-separated list of units that will gain a
 .BR Wants
 dependency on this mount unit.
 Setting this property implies
 .BR noauto .
 .TP 4
 .BR org.openzfs.systemd:required\-by = \fIunit\fR...
 Space-separated list of units that will gain a
 .BR Requires
 dependency on this mount unit.
 Setting this property implies
 .BR noauto .
 .TP 4
 .BR org.openzfs.systemd:nofail = unset | on | off
 Toggles between a
 .BR Wants
 and
 .BR Requires
 type of dependency between the mount unit and
 .BR local-fs.target ,
 if
 .BR noauto
 isn't set or implied.
 
 .BR on :
 Mount will be
 .BR WantedBy
 local-fs.target
 
 .BR off :
 Mount will be
 .BR Before
 and
 .BR RequiredBy
 local-fs.target
 
 .BR unset :
 Mount will be
 .BR Before
 and
 .BR WantedBy
 local-fs.target
 .TP 4
 .BR org.openzfs.systemd:ignore = on | off
 If set to
 .BR on ,
 do not generate a mount unit for this dataset.
 
 .RE
 See also
 .BR systemd.mount (5)
 
 .PP
 .SH EXAMPLE
 To begin, enable tracking for the pool:
 .PP
 .RS 4
 touch
 .RI @sysconfdir@/zfs/zfs-list.cache/ POOLNAME
 .RE
 .PP
 Then, enable the tracking ZEDLET:
 .PP
 .RS 4
 ln -s "@zfsexecdir@/zed.d/history_event-zfs-list-cacher.sh" "@sysconfdir@/zfs/zed.d"
 
 systemctl enable zfs-zed.service
 
 systemctl restart zfs-zed.service
 .RE
 .PP
 Force the running of the ZEDLET by setting a monitored property, e.g.
 .BR canmount ,
 for at least one dataset in the pool:
 .PP
 .RS 4
 zfs set canmount=on
 .I DATASET
 .RE
 .PP
 This forces an update to the stale cache file.
 
 To test the generator output, run
 .PP
 .RS 4
 @systemdgeneratordir@/zfs-mount-generator /tmp/zfs-mount-generator . .
 .RE
 .PP
 This will generate units and dependencies in
 .I /tmp/zfs-mount-generator
 for you to inspect them. The second and third argument are ignored.
 
 If you're satisfied with the generated units, instruct systemd to re-run all generators:
 .PP
 .RS 4
 systemctl daemon-reload
 .RE
 .PP
 
 .sp
 .SH SEE ALSO
 .BR zfs (5)
 .BR zfs-events (5)
 .BR zed (8)
 .BR zpool (5)
 .BR systemd (1)
 .BR systemd.target (5)
 .BR systemd.special (7)
 .BR systemd.mount (7)
Index: vendor-sys/openzfs/dist/man/man8/zfs.8
===================================================================
--- vendor-sys/openzfs/dist/man/man8/zfs.8	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man8/zfs.8	(revision 364928)
@@ -1,741 +1,777 @@
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2011, Pawel Jakub Dawidek <pjd@FreeBSD.org>
 .\" Copyright (c) 2012, Glen Barber <gjb@FreeBSD.org>
 .\" Copyright (c) 2012, Bryan Drewery <bdrewery@FreeBSD.org>
 .\" Copyright (c) 2013, Steven Hartland <smh@FreeBSD.org>
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright (c) 2014, Xin LI <delphij@FreeBSD.org>
 .\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved.
 .\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved.
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
 .Dd June 30, 2019
 .Dt ZFS 8
 .Os
 .Sh NAME
 .Nm zfs
 .Nd configures ZFS file systems
 .Sh SYNOPSIS
 .Nm
 .Fl ?V
 .Nm
 .Cm version
 .Nm
 .Cm <subcommand>
 .Op Ar <args>
 .Sh DESCRIPTION
 The
 .Nm
 command configures ZFS datasets within a ZFS storage pool, as described in
 .Xr zpool 8 .
 A dataset is identified by a unique path within the ZFS namespace.
 For example:
 .Bd -literal
 pool/{filesystem,volume,snapshot}
 .Ed
 .Pp
 where the maximum length of a dataset name is
 .Dv MAXNAMELEN
 .Pq 256 bytes
 and the maximum amount of nesting allowed in a path is 50 levels deep.
 .Pp
 A dataset can be one of the following:
 .Bl -tag -width "file system"
 .It Sy file system
 A ZFS dataset of type
 .Sy filesystem
 can be mounted within the standard system namespace and behaves like other file
 systems.
 While ZFS file systems are designed to be POSIX compliant, known issues exist
 that prevent compliance in some cases.
 Applications that depend on standards conformance might fail due to non-standard
 behavior when checking file system free space.
 .It Sy volume
 A logical volume exported as a raw or block device.
 This type of dataset should only be used when a block device is required.
 File systems are typically used in most environments.
 .It Sy snapshot
 A read-only version of a file system or volume at a given point in time.
 It is specified as
 .Ar filesystem Ns @ Ns Ar name
 or
 .Ar volume Ns @ Ns Ar name .
 .It Sy bookmark
 Much like a
 .Sy snapshot ,
 but without the hold on on-disk data.
 It can be used as the source of a send (but not for a receive). It is specified as
 .Ar filesystem Ns # Ns Ar name
 or
 .Ar volume Ns # Ns Ar name .
 .El
 .Pp
 For details see
 .Xr zfsconcepts 8 .
 .Ss Properties
 Properties are divided into two types, native properties and user-defined
 .Po or
 .Qq user
 .Pc
 properties.
 Native properties either export internal statistics or control ZFS behavior.
 In addition, native properties are either editable or read-only.
 User properties have no effect on ZFS behavior, but you can use them to annotate
 datasets in a way that is meaningful in your environment.
 For more information about properties, see the
 .Xr zfsprops 8 man page.
 .Ss Encryption
 Enabling the
 .Sy encryption
 feature allows for the creation of encrypted filesystems and volumes.
 ZFS will encrypt file and zvol data, file attributes, ACLs, permission bits,
 directory listings, FUID mappings, and
 .Sy userused
 /
 .Sy groupused
 data.
 For an overview of encryption see the
 .Xr zfs-load-key 8 command manual.
 .Sh SUBCOMMANDS
 All subcommands that modify state are logged persistently to the pool in their
 original form.
 .Bl -tag -width ""
 .It Nm Fl ?
 Displays a help message.
 .It Xo
 .Nm
 .Fl V , -version
 .Xc
 An alias for the
 .Nm zfs Cm version
 subcommand.
 .It Xo
 .Nm
 .Cm version
 .Xc
 Displays the software version of the
 .Nm
 userland utility and the zfs kernel module.
 .El
 .Ss Dataset Management
 .Bl -tag -width ""
 .It Xr zfs-list 8
 Lists the property information for the given datasets in tabular form.
 .It Xr zfs-create 8
 Creates a new ZFS file system or volume.
 .It Xr zfs-destroy 8
 Destroys the given dataset(s), snapshot(s), or bookmark.
 .It Xr zfs-rename 8
 Renames the given dataset (filesystem or snapshot).
 .It Xr zfs-upgrade 8
 Manage upgrading the on-disk version of filesystems.
 .El
 .Ss Snapshots
 .Bl -tag -width ""
 .It Xr zfs-snapshot 8
 Creates snapshots with the given names.
 .It Xr zfs-rollback 8
 Roll back the given dataset to a previous snapshot.
 .It Xo
 .Xr zfs-hold 8 /
 .Xr zfs-release 8
 .Xc
 Add or remove a hold reference to the specified snapshot or snapshots.
 If a hold exists on a snapshot, attempts to destroy that snapshot by using the
 .Nm zfs Cm destroy
 command return
 .Er EBUSY .
 .It Xr zfs-diff 8
 Display the difference between a snapshot of a given filesystem and another
 snapshot of that filesystem from a later time or the current contents of the
 filesystem.
 .El
 .Ss Clones
 .Bl -tag -width ""
 .It Xr zfs-clone 8
 Creates a clone of the given snapshot.
 .It Xr zfs-promote 8
 Promotes a clone file system to no longer be dependent on its
 .Qq origin
 snapshot.
 .El
 .Ss Send & Receive
 .Bl -tag -width ""
 .It Xr zfs-send 8
 Generate a send stream, which may be of a filesystem, and may be incremental
 from a bookmark.
 .It Xr zfs-receive 8
 Creates a snapshot whose contents are as specified in the stream provided on
 standard input.
 If a full stream is received, then a new file system is created as well.
 Streams are created using the
 .Xr zfs-send 8
 subcommand, which by default creates a full stream.
 .It Xr zfs-bookmark 8
 Creates a new bookmark of the given snapshot or bookmark.
 Bookmarks mark the point in time when the snapshot was created, and can be used
 as the incremental source for a
 .Nm zfs Cm send
 command.
 .It Xr zfs-redact 8
 Generate a new redaction bookmark.
 This feature can be used to allow clones of a filesystem to be made available on
 a remote system, in the case where their parent need not (or needs to not) be
 usable.
 .El
 .Ss Properties
 .Bl -tag -width ""
 .It Xr zfs-get 8
 Displays properties for the given datasets.
 .It Xr zfs-set 8
 Sets the property or list of properties to the given value(s) for each dataset.
 .It Xr zfs-inherit 8
 Clears the specified property, causing it to be inherited from an ancestor,
 restored to default if no ancestor has the property set, or with the
 .Fl S
 option reverted to the received value if one exists.
 .El
 .Ss Quotas
 .Bl -tag -width ""
 .It Xo
 .Xr zfs-userspace 8 /
 .Xr zfs-groupspace 8 /
 .Xr zfs-projectspace 8
 .Xc
 Displays space consumed by, and quotas on, each user, group, or project
 in the specified filesystem or snapshot.
 .It Xr zfs-project 8
 List, set, or clear project ID and/or inherit flag on the file(s) or directories.
 .El
 .Ss Mountpoints
 .Bl -tag -width ""
 .It Xr zfs-mount 8
 Displays all ZFS file systems currently mounted, or mount ZFS filesystem
 on a path described by its
 .Sy mountpoint
 property.
 .It Xr zfs-unmount 8
 Unmounts currently mounted ZFS file systems.
 .El
 .Ss Shares
 .Bl -tag -width ""
 .It Xr zfs-share 8
 Shares available ZFS file systems.
 .It Xr zfs-unshare 8
 Unshares currently shared ZFS file systems.
 .El
 .Ss Delegated Administration
 .Bl -tag -width ""
 .It Xr zfs-allow 8
 Delegate permissions on the specified filesystem or volume.
 .It Xr zfs-unallow 8
 Remove delegated permissions on the specified filesystem or volume.
 .El
 .Ss Encryption
 .Bl -tag -width ""
 .It Xr zfs-change-key 8
 Add or change an encryption key on the specified dataset.
 .It Xr zfs-load-key 8
 Load the key for the specified encrypted dataset, enabling access.
 .It Xr zfs-unload-key 8
 Unload a key for the specified dataset, removing the ability to access the dataset.
 .El
 .Ss Channel Programs
 .Bl -tag -width ""
 .It Xr zfs-program 8
 Execute ZFS administrative operations
 programmatically via a Lua script-language channel program.
 .El
 .Ss Jails
 .Bl -tag -width ""
 .It Xr zfs-jail 8
 Attaches a filesystem to a jail.
 .It Xr zfs-unjail 8
 Detaches a filesystem from a jail.
 .El
 .Ss Waiting
 .Bl -tag -width ""
 .It Xr zfs-wait 8
 Wait for background activity in a filesystem to complete.
 .El
 .Sh EXIT STATUS
 The
 .Nm
 utility exits 0 on success, 1 if an error occurs, and 2 if invalid command line
 options were specified.
 .Sh EXAMPLES
 .Bl -tag -width ""
 .It Sy Example 1 No Creating a ZFS File System Hierarchy
 The following commands create a file system named
 .Em pool/home
 and a file system named
 .Em pool/home/bob .
 The mount point
 .Pa /export/home
 is set for the parent file system, and is automatically inherited by the child
 file system.
 .Bd -literal
 # zfs create pool/home
 # zfs set mountpoint=/export/home pool/home
 # zfs create pool/home/bob
 .Ed
 .It Sy Example 2 No Creating a ZFS Snapshot
 The following command creates a snapshot named
 .Sy yesterday .
 This snapshot is mounted on demand in the
 .Pa .zfs/snapshot
 directory at the root of the
 .Em pool/home/bob
 file system.
 .Bd -literal
 # zfs snapshot pool/home/bob@yesterday
 .Ed
 .It Sy Example 3 No Creating and Destroying Multiple Snapshots
 The following command creates snapshots named
 .Sy yesterday
 of
 .Em pool/home
 and all of its descendent file systems.
 Each snapshot is mounted on demand in the
 .Pa .zfs/snapshot
 directory at the root of its file system.
 The second command destroys the newly created snapshots.
 .Bd -literal
 # zfs snapshot -r pool/home@yesterday
 # zfs destroy -r pool/home@yesterday
 .Ed
 .It Sy Example 4 No Disabling and Enabling File System Compression
 The following command disables the
 .Sy compression
 property for all file systems under
 .Em pool/home .
 The next command explicitly enables
 .Sy compression
 for
 .Em pool/home/anne .
 .Bd -literal
 # zfs set compression=off pool/home
 # zfs set compression=on pool/home/anne
 .Ed
 .It Sy Example 5 No Listing ZFS Datasets
 The following command lists all active file systems and volumes in the system.
 Snapshots are displayed if the
 .Sy listsnaps
 property is
 .Sy on .
 The default is
 .Sy off .
 See
 .Xr zpool 8
 for more information on pool properties.
 .Bd -literal
 # zfs list
 NAME                      USED  AVAIL  REFER  MOUNTPOINT
 pool                      450K   457G    18K  /pool
 pool/home                 315K   457G    21K  /export/home
 pool/home/anne             18K   457G    18K  /export/home/anne
 pool/home/bob             276K   457G   276K  /export/home/bob
 .Ed
 .It Sy Example 6 No Setting a Quota on a ZFS File System
 The following command sets a quota of 50 Gbytes for
 .Em pool/home/bob .
 .Bd -literal
 # zfs set quota=50G pool/home/bob
 .Ed
 .It Sy Example 7 No Listing ZFS Properties
 The following command lists all properties for
 .Em pool/home/bob .
 .Bd -literal
 # zfs get all pool/home/bob
 NAME           PROPERTY              VALUE                  SOURCE
 pool/home/bob  type                  filesystem             -
 pool/home/bob  creation              Tue Jul 21 15:53 2009  -
 pool/home/bob  used                  21K                    -
 pool/home/bob  available             20.0G                  -
 pool/home/bob  referenced            21K                    -
 pool/home/bob  compressratio         1.00x                  -
 pool/home/bob  mounted               yes                    -
 pool/home/bob  quota                 20G                    local
 pool/home/bob  reservation           none                   default
 pool/home/bob  recordsize            128K                   default
 pool/home/bob  mountpoint            /pool/home/bob         default
 pool/home/bob  sharenfs              off                    default
 pool/home/bob  checksum              on                     default
 pool/home/bob  compression           on                     local
 pool/home/bob  atime                 on                     default
 pool/home/bob  devices               on                     default
 pool/home/bob  exec                  on                     default
 pool/home/bob  setuid                on                     default
 pool/home/bob  readonly              off                    default
 pool/home/bob  zoned                 off                    default
 pool/home/bob  snapdir               hidden                 default
 pool/home/bob  acltype               off                    default
 pool/home/bob  aclmode               discard                default
 pool/home/bob  aclinherit            restricted             default
 pool/home/bob  canmount              on                     default
 pool/home/bob  xattr                 on                     default
 pool/home/bob  copies                1                      default
 pool/home/bob  version               4                      -
 pool/home/bob  utf8only              off                    -
 pool/home/bob  normalization         none                   -
 pool/home/bob  casesensitivity       sensitive              -
 pool/home/bob  vscan                 off                    default
 pool/home/bob  nbmand                off                    default
 pool/home/bob  sharesmb              off                    default
 pool/home/bob  refquota              none                   default
 pool/home/bob  refreservation        none                   default
 pool/home/bob  primarycache          all                    default
 pool/home/bob  secondarycache        all                    default
 pool/home/bob  usedbysnapshots       0                      -
 pool/home/bob  usedbydataset         21K                    -
 pool/home/bob  usedbychildren        0                      -
 pool/home/bob  usedbyrefreservation  0                      -
 .Ed
 .Pp
 The following command gets a single property value.
 .Bd -literal
 # zfs get -H -o value compression pool/home/bob
 on
 .Ed
 The following command lists all properties with local settings for
 .Em pool/home/bob .
 .Bd -literal
 # zfs get -r -s local -o name,property,value all pool/home/bob
 NAME           PROPERTY              VALUE
 pool/home/bob  quota                 20G
 pool/home/bob  compression           on
 .Ed
 .It Sy Example 8 No Rolling Back a ZFS File System
 The following command reverts the contents of
 .Em pool/home/anne
 to the snapshot named
 .Sy yesterday ,
 deleting all intermediate snapshots.
 .Bd -literal
 # zfs rollback -r pool/home/anne@yesterday
 .Ed
 .It Sy Example 9 No Creating a ZFS Clone
 The following command creates a writable file system whose initial contents are
 the same as
 .Em pool/home/bob@yesterday .
 .Bd -literal
 # zfs clone pool/home/bob@yesterday pool/clone
 .Ed
 .It Sy Example 10 No Promoting a ZFS Clone
 The following commands illustrate how to test out changes to a file system, and
 then replace the original file system with the changed one, using clones, clone
 promotion, and renaming:
 .Bd -literal
 # zfs create pool/project/production
   populate /pool/project/production with data
 # zfs snapshot pool/project/production@today
 # zfs clone pool/project/production@today pool/project/beta
   make changes to /pool/project/beta and test them
 # zfs promote pool/project/beta
 # zfs rename pool/project/production pool/project/legacy
 # zfs rename pool/project/beta pool/project/production
   once the legacy version is no longer needed, it can be destroyed
 # zfs destroy pool/project/legacy
 .Ed
 .It Sy Example 11 No Inheriting ZFS Properties
 The following command causes
 .Em pool/home/bob
 and
 .Em pool/home/anne
 to inherit the
 .Sy checksum
 property from their parent.
 .Bd -literal
 # zfs inherit checksum pool/home/bob pool/home/anne
 .Ed
 .It Sy Example 12 No Remotely Replicating ZFS Data
 The following commands send a full stream and then an incremental stream to a
 remote machine, restoring them into
 .Em poolB/received/fs@a
 and
 .Em poolB/received/fs@b ,
 respectively.
 .Em poolB
 must contain the file system
 .Em poolB/received ,
 and must not initially contain
 .Em poolB/received/fs .
 .Bd -literal
 # zfs send pool/fs@a | \e
   ssh host zfs receive poolB/received/fs@a
 # zfs send -i a pool/fs@b | \e
   ssh host zfs receive poolB/received/fs
 .Ed
 .It Sy Example 13 No Using the zfs receive -d Option
 The following command sends a full stream of
 .Em poolA/fsA/fsB@snap
 to a remote machine, receiving it into
 .Em poolB/received/fsA/fsB@snap .
 The
 .Em fsA/fsB@snap
 portion of the received snapshot's name is determined from the name of the sent
 snapshot.
 .Em poolB
 must contain the file system
 .Em poolB/received .
 If
 .Em poolB/received/fsA
 does not exist, it is created as an empty file system.
 .Bd -literal
 # zfs send poolA/fsA/fsB@snap | \e
   ssh host zfs receive -d poolB/received
 .Ed
 .It Sy Example 14 No Setting User Properties
 The following example sets the user-defined
 .Sy com.example:department
 property for a dataset.
 .Bd -literal
 # zfs set com.example:department=12345 tank/accounting
 .Ed
 .It Sy Example 15 No Performing a Rolling Snapshot
 The following example shows how to maintain a history of snapshots with a
 consistent naming scheme.
 To keep a week's worth of snapshots, the user destroys the oldest snapshot,
 renames the remaining snapshots, and then creates a new snapshot, as follows:
 .Bd -literal
 # zfs destroy -r pool/users@7daysago
 # zfs rename -r pool/users@6daysago @7daysago
 # zfs rename -r pool/users@5daysago @6daysago
 # zfs rename -r pool/users@4daysago @5daysago
 # zfs rename -r pool/users@3daysago @4daysago
 # zfs rename -r pool/users@2daysago @3daysago
 # zfs rename -r pool/users@yesterday @2daysago
 # zfs rename -r pool/users@today @yesterday
 # zfs snapshot -r pool/users@today
 .Ed
 .It Sy Example 16 No Setting sharenfs Property Options on a ZFS File System
 The following commands show how to set
 .Sy sharenfs
 property options to enable
 .Sy rw
 access for a set of
 .Sy IP
 addresses and to enable root access for system
 .Sy neo
 on the
 .Em tank/home
 file system.
 .Bd -literal
 # zfs set sharenfs='rw=@123.123.0.0/16,root=neo' tank/home
 .Ed
 .Pp
 If you are using
 .Sy DNS
 for host name resolution, specify the fully qualified hostname.
 .It Sy Example 17 No Delegating ZFS Administration Permissions on a ZFS Dataset
 The following example shows how to set permissions so that user
 .Sy cindys
 can create, destroy, mount, and take snapshots on
 .Em tank/cindys .
 The permissions on
 .Em tank/cindys
 are also displayed.
 .Bd -literal
 # zfs allow cindys create,destroy,mount,snapshot tank/cindys
 # zfs allow tank/cindys
 ---- Permissions on tank/cindys --------------------------------------
 Local+Descendent permissions:
         user cindys create,destroy,mount,snapshot
 .Ed
 .Pp
 Because the
 .Em tank/cindys
 mount point permission is set to 755 by default, user
 .Sy cindys
 will be unable to mount file systems under
 .Em tank/cindys .
 Add an ACE similar to the following syntax to provide mount point access:
 .Bd -literal
 # chmod A+user:cindys:add_subdirectory:allow /tank/cindys
 .Ed
 .It Sy Example 18 No Delegating Create Time Permissions on a ZFS Dataset
 The following example shows how to grant anyone in the group
 .Sy staff
 to create file systems in
 .Em tank/users .
 This syntax also allows staff members to destroy their own file systems, but not
 destroy anyone else's file system.
 The permissions on
 .Em tank/users
 are also displayed.
 .Bd -literal
 # zfs allow staff create,mount tank/users
 # zfs allow -c destroy tank/users
 # zfs allow tank/users
 ---- Permissions on tank/users ---------------------------------------
 Permission sets:
         destroy
 Local+Descendent permissions:
         group staff create,mount
 .Ed
 .It Sy Example 19 No Defining and Granting a Permission Set on a ZFS Dataset
 The following example shows how to define and grant a permission set on the
 .Em tank/users
 file system.
 The permissions on
 .Em tank/users
 are also displayed.
 .Bd -literal
 # zfs allow -s @pset create,destroy,snapshot,mount tank/users
 # zfs allow staff @pset tank/users
 # zfs allow tank/users
 ---- Permissions on tank/users ---------------------------------------
 Permission sets:
         @pset create,destroy,mount,snapshot
 Local+Descendent permissions:
         group staff @pset
 .Ed
 .It Sy Example 20 No Delegating Property Permissions on a ZFS Dataset
 The following example shows to grant the ability to set quotas and reservations
 on the
 .Em users/home
 file system.
 The permissions on
 .Em users/home
 are also displayed.
 .Bd -literal
 # zfs allow cindys quota,reservation users/home
 # zfs allow users/home
 ---- Permissions on users/home ---------------------------------------
 Local+Descendent permissions:
         user cindys quota,reservation
 cindys% zfs set quota=10G users/home/marks
 cindys% zfs get quota users/home/marks
 NAME              PROPERTY  VALUE  SOURCE
 users/home/marks  quota     10G    local
 .Ed
 .It Sy Example 21 No Removing ZFS Delegated Permissions on a ZFS Dataset
 The following example shows how to remove the snapshot permission from the
 .Sy staff
 group on the
 .Em tank/users
 file system.
 The permissions on
 .Em tank/users
 are also displayed.
 .Bd -literal
 # zfs unallow staff snapshot tank/users
 # zfs allow tank/users
 ---- Permissions on tank/users ---------------------------------------
 Permission sets:
         @pset create,destroy,mount,snapshot
 Local+Descendent permissions:
         group staff @pset
 .Ed
 .It Sy Example 22 No Showing the differences between a snapshot and a ZFS Dataset
 The following example shows how to see what has changed between a prior
 snapshot of a ZFS dataset and its current state.
 The
 .Fl F
 option is used to indicate type information for the files affected.
 .Bd -literal
 # zfs diff -F tank/test@before tank/test
 M       /       /tank/test/
 M       F       /tank/test/linked      (+1)
 R       F       /tank/test/oldname -> /tank/test/newname
 -       F       /tank/test/deleted
 +       F       /tank/test/created
 M       F       /tank/test/modified
 .Ed
 .It Sy Example 23 No Creating a bookmark
 The following example create a bookmark to a snapshot.
 This bookmark can then be used instead of snapshot in send streams.
 .Bd -literal
 # zfs bookmark rpool@snapshot rpool#bookmark
 .Ed
 .It Sy Example 24 No Setting sharesmb Property Options on a ZFS File System
 The following example show how to share SMB filesystem through ZFS.
 Note that that a user and his/her password must be given.
 .Bd -literal
 # smbmount //127.0.0.1/share_tmp /mnt/tmp \\
   -o user=workgroup/turbo,password=obrut,uid=1000
 .Ed
 .Pp
 Minimal
 .Em /etc/samba/smb.conf
 configuration required:
 .Pp
 Samba will need to listen to 'localhost' (127.0.0.1) for the ZFS utilities to
 communicate with Samba.
 This is the default behavior for most Linux distributions.
 .Pp
 Samba must be able to authenticate a user.
 This can be done in a number of ways, depending on if using the system password file, LDAP or the Samba
 specific smbpasswd file.
 How to do this is outside the scope of this manual.
 Please refer to the
 .Xr smb.conf 5
 man page for more information.
 .Pp
 See the
 .Sy USERSHARE section
 of the
 .Xr smb.conf 5
 man page for all configuration options in case you need to modify any options
 to the share afterwards.
 Do note that any changes done with the
 .Xr net 8
 command will be undone if the share is ever unshared (such as at a reboot etc).
 .El
 .Sh ENVIRONMENT VARIABLES
 .Bl -tag -width "ZFS_MOUNT_HELPER"
 .It Ev ZFS_MOUNT_HELPER
 Cause
 .Nm zfs mount
 to use
 .Em /bin/mount
 to mount zfs datasets. This option is provided for backwards compatibility with older zfs versions.
 .El
 .Sh INTERFACE STABILITY
 .Sy Committed .
 .Sh SEE ALSO
 .Xr attr 1 ,
 .Xr gzip 1 ,
 .Xr ssh 1 ,
 .Xr chmod 2 ,
 .Xr fsync 2 ,
 .Xr stat 2 ,
 .Xr write 2 ,
 .Xr acl 5 ,
 .Xr attributes 5 ,
 .Xr exports 5 ,
 .Xr exportfs 8 ,
 .Xr mount 8 ,
 .Xr net 8 ,
 .Xr selinux 8 ,
+.Xr zfs-allow 8 ,
+.Xr zfs-bookmark 8 ,
+.Xr zfs-change-key 8 ,
+.Xr zfs-clone 8 ,
+.Xr zfs-create 8 ,
+.Xr zfs-destroy 8 ,
+.Xr zfs-diff 8 ,
+.Xr zfs-get 8 ,
+.Xr zfs-groupspace 8 ,
+.Xr zfs-hold 8 ,
+.Xr zfs-inherit 8 ,
+.Xr zfs-jail 8 ,
+.Xr zfs-list 8 ,
+.Xr zfs-load-key 8 ,
+.Xr zfs-mount 8 ,
+.Xr zfs-program 8 ,
+.Xr zfs-project 8 ,
+.Xr zfs-projectspace 8 ,
+.Xr zfs-promote 8 ,
+.Xr zfs-receive 8 ,
+.Xr zfs-redact 8 ,
+.Xr zfs-release 8 ,
+.Xr zfs-rename 8 ,
+.Xr zfs-rollback 8 ,
+.Xr zfs-send 8 ,
+.Xr zfs-set 8 ,
+.Xr zfs-share 8 ,
+.Xr zfs-snapshot 8 ,
+.Xr zfs-unallow 8 ,
+.Xr zfs-unjail 8 ,
+.Xr zfs-unload-key 8 ,
+.Xr zfs-unmount 8 ,
+.Xr zfs-unshare 8 ,
+.Xr zfs-upgrade 8 ,
+.Xr zfs-userspace 8 ,
+.Xr zfs-wait 8 ,
 .Xr zfsconcepts 8 ,
 .Xr zfsprops 8 ,
 .Xr zpool 8
Index: vendor-sys/openzfs/dist/man/man8/zinject.8
===================================================================
--- vendor-sys/openzfs/dist/man/man8/zinject.8	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man8/zinject.8	(revision 364928)
@@ -1,198 +1,198 @@
 '\" t
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\"
 .\" Copyright 2013 Darik Horn <dajhorn@vanadac.com>. All rights reserved.
 .\"
-.TH zinject 8 "2013 FEB 28" "ZFS on Linux" "System Administration Commands"
+.TH ZINJECT 8 "Aug 24, 2020" OpenZFS
 
 .SH NAME
 zinject \- ZFS Fault Injector
 .SH DESCRIPTION
 .BR zinject
 creates artificial problems in a ZFS pool by simulating data corruption or device failures. This program is dangerous.
 .SH SYNOPSIS
 .TP
 .B "zinject"
 List injection records.
 .TP
 .B "zinject \-b \fIobjset:object:level:blkd\fB [\-f \fIfrequency\fB] [\-amu] \fIpool\fB"
 Force an error into the pool at a bookmark.
 .TP
 .B "zinject \-c <\fIid\fB | all>
 Cancel injection records.
 .TP
 .B "zinject \-d \fIvdev\fB \-A <degrade|fault> \fIpool\fB
 Force a vdev into the DEGRADED or FAULTED state.
 .TP
 .B "zinject -d \fIvdev\fB -D latency:lanes \fIpool\fB
 
 Add an artificial delay to IO requests on a particular
 device, such that the requests take a minimum of 'latency'
 milliseconds to complete. Each delay has an associated
 number of 'lanes' which defines the number of concurrent
 IO requests that can be processed.
 
 For example, with a single lane delay of 10 ms (-D 10:1),
 the device will only be able to service a single IO request
 at a time with each request taking 10 ms to complete. So,
 if only a single request is submitted every 10 ms, the
 average latency will be 10 ms; but if more than one request
 is submitted every 10 ms, the average latency will be more
 than 10 ms.
 
 Similarly, if a delay of 10 ms is specified to have two
 lanes (-D 10:2), then the device will be able to service
 two requests at a time, each with a minimum latency of
 10 ms. So, if two requests are submitted every 10 ms, then
 the average latency will be 10 ms; but if more than two
 requests are submitted every 10 ms, the average latency
 will be more than 10 ms.
 
 Also note, these delays are additive. So two invocations
 of '-D 10:1', is roughly equivalent to a single invocation
 of '-D 10:2'. This also means, one can specify multiple
 lanes with differing target latencies. For example, an
 invocation of '-D 10:1' followed by '-D 25:2' will
 create 3 lanes on the device; one lane with a latency
 of 10 ms and two lanes with a 25 ms latency.
 
 .TP
 .B "zinject \-d \fIvdev\fB [\-e \fIdevice_error\fB] [\-L \fIlabel_error\fB] [\-T \fIfailure\fB] [\-f \fIfrequency\fB] [\-F] \fIpool\fB"
 Force a vdev error.
 .TP
 .B "zinject \-I [\-s \fIseconds\fB | \-g \fItxgs\fB] \fIpool\fB"
 Simulate a hardware failure that fails to honor a cache flush.
 .TP
 .B "zinject \-p \fIfunction\fB \fIpool\fB
 Panic inside the specified function.
 .TP
 .B "zinject \-t data [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB"
 Force an error into the contents of a file.
 .TP
 .B "zinject \-t dnode [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB"
 Force an error into the metadnode for a file or directory.
 .TP
 .B "zinject \-t \fImos_type\fB [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB"
 Force an error into the MOS of a pool.
 .SH OPTIONS
 .TP
 .BI "\-a"
 Flush the ARC before injection.
 .TP
 .BI "\-b" " objset:object:level:start:end"
 Force an error into the pool at this bookmark tuple. Each number is
 in hexadecimal, and only one block can be specified.
 .TP
 .BI "\-C" " dvas"
 Inject the given error only into specific DVAs. The mask should be
 specified as a list of 0-indexed DVAs separated by commas (ex. '0,2'). This
 option is not applicable to logical data errors such as
 .BR "decompress"
 and
 .BR "decrypt" .
 .TP
 .BI "\-d" " vdev"
 A vdev specified by path or GUID.
 .TP
 .BI "\-e" " device_error"
 Specify
 .BR "checksum" " for an ECKSUM error,"
 .BR "decompress" " for a data decompression error,"
 .BR "decrypt" " for a data decryption error,"
 .BR "corrupt" " to flip a bit in the data after a read,"
 .BR "dtl" " for an ECHILD error,"
 .BR "io" " for an EIO error where reopening the device will succeed, or"
 .BR "nxio" " for an ENXIO error where reopening the device will fail."
 For EIO and ENXIO, the "failed" reads or writes still occur.  The probe simply
 sets the error value reported by the I/O pipeline so it appears the read or
 write failed.  Decryption errors only currently work with file data.
 .TP
 .BI "\-f" " frequency"
 Only inject errors a fraction of the time. Expressed as a real number
 percentage between 0.0001 and 100.
 .TP
 .BI "\-F"
 Fail faster. Do fewer checks.
 .TP
 .BI "\-g" " txgs"
 Run for this many transaction groups before reporting failure.
 .TP
 .BI "\-h"
 Print the usage message.
 .TP
 .BI "\-l" " level"
 Inject an error at a particular block level. The default is 0.
 .TP
 .BI "\-L" " label_error"
 Set the label error region to one of
 .BR " nvlist" ","
 .BR " pad1" ","
 .BR " pad2" ", or"
 .BR " uber" "."
 .TP
 .BI "\-m"
 Automatically remount the underlying filesystem.
 .TP
 .BI "\-q"
 Quiet mode. Only print the handler number added.
 .TP
 .BI "\-r" " range"
 Inject an error over a particular logical range of an object, which
 will be translated to the appropriate blkid range according to the
 object's properties.
 .TP
 .BI "\-s" " seconds"
 Run for this many seconds before reporting failure.
 .TP
 .BI "\-T" " failure"
 Set the failure type to one of
 .BR " all" ","
 .BR " claim" ","
 .BR " free" ","
 .BR " read" ", or"
 .BR " write" "."
 .TP
 .BI "\-t" " mos_type"
 Set this to
 .BR "mos " "for any data in the MOS,"
 .BR "mosdir " "for an object directory,"
 .BR "config " "for the pool configuration,"
 .BR "bpobj " "for the block pointer list,"
 .BR "spacemap " "for the space map,"
 .BR "metaslab " "for the metaslab, or"
 .BR "errlog " "for the persistent error log."
 .TP
 .BI "\-u"
 Unload the pool after injection.
 
 .SH "ENVIRONMENT VARIABLES"
 .TP
 .B "ZINJECT_DEBUG"
 Run \fBzinject\fR in debug mode.
 
 .SH "AUTHORS"
 This man page was written by Darik Horn <dajhorn@vanadac.com>
 excerpting the \fBzinject\fR usage message and source code.
 
 .SH "SEE ALSO"
 .BR zpool (8),
 .BR zfs (8)
Index: vendor-sys/openzfs/dist/man/man8/zpool-iostat.8
===================================================================
--- vendor-sys/openzfs/dist/man/man8/zpool-iostat.8	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man8/zpool-iostat.8	(revision 364928)
@@ -1,247 +1,247 @@
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
 .Dd August 9, 2019
 .Dt ZPOOL-IOSTAT 8
 .Os
 .Sh NAME
 .Nm zpool Ns Pf - Cm iostat
 .Nd Display logical I/O statistics for the given ZFS storage pools/vdevs
 .Sh SYNOPSIS
 .Nm
 .Cm iostat
 .Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw
 .Op Fl T Sy u Ns | Ns Sy d
 .Op Fl ghHLnpPvy
 .Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc
 .Op Ar interval Op Ar count
 .Sh DESCRIPTION
 .Bl -tag -width Ds
 .It Xo
 .Nm
 .Cm iostat
 .Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw
 .Op Fl T Sy u Ns | Ns Sy d
 .Op Fl ghHLnpPvy
 .Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc
 .Op Ar interval Op Ar count
 .Xc
 Displays logical I/O statistics for the given pools/vdevs. Physical I/Os may
 be observed via
 .Xr iostat 1 .
 If writes are located nearby, they may be merged into a single
 larger operation. Additional I/O may be generated depending on the level of
 vdev redundancy.
 To filter output, you may pass in a list of pools, a pool and list of vdevs
 in that pool, or a list of any vdevs from any pool. If no items are specified,
 statistics for every pool in the system are shown.
 When given an
 .Ar interval ,
 the statistics are printed every
 .Ar interval
-seconds until ^C is pressed. If 
+seconds until ^C is pressed. If
 .Fl n
-flag is specified the headers are displayed only once, otherwise they are 
+flag is specified the headers are displayed only once, otherwise they are
 displayed periodically. If count is specified, the command exits
 after count reports are printed. The first report printed is always
 the statistics since boot regardless of whether
 .Ar interval
 and
 .Ar count
 are passed. However, this behavior can be suppressed with the
 .Fl y
 flag. Also note that the units of
 .Sy K ,
 .Sy M ,
 .Sy G ...
 that are printed in the report are in base 1024. To get the raw
 values, use the
 .Fl p
 flag.
 .Bl -tag -width Ds
 .It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ...
 Run a script (or scripts) on each vdev and include the output as a new column
 in the
 .Nm zpool Cm iostat
 output. Users can run any script found in their
 .Pa ~/.zpool.d
 directory or from the system
 .Pa /etc/zfs/zpool.d
 directory. Script names containing the slash (/) character are not allowed.
 The default search path can be overridden by setting the
 ZPOOL_SCRIPTS_PATH environment variable. A privileged user can run
 .Fl c
 if they have the ZPOOL_SCRIPTS_AS_ROOT
 environment variable set. If a script requires the use of a privileged
 command, like
 .Xr smartctl 8 ,
 then it's recommended you allow the user access to it in
 .Pa /etc/sudoers
 or add the user to the
 .Pa /etc/sudoers.d/zfs
 file.
 .Pp
 If
 .Fl c
 is passed without a script name, it prints a list of all scripts.
 .Fl c
 also sets verbose mode
 .No \&( Ns Fl v Ns No \&).
 .Pp
 Script output should be in the form of "name=value". The column name is
 set to "name" and the value is set to "value". Multiple lines can be
 used to output multiple columns. The first line of output not in the
 "name=value" format is displayed without a column title, and no more
 output after that is displayed. This can be useful for printing error
 messages. Blank or NULL values are printed as a '-' to make output
 awk-able.
 .Pp
 The following environment variables are set before running each script:
 .Bl -tag -width "VDEV_PATH"
 .It Sy VDEV_PATH
 Full path to the vdev
 .El
 .Bl -tag -width "VDEV_UPATH"
 .It Sy VDEV_UPATH
 Underlying path to the vdev (/dev/sd*).  For use with device mapper,
 multipath, or partitioned vdevs.
 .El
 .Bl -tag -width "VDEV_ENC_SYSFS_PATH"
 .It Sy VDEV_ENC_SYSFS_PATH
 The sysfs path to the enclosure for the vdev (if any).
 .El
 .It Fl T Sy u Ns | Ns Sy d
 Display a time stamp.
 Specify
 .Sy u
 for a printed representation of the internal representation of time.
 See
 .Xr time 2 .
 Specify
 .Sy d
 for standard date format.
 See
 .Xr date 1 .
 .It Fl g
 Display vdev GUIDs instead of the normal device names. These GUIDs
 can be used in place of device names for the zpool
 detach/offline/remove/replace commands.
 .It Fl H
 Scripted mode. Do not display headers, and separate fields by a
 single tab instead of arbitrary space.
 .It Fl L
 Display real paths for vdevs resolving all symbolic links. This can
 be used to look up the current block device name regardless of the
 .Pa /dev/disk/
 path used to open it.
 .It Fl n
 Print headers only once when passed
 .It Fl p
 Display numbers in parsable (exact) values. Time values are in
 nanoseconds.
 .It Fl P
 Display full paths for vdevs instead of only the last component of
 the path. This can be used in conjunction with the
 .Fl L
 flag.
 .It Fl r
 Print request size histograms for the leaf vdev's IO. This includes
 histograms of individual IOs (ind) and aggregate IOs (agg). These stats
 can be useful for observing how well IO aggregation is working.  Note
 that TRIM IOs may exceed 16M, but will be counted as 16M.
 .It Fl v
 Verbose statistics Reports usage statistics for individual vdevs within the
 pool, in addition to the pool-wide statistics.
 .It Fl y
 Omit statistics since boot.
 Normally the first line of output reports the statistics since boot.
 This option suppresses that first line of output.
 .Ar interval
 .It Fl w
 Display latency histograms:
 .Pp
 .Ar total_wait :
 Total IO time (queuing + disk IO time).
 .Ar disk_wait :
 Disk IO time (time reading/writing the disk).
 .Ar syncq_wait :
 Amount of time IO spent in synchronous priority queues.  Does not include
 disk time.
 .Ar asyncq_wait :
 Amount of time IO spent in asynchronous priority queues.  Does not include
 disk time.
 .Ar scrub :
 Amount of time IO spent in scrub queue. Does not include disk time.
 .It Fl l
 Include average latency statistics:
 .Pp
 .Ar total_wait :
 Average total IO time (queuing + disk IO time).
 .Ar disk_wait :
 Average disk IO time (time reading/writing the disk).
 .Ar syncq_wait :
 Average amount of time IO spent in synchronous priority queues. Does
 not include disk time.
 .Ar asyncq_wait :
 Average amount of time IO spent in asynchronous priority queues.
 Does not include disk time.
 .Ar scrub :
 Average queuing time in scrub queue. Does not include disk time.
 .Ar trim :
 Average queuing time in trim queue. Does not include disk time.
 .It Fl q
 Include active queue statistics. Each priority queue has both
 pending (
 .Ar pend )
 and active (
 .Ar activ )
 IOs. Pending IOs are waiting to
 be issued to the disk, and active IOs have been issued to disk and are
 waiting for completion. These stats are broken out by priority queue:
 .Pp
 .Ar syncq_read/write :
 Current number of entries in synchronous priority
 queues.
 .Ar asyncq_read/write :
 Current number of entries in asynchronous priority queues.
 .Ar scrubq_read :
 Current number of entries in scrub queue.
 .Ar trimq_write :
 Current number of entries in trim queue.
 .Pp
 All queue statistics are instantaneous measurements of the number of
 entries in the queues. If you specify an interval, the measurements
 will be sampled from the end of the interval.
 .El
 .El
 .Sh SEE ALSO
 .Xr zpool-list 8 ,
 .Xr zpool-status 8 ,
 .Xr iostat 1 ,
 .Xr smartctl 8
Index: vendor-sys/openzfs/dist/man/man8/zpool.8
===================================================================
--- vendor-sys/openzfs/dist/man/man8/zpool.8	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man8/zpool.8	(revision 364928)
@@ -1,568 +1,599 @@
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
 .Dd August 9, 2019
 .Dt ZPOOL 8
 .Os
 .Sh NAME
 .Nm zpool
 .Nd configure ZFS storage pools
 .Sh SYNOPSIS
 .Nm
 .Fl ?V
 .Nm
 .Cm version
 .Nm
 .Cm <subcommand>
 .Op Ar <args>
 .Sh DESCRIPTION
 The
 .Nm
 command configures ZFS storage pools.
 A storage pool is a collection of devices that provides physical storage and
 data replication for ZFS datasets.
 All datasets within a storage pool share the same space.
 See
 .Xr zfs 8
 for information on managing datasets.
 .Pp
 For an overview of creating and managing ZFS storage pools see the
 .Xr zpoolconcepts 8
 manual page.
 .Sh SUBCOMMANDS
 All subcommands that modify state are logged persistently to the pool in their
 original form.
 .Pp
 The
 .Nm
 command provides subcommands to create and destroy storage pools, add capacity
 to storage pools, and provide information about the storage pools.
 The following subcommands are supported:
 .Bl -tag -width Ds
 .It Xo
 .Nm
 .Fl ?
 .Xc
 Displays a help message.
 .It Xo
 .Nm
 .Fl V, -version
 .Xc
 An alias for the
 .Nm zpool Cm version
 subcommand.
 .It Xo
 .Nm
 .Cm version
 .Xc
 Displays the software version of the
 .Nm
 userland utility and the zfs kernel module.
 .El
 .Ss Creation
 .Bl -tag -width Ds
 .It Xr zpool-create 8
 Creates a new storage pool containing the virtual devices specified on the
 command line.
 .It Xr zpool-initialize 8
 Begins initializing by writing to all unallocated regions on the specified
 devices, or all eligible devices in the pool if no individual devices are
 specified.
 .El
 .Ss Destruction
 .Bl -tag -width Ds
 .It Xr zpool-destroy 8
 Destroys the given pool, freeing up any devices for other use.
 .It Xr zpool-labelclear 8
 Removes ZFS label information from the specified
 .Ar device .
 .El
 .Ss Virtual Devices
 .Bl -tag -width Ds
 .It Xo
 .Xr zpool-attach 8 /
 .Xr zpool-detach 8
 .Xc
 Increases or decreases redundancy by
 .Cm attach Ns -ing or
 .Cm detach Ns -ing a device on an existing vdev (virtual device).
 .It Xo
 .Xr zpool-add 8 /
 .Xr zpool-remove 8
 .Xc
 Adds the specified virtual devices to the given pool,
 or removes the specified device from the pool.
 .It Xr zpool-replace 8
 Replaces an existing device (which may be faulted) with a new one.
 .It Xr zpool-split 8
 Creates a new pool by splitting all mirrors in an existing pool (which decreases its redundancy).
 .El
 .Ss Properties
 Available pool properties listed in the
 .Xr zpoolprops 8
 manual page.
 .Bl -tag -width Ds
 .It Xr zpool-list 8
 Lists the given pools along with a health status and space usage.
 .It Xo
 .Xr zpool-get 8 /
 .Xr zpool-set 8
 .Xc
 Retrieves the given list of properties
 .Po
 or all properties if
 .Sy all
 is used
 .Pc
 for the specified storage pool(s).
 .El
 .Ss Monitoring
 .Bl -tag -width Ds
 .It Xr zpool-status 8
 Displays the detailed health status for the given pools.
 .It Xr zpool-iostat 8
 Displays logical I/O statistics for the given pools/vdevs. Physical I/Os may
 be observed via
 .Xr iostat 1 .
 .It Xr zpool-events 8
 Lists all recent events generated by the ZFS kernel modules.  These events
 are consumed by the
 .Xr zed 8
 and used to automate administrative tasks such as replacing a failed device
 with a hot spare. For more information about the subclasses and event payloads
 that can be generated see the
 .Xr zfs-events 5
 man page.
 .It Xr zpool-history 8
 Displays the command history of the specified pool(s) or all pools if no pool is
 specified.
 .El
 .Ss Maintenance
 .Bl -tag -width Ds
 .It Xr zpool-scrub 8
 Begins a scrub or resumes a paused scrub.
 .It Xr zpool-checkpoint 8
 Checkpoints the current state of
 .Ar pool
 , which can be later restored by
 .Nm zpool Cm import --rewind-to-checkpoint .
 .It Xr zpool-trim 8
 Initiates an immediate on-demand TRIM operation for all of the free space in
 a pool.  This operation informs the underlying storage devices of all blocks
 in the pool which are no longer allocated and allows thinly provisioned
 devices to reclaim the space.
 .It Xr zpool-sync 8
 This command forces all in-core dirty data to be written to the primary
 pool storage and not the ZIL. It will also update administrative
 information including quota reporting. Without arguments,
 .Sy zpool sync
 will sync all pools on the system. Otherwise, it will sync only the
 specified pool(s).
 .It Xr zpool-upgrade 8
 Manage the on-disk format version of storage pools.
 .It Xr zpool-wait 8
 Waits until all background activity of the given types has ceased in the given
 pool.
 .El
 .Ss Fault Resolution
 .Bl -tag -width Ds
 .It Xo
 .Xr zpool-offline 8
 .Xr zpool-online 8
 .Xc
 Takes the specified physical device offline or brings it online.
 .It Xr zpool-resilver 8
 Starts a resilver. If an existing resilver is already running it will be
 restarted from the beginning.
 .It Xr zpool-reopen 8
 Reopen all the vdevs associated with the pool.
 .It Xr zpool-clear 8
 Clears device errors in a pool.
 .El
 .Ss Import & Export
 .Bl -tag -width Ds
 .It Xr zpool-import 8
 Make disks containing ZFS storage pools available for use on the system.
 .It Xr zpool-export 8
 Exports the given pools from the system.
 .It Xr zpool-reguid 8
 Generates a new unique identifier for the pool.
 .El
 .Sh EXIT STATUS
 The following exit values are returned:
 .Bl -tag -width Ds
 .It Sy 0
 Successful completion.
 .It Sy 1
 An error occurred.
 .It Sy 2
 Invalid command line options were specified.
 .El
 .Sh EXAMPLES
 .Bl -tag -width Ds
 .It Sy Example 1 No Creating a RAID-Z Storage Pool
 The following command creates a pool with a single raidz root vdev that
 consists of six disks.
 .Bd -literal
 # zpool create tank raidz sda sdb sdc sdd sde sdf
 .Ed
 .It Sy Example 2 No Creating a Mirrored Storage Pool
 The following command creates a pool with two mirrors, where each mirror
 contains two disks.
 .Bd -literal
 # zpool create tank mirror sda sdb mirror sdc sdd
 .Ed
 .It Sy Example 3 No Creating a ZFS Storage Pool by Using Partitions
 The following command creates an unmirrored pool using two disk partitions.
 .Bd -literal
 # zpool create tank sda1 sdb2
 .Ed
 .It Sy Example 4 No Creating a ZFS Storage Pool by Using Files
 The following command creates an unmirrored pool using files.
 While not recommended, a pool based on files can be useful for experimental
 purposes.
 .Bd -literal
 # zpool create tank /path/to/file/a /path/to/file/b
 .Ed
 .It Sy Example 5 No Adding a Mirror to a ZFS Storage Pool
 The following command adds two mirrored disks to the pool
 .Em tank ,
 assuming the pool is already made up of two-way mirrors.
 The additional space is immediately available to any datasets within the pool.
 .Bd -literal
 # zpool add tank mirror sda sdb
 .Ed
 .It Sy Example 6 No Listing Available ZFS Storage Pools
 The following command lists all available pools on the system.
 In this case, the pool
 .Em zion
 is faulted due to a missing device.
 The results from this command are similar to the following:
 .Bd -literal
 # zpool list
 NAME    SIZE  ALLOC   FREE  EXPANDSZ   FRAG    CAP  DEDUP  HEALTH  ALTROOT
 rpool  19.9G  8.43G  11.4G         -    33%    42%  1.00x  ONLINE  -
 tank   61.5G  20.0G  41.5G         -    48%    32%  1.00x  ONLINE  -
 zion       -      -      -         -      -      -      -  FAULTED -
 .Ed
 .It Sy Example 7 No Destroying a ZFS Storage Pool
 The following command destroys the pool
 .Em tank
 and any datasets contained within.
 .Bd -literal
 # zpool destroy -f tank
 .Ed
 .It Sy Example 8 No Exporting a ZFS Storage Pool
 The following command exports the devices in pool
 .Em tank
 so that they can be relocated or later imported.
 .Bd -literal
 # zpool export tank
 .Ed
 .It Sy Example 9 No Importing a ZFS Storage Pool
 The following command displays available pools, and then imports the pool
 .Em tank
 for use on the system.
 The results from this command are similar to the following:
 .Bd -literal
 # zpool import
   pool: tank
     id: 15451357997522795478
  state: ONLINE
 action: The pool can be imported using its name or numeric identifier.
 config:
 
         tank        ONLINE
           mirror    ONLINE
             sda     ONLINE
             sdb     ONLINE
 
 # zpool import tank
 .Ed
 .It Sy Example 10 No Upgrading All ZFS Storage Pools to the Current Version
 The following command upgrades all ZFS Storage pools to the current version of
 the software.
 .Bd -literal
 # zpool upgrade -a
 This system is currently running ZFS version 2.
 .Ed
 .It Sy Example 11 No Managing Hot Spares
 The following command creates a new pool with an available hot spare:
 .Bd -literal
 # zpool create tank mirror sda sdb spare sdc
 .Ed
 .Pp
 If one of the disks were to fail, the pool would be reduced to the degraded
 state.
 The failed device can be replaced using the following command:
 .Bd -literal
 # zpool replace tank sda sdd
 .Ed
 .Pp
 Once the data has been resilvered, the spare is automatically removed and is
 made available for use should another device fail.
 The hot spare can be permanently removed from the pool using the following
 command:
 .Bd -literal
 # zpool remove tank sdc
 .Ed
 .It Sy Example 12 No Creating a ZFS Pool with Mirrored Separate Intent Logs
 The following command creates a ZFS storage pool consisting of two, two-way
 mirrors and mirrored log devices:
 .Bd -literal
 # zpool create pool mirror sda sdb mirror sdc sdd log mirror \\
   sde sdf
 .Ed
 .It Sy Example 13 No Adding Cache Devices to a ZFS Pool
 The following command adds two disks for use as cache devices to a ZFS storage
 pool:
 .Bd -literal
 # zpool add pool cache sdc sdd
 .Ed
 .Pp
 Once added, the cache devices gradually fill with content from main memory.
 Depending on the size of your cache devices, it could take over an hour for
 them to fill.
 Capacity and reads can be monitored using the
 .Cm iostat
 option as follows:
 .Bd -literal
 # zpool iostat -v pool 5
 .Ed
 .It Sy Example 14 No Removing a Mirrored top-level (Log or Data) Device
 The following commands remove the mirrored log device
 .Sy mirror-2
 and mirrored top-level data device
 .Sy mirror-1 .
 .Pp
 Given this configuration:
 .Bd -literal
   pool: tank
  state: ONLINE
  scrub: none requested
 config:
 
          NAME        STATE     READ WRITE CKSUM
          tank        ONLINE       0     0     0
            mirror-0  ONLINE       0     0     0
              sda     ONLINE       0     0     0
              sdb     ONLINE       0     0     0
            mirror-1  ONLINE       0     0     0
              sdc     ONLINE       0     0     0
              sdd     ONLINE       0     0     0
          logs
            mirror-2  ONLINE       0     0     0
              sde     ONLINE       0     0     0
              sdf     ONLINE       0     0     0
 .Ed
 .Pp
 The command to remove the mirrored log
 .Sy mirror-2
 is:
 .Bd -literal
 # zpool remove tank mirror-2
 .Ed
 .Pp
 The command to remove the mirrored data
 .Sy mirror-1
 is:
 .Bd -literal
 # zpool remove tank mirror-1
 .Ed
 .It Sy Example 15 No Displaying expanded space on a device
 The following command displays the detailed information for the pool
 .Em data .
 This pool is comprised of a single raidz vdev where one of its devices
 increased its capacity by 10GB.
 In this example, the pool will not be able to utilize this extra capacity until
 all the devices under the raidz vdev have been expanded.
 .Bd -literal
 # zpool list -v data
 NAME         SIZE  ALLOC   FREE  EXPANDSZ   FRAG    CAP  DEDUP  HEALTH  ALTROOT
 data        23.9G  14.6G  9.30G         -    48%    61%  1.00x  ONLINE  -
   raidz1    23.9G  14.6G  9.30G         -    48%
     sda         -      -      -         -      -
     sdb         -      -      -       10G      -
     sdc         -      -      -         -      -
 .Ed
 .It Sy Example 16 No Adding output columns
 Additional columns can be added to the
 .Nm zpool Cm status
 and
 .Nm zpool Cm iostat
 output with
 .Fl c
 option.
 .Bd -literal
 # zpool status -c vendor,model,size
    NAME     STATE  READ WRITE CKSUM vendor  model        size
    tank     ONLINE 0    0     0
    mirror-0 ONLINE 0    0     0
    U1       ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U10      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U11      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U12      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U13      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U14      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
 
 # zpool iostat -vc size
               capacity     operations     bandwidth
 pool        alloc   free   read  write   read  write  size
 ----------  -----  -----  -----  -----  -----  -----  ----
 rpool       14.6G  54.9G      4     55   250K  2.69M
   sda1      14.6G  54.9G      4     55   250K  2.69M   70G
 ----------  -----  -----  -----  -----  -----  -----  ----
 .Ed
 .El
 .Sh ENVIRONMENT VARIABLES
 .Bl -tag -width "ZFS_ABORT"
 .It Ev ZFS_ABORT
 Cause
 .Nm zpool
 to dump core on exit for the purposes of running
 .Sy ::findleaks .
 .El
 .Bl -tag -width "ZFS_COLOR"
 .It Ev ZFS_COLOR
 Use ANSI color in
 .Nm zpool status
 output.
 .El
 .Bl -tag -width "ZPOOL_IMPORT_PATH"
 .It Ev ZPOOL_IMPORT_PATH
 The search path for devices or files to use with the pool. This is a colon-separated list of directories in which
 .Nm zpool
 looks for device nodes and files.
 Similar to the
 .Fl d
 option in
 .Nm zpool import .
 .El
 .Bl -tag -width "ZPOOL_IMPORT_UDEV_TIMEOUT_MS"
 .It Ev ZPOOL_IMPORT_UDEV_TIMEOUT_MS
 The maximum time in milliseconds that
 .Nm zpool import
 will wait for an expected device to be available.
 .El
 .Bl -tag -width "ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE"
 .It Ev ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE
 If set, suppress warning about non-native vdev ashift in
 .Nm zpool status .
 The value is not used, only the presence or absence of the variable matters.
 .El
 .Bl -tag -width "ZPOOL_VDEV_NAME_GUID"
 .It Ev ZPOOL_VDEV_NAME_GUID
 Cause
 .Nm zpool
 subcommands to output vdev guids by default.  This behavior is identical to the
 .Nm zpool status -g
 command line option.
 .El
 .Bl -tag -width "ZPOOL_VDEV_NAME_FOLLOW_LINKS"
 .It Ev ZPOOL_VDEV_NAME_FOLLOW_LINKS
 Cause
 .Nm zpool
 subcommands to follow links for vdev names by default.  This behavior is identical to the
 .Nm zpool status -L
 command line option.
 .El
 .Bl -tag -width "ZPOOL_VDEV_NAME_PATH"
 .It Ev ZPOOL_VDEV_NAME_PATH
 Cause
 .Nm zpool
 subcommands to output full vdev path names by default.  This
 behavior is identical to the
 .Nm zpool status -P
 command line option.
 .El
 .Bl -tag -width "ZFS_VDEV_DEVID_OPT_OUT"
 .It Ev ZFS_VDEV_DEVID_OPT_OUT
 Older ZFS on Linux implementations had issues when attempting to display pool
 config VDEV names if a
 .Sy devid
 NVP value is present in the pool's config.
 .Pp
 For example, a pool that originated on illumos platform would have a devid
 value in the config and
 .Nm zpool status
 would fail when listing the config.
 This would also be true for future Linux based pools.
 .Pp
 A pool can be stripped of any
 .Sy devid
 values on import or prevented from adding
 them on
 .Nm zpool create
 or
 .Nm zpool add
 by setting
 .Sy ZFS_VDEV_DEVID_OPT_OUT .
 .El
 .Bl -tag -width "ZPOOL_SCRIPTS_AS_ROOT"
 .It Ev ZPOOL_SCRIPTS_AS_ROOT
 Allow a privileged user to run the
 .Nm zpool status/iostat
 with the
 .Fl c
 option.  Normally, only unprivileged users are allowed to run
 .Fl c .
 .El
 .Bl -tag -width "ZPOOL_SCRIPTS_PATH"
 .It Ev ZPOOL_SCRIPTS_PATH
 The search path for scripts when running
 .Nm zpool status/iostat
 with the
 .Fl c
 option. This is a colon-separated list of directories and overrides the default
 .Pa ~/.zpool.d
 and
 .Pa /etc/zfs/zpool.d
 search paths.
 .El
 .Bl -tag -width "ZPOOL_SCRIPTS_ENABLED"
 .It Ev ZPOOL_SCRIPTS_ENABLED
 Allow a user to run
 .Nm zpool status/iostat
 with the
 .Fl c
 option. If
 .Sy ZPOOL_SCRIPTS_ENABLED
 is not set, it is assumed that the user is allowed to run
 .Nm zpool status/iostat -c .
 .El
 .Sh INTERFACE STABILITY
 .Sy Evolving
 .Sh SEE ALSO
-.Xr zpoolconcepts 8 ,
-.Xr zpoolprops 8 ,
 .Xr zfs-events 5 ,
 .Xr zfs-module-parameters 5 ,
 .Xr zpool-features 5 ,
 .Xr zed 8 ,
-.Xr zfs 8
+.Xr zfs 8 ,
+.Xr zpool-add 8 ,
+.Xr zpool-attach 8 ,
+.Xr zpool-checkpoint 8 ,
+.Xr zpool-clear 8 ,
+.Xr zpool-create 8 ,
+.Xr zpool-destroy 8 ,
+.Xr zpool-detach 8 ,
+.Xr zpool-events 8 ,
+.Xr zpool-export 8 ,
+.Xr zpool-get 8 ,
+.Xr zpool-history 8 ,
+.Xr zpool-import 8 ,
+.Xr zpool-initialize 8 ,
+.Xr zpool-iostat 8 ,
+.Xr zpool-labelclear 8 ,
+.Xr zpool-list 8 ,
+.Xr zpool-offline 8 ,
+.Xr zpool-online 8 ,
+.Xr zpool-reguid 8 ,
+.Xr zpool-remove 8 ,
+.Xr zpool-reopen 8 ,
+.Xr zpool-replace 8 ,
+.Xr zpool-resilver 8 ,
+.Xr zpool-scrub 8 ,
+.Xr zpool-set 8 ,
+.Xr zpool-split 8 ,
+.Xr zpool-status 8 ,
+.Xr zpool-sync 8 ,
+.Xr zpool-trim 8 ,
+.Xr zpool-upgrade 8 ,
+.Xr zpool-wait 8 ,
+.Xr zpoolconcepts 8 ,
+.Xr zpoolprops 8
Index: vendor-sys/openzfs/dist/man/man8/zstreamdump.8
===================================================================
--- vendor-sys/openzfs/dist/man/man8/zstreamdump.8	(revision 364927)
+++ vendor-sys/openzfs/dist/man/man8/zstreamdump.8	(revision 364928)
@@ -1,58 +1,58 @@
 '\" te
 .\" Copyright (c) 2009, Sun Microsystems, Inc. All Rights Reserved
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
 .\"  See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with
 .\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH zstreamdump 8 "29 Aug 2012" "ZFS pool 28, filesystem 5" "System Administration Commands"
+.TH ZSTREAMDUMP 8 "Aug 24, 2020" OpenZFS
 .SH NAME
 zstreamdump \- filter data in zfs send stream
 .SH SYNOPSIS
 .LP
 .nf
 \fBzstreamdump\fR [\fB-C\fR] [\fB-v\fR] [\fB-d\fR]
 .fi
 
 .SH DESCRIPTION
 .sp
 .LP
 The \fBzstreamdump\fR utility reads from the output of the \fBzfs send\fR
 command, then displays headers and some statistics from that output.  See
 \fBzfs\fR(8).
 .SH OPTIONS
 .sp
 .LP
 The following options are supported:
 .sp
 .ne 2
 .na
 \fB-C\fR
 .ad
 .sp .6
 .RS 4n
 Suppress the validation of checksums.
 .RE
 
 .sp
 .ne 2
 .na
 \fB-v\fR
 .ad
 .sp .6
 .RS 4n
 Verbose. Dump all headers, not only begin and end headers.
 .RE
 
 .sp
 .ne 2
 .na
 \fB-d\fR
 .ad
 .sp .6
 .RS 4n
 Dump contents of blocks modified. Implies verbose.
 .RE
 
 .SH SEE ALSO
 .sp
 .LP
 \fBzfs\fR(8)
Index: vendor-sys/openzfs/dist/module/Makefile.bsd
===================================================================
--- vendor-sys/openzfs/dist/module/Makefile.bsd	(revision 364927)
+++ vendor-sys/openzfs/dist/module/Makefile.bsd	(revision 364928)
@@ -1,359 +1,359 @@
 .if !defined(WITH_CTF)
 WITH_CTF=1
 .endif
 
 .include <bsd.sys.mk>
 
 SRCDIR=${.CURDIR}
 INCDIR=${.CURDIR:H}/include
 
 KMOD=	openzfs
 
 .PATH:	${SRCDIR}/avl \
 	${SRCDIR}/lua \
 	${SRCDIR}/nvpair \
 	${SRCDIR}/os/freebsd/spl \
 	${SRCDIR}/os/freebsd/zfs \
 	${SRCDIR}/unicode \
 	${SRCDIR}/zcommon \
 	${SRCDIR}/zfs \
 	${SRCDIR}/zstd \
 	${SRCDIR}/zstd/lib
 
 
 
 CFLAGS+= -I${.OBJDIR:H}/include
 CFLAGS+= -I${INCDIR}
 CFLAGS+= -I${INCDIR}/spl
 CFLAGS+= -I${INCDIR}/os/freebsd
 CFLAGS+= -I${INCDIR}/os/freebsd/spl
 CFLAGS+= -I${INCDIR}/os/freebsd/zfs
 CFLAGS+= -I${SRCDIR}/zstd/include
 CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h
 
 CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS  -D__BSD_VISIBLE=1 \
 	 -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ \
 	 -D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DHAVE_KSID -DCOMPAT_FREEBSD11
 
 .if ${MACHINE_ARCH} == "amd64"
 CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSSE3
 .endif
 
 .if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true"
 CFLAGS+= -DINVARIANTS -DWITNESS -g -O0 -DZFS_DEBUG -DOPENSOLARIS_WITNESS
 .else
 CFLAGS += -DNDEBUG
 .endif
 
 .if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true"
 # kernel must also be built with this option for this to work
 CFLAGS+= -DDEBUG_VFS_LOCKS
 .endif
 
 .if defined(WITH_GCOV) && ${WITH_GCOV} == "true"
 CFLAGS+=	 -fprofile-arcs -ftest-coverage
 .endif
 
 DEBUG_FLAGS=-g
 
 .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
 	${MACHINE_ARCH} == "arm"
 CFLAGS+= -DBITS_PER_LONG=32
 .else
 CFLAGS+= -DBITS_PER_LONG=64
 .endif
 
 SRCS=	vnode_if.h device_if.h bus_if.h
 
 # avl
 SRCS+=	avl.c
 
 #lua
 SRCS+=	lapi.c \
 	lauxlib.c \
 	lbaselib.c \
 	lcode.c \
 	lcompat.c \
 	lcorolib.c \
 	lctype.c \
 	ldebug.c \
 	ldo.c \
 	lfunc.c \
 	lgc.c \
 	llex.c \
 	lmem.c \
 	lobject.c \
 	lopcodes.c \
 	lparser.c \
 	lstate.c \
 	lstring.c \
 	lstrlib.c \
 	ltable.c \
 	ltablib.c \
 	ltm.c \
 	lvm.c \
 	lzio.c
 
 #nvpair
 SRCS+=	nvpair.c \
 	fnvpair.c \
 	nvpair_alloc_spl.c \
 	nvpair_alloc_fixed.c
 
 #os/freebsd/spl
 SRCS+=	acl_common.c \
 	btree.c \
 	callb.c \
 	list.c \
 	spl_acl.c \
 	spl_cmn_err.c \
 	spl_dtrace.c \
 	spl_kmem.c \
 	spl_kstat.c \
 	spl_misc.c \
 	spl_policy.c \
 	spl_string.c \
 	spl_sunddi.c \
 	spl_sysevent.c \
 	spl_taskq.c \
 	spl_uio.c \
 	spl_vfs.c \
 	spl_vm.c \
 	spl_zone.c \
 	sha256c.c \
 	sha512c.c \
 	spl_procfs_list.c \
 	spl_zlib.c
 
 
 .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
 	${MACHINE_ARCH} == "arm"
 SRCS+= spl_atomic.c
 .endif
 
 #os/freebsd/zfs
 SRCS+=	abd_os.c \
 	crypto_os.c \
 	dmu_os.c \
 	hkdf.c \
 	kmod_core.c \
 	spa_os.c \
 	sysctl_os.c \
 	vdev_file.c \
 	vdev_label_os.c \
 	vdev_geom.c \
 	zfs_acl.c \
 	zfs_ctldir.c \
 	zfs_dir.c \
 	zfs_ioctl_compat.c \
 	zfs_ioctl_os.c \
 	zfs_log.c \
 	zfs_replay.c \
 	zfs_vfsops.c \
 	zfs_vnops.c \
 	zfs_znode.c \
 	zio_crypt.c \
 	zvol_os.c
 
 #unicode
 SRCS+=	uconv.c \
 	u8_textprep.c
 
 #zcommon
 SRCS+=	zfeature_common.c \
 	zfs_comutil.c \
 	zfs_deleg.c \
 	zfs_fletcher.c \
 	zfs_fletcher_avx512.c \
 	zfs_fletcher_intel.c \
 	zfs_fletcher_sse.c \
 	zfs_fletcher_superscalar.c \
 	zfs_fletcher_superscalar4.c \
 	zfs_namecheck.c \
 	zfs_prop.c \
 	zpool_prop.c \
 	zprop_common.c
 
 #zfs
 SRCS+=	abd.c \
 	aggsum.c \
 	arc.c \
 	arc_os.c \
 	blkptr.c \
 	bplist.c \
 	bpobj.c \
 	cityhash.c \
 	dbuf.c \
 	dbuf_stats.c \
 	bptree.c \
 	bqueue.c \
 	dataset_kstats.c \
 	ddt.c \
 	ddt_zap.c \
 	dmu.c \
 	dmu_diff.c \
 	dmu_object.c \
 	dmu_objset.c \
 	dmu_recv.c \
 	dmu_redact.c \
 	dmu_send.c \
 	dmu_traverse.c \
 	dmu_tx.c \
 	dmu_zfetch.c \
 	dnode.c \
 	dnode_sync.c \
 	dsl_dataset.c \
 	dsl_deadlist.c \
 	dsl_deleg.c \
 	dsl_bookmark.c \
 	dsl_dir.c \
 	dsl_crypt.c \
 	dsl_destroy.c \
 	dsl_pool.c \
 	dsl_prop.c \
 	dsl_scan.c \
 	dsl_synctask.c \
 	dsl_userhold.c \
 	fm.c \
 	gzip.c \
 	lzjb.c \
 	lz4.c \
 	metaslab.c \
 	mmp.c \
 	multilist.c \
 	objlist.c \
 	pathname.c \
 	range_tree.c \
 	refcount.c \
 	rrwlock.c \
 	sa.c \
 	sha256.c \
 	skein_zfs.c \
 	spa.c \
 	spa_boot.c \
 	spa_checkpoint.c \
 	spa_config.c \
 	spa_errlog.c \
 	spa_history.c \
 	spa_log_spacemap.c \
 	spa_misc.c \
 	spa_stats.c \
 	space_map.c \
 	space_reftree.c \
 	txg.c \
 	uberblock.c \
 	unique.c \
 	vdev.c \
 	vdev_cache.c \
 	vdev_indirect.c \
 	vdev_indirect_births.c \
 	vdev_indirect_mapping.c \
 	vdev_initialize.c \
 	vdev_label.c \
 	vdev_mirror.c \
 	vdev_missing.c \
 	vdev_queue.c \
 	vdev_raidz.c \
 	vdev_raidz_math.c \
 	vdev_raidz_math_scalar.c \
 	vdev_rebuild.c \
 	vdev_raidz_math_avx2.c \
 	vdev_raidz_math_avx512bw.c \
 	vdev_raidz_math_avx512f.c \
 	vdev_raidz_math_sse2.c \
 	vdev_raidz_math_ssse3.c \
 	vdev_removal.c \
 	vdev_root.c \
 	vdev_trim.c \
 	zap.c \
 	zap_leaf.c \
 	zap_micro.c \
 	zcp.c \
 	zcp_get.c \
 	zcp_global.c \
 	zcp_iter.c \
 	zcp_set.c \
 	zcp_synctask.c \
 	zfeature.c \
 	zfs_byteswap.c \
 	zfs_debug.c \
 	zfs_file_os.c \
 	zfs_fm.c \
 	zfs_fuid.c \
 	zfs_ioctl.c \
 	zfs_onexit.c \
 	zfs_quota.c \
 	zfs_ratelimit.c \
 	zfs_rlock.c \
 	zfs_sa.c \
 	zil.c \
 	zio.c \
 	zio_checksum.c \
 	zio_compress.c \
 	zio_inject.c \
 	zle.c \
 	zrlock.c \
 	zthr.c \
 	zvol.c
 
 #zstd
 SRCS+=	zfs_zstd.c \
 	zstd.c
 
 beforeinstall:
 .if ${MK_DEBUG_FILES} != "no"
 	mtree -eu \
 	    -f /etc/mtree/BSD.debug.dist \
 	    -p ${DESTDIR}/usr/lib
 .endif
 
 .include <bsd.kmod.mk>
 
 
 CFLAGS.gcc+= -Wno-pointer-to-int-cast
 
 CFLAGS.lapi.c= -Wno-cast-qual
 CFLAGS.lcompat.c= -Wno-cast-qual
 CFLAGS.lobject.c= -Wno-cast-qual
 CFLAGS.ltable.c= -Wno-cast-qual
 CFLAGS.lvm.c= -Wno-cast-qual
 CFLAGS.nvpair.c= -DHAVE_RPC_TYPES -Wno-cast-qual
 CFLAGS.spl_string.c= -Wno-cast-qual
 CFLAGS.spl_vm.c= -Wno-cast-qual
 CFLAGS.spl_zlib.c= -Wno-cast-qual
 CFLAGS.abd.c= -Wno-cast-qual
 CFLAGS.zfs_log.c= -Wno-cast-qual
 CFLAGS.zfs_vnops.c= -Wno-pointer-arith
 CFLAGS.u8_textprep.c= -Wno-cast-qual
 CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.zfs_fletcher_sse.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.zprop_common.c= -Wno-cast-qual
 CFLAGS.ddt.c= -Wno-cast-qual
 CFLAGS.dmu.c= -Wno-cast-qual
 CFLAGS.dmu_traverse.c= -Wno-cast-qual
 CFLAGS.dsl_dir.c= -Wno-cast-qual
 CFLAGS.dsl_deadlist.c= -Wno-cast-qual
 CFLAGS.dsl_prop.c= -Wno-cast-qual
 CFLAGS.fm.c= -Wno-cast-qual
 CFLAGS.lz4.c= -Wno-cast-qual
 CFLAGS.spa.c= -Wno-cast-qual
 CFLAGS.spa_misc.c= -Wno-cast-qual
 CFLAGS.sysctl_os.c= -include ../zfs_config.h
 CFLAGS.vdev_raidz.c= -Wno-cast-qual
 CFLAGS.vdev_raidz_math.c= -Wno-cast-qual
 CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual
 CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
 CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
 CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
 CFLAGS.zap_leaf.c= -Wno-cast-qual
 CFLAGS.zap_micro.c= -Wno-cast-qual
 CFLAGS.zcp.c= -Wno-cast-qual
 CFLAGS.zfs_fm.c= -Wno-cast-qual
 CFLAGS.zfs_ioctl.c= -Wno-cast-qual
 CFLAGS.zil.c= -Wno-cast-qual
 CFLAGS.zio.c= -Wno-cast-qual
 CFLAGS.zrlock.c= -Wno-cast-qual
 CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith
-CFLAGS.zstd.c= -fno-tree-vectorize
+CFLAGS.zstd.c= -fno-tree-vectorize -U__BMI__
Index: vendor-sys/openzfs/dist/module/lua/setjmp/setjmp_ppc.S
===================================================================
--- vendor-sys/openzfs/dist/module/lua/setjmp/setjmp_ppc.S	(revision 364927)
+++ vendor-sys/openzfs/dist/module/lua/setjmp/setjmp_ppc.S	(revision 364928)
@@ -1,165 +1,165 @@
 /*	$FreeBSD$  */
 /*	from:	NetBSD: setjmp.S,v 1.1 1998/01/27 15:13:12 sakamoto Exp $  */
 /*	from:	OpenBSD: setjmp.S,v 1.2 1996/12/28 06:22:18 rahnds Exp 	*/
 /* kernel version of this file, does not have signal goop */
 /* int setjmp(jmp_buf env) */
 
 #define	_ASM
 #include <asm/types.h>
 
 #ifdef __powerpc64__
 #if !defined(PPC64_ELF_ABI_v2) && !defined(PPC64_ELF_ABI_v1)
 #if defined(_CALL_ELF) && _CALL_ELF == 2
 #define	PPC64_ELF_ABI_v2
 #endif /* _CALL_ELF */
 #endif /* PPC64_ELF_ABI_ */
 #endif /* __powerpc64__ */
 
 #ifdef __powerpc64__
 #define LD_REG	ld
 #define	ST_REG	std
 #define	REGWIDTH 8
 #else
 #define	LD_REG	lwz
 #define	ST_REG	stw
 #define	REGWIDTH 4
 #endif /* __powerpc64__ */
 
 #define JMP_r1	1*REGWIDTH
 #define JMP_r2	2*REGWIDTH
 #define JMP_r14	3*REGWIDTH
 #define JMP_r15 4*REGWIDTH
 #define JMP_r16 5*REGWIDTH
 #define JMP_r17 6*REGWIDTH
 #define JMP_r18 7*REGWIDTH
 #define JMP_r19 8*REGWIDTH
 #define JMP_r20 9*REGWIDTH
 #define JMP_r21 10*REGWIDTH
 #define JMP_r22 11*REGWIDTH
 #define JMP_r23 12*REGWIDTH
 #define JMP_r24 13*REGWIDTH
 #define JMP_r25 14*REGWIDTH
 #define JMP_r26 15*REGWIDTH
 #define JMP_r27 16*REGWIDTH
 #define JMP_r28 17*REGWIDTH
 #define JMP_r29 18*REGWIDTH
 #define JMP_r30 19*REGWIDTH
 #define JMP_r31 20*REGWIDTH
 #define JMP_lr 	21*REGWIDTH
 #define JMP_cr	22*REGWIDTH
 #define JMP_ctr	23*REGWIDTH
 #define JMP_xer	24*REGWIDTH
 
 #ifdef __powerpc64__
 #ifdef PPC64_ELF_ABI_v2
 
 #define	ENTRY(name) \
 	.align 2 ; \
 	.type name,@function; \
-	.globl name; \
+	.weak name; \
 name:
 
 #else /* PPC64_ELF_ABI_v1 */
 
 #define	XGLUE(a,b) a##b
 #define	GLUE(a,b) XGLUE(a,b)
 #define	ENTRY(name) \
 	.align 2 ; \
-	.globl name; \
-	.globl GLUE(.,name); \
+	.weak name; \
+	.weak GLUE(.,name); \
 	.pushsection ".opd","aw"; \
 name: \
 	.quad GLUE(.,name); \
 	.quad .TOC.@tocbase; \
 	.quad 0; \
 	.popsection; \
 	.type GLUE(.,name),@function; \
 GLUE(.,name):
 
 #endif /* PPC64_ELF_ABI_v2 */
 
 #else /* 32-bit */
 
 #define	ENTRY(name) \
 	.text; \
 	.p2align 4; \
-	.globl  name; \
-	.type   name,@function; \
+	.weak name; \
+	.type name,@function; \
 name:
 
 #endif /* __powerpc64__ */
 
 
 ENTRY(setjmp)
 	ST_REG 31, JMP_r31(3)
 	/* r1, r2, r14-r30 */
 	ST_REG 1,  JMP_r1 (3)
 	ST_REG 2,  JMP_r2 (3)
 	ST_REG 14, JMP_r14(3)
 	ST_REG 15, JMP_r15(3)
 	ST_REG 16, JMP_r16(3)
 	ST_REG 17, JMP_r17(3)
 	ST_REG 18, JMP_r18(3)
 	ST_REG 19, JMP_r19(3)
 	ST_REG 20, JMP_r20(3)
 	ST_REG 21, JMP_r21(3)
 	ST_REG 22, JMP_r22(3)
 	ST_REG 23, JMP_r23(3)
 	ST_REG 24, JMP_r24(3)
 	ST_REG 25, JMP_r25(3)
 	ST_REG 26, JMP_r26(3)
 	ST_REG 27, JMP_r27(3)
 	ST_REG 28, JMP_r28(3)
 	ST_REG 29, JMP_r29(3)
 	ST_REG 30, JMP_r30(3)
 	/* cr, lr, ctr, xer */
 	mfcr 0
 	ST_REG 0, JMP_cr(3)
 	mflr 0
 	ST_REG 0, JMP_lr(3)
 	mfctr 0
 	ST_REG 0, JMP_ctr(3)
 	mfxer 0
 	ST_REG 0, JMP_xer(3)
 	/* f14-f31, fpscr */
 	li 3, 0
 	blr
 
 ENTRY(longjmp)
 	LD_REG 31, JMP_r31(3)
 	/* r1, r2, r14-r30 */
 	LD_REG 1,  JMP_r1 (3)
 	LD_REG 2,  JMP_r2 (3)
 	LD_REG 14, JMP_r14(3)
 	LD_REG 15, JMP_r15(3)
 	LD_REG 16, JMP_r16(3)
 	LD_REG 17, JMP_r17(3)
 	LD_REG 18, JMP_r18(3)
 	LD_REG 19, JMP_r19(3)
 	LD_REG 20, JMP_r20(3)
 	LD_REG 21, JMP_r21(3)
 	LD_REG 22, JMP_r22(3)
 	LD_REG 23, JMP_r23(3)
 	LD_REG 24, JMP_r24(3)
 	LD_REG 25, JMP_r25(3)
 	LD_REG 26, JMP_r26(3)
 	LD_REG 27, JMP_r27(3)
 	LD_REG 28, JMP_r28(3)
 	LD_REG 29, JMP_r29(3)
 	LD_REG 30, JMP_r30(3)
 	/* cr, lr, ctr, xer */
 	LD_REG 0, JMP_cr(3)
 	mtcr 0
 	LD_REG 0, JMP_lr(3)
 	mtlr 0
 	LD_REG 0, JMP_ctr(3)
 	mtctr 0
 	LD_REG 0, JMP_xer(3)
 	mtxer 0
 	/* f14-f31, fpscr */
 	mr 3, 4
 	blr
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits
 #endif
Index: vendor-sys/openzfs/dist/module/os/freebsd/spl/list.c
===================================================================
--- vendor-sys/openzfs/dist/module/os/freebsd/spl/list.c	(revision 364927)
+++ vendor-sys/openzfs/dist/module/os/freebsd/spl/list.c	(revision 364928)
@@ -1,246 +1,244 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Generic doubly-linked list implementation
  */
 
 #include <sys/list.h>
 #include <sys/list_impl.h>
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/debug.h>
 
 #define	list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
 #define	list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
 #define	list_empty(a) ((a)->list_head.list_next == &(a)->list_head)
 
 #define	list_insert_after_node(list, node, object) {	\
 	list_node_t *lnew = list_d2l(list, object);	\
 	lnew->list_prev = (node);			\
 	lnew->list_next = (node)->list_next;		\
 	(node)->list_next->list_prev = lnew;		\
 	(node)->list_next = lnew;			\
 }
 
 #define	list_insert_before_node(list, node, object) {	\
 	list_node_t *lnew = list_d2l(list, object);	\
 	lnew->list_next = (node);			\
 	lnew->list_prev = (node)->list_prev;		\
 	(node)->list_prev->list_next = lnew;		\
 	(node)->list_prev = lnew;			\
 }
 
 #define	list_remove_node(node)					\
 	(node)->list_prev->list_next = (node)->list_next;	\
 	(node)->list_next->list_prev = (node)->list_prev;	\
 	(node)->list_next = (node)->list_prev = NULL
 
 void
 list_create(list_t *list, size_t size, size_t offset)
 {
 	ASSERT(list);
 	ASSERT(size > 0);
 	ASSERT(size >= offset + sizeof (list_node_t));
 
 	list->list_size = size;
 	list->list_offset = offset;
 	list->list_head.list_next = list->list_head.list_prev =
 	    &list->list_head;
 }
 
 void
 list_destroy(list_t *list)
 {
 	list_node_t *node = &list->list_head;
 
 	ASSERT(list);
 	ASSERT(list->list_head.list_next == node);
 	ASSERT(list->list_head.list_prev == node);
 
 	node->list_next = node->list_prev = NULL;
 }
 
 void
 list_insert_after(list_t *list, void *object, void *nobject)
 {
 	if (object == NULL) {
 		list_insert_head(list, nobject);
 	} else {
 		list_node_t *lold = list_d2l(list, object);
 		list_insert_after_node(list, lold, nobject);
 	}
 }
 
 void
 list_insert_before(list_t *list, void *object, void *nobject)
 {
 	if (object == NULL) {
 		list_insert_tail(list, nobject);
 	} else {
 		list_node_t *lold = list_d2l(list, object);
 		list_insert_before_node(list, lold, nobject);
 	}
 }
 
 void
 list_insert_head(list_t *list, void *object)
 {
 	list_node_t *lold = &list->list_head;
 	list_insert_after_node(list, lold, object);
 }
 
 void
 list_insert_tail(list_t *list, void *object)
 {
 	list_node_t *lold = &list->list_head;
 	list_insert_before_node(list, lold, object);
 }
 
 void
 list_remove(list_t *list, void *object)
 {
 	list_node_t *lold = list_d2l(list, object);
 	ASSERT(!list_empty(list));
 	ASSERT(lold->list_next != NULL);
 	list_remove_node(lold);
 }
 
 void *
 list_remove_head(list_t *list)
 {
 	list_node_t *head = list->list_head.list_next;
 	if (head == &list->list_head)
 		return (NULL);
 	list_remove_node(head);
 	return (list_object(list, head));
 }
 
 void *
 list_remove_tail(list_t *list)
 {
 	list_node_t *tail = list->list_head.list_prev;
 	if (tail == &list->list_head)
 		return (NULL);
 	list_remove_node(tail);
 	return (list_object(list, tail));
 }
 
 void *
 list_head(list_t *list)
 {
 	if (list_empty(list))
 		return (NULL);
 	return (list_object(list, list->list_head.list_next));
 }
 
 void *
 list_tail(list_t *list)
 {
 	if (list_empty(list))
 		return (NULL);
 	return (list_object(list, list->list_head.list_prev));
 }
 
 void *
 list_next(list_t *list, void *object)
 {
 	list_node_t *node = list_d2l(list, object);
 
 	if (node->list_next != &list->list_head)
 		return (list_object(list, node->list_next));
 
 	return (NULL);
 }
 
 void *
 list_prev(list_t *list, void *object)
 {
 	list_node_t *node = list_d2l(list, object);
 
 	if (node->list_prev != &list->list_head)
 		return (list_object(list, node->list_prev));
 
 	return (NULL);
 }
 
 /*
  *  Insert src list after dst list. Empty src list thereafter.
  */
 void
 list_move_tail(list_t *dst, list_t *src)
 {
 	list_node_t *dstnode = &dst->list_head;
 	list_node_t *srcnode = &src->list_head;
 
 	ASSERT(dst->list_size == src->list_size);
 	ASSERT(dst->list_offset == src->list_offset);
 
 	if (list_empty(src))
 		return;
 
 	dstnode->list_prev->list_next = srcnode->list_next;
 	srcnode->list_next->list_prev = dstnode->list_prev;
 	dstnode->list_prev = srcnode->list_prev;
 	srcnode->list_prev->list_next = dstnode;
 
 	/* empty src list */
 	srcnode->list_next = srcnode->list_prev = srcnode;
 }
 
 void
 list_link_replace(list_node_t *lold, list_node_t *lnew)
 {
 	ASSERT(list_link_active(lold));
 	ASSERT(!list_link_active(lnew));
 
 	lnew->list_next = lold->list_next;
 	lnew->list_prev = lold->list_prev;
 	lold->list_prev->list_next = lnew;
 	lold->list_next->list_prev = lnew;
 	lold->list_next = lold->list_prev = NULL;
 }
 
 void
 list_link_init(list_node_t *link)
 {
 	link->list_next = NULL;
 	link->list_prev = NULL;
 }
 
 int
 list_link_active(list_node_t *link)
 {
 	EQUIV(link->list_next == NULL, link->list_prev == NULL);
 	return (link->list_next != NULL);
 }
 
 int
 list_is_empty(list_t *list)
 {
 	return (list_empty(list));
 }
Index: vendor-sys/openzfs/dist/module/os/linux/zfs/zfs_ctldir.c
===================================================================
--- vendor-sys/openzfs/dist/module/os/linux/zfs/zfs_ctldir.c	(revision 364927)
+++ vendor-sys/openzfs/dist/module/os/linux/zfs/zfs_ctldir.c	(revision 364928)
@@ -1,1241 +1,1260 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  *
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * LLNL-CODE-403049.
  * Rewritten for Linux by:
  *   Rohan Puri <rohan.puri15@gmail.com>
  *   Brian Behlendorf <behlendorf1@llnl.gov>
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2018 George Melikov. All Rights Reserved.
  * Copyright (c) 2019 Datto, Inc. All rights reserved.
+ * Copyright (c) 2020 The MathWorks, Inc. All rights reserved.
  */
 
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
  * This directory provides a common location for all ZFS meta-objects.
  * Currently, this is only the 'snapshot' and 'shares' directory, but this may
  * expand in the future.  The elements are built dynamically, as the hierarchy
  * does not actually exist on disk.
  *
  * For 'snapshot', we don't want to have all snapshots always mounted, because
  * this would take up a huge amount of space in /etc/mnttab.  We have three
  * types of objects:
  *
  *	ctldir ------> snapshotdir -------> snapshot
  *                                             |
  *                                             |
  *                                             V
  *                                         mounted fs
  *
  * The 'snapshot' node contains just enough information to lookup '..' and act
  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  * perform an automount of the underlying filesystem and return the
  * corresponding inode.
  *
  * All mounts are handled automatically by an user mode helper which invokes
  * the mount procedure.  Unmounts are handled by allowing the mount
  * point to expire so the kernel may automatically unmount it.
  *
  * The '.zfs', '.zfs/snapshot', and all directories created under
  * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
  * zfsvfs_t as the head filesystem (what '.zfs' lives under).
  *
  * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
  * (ie: snapshots) are complete ZFS filesystems and have their own unique
  * zfsvfs_t.  However, the fsid reported by these mounts will be the same
  * as that used by the parent zfsvfs_t to make NFS happy.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/sysmacros.h>
 #include <sys/pathname.h>
 #include <sys/vfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_deleg.h>
 #include <sys/zpl.h>
 #include <sys/mntent.h>
 #include "zfs_namecheck.h"
 
 /*
  * Two AVL trees are maintained which contain all currently automounted
  * snapshots.  Every automounted snapshots maps to a single zfs_snapentry_t
  * entry which MUST:
  *
  *   - be attached to both trees, and
  *   - be unique, no duplicate entries are allowed.
  *
  * The zfs_snapshots_by_name tree is indexed by the full dataset name
  * while the zfs_snapshots_by_objsetid tree is indexed by the unique
  * objsetid.  This allows for fast lookups either by name or objsetid.
  */
 static avl_tree_t zfs_snapshots_by_name;
 static avl_tree_t zfs_snapshots_by_objsetid;
 static krwlock_t zfs_snapshot_lock;
 
 /*
  * Control Directory Tunables (.zfs)
  */
 int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
 int zfs_admin_snapshot = 0;
 
 typedef struct {
 	char		*se_name;	/* full snapshot name */
 	char		*se_path;	/* full mount path */
 	spa_t		*se_spa;	/* pool spa */
 	uint64_t	se_objsetid;	/* snapshot objset id */
 	struct dentry   *se_root_dentry; /* snapshot root dentry */
 	taskqid_t	se_taskqid;	/* scheduled unmount taskqid */
 	avl_node_t	se_node_name;	/* zfs_snapshots_by_name link */
 	avl_node_t	se_node_objsetid; /* zfs_snapshots_by_objsetid link */
 	zfs_refcount_t	se_refcount;	/* reference count */
 } zfs_snapentry_t;
 
 static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
 
 /*
  * Allocate a new zfs_snapentry_t being careful to make a copy of the
  * the snapshot name and provided mount point.  No reference is taken.
  */
 static zfs_snapentry_t *
 zfsctl_snapshot_alloc(char *full_name, char *full_path, spa_t *spa,
     uint64_t objsetid, struct dentry *root_dentry)
 {
 	zfs_snapentry_t *se;
 
 	se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
 
 	se->se_name = kmem_strdup(full_name);
 	se->se_path = kmem_strdup(full_path);
 	se->se_spa = spa;
 	se->se_objsetid = objsetid;
 	se->se_root_dentry = root_dentry;
 	se->se_taskqid = TASKQID_INVALID;
 
 	zfs_refcount_create(&se->se_refcount);
 
 	return (se);
 }
 
 /*
  * Free a zfs_snapentry_t the caller must ensure there are no active
  * references.
  */
 static void
 zfsctl_snapshot_free(zfs_snapentry_t *se)
 {
 	zfs_refcount_destroy(&se->se_refcount);
 	kmem_strfree(se->se_name);
 	kmem_strfree(se->se_path);
 
 	kmem_free(se, sizeof (zfs_snapentry_t));
 }
 
 /*
  * Hold a reference on the zfs_snapentry_t.
  */
 static void
 zfsctl_snapshot_hold(zfs_snapentry_t *se)
 {
 	zfs_refcount_add(&se->se_refcount, NULL);
 }
 
 /*
  * Release a reference on the zfs_snapentry_t.  When the number of
  * references drops to zero the structure will be freed.
  */
 static void
 zfsctl_snapshot_rele(zfs_snapentry_t *se)
 {
 	if (zfs_refcount_remove(&se->se_refcount, NULL) == 0)
 		zfsctl_snapshot_free(se);
 }
 
 /*
  * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
  * zfs_snapshots_by_objsetid trees.  While the zfs_snapentry_t is part
  * of the trees a reference is held.
  */
 static void
 zfsctl_snapshot_add(zfs_snapentry_t *se)
 {
 	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
 	zfsctl_snapshot_hold(se);
 	avl_add(&zfs_snapshots_by_name, se);
 	avl_add(&zfs_snapshots_by_objsetid, se);
 }
 
 /*
  * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
  * zfs_snapshots_by_objsetid trees.  Upon removal a reference is dropped,
  * this can result in the structure being freed if that was the last
  * remaining reference.
  */
 static void
 zfsctl_snapshot_remove(zfs_snapentry_t *se)
 {
 	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
 	avl_remove(&zfs_snapshots_by_name, se);
 	avl_remove(&zfs_snapshots_by_objsetid, se);
 	zfsctl_snapshot_rele(se);
 }
 
 /*
  * Snapshot name comparison function for the zfs_snapshots_by_name.
  */
 static int
 snapentry_compare_by_name(const void *a, const void *b)
 {
 	const zfs_snapentry_t *se_a = a;
 	const zfs_snapentry_t *se_b = b;
 	int ret;
 
 	ret = strcmp(se_a->se_name, se_b->se_name);
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
  */
 static int
 snapentry_compare_by_objsetid(const void *a, const void *b)
 {
 	const zfs_snapentry_t *se_a = a;
 	const zfs_snapentry_t *se_b = b;
 
 	if (se_a->se_spa != se_b->se_spa)
 		return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1);
 
 	if (se_a->se_objsetid < se_b->se_objsetid)
 		return (-1);
 	else if (se_a->se_objsetid > se_b->se_objsetid)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Find a zfs_snapentry_t in zfs_snapshots_by_name.  If the snapname
  * is found a pointer to the zfs_snapentry_t is returned and a reference
  * taken on the structure.  The caller is responsible for dropping the
  * reference with zfsctl_snapshot_rele().  If the snapname is not found
  * NULL will be returned.
  */
 static zfs_snapentry_t *
 zfsctl_snapshot_find_by_name(char *snapname)
 {
 	zfs_snapentry_t *se, search;
 
 	ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
 
 	search.se_name = snapname;
 	se = avl_find(&zfs_snapshots_by_name, &search, NULL);
 	if (se)
 		zfsctl_snapshot_hold(se);
 
 	return (se);
 }
 
 /*
  * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
  * rather than the snapname.  In all other respects it behaves the same
  * as zfsctl_snapshot_find_by_name().
  */
 static zfs_snapentry_t *
 zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid)
 {
 	zfs_snapentry_t *se, search;
 
 	ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
 
 	search.se_spa = spa;
 	search.se_objsetid = objsetid;
 	se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
 	if (se)
 		zfsctl_snapshot_hold(se);
 
 	return (se);
 }
 
 /*
  * Rename a zfs_snapentry_t in the zfs_snapshots_by_name.  The structure is
  * removed, renamed, and added back to the new correct location in the tree.
  */
 static int
 zfsctl_snapshot_rename(char *old_snapname, char *new_snapname)
 {
 	zfs_snapentry_t *se;
 
 	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
 
 	se = zfsctl_snapshot_find_by_name(old_snapname);
 	if (se == NULL)
 		return (SET_ERROR(ENOENT));
 
 	zfsctl_snapshot_remove(se);
 	kmem_strfree(se->se_name);
 	se->se_name = kmem_strdup(new_snapname);
 	zfsctl_snapshot_add(se);
 	zfsctl_snapshot_rele(se);
 
 	return (0);
 }
 
 /*
  * Delayed task responsible for unmounting an expired automounted snapshot.
  */
 static void
 snapentry_expire(void *data)
 {
 	zfs_snapentry_t *se = (zfs_snapentry_t *)data;
 	spa_t *spa = se->se_spa;
 	uint64_t objsetid = se->se_objsetid;
 
 	if (zfs_expire_snapshot <= 0) {
 		zfsctl_snapshot_rele(se);
 		return;
 	}
 
 	se->se_taskqid = TASKQID_INVALID;
 	(void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
 	zfsctl_snapshot_rele(se);
 
 	/*
 	 * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
 	 * This can occur when the snapshot is busy.
 	 */
 	rw_enter(&zfs_snapshot_lock, RW_READER);
 	if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
 		zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
 		zfsctl_snapshot_rele(se);
 	}
 	rw_exit(&zfs_snapshot_lock);
 }
 
 /*
  * Cancel an automatic unmount of a snapname.  This callback is responsible
  * for dropping the reference on the zfs_snapentry_t which was taken when
  * during dispatch.
  */
 static void
 zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
 {
 	if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) {
 		se->se_taskqid = TASKQID_INVALID;
 		zfsctl_snapshot_rele(se);
 	}
 }
 
 /*
  * Dispatch the unmount task for delayed handling with a hold protecting it.
  */
 static void
 zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
 {
 	ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID);
 
 	if (delay <= 0)
 		return;
 
 	zfsctl_snapshot_hold(se);
 	se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
 	    snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
 }
 
 /*
  * Schedule an automatic unmount of objset id to occur in delay seconds from
  * now.  Any previous delayed unmount will be cancelled in favor of the
  * updated deadline.  A reference is taken by zfsctl_snapshot_find_by_name()
  * and held until the outstanding task is handled or cancelled.
  */
 int
 zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
 {
 	zfs_snapentry_t *se;
 	int error = ENOENT;
 
 	rw_enter(&zfs_snapshot_lock, RW_READER);
 	if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
 		zfsctl_snapshot_unmount_cancel(se);
 		zfsctl_snapshot_unmount_delay_impl(se, delay);
 		zfsctl_snapshot_rele(se);
 		error = 0;
 	}
 	rw_exit(&zfs_snapshot_lock);
 
 	return (error);
 }
 
 /*
  * Check if snapname is currently mounted.  Returned non-zero when mounted
  * and zero when unmounted.
  */
 static boolean_t
 zfsctl_snapshot_ismounted(char *snapname)
 {
 	zfs_snapentry_t *se;
 	boolean_t ismounted = B_FALSE;
 
 	rw_enter(&zfs_snapshot_lock, RW_READER);
 	if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
 		zfsctl_snapshot_rele(se);
 		ismounted = B_TRUE;
 	}
 	rw_exit(&zfs_snapshot_lock);
 
 	return (ismounted);
 }
 
 /*
  * Check if the given inode is a part of the virtual .zfs directory.
  */
 boolean_t
 zfsctl_is_node(struct inode *ip)
 {
 	return (ITOZ(ip)->z_is_ctldir);
 }
 
 /*
  * Check if the given inode is a .zfs/snapshots/snapname directory.
  */
 boolean_t
 zfsctl_is_snapdir(struct inode *ip)
 {
 	return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
 }
 
 /*
  * Allocate a new inode with the passed id and ops.
  */
 static struct inode *
 zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
     const struct file_operations *fops, const struct inode_operations *ops)
 {
 	inode_timespec_t now;
 	struct inode *ip;
 	znode_t *zp;
 
 	ip = new_inode(zfsvfs->z_sb);
 	if (ip == NULL)
 		return (NULL);
 
 	now = current_time(ip);
 	zp = ITOZ(ip);
 	ASSERT3P(zp->z_dirlocks, ==, NULL);
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
 	zp->z_id = id;
 	zp->z_unlinked = B_FALSE;
 	zp->z_atime_dirty = B_FALSE;
 	zp->z_zn_prefetch = B_FALSE;
 	zp->z_moved = B_FALSE;
 	zp->z_is_sa = B_FALSE;
 	zp->z_is_mapped = B_FALSE;
 	zp->z_is_ctldir = B_TRUE;
 	zp->z_is_stale = B_FALSE;
 	zp->z_sa_hdl = NULL;
 	zp->z_blksz = 0;
 	zp->z_seq = 0;
 	zp->z_mapcnt = 0;
 	zp->z_size = 0;
 	zp->z_pflags = 0;
 	zp->z_mode = 0;
 	zp->z_sync_cnt = 0;
 	ip->i_generation = 0;
 	ip->i_ino = id;
 	ip->i_mode = (S_IFDIR | S_IRWXUGO);
 	ip->i_uid = SUID_TO_KUID(0);
 	ip->i_gid = SGID_TO_KGID(0);
 	ip->i_blkbits = SPA_MINBLOCKSHIFT;
 	ip->i_atime = now;
 	ip->i_mtime = now;
 	ip->i_ctime = now;
 	ip->i_fop = fops;
 	ip->i_op = ops;
 #if defined(IOP_XATTR)
 	ip->i_opflags &= ~IOP_XATTR;
 #endif
 
 	if (insert_inode_locked(ip)) {
 		unlock_new_inode(ip);
 		iput(ip);
 		return (NULL);
 	}
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	zfsvfs->z_nr_znodes++;
 	membar_producer();
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	unlock_new_inode(ip);
 
 	return (ip);
 }
 
 /*
  * Lookup the inode with given id, it will be allocated if needed.
  */
 static struct inode *
 zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
     const struct file_operations *fops, const struct inode_operations *ops)
 {
 	struct inode *ip = NULL;
 
 	while (ip == NULL) {
 		ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
 		if (ip)
 			break;
 
 		/* May fail due to concurrent zfsctl_inode_alloc() */
 		ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
 	}
 
 	return (ip);
 }
 
 /*
  * Create the '.zfs' directory.  This directory is cached as part of the VFS
  * structure.  This results in a hold on the zfsvfs_t.  The code in zfs_umount()
  * therefore checks against a vfs_count of 2 instead of 1.  This reference
  * is removed when the ctldir is destroyed in the unmount.  All other entities
  * under the '.zfs' directory are created dynamically as needed.
  *
  * Because the dynamically created '.zfs' directory entries assume the use
  * of 64-bit inode numbers this support must be disabled on 32-bit systems.
  */
 int
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
 	zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
 	    &zpl_fops_root, &zpl_ops_root);
 	if (zfsvfs->z_ctldir == NULL)
 		return (SET_ERROR(ENOENT));
 
 	return (0);
 }
 
 /*
  * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
  * Only called when the filesystem is unmounted.
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
 	if (zfsvfs->z_issnap) {
 		zfs_snapentry_t *se;
 		spa_t *spa = zfsvfs->z_os->os_spa;
 		uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
 
 		rw_enter(&zfs_snapshot_lock, RW_WRITER);
 		se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
 		if (se != NULL)
 			zfsctl_snapshot_remove(se);
 		rw_exit(&zfs_snapshot_lock);
 		if (se != NULL) {
 			zfsctl_snapshot_unmount_cancel(se);
 			zfsctl_snapshot_rele(se);
 		}
 	} else if (zfsvfs->z_ctldir) {
 		iput(zfsvfs->z_ctldir);
 		zfsvfs->z_ctldir = NULL;
 	}
 }
 
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
 struct inode *
 zfsctl_root(znode_t *zp)
 {
 	ASSERT(zfs_has_ctldir(zp));
 	igrab(ZTOZSB(zp)->z_ctldir);
 	return (ZTOZSB(zp)->z_ctldir);
 }
 
 /*
  * Generate a long fid to indicate a snapdir. We encode whether snapdir is
  * already mounted in gen field. We do this because nfsd lookup will not
  * trigger automount. Next time the nfsd does fh_to_dentry, we will notice
  * this and do automount and return ESTALE to force nfsd revalidate and follow
  * mount.
  */
 static int
 zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)
 {
 	zfid_short_t *zfid = (zfid_short_t *)fidp;
 	zfid_long_t *zlfid = (zfid_long_t *)fidp;
 	uint32_t gen = 0;
 	uint64_t object;
 	uint64_t objsetid;
 	int i;
 	struct dentry *dentry;
 
 	if (fidp->fid_len < LONG_FID_LEN) {
 		fidp->fid_len = LONG_FID_LEN;
 		return (SET_ERROR(ENOSPC));
 	}
 
 	object = ip->i_ino;
 	objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;
 	zfid->zf_len = LONG_FID_LEN;
 
 	dentry = d_obtain_alias(igrab(ip));
 	if (!IS_ERR(dentry)) {
 		gen = !!d_mountpoint(dentry);
 		dput(dentry);
 	}
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 		zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 	for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 		zlfid->zf_setgen[i] = 0;
 
 	return (0);
 }
 
 /*
  * Generate an appropriate fid for an entry in the .zfs directory.
  */
 int
 zfsctl_fid(struct inode *ip, fid_t *fidp)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		i;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (zfsctl_is_snapdir(ip)) {
 		ZFS_EXIT(zfsvfs);
 		return (zfsctl_snapdir_fid(ip, fidp));
 	}
 
 	if (fidp->fid_len < SHORT_FID_LEN) {
 		fidp->fid_len = SHORT_FID_LEN;
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = SHORT_FID_LEN;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* .zfs znodes always have a generation number of 0 */
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Construct a full dataset name in full_name: "pool/dataset@snap_name"
  */
 static int
 zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len,
     char *full_name)
 {
 	objset_t *os = zfsvfs->z_os;
 
 	if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
 		return (SET_ERROR(EILSEQ));
 
 	dmu_objset_name(os, full_name);
 	if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	(void) strcat(full_name, "@");
 	(void) strcat(full_name, snap_name);
 
 	return (0);
 }
 
 /*
  * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
  */
 static int
 zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,
     int path_len, char *full_path)
 {
 	objset_t *os = zfsvfs->z_os;
 	fstrans_cookie_t cookie;
 	char *snapname;
 	boolean_t case_conflict;
 	uint64_t id, pos = 0;
 	int error = 0;
 
 	if (zfsvfs->z_vfs->vfs_mntpoint == NULL)
 		return (SET_ERROR(ENOENT));
 
 	cookie = spl_fstrans_mark();
 	snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 
 	while (error == 0) {
 		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 		error = dmu_snapshot_list_next(zfsvfs->z_os,
 		    ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos,
 		    &case_conflict);
 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 		if (error)
 			goto out;
 
 		if (id == objsetid)
 			break;
 	}
 
 	snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",
 	    zfsvfs->z_vfs->vfs_mntpoint, snapname);
 out:
 	kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
 	spl_fstrans_unmark(cookie);
 
 	return (error);
 }
 
 /*
  * Special case the handling of "..".
  */
 int
 zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	int error = 0;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (strcmp(name, "..") == 0) {
 		*ipp = dip->i_sb->s_root->d_inode;
 	} else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
 		*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR,
 		    &zpl_fops_snapdir, &zpl_ops_snapdir);
 	} else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
 		*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES,
 		    &zpl_fops_shares, &zpl_ops_shares);
 	} else {
 		*ipp = NULL;
 	}
 
 	if (*ipp == NULL)
 		error = SET_ERROR(ENOENT);
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem inode as necessary.
  */
 int
 zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	uint64_t id;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id,
 	    &simple_dir_operations, &simple_dir_inode_operations);
 	if (*ipp == NULL)
 		error = SET_ERROR(ENOENT);
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Renaming a directory under '.zfs/snapshot' will automatically trigger
  * a rename of the snapshot to the new given name.  The rename is confined
  * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
  */
 int
 zfsctl_snapdir_rename(struct inode *sdip, char *snm,
     struct inode *tdip, char *tnm, cred_t *cr, int flags)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(sdip);
 	char *to, *from, *real, *fsname;
 	int error;
 
 	if (!zfs_admin_snapshot)
 		return (SET_ERROR(EACCES));
 
 	ZFS_ENTER(zfsvfs);
 
 	to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		error = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
 		    ZFS_MAX_DATASET_NAME_LEN, NULL);
 		if (error == 0) {
 			snm = real;
 		} else if (error != ENOTSUP) {
 			goto out;
 		}
 	}
 
 	dmu_objset_name(zfsvfs->z_os, fsname);
 
 	error = zfsctl_snapshot_name(ITOZSB(sdip), snm,
 	    ZFS_MAX_DATASET_NAME_LEN, from);
 	if (error == 0)
 		error = zfsctl_snapshot_name(ITOZSB(tdip), tnm,
 		    ZFS_MAX_DATASET_NAME_LEN, to);
 	if (error == 0)
 		error = zfs_secpolicy_rename_perms(from, to, cr);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Cannot move snapshots out of the snapdir.
 	 */
 	if (sdip != tdip) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * No-op when names are identical.
 	 */
 	if (strcmp(snm, tnm) == 0) {
 		error = 0;
 		goto out;
 	}
 
 	rw_enter(&zfs_snapshot_lock, RW_WRITER);
 
 	error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
 	if (error == 0)
 		(void) zfsctl_snapshot_rename(snm, tnm);
 
 	rw_exit(&zfs_snapshot_lock);
 out:
 	kmem_free(from, ZFS_MAX_DATASET_NAME_LEN);
 	kmem_free(to, ZFS_MAX_DATASET_NAME_LEN);
 	kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
 	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Removing a directory under '.zfs/snapshot' will automatically trigger
  * the removal of the snapshot with the given name.
  */
 int
 zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	char *snapname, *real;
 	int error;
 
 	if (!zfs_admin_snapshot)
 		return (SET_ERROR(EACCES));
 
 	ZFS_ENTER(zfsvfs);
 
 	snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		error = dmu_snapshot_realname(zfsvfs->z_os, name, real,
 		    ZFS_MAX_DATASET_NAME_LEN, NULL);
 		if (error == 0) {
 			name = real;
 		} else if (error != ENOTSUP) {
 			goto out;
 		}
 	}
 
 	error = zfsctl_snapshot_name(ITOZSB(dip), name,
 	    ZFS_MAX_DATASET_NAME_LEN, snapname);
 	if (error == 0)
 		error = zfs_secpolicy_destroy_perms(snapname, cr);
 	if (error != 0)
 		goto out;
 
 	error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
 	if ((error == 0) || (error == ENOENT))
 		error = dsl_destroy_snapshot(snapname, B_FALSE);
 out:
 	kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
 	kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Creating a directory under '.zfs/snapshot' will automatically trigger
  * the creation of a new snapshot with the given name.
  */
 int
 zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
     struct inode **ipp, cred_t *cr, int flags)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	char *dsname;
 	int error;
 
 	if (!zfs_admin_snapshot)
 		return (SET_ERROR(EACCES));
 
 	dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 
 	if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {
 		error = SET_ERROR(EILSEQ);
 		goto out;
 	}
 
 	dmu_objset_name(zfsvfs->z_os, dsname);
 
 	error = zfs_secpolicy_snapshot_perms(dsname, cr);
 	if (error != 0)
 		goto out;
 
 	if (error == 0) {
 		error = dmu_objset_snapshot_one(dsname, dirname);
 		if (error != 0)
 			goto out;
 
 		error = zfsctl_snapdir_lookup(dip, dirname, ipp,
 		    0, cr, NULL, NULL);
 	}
 out:
 	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
 
 	return (error);
 }
 
 /*
+ * Flush everything out of the kernel's export table and such.
+ * This is needed as once the snapshot is used over NFS, its
+ * entries in svc_export and svc_expkey caches hold reference
+ * to the snapshot mount point. There is no known way of flushing
+ * only the entries related to the snapshot.
+ */
+static void
+exportfs_flush(void)
+{
+	char *argv[] = { "/usr/sbin/exportfs", "-f", NULL };
+	char *envp[] = { NULL };
+
+	(void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+}
+
+/*
  * Attempt to unmount a snapshot by making a call to user space.
  * There is no assurance that this can or will succeed, is just a
  * best effort.  In the case where it does fail, perhaps because
  * it's in use, the unmount will fail harmlessly.
  */
 int
 zfsctl_snapshot_unmount(char *snapname, int flags)
 {
 	char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL,
 	    NULL };
 	char *envp[] = { NULL };
 	zfs_snapentry_t *se;
 	int error;
 
 	rw_enter(&zfs_snapshot_lock, RW_READER);
 	if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
 		rw_exit(&zfs_snapshot_lock);
 		return (SET_ERROR(ENOENT));
 	}
 	rw_exit(&zfs_snapshot_lock);
+
+	exportfs_flush();
 
 	if (flags & MNT_FORCE)
 		argv[4] = "-fn";
 	argv[5] = se->se_path;
 	dprintf("unmount; path=%s\n", se->se_path);
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	zfsctl_snapshot_rele(se);
 
 
 	/*
 	 * The umount system utility will return 256 on error.  We must
 	 * assume this error is because the file system is busy so it is
 	 * converted to the more sensible EBUSY.
 	 */
 	if (error)
 		error = SET_ERROR(EBUSY);
 
 	return (error);
 }
 
 int
 zfsctl_snapshot_mount(struct path *path, int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *ip = dentry->d_inode;
 	zfsvfs_t *zfsvfs;
 	zfsvfs_t *snap_zfsvfs;
 	zfs_snapentry_t *se;
 	char *full_name, *full_path;
 	char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
 	    NULL };
 	char *envp[] = { NULL };
 	int error;
 	struct path spath;
 
 	if (ip == NULL)
 		return (SET_ERROR(EISDIR));
 
 	zfsvfs = ITOZSB(ip);
 	ZFS_ENTER(zfsvfs);
 
 	full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 
 	error = zfsctl_snapshot_name(zfsvfs, dname(dentry),
 	    ZFS_MAX_DATASET_NAME_LEN, full_name);
 	if (error)
 		goto error;
 
 	/*
 	 * Construct a mount point path from sb of the ctldir inode and dirent
 	 * name, instead of from d_path(), so that chroot'd process doesn't fail
 	 * on mount.zfs(8).
 	 */
 	snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s",
 	    zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "",
 	    dname(dentry));
 
 	/*
 	 * Multiple concurrent automounts of a snapshot are never allowed.
 	 * The snapshot may be manually mounted as many times as desired.
 	 */
 	if (zfsctl_snapshot_ismounted(full_name)) {
 		error = 0;
 		goto error;
 	}
 
 	/*
 	 * Attempt to mount the snapshot from user space.  Normally this
 	 * would be done using the vfs_kern_mount() function, however that
 	 * function is marked GPL-only and cannot be used.  On error we
 	 * careful to log the real error to the console and return EISDIR
 	 * to safely abort the automount.  This should be very rare.
 	 *
 	 * If the user mode helper happens to return EBUSY, a concurrent
 	 * mount is already in progress in which case the error is ignored.
 	 * Take note that if the program was executed successfully the return
 	 * value from call_usermodehelper() will be (exitcode << 8 + signal).
 	 */
 	dprintf("mount; name=%s path=%s\n", full_name, full_path);
 	argv[5] = full_name;
 	argv[6] = full_path;
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (error) {
 		if (!(error & MOUNT_BUSY << 8)) {
 			zfs_dbgmsg("Unable to automount %s error=%d",
 			    full_path, error);
 			error = SET_ERROR(EISDIR);
 		} else {
 			/*
 			 * EBUSY, this could mean a concurrent mount, or the
 			 * snapshot has already been mounted at completely
 			 * different place. We return 0 so VFS will retry. For
 			 * the latter case the VFS will retry several times
 			 * and return ELOOP, which is probably not a very good
 			 * behavior.
 			 */
 			error = 0;
 		}
 		goto error;
 	}
 
 	/*
 	 * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
 	 * to identify this as an automounted filesystem.
 	 */
 	spath = *path;
 	path_get(&spath);
 	if (follow_down_one(&spath)) {
 		snap_zfsvfs = ITOZSB(spath.dentry->d_inode);
 		snap_zfsvfs->z_parent = zfsvfs;
 		dentry = spath.dentry;
 		spath.mnt->mnt_flags |= MNT_SHRINKABLE;
 
 		rw_enter(&zfs_snapshot_lock, RW_WRITER);
 		se = zfsctl_snapshot_alloc(full_name, full_path,
 		    snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
 		    dentry);
 		zfsctl_snapshot_add(se);
 		zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
 		rw_exit(&zfs_snapshot_lock);
 	}
 	path_put(&spath);
 error:
 	kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
 	kmem_free(full_path, MAXPATHLEN);
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Get the snapdir inode from fid
  */
 int
 zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
     struct inode **ipp)
 {
 	int error;
 	struct path path;
 	char *mnt;
 	struct dentry *dentry;
 
 	mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 	error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
 	    MAXPATHLEN, mnt);
 	if (error)
 		goto out;
 
 	/* Trigger automount */
 	error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
 	if (error)
 		goto out;
 
 	path_put(&path);
 	/*
 	 * Get the snapdir inode. Note, we don't want to use the above
 	 * path because it contains the root of the snapshot rather
 	 * than the snapdir.
 	 */
 	*ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid);
 	if (*ipp == NULL) {
 		error = SET_ERROR(ENOENT);
 		goto out;
 	}
 
 	/* check gen, see zfsctl_snapdir_fid */
 	dentry = d_obtain_alias(igrab(*ipp));
 	if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) {
 		iput(*ipp);
 		*ipp = NULL;
 		error = SET_ERROR(ENOENT);
 	}
 	if (!IS_ERR(dentry))
 		dput(dentry);
 out:
 	kmem_free(mnt, MAXPATHLEN);
 	return (error);
 }
 
 int
 zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	znode_t *zp;
 	znode_t *dzp;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (zfsvfs->z_shares_dir == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
 		error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL);
 		zrele(dzp);
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Initialize the various pieces we'll need to create and manipulate .zfs
  * directories.  Currently this is unused but available.
  */
 void
 zfsctl_init(void)
 {
 	avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
 	    se_node_name));
 	avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
 	    se_node_objsetid));
 	rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL);
 }
 
 /*
  * Cleanup the various pieces we needed for .zfs directories.  In particular
  * ensure the expiry timer is canceled safely.
  */
 void
 zfsctl_fini(void)
 {
 	avl_destroy(&zfs_snapshots_by_name);
 	avl_destroy(&zfs_snapshots_by_objsetid);
 	rw_destroy(&zfs_snapshot_lock);
 }
 
 module_param(zfs_admin_snapshot, int, 0644);
 MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");
 
 module_param(zfs_expire_snapshot, int, 0644);
 MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");
Index: vendor-sys/openzfs/dist/module/zcommon/zfs_fletcher.c
===================================================================
--- vendor-sys/openzfs/dist/module/zcommon/zfs_fletcher.c	(revision 364927)
+++ vendor-sys/openzfs/dist/module/zcommon/zfs_fletcher.c	(revision 364928)
@@ -1,940 +1,940 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
  */
 /*
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 /*
  * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
 /*
  * Fletcher Checksums
  * ------------------
  *
  * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
  * recurrence relations:
  *
  *	a  = a    + f
  *	 i    i-1    i-1
  *
  *	b  = b    + a
  *	 i    i-1    i
  *
  *	c  = c    + b		(fletcher-4 only)
  *	 i    i-1    i
  *
  *	d  = d    + c		(fletcher-4 only)
  *	 i    i-1    i
  *
  * Where
  *	a_0 = b_0 = c_0 = d_0 = 0
  * and
  *	f_0 .. f_(n-1) are the input data.
  *
  * Using standard techniques, these translate into the following series:
  *
  *	     __n_			     __n_
  *	     \   |			     \   |
  *	a  =  >     f			b  =  >     i * f
  *	 n   /___|   n - i		 n   /___|	 n - i
  *	     i = 1			     i = 1
  *
  *
  *	     __n_			     __n_
  *	     \   |  i*(i+1)		     \   |  i*(i+1)*(i+2)
  *	c  =  >     ------- f		d  =  >     ------------- f
  *	 n   /___|     2     n - i	 n   /___|	  6	   n - i
  *	     i = 1			     i = 1
  *
  * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
  * Since the additions are done mod (2^64), errors in the high bits may not
  * be noticed.  For this reason, fletcher-2 is deprecated.
  *
  * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
  * A conservative estimate of how big the buffer can get before we overflow
  * can be estimated using f_i = 0xffffffff for all i:
  *
  * % bc
  *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
  * 2264
  *  quit
  * %
  *
  * So blocks of up to 2k will not overflow.  Our largest block size is
  * 128k, which has 32k 4-byte words, so we can compute the largest possible
  * accumulators, then divide by 2^64 to figure the max amount of overflow:
  *
  * % bc
  *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
  *  a/2^64;b/2^64;c/2^64;d/2^64
  * 0
  * 0
  * 1365
  * 11186858
  *  quit
  * %
  *
  * So a and b cannot overflow.  To make sure each bit of input has some
  * effect on the contents of c and d, we can look at what the factors of
  * the coefficients in the equations for c_n and d_n are.  The number of 2s
  * in the factors determines the lowest set bit in the multiplier.  Running
  * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
  * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
  * the 64-bit accumulators, every bit of every f_i effects every accumulator,
  * even for 128k blocks.
  *
  * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
  * we could do our calculations mod (2^32 - 1) by adding in the carries
  * periodically, and store the number of carries in the top 32-bits.
  *
  * --------------------
  * Checksum Performance
  * --------------------
  *
  * There are two interesting components to checksum performance: cached and
  * uncached performance.  With cached data, fletcher-2 is about four times
  * faster than fletcher-4.  With uncached data, the performance difference is
  * negligible, since the cost of a cache fill dominates the processing time.
  * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
  * efficient pass over the data.
  *
  * In normal operation, the data which is being checksummed is in a buffer
  * which has been filled either by:
  *
  *	1. a compression step, which will be mostly cached, or
  *	2. a bcopy() or copyin(), which will be uncached (because the
  *	   copy is cache-bypassing).
  *
  * For both cached and uncached data, both fletcher checksums are much faster
  * than sha-256, and slower than 'off', which doesn't touch the data at all.
  */
 
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/byteorder.h>
 #include <sys/spa.h>
 #include <sys/simd.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <zfs_fletcher.h>
 
 #define	FLETCHER_MIN_SIMD_SIZE	64
 
 static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
 static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
 static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
     const void *buf, uint64_t size);
 static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
     const void *buf, uint64_t size);
 static boolean_t fletcher_4_scalar_valid(void);
 
 static const fletcher_4_ops_t fletcher_4_scalar_ops = {
 	.init_native = fletcher_4_scalar_init,
 	.fini_native = fletcher_4_scalar_fini,
 	.compute_native = fletcher_4_scalar_native,
 	.init_byteswap = fletcher_4_scalar_init,
 	.fini_byteswap = fletcher_4_scalar_fini,
 	.compute_byteswap = fletcher_4_scalar_byteswap,
 	.valid = fletcher_4_scalar_valid,
 	.name = "scalar"
 };
 
 static fletcher_4_ops_t fletcher_4_fastest_impl = {
 	.name = "fastest",
 	.valid = fletcher_4_scalar_valid
 };
 
 static const fletcher_4_ops_t *fletcher_4_impls[] = {
 	&fletcher_4_scalar_ops,
 	&fletcher_4_superscalar_ops,
 	&fletcher_4_superscalar4_ops,
 #if defined(HAVE_SSE2)
 	&fletcher_4_sse2_ops,
 #endif
 #if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
 	&fletcher_4_ssse3_ops,
 #endif
 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
 	&fletcher_4_avx2_ops,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX512F)
 	&fletcher_4_avx512f_ops,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX512BW)
 	&fletcher_4_avx512bw_ops,
 #endif
-#if defined(__aarch64__)
+#if defined(__aarch64__) && !defined(__FreeBSD__)
 	&fletcher_4_aarch64_neon_ops,
 #endif
 };
 
 /* Hold all supported implementations */
 static uint32_t fletcher_4_supp_impls_cnt = 0;
 static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
 
 /* Select fletcher4 implementation */
 #define	IMPL_FASTEST	(UINT32_MAX)
 #define	IMPL_CYCLE	(UINT32_MAX - 1)
 #define	IMPL_SCALAR	(0)
 
 static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
 
 #define	IMPL_READ(i)	(*(volatile uint32_t *) &(i))
 
 static struct fletcher_4_impl_selector {
 	const char	*fis_name;
 	uint32_t	fis_sel;
 } fletcher_4_impl_selectors[] = {
 	{ "cycle",	IMPL_CYCLE },
 	{ "fastest",	IMPL_FASTEST },
 	{ "scalar",	IMPL_SCALAR }
 };
 
 #if defined(_KERNEL)
 static kstat_t *fletcher_4_kstat;
 
 static struct fletcher_4_kstat {
 	uint64_t native;
 	uint64_t byteswap;
 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
 #endif
 
 /* Indicate that benchmark has been completed */
 static boolean_t fletcher_4_initialized = B_FALSE;
 
 /*ARGSUSED*/
 void
 fletcher_init(zio_cksum_t *zcp)
 {
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 }
 
 int
 fletcher_2_incremental_native(void *buf, size_t size, void *data)
 {
 	zio_cksum_t *zcp = data;
 
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 	uint64_t a0, b0, a1, b1;
 
 	a0 = zcp->zc_word[0];
 	a1 = zcp->zc_word[1];
 	b0 = zcp->zc_word[2];
 	b1 = zcp->zc_word[3];
 
 	for (; ip < ipend; ip += 2) {
 		a0 += ip[0];
 		a1 += ip[1];
 		b0 += a0;
 		b1 += a1;
 	}
 
 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 	return (0);
 }
 
 /*ARGSUSED*/
 void
 fletcher_2_native(const void *buf, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	fletcher_init(zcp);
 	(void) fletcher_2_incremental_native((void *) buf, size, zcp);
 }
 
 int
 fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
 {
 	zio_cksum_t *zcp = data;
 
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 	uint64_t a0, b0, a1, b1;
 
 	a0 = zcp->zc_word[0];
 	a1 = zcp->zc_word[1];
 	b0 = zcp->zc_word[2];
 	b1 = zcp->zc_word[3];
 
 	for (; ip < ipend; ip += 2) {
 		a0 += BSWAP_64(ip[0]);
 		a1 += BSWAP_64(ip[1]);
 		b0 += a0;
 		b1 += a1;
 	}
 
 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 	return (0);
 }
 
 /*ARGSUSED*/
 void
 fletcher_2_byteswap(const void *buf, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	fletcher_init(zcp);
 	(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
 }
 
 static void
 fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
 {
 	ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
 }
 
 static void
 fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
 {
 	memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
 }
 
 static void
 fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
     uint64_t size)
 {
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 	uint64_t a, b, c, d;
 
 	a = ctx->scalar.zc_word[0];
 	b = ctx->scalar.zc_word[1];
 	c = ctx->scalar.zc_word[2];
 	d = ctx->scalar.zc_word[3];
 
 	for (; ip < ipend; ip++) {
 		a += ip[0];
 		b += a;
 		c += b;
 		d += c;
 	}
 
 	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
 }
 
 static void
 fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
     uint64_t size)
 {
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 	uint64_t a, b, c, d;
 
 	a = ctx->scalar.zc_word[0];
 	b = ctx->scalar.zc_word[1];
 	c = ctx->scalar.zc_word[2];
 	d = ctx->scalar.zc_word[3];
 
 	for (; ip < ipend; ip++) {
 		a += BSWAP_32(ip[0]);
 		b += a;
 		c += b;
 		d += c;
 	}
 
 	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
 }
 
 static boolean_t
 fletcher_4_scalar_valid(void)
 {
 	return (B_TRUE);
 }
 
 int
 fletcher_4_impl_set(const char *val)
 {
 	int err = -EINVAL;
 	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 	size_t i, val_len;
 
 	val_len = strlen(val);
 	while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
 		val_len--;
 
 	/* check mandatory implementations */
 	for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
 		const char *name = fletcher_4_impl_selectors[i].fis_name;
 
 		if (val_len == strlen(name) &&
 		    strncmp(val, name, val_len) == 0) {
 			impl = fletcher_4_impl_selectors[i].fis_sel;
 			err = 0;
 			break;
 		}
 	}
 
 	if (err != 0 && fletcher_4_initialized) {
 		/* check all supported implementations */
 		for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 			const char *name = fletcher_4_supp_impls[i]->name;
 
 			if (val_len == strlen(name) &&
 			    strncmp(val, name, val_len) == 0) {
 				impl = i;
 				err = 0;
 				break;
 			}
 		}
 	}
 
 	if (err == 0) {
 		atomic_swap_32(&fletcher_4_impl_chosen, impl);
 		membar_producer();
 	}
 
 	return (err);
 }
 
 /*
  * Returns the Fletcher 4 operations for checksums.   When a SIMD
  * implementation is not allowed in the current context, then fallback
  * to the fastest generic implementation.
  */
 static inline const fletcher_4_ops_t *
 fletcher_4_impl_get(void)
 {
 	if (!kfpu_allowed())
 		return (&fletcher_4_superscalar4_ops);
 
 	const fletcher_4_ops_t *ops = NULL;
 	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 
 	switch (impl) {
 	case IMPL_FASTEST:
 		ASSERT(fletcher_4_initialized);
 		ops = &fletcher_4_fastest_impl;
 		break;
 	case IMPL_CYCLE:
 		/* Cycle through supported implementations */
 		ASSERT(fletcher_4_initialized);
 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 		static uint32_t cycle_count = 0;
 		uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
 		ops = fletcher_4_supp_impls[idx];
 		break;
 	default:
 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 		ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
 		ops = fletcher_4_supp_impls[impl];
 		break;
 	}
 
 	ASSERT3P(ops, !=, NULL);
 
 	return (ops);
 }
 
 static inline void
 fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
 {
 	fletcher_4_ctx_t ctx;
 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
 
 	ops->init_native(&ctx);
 	ops->compute_native(&ctx, buf, size);
 	ops->fini_native(&ctx, zcp);
 }
 
 /*ARGSUSED*/
 void
 fletcher_4_native(const void *buf, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
 
 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 
 	if (size == 0 || p2size == 0) {
 		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 
 		if (size > 0)
 			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
 			    buf, size);
 	} else {
 		fletcher_4_native_impl(buf, p2size, zcp);
 
 		if (p2size < size)
 			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
 			    (char *)buf + p2size, size - p2size);
 	}
 }
 
 void
 fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
 {
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 	fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
 }
 
 static inline void
 fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
 {
 	fletcher_4_ctx_t ctx;
 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
 
 	ops->init_byteswap(&ctx);
 	ops->compute_byteswap(&ctx, buf, size);
 	ops->fini_byteswap(&ctx, zcp);
 }
 
 /*ARGSUSED*/
 void
 fletcher_4_byteswap(const void *buf, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
 
 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 
 	if (size == 0 || p2size == 0) {
 		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 
 		if (size > 0)
 			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
 			    buf, size);
 	} else {
 		fletcher_4_byteswap_impl(buf, p2size, zcp);
 
 		if (p2size < size)
 			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
 			    (char *)buf + p2size, size - p2size);
 	}
 }
 
 /* Incremental Fletcher 4 */
 
 #define	ZFS_FLETCHER_4_INC_MAX_SIZE	(8ULL << 20)
 
 static inline void
 fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
     const zio_cksum_t *nzcp)
 {
 	const uint64_t c1 = size / sizeof (uint32_t);
 	const uint64_t c2 = c1 * (c1 + 1) / 2;
 	const uint64_t c3 = c2 * (c1 + 2) / 3;
 
 	/*
 	 * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
 	 * reason we split incremental fletcher4 computation of large buffers
 	 * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
 	 */
 	ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
 
 	zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
 	    c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
 	zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
 	    c2 * zcp->zc_word[0];
 	zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
 	zcp->zc_word[0] += nzcp->zc_word[0];
 }
 
 static inline void
 fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
     zio_cksum_t *zcp)
 {
 	while (size > 0) {
 		zio_cksum_t nzc;
 		uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
 
 		if (native)
 			fletcher_4_native(buf, len, NULL, &nzc);
 		else
 			fletcher_4_byteswap(buf, len, NULL, &nzc);
 
 		fletcher_4_incremental_combine(zcp, len, &nzc);
 
 		size -= len;
 		buf += len;
 	}
 }
 
 int
 fletcher_4_incremental_native(void *buf, size_t size, void *data)
 {
 	zio_cksum_t *zcp = data;
 	/* Use scalar impl to directly update cksum of small blocks */
 	if (size < SPA_MINBLOCKSIZE)
 		fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
 	else
 		fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
 	return (0);
 }
 
 int
 fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
 {
 	zio_cksum_t *zcp = data;
 	/* Use scalar impl to directly update cksum of small blocks */
 	if (size < SPA_MINBLOCKSIZE)
 		fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
 	else
 		fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
 	return (0);
 }
 
 #if defined(_KERNEL)
 /*
  * Fletcher 4 kstats
  */
 static int
 fletcher_4_kstat_headers(char *buf, size_t size)
 {
 	ssize_t off = 0;
 
 	off += snprintf(buf + off, size, "%-17s", "implementation");
 	off += snprintf(buf + off, size - off, "%-15s", "native");
 	(void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");
 
 	return (0);
 }
 
 static int
 fletcher_4_kstat_data(char *buf, size_t size, void *data)
 {
 	struct fletcher_4_kstat *fastest_stat =
 	    &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
 	struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data;
 	ssize_t off = 0;
 
 	if (curr_stat == fastest_stat) {
 		off += snprintf(buf + off, size - off, "%-17s", "fastest");
 		off += snprintf(buf + off, size - off, "%-15s",
 		    fletcher_4_supp_impls[fastest_stat->native]->name);
 		off += snprintf(buf + off, size - off, "%-15s\n",
 		    fletcher_4_supp_impls[fastest_stat->byteswap]->name);
 	} else {
 		ptrdiff_t id = curr_stat - fletcher_4_stat_data;
 
 		off += snprintf(buf + off, size - off, "%-17s",
 		    fletcher_4_supp_impls[id]->name);
 		off += snprintf(buf + off, size - off, "%-15llu",
 		    (u_longlong_t)curr_stat->native);
 		off += snprintf(buf + off, size - off, "%-15llu\n",
 		    (u_longlong_t)curr_stat->byteswap);
 	}
 
 	return (0);
 }
 
 static void *
 fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
 {
 	if (n <= fletcher_4_supp_impls_cnt)
 		ksp->ks_private = (void *) (fletcher_4_stat_data + n);
 	else
 		ksp->ks_private = NULL;
 
 	return (ksp->ks_private);
 }
 #endif
 
 #define	FLETCHER_4_FASTEST_FN_COPY(type, src)				  \
 {									  \
 	fletcher_4_fastest_impl.init_ ## type = src->init_ ## type;	  \
 	fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type;	  \
 	fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
 }
 
 #define	FLETCHER_4_BENCH_NS	(MSEC2NSEC(50))		/* 50ms */
 
 typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
 					zio_cksum_t *);
 
 #if defined(_KERNEL)
 static void
 fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 {
 
 	struct fletcher_4_kstat *fastest_stat =
 	    &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
 	hrtime_t start;
 	uint64_t run_bw, run_time_ns, best_run = 0;
 	zio_cksum_t zc;
 	uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
 
 	fletcher_checksum_func_t *fletcher_4_test = native ?
 	    fletcher_4_native : fletcher_4_byteswap;
 
 	for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 		struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
 		uint64_t run_count = 0;
 
 		/* temporary set an implementation */
 		fletcher_4_impl_chosen = i;
 
 		kpreempt_disable();
 		start = gethrtime();
 		do {
 			for (l = 0; l < 32; l++, run_count++)
 				fletcher_4_test(data, data_size, NULL, &zc);
 
 			run_time_ns = gethrtime() - start;
 		} while (run_time_ns < FLETCHER_4_BENCH_NS);
 		kpreempt_enable();
 
 		run_bw = data_size * run_count * NANOSEC;
 		run_bw /= run_time_ns;	/* B/s */
 
 		if (native)
 			stat->native = run_bw;
 		else
 			stat->byteswap = run_bw;
 
 		if (run_bw > best_run) {
 			best_run = run_bw;
 
 			if (native) {
 				fastest_stat->native = i;
 				FLETCHER_4_FASTEST_FN_COPY(native,
 				    fletcher_4_supp_impls[i]);
 			} else {
 				fastest_stat->byteswap = i;
 				FLETCHER_4_FASTEST_FN_COPY(byteswap,
 				    fletcher_4_supp_impls[i]);
 			}
 		}
 	}
 
 	/* restore original selection */
 	atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
 }
 #endif /* _KERNEL */
 
 /*
  * Initialize and benchmark all supported implementations.
  */
 static void
 fletcher_4_benchmark(void)
 {
 	fletcher_4_ops_t *curr_impl;
 	int i, c;
 
 	/* Move supported implementations into fletcher_4_supp_impls */
 	for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
 		curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
 
 		if (curr_impl->valid && curr_impl->valid())
 			fletcher_4_supp_impls[c++] = curr_impl;
 	}
 	membar_producer();	/* complete fletcher_4_supp_impls[] init */
 	fletcher_4_supp_impls_cnt = c;	/* number of supported impl */
 
 #if defined(_KERNEL)
 	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
 	char *databuf = vmem_alloc(data_size, KM_SLEEP);
 
 	for (i = 0; i < data_size / sizeof (uint64_t); i++)
 		((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
 
 	fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
 	fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
 
 	vmem_free(databuf, data_size);
 #else
 	/*
 	 * Skip the benchmark in user space to avoid impacting libzpool
 	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
 	 * is assumed to be the fastest and used by default.
 	 */
 	memcpy(&fletcher_4_fastest_impl,
 	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
 	    sizeof (fletcher_4_fastest_impl));
 	fletcher_4_fastest_impl.name = "fastest";
 	membar_producer();
 #endif /* _KERNEL */
 }
 
 void
 fletcher_4_init(void)
 {
 	/* Determine the fastest available implementation. */
 	fletcher_4_benchmark();
 
 #if defined(_KERNEL)
 	/* Install kstats for all implementations */
 	fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 	if (fletcher_4_kstat != NULL) {
 		fletcher_4_kstat->ks_data = NULL;
 		fletcher_4_kstat->ks_ndata = UINT32_MAX;
 		kstat_set_raw_ops(fletcher_4_kstat,
 		    fletcher_4_kstat_headers,
 		    fletcher_4_kstat_data,
 		    fletcher_4_kstat_addr);
 		kstat_install(fletcher_4_kstat);
 	}
 #endif
 
 	/* Finish initialization */
 	fletcher_4_initialized = B_TRUE;
 }
 
 void
 fletcher_4_fini(void)
 {
 #if defined(_KERNEL)
 	if (fletcher_4_kstat != NULL) {
 		kstat_delete(fletcher_4_kstat);
 		fletcher_4_kstat = NULL;
 	}
 #endif
 }
 
 /* ABD adapters */
 
 static void
 abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
 {
 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
 	cdp->acd_private = (void *) ops;
 
 	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
 		ops->init_native(cdp->acd_ctx);
 	else
 		ops->init_byteswap(cdp->acd_ctx);
 }
 
 static void
 abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
 {
 	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
 
 	ASSERT(ops);
 
 	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
 		ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
 	else
 		ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
 }
 
 static void
 abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
     zio_abd_checksum_data_t *cdp)
 {
 	zio_cksum_t *zcp = cdp->acd_zcp;
 
 	ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
 
 	abd_fletcher_4_fini(cdp);
 	cdp->acd_private = (void *)&fletcher_4_scalar_ops;
 
 	if (native)
 		fletcher_4_incremental_native(data, size, zcp);
 	else
 		fletcher_4_incremental_byteswap(data, size, zcp);
 }
 
 static int
 abd_fletcher_4_iter(void *data, size_t size, void *private)
 {
 	zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;
 	fletcher_4_ctx_t *ctx = cdp->acd_ctx;
 	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
 	boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
 	uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
 
 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 
 	if (asize > 0) {
 		if (native)
 			ops->compute_native(ctx, data, asize);
 		else
 			ops->compute_byteswap(ctx, data, asize);
 
 		size -= asize;
 		data = (char *)data + asize;
 	}
 
 	if (size > 0) {
 		ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
 		/* At this point we have to switch to scalar impl */
 		abd_fletcher_4_simd2scalar(native, data, size, cdp);
 	}
 
 	return (0);
 }
 
 zio_abd_checksum_func_t fletcher_4_abd_ops = {
 	.acf_init = abd_fletcher_4_init,
 	.acf_fini = abd_fletcher_4_fini,
 	.acf_iter = abd_fletcher_4_iter
 };
 
 
 #if defined(_KERNEL) && defined(__linux__)
 
 static int
 fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused)
 {
 	const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 	char *fmt;
 	int i, cnt = 0;
 
 	/* list fastest */
 	fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s ";
 	cnt += sprintf(buffer + cnt, fmt, "fastest");
 
 	/* list all supported implementations */
 	for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 		fmt = (i == impl) ? "[%s] " : "%s ";
 		cnt += sprintf(buffer + cnt, fmt,
 		    fletcher_4_supp_impls[i]->name);
 	}
 
 	return (cnt);
 }
 
 static int
 fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused)
 {
 	return (fletcher_4_impl_set(val));
 }
 
 /*
  * Choose a fletcher 4 implementation in ZFS.
  * Users can choose "cycle" to exercise all implementations, but this is
  * for testing purpose therefore it can only be set in user space.
  */
 module_param_call(zfs_fletcher_4_impl,
     fletcher_4_param_set, fletcher_4_param_get, NULL, 0644);
 MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation.");
 
 EXPORT_SYMBOL(fletcher_init);
 EXPORT_SYMBOL(fletcher_2_incremental_native);
 EXPORT_SYMBOL(fletcher_2_incremental_byteswap);
 EXPORT_SYMBOL(fletcher_4_init);
 EXPORT_SYMBOL(fletcher_4_fini);
 EXPORT_SYMBOL(fletcher_2_native);
 EXPORT_SYMBOL(fletcher_2_byteswap);
 EXPORT_SYMBOL(fletcher_4_native);
 EXPORT_SYMBOL(fletcher_4_native_varsize);
 EXPORT_SYMBOL(fletcher_4_byteswap);
 EXPORT_SYMBOL(fletcher_4_incremental_native);
 EXPORT_SYMBOL(fletcher_4_incremental_byteswap);
 EXPORT_SYMBOL(fletcher_4_abd_ops);
 #endif
Index: vendor-sys/openzfs/dist/module/zfs/arc.c
===================================================================
--- vendor-sys/openzfs/dist/module/zfs/arc.c	(revision 364927)
+++ vendor-sys/openzfs/dist/module/zfs/arc.c	(revision 364928)
@@ -1,10565 +1,10582 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, Joyent, Inc.
  * Copyright (c) 2011, 2019, Delphix. All rights reserved.
  * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2020, George Amanakis. All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  */
 
 /*
  * DVA-based Adjustable Replacement Cache
  *
  * While much of the theory of operation used here is
  * based on the self-tuning, low overhead replacement cache
  * presented by Megiddo and Modha at FAST 2003, there are some
  * significant differences:
  *
  * 1. The Megiddo and Modha model assumes any page is evictable.
  * Pages in its cache cannot be "locked" into memory.  This makes
  * the eviction algorithm simple: evict the last page in the list.
  * This also make the performance characteristics easy to reason
  * about.  Our cache is not so simple.  At any given moment, some
  * subset of the blocks in the cache are un-evictable because we
  * have handed out a reference to them.  Blocks are only evictable
  * when there are no external references active.  This makes
  * eviction far more problematic:  we choose to evict the evictable
  * blocks that are the "lowest" in the list.
  *
  * There are times when it is not possible to evict the requested
  * space.  In these circumstances we are unable to adjust the cache
  * size.  To prevent the cache growing unbounded at these times we
  * implement a "cache throttle" that slows the flow of new data
  * into the cache until we can make space available.
  *
  * 2. The Megiddo and Modha model assumes a fixed cache size.
  * Pages are evicted when the cache is full and there is a cache
  * miss.  Our model has a variable sized cache.  It grows with
  * high use, but also tries to react to memory pressure from the
  * operating system: decreasing its size when system memory is
  * tight.
  *
  * 3. The Megiddo and Modha model assumes a fixed page size. All
  * elements of the cache are therefore exactly the same size.  So
  * when adjusting the cache size following a cache miss, its simply
  * a matter of choosing a single page to evict.  In our model, we
  * have variable sized cache blocks (ranging from 512 bytes to
  * 128K bytes).  We therefore choose a set of blocks to evict to make
  * space for a cache miss that approximates as closely as possible
  * the space used by the new block.
  *
  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  * by N. Megiddo & D. Modha, FAST 2003
  */
 
 /*
  * The locking model:
  *
  * A new reference to a cache buffer can be obtained in two
  * ways: 1) via a hash table lookup using the DVA as a key,
  * or 2) via one of the ARC lists.  The arc_read() interface
  * uses method 1, while the internal ARC algorithms for
  * adjusting the cache use method 2.  We therefore provide two
  * types of locks: 1) the hash table lock array, and 2) the
  * ARC list locks.
  *
  * Buffers do not have their own mutexes, rather they rely on the
  * hash table mutexes for the bulk of their protection (i.e. most
  * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
  * NULL for the mutex if the buffer was not in the table.
  *
  * buf_hash_remove() expects the appropriate hash mutex to be
  * already held before it is invoked.
  *
  * Each ARC state also has a mutex which is used to protect the
  * buffer list associated with the state.  When attempting to
  * obtain a hash table lock while holding an ARC list lock you
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the active state mutex must be held before the ghost state mutex.
  *
  * It as also possible to register a callback which is run when the
  * arc_meta_limit is reached and no buffers can be safely evicted.  In
  * this case the arc user should drop a reference on some arc buffers so
  * they can be reclaimed and the arc_meta_limit honored.  For example,
  * when using the ZPL each dentry holds a references on a znode.  These
  * dentries must be pruned before the arc buffer holding the znode can
  * be safely evicted.
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
  * The L2ARC uses the l2ad_mtx on each vdev for the following:
  *
  *	- L2ARC buflist creation
  *	- L2ARC buflist eviction
  *	- L2ARC write completion, which walks L2ARC buflists
  *	- ARC header destruction, as it removes from L2ARC buflists
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
 /*
  * ARC operation:
  *
  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
  * This structure can point either to a block that is still in the cache or to
  * one that is only accessible in an L2 ARC device, or it can provide
  * information about a block that was recently evicted. If a block is
  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
  * information to retrieve it from the L2ARC device. This information is
  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
  * that is in this state cannot access the data directly.
  *
  * Blocks that are actively being referenced or have not been evicted
  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
  * the arc_buf_hdr_t that will point to the data block in memory. A block can
  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
  *
  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
  * ability to store the physical data (b_pabd) associated with the DVA of the
  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
  * it will match its on-disk compression characteristics. This behavior can be
  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
  * compressed ARC functionality is disabled, the b_pabd will point to an
  * uncompressed version of the on-disk data.
  *
  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
  * consumer. The ARC will provide references to this data and will keep it
  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
  * data block and will evict any arc_buf_t that is no longer referenced. The
  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
  * "overhead_size" kstat.
  *
  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
  * compressed form. The typical case is that consumers will want uncompressed
  * data, and when that happens a new data buffer is allocated where the data is
  * decompressed for them to use. Currently the only consumer who wants
  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
  * with the arc_buf_hdr_t.
  *
  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
  * first one is owned by a compressed send consumer (and therefore references
  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
  * used by any other consumer (and has its own uncompressed copy of the data
  * buffer).
  *
  *   arc_buf_hdr_t
  *   +-----------+
  *   | fields    |
  *   | common to |
  *   | L1- and   |
  *   | L2ARC     |
  *   +-----------+
  *   | l2arc_buf_hdr_t
  *   |           |
  *   +-----------+
  *   | l1arc_buf_hdr_t
  *   |           |              arc_buf_t
  *   | b_buf     +------------>+-----------+      arc_buf_t
  *   | b_pabd    +-+           |b_next     +---->+-----------+
  *   +-----------+ |           |-----------|     |b_next     +-->NULL
  *                 |           |b_comp = T |     +-----------+
  *                 |           |b_data     +-+   |b_comp = F |
  *                 |           +-----------+ |   |b_data     +-+
  *                 +->+------+               |   +-----------+ |
  *        compressed  |      |               |                 |
  *           data     |      |<--------------+                 | uncompressed
  *                    +------+          compressed,            |     data
  *                                        shared               +-->+------+
  *                                         data                    |      |
  *                                                                 |      |
  *                                                                 +------+
  *
  * When a consumer reads a block, the ARC must first look to see if the
  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
  * arc_buf_t and either copies uncompressed data into a new data buffer from an
  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
  * hdr is compressed and the desired compression characteristics of the
  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
  * be anywhere in the hdr's list.
  *
  * The diagram below shows an example of an uncompressed ARC hdr that is
  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
  * the last element in the buf list):
  *
  *                arc_buf_hdr_t
  *                +-----------+
  *                |           |
  *                |           |
  *                |           |
  *                +-----------+
  * l2arc_buf_hdr_t|           |
  *                |           |
  *                +-----------+
  * l1arc_buf_hdr_t|           |
  *                |           |                 arc_buf_t    (shared)
  *                |    b_buf  +------------>+---------+      arc_buf_t
  *                |           |             |b_next   +---->+---------+
  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
  *                +-----------+ |           |         |     +---------+
  *                              |           |b_data   +-+   |         |
  *                              |           +---------+ |   |b_data   +-+
  *                              +->+------+             |   +---------+ |
  *                                 |      |             |               |
  *                   uncompressed  |      |             |               |
  *                        data     +------+             |               |
  *                                    ^                 +->+------+     |
  *                                    |       uncompressed |      |     |
  *                                    |           data     |      |     |
  *                                    |                    +------+     |
  *                                    +---------------------------------+
  *
  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
  * since the physical block is about to be rewritten. The new data contents
  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
  * it may compress the data before writing it to disk. The ARC will be called
  * with the transformed data and will bcopy the transformed on-disk block into
  * a newly allocated b_pabd. Writes are always done into buffers which have
  * either been loaned (and hence are new and don't have other readers) or
  * buffers which have been released (and hence have their own hdr, if there
  * were originally other readers of the buf's original hdr). This ensures that
  * the ARC only needs to update a single buf and its hdr after a write occurs.
  *
  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
  * that when compressed ARC is enabled that the L2ARC blocks are identical
  * to the on-disk block in the main data pool. This provides a significant
  * advantage since the ARC can leverage the bp's checksum when reading from the
  * L2ARC to determine if the contents are valid. However, if the compressed
  * ARC is disabled, then the L2ARC's block must be transformed to look
  * like the physical block in the main data pool before comparing the
  * checksum and determining its validity.
  *
  * The L1ARC has a slightly different system for storing encrypted data.
  * Raw (encrypted + possibly compressed) data has a few subtle differences from
  * data that is just compressed. The biggest difference is that it is not
  * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
  * The other difference is that encryption cannot be treated as a suggestion.
  * If a caller would prefer compressed data, but they actually wind up with
  * uncompressed data the worst thing that could happen is there might be a
  * performance hit. If the caller requests encrypted data, however, we must be
  * sure they actually get it or else secret information could be leaked. Raw
  * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
  * may have both an encrypted version and a decrypted version of its data at
  * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
  * copied out of this header. To avoid complications with b_pabd, raw buffers
  * cannot be shared.
  */
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/zfs_refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
 #include <sys/zio_checksum.h>
 #include <sys/multilist.h>
 #include <sys/abd.h>
 #include <sys/zil.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/callb.h>
 #include <sys/kstat.h>
 #include <sys/zthr.h>
 #include <zfs_fletcher.h>
 #include <sys/arc_impl.h>
 #include <sys/trace_zfs.h>
 #include <sys/aggsum.h>
 #include <cityhash.h>
 #include <sys/vdev_trim.h>
 
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 boolean_t arc_watch = B_FALSE;
 #endif
 
 /*
  * This thread's job is to keep enough free memory in the system, by
  * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
  * arc_available_memory().
  */
 static zthr_t *arc_reap_zthr;
 
 /*
  * This thread's job is to keep arc_size under arc_c, by calling
  * arc_evict(), which improves arc_is_overflowing().
  */
 static zthr_t *arc_evict_zthr;
 
 static kmutex_t arc_evict_lock;
 static boolean_t arc_evict_needed = B_FALSE;
 
 /*
  * Count of bytes evicted since boot.
  */
 static uint64_t arc_evict_count;
 
 /*
  * List of arc_evict_waiter_t's, representing threads waiting for the
  * arc_evict_count to reach specific values.
  */
 static list_t arc_evict_waiters;
 
 /*
  * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
  * the requested amount of data to be evicted.  For example, by default for
  * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
  * Since this is above 100%, it ensures that progress is made towards getting
  * arc_size under arc_c.  Since this is finite, it ensures that allocations
  * can still happen, even during the potentially long time that arc_size is
  * more than arc_c.
  */
 int zfs_arc_eviction_pct = 200;
 
 /*
  * The number of headers to evict in arc_evict_state_impl() before
  * dropping the sublist lock and evicting from another sublist. A lower
  * value means we're more likely to evict the "correct" header (i.e. the
  * oldest header in the arc state), but comes with higher overhead
  * (i.e. more invocations of arc_evict_state_impl()).
  */
 int zfs_arc_evict_batch_limit = 10;
 
 /* number of seconds before growing cache again */
 int arc_grow_retry = 5;
 
 /*
  * Minimum time between calls to arc_kmem_reap_soon().
  */
 int arc_kmem_cache_reap_retry_ms = 1000;
 
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 int zfs_arc_overflow_shift = 8;
 
 /* shift of arc_c for calculating both min and max arc_p */
 int arc_p_min_shift = 4;
 
 /* log2(fraction of arc to reclaim) */
 int arc_shrink_shift = 7;
 
 /* percent of pagecache to reclaim arc to */
 #ifdef _KERNEL
 uint_t zfs_arc_pc_percent = 0;
 #endif
 
 /*
  * log2(fraction of ARC which must be free to allow growing).
  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
  * when reading a new block into the ARC, we will evict an equal-sized block
  * from the ARC.
  *
  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
  * we will still not allow it to grow.
  */
 int			arc_no_grow_shift = 5;
 
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
 static int		arc_min_prefetch_ms;
 static int		arc_min_prescient_prefetch_ms;
 
 /*
  * If this percent of memory is free, don't throttle.
  */
 int arc_lotsfree_percent = 10;
 
 /*
  * The arc has filled available memory and has now warmed up.
  */
 boolean_t arc_warm;
 
 /*
  * These tunables are for performance analysis.
  */
 unsigned long zfs_arc_max = 0;
 unsigned long zfs_arc_min = 0;
 unsigned long zfs_arc_meta_limit = 0;
 unsigned long zfs_arc_meta_min = 0;
 unsigned long zfs_arc_dnode_limit = 0;
 unsigned long zfs_arc_dnode_reduce_percent = 10;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 
 /*
  * ARC dirty data constraints for arc_tempreserve_space() throttle.
  */
 unsigned long zfs_arc_dirty_limit_percent = 50;	/* total dirty data limit */
 unsigned long zfs_arc_anon_limit_percent = 25;	/* anon block dirty limit */
 unsigned long zfs_arc_pool_dirty_percent = 20;	/* each pool's anon allowance */
 
 /*
  * Enable or disable compressed arc buffers.
  */
 int zfs_compressed_arc_enabled = B_TRUE;
 
 /*
  * ARC will evict meta buffers that exceed arc_meta_limit. This
  * tunable make arc_meta_limit adjustable for different workloads.
  */
 unsigned long zfs_arc_meta_limit_percent = 75;
 
 /*
  * Percentage that can be consumed by dnodes of ARC meta buffers.
  */
 unsigned long zfs_arc_dnode_limit_percent = 10;
 
 /*
  * These tunables are Linux specific
  */
 unsigned long zfs_arc_sys_free = 0;
 int zfs_arc_min_prefetch_ms = 0;
 int zfs_arc_min_prescient_prefetch_ms = 0;
 int zfs_arc_p_dampener_disable = 1;
 int zfs_arc_meta_prune = 10000;
 int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
 int zfs_arc_meta_adjust_restarts = 4096;
 int zfs_arc_lotsfree_percent = 10;
 
 /* The 6 states: */
 arc_state_t ARC_anon;
 arc_state_t ARC_mru;
 arc_state_t ARC_mru_ghost;
 arc_state_t ARC_mfu;
 arc_state_t ARC_mfu_ghost;
 arc_state_t ARC_l2c_only;
 
 arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "mru_hits",			KSTAT_DATA_UINT64 },
 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "access_skip",		KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
 	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
 	{ "hash_chains",		KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
 	{ "p",				KSTAT_DATA_UINT64 },
 	{ "c",				KSTAT_DATA_UINT64 },
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
 	{ "compressed_size",		KSTAT_DATA_UINT64 },
 	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
 	{ "overhead_size",		KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
 	{ "metadata_size",		KSTAT_DATA_UINT64 },
 	{ "dbuf_size",			KSTAT_DATA_UINT64 },
 	{ "dnode_size",			KSTAT_DATA_UINT64 },
 	{ "bonus_size",			KSTAT_DATA_UINT64 },
 #if defined(COMPAT_FREEBSD11)
 	{ "other_size",			KSTAT_DATA_UINT64 },
 #endif
 	{ "anon_size",			KSTAT_DATA_UINT64 },
 	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_size",			KSTAT_DATA_UINT64 },
 	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "mfu_size",			KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_writes",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_avg_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_count",		KSTAT_DATA_UINT64 },
 	{ "l2_data_to_meta_ratio",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_success",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_unsupported",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_io_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_dh_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_cksum_lb_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_size",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs_precached",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_log_blks",	KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
 	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
 	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
 	{ "memory_all_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_free_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_available_bytes",	KSTAT_DATA_INT64 },
 	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
 	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
 	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
 	{ "arc_prune",			KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_dnode_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
 	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "arc_need_free",		KSTAT_DATA_UINT64 },
 	{ "arc_sys_free",		KSTAT_DATA_UINT64 },
 	{ "arc_raw_size",		KSTAT_DATA_UINT64 },
 	{ "cached_only_in_progress",	KSTAT_DATA_UINT64 },
 	{ "abd_chunk_waste_size",	KSTAT_DATA_UINT64 },
 };
 
 #define	ARCSTAT_MAX(stat, val) {					\
 	uint64_t m;							\
 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
 		continue;						\
 }
 
 #define	ARCSTAT_MAXSTAT(stat) \
 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 
 /*
  * We define a macro to allow ARC hits/misses to be easily broken down by
  * two separate conditions, giving a total of four different subtypes for
  * each of hits and misses (so eight statistics total).
  */
 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 	if (cond1) {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 		}							\
 	} else {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 		}							\
 	}
 
 /*
  * This macro allows us to use kstats as floating averages. Each time we
  * update this kstat, we first factor it and the update value by
  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
  * average. This macro assumes that integer loads and stores are atomic, but
  * is not safe for multiple writers updating the kstat in parallel (only the
  * last writer's update will remain).
  */
 #define	ARCSTAT_F_AVG_FACTOR	3
 #define	ARCSTAT_F_AVG(stat, value) \
 	do { \
 		uint64_t x = ARCSTAT(stat); \
 		x = x - x / ARCSTAT_F_AVG_FACTOR + \
 		    (value) / ARCSTAT_F_AVG_FACTOR; \
 		ARCSTAT(stat) = x; \
 		_NOTE(CONSTCOND) \
 	} while (0)
 
 kstat_t			*arc_ksp;
 static arc_state_t	*arc_anon;
 static arc_state_t	*arc_mru_ghost;
 static arc_state_t	*arc_mfu_ghost;
 static arc_state_t	*arc_l2c_only;
 
 arc_state_t	*arc_mru;
 arc_state_t	*arc_mfu;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
  * but we don't want to have to grovel around in the kstat whenever we wish to
  * manipulate them.  For these variables, we therefore define them to be in
  * terms of the statistic variable.  This assures that we are not introducing
  * the possibility of inconsistency by having shadow copies of the variables,
  * while still allowing the code to be readable.
  */
 #define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
 #define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
 #define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 /* max size for dnodes */
 #define	arc_dnode_size_limit	ARCSTAT(arcstat_dnode_limit)
 #define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
 #define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
 #define	arc_need_free	ARCSTAT(arcstat_need_free) /* waiting to be evicted */
 
 /* size of all b_rabd's in entire arc */
 #define	arc_raw_size	ARCSTAT(arcstat_raw_size)
 /* compressed size of entire arc */
 #define	arc_compressed_size	ARCSTAT(arcstat_compressed_size)
 /* uncompressed size of entire arc */
 #define	arc_uncompressed_size	ARCSTAT(arcstat_uncompressed_size)
 /* number of bytes in the arc from arc_buf_t's */
 #define	arc_overhead_size	ARCSTAT(arcstat_overhead_size)
 
 /*
  * There are also some ARC variables that we want to export, but that are
  * updated so often that having the canonical representation be the statistic
  * variable causes a performance bottleneck. We want to use aggsum_t's for these
  * instead, but still be able to export the kstat in the same way as before.
  * The solution is to always use the aggsum version, except in the kstat update
  * callback.
  */
 aggsum_t arc_size;
 aggsum_t arc_meta_used;
 aggsum_t astat_data_size;
 aggsum_t astat_metadata_size;
 aggsum_t astat_dbuf_size;
 aggsum_t astat_dnode_size;
 aggsum_t astat_bonus_size;
 aggsum_t astat_hdr_size;
 aggsum_t astat_l2_hdr_size;
 aggsum_t astat_abd_chunk_waste_size;
 
 hrtime_t arc_growtime;
 list_t arc_prune_list;
 kmutex_t arc_prune_mtx;
 taskq_t *arc_prune_taskq;
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
 #define	HDR_PRESCIENT_PREFETCH(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define	HDR_COMPRESSION_ENABLED(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
 #define	HDR_L2_READING(hdr)	\
 	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
 	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 #define	HDR_PROTECTED(hdr)	((hdr)->b_flags & ARC_FLAG_PROTECTED)
 #define	HDR_NOAUTH(hdr)		((hdr)->b_flags & ARC_FLAG_NOAUTH)
 #define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 
 #define	HDR_ISTYPE_METADATA(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
 
 #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 #define	HDR_HAS_RABD(hdr)	\
 	(HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&	\
 	(hdr)->b_crypt_hdr.b_rabd != NULL)
 #define	HDR_ENCRYPTED(hdr)	\
 	(HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 #define	HDR_AUTHENTICATED(hdr)	\
 	(HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 
 /* For storing compression mode in b_flags */
 #define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
 
 #define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
 #define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
 
 #define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
 #define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
 #define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
 #define	ARC_BUF_ENCRYPTED(buf)	((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
 
 /*
  * Other sizes
  */
 
 #define	HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 #define	HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
  * Hash table routines
  */
 
 #define	HT_LOCK_ALIGN	64
 #define	HT_LOCK_PAD	(P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 
 struct ht_lock {
 	kmutex_t	ht_lock;
 #ifdef _KERNEL
 	unsigned char	pad[HT_LOCK_PAD];
 #endif
 };
 
 #define	BUF_LOCKS 8192
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
 	arc_buf_hdr_t **ht_table;
 	struct ht_lock ht_locks[BUF_LOCKS];
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define	BUF_HASH_INDEX(spa, dva, birth) \
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 #define	HDR_LOCK(hdr) \
 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
 /*
  * Level 2 ARC
  */
 
 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
 #define	L2ARC_HEADROOM		2			/* num of writes */
 
 /*
  * If we discover during ARC scan any buffers to be compressed, we boost
  * our headroom for the next scanning cycle by this percentage multiple.
  */
 #define	L2ARC_HEADROOM_BOOST	200
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 /*
  * We can feed L2ARC from two states of ARC buffers, mru and mfu,
  * and each of the state has two types: data and metadata.
  */
 #define	L2ARC_FEED_TYPES	4
 
 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
 
 /* L2ARC Performance Tunables */
 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
 unsigned long l2arc_headroom = L2ARC_HEADROOM;		/* # of dev writes */
 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
 int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
 int l2arc_feed_again = B_TRUE;			/* turbo warmup */
 int l2arc_norw = B_FALSE;			/* no reads during writes */
+int l2arc_meta_percent = 33;			/* limit on headers size */
 
 /*
  * L2ARC Internals
  */
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
 	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
 	blkptr_t		l2rcb_bp;		/* original blkptr */
 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
 	int			l2rcb_flags;		/* original flags */
 	abd_t			*l2rcb_abd;		/* temporary buffer */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	abd_t		*l2df_abd;
 	size_t		l2df_size;
 	arc_buf_contents_t l2df_type;
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
 typedef enum arc_fill_flags {
 	ARC_FILL_LOCKED		= 1 << 0, /* hdr lock is held */
 	ARC_FILL_COMPRESSED	= 1 << 1, /* fill with compressed data */
 	ARC_FILL_ENCRYPTED	= 1 << 2, /* fill with encrypted data */
 	ARC_FILL_NOAUTH		= 1 << 3, /* don't attempt to authenticate */
 	ARC_FILL_IN_PLACE	= 1 << 4  /* fill in place (special case) */
 } arc_fill_flags_t;
 
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
 static kmutex_t l2arc_rebuild_thr_lock;
 static kcondvar_t l2arc_rebuild_thr_cv;
 
 enum arc_hdr_alloc_flags {
 	ARC_HDR_ALLOC_RDATA = 0x1,
 	ARC_HDR_DO_ADAPT = 0x2,
 };
 
 
 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
 static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
 static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 static void arc_buf_watch(arc_buf_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 static void l2arc_do_free_on_write(void);
 
 /*
  * L2ARC TRIM
  * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
  * 		the current write size (l2arc_write_max) we should TRIM if we
  * 		have filled the device. It is defined as a percentage of the
  * 		write size. If set to 100 we trim twice the space required to
  * 		accommodate upcoming writes. A minimum of 64MB will be trimmed.
  * 		It also enables TRIM of the whole L2ARC device upon creation or
  * 		addition to an existing pool or if the header of the device is
  * 		invalid upon importing a pool or onlining a cache device. The
  * 		default is 0, which disables TRIM on L2ARC altogether as it can
  * 		put significant stress on the underlying storage devices. This
  * 		will vary depending of how well the specific device handles
  * 		these commands.
  */
 unsigned long l2arc_trim_ahead = 0;
 
 /*
  * Performance tuning of L2ARC persistence:
  *
  * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
  * 		an L2ARC device (either at pool import or later) will attempt
  * 		to rebuild L2ARC buffer contents.
  * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
  * 		whether log blocks are written to the L2ARC device. If the L2ARC
  * 		device is less than 1GB, the amount of data l2arc_evict()
  * 		evicts is significant compared to the amount of restored L2ARC
  * 		data. In this case do not write log blocks in L2ARC in order
  * 		not to waste space.
  */
 int l2arc_rebuild_enabled = B_TRUE;
 unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
 
 /* L2ARC persistence rebuild control routines. */
 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
 static void l2arc_dev_rebuild_thread(void *arg);
 static int l2arc_rebuild(l2arc_dev_t *dev);
 
 /* L2ARC persistence read I/O routines. */
 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
 static int l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io);
 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
     const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
 static void l2arc_log_blk_fetch_abort(zio_t *zio);
 
 /* L2ARC persistence block restoration routines. */
 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
     const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr);
 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
     l2arc_dev_t *dev);
 
 /* L2ARC persistence write I/O routines. */
 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     l2arc_write_callback_t *cb);
 
 /* L2ARC persistence auxiliary routines. */
 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *lbp);
 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
     const arc_buf_hdr_t *ab);
 boolean_t l2arc_range_check_overlap(uint64_t bottom,
     uint64_t top, uint64_t check);
 static void l2arc_blk_fetch_done(zio_t *zio);
 static inline uint64_t
     l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
  */
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
 	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
 }
 
 #define	HDR_EMPTY(hdr)						\
 	((hdr)->b_dva.dva_word[0] == 0 &&			\
 	(hdr)->b_dva.dva_word[1] == 0)
 
 #define	HDR_EMPTY_OR_LOCKED(hdr)				\
 	(HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
 
 #define	HDR_EQUAL(spa, dva, birth, hdr)				\
 	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
 	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
 	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
 {
 	hdr->b_dva.dva_word[0] = 0;
 	hdr->b_dva.dva_word[1] = 0;
 	hdr->b_birth = 0;
 }
 
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
 	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
 
 	mutex_enter(hash_lock);
 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 	    hdr = hdr->b_hash_next) {
 		if (HDR_EQUAL(spa, dva, birth, hdr)) {
 			*lockp = hash_lock;
 			return (hdr);
 		}
 	}
 	mutex_exit(hash_lock);
 	*lockp = NULL;
 	return (NULL);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  * If lockp == NULL, the caller is assumed to already hold the hash lock.
  */
 static arc_buf_hdr_t *
 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *fhdr;
 	uint32_t i;
 
 	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 	ASSERT(hdr->b_birth != 0);
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (lockp != NULL) {
 		*lockp = hash_lock;
 		mutex_enter(hash_lock);
 	} else {
 		ASSERT(MUTEX_HELD(hash_lock));
 	}
 
 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 	    fhdr = fhdr->b_hash_next, i++) {
 		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 			return (fhdr);
 	}
 
 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = hdr;
 	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	if (i > 0) {
 		ARCSTAT_BUMP(arcstat_hash_collisions);
 		if (i == 1)
 			ARCSTAT_BUMP(arcstat_hash_chains);
 
 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
 	}
 
 	ARCSTAT_BUMP(arcstat_hash_elements);
 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
 
 	return (NULL);
 }
 
 static void
 buf_hash_remove(arc_buf_hdr_t *hdr)
 {
 	arc_buf_hdr_t *fhdr, **hdrp;
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 	ASSERT(HDR_IN_HASH_TABLE(hdr));
 
 	hdrp = &buf_hash_table.ht_table[idx];
 	while ((fhdr = *hdrp) != hdr) {
 		ASSERT3P(fhdr, !=, NULL);
 		hdrp = &fhdr->b_hash_next;
 	}
 	*hdrp = hdr->b_hash_next;
 	hdr->b_hash_next = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 
 	if (buf_hash_table.ht_table[idx] &&
 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 }
 
 /*
  * Global data structures and functions for the buf kmem cache.
  */
 
 static kmem_cache_t *hdr_full_cache;
 static kmem_cache_t *hdr_full_crypt_cache;
 static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
 
 static void
 buf_fini(void)
 {
 	int i;
 
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_free() in the linux kernel\
 	 */
 	vmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #else
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #endif
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 	kmem_cache_destroy(hdr_full_cache);
 	kmem_cache_destroy(hdr_full_crypt_cache);
 	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
 
 /*
  * Constructor callback - called when the cache is empty
  * and a new buf is requested.
  */
 /* ARGSUSED */
 static int
 hdr_full_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	bzero(hdr, HDR_FULL_SIZE);
 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
 	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_link_init(&hdr->b_l1hdr.b_arc_node);
 	list_link_init(&hdr->b_l2hdr.b_l2node);
 	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	hdr_full_cons(vbuf, unused, kmflag);
 	bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
 	arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	bzero(hdr, HDR_L2ONLY_SIZE);
 	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_t *buf = vbuf;
 
 	bzero(buf, sizeof (arc_buf_t));
 	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /*
  * Destructor callback - called when a cached buf is
  * no longer required.
  */
 /* ARGSUSED */
 static void
 hdr_full_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	cv_destroy(&hdr->b_l1hdr.b_cv);
 	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
 /* ARGSUSED */
 static void
 hdr_full_crypt_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	hdr_full_dest(vbuf, unused);
 	arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
 }
 
 /* ARGSUSED */
 static void
 hdr_l2only_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *hdr __maybe_unused = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
 /* ARGSUSED */
 static void
 buf_dest(void *vbuf, void *unused)
 {
 	arc_buf_t *buf = vbuf;
 
 	mutex_destroy(&buf->b_evict_lock);
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
 static void
 buf_init(void)
 {
 	uint64_t *ct = NULL;
 	uint64_t hsize = 1ULL << 12;
 	int i, j;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory())
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_alloc() in the linux kernel
 	 */
 	buf_hash_table.ht_table =
 	    vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 #else
 	buf_hash_table.ht_table =
 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 #endif
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
 	    0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
 	hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
 	    HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
 	    NULL, NULL, NULL, 0);
 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
 	    NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < 256; i++)
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
 	for (i = 0; i < BUF_LOCKS; i++) {
 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 	}
 }
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
 /*
  * This is the size that the buf occupies in memory. If the buf is compressed,
  * it will correspond to the compressed size. You should use this method of
  * getting the buf size unless you explicitly need the logical size.
  */
 uint64_t
 arc_buf_size(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
 }
 
 uint64_t
 arc_buf_lsize(arc_buf_t *buf)
 {
 	return (HDR_GET_LSIZE(buf->b_hdr));
 }
 
 /*
  * This function will return B_TRUE if the buffer is encrypted in memory.
  * This buffer can be decrypted by calling arc_untransform().
  */
 boolean_t
 arc_is_encrypted(arc_buf_t *buf)
 {
 	return (ARC_BUF_ENCRYPTED(buf) != 0);
 }
 
 /*
  * Returns B_TRUE if the buffer represents data that has not had its MAC
  * verified yet.
  */
 boolean_t
 arc_is_unauthenticated(arc_buf_t *buf)
 {
 	return (HDR_NOAUTH(buf->b_hdr) != 0);
 }
 
 void
 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
     uint8_t *iv, uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 	*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 	    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 }
 
 /*
  * Indicates how this buffer is compressed in memory. If it is not compressed
  * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
  * arc_untransform() as long as it is also unencrypted.
  */
 enum zio_compress
 arc_get_compression(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
 }
 
 /*
  * Return the compression algorithm used to store this data in the ARC. If ARC
  * compression is enabled or this is an encrypted block, this will be the same
  * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
  */
 static inline enum zio_compress
 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
 {
 	return (HDR_COMPRESSION_ENABLED(hdr) ?
 	    HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
 }
 
 uint8_t
 arc_get_complevel(arc_buf_t *buf)
 {
 	return (buf->b_hdr->b_complevel);
 }
 
 static inline boolean_t
 arc_buf_is_shared(arc_buf_t *buf)
 {
 	boolean_t shared = (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
 	IMPLY(shared, ARC_BUF_SHARED(buf));
 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
 
 	/*
 	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
 	 * already being shared" requirement prevents us from doing that.
 	 */
 
 	return (shared);
 }
 
 /*
  * Free the checksum associated with this header. If there is no checksum, this
  * is a no-op.
  */
 static inline void
 arc_cksum_free(arc_buf_hdr_t *hdr)
 {
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
 		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_l1hdr.b_freeze_cksum = NULL;
 	}
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 }
 
 /*
  * Return true iff at least one of the bufs on hdr is not compressed.
  * Encrypted buffers count as compressed.
  */
 static boolean_t
 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
 {
 	ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
 
 	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
 		if (!ARC_BUF_COMPRESSED(b)) {
 			return (B_TRUE);
 		}
 	}
 	return (B_FALSE);
 }
 
 
 /*
  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
  * matches the checksum that is stored in the hdr. If there is no checksum,
  * or if the buf is compressed, this is a no-op.
  */
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 
 	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
 	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 }
 
 /*
  * This function makes the assumption that data stored in the L2ARC
  * will be transformed exactly as it is in the main pool. Because of
  * this we can verify the checksum against the reading process's bp.
  */
 static boolean_t
 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 {
 	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
 	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
 
 	/*
 	 * Block pointers always store the checksum for the logical data.
 	 * If the block pointer has the gang bit set, then the checksum
 	 * it represents is for the reconstituted data and not for an
 	 * individual gang member. The zio pipeline, however, must be able to
 	 * determine the checksum of each of the gang constituents so it
 	 * treats the checksum comparison differently than what we need
 	 * for l2arc blocks. This prevents us from using the
 	 * zio_checksum_error() interface directly. Instead we must call the
 	 * zio_checksum_error_impl() so that we can ensure the checksum is
 	 * generated using the correct checksum algorithm and accounts for the
 	 * logical I/O size and not just a gang fragment.
 	 */
 	return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
 	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
 	    zio->io_offset, NULL) == 0);
 }
 
 /*
  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
  * isn't modified later on. If buf is compressed or there is already a checksum
  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
  */
 static void
 arc_cksum_compute(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
 	    KM_SLEEP);
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
 	    hdr->b_l1hdr.b_freeze_cksum);
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 	arc_buf_watch(buf);
 }
 
 #ifndef _KERNEL
 void
 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
 {
 	panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
 }
 #endif
 
 /* ARGSUSED */
 static void
 arc_buf_unwatch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch) {
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ | PROT_WRITE));
 	}
 #endif
 }
 
 /* ARGSUSED */
 static void
 arc_buf_watch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch)
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ));
 #endif
 }
 
 static arc_buf_contents_t
 arc_buf_type(arc_buf_hdr_t *hdr)
 {
 	arc_buf_contents_t type;
 	if (HDR_ISTYPE_METADATA(hdr)) {
 		type = ARC_BUFC_METADATA;
 	} else {
 		type = ARC_BUFC_DATA;
 	}
 	VERIFY3U(hdr->b_type, ==, type);
 	return (type);
 }
 
 boolean_t
 arc_is_metadata(arc_buf_t *buf)
 {
 	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
 }
 
 static uint32_t
 arc_bufc_to_flags(arc_buf_contents_t type)
 {
 	switch (type) {
 	case ARC_BUFC_DATA:
 		/* metadata field is 0 if buffer contains normal data */
 		return (0);
 	case ARC_BUFC_METADATA:
 		return (ARC_FLAG_BUFC_METADATA);
 	default:
 		break;
 	}
 	panic("undefined ARC buffer type!");
 	return ((uint32_t)-1);
 }
 
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 
 	arc_cksum_verify(buf);
 
 	/*
 	 * Compressed buffers do not manipulate the b_freeze_cksum.
 	 */
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	arc_cksum_free(hdr);
 	arc_buf_unwatch(buf);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
 	arc_cksum_compute(buf);
 }
 
 /*
  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
  * the following functions should be used to ensure that the flags are
  * updated in a thread-safe way. When manipulating the flags either
  * the hash_lock must be held or the hdr must be undiscoverable. This
  * ensures that we're not racing with any other threads when updating
  * the flags.
  */
 static inline void
 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags |= flags;
 }
 
 static inline void
 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags &= ~flags;
 }
 
 /*
  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
  * done in a special way since we have to clear and set bits
  * at the same time. Consumers that wish to set the compression bits
  * must use this function to ensure that the flags are updated in
  * thread-safe manner.
  */
 static void
 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Holes and embedded blocks will always have a psize = 0 so
 	 * we ignore the compression of the blkptr and set the
 	 * want to uncompress them. Mark them as uncompressed.
 	 */
 	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
 	}
 
 	HDR_SET_COMPRESS(hdr, cmp);
 	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
 }
 
 /*
  * Looks for another buf on the same hdr which has the data decompressed, copies
  * from it, and returns true. If no such buf exists, returns false.
  */
 static boolean_t
 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t copied = B_FALSE;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 
 	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
 	    from = from->b_next) {
 		/* can't use our own data buffer */
 		if (from == buf) {
 			continue;
 		}
 
 		if (!ARC_BUF_COMPRESSED(from)) {
 			bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
 			copied = B_TRUE;
 			break;
 		}
 	}
 
 	/*
 	 * There were no decompressed bufs, so there should not be a
 	 * checksum on the hdr either.
 	 */
 	if (zfs_flags & ZFS_DEBUG_MODIFY)
 		EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
 
 	return (copied);
 }
 
 /*
  * Allocates an ARC buf header that's in an evicted & L2-cached state.
  * This is used during l2arc reconstruction to make empty ARC buffers
  * which circumvent the regular disk->arc->l2arc path and instead come
  * into being in the reverse order, i.e. l2arc->arc.
  */
 static arc_buf_hdr_t *
 arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
     dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
     enum zio_compress compress, uint8_t complevel, boolean_t protected,
     boolean_t prefetch)
 {
 	arc_buf_hdr_t	*hdr;
 
 	ASSERT(size != 0);
 	hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
 	hdr->b_birth = birth;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
 	HDR_SET_LSIZE(hdr, size);
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	if (prefetch)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 	hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
 
 	hdr->b_dva = dva;
 
 	hdr->b_l2hdr.b_dev = dev;
 	hdr->b_l2hdr.b_daddr = daddr;
 
 	return (hdr);
 }
 
 /*
  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
  */
 static uint64_t
 arc_hdr_size(arc_buf_hdr_t *hdr)
 {
 	uint64_t size;
 
 	if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 	    HDR_GET_PSIZE(hdr) > 0) {
 		size = HDR_GET_PSIZE(hdr);
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
 		size = HDR_GET_LSIZE(hdr);
 	}
 	return (size);
 }
 
 static int
 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 {
 	int ret;
 	uint64_t csize;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	void *tmpbuf = NULL;
 	abd_t *abd = hdr->b_l1hdr.b_pabd;
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_AUTHENTICATED(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * The MAC is calculated on the compressed data that is stored on disk.
 	 * However, if compressed arc is disabled we will only have the
 	 * decompressed data available to us now. Compress it into a temporary
 	 * abd so we can verify the MAC. The performance overhead of this will
 	 * be relatively low, since most objects in an encrypted objset will
 	 * be encrypted (instead of authenticated) anyway.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		tmpbuf = zio_buf_alloc(lsize);
 		abd = abd_get_from_buf(tmpbuf, lsize);
 		abd_take_ownership_of_buf(abd, B_TRUE);
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel);
 		ASSERT3U(csize, <=, psize);
 		abd_zero_off(abd, csize, psize - csize);
 	}
 
 	/*
 	 * Authentication is best effort. We authenticate whenever the key is
 	 * available. If we succeed we clear ARC_FLAG_NOAUTH.
 	 */
 	if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
 		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
 		ASSERT3U(lsize, ==, psize);
 		ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
 		    psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	} else {
 		ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
 		    hdr->b_crypt_hdr.b_mac);
 	}
 
 	if (ret == 0)
 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
 	else if (ret != ENOENT)
 		goto error;
 
 	if (tmpbuf != NULL)
 		abd_free(abd);
 
 	return (0);
 
 error:
 	if (tmpbuf != NULL)
 		abd_free(abd);
 
 	return (ret);
 }
 
 /*
  * This function will take a header that only has raw encrypted data in
  * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
  * b_l1hdr.b_pabd. If designated in the header flags, this function will
  * also decompress the data.
  */
 static int
 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 {
 	int ret;
 	abd_t *cabd = NULL;
 	void *tmp = NULL;
 	boolean_t no_crypt = B_FALSE;
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_ENCRYPTED(hdr));
 
 	arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
 	    B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
 	    hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
 	    hdr->b_crypt_hdr.b_rabd, &no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (no_crypt) {
 		abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 	}
 
 	/*
 	 * If this header has disabled arc compression but the b_pabd is
 	 * compressed after decrypting it, we need to decompress the newly
 	 * decrypted data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		/*
 		 * We want to make sure that we are correctly honoring the
 		 * zfs_abd_scatter_enabled setting, so we allocate an abd here
 		 * and then loan a buffer from it, rather than allocating a
 		 * linear buffer and wrapping it in an abd later.
 		 */
 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE);
 		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
 			goto error;
 		}
 
 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 	}
 
 	return (0);
 
 error:
 	arc_hdr_free_abd(hdr, B_FALSE);
 	if (cabd != NULL)
 		arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
 
 	return (ret);
 }
 
 /*
  * This function is called during arc_buf_fill() to prepare the header's
  * abd plaintext pointer for use. This involves authenticated protected
  * data and decrypting encrypted data into the plaintext abd.
  */
 static int
 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
     const zbookmark_phys_t *zb, boolean_t noauth)
 {
 	int ret;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	if (hash_lock != NULL)
 		mutex_enter(hash_lock);
 
 	if (HDR_NOAUTH(hdr) && !noauth) {
 		/*
 		 * The caller requested authenticated data but our data has
 		 * not been authenticated yet. Verify the MAC now if we can.
 		 */
 		ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
 		if (ret != 0)
 			goto error;
 	} else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
 		/*
 		 * If we only have the encrypted version of the data, but the
 		 * unencrypted version was requested we take this opportunity
 		 * to store the decrypted version in the header for future use.
 		 */
 		ret = arc_hdr_decrypt(hdr, spa, zb);
 		if (ret != 0)
 			goto error;
 	}
 
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (0);
 
 error:
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (ret);
 }
 
 /*
  * This function is used by the dbuf code to decrypt bonus buffers in place.
  * The dbuf code itself doesn't have any locking for decrypting a shared dnode
  * block, so we use the hash lock here to protect against concurrent calls to
  * arc_buf_fill().
  */
 static void
 arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_ENCRYPTED(hdr));
 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
 	    arc_buf_size(buf));
 	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	hdr->b_crypt_hdr.b_ebufcnt -= 1;
 }
 
 /*
  * Given a buf that has a data buffer attached to it, this function will
  * efficiently fill the buf with data of the specified compression setting from
  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
  * are already sharing a data buf, no copy is performed.
  *
  * If the buf is marked as compressed but uncompressed data was requested, this
  * will allocate a new data buffer for the buf, remove that flag, and fill the
  * buf with uncompressed data. You can't request a compressed buf on a hdr with
  * uncompressed data, and (since we haven't added support for it yet) if you
  * want compressed data your buf must already be marked as compressed and have
  * the correct-sized data buffer.
  */
 static int
 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     arc_fill_flags_t flags)
 {
 	int error = 0;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t hdr_compressed =
 	    (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
 	boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
 	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
 	kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
 	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, !ARC_BUF_SHARED(buf));
 
 	/*
 	 * If the caller wanted encrypted data we just need to copy it from
 	 * b_rabd and potentially byteswap it. We won't be able to do any
 	 * further transforms on it.
 	 */
 	if (encrypted) {
 		ASSERT(HDR_HAS_RABD(hdr));
 		abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 		goto byteswap;
 	}
 
 	/*
 	 * Adjust encrypted and authenticated headers to accommodate
 	 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
 	 * allowed to fail decryption due to keys not being loaded
 	 * without being marked as an IO error.
 	 */
 	if (HDR_PROTECTED(hdr)) {
 		error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
 		    zb, !!(flags & ARC_FILL_NOAUTH));
 		if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
 			return (error);
 		} else if (error != 0) {
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			return (error);
 		}
 	}
 
 	/*
 	 * There is a special case here for dnode blocks which are
 	 * decrypting their bonus buffers. These blocks may request to
 	 * be decrypted in-place. This is necessary because there may
 	 * be many dnodes pointing into this buffer and there is
 	 * currently no method to synchronize replacing the backing
 	 * b_data buffer and updating all of the pointers. Here we use
 	 * the hash lock to ensure there are no races. If the need
 	 * arises for other types to be decrypted in-place, they must
 	 * add handling here as well.
 	 */
 	if ((flags & ARC_FILL_IN_PLACE) != 0) {
 		ASSERT(!hdr_compressed);
 		ASSERT(!compressed);
 		ASSERT(!encrypted);
 
 		if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
 			ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_buf_untransform_in_place(buf, hash_lock);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 
 			/* Compute the hdr's checksum if necessary */
 			arc_cksum_compute(buf);
 		}
 
 		return (0);
 	}
 
 	if (hdr_compressed == compressed) {
 		if (!arc_buf_is_shared(buf)) {
 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT(hdr_compressed);
 		ASSERT(!compressed);
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
 
 		/*
 		 * If the buf is sharing its data with the hdr, unlink it and
 		 * allocate a new data buffer for the buf.
 		 */
 		if (arc_buf_is_shared(buf)) {
 			ASSERT(ARC_BUF_COMPRESSED(buf));
 
 			/* We need to give the buf its own b_data */
 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 
 			/* Previously overhead was 0; just add new overhead */
 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 		} else if (ARC_BUF_COMPRESSED(buf)) {
 			/* We need to reallocate the buf's b_data */
 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
 			    buf);
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 
 			/* We increased the size of b_data; update overhead */
 			ARCSTAT_INCR(arcstat_overhead_size,
 			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
 		}
 
 		/*
 		 * Regardless of the buf's previous compression settings, it
 		 * should not be compressed at the end of this function.
 		 */
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 
 		/*
 		 * Try copying the data from another buf which already has a
 		 * decompressed version. If that's not possible, it's time to
 		 * bite the bullet and decompress the data from the hdr.
 		 */
 		if (arc_buf_try_copy_decompressed_data(buf)) {
 			/* Skip byteswapping and checksumming (already done) */
 			return (0);
 		} else {
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 			    hdr->b_l1hdr.b_pabd, buf->b_data,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
 			    &hdr->b_complevel);
 
 			/*
 			 * Absent hardware errors or software bugs, this should
 			 * be impossible, but log it anyway so we can debug it.
 			 */
 			if (error != 0) {
 				zfs_dbgmsg(
 				    "hdr %px, compress %d, psize %d, lsize %d",
 				    hdr, arc_hdr_get_compress(hdr),
 				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 				arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 				return (SET_ERROR(EIO));
 			}
 		}
 	}
 
 byteswap:
 	/* Byteswap the buf's data if necessary */
 	if (bswap != DMU_BSWAP_NUMFUNCS) {
 		ASSERT(!HDR_SHARED_DATA(hdr));
 		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
 		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
 	}
 
 	/* Compute the hdr's checksum if necessary */
 	arc_cksum_compute(buf);
 
 	return (0);
 }
 
 /*
  * If this function is being called to decrypt an encrypted buffer or verify an
  * authenticated one, the key must be loaded and a mapping must be made
  * available in the keystore via spa_keystore_create_mapping() or one of its
  * callers.
  */
 int
 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     boolean_t in_place)
 {
 	int ret;
 	arc_fill_flags_t flags = 0;
 
 	if (in_place)
 		flags |= ARC_FILL_IN_PLACE;
 
 	ret = arc_buf_fill(buf, spa, zb, flags);
 	if (ret == ECKSUM) {
 		/*
 		 * Convert authentication and decryption errors to EIO
 		 * (and generate an ereport) before leaving the ARC.
 		 */
 		ret = SET_ERROR(EIO);
 		spa_log_error(spa, zb);
 		zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 		    spa, NULL, zb, NULL, 0, 0);
 	}
 
 	return (ret);
 }
 
 /*
  * Increment the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	ASSERT(!GHOST_STATE(state));
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (arc_buf_is_shared(buf))
 			continue;
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Decrement the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	ASSERT(!GHOST_STATE(state));
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (arc_buf_is_shared(buf))
 			continue;
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Add a reference to this hdr indicating that someone is actively
  * referencing that memory. When the refcount transitions from 0 to 1,
  * we remove it from the respective arc_state_t list to indicate that
  * it is not evictable.
  */
 static void
 add_reference(arc_buf_hdr_t *hdr, void *tag)
 {
 	arc_state_t *state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
 		ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 	}
 
 	state = hdr->b_l1hdr.b_state;
 
 	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
 	    (state != arc_anon)) {
 		/* We don't use the L2-only state list. */
 		if (state != arc_l2c_only) {
 			multilist_remove(state->arcs_list[arc_buf_type(hdr)],
 			    hdr);
 			arc_evictable_space_decrement(hdr, state);
 		}
 		/* remove the prefetch flag if we get a reference */
 		arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
 	}
 }
 
 /*
  * Remove a reference from this hdr. When the reference transitions from
  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
  * list making it eligible for eviction.
  */
 static int
 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 {
 	int cnt;
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
 	ASSERT(!GHOST_STATE(state));
 
 	/*
 	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
 	 * check to prevent usage of the arc_l2c_only list.
 	 */
 	if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
 	    (state != arc_anon)) {
 		multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
 		arc_evictable_space_increment(hdr, state);
 	}
 	return (cnt);
 }
 
 /*
  * Returns detailed information about a specific arc buffer.  When the
  * state_index argument is set the function will calculate the arc header
  * list position for its arc state.  Since this requires a linear traversal
  * callers are strongly encourage not to do this.  However, it can be helpful
  * for targeted analysis so the functionality is provided.
  */
 void
 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 {
 	arc_buf_hdr_t *hdr = ab->b_hdr;
 	l1arc_buf_hdr_t *l1hdr = NULL;
 	l2arc_buf_hdr_t *l2hdr = NULL;
 	arc_state_t *state = NULL;
 
 	memset(abi, 0, sizeof (arc_buf_info_t));
 
 	if (hdr == NULL)
 		return;
 
 	abi->abi_flags = hdr->b_flags;
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		l1hdr = &hdr->b_l1hdr;
 		state = l1hdr->b_state;
 	}
 	if (HDR_HAS_L2HDR(hdr))
 		l2hdr = &hdr->b_l2hdr;
 
 	if (l1hdr) {
 		abi->abi_bufcnt = l1hdr->b_bufcnt;
 		abi->abi_access = l1hdr->b_arc_access;
 		abi->abi_mru_hits = l1hdr->b_mru_hits;
 		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
 		abi->abi_mfu_hits = l1hdr->b_mfu_hits;
 		abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
 		abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
 	}
 
 	if (l2hdr) {
 		abi->abi_l2arc_dattr = l2hdr->b_daddr;
 		abi->abi_l2arc_hits = l2hdr->b_hits;
 	}
 
 	abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
 	abi->abi_state_contents = arc_buf_type(hdr);
 	abi->abi_size = arc_hdr_size(hdr);
 }
 
 /*
  * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
     kmutex_t *hash_lock)
 {
 	arc_state_t *old_state;
 	int64_t refcnt;
 	uint32_t bufcnt;
 	boolean_t update_old, update_new;
 	arc_buf_contents_t buftype = arc_buf_type(hdr);
 
 	/*
 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
 	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
 	 * L1 hdr doesn't always exist when we change state to arc_anon before
 	 * destroying a header, in which case reallocating to add the L1 hdr is
 	 * pointless.
 	 */
 	if (HDR_HAS_L1HDR(hdr)) {
 		old_state = hdr->b_l1hdr.b_state;
 		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
 		bufcnt = hdr->b_l1hdr.b_bufcnt;
 		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
 		    HDR_HAS_RABD(hdr));
 	} else {
 		old_state = arc_l2c_only;
 		refcnt = 0;
 		bufcnt = 0;
 		update_old = B_FALSE;
 	}
 	update_new = update_old;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT3P(new_state, !=, old_state);
 	ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
 	ASSERT(old_state != arc_anon || bufcnt <= 1);
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
 	if (refcnt == 0) {
 		if (old_state != arc_anon && old_state != arc_l2c_only) {
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			multilist_remove(old_state->arcs_list[buftype], hdr);
 
 			if (GHOST_STATE(old_state)) {
 				ASSERT0(bufcnt);
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 				update_old = B_TRUE;
 			}
 			arc_evictable_space_decrement(hdr, old_state);
 		}
 		if (new_state != arc_anon && new_state != arc_l2c_only) {
 			/*
 			 * An L1 header always exists here, since if we're
 			 * moving to some L1-cached state (i.e. not l2c_only or
 			 * anonymous), we realloc the header to add an L1hdr
 			 * beforehand.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			multilist_insert(new_state->arcs_list[buftype], hdr);
 
 			if (GHOST_STATE(new_state)) {
 				ASSERT0(bufcnt);
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 				update_new = B_TRUE;
 			}
 			arc_evictable_space_increment(hdr, new_state);
 		}
 	}
 
 	ASSERT(!HDR_EMPTY(hdr));
 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
 		buf_hash_remove(hdr);
 
 	/* adjust state sizes (ignore arc_l2c_only) */
 
 	if (update_new && new_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(new_state)) {
 			ASSERT0(bufcnt);
 
 			/*
 			 * When moving a header to a ghost state, we first
 			 * remove all arc buffers. Thus, we'll have a
 			 * bufcnt of zero, and no arc buffer to use for
 			 * the reference. As a result, we use the arc
 			 * header pointer for the reference.
 			 */
 			(void) zfs_refcount_add_many(&new_state->arcs_size,
 			    HDR_GET_LSIZE(hdr), hdr);
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 		} else {
 			uint32_t buffers = 0;
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 				ASSERT3U(bufcnt, !=, 0);
 				buffers++;
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (arc_buf_is_shared(buf))
 					continue;
 
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size,
 				    arc_buf_size(buf), buf);
 			}
 			ASSERT3U(bufcnt, ==, buffers);
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size,
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size,
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
 
 	if (update_old && old_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(old_state)) {
 			ASSERT0(bufcnt);
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 
 			/*
 			 * When moving a header off of a ghost state,
 			 * the header will not contain any arc buffers.
 			 * We use the arc header pointer for the reference
 			 * which is exactly what we did when we put the
 			 * header on the ghost state.
 			 */
 
 			(void) zfs_refcount_remove_many(&old_state->arcs_size,
 			    HDR_GET_LSIZE(hdr), hdr);
 		} else {
 			uint32_t buffers = 0;
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 				ASSERT3U(bufcnt, !=, 0);
 				buffers++;
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (arc_buf_is_shared(buf))
 					continue;
 
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size, arc_buf_size(buf),
 				    buf);
 			}
 			ASSERT3U(bufcnt, ==, buffers);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size, arc_hdr_size(hdr),
 				    hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size, HDR_GET_PSIZE(hdr),
 				    hdr);
 			}
 		}
 	}
 
 	if (HDR_HAS_L1HDR(hdr))
 		hdr->b_l1hdr.b_state = new_state;
 
 	/*
 	 * L2 headers should never be on the L2 state list since they don't
 	 * have L1 headers allocated.
 	 */
 	ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
 	    multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
 }
 
 void
 arc_space_consume(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		aggsum_add(&astat_data_size, space);
 		break;
 	case ARC_SPACE_META:
 		aggsum_add(&astat_metadata_size, space);
 		break;
 	case ARC_SPACE_BONUS:
 		aggsum_add(&astat_bonus_size, space);
 		break;
 	case ARC_SPACE_DNODE:
 		aggsum_add(&astat_dnode_size, space);
 		break;
 	case ARC_SPACE_DBUF:
 		aggsum_add(&astat_dbuf_size, space);
 		break;
 	case ARC_SPACE_HDRS:
 		aggsum_add(&astat_hdr_size, space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&astat_l2_hdr_size, space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		/*
 		 * Note: this includes space wasted by all scatter ABD's, not
 		 * just those allocated by the ARC.  But the vast majority of
 		 * scatter ABD's come from the ARC, because other users are
 		 * very short-lived.
 		 */
 		aggsum_add(&astat_abd_chunk_waste_size, space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
 		aggsum_add(&arc_meta_used, space);
 
 	aggsum_add(&arc_size, space);
 }
 
 void
 arc_space_return(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		aggsum_add(&astat_data_size, -space);
 		break;
 	case ARC_SPACE_META:
 		aggsum_add(&astat_metadata_size, -space);
 		break;
 	case ARC_SPACE_BONUS:
 		aggsum_add(&astat_bonus_size, -space);
 		break;
 	case ARC_SPACE_DNODE:
 		aggsum_add(&astat_dnode_size, -space);
 		break;
 	case ARC_SPACE_DBUF:
 		aggsum_add(&astat_dbuf_size, -space);
 		break;
 	case ARC_SPACE_HDRS:
 		aggsum_add(&astat_hdr_size, -space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&astat_l2_hdr_size, -space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		aggsum_add(&astat_abd_chunk_waste_size, -space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) {
 		ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
 		/*
 		 * We use the upper bound here rather than the precise value
 		 * because the arc_meta_max value doesn't need to be
 		 * precise. It's only consumed by humans via arcstats.
 		 */
 		if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
 			arc_meta_max = aggsum_upper_bound(&arc_meta_used);
 		aggsum_add(&arc_meta_used, -space);
 	}
 
 	ASSERT(aggsum_compare(&arc_size, space) >= 0);
 	aggsum_add(&arc_size, -space);
 }
 
 /*
  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
  * with the hdr's b_pabd.
  */
 static boolean_t
 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	/*
 	 * The criteria for sharing a hdr's data are:
 	 * 1. the buffer is not encrypted
 	 * 2. the hdr's compression matches the buf's compression
 	 * 3. the hdr doesn't need to be byteswapped
 	 * 4. the hdr isn't already being shared
 	 * 5. the buf is either compressed or it is the last buf in the hdr list
 	 *
 	 * Criterion #5 maintains the invariant that shared uncompressed
 	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
 	 * might ask, "if a compressed buf is allocated first, won't that be the
 	 * last thing in the list?", but in that case it's impossible to create
 	 * a shared uncompressed buf anyway (because the hdr must be compressed
 	 * to have the compressed buf). You might also think that #3 is
 	 * sufficient to make this guarantee, however it's possible
 	 * (specifically in the rare L2ARC write race mentioned in
 	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
 	 * is shareable, but wasn't at the time of its allocation. Rather than
 	 * allow a new shared uncompressed buf to be created and then shuffle
 	 * the list around to make it the last element, this simply disallows
 	 * sharing if the new buf isn't the first to be added.
 	 */
 	ASSERT3P(buf->b_hdr, ==, hdr);
 	boolean_t hdr_compressed =
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
 	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
 	return (!ARC_BUF_ENCRYPTED(buf) &&
 	    buf_compressed == hdr_compressed &&
 	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
 	    !HDR_SHARED_DATA(hdr) &&
 	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
 }
 
 /*
  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
  * copy was made successfully, or an error code otherwise.
  */
 static int
 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
     void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
     boolean_t fill, arc_buf_t **ret)
 {
 	arc_buf_t *buf;
 	arc_fill_flags_t flags = ARC_FILL_LOCKED;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
 	    hdr->b_type == ARC_BUFC_METADATA);
 	ASSERT3P(ret, !=, NULL);
 	ASSERT3P(*ret, ==, NULL);
 	IMPLY(encrypted, compressed);
 
 	hdr->b_l1hdr.b_mru_hits = 0;
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 	hdr->b_l1hdr.b_l2_hits = 0;
 
 	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_next = hdr->b_l1hdr.b_buf;
 	buf->b_flags = 0;
 
 	add_reference(hdr, tag);
 
 	/*
 	 * We're about to change the hdr's b_flags. We must either
 	 * hold the hash_lock or be undiscoverable.
 	 */
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Only honor requests for compressed bufs if the hdr is actually
 	 * compressed. This must be overridden if the buffer is encrypted since
 	 * encrypted buffers cannot be decompressed.
 	 */
 	if (encrypted) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
 		flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
 	} else if (compressed &&
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		flags |= ARC_FILL_COMPRESSED;
 	}
 
 	if (noauth) {
 		ASSERT0(encrypted);
 		flags |= ARC_FILL_NOAUTH;
 	}
 
 	/*
 	 * If the hdr's data can be shared then we share the data buffer and
 	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
 	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
 	 * buffer to store the buf's data.
 	 *
 	 * There are two additional restrictions here because we're sharing
 	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
 	 * actively involved in an L2ARC write, because if this buf is used by
 	 * an arc_write() then the hdr's data buffer will be released when the
 	 * write completes, even though the L2ARC write might still be using it.
 	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
 	 * need to be ABD-aware.  It must be allocated via
 	 * zio_[data_]buf_alloc(), not as a page, because we need to be able
 	 * to abd_release_ownership_of_buf(), which isn't allowed on "linear
 	 * page" buffers because the ABD code needs to handle freeing them
 	 * specially.
 	 */
 	boolean_t can_share = arc_can_share(hdr, buf) &&
 	    !HDR_L2_WRITING(hdr) &&
 	    hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(hdr->b_l1hdr.b_pabd) &&
 	    !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
 
 	/* Set up b_data and sharing */
 	if (can_share) {
 		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
 		buf->b_flags |= ARC_BUF_FLAG_SHARED;
 		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	} else {
 		buf->b_data =
 		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
 		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 	}
 	VERIFY3P(buf->b_data, !=, NULL);
 
 	hdr->b_l1hdr.b_buf = buf;
 	hdr->b_l1hdr.b_bufcnt += 1;
 	if (encrypted)
 		hdr->b_crypt_hdr.b_ebufcnt += 1;
 
 	/*
 	 * If the user wants the data from the hdr, we need to either copy or
 	 * decompress the data.
 	 */
 	if (fill) {
 		ASSERT3P(zb, !=, NULL);
 		return (arc_buf_fill(buf, spa, zb, flags));
 	}
 
 	return (0);
 }
 
 static char *arc_onloan_tag = "onloan";
 
 static inline void
 arc_loaned_bytes_update(int64_t delta)
 {
 	atomic_add_64(&arc_loaned_bytes, delta);
 
 	/* assert that it did not wrap around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 }
 
 /*
  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
  * flight data by arc_tempreserve_space() until they are "returned". Loaned
  * buffers must be returned to the arc before they can be used by the DMU or
  * freed.
  */
 arc_buf_t *
 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
 {
 	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
 	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
 	    psize, lsize, compression_type, complevel);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
 	    byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
 	    complevel);
 
 	atomic_add_64(&arc_loaned_bytes, psize);
 	return (buf);
 }
 
 
 /*
  * Return a loaned arc buffer to the arc.
  */
 void
 arc_return_buf(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 
 	arc_loaned_bytes_update(-arc_buf_size(buf));
 }
 
 /* Detach an arc_buf from a dbuf (tag) */
 void
 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 }
 
 static void
 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
 {
 	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
 
 	df->l2df_abd = abd;
 	df->l2df_size = size;
 	df->l2df_type = type;
 	mutex_enter(&l2arc_free_on_write_mtx);
 	list_insert_head(l2arc_free_on_write, df);
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 static void
 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, hdr);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 
 	if (free_rdata) {
 		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
 	} else {
 		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
 	}
 }
 
 /*
  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
  * data buffer, we transfer the refcount ownership to the hdr and update
  * the appropriate kstats.
  */
 static void
 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_can_share(hdr, buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Start sharing the data buffer. We transfer the
 	 * refcount ownership to the hdr since it always owns
 	 * the refcount whenever an arc_buf_t is shared.
 	 */
 	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
 	    arc_hdr_size(hdr), buf, hdr);
 	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
 	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
 	    HDR_ISTYPE_METADATA(hdr));
 	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	buf->b_flags |= ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since we've transferred ownership to the hdr we need
 	 * to increment its compressed and uncompressed kstats and
 	 * decrement the overhead size.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
 }
 
 static void
 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * We are no longer sharing this buffer so we need
 	 * to transfer its ownership to the rightful owner.
 	 */
 	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
 	    arc_hdr_size(hdr), hdr, buf);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
 	abd_put(hdr->b_l1hdr.b_pabd);
 	hdr->b_l1hdr.b_pabd = NULL;
 	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since the buffer is no longer shared between
 	 * the arc buf and the hdr, count it as overhead.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 }
 
 /*
  * Remove an arc_buf_t from the hdr's buf list and return the last
  * arc_buf_t on the list. If no buffers remain on the list then return
  * NULL.
  */
 static arc_buf_t *
 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
 	arc_buf_t *lastbuf = NULL;
 
 	/*
 	 * Remove the buf from the hdr list and locate the last
 	 * remaining buffer on the list.
 	 */
 	while (*bufp != NULL) {
 		if (*bufp == buf)
 			*bufp = buf->b_next;
 
 		/*
 		 * If we've removed a buffer in the middle of
 		 * the list then update the lastbuf and update
 		 * bufp.
 		 */
 		if (*bufp != NULL) {
 			lastbuf = *bufp;
 			bufp = &(*bufp)->b_next;
 		}
 	}
 	buf->b_next = NULL;
 	ASSERT3P(lastbuf, !=, buf);
 	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
 	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
 	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
 
 	return (lastbuf);
 }
 
 /*
  * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
  * list and free it.
  */
 static void
 arc_buf_destroy_impl(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Free up the data associated with the buf but only if we're not
 	 * sharing this with the hdr. If we are sharing it with the hdr, the
 	 * hdr is responsible for doing the free.
 	 */
 	if (buf->b_data != NULL) {
 		/*
 		 * We're about to change the hdr's b_flags. We must either
 		 * hold the hash_lock or be undiscoverable.
 		 */
 		ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		if (arc_buf_is_shared(buf)) {
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 		} else {
 			uint64_t size = arc_buf_size(buf);
 			arc_free_data_buf(hdr, buf->b_data, size, buf);
 			ARCSTAT_INCR(arcstat_overhead_size, -size);
 		}
 		buf->b_data = NULL;
 
 		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
 		hdr->b_l1hdr.b_bufcnt -= 1;
 
 		if (ARC_BUF_ENCRYPTED(buf)) {
 			hdr->b_crypt_hdr.b_ebufcnt -= 1;
 
 			/*
 			 * If we have no more encrypted buffers and we've
 			 * already gotten a copy of the decrypted data we can
 			 * free b_rabd to save some space.
 			 */
 			if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
 			    HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
 			    !HDR_IO_IN_PROGRESS(hdr)) {
 				arc_hdr_free_abd(hdr, B_TRUE);
 			}
 		}
 	}
 
 	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 
 	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 		/*
 		 * If the current arc_buf_t is sharing its data buffer with the
 		 * hdr, then reassign the hdr's b_pabd to share it with the new
 		 * buffer at the end of the list. The shared buffer is always
 		 * the last one on the hdr's buffer list.
 		 *
 		 * There is an equivalent case for compressed bufs, but since
 		 * they aren't guaranteed to be the last buf in the list and
 		 * that is an exceedingly rare case, we just allow that space be
 		 * wasted temporarily. We must also be careful not to share
 		 * encrypted buffers, since they cannot be shared.
 		 */
 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
 			/* Only one buf can be shared at once */
 			VERIFY(!arc_buf_is_shared(lastbuf));
 			/* hdr is uncompressed so can't have compressed buf */
 			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
 
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 			/*
 			 * We must setup a new shared block between the
 			 * last buffer and the hdr. The data would have
 			 * been allocated by the arc buf so we need to transfer
 			 * ownership to the hdr since it's now being shared.
 			 */
 			arc_share_buf(hdr, lastbuf);
 		}
 	} else if (HDR_SHARED_DATA(hdr)) {
 		/*
 		 * Uncompressed shared buffers are always at the end
 		 * of the list. Compressed buffers don't have the
 		 * same requirements. This makes it hard to
 		 * simply assert that the lastbuf is shared so
 		 * we rely on the hdr's compression flags to determine
 		 * if we have a compressed, shared buffer.
 		 */
 		ASSERT3P(lastbuf, !=, NULL);
 		ASSERT(arc_buf_is_shared(lastbuf) ||
 		    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	}
 
 	/*
 	 * Free the checksum if we're removing the last uncompressed buf from
 	 * this hdr.
 	 */
 	if (!arc_hdr_has_uncompressed_buf(hdr)) {
 		arc_cksum_free(hdr);
 	}
 
 	/* clean up the buf */
 	buf->b_hdr = NULL;
 	kmem_cache_free(buf_cache, buf);
 }
 
 static void
 arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
 {
 	uint64_t size;
 	boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
 	boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0);
 
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
 	IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
 
 	if (alloc_rdata) {
 		size = HDR_GET_PSIZE(hdr);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
 		hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
 		    do_adapt);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
 		ARCSTAT_INCR(arcstat_raw_size, size);
 	} else {
 		size = arc_hdr_size(hdr);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
 		    do_adapt);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	}
 
 	ARCSTAT_INCR(arcstat_compressed_size, size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 }
 
 static void
 arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 	IMPLY(free_rdata, HDR_HAS_RABD(hdr));
 
 	/*
 	 * If the hdr is currently being written to the l2arc then
 	 * we defer freeing the data by adding it to the l2arc_free_on_write
 	 * list. The l2arc will free the data once it's finished
 	 * writing it to the l2arc device.
 	 */
 	if (HDR_L2_WRITING(hdr)) {
 		arc_hdr_free_on_write(hdr, free_rdata);
 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
 	} else if (free_rdata) {
 		arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
 	} else {
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
 	}
 
 	if (free_rdata) {
 		hdr->b_crypt_hdr.b_rabd = NULL;
 		ARCSTAT_INCR(arcstat_raw_size, -size);
 	} else {
 		hdr->b_l1hdr.b_pabd = NULL;
 	}
 
 	if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
 		hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 
 	ARCSTAT_INCR(arcstat_compressed_size, -size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 }
 
 static arc_buf_hdr_t *
 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
     boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
     arc_buf_contents_t type, boolean_t alloc_rdata)
 {
 	arc_buf_hdr_t *hdr;
 	int flags = ARC_HDR_DO_ADAPT;
 
 	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
 	if (protected) {
 		hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
 	} else {
 		hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
 	}
 	flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0;
 
 	ASSERT(HDR_EMPTY(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 	HDR_SET_PSIZE(hdr, psize);
 	HDR_SET_LSIZE(hdr, lsize);
 	hdr->b_spa = spa;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
 	arc_hdr_set_compress(hdr, compression_type);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 
 	hdr->b_l1hdr.b_state = arc_anon;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_bufcnt = 0;
 	hdr->b_l1hdr.b_buf = NULL;
 
 	/*
 	 * Allocate the hdr's buffer. This will contain either
 	 * the compressed or uncompressed data depending on the block
 	 * it references and compressed arc enablement.
 	 */
 	arc_hdr_alloc_abd(hdr, flags);
 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 
 	return (hdr);
 }
 
 /*
  * Transition between the two allocation states for the arc_buf_hdr struct.
  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
  * version is used when a cache buffer is only in the L2ARC in order to reduce
  * memory usage.
  */
 static arc_buf_hdr_t *
 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 {
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	arc_buf_hdr_t *nhdr;
 	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 
 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
 	    (old == hdr_l2only_cache && new == hdr_full_cache));
 
 	/*
 	 * if the caller wanted a new full header and the header is to be
 	 * encrypted we will actually allocate the header from the full crypt
 	 * cache instead. The same applies to freeing from the old cache.
 	 */
 	if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
 		new = hdr_full_crypt_cache;
 	if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
 		old = hdr_full_crypt_cache;
 
 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	buf_hash_remove(hdr);
 
 	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
 
 	if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
 		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 		/*
 		 * arc_access and arc_change_state need to be aware that a
 		 * header has just come out of L2ARC, so we set its state to
 		 * l2c_only even though it's about to change.
 		 */
 		nhdr->b_l1hdr.b_state = arc_l2c_only;
 
 		/* Verify previous threads set to NULL before freeing */
 		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 	} else {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 
 		/*
 		 * If we've reached here, We must have been called from
 		 * arc_evict_hdr(), as such we should have already been
 		 * removed from any ghost list we were previously on
 		 * (which protects us from racing with arc_evict_state),
 		 * thus no locking is needed during this check.
 		 */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		/*
 		 * A buffer must not be moved into the arc_l2c_only
 		 * state if it's not finished being written out to the
 		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
 		 * might try to be accessed, even though it was removed.
 		 */
 		VERIFY(!HDR_L2_WRITING(hdr));
 		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 
 		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 	}
 	/*
 	 * The header has been reallocated so we need to re-insert it into any
 	 * lists it was on.
 	 */
 	(void) buf_hash_insert(nhdr, NULL);
 
 	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
 
 	mutex_enter(&dev->l2ad_mtx);
 
 	/*
 	 * We must place the realloc'ed header back into the list at
 	 * the same spot. Otherwise, if it's placed earlier in the list,
 	 * l2arc_write_buffers() could find it during the function's
 	 * write phase, and try to write it out to the l2arc.
 	 */
 	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	mutex_exit(&dev->l2ad_mtx);
 
 	/*
 	 * Since we're using the pointer address as the tag when
 	 * incrementing and decrementing the l2ad_alloc refcount, we
 	 * must remove the old pointer (that we're about to destroy) and
 	 * add the new pointer to the refcount. Otherwise we'd remove
 	 * the wrong pointer address when calling arc_hdr_destroy() later.
 	 */
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 	    arc_hdr_size(hdr), hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 	    arc_hdr_size(nhdr), nhdr);
 
 	buf_discard_identity(hdr);
 	kmem_cache_free(old, hdr);
 
 	return (nhdr);
 }
 
 /*
  * This function allows an L1 header to be reallocated as a crypt
  * header and vice versa. If we are going to a crypt header, the
  * new fields will be zeroed out.
  */
 static arc_buf_hdr_t *
 arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
 {
 	arc_buf_hdr_t *nhdr;
 	arc_buf_t *buf;
 	kmem_cache_t *ncache, *ocache;
 	unsigned nsize, osize;
 
 	/*
 	 * This function requires that hdr is in the arc_anon state.
 	 * Therefore it won't have any L2ARC data for us to worry
 	 * about copying.
 	 */
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_HAS_L2HDR(hdr));
 	ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 
 	if (need_crypt) {
 		ncache = hdr_full_crypt_cache;
 		nsize = sizeof (hdr->b_crypt_hdr);
 		ocache = hdr_full_cache;
 		osize = HDR_FULL_SIZE;
 	} else {
 		ncache = hdr_full_cache;
 		nsize = HDR_FULL_SIZE;
 		ocache = hdr_full_crypt_cache;
 		osize = sizeof (hdr->b_crypt_hdr);
 	}
 
 	nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
 
 	/*
 	 * Copy all members that aren't locks or condvars to the new header.
 	 * No lists are pointing to us (as we asserted above), so we don't
 	 * need to worry about the list nodes.
 	 */
 	nhdr->b_dva = hdr->b_dva;
 	nhdr->b_birth = hdr->b_birth;
 	nhdr->b_type = hdr->b_type;
 	nhdr->b_flags = hdr->b_flags;
 	nhdr->b_psize = hdr->b_psize;
 	nhdr->b_lsize = hdr->b_lsize;
 	nhdr->b_spa = hdr->b_spa;
 	nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
 	nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
 	nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
 	nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
 	nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
 	nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
 	nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
 	nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
 	nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
 	nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits;
 	nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
 	nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
 
 	/*
 	 * This zfs_refcount_add() exists only to ensure that the individual
 	 * arc buffers always point to a header that is referenced, avoiding
 	 * a small race condition that could trigger ASSERTs.
 	 */
 	(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
 	nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
 	for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
 		mutex_enter(&buf->b_evict_lock);
 		buf->b_hdr = nhdr;
 		mutex_exit(&buf->b_evict_lock);
 	}
 
 	zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
 	(void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
 
 	if (need_crypt) {
 		arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
 	} else {
 		arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
 	}
 
 	/* unset all members of the original hdr */
 	bzero(&hdr->b_dva, sizeof (dva_t));
 	hdr->b_birth = 0;
 	hdr->b_type = ARC_BUFC_INVALID;
 	hdr->b_flags = 0;
 	hdr->b_psize = 0;
 	hdr->b_lsize = 0;
 	hdr->b_spa = 0;
 	hdr->b_l1hdr.b_freeze_cksum = NULL;
 	hdr->b_l1hdr.b_buf = NULL;
 	hdr->b_l1hdr.b_bufcnt = 0;
 	hdr->b_l1hdr.b_byteswap = 0;
 	hdr->b_l1hdr.b_state = NULL;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_mru_hits = 0;
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 	hdr->b_l1hdr.b_l2_hits = 0;
 	hdr->b_l1hdr.b_acb = NULL;
 	hdr->b_l1hdr.b_pabd = NULL;
 
 	if (ocache == hdr_full_crypt_cache) {
 		ASSERT(!HDR_HAS_RABD(hdr));
 		hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
 		hdr->b_crypt_hdr.b_ebufcnt = 0;
 		hdr->b_crypt_hdr.b_dsobj = 0;
 		bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
 		bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
 		bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 	}
 
 	buf_discard_identity(hdr);
 	kmem_cache_free(ocache, hdr);
 
 	return (nhdr);
 }
 
 /*
  * This function is used by the send / receive code to convert a newly
  * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
  * is also used to allow the root objset block to be updated without altering
  * its embedded MACs. Both block types will always be uncompressed so we do not
  * have to worry about compression type or psize.
  */
 void
 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
     dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
     const uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 
 	buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
 	if (!HDR_PROTECTED(hdr))
 		hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	if (!arc_hdr_has_uncompressed_buf(hdr))
 		arc_cksum_free(hdr);
 
 	if (salt != NULL)
 		bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
 	if (iv != NULL)
 		bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
 	if (mac != NULL)
 		bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 }
 
 /*
  * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
  * The buf is returned thawed since we expect the consumer to modify it.
  */
 arc_buf_t *
 arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
 {
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
 	    B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	return (buf);
 }
 
 /*
  * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
  * for bufs containing metadata.
  */
 arc_buf_t *
 arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
 	    B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
 	    B_TRUE, B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 
 	if (!arc_buf_is_shared(buf)) {
 		/*
 		 * To ensure that the hdr has the correct data in it if we call
 		 * arc_untransform() on this buf before it's been written to
 		 * disk, it's easiest if we just set up sharing between the
 		 * buf and the hdr.
 		 */
 		arc_hdr_free_abd(hdr, B_FALSE);
 		arc_share_buf(hdr, buf);
 	}
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 	arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
 	    ARC_BUFC_METADATA : ARC_BUFC_DATA;
 
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
 	    compression_type, complevel, type, B_TRUE);
 
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
 	bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
 	bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 
 	/*
 	 * This buffer will be considered encrypted even if the ot is not an
 	 * encrypted type. It will become authenticated instead in
 	 * arc_write_ready().
 	 */
 	buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 
 	return (buf);
 }
 
 static void
 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 	l2arc_dev_t *dev = l2hdr->b_dev;
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 
 	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	ARCSTAT_INCR(arcstat_l2_psize, -psize);
 	ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
 
 	vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
 	    hdr);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 }
 
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
 		    hdr->b_l1hdr.b_bufcnt > 0);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	}
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
 
 		if (!buflist_held)
 			mutex_enter(&dev->l2ad_mtx);
 
 		/*
 		 * Even though we checked this conditional above, we
 		 * need to check this again now that we have the
 		 * l2ad_mtx. This is because we could be racing with
 		 * another thread calling l2arc_evict() which might have
 		 * destroyed this header's L2 portion as we were waiting
 		 * to acquire the l2ad_mtx. If that happens, we don't
 		 * want to re-destroy the header's L2 portion.
 		 */
 		if (HDR_HAS_L2HDR(hdr))
 			arc_hdr_l2hdr_destroy(hdr);
 
 		if (!buflist_held)
 			mutex_exit(&dev->l2ad_mtx);
 	}
 
 	/*
 	 * The header's identify can only be safely discarded once it is no
 	 * longer discoverable.  This requires removing it from the hash table
 	 * and the l2arc header list.  After this point the hash lock can not
 	 * be used to protect the header.
 	 */
 	if (!HDR_EMPTY(hdr))
 		buf_discard_identity(hdr);
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		arc_cksum_free(hdr);
 
 		while (hdr->b_l1hdr.b_buf != NULL)
 			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
 
 		if (hdr->b_l1hdr.b_pabd != NULL)
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 
 		if (!HDR_PROTECTED(hdr)) {
 			kmem_cache_free(hdr_full_cache, hdr);
 		} else {
 			kmem_cache_free(hdr_full_crypt_cache, hdr);
 		}
 	} else {
 		kmem_cache_free(hdr_l2only_cache, hdr);
 	}
 }
 
 void
 arc_buf_destroy(arc_buf_t *buf, void* tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		VERIFY0(remove_reference(hdr, NULL, tag));
 		arc_hdr_destroy(hdr);
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	ASSERT3P(hdr, ==, buf->b_hdr);
 	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
 	ASSERT3P(buf->b_data, !=, NULL);
 
 	(void) remove_reference(hdr, hash_lock, tag);
 	arc_buf_destroy_impl(buf);
 	mutex_exit(hash_lock);
 }
 
 /*
  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
  * state of the header is dependent on its state prior to entering this
  * function. The following transitions are possible:
  *
  *    - arc_mru -> arc_mru_ghost
  *    - arc_mfu -> arc_mfu_ghost
  *    - arc_mru_ghost -> arc_l2c_only
  *    - arc_mru_ghost -> deleted
  *    - arc_mfu_ghost -> arc_l2c_only
  *    - arc_mfu_ghost -> deleted
  */
 static int64_t
 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
 	arc_state_t *evicted_state, *state;
 	int64_t bytes_evicted = 0;
 	int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
 	    arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	state = hdr->b_l1hdr.b_state;
 	if (GHOST_STATE(state)) {
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 
 		/*
 		 * l2arc_write_buffers() relies on a header's L1 portion
 		 * (i.e. its b_pabd field) during it's write phase.
 		 * Thus, we cannot push a header onto the arc_l2c_only
 		 * state (removing its L1 piece) until the header is
 		 * done being written to the l2arc.
 		 */
 		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
 			ARCSTAT_BUMP(arcstat_evict_l2_skip);
 			return (bytes_evicted);
 		}
 
 		ARCSTAT_BUMP(arcstat_deleted);
 		bytes_evicted += HDR_GET_LSIZE(hdr);
 
 		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
 
 		if (HDR_HAS_L2HDR(hdr)) {
 			ASSERT(hdr->b_l1hdr.b_pabd == NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 			/*
 			 * This buffer is cached on the 2nd Level ARC;
 			 * don't destroy the header.
 			 */
 			arc_change_state(arc_l2c_only, hdr, hash_lock);
 			/*
 			 * dropping from L1+L2 cached to L2-only,
 			 * realloc to remove the L1 header.
 			 */
 			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
 			    hdr_l2only_cache);
 		} else {
 			arc_change_state(arc_anon, hdr, hash_lock);
 			arc_hdr_destroy(hdr);
 		}
 		return (bytes_evicted);
 	}
 
 	ASSERT(state == arc_mru || state == arc_mfu);
 	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 
 	/* prefetch buffers have a minimum lifespan */
 	if (HDR_IO_IN_PROGRESS(hdr) ||
 	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
 	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
 	    MSEC_TO_TICK(min_lifetime))) {
 		ARCSTAT_BUMP(arcstat_evict_skip);
 		return (bytes_evicted);
 	}
 
 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
 	while (hdr->b_l1hdr.b_buf) {
 		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
 		if (!mutex_tryenter(&buf->b_evict_lock)) {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 			break;
 		}
 		if (buf->b_data != NULL)
 			bytes_evicted += HDR_GET_LSIZE(hdr);
 		mutex_exit(&buf->b_evict_lock);
 		arc_buf_destroy_impl(buf);
 	}
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
 	} else {
 		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
 			ARCSTAT_INCR(arcstat_evict_l2_eligible,
 			    HDR_GET_LSIZE(hdr));
 		} else {
 			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
 			    HDR_GET_LSIZE(hdr));
 		}
 	}
 
 	if (hdr->b_l1hdr.b_bufcnt == 0) {
 		arc_cksum_free(hdr);
 
 		bytes_evicted += arc_hdr_size(hdr);
 
 		/*
 		 * If this hdr is being evicted and has a compressed
 		 * buffer then we discard it here before we change states.
 		 * This ensures that the accounting is updated correctly
 		 * in arc_free_data_impl().
 		 */
 		if (hdr->b_l1hdr.b_pabd != NULL)
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 
 		arc_change_state(evicted_state, hdr, hash_lock);
 		ASSERT(HDR_IN_HASH_TABLE(hdr));
 		arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
 	}
 
 	return (bytes_evicted);
 }
 
 static void
 arc_set_need_free(void)
 {
 	ASSERT(MUTEX_HELD(&arc_evict_lock));
 	int64_t remaining = arc_free_memory() - arc_sys_free / 2;
 	arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
 	if (aw == NULL) {
 		arc_need_free = MAX(-remaining, 0);
 	} else {
 		arc_need_free =
 		    MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
 	}
 }
 
 static uint64_t
 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
     uint64_t spa, int64_t bytes)
 {
 	multilist_sublist_t *mls;
 	uint64_t bytes_evicted = 0;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	int evict_count = 0;
 
 	ASSERT3P(marker, !=, NULL);
 	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
 
 	mls = multilist_sublist_lock(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
 	    hdr = multilist_sublist_prev(mls, marker)) {
 		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
 		    (evict_count >= zfs_arc_evict_batch_limit))
 			break;
 
 		/*
 		 * To keep our iteration location, move the marker
 		 * forward. Since we're not holding hdr's hash lock, we
 		 * must be very careful and not remove 'hdr' from the
 		 * sublist. Otherwise, other consumers might mistake the
 		 * 'hdr' as not being on a sublist when they call the
 		 * multilist_link_active() function (they all rely on
 		 * the hash lock protecting concurrent insertions and
 		 * removals). multilist_sublist_move_forward() was
 		 * specifically implemented to ensure this is the case
 		 * (only 'marker' will be removed and re-inserted).
 		 */
 		multilist_sublist_move_forward(mls, marker);
 
 		/*
 		 * The only case where the b_spa field should ever be
 		 * zero, is the marker headers inserted by
 		 * arc_evict_state(). It's possible for multiple threads
 		 * to be calling arc_evict_state() concurrently (e.g.
 		 * dsl_pool_close() and zio_inject_fault()), so we must
 		 * skip any markers we see from these other threads.
 		 */
 		if (hdr->b_spa == 0)
 			continue;
 
 		/* we're only interested in evicting buffers of a certain spa */
 		if (spa != 0 && hdr->b_spa != spa) {
 			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
 		}
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We aren't calling this function from any code path
 		 * that would already be holding a hash lock, so we're
 		 * asserting on this assumption to be defensive in case
 		 * this ever changes. Without this check, it would be
 		 * possible to incorrectly increment arcstat_mutex_miss
 		 * below (e.g. if the code changed such that we called
 		 * this function with a hash lock held).
 		 */
 		ASSERT(!MUTEX_HELD(hash_lock));
 
 		if (mutex_tryenter(hash_lock)) {
 			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
 			mutex_exit(hash_lock);
 
 			bytes_evicted += evicted;
 
 			/*
 			 * If evicted is zero, arc_evict_hdr() must have
 			 * decided to skip this header, don't increment
 			 * evict_count in this case.
 			 */
 			if (evicted != 0)
 				evict_count++;
 
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
 	}
 
 	multilist_sublist_unlock(mls);
 
 	/*
 	 * Increment the count of evicted bytes, and wake up any threads that
 	 * are waiting for the count to reach this value.  Since the list is
 	 * ordered by ascending aew_count, we pop off the beginning of the
 	 * list until we reach the end, or a waiter that's past the current
 	 * "count".  Doing this outside the loop reduces the number of times
 	 * we need to acquire the global arc_evict_lock.
 	 *
 	 * Only wake when there's sufficient free memory in the system
 	 * (specifically, arc_sys_free/2, which by default is a bit more than
 	 * 1/64th of RAM).  See the comments in arc_wait_for_eviction().
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_count += bytes_evicted;
 
 	if ((int64_t)(arc_free_memory() - arc_sys_free / 2) > 0) {
 		arc_evict_waiter_t *aw;
 		while ((aw = list_head(&arc_evict_waiters)) != NULL &&
 		    aw->aew_count <= arc_evict_count) {
 			list_remove(&arc_evict_waiters, aw);
 			cv_broadcast(&aw->aew_cv);
 		}
 	}
 	arc_set_need_free();
 	mutex_exit(&arc_evict_lock);
 
 	/*
 	 * If the ARC size is reduced from arc_c_max to arc_c_min (especially
 	 * if the average cached block is small), eviction can be on-CPU for
 	 * many seconds.  To ensure that other threads that may be bound to
 	 * this CPU are able to make progress, make a voluntary preemption
 	 * call here.
 	 */
 	cond_resched();
 
 	return (bytes_evicted);
 }
 
 /*
  * Evict buffers from the given arc state, until we've removed the
  * specified number of bytes. Move the removed buffers to the
  * appropriate evict state.
  *
  * This function makes a "best effort". It skips over any buffers
  * it can't get a hash_lock on, and so, may not catch all candidates.
  * It may also return without evicting as much space as requested.
  *
  * If bytes is specified using the special value ARC_EVICT_ALL, this
  * will evict all available (i.e. unlocked and evictable) buffers from
  * the given arc state; which is used by arc_flush().
  */
 static uint64_t
 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
     arc_buf_contents_t type)
 {
 	uint64_t total_evicted = 0;
 	multilist_t *ml = state->arcs_list[type];
 	int num_sublists;
 	arc_buf_hdr_t **markers;
 
 	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
 
 	num_sublists = multilist_get_num_sublists(ml);
 
 	/*
 	 * If we've tried to evict from each sublist, made some
 	 * progress, but still have not hit the target number of bytes
 	 * to evict, we want to keep trying. The markers allow us to
 	 * pick up where we left off for each individual sublist, rather
 	 * than starting from the tail each time.
 	 */
 	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
 		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
 
 		/*
 		 * A b_spa of 0 is used to indicate that this header is
 		 * a marker. This fact is used in arc_evict_type() and
 		 * arc_evict_state_impl().
 		 */
 		markers[i]->b_spa = 0;
 
 		mls = multilist_sublist_lock(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
 
 	/*
 	 * While we haven't hit our target number of bytes to evict, or
 	 * we're evicting all available buffers.
 	 */
 	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
 		int sublist_idx = multilist_get_random_index(ml);
 		uint64_t scan_evicted = 0;
 
 		/*
 		 * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
 		 * Request that 10% of the LRUs be scanned by the superblock
 		 * shrinker.
 		 */
 		if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
 		    arc_dnode_size_limit) > 0) {
 			arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
 			    arc_dnode_size_limit) / sizeof (dnode_t) /
 			    zfs_arc_dnode_reduce_percent);
 		}
 
 		/*
 		 * Start eviction using a randomly selected sublist,
 		 * this is to try and evenly balance eviction across all
 		 * sublists. Always starting at the same sublist
 		 * (e.g. index 0) would cause evictions to favor certain
 		 * sublists over others.
 		 */
 		for (int i = 0; i < num_sublists; i++) {
 			uint64_t bytes_remaining;
 			uint64_t bytes_evicted;
 
 			if (bytes == ARC_EVICT_ALL)
 				bytes_remaining = ARC_EVICT_ALL;
 			else if (total_evicted < bytes)
 				bytes_remaining = bytes - total_evicted;
 			else
 				break;
 
 			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
 			    markers[sublist_idx], spa, bytes_remaining);
 
 			scan_evicted += bytes_evicted;
 			total_evicted += bytes_evicted;
 
 			/* we've reached the end, wrap to the beginning */
 			if (++sublist_idx >= num_sublists)
 				sublist_idx = 0;
 		}
 
 		/*
 		 * If we didn't evict anything during this scan, we have
 		 * no reason to believe we'll evict more during another
 		 * scan, so break the loop.
 		 */
 		if (scan_evicted == 0) {
 			/* This isn't possible, let's make that obvious */
 			ASSERT3S(bytes, !=, 0);
 
 			/*
 			 * When bytes is ARC_EVICT_ALL, the only way to
 			 * break the loop is when scan_evicted is zero.
 			 * In that case, we actually have evicted enough,
 			 * so we don't want to increment the kstat.
 			 */
 			if (bytes != ARC_EVICT_ALL) {
 				ASSERT3S(total_evicted, <, bytes);
 				ARCSTAT_BUMP(arcstat_evict_not_enough);
 			}
 
 			break;
 		}
 	}
 
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 
 		kmem_cache_free(hdr_full_cache, markers[i]);
 	}
 	kmem_free(markers, sizeof (*markers) * num_sublists);
 
 	return (total_evicted);
 }
 
 /*
  * Flush all "evictable" data of the given type from the arc state
  * specified. This will not evict any "active" buffers (i.e. referenced).
  *
  * When 'retry' is set to B_FALSE, the function will make a single pass
  * over the state and evict any buffers that it can. Since it doesn't
  * continually retry the eviction, it might end up leaving some buffers
  * in the ARC due to lock misses.
  *
  * When 'retry' is set to B_TRUE, the function will continually retry the
  * eviction until *all* evictable buffers have been removed from the
  * state. As a result, if concurrent insertions into the state are
  * allowed (e.g. if the ARC isn't shutting down), this function might
  * wind up in an infinite loop, continually trying to evict buffers.
  */
 static uint64_t
 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
     boolean_t retry)
 {
 	uint64_t evicted = 0;
 
 	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
 		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
 
 		if (!retry)
 			break;
 	}
 
 	return (evicted);
 }
 
 /*
  * Evict the specified number of bytes from the state specified,
  * restricting eviction to the spa and type given. This function
  * prevents us from trying to evict more from a state's list than
  * is "evictable", and to skip evicting altogether when passed a
  * negative value for "bytes". In contrast, arc_evict_state() will
  * evict everything it can, when passed a negative value for "bytes".
  */
 static uint64_t
 arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
     arc_buf_contents_t type)
 {
 	int64_t delta;
 
 	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
 		    bytes);
 		return (arc_evict_state(state, spa, delta, type));
 	}
 
 	return (0);
 }
 
 /*
  * The goal of this function is to evict enough meta data buffers from the
  * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
  * more complicated than it appears because it is common for data buffers
  * to have holds on meta data buffers.  In addition, dnode meta data buffers
  * will be held by the dnodes in the block preventing them from being freed.
  * This means we can't simply traverse the ARC and expect to always find
  * enough unheld meta data buffer to release.
  *
  * Therefore, this function has been updated to make alternating passes
  * over the ARC releasing data buffers and then newly unheld meta data
  * buffers.  This ensures forward progress is maintained and meta_used
  * will decrease.  Normally this is sufficient, but if required the ARC
  * will call the registered prune callbacks causing dentry and inodes to
  * be dropped from the VFS cache.  This will make dnode meta data buffers
  * available for reclaim.
  */
 static uint64_t
 arc_evict_meta_balanced(uint64_t meta_used)
 {
 	int64_t delta, prune = 0, adjustmnt;
 	uint64_t total_evicted = 0;
 	arc_buf_contents_t type = ARC_BUFC_DATA;
 	int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
 
 restart:
 	/*
 	 * This slightly differs than the way we evict from the mru in
 	 * arc_evict because we don't have a "target" value (i.e. no
 	 * "meta" arc_p). As a result, I think we can completely
 	 * cannibalize the metadata in the MRU before we evict the
 	 * metadata from the MFU. I think we probably need to implement a
 	 * "metadata arc_p" value to do this properly.
 	 */
 	adjustmnt = meta_used - arc_meta_limit;
 
 	if (adjustmnt > 0 &&
 	    zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
 		    adjustmnt);
 		total_evicted += arc_evict_impl(arc_mru, 0, delta, type);
 		adjustmnt -= delta;
 	}
 
 	/*
 	 * We can't afford to recalculate adjustmnt here. If we do,
 	 * new metadata buffers can sneak into the MRU or ANON lists,
 	 * thus penalize the MFU metadata. Although the fudge factor is
 	 * small, it has been empirically shown to be significant for
 	 * certain workloads (e.g. creating many empty directories). As
 	 * such, we use the original calculation for adjustmnt, and
 	 * simply decrement the amount of data evicted from the MRU.
 	 */
 
 	if (adjustmnt > 0 &&
 	    zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
 		    adjustmnt);
 		total_evicted += arc_evict_impl(arc_mfu, 0, delta, type);
 	}
 
 	adjustmnt = meta_used - arc_meta_limit;
 
 	if (adjustmnt > 0 &&
 	    zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
 		delta = MIN(adjustmnt,
 		    zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
 		total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type);
 		adjustmnt -= delta;
 	}
 
 	if (adjustmnt > 0 &&
 	    zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
 		delta = MIN(adjustmnt,
 		    zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
 		total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type);
 	}
 
 	/*
 	 * If after attempting to make the requested adjustment to the ARC
 	 * the meta limit is still being exceeded then request that the
 	 * higher layers drop some cached objects which have holds on ARC
 	 * meta buffers.  Requests to the upper layers will be made with
 	 * increasingly large scan sizes until the ARC is below the limit.
 	 */
 	if (meta_used > arc_meta_limit) {
 		if (type == ARC_BUFC_DATA) {
 			type = ARC_BUFC_METADATA;
 		} else {
 			type = ARC_BUFC_DATA;
 
 			if (zfs_arc_meta_prune) {
 				prune += zfs_arc_meta_prune;
 				arc_prune_async(prune);
 			}
 		}
 
 		if (restarts > 0) {
 			restarts--;
 			goto restart;
 		}
 	}
 	return (total_evicted);
 }
 
 /*
  * Evict metadata buffers from the cache, such that arc_meta_used is
  * capped by the arc_meta_limit tunable.
  */
 static uint64_t
 arc_evict_meta_only(uint64_t meta_used)
 {
 	uint64_t total_evicted = 0;
 	int64_t target;
 
 	/*
 	 * If we're over the meta limit, we want to evict enough
 	 * metadata to get back under the meta limit. We don't want to
 	 * evict so much that we drop the MRU below arc_p, though. If
 	 * we're over the meta limit more than we're over arc_p, we
 	 * evict some from the MRU here, and some from the MFU below.
 	 */
 	target = MIN((int64_t)(meta_used - arc_meta_limit),
 	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
 	    zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
 
 	total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 
 	/*
 	 * Similar to the above, we want to evict enough bytes to get us
 	 * below the meta limit, but not so much as to drop us below the
 	 * space allotted to the MFU (which is defined as arc_c - arc_p).
 	 */
 	target = MIN((int64_t)(meta_used - arc_meta_limit),
 	    (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
 	    (arc_c - arc_p)));
 
 	total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 
 	return (total_evicted);
 }
 
 static uint64_t
 arc_evict_meta(uint64_t meta_used)
 {
 	if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
 		return (arc_evict_meta_only(meta_used));
 	else
 		return (arc_evict_meta_balanced(meta_used));
 }
 
 /*
  * Return the type of the oldest buffer in the given arc state
  *
  * This function will select a random sublist of type ARC_BUFC_DATA and
  * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
  * is compared, and the type which contains the "older" buffer will be
  * returned.
  */
 static arc_buf_contents_t
 arc_evict_type(arc_state_t *state)
 {
 	multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
 	multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
 	int data_idx = multilist_get_random_index(data_ml);
 	int meta_idx = multilist_get_random_index(meta_ml);
 	multilist_sublist_t *data_mls;
 	multilist_sublist_t *meta_mls;
 	arc_buf_contents_t type;
 	arc_buf_hdr_t *data_hdr;
 	arc_buf_hdr_t *meta_hdr;
 
 	/*
 	 * We keep the sublist lock until we're finished, to prevent
 	 * the headers from being destroyed via arc_evict_state().
 	 */
 	data_mls = multilist_sublist_lock(data_ml, data_idx);
 	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
 
 	/*
 	 * These two loops are to ensure we skip any markers that
 	 * might be at the tail of the lists due to arc_evict_state().
 	 */
 
 	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
 	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
 		if (data_hdr->b_spa != 0)
 			break;
 	}
 
 	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
 	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
 		if (meta_hdr->b_spa != 0)
 			break;
 	}
 
 	if (data_hdr == NULL && meta_hdr == NULL) {
 		type = ARC_BUFC_DATA;
 	} else if (data_hdr == NULL) {
 		ASSERT3P(meta_hdr, !=, NULL);
 		type = ARC_BUFC_METADATA;
 	} else if (meta_hdr == NULL) {
 		ASSERT3P(data_hdr, !=, NULL);
 		type = ARC_BUFC_DATA;
 	} else {
 		ASSERT3P(data_hdr, !=, NULL);
 		ASSERT3P(meta_hdr, !=, NULL);
 
 		/* The headers can't be on the sublist without an L1 header */
 		ASSERT(HDR_HAS_L1HDR(data_hdr));
 		ASSERT(HDR_HAS_L1HDR(meta_hdr));
 
 		if (data_hdr->b_l1hdr.b_arc_access <
 		    meta_hdr->b_l1hdr.b_arc_access) {
 			type = ARC_BUFC_DATA;
 		} else {
 			type = ARC_BUFC_METADATA;
 		}
 	}
 
 	multilist_sublist_unlock(meta_mls);
 	multilist_sublist_unlock(data_mls);
 
 	return (type);
 }
 
 /*
  * Evict buffers from the cache, such that arc_size is capped by arc_c.
  */
 static uint64_t
 arc_evict(void)
 {
 	uint64_t total_evicted = 0;
 	uint64_t bytes;
 	int64_t target;
 	uint64_t asize = aggsum_value(&arc_size);
 	uint64_t ameta = aggsum_value(&arc_meta_used);
 
 	/*
 	 * If we're over arc_meta_limit, we want to correct that before
 	 * potentially evicting data buffers below.
 	 */
 	total_evicted += arc_evict_meta(ameta);
 
 	/*
 	 * Adjust MRU size
 	 *
 	 * If we're over the target cache size, we want to evict enough
 	 * from the list to get back to our target size. We don't want
 	 * to evict too much from the MRU, such that it drops below
 	 * arc_p. So, if we're over our target cache size more than
 	 * the MRU is over arc_p, we'll evict enough to get back to
 	 * arc_p here, and then evict more from the MFU below.
 	 */
 	target = MIN((int64_t)(asize - arc_c),
 	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
 	    zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
 
 	/*
 	 * If we're below arc_meta_min, always prefer to evict data.
 	 * Otherwise, try to satisfy the requested number of bytes to
 	 * evict from the type which contains older buffers; in an
 	 * effort to keep newer buffers in the cache regardless of their
 	 * type. If we cannot satisfy the number of bytes from this
 	 * type, spill over into the next type.
 	 */
 	if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA &&
 	    ameta > arc_meta_min) {
 		bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 		total_evicted += bytes;
 
 		/*
 		 * If we couldn't evict our target number of bytes from
 		 * metadata, we try to get the rest from data.
 		 */
 		target -= bytes;
 
 		total_evicted +=
 		    arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
 	} else {
 		bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
 		total_evicted += bytes;
 
 		/*
 		 * If we couldn't evict our target number of bytes from
 		 * data, we try to get the rest from metadata.
 		 */
 		target -= bytes;
 
 		total_evicted +=
 		    arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Re-sum ARC stats after the first round of evictions.
 	 */
 	asize = aggsum_value(&arc_size);
 	ameta = aggsum_value(&arc_meta_used);
 
 
 	/*
 	 * Adjust MFU size
 	 *
 	 * Now that we've tried to evict enough from the MRU to get its
 	 * size back to arc_p, if we're still above the target cache
 	 * size, we evict the rest from the MFU.
 	 */
 	target = asize - arc_c;
 
 	if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA &&
 	    ameta > arc_meta_min) {
 		bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 		total_evicted += bytes;
 
 		/*
 		 * If we couldn't evict our target number of bytes from
 		 * metadata, we try to get the rest from data.
 		 */
 		target -= bytes;
 
 		total_evicted +=
 		    arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
 	} else {
 		bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
 		total_evicted += bytes;
 
 		/*
 		 * If we couldn't evict our target number of bytes from
 		 * data, we try to get the rest from data.
 		 */
 		target -= bytes;
 
 		total_evicted +=
 		    arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Adjust ghost lists
 	 *
 	 * In addition to the above, the ARC also defines target values
 	 * for the ghost lists. The sum of the mru list and mru ghost
 	 * list should never exceed the target size of the cache, and
 	 * the sum of the mru list, mfu list, mru ghost list, and mfu
 	 * ghost list should never exceed twice the target size of the
 	 * cache. The following logic enforces these limits on the ghost
 	 * caches, and evicts from them as needed.
 	 */
 	target = zfs_refcount_count(&arc_mru->arcs_size) +
 	    zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
 
 	bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
 	total_evicted += bytes;
 
 	target -= bytes;
 
 	total_evicted +=
 	    arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
 
 	/*
 	 * We assume the sum of the mru list and mfu list is less than
 	 * or equal to arc_c (we enforced this above), which means we
 	 * can use the simpler of the two equations below:
 	 *
 	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
 	 *		    mru ghost + mfu ghost <= arc_c
 	 */
 	target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
 	    zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
 
 	bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
 	total_evicted += bytes;
 
 	target -= bytes;
 
 	total_evicted +=
 	    arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
 
 	return (total_evicted);
 }
 
 void
 arc_flush(spa_t *spa, boolean_t retry)
 {
 	uint64_t guid = 0;
 
 	/*
 	 * If retry is B_TRUE, a spa must not be specified since we have
 	 * no good way to determine if all of a spa's buffers have been
 	 * evicted from an arc state.
 	 */
 	ASSERT(!retry || spa == 0);
 
 	if (spa != NULL)
 		guid = spa_load_guid(spa);
 
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 }
 
 void
 arc_reduce_target_size(int64_t to_free)
 {
 	uint64_t asize = aggsum_value(&arc_size);
 
 	/*
 	 * All callers want the ARC to actually evict (at least) this much
 	 * memory.  Therefore we reduce from the lower of the current size and
 	 * the target size.  This way, even if arc_c is much higher than
 	 * arc_size (as can be the case after many calls to arc_freed(), we will
 	 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
 	 * will evict.
 	 */
 	uint64_t c = MIN(arc_c, asize);
 
 	if (c > to_free && c - to_free > arc_c_min) {
 		arc_c = c - to_free;
 		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
 		if (arc_p > arc_c)
 			arc_p = (arc_c >> 1);
 		ASSERT(arc_c >= arc_c_min);
 		ASSERT((int64_t)arc_p >= 0);
 	} else {
 		arc_c = arc_c_min;
 	}
 
 	if (asize > arc_c) {
 		/* See comment in arc_evict_cb_check() on why lock+flag */
 		mutex_enter(&arc_evict_lock);
 		arc_evict_needed = B_TRUE;
 		mutex_exit(&arc_evict_lock);
 		zthr_wakeup(arc_evict_zthr);
 	}
 }
 
 /*
  * Determine if the system is under memory pressure and is asking
  * to reclaim memory. A return value of B_TRUE indicates that the system
  * is under memory pressure and that the arc should adjust accordingly.
  */
 boolean_t
 arc_reclaim_needed(void)
 {
 	return (arc_available_memory() < 0);
 }
 
 void
 arc_kmem_reap_soon(void)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
 	extern kmem_cache_t	*zio_buf_cache[];
 	extern kmem_cache_t	*zio_data_buf_cache[];
 
 #ifdef _KERNEL
 	if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) &&
 	    zfs_arc_meta_prune) {
 		/*
 		 * We are exceeding our meta-data cache limit.
 		 * Prune some entries to release holds on meta-data.
 		 */
 		arc_prune_async(zfs_arc_meta_prune);
 	}
 #if defined(_ILP32)
 	/*
 	 * Reclaim unused memory from all kmem caches.
 	 */
 	kmem_reap();
 #endif
 #endif
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 #if defined(_ILP32)
 		/* reach upper limit of cache size on 32-bit */
 		if (zio_buf_cache[i] == NULL)
 			break;
 #endif
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
 		if (zio_data_buf_cache[i] != prev_data_cache) {
 			prev_data_cache = zio_data_buf_cache[i];
 			kmem_cache_reap_now(zio_data_buf_cache[i]);
 		}
 	}
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_full_cache);
 	kmem_cache_reap_now(hdr_l2only_cache);
 	kmem_cache_reap_now(zfs_btree_leaf_cache);
 	abd_cache_reap_now();
 }
 
 /* ARGSUSED */
 static boolean_t
 arc_evict_cb_check(void *arg, zthr_t *zthr)
 {
 	/*
 	 * This is necessary so that any changes which may have been made to
 	 * many of the zfs_arc_* module parameters will be propagated to
 	 * their actual internal variable counterparts. Without this,
 	 * changing those module params at runtime would have no effect.
 	 */
 	arc_tuning_update(B_FALSE);
 
 	/*
 	 * This is necessary in order to keep the kstat information
 	 * up to date for tools that display kstat data such as the
 	 * mdb ::arc dcmd and the Linux crash utility.  These tools
 	 * typically do not call kstat's update function, but simply
 	 * dump out stats from the most recent update.  Without
 	 * this call, these commands may show stale stats for the
 	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
 	 * with this change, the data might be up to 1 second
 	 * out of date(the arc_evict_zthr has a maximum sleep
 	 * time of 1 second); but that should suffice.  The
 	 * arc_state_t structures can be queried directly if more
 	 * accurate information is needed.
 	 */
 	if (arc_ksp != NULL)
 		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 
 	/*
 	 * We have to rely on arc_wait_for_eviction() to tell us when to
 	 * evict, rather than checking if we are overflowing here, so that we
 	 * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
 	 * If we have become "not overflowing" since arc_wait_for_eviction()
 	 * checked, we need to wake it up.  We could broadcast the CV here,
 	 * but arc_wait_for_eviction() may have not yet gone to sleep.  We
 	 * would need to use a mutex to ensure that this function doesn't
 	 * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
 	 * the arc_evict_lock).  However, the lock ordering of such a lock
 	 * would necessarily be incorrect with respect to the zthr_lock,
 	 * which is held before this function is called, and is held by
 	 * arc_wait_for_eviction() when it calls zthr_wakeup().
 	 */
 	return (arc_evict_needed);
 }
 
 /*
  * Keep arc_size under arc_c by running arc_evict which evicts data
  * from the ARC.
  */
 /* ARGSUSED */
 static void
 arc_evict_cb(void *arg, zthr_t *zthr)
 {
 	uint64_t evicted = 0;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/* Evict from cache */
 	evicted = arc_evict();
 
 	/*
 	 * If evicted is zero, we couldn't evict anything
 	 * via arc_evict(). This could be due to hash lock
 	 * collisions, but more likely due to the majority of
 	 * arc buffers being unevictable. Therefore, even if
 	 * arc_size is above arc_c, another pass is unlikely to
 	 * be helpful and could potentially cause us to enter an
 	 * infinite loop.  Additionally, zthr_iscancelled() is
 	 * checked here so that if the arc is shutting down, the
 	 * broadcast will wake any remaining arc evict waiters.
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) &&
 	    evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
 	if (!arc_evict_needed) {
 		/*
 		 * We're either no longer overflowing, or we
 		 * can't evict anything more, so we should wake
 		 * arc_get_data_impl() sooner.
 		 */
 		arc_evict_waiter_t *aw;
 		while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
 			cv_broadcast(&aw->aew_cv);
 		}
 		arc_set_need_free();
 	}
 	mutex_exit(&arc_evict_lock);
 	spl_fstrans_unmark(cookie);
 }
 
 /* ARGSUSED */
 static boolean_t
 arc_reap_cb_check(void *arg, zthr_t *zthr)
 {
 	int64_t free_memory = arc_available_memory();
 
 	/*
 	 * If a kmem reap is already active, don't schedule more.  We must
 	 * check for this because kmem_cache_reap_soon() won't actually
 	 * block on the cache being reaped (this is to prevent callers from
 	 * becoming implicitly blocked by a system-wide kmem reap -- which,
 	 * on a system with many, many full magazines, can take minutes).
 	 */
 	if (!kmem_cache_reap_active() && free_memory < 0) {
 
 		arc_no_grow = B_TRUE;
 		arc_warm = B_TRUE;
 		/*
 		 * Wait at least zfs_grow_retry (default 5) seconds
 		 * before considering growing.
 		 */
 		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
 		return (B_TRUE);
 	} else if (free_memory < arc_c >> arc_no_grow_shift) {
 		arc_no_grow = B_TRUE;
 	} else if (gethrtime() >= arc_growtime) {
 		arc_no_grow = B_FALSE;
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Keep enough free memory in the system by reaping the ARC's kmem
  * caches.  To cause more slabs to be reapable, we may reduce the
  * target size of the cache (arc_c), causing the arc_evict_cb()
  * to free more buffers.
  */
 /* ARGSUSED */
 static void
 arc_reap_cb(void *arg, zthr_t *zthr)
 {
 	int64_t free_memory;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/*
 	 * Kick off asynchronous kmem_reap()'s of all our caches.
 	 */
 	arc_kmem_reap_soon();
 
 	/*
 	 * Wait at least arc_kmem_cache_reap_retry_ms between
 	 * arc_kmem_reap_soon() calls. Without this check it is possible to
 	 * end up in a situation where we spend lots of time reaping
 	 * caches, while we're near arc_c_min.  Waiting here also gives the
 	 * subsequent free memory check a chance of finding that the
 	 * asynchronous reap has already freed enough memory, and we don't
 	 * need to call arc_reduce_target_size().
 	 */
 	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
 
 	/*
 	 * Reduce the target size as needed to maintain the amount of free
 	 * memory in the system at a fraction of the arc_size (1/128th by
 	 * default).  If oversubscribed (free_memory < 0) then reduce the
 	 * target arc_size by the deficit amount plus the fractional
 	 * amount.  If free memory is positive but less then the fractional
 	 * amount, reduce by what is needed to hit the fractional amount.
 	 */
 	free_memory = arc_available_memory();
 
 	int64_t to_free =
 	    (arc_c >> arc_shrink_shift) - free_memory;
 	if (to_free > 0) {
 		arc_reduce_target_size(to_free);
 	}
 	spl_fstrans_unmark(cookie);
 }
 
 #ifdef _KERNEL
 /*
  * Determine the amount of memory eligible for eviction contained in the
  * ARC. All clean data reported by the ghost lists can always be safely
  * evicted. Due to arc_c_min, the same does not hold for all clean data
  * contained by the regular mru and mfu lists.
  *
  * In the case of the regular mru and mfu lists, we need to report as
  * much clean data as possible, such that evicting that same reported
  * data will not bring arc_size below arc_c_min. Thus, in certain
  * circumstances, the total amount of clean data in the mru and mfu
  * lists might not actually be evictable.
  *
  * The following two distinct cases are accounted for:
  *
  * 1. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is greater than or equal to arc_c_min.
  *    (i.e. amount of dirty data >= arc_c_min)
  *
  *    This is the easy case; all clean data contained by the mru and mfu
  *    lists is evictable. Evicting all clean data can only drop arc_size
  *    to the amount of dirty data, which is greater than arc_c_min.
  *
  * 2. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is less than arc_c_min.
  *    (i.e. arc_c_min > amount of dirty data)
  *
  *    2.1. arc_size is greater than or equal arc_c_min.
  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
  *
  *         In this case, not all clean data from the regular mru and mfu
  *         lists is actually evictable; we must leave enough clean data
  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
  *         evictable data from the two lists combined, is exactly the
  *         difference between arc_size and arc_c_min.
  *
  *    2.2. arc_size is less than arc_c_min
  *         (i.e. arc_c_min > arc_size > amount of dirty data)
  *
  *         In this case, none of the data contained in the mru and mfu
  *         lists is evictable, even if it's clean. Since arc_size is
  *         already below arc_c_min, evicting any more would only
  *         increase this negative difference.
  */
 
 #endif /* _KERNEL */
 
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are coming from.  This function is only called
  * when we are adding new content to the cache.
  */
 static void
 arc_adapt(int bytes, arc_state_t *state)
 {
 	int mult;
 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
 	int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
 	int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
 
-	if (state == arc_l2c_only)
-		return;
-
 	ASSERT(bytes > 0);
 	/*
 	 * Adapt the target size of the MRU list:
 	 *	- if we just hit in the MRU ghost list, then increase
 	 *	  the target size of the MRU list.
 	 *	- if we just hit in the MFU ghost list, then increase
 	 *	  the target size of the MFU list by decreasing the
 	 *	  target size of the MRU list.
 	 */
 	if (state == arc_mru_ghost) {
 		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
 		if (!zfs_arc_p_dampener_disable)
 			mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
 	} else if (state == arc_mfu_ghost) {
 		uint64_t delta;
 
 		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
 		if (!zfs_arc_p_dampener_disable)
 			mult = MIN(mult, 10);
 
 		delta = MIN(bytes * mult, arc_p);
 		arc_p = MAX(arc_p_min, arc_p - delta);
 	}
 	ASSERT((int64_t)arc_p >= 0);
 
 	/*
 	 * Wake reap thread if we do not have any available memory
 	 */
 	if (arc_reclaim_needed()) {
 		zthr_wakeup(arc_reap_zthr);
 		return;
 	}
 
 	if (arc_no_grow)
 		return;
 
 	if (arc_c >= arc_c_max)
 		return;
 
 	/*
 	 * If we're within (2 * maxblocksize) bytes of the target
 	 * cache size, increment the target cache size
 	 */
 	ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
 	if (aggsum_upper_bound(&arc_size) >=
 	    arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
 		atomic_add_64(&arc_c, (int64_t)bytes);
 		if (arc_c > arc_c_max)
 			arc_c = arc_c_max;
 		else if (state == arc_anon)
 			atomic_add_64(&arc_p, (int64_t)bytes);
 		if (arc_p > arc_c)
 			arc_p = arc_c;
 	}
 	ASSERT((int64_t)arc_p >= 0);
 }
 
 /*
  * Check if arc_size has grown past our upper threshold, determined by
  * zfs_arc_overflow_shift.
  */
 boolean_t
 arc_is_overflowing(void)
 {
 	/* Always allow at least one block of overflow */
 	int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
 	    arc_c >> zfs_arc_overflow_shift);
 
 	/*
 	 * We just compare the lower bound here for performance reasons. Our
 	 * primary goals are to make sure that the arc never grows without
 	 * bound, and that it can reach its maximum size. This check
 	 * accomplishes both goals. The maximum amount we could run over by is
 	 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
 	 * in the ARC. In practice, that's in the tens of MB, which is low
 	 * enough to be safe.
 	 */
 	return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow);
 }
 
 static abd_t *
 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
     boolean_t do_adapt)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, do_adapt);
 	if (type == ARC_BUFC_METADATA) {
 		return (abd_alloc(size, B_TRUE));
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		return (abd_alloc(size, B_FALSE));
 	}
 }
 
 static void *
 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, B_TRUE);
 	if (type == ARC_BUFC_METADATA) {
 		return (zio_buf_alloc(size));
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		return (zio_data_buf_alloc(size));
 	}
 }
 
 /*
  * Wait for the specified amount of data (in bytes) to be evicted from the
  * ARC, and for there to be sufficient free memory in the system.  Waiting for
  * eviction ensures that the memory used by the ARC decreases.  Waiting for
  * free memory ensures that the system won't run out of free pages, regardless
  * of ARC behavior and settings.  See arc_lowmem_init().
  */
 void
 arc_wait_for_eviction(uint64_t amount)
 {
 	mutex_enter(&arc_evict_lock);
 	if (arc_is_overflowing()) {
 		arc_evict_needed = B_TRUE;
 		zthr_wakeup(arc_evict_zthr);
 
 		if (amount != 0) {
 			arc_evict_waiter_t aw;
 			list_link_init(&aw.aew_node);
 			cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
 
 			arc_evict_waiter_t *last =
 			    list_tail(&arc_evict_waiters);
 			if (last != NULL) {
 				ASSERT3U(last->aew_count, >, arc_evict_count);
 				aw.aew_count = last->aew_count + amount;
 			} else {
 				aw.aew_count = arc_evict_count + amount;
 			}
 
 			list_insert_tail(&arc_evict_waiters, &aw);
 
 			arc_set_need_free();
 
 			DTRACE_PROBE3(arc__wait__for__eviction,
 			    uint64_t, amount,
 			    uint64_t, arc_evict_count,
 			    uint64_t, aw.aew_count);
 
 			/*
 			 * We will be woken up either when arc_evict_count
 			 * reaches aew_count, or when the ARC is no longer
 			 * overflowing and eviction completes.
 			 */
 			cv_wait(&aw.aew_cv, &arc_evict_lock);
 
 			/*
 			 * In case of "false" wakeup, we will still be on the
 			 * list.
 			 */
 			if (list_link_active(&aw.aew_node))
 				list_remove(&arc_evict_waiters, &aw);
 
 			cv_destroy(&aw.aew_cv);
 		}
 	}
 	mutex_exit(&arc_evict_lock);
 }
 
 /*
  * Allocate a block and return it to the caller. If we are hitting the
  * hard limit for the cache size, we must sleep, waiting for the eviction
  * thread to catch up. If we're past the target size but below the hard
  * limit, we'll only signal the reclaim thread and continue on.
  */
 static void
 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
     boolean_t do_adapt)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	if (do_adapt)
 		arc_adapt(size, state);
 
 	/*
 	 * If arc_size is currently overflowing, we must be adding data
 	 * faster than we are evicting.  To ensure we don't compound the
 	 * problem by adding more data and forcing arc_size to grow even
 	 * further past it's target size, we wait for the eviction thread to
 	 * make some progress.  We also wait for there to be sufficient free
 	 * memory in the system, as measured by arc_free_memory().
 	 *
 	 * Specifically, we wait for zfs_arc_eviction_pct percent of the
 	 * requested size to be evicted.  This should be more than 100%, to
 	 * ensure that that progress is also made towards getting arc_size
 	 * under arc_c.  See the comment above zfs_arc_eviction_pct.
 	 *
 	 * We do the overflowing check without holding the arc_evict_lock to
 	 * reduce lock contention in this hot path.  Note that
 	 * arc_wait_for_eviction() will acquire the lock and check again to
 	 * ensure we are truly overflowing before blocking.
 	 */
 	if (arc_is_overflowing()) {
 		arc_wait_for_eviction(size *
 		    zfs_arc_eviction_pct / 100);
 	}
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_consume(size, ARC_SPACE_META);
 	} else {
 		arc_space_consume(size, ARC_SPACE_DATA);
 	}
 
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
 	if (!GHOST_STATE(state)) {
 
 		(void) zfs_refcount_add_many(&state->arcs_size, size, tag);
 
 		/*
 		 * If this is reached via arc_read, the link is
 		 * protected by the hash lock. If reached via
 		 * arc_buf_alloc, the header should not be accessed by
 		 * any other thread. And, if reached via arc_read_done,
 		 * the hash lock will protect it if it's found in the
 		 * hash table; otherwise no other thread should be
 		 * trying to [add|remove]_reference it.
 		 */
 		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 			ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 			(void) zfs_refcount_add_many(&state->arcs_esize[type],
 			    size, tag);
 		}
 
 		/*
 		 * If we are growing the cache, and we are adding anonymous
 		 * data, and we have outgrown arc_p, update arc_p
 		 */
 		if (aggsum_upper_bound(&arc_size) < arc_c &&
 		    hdr->b_l1hdr.b_state == arc_anon &&
 		    (zfs_refcount_count(&arc_anon->arcs_size) +
 		    zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
 			arc_p = MIN(arc_c, arc_p + size);
 	}
 }
 
 static void
 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
 {
 	arc_free_data_impl(hdr, size, tag);
 	abd_free(abd);
 }
 
 static void
 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_free_data_impl(hdr, size, tag);
 	if (type == ARC_BUFC_METADATA) {
 		zio_buf_free(buf, size);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		zio_data_buf_free(buf, size);
 	}
 }
 
 /*
  * Free the arc data buffer.
  */
 static void
 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, tag);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 }
 
 /*
  * This routine is called whenever a buffer is accessed.
  * NOTE: the hash lock is dropped in this function.
  */
 static void
 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
 	clock_t now;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * This buffer is not in the cache, and does not
 		 * appear in our "ghost" list.  Add the new buffer
 		 * to the MRU state.
 		 */
 
 		ASSERT0(hdr->b_l1hdr.b_arc_access);
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mru, hdr, hash_lock);
 
 	} else if (hdr->b_l1hdr.b_state == arc_mru) {
 		now = ddi_get_lbolt();
 
 		/*
 		 * If this buffer is here because of a prefetch, then either:
 		 * - clear the flag if this is a "referencing" read
 		 *   (any subsequent access will bump this into the MFU state).
 		 * or
 		 * - move the buffer to the head of the list if this is
 		 *   another prefetch (to make it less likely to be evicted).
 		 */
 		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
 				/* link protected by hash lock */
 				ASSERT(multilist_link_active(
 				    &hdr->b_l1hdr.b_arc_node));
 			} else {
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PREFETCH |
 				    ARC_FLAG_PRESCIENT_PREFETCH);
 				atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
 				ARCSTAT_BUMP(arcstat_mru_hits);
 			}
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 
 		/*
 		 * This buffer has been "accessed" only once so far,
 		 * but it is still in the cache. Move it to the MFU
 		 * state.
 		 */
 		if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
 		    ARC_MINTIME)) {
 			/*
 			 * More than 125ms have passed since we
 			 * instantiated this buffer.  Move it to the
 			 * most frequently used state.
 			 */
 			hdr->b_l1hdr.b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 			arc_change_state(arc_mfu, hdr, hash_lock);
 		}
 		atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
 		ARCSTAT_BUMP(arcstat_mru_hits);
 	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been "accessed" recently, but
 		 * was evicted from the cache.  Move it to the
 		 * MFU state.
 		 */
 
 		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			new_state = arc_mru;
 			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PREFETCH |
 				    ARC_FLAG_PRESCIENT_PREFETCH);
 			}
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		}
 
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		arc_change_state(new_state, hdr, hash_lock);
 
 		atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
 	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
 		/*
 		 * This buffer has been accessed more than once and is
 		 * still in the cache.  Keep it in the MFU state.
 		 *
 		 * NOTE: an add_reference() that occurred when we did
 		 * the arc_read() will have kicked this off the list.
 		 * If it was a prefetch, we will explicitly move it to
 		 * the head of the list now.
 		 */
 
 		atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
 		ARCSTAT_BUMP(arcstat_mfu_hits);
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
 		arc_state_t	*new_state = arc_mfu;
 		/*
 		 * This buffer has been accessed more than once but has
 		 * been evicted from the cache.  Move it back to the
 		 * MFU state.
 		 */
 
 		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			/*
 			 * This is a prefetch access...
 			 * move this block back to the MRU state.
 			 */
 			new_state = arc_mru;
 		}
 
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		arc_change_state(new_state, hdr, hash_lock);
 
 		atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
 	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
 		/*
 		 * This buffer is on the 2nd Level ARC.
 		 */
 
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mfu, hdr, hash_lock);
 	} else {
 		cmn_err(CE_PANIC, "invalid arc state 0x%p",
 		    hdr->b_l1hdr.b_state);
 	}
 }
 
 /*
  * This routine is called by dbuf_hold() to update the arc_access() state
  * which otherwise would be skipped for entries in the dbuf cache.
  */
 void
 arc_buf_access(arc_buf_t *buf)
 {
 	mutex_enter(&buf->b_evict_lock);
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Avoid taking the hash_lock when possible as an optimization.
 	 * The header must be checked again under the hash_lock in order
 	 * to handle the case where it is concurrently being released.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
 		mutex_exit(&buf->b_evict_lock);
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
 		mutex_exit(hash_lock);
 		mutex_exit(&buf->b_evict_lock);
 		ARCSTAT_BUMP(arcstat_access_skip);
 		return;
 	}
 
 	mutex_exit(&buf->b_evict_lock);
 
 	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 	    hdr->b_l1hdr.b_state == arc_mfu);
 
 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 	arc_access(hdr, hash_lock);
 	mutex_exit(hash_lock);
 
 	ARCSTAT_BUMP(arcstat_hits);
 	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr),
 	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
 }
 
 /* a generic arc_read_done_func_t which you can use */
 /* ARGSUSED */
 void
 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	if (buf == NULL)
 		return;
 
 	bcopy(buf->b_data, arg, arc_buf_size(buf));
 	arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_read_done_func_t */
 /* ARGSUSED */
 void
 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	arc_buf_t **bufp = arg;
 
 	if (buf == NULL) {
 		ASSERT(zio == NULL || zio->io_error != 0);
 		*bufp = NULL;
 	} else {
 		ASSERT(zio == NULL || zio->io_error == 0);
 		*bufp = buf;
 		ASSERT(buf->b_data != NULL);
 	}
 }
 
 static void
 arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
 {
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
 		ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
 	} else {
 		if (HDR_COMPRESSION_ENABLED(hdr)) {
 			ASSERT3U(arc_hdr_get_compress(hdr), ==,
 			    BP_GET_COMPRESS(bp));
 		}
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
 		ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
 	}
 }
 
 static void
 arc_read_done(zio_t *zio)
 {
 	blkptr_t 	*bp = zio->io_bp;
 	arc_buf_hdr_t	*hdr = zio->io_private;
 	kmutex_t	*hash_lock = NULL;
 	arc_callback_t	*callback_list;
 	arc_callback_t	*acb;
 	boolean_t	freeable = B_FALSE;
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
 	 * prior to starting I/O.  We should find this header, since
 	 * it's in the hash table, and it should be legit since it's
 	 * not possible to evict it during the I/O.  The only possible
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
 		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
 
 		found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
 
 		ASSERT((found == hdr &&
 		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
 		    (found == hdr && HDR_L2_READING(hdr)));
 		ASSERT3P(hash_lock, !=, NULL);
 	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 
 		if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
 			void *tmpbuf;
 
 			tmpbuf = abd_borrow_buf_copy(zio->io_abd,
 			    sizeof (zil_chain_t));
 			zio_crypt_decode_mac_zil(tmpbuf,
 			    hdr->b_crypt_hdr.b_mac);
 			abd_return_buf(zio->io_abd, tmpbuf,
 			    sizeof (zil_chain_t));
 		} else {
 			zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
 		}
 	}
 
 	if (zio->io_error == 0) {
 		/* byteswap if necessary */
 		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
 			if (BP_GET_LEVEL(zio->io_bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 		if (!HDR_L2_READING(hdr)) {
 			hdr->b_complevel = zio->io_prop.zp_complevel;
 		}
 	}
 
 	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
 	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
 
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT3P(callback_list, !=, NULL);
 
 	if (hash_lock && zio->io_error == 0 &&
 	    hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
 		 * if we've issued an I/O for an evicted buffer, we've already
 		 * called arc_access (to prevent any simultaneous readers from
 		 * getting confused).
 		 */
 		arc_access(hdr, hash_lock);
 	}
 
 	/*
 	 * If a read request has a callback (i.e. acb_done is not NULL), then we
 	 * make a buf containing the data according to the parameters which were
 	 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
 	 * aren't needlessly decompressing the data multiple times.
 	 */
 	int callback_cnt = 0;
 	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
 		if (!acb->acb_done)
 			continue;
 
 		callback_cnt++;
 
 		if (zio->io_error != 0)
 			continue;
 
 		int error = arc_buf_alloc_impl(hdr, zio->io_spa,
 		    &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
 		    acb->acb_compressed, acb->acb_noauth, B_TRUE,
 		    &acb->acb_buf);
 
 		/*
 		 * Assert non-speculative zios didn't fail because an
 		 * encryption key wasn't loaded
 		 */
 		ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
 		    error != EACCES);
 
 		/*
 		 * If we failed to decrypt, report an error now (as the zio
 		 * layer would have done if it had done the transforms).
 		 */
 		if (error == ECKSUM) {
 			ASSERT(BP_IS_PROTECTED(bp));
 			error = SET_ERROR(EIO);
 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 				spa_log_error(zio->io_spa, &acb->acb_zb);
 				zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0);
 			}
 		}
 
 		if (error != 0) {
 			/*
 			 * Decompression or decryption failed.  Set
 			 * io_error so that when we call acb_done
 			 * (below), we will indicate that the read
 			 * failed. Note that in the unusual case
 			 * where one callback is compressed and another
 			 * uncompressed, we will mark all of them
 			 * as failed, even though the uncompressed
 			 * one can't actually fail.  In this case,
 			 * the hdr will not be anonymous, because
 			 * if there are multiple callbacks, it's
 			 * because multiple threads found the same
 			 * arc buf in the hash table.
 			 */
 			zio->io_error = error;
 		}
 	}
 
 	/*
 	 * If there are multiple callbacks, we must have the hash lock,
 	 * because the only way for multiple threads to find this hdr is
 	 * in the hash table.  This ensures that if there are multiple
 	 * callbacks, the hdr is not anonymous.  If it were anonymous,
 	 * we couldn't use arc_buf_destroy() in the error case below.
 	 */
 	ASSERT(callback_cnt < 2 || hash_lock != NULL);
 
 	hdr->b_l1hdr.b_acb = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	if (callback_cnt == 0)
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 
 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
 	    callback_list != NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 		if (hdr->b_l1hdr.b_state != arc_anon)
 			arc_change_state(arc_anon, hdr, hash_lock);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
 		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
 	/*
 	 * Broadcast before we drop the hash_lock to avoid the possibility
 	 * that the hdr (and hence the cv) might be freed before we get to
 	 * the cv_broadcast().
 	 */
 	cv_broadcast(&hdr->b_l1hdr.b_cv);
 
 	if (hash_lock != NULL) {
 		mutex_exit(hash_lock);
 	} else {
 		/*
 		 * This block was freed while we waited for the read to
 		 * complete.  It has been removed from the hash table and
 		 * moved to the anonymous state (so that it won't show up
 		 * in the cache).
 		 */
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done != NULL) {
 			if (zio->io_error != 0 && acb->acb_buf != NULL) {
 				/*
 				 * If arc_buf_alloc_impl() fails during
 				 * decompression, the buf will still be
 				 * allocated, and needs to be freed here.
 				 */
 				arc_buf_destroy(acb->acb_buf,
 				    acb->acb_private);
 				acb->acb_buf = NULL;
 			}
 			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
 			    acb->acb_buf, acb->acb_private);
 		}
 
 		if (acb->acb_zio_dummy != NULL) {
 			acb->acb_zio_dummy->io_error = zio->io_error;
 			zio_nowait(acb->acb_zio_dummy);
 		}
 
 		callback_list = acb->acb_next;
 		kmem_free(acb, sizeof (arc_callback_t));
 	}
 
 	if (freeable)
 		arc_hdr_destroy(hdr);
 }
 
 /*
  * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
  * required.  If the block is not in the cache pass the read request
  * on to the spa with a substitute callback function, so that the
  * requested block will be added to the cache.
  *
  * If a read request arrives for a block that has a read in-progress,
  * either wait for the in-progress read to complete (and return the
  * results); or, if this is a read with a "done" func, add a record
  * to the read to invoke the "done" func when the read completes,
  * and return; or just return.
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_read_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = NULL;
 	kmutex_t *hash_lock = NULL;
 	zio_t *rzio;
 	uint64_t guid = spa_load_guid(spa);
 	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
 	boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
 	int rc = 0;
 
 	ASSERT(!embedded_bp ||
 	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	/*
 	 * Normally SPL_FSTRANS will already be set since kernel threads which
 	 * expect to call the DMU interfaces will set it when created.  System
 	 * calls are similarly handled by setting/cleaning the bit in the
 	 * registered callback (module/os/.../zfs/zpl_*).
 	 *
 	 * External consumers such as Lustre which call the exported DMU
 	 * interfaces may not have set SPL_FSTRANS.  To avoid a deadlock
 	 * on the hash_lock always set and clear the bit.
 	 */
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 top:
 	if (!embedded_bp) {
 		/*
 		 * Embedded BP's have no DVA and require no I/O to "read".
 		 * Create an anonymous arc buf to back it.
 		 */
 		hdr = buf_hash_find(guid, bp, &hash_lock);
 	}
 
 	/*
 	 * Determine if we have an L1 cache hit or a cache miss. For simplicity
 	 * we maintain encrypted data separately from compressed / uncompressed
 	 * data. If the user is requesting raw encrypted data and we don't have
 	 * that in the header we will read from disk to guarantee that we can
 	 * get it even if the encryption keys aren't loaded.
 	 */
 	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
 	    (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
 		arc_buf_t *buf = NULL;
 		*arc_flags |= ARC_FLAG_CACHED;
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
 
 			if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_cached_only_in_progress);
 				rc = SET_ERROR(ENOENT);
 				goto out;
 			}
 
 			ASSERT3P(head_zio, !=, NULL);
 			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
 			    priority == ZIO_PRIORITY_SYNC_READ) {
 				/*
 				 * This is a sync read that needs to wait for
 				 * an in-flight async read. Request that the
 				 * zio have its priority upgraded.
 				 */
 				zio_change_priority(head_zio, priority);
 				DTRACE_PROBE1(arc__async__upgrade__sync,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
 			}
 			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PREDICTIVE_PREFETCH);
 			}
 
 			if (*arc_flags & ARC_FLAG_WAIT) {
 				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
 				mutex_exit(hash_lock);
 				goto top;
 			}
 			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 
 			if (done) {
 				arc_callback_t *acb = NULL;
 
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
 				acb->acb_done = done;
 				acb->acb_private = private;
 				acb->acb_compressed = compressed_read;
 				acb->acb_encrypted = encrypted_read;
 				acb->acb_noauth = noauth_read;
 				acb->acb_zb = *zb;
 				if (pio != NULL)
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
 
 				ASSERT3P(acb->acb_done, !=, NULL);
 				acb->acb_zio_head = head_zio;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb = acb;
 				mutex_exit(hash_lock);
 				goto out;
 			}
 			mutex_exit(hash_lock);
 			goto out;
 		}
 
 		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 		    hdr->b_l1hdr.b_state == arc_mfu);
 
 		if (done) {
 			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
 				/*
 				 * This is a demand read which does not have to
 				 * wait for i/o because we did a predictive
 				 * prefetch i/o for it, which has completed.
 				 */
 				DTRACE_PROBE1(
 				    arc__demand__hit__predictive__prefetch,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(
 				    arcstat_demand_hit_predictive_prefetch);
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PREDICTIVE_PREFETCH);
 			}
 
 			if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
 				ARCSTAT_BUMP(
 				    arcstat_demand_hit_prescient_prefetch);
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PRESCIENT_PREFETCH);
 			}
 
 			ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
 
 			/* Get a buf with the desired data in it. */
 			rc = arc_buf_alloc_impl(hdr, spa, zb, private,
 			    encrypted_read, compressed_read, noauth_read,
 			    B_TRUE, &buf);
 			if (rc == ECKSUM) {
 				/*
 				 * Convert authentication and decryption errors
 				 * to EIO (and generate an ereport if needed)
 				 * before leaving the ARC.
 				 */
 				rc = SET_ERROR(EIO);
 				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 					spa_log_error(spa, zb);
 					zfs_ereport_post(
 					    FM_EREPORT_ZFS_AUTHENTICATION,
 					    spa, NULL, zb, NULL, 0, 0);
 				}
 			}
 			if (rc != 0) {
 				(void) remove_reference(hdr, hash_lock,
 				    private);
 				arc_buf_destroy_impl(buf);
 				buf = NULL;
 			}
 
 			/* assert any errors weren't due to unloaded keys */
 			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
 			    rc != EACCES);
 		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
 		    zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, hash_lock);
 		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 		if (*arc_flags & ARC_FLAG_L2CACHE)
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
 		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 		    data, metadata, hits);
 
 		if (done)
 			done(NULL, zb, bp, buf, private);
 	} else {
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		uint64_t psize = BP_GET_PSIZE(bp);
 		arc_callback_t *acb;
 		vdev_t *vd = NULL;
 		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
 		uint64_t size;
 		abd_t *hdr_abd;
 		int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
 
 		if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 			rc = SET_ERROR(ENOENT);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			goto out;
 		}
 
 		/*
 		 * Gracefully handle a damaged logical block size as a
 		 * checksum error.
 		 */
 		if (lsize > spa_maxblocksize(spa)) {
 			rc = SET_ERROR(ECKSUM);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			goto out;
 		}
 
 		if (hdr == NULL) {
 			/*
 			 * This block is not in the cache or it has
 			 * embedded data.
 			 */
 			arc_buf_hdr_t *exists = NULL;
 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
 			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
 			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type,
 			    encrypted_read);
 
 			if (!embedded_bp) {
 				hdr->b_dva = *BP_IDENTITY(bp);
 				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
 				buf_discard_identity(hdr);
 				arc_hdr_destroy(hdr);
 				goto top; /* restart the IO request */
 			}
 		} else {
 			/*
 			 * This block is in the ghost cache or encrypted data
 			 * was requested and we didn't have it. If it was
 			 * L2-only (and thus didn't have an L1 hdr),
 			 * we realloc the header to add an L1 hdr.
 			 */
 			if (!HDR_HAS_L1HDR(hdr)) {
 				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
 				    hdr_full_cache);
 			}
 
 			if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
 				ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 				ASSERT(!HDR_HAS_RABD(hdr));
 				ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 				ASSERT0(zfs_refcount_count(
 				    &hdr->b_l1hdr.b_refcnt));
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 				ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 			} else if (HDR_IO_IN_PROGRESS(hdr)) {
 				/*
 				 * If this header already had an IO in progress
 				 * and we are performing another IO to fetch
 				 * encrypted data we must wait until the first
 				 * IO completes so as not to confuse
 				 * arc_read_done(). This should be very rare
 				 * and so the performance impact shouldn't
 				 * matter.
 				 */
 				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
 				mutex_exit(hash_lock);
 				goto top;
 			}
 
 			/*
 			 * This is a delicate dance that we play here.
 			 * This hdr might be in the ghost list so we access
 			 * it to move it out of the ghost list before we
 			 * initiate the read. If it's a prefetch then
 			 * it won't have a callback so we'll remove the
 			 * reference that arc_buf_alloc_impl() created. We
 			 * do this after we've called arc_access() to
 			 * avoid hitting an assert in remove_reference().
 			 */
 			arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
 			arc_access(hdr, hash_lock);
 			arc_hdr_alloc_abd(hdr, alloc_flags);
 		}
 
 		if (encrypted_read) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			size = HDR_GET_PSIZE(hdr);
 			hdr_abd = hdr->b_crypt_hdr.b_rabd;
 			zio_flags |= ZIO_FLAG_RAW;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			size = arc_hdr_size(hdr);
 			hdr_abd = hdr->b_l1hdr.b_pabd;
 
 			if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 			}
 
 			/*
 			 * For authenticated bp's, we do not ask the ZIO layer
 			 * to authenticate them since this will cause the entire
 			 * IO to fail if the key isn't loaded. Instead, we
 			 * defer authentication until arc_buf_fill(), which will
 			 * verify the data when the key is available.
 			 */
 			if (BP_IS_AUTHENTICATED(bp))
 				zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
 		}
 
 		if (*arc_flags & ARC_FLAG_PREFETCH &&
 		    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 		if (*arc_flags & ARC_FLAG_L2CACHE)
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		if (BP_IS_AUTHENTICATED(bp))
 			arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		if (BP_GET_LEVEL(bp) > 0)
 			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
 		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
 		acb->acb_compressed = compressed_read;
 		acb->acb_encrypted = encrypted_read;
 		acb->acb_noauth = noauth_read;
 		acb->acb_zb = *zb;
 
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 		hdr->b_l1hdr.b_acb = acb;
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 
 		if (HDR_HAS_L2HDR(hdr) &&
 		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
 			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
 			addr = hdr->b_l2hdr.b_daddr;
 			/*
 			 * Lock out L2ARC device removal.
 			 */
 			if (vdev_is_dead(vd) ||
 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
 				vd = NULL;
 		}
 
 		/*
 		 * We count both async reads and scrub IOs as asynchronous so
 		 * that both can be upgraded in the event of a cache hit while
 		 * the read IO is still in-flight.
 		 */
 		if (priority == ZIO_PRIORITY_ASYNC_READ ||
 		    priority == ZIO_PRIORITY_SCRUB)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 		else
 			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 
 		/*
 		 * At this point, we have a level 1 cache miss or a blkptr
 		 * with embedded data.  Try again in L2ARC if possible.
 		 */
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
 
 		/*
 		 * Skip ARC stat bump for block pointers with embedded
 		 * data. The data are read from the blkptr itself via
 		 * decode_embedded_bp_compressed().
 		 */
 		if (!embedded_bp) {
 			DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
 			    blkptr_t *, bp, uint64_t, lsize,
 			    zbookmark_phys_t *, zb);
 			ARCSTAT_BUMP(arcstat_misses);
 			ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
 			    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
 			    metadata, misses);
 		}
 
 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true:
 			 * 1. The L2ARC vdev was previously cached.
 			 * 2. This buffer still has L2ARC metadata.
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
 			 * 5. This isn't prefetch and l2arc_noprefetch is set.
 			 */
 			if (HDR_HAS_L2HDR(hdr) &&
 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
 				l2arc_read_callback_t *cb;
 				abd_t *abd;
 				uint64_t asize;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
 				atomic_inc_32(&hdr->b_l2hdr.b_hits);
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
 				    KM_SLEEP);
 				cb->l2rcb_hdr = hdr;
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
 
 				/*
 				 * When Compressed ARC is disabled, but the
 				 * L2ARC block is compressed, arc_hdr_size()
 				 * will have returned LSIZE rather than PSIZE.
 				 */
 				if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 				    !HDR_COMPRESSION_ENABLED(hdr) &&
 				    HDR_GET_PSIZE(hdr) != 0) {
 					size = HDR_GET_PSIZE(hdr);
 				}
 
 				asize = vdev_psize_to_asize(vd, size);
 				if (asize != size) {
 					abd = abd_alloc_for_io(asize,
 					    HDR_ISTYPE_METADATA(hdr));
 					cb->l2rcb_abd = abd;
 				} else {
 					abd = hdr_abd;
 				}
 
 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
 				    addr + asize <= vd->vdev_psize -
 				    VDEV_LABEL_END_SIZE);
 
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
 				 * Issue a null zio if the underlying buffer
 				 * was squashed to zero size by compression.
 				 */
 				ASSERT3U(arc_hdr_get_compress(hdr), !=,
 				    ZIO_COMPRESS_EMPTY);
 				rzio = zio_read_phys(pio, vd, addr,
 				    asize, abd,
 				    ZIO_CHECKSUM_OFF,
 				    l2arc_read_done, cb, priority,
 				    zio_flags | ZIO_FLAG_DONT_CACHE |
 				    ZIO_FLAG_CANFAIL |
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				acb->acb_zio_head = rzio;
 
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
 				ARCSTAT_INCR(arcstat_l2_read_bytes,
 				    HDR_GET_PSIZE(hdr));
 
 				if (*arc_flags & ARC_FLAG_NOWAIT) {
 					zio_nowait(rzio);
 					goto out;
 				}
 
 				ASSERT(*arc_flags & ARC_FLAG_WAIT);
 				if (zio_wait(rzio) == 0)
 					goto out;
 
 				/* l2arc read error; goto zio_read() */
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 			} else {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 				if (HDR_L2_WRITING(hdr))
 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			}
 		} else {
 			if (vd != NULL)
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			/*
 			 * Skip ARC stat bump for block pointers with
 			 * embedded data. The data are read from the blkptr
 			 * itself via decode_embedded_bp_compressed().
 			 */
 			if (l2arc_ndev != 0 && !embedded_bp) {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 			}
 		}
 
 		rzio = zio_read(pio, spa, bp, hdr_abd, size,
 		    arc_read_done, hdr, priority, zio_flags, zb);
 		acb->acb_zio_head = rzio;
 
 		if (hash_lock != NULL)
 			mutex_exit(hash_lock);
 
 		if (*arc_flags & ARC_FLAG_WAIT) {
 			rc = zio_wait(rzio);
 			goto out;
 		}
 
 		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 		zio_nowait(rzio);
 	}
 
 out:
 	/* embedded bps don't actually go to disk */
 	if (!embedded_bp)
 		spa_read_history_add(spa, zb, *arc_flags);
 	spl_fstrans_unmark(cookie);
 	return (rc);
 }
 
 arc_prune_t *
 arc_add_prune_callback(arc_prune_func_t *func, void *private)
 {
 	arc_prune_t *p;
 
 	p = kmem_alloc(sizeof (*p), KM_SLEEP);
 	p->p_pfunc = func;
 	p->p_private = private;
 	list_link_init(&p->p_node);
 	zfs_refcount_create(&p->p_refcnt);
 
 	mutex_enter(&arc_prune_mtx);
 	zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
 	list_insert_head(&arc_prune_list, p);
 	mutex_exit(&arc_prune_mtx);
 
 	return (p);
 }
 
 void
 arc_remove_prune_callback(arc_prune_t *p)
 {
 	boolean_t wait = B_FALSE;
 	mutex_enter(&arc_prune_mtx);
 	list_remove(&arc_prune_list, p);
 	if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
 		wait = B_TRUE;
 	mutex_exit(&arc_prune_mtx);
 
 	/* wait for arc_prune_task to finish */
 	if (wait)
 		taskq_wait_outstanding(arc_prune_taskq, 0);
 	ASSERT0(zfs_refcount_count(&p->p_refcnt));
 	zfs_refcount_destroy(&p->p_refcnt);
 	kmem_free(p, sizeof (*p));
 }
 
 /*
  * Notify the arc that a block was freed, and thus will never be used again.
  */
 void
 arc_freed(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint64_t guid = spa_load_guid(spa);
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	hdr = buf_hash_find(guid, bp, &hash_lock);
 	if (hdr == NULL)
 		return;
 
 	/*
 	 * We might be trying to free a block that is still doing I/O
 	 * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
 	 * dmu_sync-ed block). If this block is being prefetched, then it
 	 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
 	 * until the I/O completes. A block may also have a reference if it is
 	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
 	 * have written the new block to its final resting place on disk but
 	 * without the dedup flag set. This would have left the hdr in the MRU
 	 * state and discoverable. When the txg finally syncs it detects that
 	 * the block was overridden in open context and issues an override I/O.
 	 * Since this is a dedup block, the override I/O will determine if the
 	 * block is already in the DDT. If so, then it will replace the io_bp
 	 * with the bp from the DDT and allow the I/O to finish. When the I/O
 	 * reaches the done callback, dbuf_write_override_done, it will
 	 * check to see if the io_bp and io_bp_override are identical.
 	 * If they are not, then it indicates that the bp was replaced with
 	 * the bp in the DDT and the override bp is freed. This allows
 	 * us to arrive here with a reference on a block that is being
 	 * freed. So if we have an I/O in progress, or a reference to
 	 * this hdr, then we don't destroy the hdr.
 	 */
 	if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
 	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
 		arc_change_state(arc_anon, hdr, hash_lock);
 		arc_hdr_destroy(hdr);
 		mutex_exit(hash_lock);
 	} else {
 		mutex_exit(hash_lock);
 	}
 
 }
 
 /*
  * Release this buffer from the cache, making it an anonymous buffer.  This
  * must be done after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
  * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * It would be nice to assert that if its DMU metadata (level >
 	 * 0 || it's the dnode file), then it must be syncing context.
 	 * But we don't know that information at this level.
 	 */
 
 	mutex_enter(&buf->b_evict_lock);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
 	 * We don't grab the hash lock prior to this check, because if
 	 * the buffer's header is in the arc_anon state, it won't be
 	 * linked into the hash table.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		mutex_exit(&buf->b_evict_lock);
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		ASSERT(!HDR_HAS_L2HDR(hdr));
 		ASSERT(HDR_EMPTY(hdr));
 
 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
 		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
 		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		/*
 		 * If the buf is being overridden then it may already
 		 * have a hdr that is not empty.
 		 */
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	/*
 	 * This assignment is only valid as long as the hash_lock is
 	 * held, we must be careful not to reference state or the
 	 * b_state field after dropping the lock.
 	 */
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(state, !=, arc_anon);
 
 	/* this buffer is not on any list */
 	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 
 		/*
 		 * We have to recheck this conditional again now that
 		 * we're holding the l2ad_mtx to prevent a race with
 		 * another thread which might be concurrently calling
 		 * l2arc_evict(). In that case, l2arc_evict() might have
 		 * destroyed the header's L2 portion as we were waiting
 		 * to acquire the l2ad_mtx.
 		 */
 		if (HDR_HAS_L2HDR(hdr))
 			arc_hdr_l2hdr_destroy(hdr);
 
 		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 	}
 
 	/*
 	 * Do we have more than one buf?
 	 */
 	if (hdr->b_l1hdr.b_bufcnt > 1) {
 		arc_buf_hdr_t *nhdr;
 		uint64_t spa = hdr->b_spa;
 		uint64_t psize = HDR_GET_PSIZE(hdr);
 		uint64_t lsize = HDR_GET_LSIZE(hdr);
 		boolean_t protected = HDR_PROTECTED(hdr);
 		enum zio_compress compress = arc_hdr_get_compress(hdr);
 		arc_buf_contents_t type = arc_buf_type(hdr);
 		VERIFY3U(hdr->b_type, ==, type);
 
 		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
 		(void) remove_reference(hdr, hash_lock, tag);
 
 		if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(ARC_BUF_LAST(buf));
 		}
 
 		/*
 		 * Pull the data off of this hdr and attach it to
 		 * a new anonymous hdr. Also find the last buffer
 		 * in the hdr's buffer list.
 		 */
 		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 		ASSERT3P(lastbuf, !=, NULL);
 
 		/*
 		 * If the current arc_buf_t and the hdr are sharing their data
 		 * buffer, then we must stop sharing that block.
 		 */
 		if (arc_buf_is_shared(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			VERIFY(!arc_buf_is_shared(lastbuf));
 
 			/*
 			 * First, sever the block sharing relationship between
 			 * buf and the arc_buf_hdr_t.
 			 */
 			arc_unshare_buf(hdr, buf);
 
 			/*
 			 * Now we need to recreate the hdr's b_pabd. Since we
 			 * have lastbuf handy, we try to share with it, but if
 			 * we can't then we allocate a new b_pabd and copy the
 			 * data from buf into it.
 			 */
 			if (arc_can_share(hdr, lastbuf)) {
 				arc_share_buf(hdr, lastbuf);
 			} else {
 				arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
 				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
 				    buf->b_data, psize);
 			}
 			VERIFY3P(lastbuf->b_data, !=, NULL);
 		} else if (HDR_SHARED_DATA(hdr)) {
 			/*
 			 * Uncompressed shared buffers are always at the end
 			 * of the list. Compressed buffers don't have the
 			 * same requirements. This makes it hard to
 			 * simply assert that the lastbuf is shared so
 			 * we rely on the hdr's compression flags to determine
 			 * if we have a compressed, shared buffer.
 			 */
 			ASSERT(arc_buf_is_shared(lastbuf) ||
 			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 			ASSERT(!ARC_BUF_SHARED(buf));
 		}
 
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 		ASSERT3P(state, !=, arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_size,
 		    arc_buf_size(buf), buf);
 
 		if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 			ASSERT3P(state, !=, arc_l2c_only);
 			(void) zfs_refcount_remove_many(
 			    &state->arcs_esize[type],
 			    arc_buf_size(buf), buf);
 		}
 
 		hdr->b_l1hdr.b_bufcnt -= 1;
 		if (ARC_BUF_ENCRYPTED(buf))
 			hdr->b_crypt_hdr.b_ebufcnt -= 1;
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		/* if this is the last uncompressed buf free the checksum */
 		if (!arc_hdr_has_uncompressed_buf(hdr))
 			arc_cksum_free(hdr);
 
 		mutex_exit(hash_lock);
 
 		/*
 		 * Allocate a new hdr. The new hdr will contain a b_pabd
 		 * buffer which will be freed in arc_write().
 		 */
 		nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
 		    compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr));
 		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT0(nhdr->b_l1hdr.b_bufcnt);
 		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
 		VERIFY3U(nhdr->b_type, ==, type);
 		ASSERT(!HDR_SHARED_DATA(nhdr));
 
 		nhdr->b_l1hdr.b_buf = buf;
 		nhdr->b_l1hdr.b_bufcnt = 1;
 		if (ARC_BUF_ENCRYPTED(buf))
 			nhdr->b_crypt_hdr.b_ebufcnt = 1;
 		nhdr->b_l1hdr.b_mru_hits = 0;
 		nhdr->b_l1hdr.b_mru_ghost_hits = 0;
 		nhdr->b_l1hdr.b_mfu_hits = 0;
 		nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
 		nhdr->b_l1hdr.b_l2_hits = 0;
 		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
 
 		mutex_exit(&buf->b_evict_lock);
 		(void) zfs_refcount_add_many(&arc_anon->arcs_size,
 		    arc_buf_size(buf), buf);
 	} else {
 		mutex_exit(&buf->b_evict_lock);
 		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
 		/* protected by hash lock, or hdr is on arc_anon */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		hdr->b_l1hdr.b_mru_hits = 0;
 		hdr->b_l1hdr.b_mru_ghost_hits = 0;
 		hdr->b_l1hdr.b_mfu_hits = 0;
 		hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 		hdr->b_l1hdr.b_l2_hits = 0;
 		arc_change_state(arc_anon, hdr, hash_lock);
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		mutex_exit(hash_lock);
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
 	int released;
 
 	mutex_enter(&buf->b_evict_lock);
 	released = (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
 	mutex_exit(&buf->b_evict_lock);
 	return (released);
 }
 
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
 	int referenced;
 
 	mutex_enter(&buf->b_evict_lock);
 	referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
 	mutex_exit(&buf->b_evict_lock);
 	return (referenced);
 }
 #endif
 
 static void
 arc_write_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
 	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
 
 	/*
 	 * If we're reexecuting this zio because the pool suspended, then
 	 * cleanup any state that was previously set the first time the
 	 * callback was invoked.
 	 */
 	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
 		arc_cksum_free(hdr);
 		arc_buf_unwatch(buf);
 		if (hdr->b_l1hdr.b_pabd != NULL) {
 			if (arc_buf_is_shared(buf)) {
 				arc_unshare_buf(hdr, buf);
 			} else {
 				arc_hdr_free_abd(hdr, B_FALSE);
 			}
 		}
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!HDR_HAS_RABD(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr));
 	ASSERT(!arc_buf_is_shared(buf));
 
 	callback->awcb_ready(zio, buf, callback->awcb_private);
 
 	if (HDR_IO_IN_PROGRESS(hdr))
 		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
 
 	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 
 	if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
 		hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
 
 	if (BP_IS_PROTECTED(bp)) {
 		/* ZIL blocks are written through zio_rewrite */
 		ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 		ASSERT(HDR_PROTECTED(hdr));
 
 		if (BP_SHOULD_BYTESWAP(bp)) {
 			if (BP_GET_LEVEL(bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 		zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
 	}
 
 	/*
 	 * If this block was written for raw encryption but the zio layer
 	 * ended up only authenticating it, adjust the buffer flags now.
 	 */
 	if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
 			buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	} else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	}
 
 	/* this must be done after the buffer flags are adjusted */
 	arc_cksum_compute(buf);
 
 	enum zio_compress compress;
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		compress = ZIO_COMPRESS_OFF;
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		compress = BP_GET_COMPRESS(bp);
 	}
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = zio->io_prop.zp_complevel;
 
 	if (zio->io_error != 0 || psize == 0)
 		goto out;
 
 	/*
 	 * Fill the hdr with data. If the buffer is encrypted we have no choice
 	 * but to copy the data into b_radb. If the hdr is compressed, the data
 	 * we want is available from the zio, otherwise we can take it from
 	 * the buf.
 	 *
 	 * We might be able to share the buf's data with the hdr here. However,
 	 * doing so would cause the ARC to be full of linear ABDs if we write a
 	 * lot of shareable data. As a compromise, we check whether scattered
 	 * ABDs are allowed, and assume that if they are then the user wants
 	 * the ARC to be primarily filled with them regardless of the data being
 	 * written. Therefore, if they're allowed then we allocate one and copy
 	 * the data into it; otherwise, we share the data directly if we can.
 	 */
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT3U(psize, >, 0);
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
 		abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 	} else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
 		/*
 		 * Ideally, we would always copy the io_abd into b_pabd, but the
 		 * user may have disabled compressed ARC, thus we must check the
 		 * hdr's compression setting rather than the io_bp's.
 		 */
 		if (BP_IS_ENCRYPTED(bp)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr,
 			    ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
 			abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 		} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 		    !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
 			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
 		} else {
 			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
 			arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
 			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
 		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
 
 		arc_share_buf(hdr, buf);
 	}
 
 out:
 	arc_hdr_verify(hdr, bp);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 arc_write_children_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 
 	callback->awcb_children_ready(zio, buf, callback->awcb_private);
 }
 
 /*
  * The SPA calls this callback for each physical write that happens on behalf
  * of a logical write.  See the comment in dbuf_write_physdone() for details.
  */
 static void
 arc_write_physdone(zio_t *zio)
 {
 	arc_write_callback_t *cb = zio->io_private;
 	if (cb->awcb_physdone != NULL)
 		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
 }
 
 static void
 arc_write_done(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 
 		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(HDR_EMPTY(hdr));
 	}
 
 	/*
 	 * If the block to be written was all-zero or compressed enough to be
 	 * embedded in the BP, no write was performed so there will be no
 	 * dva/birth/checksum.  The buffer must therefore remain anonymous
 	 * (and uncached).
 	 */
 	if (!HDR_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
 		ASSERT3U(zio->io_error, ==, 0);
 
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
 		if (exists != NULL) {
 			/*
 			 * This can only happen if we overwrite for
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad overwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 				ASSERT(zfs_refcount_is_zero(
 				    &exists->b_l1hdr.b_refcnt));
 				arc_change_state(arc_anon, exists, hash_lock);
 				arc_hdr_destroy(exists);
 				mutex_exit(hash_lock);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT3P(exists, ==, NULL);
 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
 				/* nopwrite */
 				ASSERT(zio->io_prop.zp_nopwrite);
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad nopwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
 				ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
 				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
 		}
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		/* if it's not anon, we are doing a scrub */
 		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
 			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	}
 
 	ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	abd_put(zio->io_abd);
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc,
     const zio_prop_t *zp, arc_write_done_func_t *ready,
     arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
     arc_write_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
 	zio_prop_t localprop = *zp;
 
 	ASSERT3P(ready, !=, NULL);
 	ASSERT3P(done, !=, NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
 	if (l2arc)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		localprop.zp_encrypt = B_TRUE;
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		localprop.zp_byteorder =
 		    (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 		    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 		bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
 		    ZIO_DATA_SALT_LEN);
 		bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
 		    ZIO_DATA_IV_LEN);
 		bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
 		    ZIO_DATA_MAC_LEN);
 		if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
 			localprop.zp_nopwrite = B_FALSE;
 			localprop.zp_copies =
 			    MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
 		}
 		zio_flags |= ZIO_FLAG_RAW;
 	} else if (ARC_BUF_COMPRESSED(buf)) {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 	}
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_children_ready = children_ready;
 	callback->awcb_physdone = physdone;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
 	/*
 	 * The hdr's b_pabd is now stale, free it now. A new data block
 	 * will be allocated when the zio pipeline calls arc_write_ready().
 	 */
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		/*
 		 * If the buf is currently sharing the data block with
 		 * the hdr then we need to break that relationship here.
 		 * The hdr will remain with a NULL data pointer and the
 		 * buf will take sole ownership of the block.
 		 */
 		if (arc_buf_is_shared(buf)) {
 			arc_unshare_buf(hdr, buf);
 		} else {
 			arc_hdr_free_abd(hdr, B_FALSE);
 		}
 		VERIFY3P(buf->b_data, !=, NULL);
 	}
 
 	if (HDR_HAS_RABD(hdr))
 		arc_hdr_free_abd(hdr, B_TRUE);
 
 	if (!(zio_flags & ZIO_FLAG_RAW))
 		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
 
 	ASSERT(!arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 
 	zio = zio_write(pio, spa, txg, bp,
 	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
 	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
 	    arc_write_physdone, arc_write_done, callback,
 	    priority, zio_flags, zb);
 
 	return (zio);
 }
 
 void
 arc_tempreserve_clear(uint64_t reserve)
 {
 	atomic_add_64(&arc_tempreserve, -reserve);
 	ASSERT((int64_t)arc_tempreserve >= 0);
 }
 
 int
 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
 {
 	int error;
 	uint64_t anon_size;
 
 	if (!arc_no_grow &&
 	    reserve > arc_c/4 &&
 	    reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
 		arc_c = MIN(arc_c_max, reserve * 4);
 
 	/*
 	 * Throttle when the calculated memory footprint for the TXG
 	 * exceeds the target ARC size.
 	 */
 	if (reserve > arc_c) {
 		DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
 		return (SET_ERROR(ERESTART));
 	}
 
 	/*
 	 * Don't count loaned bufs as in flight dirty data to prevent long
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
 
 	/* assert that it has not wrapped around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 
 	anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
 	    arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
 	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
 	error = arc_memory_throttle(spa, reserve, txg);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Throttle writes when the amount of dirty data in the cache
 	 * gets too large.  We try to keep the cache less than half full
 	 * of dirty blocks so that our sync times don't grow too large.
 	 *
 	 * In the case of one pool being built on another pool, we want
 	 * to make sure we don't end up throttling the lower (backing)
 	 * pool when the upper pool is the majority contributor to dirty
 	 * data. To insure we make forward progress during throttling, we
 	 * also check the current pool's net dirty data and only throttle
 	 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
 	 * data in the cache.
 	 *
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
 	uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
 	uint64_t spa_dirty_anon = spa_dirty_data(spa);
 
 	if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
 	    anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
 	    spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
 #ifdef ZFS_DEBUG
 		uint64_t meta_esize = zfs_refcount_count(
 		    &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 		uint64_t data_esize =
 		    zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
 		    arc_tempreserve >> 10, meta_esize >> 10,
 		    data_esize >> 10, reserve >> 10, arc_c >> 10);
 #endif
 		DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
 		return (SET_ERROR(ERESTART));
 	}
 	atomic_add_64(&arc_tempreserve, reserve);
 	return (0);
 }
 
 static void
 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
 	size->value.ui64 = zfs_refcount_count(&state->arcs_size);
 	evict_data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
 	evict_metadata->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
 }
 
 static int
 arc_kstat_update(kstat_t *ksp, int rw)
 {
 	arc_stats_t *as = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE) {
 		return (SET_ERROR(EACCES));
 	} else {
 		arc_kstat_update_state(arc_anon,
 		    &as->arcstat_anon_size,
 		    &as->arcstat_anon_evictable_data,
 		    &as->arcstat_anon_evictable_metadata);
 		arc_kstat_update_state(arc_mru,
 		    &as->arcstat_mru_size,
 		    &as->arcstat_mru_evictable_data,
 		    &as->arcstat_mru_evictable_metadata);
 		arc_kstat_update_state(arc_mru_ghost,
 		    &as->arcstat_mru_ghost_size,
 		    &as->arcstat_mru_ghost_evictable_data,
 		    &as->arcstat_mru_ghost_evictable_metadata);
 		arc_kstat_update_state(arc_mfu,
 		    &as->arcstat_mfu_size,
 		    &as->arcstat_mfu_evictable_data,
 		    &as->arcstat_mfu_evictable_metadata);
 		arc_kstat_update_state(arc_mfu_ghost,
 		    &as->arcstat_mfu_ghost_size,
 		    &as->arcstat_mfu_ghost_evictable_data,
 		    &as->arcstat_mfu_ghost_evictable_metadata);
 
 		ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
 		ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
 		ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
 		ARCSTAT(arcstat_metadata_size) =
 		    aggsum_value(&astat_metadata_size);
 		ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
 		ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
 		ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
 #if defined(COMPAT_FREEBSD11)
 		ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) +
 		    aggsum_value(&astat_dnode_size) +
 		    aggsum_value(&astat_dbuf_size);
 #endif
 		ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
 		ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
 		ARCSTAT(arcstat_abd_chunk_waste_size) =
 		    aggsum_value(&astat_abd_chunk_waste_size);
 
 		as->arcstat_memory_all_bytes.value.ui64 =
 		    arc_all_memory();
 		as->arcstat_memory_free_bytes.value.ui64 =
 		    arc_free_memory();
 		as->arcstat_memory_available_bytes.value.i64 =
 		    arc_available_memory();
 	}
 
 	return (0);
 }
 
 /*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the ARC eviction
  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
 static unsigned int
 arc_state_multilist_index_func(multilist_t *ml, void *obj)
 {
 	arc_buf_hdr_t *hdr = obj;
 
 	/*
 	 * We rely on b_dva to generate evenly distributed index
 	 * numbers using buf_hash below. So, as an added precaution,
 	 * let's make sure we never add empty buffers to the arc lists.
 	 */
 	ASSERT(!HDR_EMPTY(hdr));
 
 	/*
 	 * The assumption here, is the hash value for a given
 	 * arc_buf_hdr_t will remain constant throughout its lifetime
 	 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
 	 * Thus, we don't need to store the header's sublist index
 	 * on insertion, as this index can be recalculated on removal.
 	 *
 	 * Also, the low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed.
 	 */
 	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
 	    multilist_get_num_sublists(ml));
 }
 
 #define	WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do {	\
 	if ((do_warn) && (tuning) && ((tuning) != (value))) {	\
 		cmn_err(CE_WARN,				\
 		    "ignoring tunable %s (using %llu instead)",	\
 		    (#tuning), (value));			\
 	}							\
 } while (0)
 
 /*
  * Called during module initialization and periodically thereafter to
  * apply reasonable changes to the exposed performance tunings.  Can also be
  * called explicitly by param_set_arc_*() functions when ARC tunables are
  * updated manually.  Non-zero zfs_* values which differ from the currently set
  * values will be applied.
  */
 void
 arc_tuning_update(boolean_t verbose)
 {
 	uint64_t allmem = arc_all_memory();
 	unsigned long limit;
 
 	/* Valid range: 32M - <arc_c_max> */
 	if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
 	    (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
 	    (zfs_arc_min <= arc_c_max)) {
 		arc_c_min = zfs_arc_min;
 		arc_c = MAX(arc_c, arc_c_min);
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
 
 	/* Valid range: 64M - <all physical memory> */
 	if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
 	    (zfs_arc_max >= 64 << 20) && (zfs_arc_max < allmem) &&
 	    (zfs_arc_max > arc_c_min)) {
 		arc_c_max = zfs_arc_max;
 		arc_c = MIN(arc_c, arc_c_max);
 		arc_p = (arc_c >> 1);
 		if (arc_meta_limit > arc_c_max)
 			arc_meta_limit = arc_c_max;
 		if (arc_dnode_size_limit > arc_meta_limit)
 			arc_dnode_size_limit = arc_meta_limit;
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
 
 	/* Valid range: 16M - <arc_c_max> */
 	if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
 	    (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
 	    (zfs_arc_meta_min <= arc_c_max)) {
 		arc_meta_min = zfs_arc_meta_min;
 		if (arc_meta_limit < arc_meta_min)
 			arc_meta_limit = arc_meta_min;
 		if (arc_dnode_size_limit < arc_meta_min)
 			arc_dnode_size_limit = arc_meta_min;
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose);
 
 	/* Valid range: <arc_meta_min> - <arc_c_max> */
 	limit = zfs_arc_meta_limit ? zfs_arc_meta_limit :
 	    MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100;
 	if ((limit != arc_meta_limit) &&
 	    (limit >= arc_meta_min) &&
 	    (limit <= arc_c_max))
 		arc_meta_limit = limit;
 	WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose);
 
 	/* Valid range: <arc_meta_min> - <arc_meta_limit> */
 	limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
 	    MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100;
 	if ((limit != arc_dnode_size_limit) &&
 	    (limit >= arc_meta_min) &&
 	    (limit <= arc_meta_limit))
 		arc_dnode_size_limit = limit;
 	WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit,
 	    verbose);
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_grow_retry)
 		arc_grow_retry = zfs_arc_grow_retry;
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_shrink_shift) {
 		arc_shrink_shift = zfs_arc_shrink_shift;
 		arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
 	}
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_p_min_shift)
 		arc_p_min_shift = zfs_arc_p_min_shift;
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prefetch_ms)
 		arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prescient_prefetch_ms) {
 		arc_min_prescient_prefetch_ms =
 		    zfs_arc_min_prescient_prefetch_ms;
 	}
 
 	/* Valid range: 0 - 100 */
 	if ((zfs_arc_lotsfree_percent >= 0) &&
 	    (zfs_arc_lotsfree_percent <= 100))
 		arc_lotsfree_percent = zfs_arc_lotsfree_percent;
 	WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
 	    verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
 		arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem);
 	WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
 }
 
 static void
 arc_state_init(void)
 {
 	arc_anon = &ARC_anon;
 	arc_mru = &ARC_mru;
 	arc_mru_ghost = &ARC_mru_ghost;
 	arc_mfu = &ARC_mfu;
 	arc_mfu_ghost = &ARC_mfu_ghost;
 	arc_l2c_only = &ARC_l2c_only;
 
 	arc_mru->arcs_list[ARC_BUFC_METADATA] =
 	    multilist_create(sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	arc_mru->arcs_list[ARC_BUFC_DATA] =
 	    multilist_create(sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
 	    multilist_create(sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
 	    multilist_create(sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	arc_mfu->arcs_list[ARC_BUFC_METADATA] =
 	    multilist_create(sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	arc_mfu->arcs_list[ARC_BUFC_DATA] =
 	    multilist_create(sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
 	    multilist_create(sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
 	    multilist_create(sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
 	    multilist_create(sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
 	    multilist_create(sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_create(&arc_anon->arcs_size);
 	zfs_refcount_create(&arc_mru->arcs_size);
 	zfs_refcount_create(&arc_mru_ghost->arcs_size);
 	zfs_refcount_create(&arc_mfu->arcs_size);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_size);
 	zfs_refcount_create(&arc_l2c_only->arcs_size);
 
 	aggsum_init(&arc_meta_used, 0);
 	aggsum_init(&arc_size, 0);
 	aggsum_init(&astat_data_size, 0);
 	aggsum_init(&astat_metadata_size, 0);
 	aggsum_init(&astat_hdr_size, 0);
 	aggsum_init(&astat_l2_hdr_size, 0);
 	aggsum_init(&astat_bonus_size, 0);
 	aggsum_init(&astat_dnode_size, 0);
 	aggsum_init(&astat_dbuf_size, 0);
 	aggsum_init(&astat_abd_chunk_waste_size, 0);
 
 	arc_anon->arcs_state = ARC_STATE_ANON;
 	arc_mru->arcs_state = ARC_STATE_MRU;
 	arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
 	arc_mfu->arcs_state = ARC_STATE_MFU;
 	arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
 	arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
 }
 
 static void
 arc_state_fini(void)
 {
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_destroy(&arc_anon->arcs_size);
 	zfs_refcount_destroy(&arc_mru->arcs_size);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
 	zfs_refcount_destroy(&arc_mfu->arcs_size);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_size);
 
 	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
 
 	aggsum_fini(&arc_meta_used);
 	aggsum_fini(&arc_size);
 	aggsum_fini(&astat_data_size);
 	aggsum_fini(&astat_metadata_size);
 	aggsum_fini(&astat_hdr_size);
 	aggsum_fini(&astat_l2_hdr_size);
 	aggsum_fini(&astat_bonus_size);
 	aggsum_fini(&astat_dnode_size);
 	aggsum_fini(&astat_dbuf_size);
 	aggsum_fini(&astat_abd_chunk_waste_size);
 }
 
 uint64_t
 arc_target_bytes(void)
 {
 	return (arc_c);
 }
 
 void
 arc_init(void)
 {
 	uint64_t percent, allmem = arc_all_memory();
 	mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
 	    offsetof(arc_evict_waiter_t, aew_node));
 
 	arc_min_prefetch_ms = 1000;
 	arc_min_prescient_prefetch_ms = 6000;
 
 #if defined(_KERNEL)
 	arc_lowmem_init();
 #endif
 
 	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
 	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
 
 	/* How to set default max varies by platform. */
 	arc_c_max = arc_default_max(arc_c_min, allmem);
 
 #ifndef _KERNEL
 	/*
 	 * In userland, there's only the memory pressure that we artificially
 	 * create (see arc_available_memory()).  Don't let arc_c get too
 	 * small, because it can cause transactions to be larger than
 	 * arc_c, causing arc_tempreserve_space() to fail.
 	 */
 	arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
 #endif
 
 	arc_c = arc_c_min;
 	arc_p = (arc_c >> 1);
 
 	/* Set min to 1/2 of arc_c_min */
 	arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
 	/* Initialize maximum observed usage to zero */
 	arc_meta_max = 0;
 	/*
 	 * Set arc_meta_limit to a percent of arc_c_max with a floor of
 	 * arc_meta_min, and a ceiling of arc_c_max.
 	 */
 	percent = MIN(zfs_arc_meta_limit_percent, 100);
 	arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100);
 	percent = MIN(zfs_arc_dnode_limit_percent, 100);
 	arc_dnode_size_limit = (percent * arc_meta_limit) / 100;
 
 	/* Apply user specified tunings */
 	arc_tuning_update(B_TRUE);
 
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
 	arc_state_init();
 
 	buf_init();
 
 	list_create(&arc_prune_list, sizeof (arc_prune_t),
 	    offsetof(arc_prune_t, p_node));
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	arc_prune_taskq = taskq_create("arc_prune", boot_ncpus, defclsyspri,
 	    boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
 	if (arc_ksp != NULL) {
 		arc_ksp->ks_data = &arc_stats;
 		arc_ksp->ks_update = arc_kstat_update;
 		kstat_install(arc_ksp);
 	}
 
 	arc_evict_zthr = zthr_create_timer("arc_evict",
 	    arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1));
 	arc_reap_zthr = zthr_create_timer("arc_reap",
 	    arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1));
 
 	arc_warm = B_FALSE;
 
 	/*
 	 * Calculate maximum amount of dirty data per pool.
 	 *
 	 * If it has been set by a module parameter, take that.
 	 * Otherwise, use a percentage of physical memory defined by
 	 * zfs_dirty_data_max_percent (default 10%) with a cap at
 	 * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
 	 */
 #ifdef __LP64__
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #else
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #endif
 
 	if (zfs_dirty_data_max == 0) {
 		zfs_dirty_data_max = allmem *
 		    zfs_dirty_data_max_percent / 100;
 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
 		    zfs_dirty_data_max_max);
 	}
 }
 
 void
 arc_fini(void)
 {
 	arc_prune_t *p;
 
 #ifdef _KERNEL
 	arc_lowmem_fini();
 #endif /* _KERNEL */
 
 	/* Use B_TRUE to ensure *all* buffers are evicted */
 	arc_flush(NULL, B_TRUE);
 
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
 	taskq_wait(arc_prune_taskq);
 	taskq_destroy(arc_prune_taskq);
 
 	mutex_enter(&arc_prune_mtx);
 	while ((p = list_head(&arc_prune_list)) != NULL) {
 		list_remove(&arc_prune_list, p);
 		zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
 		zfs_refcount_destroy(&p->p_refcnt);
 		kmem_free(p, sizeof (*p));
 	}
 	mutex_exit(&arc_prune_mtx);
 
 	list_destroy(&arc_prune_list);
 	mutex_destroy(&arc_prune_mtx);
 
 	(void) zthr_cancel(arc_evict_zthr);
 	(void) zthr_cancel(arc_reap_zthr);
 
 	mutex_destroy(&arc_evict_lock);
 	list_destroy(&arc_evict_waiters);
 
 	/*
 	 * Free any buffers that were tagged for destruction.  This needs
 	 * to occur before arc_state_fini() runs and destroys the aggsum
 	 * values which are updated when freeing scatter ABDs.
 	 */
 	l2arc_do_free_on_write();
 
 	/*
 	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
 	 * trigger the release of kmem magazines, which can callback to
 	 * arc_space_return() which accesses aggsums freed in act_state_fini().
 	 */
 	buf_fini();
 	arc_state_fini();
 
 	/*
 	 * We destroy the zthrs after all the ARC state has been
 	 * torn down to avoid the case of them receiving any
 	 * wakeup() signals after they are destroyed.
 	 */
 	zthr_destroy(arc_evict_zthr);
 	zthr_destroy(arc_reap_zthr);
 
 	ASSERT0(arc_loaned_bytes);
 }
 
 /*
  * Level 2 ARC
  *
  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
  * It uses dedicated storage devices to hold cached data, which are populated
  * using large infrequent writes.  The main role of this cache is to boost
  * the performance of random read workloads.  The intended L2ARC devices
  * include short-stroked disks, solid state disks, and other media with
  * substantially faster read latency than disk.
  *
  *                 +-----------------------+
  *                 |         ARC           |
  *                 +-----------------------+
  *                    |         ^     ^
  *                    |         |     |
  *      l2arc_feed_thread()    arc_read()
  *                    |         |     |
  *                    |  l2arc read   |
  *                    V         |     |
  *               +---------------+    |
  *               |     L2ARC     |    |
  *               +---------------+    |
  *                   |    ^           |
  *          l2arc_write() |           |
  *                   |    |           |
  *                   V    |           |
  *                 +-------+      +-------+
  *                 | vdev  |      | vdev  |
  *                 | cache |      | cache |
  *                 +-------+      +-------+
  *                 +=========+     .-----.
  *                 :  L2ARC  :    |-_____-|
  *                 : devices :    | Disks |
  *                 +=========+    `-_____-'
  *
  * Read requests are satisfied from the following sources, in order:
  *
  *	1) ARC
  *	2) vdev cache of L2ARC devices
  *	3) L2ARC devices
  *	4) vdev cache of disks
  *	5) disks
  *
  * Some L2ARC device types exhibit extremely slow write performance.
  * To accommodate for this there are some significant differences between
  * the L2ARC and traditional cache design:
  *
  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
  * the ARC behave as usual, freeing buffers and placing headers on ghost
  * lists.  The ARC does not send buffers to the L2ARC during eviction as
  * this would add inflated write latencies for all ARC memory pressure.
  *
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
  * not already there. It scans until a headroom of buffers is satisfied,
  * which itself is a buffer for ARC eviction. If a compressible buffer is
  * found during scanning and selected for writing to an L2ARC device, we
  * temporarily boost scanning headroom during the next scan cycle to make
  * sure we adapt to compression effects (which might significantly reduce
  * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
  *	       head -->                        tail
  *	        +---------------------+----------+
  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
  *	        +---------------------+----------+   |   o L2ARC eligible
  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
  *	        +---------------------+----------+   |
  *	             15.9 Gbytes      ^ 32 Mbytes    |
  *	                           headroom          |
  *	                                      l2arc_feed_thread()
  *	                                             |
  *	                 l2arc write hand <--[oooo]--'
  *	                         |           8 Mbyte
  *	                         |          write max
  *	                         V
  *		  +==============================+
  *	L2ARC dev |####|#|###|###|    |####| ... |
  *	          +==============================+
  *	                     32 Gbytes
  *
  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
  * evicted, then the L2ARC has cached a buffer much sooner than it probably
  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
  * safe to say that this is an uncommon case, since buffers at the end of
  * the ARC lists have moved there due to inactivity.
  *
  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
  * then the L2ARC simply misses copying some buffers.  This serves as a
  * pressure valve to prevent heavy read workloads from both stalling the ARC
  * with waits and clogging the L2ARC with writes.  This also helps prevent
  * the potential for the L2ARC to churn if it attempts to cache content too
  * quickly, such as during backups of the entire pool.
  *
  * 5. After system boot and before the ARC has filled main memory, there are
  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
  * lists can remain mostly static.  Instead of searching from tail of these
  * lists as pictured, the l2arc_feed_thread() will search from the list heads
  * for eligible buffers, greatly increasing its chance of finding them.
  *
  * The L2ARC device write speed is also boosted during this time so that
  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
  * there are no L2ARC reads, and no fear of degrading read performance
  * through increased writes.
  *
  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  * the vdev queue can aggregate them into larger and fewer writes.  Each
  * device is written to in a rotor fashion, sweeping writes through
  * available space then repeating.
  *
  * 7. The L2ARC does not store dirty content.  It never needs to flush
  * write buffers back to disk based storage.
  *
  * 8. If an ARC buffer is written (and dirtied) which also exists in the
  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  *
  * The performance of the L2ARC can be tweaked by a number of tunables, which
  * may be necessary for different workloads:
  *
  *	l2arc_write_max		max write bytes per interval
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
  *	l2arc_headroom_boost	when we find compressed buffers during ARC
  *				scanning, we multiply headroom by this
  *				percentage factor for the next scan cycle,
  *				since more compressed buffers are likely to
  *				be present
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
  * integrated, and also may become zpool properties.
  *
  * There are three key functions that control how the L2ARC warms up:
  *
  *	l2arc_write_eligible()	check if a buffer is eligible to cache
  *	l2arc_write_size()	calculate how much to write
  *	l2arc_write_interval()	calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
  *
  * L2ARC persistence:
  *
  * When writing buffers to L2ARC, we periodically add some metadata to
  * make sure we can pick them up after reboot, thus dramatically reducing
  * the impact that any downtime has on the performance of storage systems
  * with large caches.
  *
  * The implementation works fairly simply by integrating the following two
  * modifications:
  *
  * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
  *    which is an additional piece of metadata which describes what's been
  *    written. This allows us to rebuild the arc_buf_hdr_t structures of the
  *    main ARC buffers. There are 2 linked-lists of log blocks headed by
  *    dh_start_lbps[2]. We alternate which chain we append to, so they are
  *    time-wise and offset-wise interleaved, but that is an optimization rather
  *    than for correctness. The log block also includes a pointer to the
  *    previous block in its chain.
  *
  * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
  *    for our header bookkeeping purposes. This contains a device header,
  *    which contains our top-level reference structures. We update it each
  *    time we write a new log block, so that we're able to locate it in the
  *    L2ARC device. If this write results in an inconsistent device header
  *    (e.g. due to power failure), we detect this by verifying the header's
  *    checksum and simply fail to reconstruct the L2ARC after reboot.
  *
  * Implementation diagram:
  *
  * +=== L2ARC device (not to scale) ======================================+
  * |       ___two newest log block pointers__.__________                  |
  * |      /                                   \dh_start_lbps[1]           |
  * |	 /				       \         \dh_start_lbps[0]|
  * |.___/__.                                    V         V               |
  * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
  * ||   hdr|      ^         /^       /^        /         /                |
  * |+------+  ...--\-------/  \-----/--\------/         /                 |
  * |                \--------------/    \--------------/                  |
  * +======================================================================+
  *
  * As can be seen on the diagram, rather than using a simple linked list,
  * we use a pair of linked lists with alternating elements. This is a
  * performance enhancement due to the fact that we only find out the
  * address of the next log block access once the current block has been
  * completely read in. Obviously, this hurts performance, because we'd be
  * keeping the device's I/O queue at only a 1 operation deep, thus
  * incurring a large amount of I/O round-trip latency. Having two lists
  * allows us to fetch two log blocks ahead of where we are currently
  * rebuilding L2ARC buffers.
  *
  * On-device data structures:
  *
  * L2ARC device header:	l2arc_dev_hdr_phys_t
  * L2ARC log block:	l2arc_log_blk_phys_t
  *
  * L2ARC reconstruction:
  *
  * When writing data, we simply write in the standard rotary fashion,
  * evicting buffers as we go and simply writing new data over them (writing
  * a new log block every now and then). This obviously means that once we
  * loop around the end of the device, we will start cutting into an already
  * committed log block (and its referenced data buffers), like so:
  *
  *    current write head__       __old tail
  *                        \     /
  *                        V    V
  * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
  *                         ^    ^^^^^^^^^___________________________________
  *                         |                                                \
  *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
  *
  * When importing the pool, we detect this situation and use it to stop
  * our scanning process (see l2arc_rebuild).
  *
  * There is one significant caveat to consider when rebuilding ARC contents
  * from an L2ARC device: what about invalidated buffers? Given the above
  * construction, we cannot update blocks which we've already written to amend
  * them to remove buffers which were invalidated. Thus, during reconstruction,
  * we might be populating the cache with buffers for data that's not on the
  * main pool anymore, or may have been overwritten!
  *
  * As it turns out, this isn't a problem. Every arc_read request includes
  * both the DVA and, crucially, the birth TXG of the BP the caller is
  * looking for. So even if the cache were populated by completely rotten
  * blocks for data that had been long deleted and/or overwritten, we'll
  * never actually return bad data from the cache, since the DVA with the
  * birth TXG uniquely identify a block in space and time - once created,
  * a block is immutable on disk. The worst thing we have done is wasted
  * some time and memory at l2arc rebuild to reconstruct outdated ARC
  * entries that will get dropped from the l2arc as it is being updated
  * with new blocks.
  *
  * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
  * hand are not restored. This is done by saving the offset (in bytes)
  * l2arc_evict() has evicted to in the L2ARC device header and taking it
  * into account when restoring buffers.
  */
 
 static boolean_t
 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
 	 * 1. belongs to a different spa.
 	 * 2. is already cached on the L2ARC.
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
 	if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
 	    HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static uint64_t
 l2arc_write_size(l2arc_dev_t *dev)
 {
 	uint64_t size, dev_size, tsize;
 
 	/*
 	 * Make sure our globals have meaningful values in case the user
 	 * altered them.
 	 */
 	size = l2arc_write_max;
 	if (size == 0) {
 		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
 		    "be greater than zero, resetting it to the default (%d)",
 		    L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
 	}
 
 	if (arc_warm == B_FALSE)
 		size += l2arc_write_boost;
 
 	/*
 	 * Make sure the write size does not exceed the size of the cache
 	 * device. This is important in l2arc_evict(), otherwise infinite
 	 * iteration can occur.
 	 */
 	dev_size = dev->l2ad_end - dev->l2ad_start;
 	tsize = size + l2arc_log_blk_overhead(size, dev);
 	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
 		tsize += MAX(64 * 1024 * 1024,
 		    (tsize * l2arc_trim_ahead) / 100);
 
 	if (tsize >= dev_size) {
 		cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
 		    "plus the overhead of log blocks (persistent L2ARC, "
 		    "%llu bytes) exceeds the size of the cache device "
 		    "(guid %llu), resetting them to the default (%d)",
 		    l2arc_log_blk_overhead(size, dev),
 		    dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
 
 		if (arc_warm == B_FALSE)
 			size += l2arc_write_boost;
 	}
 
 	return (size);
 
 }
 
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
 	clock_t interval, next, now;
 
 	/*
 	 * If the ARC lists are busy, increase our write rate; if the
 	 * lists are stale, idle back.  This is achieved by checking
 	 * how much we previously wrote - if it was more than half of
 	 * what we wanted, schedule the next write much sooner.
 	 */
 	if (l2arc_feed_again && wrote > (wanted / 2))
 		interval = (hz * l2arc_feed_min_ms) / 1000;
 	else
 		interval = hz * l2arc_feed_secs;
 
 	now = ddi_get_lbolt();
 	next = MAX(now, MIN(now + interval, began + interval));
 
 	return (next);
 }
 
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
  */
 static l2arc_dev_t *
 l2arc_dev_get_next(void)
 {
 	l2arc_dev_t *first, *next = NULL;
 
 	/*
 	 * Lock out the removal of spas (spa_namespace_lock), then removal
 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
 	 * both locks will be dropped and a spa config lock held instead.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	mutex_enter(&l2arc_dev_mtx);
 
 	/* if there are no vdevs, there is nothing to do */
 	if (l2arc_ndev == 0)
 		goto out;
 
 	first = NULL;
 	next = l2arc_dev_last;
 	do {
 		/* loop around the list looking for a non-faulted vdev */
 		if (next == NULL) {
 			next = list_head(l2arc_dev_list);
 		} else {
 			next = list_next(l2arc_dev_list, next);
 			if (next == NULL)
 				next = list_head(l2arc_dev_list);
 		}
 
 		/* if we have come back to the start, bail out */
 		if (first == NULL)
 			first = next;
 		else if (next == first)
 			break;
 
 	} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
 	    next->l2ad_trim_all);
 
 	/* if we were unable to find any usable vdevs, return NULL */
 	if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
 	    next->l2ad_trim_all)
 		next = NULL;
 
 	l2arc_dev_last = next;
 
 out:
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Grab the config lock to prevent the 'next' device from being
 	 * removed while we are writing to it.
 	 */
 	if (next != NULL)
 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
 	mutex_exit(&spa_namespace_lock);
 
 	return (next);
 }
 
 /*
  * Free buffers that were tagged for destruction.
  */
 static void
 l2arc_do_free_on_write(void)
 {
 	list_t *buflist;
 	l2arc_data_free_t *df, *df_prev;
 
 	mutex_enter(&l2arc_free_on_write_mtx);
 	buflist = l2arc_free_on_write;
 
 	for (df = list_tail(buflist); df; df = df_prev) {
 		df_prev = list_prev(buflist, df);
 		ASSERT3P(df->l2df_abd, !=, NULL);
 		abd_free(df->l2df_abd);
 		list_remove(buflist, df);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
 
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 /*
  * A write to a cache device has completed.  Update all headers to allow
  * reads from these buffers to begin.
  */
 static void
 l2arc_write_done(zio_t *zio)
 {
 	l2arc_write_callback_t	*cb;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	l2arc_dev_t		*dev;
 	l2arc_dev_hdr_phys_t	*l2dhdr;
 	list_t			*buflist;
 	arc_buf_hdr_t		*head, *hdr, *hdr_prev;
 	kmutex_t		*hash_lock;
 	int64_t			bytes_dropped = 0;
 
 	cb = zio->io_private;
 	ASSERT3P(cb, !=, NULL);
 	dev = cb->l2wcb_dev;
 	l2dhdr = dev->l2ad_dev_hdr;
 	ASSERT3P(dev, !=, NULL);
 	head = cb->l2wcb_head;
 	ASSERT3P(head, !=, NULL);
 	buflist = &dev->l2ad_buflist;
 	ASSERT3P(buflist, !=, NULL);
 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
 	    l2arc_write_callback_t *, cb);
 
 	if (zio->io_error != 0)
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
 	/*
 	 * All writes completed, or an error was hit.
 	 */
 top:
 	mutex_enter(&dev->l2ad_mtx);
 	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock. We must retry so we
 			 * don't leave the ARC_FLAG_L2_WRITING bit set.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
 
 			/*
 			 * We don't want to rescan the headers we've
 			 * already marked as having been written out, so
 			 * we reinsert the head node so we can pick up
 			 * where we left off.
 			 */
 			list_remove(buflist, head);
 			list_insert_after(buflist, hdr, head);
 
 			mutex_exit(&dev->l2ad_mtx);
 
 			/*
 			 * We wait for the hash lock to become available
 			 * to try and prevent busy waiting, and increase
 			 * the chance we'll be able to acquire the lock
 			 * the next time around.
 			 */
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto top;
 		}
 
 		/*
 		 * We could not have been moved into the arc_l2c_only
 		 * state while in-flight due to our ARC_FLAG_L2_WRITING
 		 * bit being set. Let's just ensure that's being enforced.
 		 */
 		ASSERT(HDR_HAS_L1HDR(hdr));
 
 		/*
 		 * Skipped - drop L2ARC entry and mark the header as no
 		 * longer L2 eligibile.
 		 */
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
 			list_remove(buflist, hdr);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			ARCSTAT_INCR(arcstat_l2_psize, -psize);
 			ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
 
 			bytes_dropped +=
 			    vdev_psize_to_asize(dev->l2ad_vdev, psize);
 			(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 		}
 
 		/*
 		 * Allow ARC to begin reads and ghost list evictions to
 		 * this L2ARC entry.
 		 */
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
 
 		mutex_exit(hash_lock);
 	}
 
 	/*
 	 * Free the allocated abd buffers for writing the log blocks.
 	 * If the zio failed reclaim the allocated space and remove the
 	 * pointers to these log blocks from the log block pointer list
 	 * of the L2ARC device.
 	 */
 	while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
 		abd_free(abd_buf->abd);
 		zio_buf_free(abd_buf, sizeof (*abd_buf));
 		if (zio->io_error != 0) {
 			lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
 			/*
 			 * L2BLK_GET_PSIZE returns aligned size for log
 			 * blocks.
 			 */
 			uint64_t asize =
 			    L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
 			bytes_dropped += asize;
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 	list_destroy(&cb->l2wcb_abd_list);
 
 	if (zio->io_error != 0) {
 		/*
 		 * Restore the lbps array in the header to its previous state.
 		 * If the list of log block pointers is empty, zero out the
 		 * log block pointers in the device header.
 		 */
 		lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
 		for (int i = 0; i < 2; i++) {
 			if (lb_ptr_buf == NULL) {
 				/*
 				 * If the list is empty zero out the device
 				 * header. Otherwise zero out the second log
 				 * block pointer in the header.
 				 */
 				if (i == 0) {
 					bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
 				} else {
 					bzero(&l2dhdr->dh_start_lbps[i],
 					    sizeof (l2arc_log_blkptr_t));
 				}
 				break;
 			}
 			bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
 			    sizeof (l2arc_log_blkptr_t));
 			lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
 			    lb_ptr_buf);
 		}
 	}
 
 	atomic_inc_64(&l2arc_writes_done);
 	list_remove(buflist, head);
 	ASSERT(!HDR_HAS_L1HDR(head));
 	kmem_cache_free(hdr_l2only_cache, head);
 	mutex_exit(&dev->l2ad_mtx);
 
 	ASSERT(dev->l2ad_vdev != NULL);
 	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
 
 	l2arc_do_free_on_write();
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 
 static int
 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 {
 	int ret;
 	spa_t *spa = zio->io_spa;
 	arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/*
 	 * ZIL data is never be written to the L2ARC, so we don't need
 	 * special handling for its unique MAC storage.
 	 */
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * If the data was encrypted, decrypt it now. Note that
 	 * we must check the bp here and not the hdr, since the
 	 * hdr does not have its encryption parameters updated
 	 * until arc_read_done().
 	 */
 	if (BP_IS_ENCRYPTED(bp)) {
 		abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    B_TRUE);
 
 		zio_crypt_decode_params_bp(bp, salt, iv);
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
 		    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 		    salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
 		    hdr->b_l1hdr.b_pabd, &no_crypt);
 		if (ret != 0) {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		/*
 		 * If we actually performed decryption, replace b_pabd
 		 * with the decrypted data. Otherwise we can just throw
 		 * our decryption buffer away.
 		 */
 		if (!no_crypt) {
 			arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 			    arc_hdr_size(hdr), hdr);
 			hdr->b_l1hdr.b_pabd = eabd;
 			zio->io_abd = eabd;
 		} else {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 		}
 	}
 
 	/*
 	 * If the L2ARC block was compressed, but ARC compression
 	 * is disabled we decompress the data into a new buffer and
 	 * replace the existing data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    B_TRUE);
 		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 		zio->io_abd = cabd;
 		zio->io_size = HDR_GET_LSIZE(hdr);
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 
 /*
  * A read to a cache device completed.  Validate buffer contents before
  * handing over to the regular ARC routines.
  */
 static void
 l2arc_read_done(zio_t *zio)
 {
 	int tfm_error = 0;
 	l2arc_read_callback_t *cb = zio->io_private;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	boolean_t valid_cksum;
 	boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
 	    (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
 
 	ASSERT3P(zio->io_vd, !=, NULL);
 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
 
 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
 	ASSERT3P(cb, !=, NULL);
 	hdr = cb->l2rcb_hdr;
 	ASSERT3P(hdr, !=, NULL);
 
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
 	 * If the data was read into a temporary buffer,
 	 * move it and free the buffer.
 	 */
 	if (cb->l2rcb_abd != NULL) {
 		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
 		if (zio->io_error == 0) {
 			if (using_rdata) {
 				abd_copy(hdr->b_crypt_hdr.b_rabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			} else {
 				abd_copy(hdr->b_l1hdr.b_pabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			}
 		}
 
 		/*
 		 * The following must be done regardless of whether
 		 * there was an error:
 		 * - free the temporary buffer
 		 * - point zio to the real ARC buffer
 		 * - set zio size accordingly
 		 * These are required because zio is either re-used for
 		 * an I/O of the block in the case of the error
 		 * or the zio is passed to arc_read_done() and it
 		 * needs real data.
 		 */
 		abd_free(cb->l2rcb_abd);
 		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
 
 		if (using_rdata) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			zio->io_abd = zio->io_orig_abd =
 			    hdr->b_crypt_hdr.b_rabd;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
 		}
 	}
 
 	ASSERT3P(zio->io_abd, !=, NULL);
 
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
 	ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
 	    (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
 	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_prop.zp_complevel = hdr->b_complevel;
 
 	valid_cksum = arc_cksum_is_equal(hdr, zio);
 
 	/*
 	 * b_rabd will always match the data as it exists on disk if it is
 	 * being used. Therefore if we are reading into b_rabd we do not
 	 * attempt to untransform the data.
 	 */
 	if (valid_cksum && !using_rdata)
 		tfm_error = l2arc_untransform(zio, cb);
 
 	if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
 	    !HDR_L2_EVICTED(hdr)) {
 		mutex_exit(hash_lock);
 		zio->io_private = hdr;
 		arc_read_done(zio);
 	} else {
 		/*
 		 * Buffer didn't survive caching.  Increment stats and
 		 * reissue to the original storage device.
 		 */
 		if (zio->io_error != 0) {
 			ARCSTAT_BUMP(arcstat_l2_io_error);
 		} else {
 			zio->io_error = SET_ERROR(EIO);
 		}
 		if (!valid_cksum || tfm_error != 0)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
 		/*
 		 * If there's no waiter, issue an async i/o to the primary
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
 		if (zio->io_waiter == NULL) {
 			zio_t *pio = zio_unique_parent(zio);
 			void *abd = (using_rdata) ?
 			    hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
 
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
 			zio = zio_read(pio, zio->io_spa, zio->io_bp,
 			    abd, zio->io_size, arc_read_done,
 			    hdr, zio->io_priority, cb->l2rcb_flags,
 			    &cb->l2rcb_zb);
 
 			/*
 			 * Original ZIO will be freed, so we need to update
 			 * ARC header with the new ZIO pointer to be used
 			 * by zio_change_priority() in arc_read().
 			 */
 			for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
 			    acb != NULL; acb = acb->acb_next)
 				acb->acb_zio_head = zio;
 
 			mutex_exit(hash_lock);
 			zio_nowait(zio);
 		} else {
 			mutex_exit(hash_lock);
 		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * This is the list priority from which the L2ARC will search for pages to
  * cache.  This is used within loops (0..3) to cycle through lists in the
  * desired order.  This order can have a significant effect on cache
  * performance.
  *
  * Currently the metadata lists are hit first, MFU then MRU, followed by
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
 static multilist_sublist_t *
 l2arc_sublist_lock(int list_num)
 {
 	multilist_t *ml = NULL;
 	unsigned int idx;
 
 	ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
 
 	switch (list_num) {
 	case 0:
 		ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 1:
 		ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 2:
 		ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
 		break;
 	case 3:
 		ml = arc_mru->arcs_list[ARC_BUFC_DATA];
 		break;
 	default:
 		return (NULL);
 	}
 
 	/*
 	 * Return a randomly-selected sublist. This is acceptable
 	 * because the caller feeds only a little bit of data for each
 	 * call (8MB). Subsequent calls will result in different
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
 	return (multilist_sublist_lock(ml, idx));
 }
 
 /*
  * Calculates the maximum overhead of L2ARC metadata log blocks for a given
  * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
  * overhead in processing to make sure there is enough headroom available
  * when writing buffers.
  */
 static inline uint64_t
 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
 {
 	if (dev->l2ad_log_entries == 0) {
 		return (0);
 	} else {
 		uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
 
 		uint64_t log_blocks = (log_entries +
 		    dev->l2ad_log_entries - 1) /
 		    dev->l2ad_log_entries;
 
 		return (vdev_psize_to_asize(dev->l2ad_vdev,
 		    sizeof (l2arc_log_blk_phys_t)) * log_blocks);
 	}
 }
 
 /*
  * Evict buffers from the device write hand to the distance specified in
  * bytes. This distance may span populated buffers, it may span nothing.
  * This is clearing a region on the L2ARC device ready for writing.
  * If the 'all' boolean is set, every buffer is evicted.
  */
 static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
 	list_t *buflist;
 	arc_buf_hdr_t *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 	l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
 	vdev_t *vd = dev->l2ad_vdev;
 	boolean_t rerun;
 
 	buflist = &dev->l2ad_buflist;
 
 	/*
 	 * We need to add in the worst case scenario of log block overhead.
 	 */
 	distance += l2arc_log_blk_overhead(distance, dev);
 	if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
 		/*
 		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
 		 * times the write size, whichever is greater.
 		 */
 		distance += MAX(64 * 1024 * 1024,
 		    (distance * l2arc_trim_ahead) / 100);
 	}
 
 top:
 	rerun = B_FALSE;
 	if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
 		/*
 		 * When there is no space to accommodate upcoming writes,
 		 * evict to the end. Then bump the write and evict hands
 		 * to the start and iterate. This iteration does not
 		 * happen indefinitely as we make sure in
 		 * l2arc_write_size() that when the write hand is reset,
 		 * the write size does not exceed the end of the device.
 		 */
 		rerun = B_TRUE;
 		taddr = dev->l2ad_end;
 	} else {
 		taddr = dev->l2ad_hand + distance;
 	}
 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
 	    uint64_t, taddr, boolean_t, all);
 
 	if (!all) {
 		/*
 		 * This check has to be placed after deciding whether to
 		 * iterate (rerun).
 		 */
 		if (dev->l2ad_first) {
 			/*
 			 * This is the first sweep through the device. There is
 			 * nothing to evict. We have already trimmmed the
 			 * whole device.
 			 */
 			goto out;
 		} else {
 			/*
 			 * Trim the space to be evicted.
 			 */
 			if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
 			    l2arc_trim_ahead > 0) {
 				/*
 				 * We have to drop the spa_config lock because
 				 * vdev_trim_range() will acquire it.
 				 * l2ad_evict already accounts for the label
 				 * size. To prevent vdev_trim_ranges() from
 				 * adding it again, we subtract it from
 				 * l2ad_evict.
 				 */
 				spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
 				vdev_trim_simple(vd,
 				    dev->l2ad_evict - VDEV_LABEL_START_SIZE,
 				    taddr - dev->l2ad_evict);
 				spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
 				    RW_READER);
 			}
 
 			/*
 			 * When rebuilding L2ARC we retrieve the evict hand
 			 * from the header of the device. Of note, l2arc_evict()
 			 * does not actually delete buffers from the cache
 			 * device, but trimming may do so depending on the
 			 * hardware implementation. Thus keeping track of the
 			 * evict hand is useful.
 			 */
 			dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
 		}
 	}
 
 retry:
 	mutex_enter(&dev->l2ad_mtx);
 	/*
 	 * We have to account for evicted log blocks. Run vdev_space_update()
 	 * on log blocks whose offset (in bytes) is before the evicted offset
 	 * (in bytes) by searching in the list of pointers to log blocks
 	 * present in the L2ARC device.
 	 */
 	for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
 	    lb_ptr_buf = lb_ptr_buf_prev) {
 
 		lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
 
 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 		uint64_t asize = L2BLK_GET_PSIZE(
 		    (lb_ptr_buf->lb_ptr)->lbp_prop);
 
 		/*
 		 * We don't worry about log blocks left behind (ie
 		 * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
 		 * will never write more than l2arc_evict() evicts.
 		 */
 		if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
 			break;
 		} else {
 			vdev_space_update(vd, -asize, 0, 0);
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
 			list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 
 	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		ASSERT(!HDR_EMPTY(hdr));
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
 			mutex_exit(&dev->l2ad_mtx);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto retry;
 		}
 
 		/*
 		 * A header can't be on this list if it doesn't have L2 header.
 		 */
 		ASSERT(HDR_HAS_L2HDR(hdr));
 
 		/* Ensure this header has finished being written. */
 		ASSERT(!HDR_L2_WRITING(hdr));
 		ASSERT(!HDR_L2_WRITE_HEAD(hdr));
 
 		if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
 		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
 			/*
 			 * We've evicted to the target address,
 			 * or the end of the device.
 			 */
 			mutex_exit(hash_lock);
 			break;
 		}
 
 		if (!HDR_HAS_L1HDR(hdr)) {
 			ASSERT(!HDR_L2_READING(hdr));
 			/*
 			 * This doesn't exist in the ARC.  Destroy.
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_lsize.
 			 */
 			arc_change_state(arc_anon, hdr, hash_lock);
 			arc_hdr_destroy(hdr);
 		} else {
 			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
 			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
 			/*
 			 * Invalidate issued or about to be issued
 			 * reads, since we may be about to write
 			 * over this location.
 			 */
 			if (HDR_L2_READING(hdr)) {
 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
 				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
 			}
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 		mutex_exit(hash_lock);
 	}
 	mutex_exit(&dev->l2ad_mtx);
 
 out:
 	/*
 	 * We need to check if we evict all buffers, otherwise we may iterate
 	 * unnecessarily.
 	 */
 	if (!all && rerun) {
 		/*
 		 * Bump device hand to the device start if it is approaching the
 		 * end. l2arc_evict() has already evicted ahead for this case.
 		 */
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 		goto top;
 	}
 
 	ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
 	if (!dev->l2ad_first)
 		ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
 }
 
 /*
  * Handle any abd transforms that might be required for writing to the L2ARC.
  * If successful, this function will always return an abd with the data
  * transformed as it is on disk in a new abd of asize bytes.
  */
 static int
 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
     abd_t **abd_out)
 {
 	int ret;
 	void *tmp = NULL;
 	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
 	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t size = arc_hdr_size(hdr);
 	boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	dsl_crypto_key_t *dck = NULL;
 	uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) ||
 	    HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
 	ASSERT3U(psize, <=, asize);
 
 	/*
 	 * If this data simply needs its own buffer, we simply allocate it
 	 * and copy the data. This may be done to eliminate a dependency on a
 	 * shared buffer or to reallocate the buffer to match asize.
 	 */
 	if (HDR_HAS_RABD(hdr) && asize != psize) {
 		ASSERT3U(asize, >=, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
 		if (psize != asize)
 			abd_zero_off(to_write, psize, asize - psize);
 		goto out;
 	}
 
 	if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
 	    !HDR_ENCRYPTED(hdr)) {
 		ASSERT3U(size, ==, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
 		if (size != asize)
 			abd_zero_off(to_write, size, asize - size);
 		goto out;
 	}
 
 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
 		cabd = abd_alloc_for_io(asize, ismd);
 		tmp = abd_borrow_buf(cabd, asize);
 
 		psize = zio_compress_data(compress, to_write, tmp, size,
 		    hdr->b_complevel);
 
 		if (psize >= size) {
 			abd_return_buf(cabd, tmp, asize);
 			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
 			to_write = cabd;
 			abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
 			if (size != asize)
 				abd_zero_off(to_write, size, asize - size);
 			goto encrypt;
 		}
 		ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
 		if (psize < asize)
 			bzero((char *)tmp + psize, asize - psize);
 		psize = HDR_GET_PSIZE(hdr);
 		abd_return_buf_copy(cabd, tmp, asize);
 		to_write = cabd;
 	}
 
 encrypt:
 	if (HDR_ENCRYPTED(hdr)) {
 		eabd = abd_alloc_for_io(asize, ismd);
 
 		/*
 		 * If the dataset was disowned before the buffer
 		 * made it to this point, the key to re-encrypt
 		 * it won't be available. In this case we simply
 		 * won't write the buffer to the L2ARC.
 		 */
 		ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
 		    FTAG, &dck);
 		if (ret != 0)
 			goto error;
 
 		ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
 		    hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
 		    &no_crypt);
 		if (ret != 0)
 			goto error;
 
 		if (no_crypt)
 			abd_copy(eabd, to_write, psize);
 
 		if (psize != asize)
 			abd_zero_off(eabd, psize, asize - psize);
 
 		/* assert that the MAC we got here matches the one we saved */
 		ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 
 		if (to_write == cabd)
 			abd_free(cabd);
 
 		to_write = eabd;
 	}
 
 out:
 	ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
 	*abd_out = to_write;
 	return (0);
 
 error:
 	if (dck != NULL)
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 	if (cabd != NULL)
 		abd_free(cabd);
 	if (eabd != NULL)
 		abd_free(eabd);
 
 	*abd_out = NULL;
 	return (ret);
 }
 
 static void
 l2arc_blk_fetch_done(zio_t *zio)
 {
 	l2arc_read_callback_t *cb;
 
 	cb = zio->io_private;
 	if (cb->l2rcb_abd != NULL)
 		abd_put(cb->l2rcb_abd);
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  * The headroom_boost is an in-out parameter used to maintain headroom boost
  * state between calls to this function.
  *
  * Returns the number of bytes actually written (which may be smaller than
  * the delta by which the device hand has changed due to alignment and the
  * writing of log blocks).
  */
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
 	arc_buf_hdr_t 		*hdr, *hdr_prev, *head;
 	uint64_t 		write_asize, write_psize, write_lsize, headroom;
 	boolean_t		full;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
 
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
 	write_lsize = write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
 
 	/*
 	 * Copy buffers for L2ARC writing.
 	 */
 	for (int try = 0; try < L2ARC_FEED_TYPES; try++) {
 		multilist_sublist_t *mls = l2arc_sublist_lock(try);
 		uint64_t passed_sz = 0;
 
 		VERIFY3P(mls, !=, NULL);
 
 		/*
 		 * L2ARC fast warmup.
 		 *
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
 		if (arc_warm == B_FALSE)
 			hdr = multilist_sublist_head(mls);
 		else
 			hdr = multilist_sublist_tail(mls);
 
 		headroom = target_sz * l2arc_headroom;
 		if (zfs_compressed_arc_enabled)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		for (; hdr; hdr = hdr_prev) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
 			if (arc_warm == B_FALSE)
 				hdr_prev = multilist_sublist_next(mls, hdr);
 			else
 				hdr_prev = multilist_sublist_prev(mls, hdr);
 
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
 				/*
 				 * Skip this buffer rather than waiting.
 				 */
 				continue;
 			}
 
 			passed_sz += HDR_GET_LSIZE(hdr);
 			if (l2arc_headroom != 0 && passed_sz > headroom) {
 				/*
 				 * Searched too far.
 				 */
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
 				continue;
 			}
 
 			/*
 			 * We rely on the L1 portion of the header below, so
 			 * it's invalid for this header to have been evicted out
 			 * of the ghost cache, prior to being written out. The
 			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
 
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
 			    psize);
 
 			if ((write_asize + asize) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			/*
 			 * We rely on the L1 portion of the header below, so
 			 * it's invalid for this header to have been evicted out
 			 * of the ghost cache, prior to being written out. The
 			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
 			 */
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
 			ASSERT(HDR_HAS_L1HDR(hdr));
 
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
 			 * must always match the data exactly as it exists on
 			 * disk. Otherwise, the L2ARC can normally use the
 			 * hdr's data, but if we're sharing data between the
 			 * hdr and one of its bufs, L2ARC needs its own copy of
 			 * the data so that the ZIO below can't race with the
 			 * buf consumer. To ensure that this copy will be
 			 * available for the lifetime of the ZIO and be cleaned
 			 * up afterwards, we add it to the l2arc_free_on_write
 			 * queue. If we need to apply any transforms to the
 			 * data (compression, encryption) we will also need the
 			 * extra buffer.
 			 */
 			if (HDR_HAS_RABD(hdr) && psize == asize) {
 				to_write = hdr->b_crypt_hdr.b_rabd;
 			} else if ((HDR_COMPRESSION_ENABLED(hdr) ||
 			    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
 			    !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
 			    psize == asize) {
 				to_write = hdr->b_l1hdr.b_pabd;
 			} else {
 				int ret;
 				arc_buf_contents_t type = arc_buf_type(hdr);
 
 				ret = l2arc_apply_transforms(spa, hdr, asize,
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
 					    ARC_FLAG_L2_WRITING);
 					mutex_exit(hash_lock);
 					continue;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
 				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
 				mutex_exit(&dev->l2ad_mtx);
 
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
 				/*
 				 * Create a list to save allocated abd buffers
 				 * for l2arc_log_blk_commit().
 				 */
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
 				pio = zio_root(spa, l2arc_write_done, cb,
 				    ZIO_FLAG_CANFAIL);
 			}
 
 			hdr->b_l2hdr.b_dev = dev;
 			hdr->b_l2hdr.b_hits = 0;
 
 			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
 			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
 
 			mutex_enter(&dev->l2ad_mtx);
 			list_insert_head(&dev->l2ad_buflist, hdr);
 			mutex_exit(&dev->l2ad_mtx);
 
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    hdr->b_l2hdr.b_daddr, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
 			write_lsize += HDR_GET_LSIZE(hdr);
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 			mutex_exit(hash_lock);
 
 			/*
 			 * Append buf info to current log and commit if full.
 			 * arcstat_l2_{size,asize} kstats are updated
 			 * internally.
 			 */
 			if (l2arc_log_blk_insert(dev, hdr))
 				l2arc_log_blk_commit(dev, pio, cb);
 
 			zio_nowait(wzio);
 		}
 
 		multilist_sublist_unlock(mls);
 
 		if (full == B_TRUE)
 			break;
 	}
 
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
 		ASSERT0(write_lsize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
 		/*
 		 * Although we did not write any buffers l2ad_evict may
 		 * have advanced.
 		 */
 		l2arc_dev_hdr_update(dev);
 
 		return (0);
 	}
 
 	if (!dev->l2ad_first)
 		ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 
 	ASSERT3U(write_asize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
 	ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
 	ARCSTAT_INCR(arcstat_l2_psize, write_psize);
 
 	dev->l2ad_writing = B_TRUE;
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
 	/*
 	 * Update the device header after the zio completes as
 	 * l2arc_write_done() may have updated the memory holding the log block
 	 * pointers in the device header.
 	 */
 	l2arc_dev_hdr_update(dev);
 
 	return (write_asize);
 }
 
+static boolean_t
+l2arc_hdr_limit_reached(void)
+{
+	int64_t s = aggsum_upper_bound(&astat_l2_hdr_size);
+
+	return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) ||
+	    (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
+}
+
 /*
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
 /* ARGSUSED */
 static void
 l2arc_feed_thread(void *unused)
 {
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
 	fstrans_cookie_t cookie;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&l2arc_feed_thr_lock);
 
 	cookie = spl_fstrans_mark();
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait_sig(&l2arc_feed_thr_cv,
 		    &l2arc_feed_thr_lock, next);
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
 		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
 		 */
 		mutex_enter(&l2arc_dev_mtx);
 		if (l2arc_ndev == 0) {
 			mutex_exit(&l2arc_dev_mtx);
 			continue;
 		}
 		mutex_exit(&l2arc_dev_mtx);
 		begin = ddi_get_lbolt();
 
 		/*
 		 * This selects the next l2arc device to write to, and in
 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
 		 * will return NULL if there are now no l2arc devices or if
 		 * they are all faulted.
 		 *
 		 * If a device is returned, its spa's config lock is also
 		 * held to prevent device removal.  l2arc_dev_get_next()
 		 * will grab and release l2arc_dev_mtx.
 		 */
 		if ((dev = l2arc_dev_get_next()) == NULL)
 			continue;
 
 		spa = dev->l2ad_spa;
 		ASSERT3P(spa, !=, NULL);
 
 		/*
 		 * If the pool is read-only then force the feed thread to
 		 * sleep a little longer.
 		 */
 		if (!spa_writeable(spa)) {
 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		/*
 		 * Avoid contributing to memory pressure.
 		 */
-		if (arc_reclaim_needed()) {
+		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
 		size = l2arc_write_size(dev);
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
 		 */
 		l2arc_evict(dev, size, B_FALSE);
 
 		/*
 		 * Write ARC buffers.
 		 */
 		wrote = l2arc_write_buffers(spa, dev, size);
 
 		/*
 		 * Calculate interval between writes.
 		 */
 		next = l2arc_write_interval(begin, size, wrote);
 		spa_config_exit(spa, SCL_L2ARC, dev);
 	}
 	spl_fstrans_unmark(cookie);
 
 	l2arc_thread_exit = 0;
 	cv_broadcast(&l2arc_feed_thr_cv);
 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
 	thread_exit();
 }
 
 boolean_t
 l2arc_vdev_present(vdev_t *vd)
 {
 	return (l2arc_vdev_get(vd) != NULL);
 }
 
 /*
  * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
  * the vdev_t isn't an L2ARC device.
  */
 l2arc_dev_t *
 l2arc_vdev_get(vdev_t *vd)
 {
 	l2arc_dev_t	*dev;
 
 	mutex_enter(&l2arc_dev_mtx);
 	for (dev = list_head(l2arc_dev_list); dev != NULL;
 	    dev = list_next(l2arc_dev_list, dev)) {
 		if (dev->l2ad_vdev == vd)
 			break;
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
 	return (dev);
 }
 
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
  * validated the vdev and opened it.
  */
 void
 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 {
 	l2arc_dev_t		*adddev;
 	uint64_t		l2dhdr_asize;
 
 	ASSERT(!l2arc_vdev_present(vd));
 
 	vdev_ashift_optimize(vd);
 
 	/*
 	 * Create a new l2arc device entry.
 	 */
 	adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
 	/* leave extra size for an l2arc device header */
 	l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
 	    MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
 	adddev->l2ad_hand = adddev->l2ad_start;
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 	adddev->l2ad_trim_all = B_FALSE;
 	list_link_init(&adddev->l2ad_node);
 	adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
 
 	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
 	 * device.
 	 */
 	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
 	/*
 	 * This is a list of pointers to log blocks that are still present
 	 * on the device.
 	 */
 	list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
 	    offsetof(l2arc_lb_ptr_buf_t, node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 	zfs_refcount_create(&adddev->l2ad_alloc);
 	zfs_refcount_create(&adddev->l2ad_lb_asize);
 	zfs_refcount_create(&adddev->l2ad_lb_count);
 
 	/*
 	 * Add device to global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_insert_head(l2arc_dev_list, adddev);
 	atomic_inc_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Decide if vdev is eligible for L2ARC rebuild
 	 */
 	l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE);
 }
 
 void
 l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
 {
 	l2arc_dev_t		*dev = NULL;
 	l2arc_dev_hdr_phys_t	*l2dhdr;
 	uint64_t		l2dhdr_asize;
 	spa_t			*spa;
 	int			err;
 	boolean_t		l2dhdr_valid = B_TRUE;
 
 	dev = l2arc_vdev_get(vd);
 	ASSERT3P(dev, !=, NULL);
 	spa = dev->l2ad_spa;
 	l2dhdr = dev->l2ad_dev_hdr;
 	l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 
 	/*
 	 * The L2ARC has to hold at least the payload of one log block for
 	 * them to be restored (persistent L2ARC). The payload of a log block
 	 * depends on the amount of its log entries. We always write log blocks
 	 * with 1022 entries. How many of them are committed or restored depends
 	 * on the size of the L2ARC device. Thus the maximum payload of
 	 * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
 	 * is less than that, we reduce the amount of committed and restored
 	 * log entries per block so as to enable persistence.
 	 */
 	if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
 		dev->l2ad_log_entries = 0;
 	} else {
 		dev->l2ad_log_entries = MIN((dev->l2ad_end -
 		    dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
 		    L2ARC_LOG_BLK_MAX_ENTRIES);
 	}
 
 	/*
 	 * Read the device header, if an error is returned do not rebuild L2ARC.
 	 */
 	if ((err = l2arc_dev_hdr_read(dev)) != 0)
 		l2dhdr_valid = B_FALSE;
 
 	if (l2dhdr_valid && dev->l2ad_log_entries > 0) {
 		/*
 		 * If we are onlining a cache device (vdev_reopen) that was
 		 * still present (l2arc_vdev_present()) and rebuild is enabled,
 		 * we should evict all ARC buffers and pointers to log blocks
 		 * and reclaim their space before restoring its contents to
 		 * L2ARC.
 		 */
 		if (reopen) {
 			if (!l2arc_rebuild_enabled) {
 				return;
 			} else {
 				l2arc_evict(dev, 0, B_TRUE);
 				/* start a new log block */
 				dev->l2ad_log_ent_idx = 0;
 				dev->l2ad_log_blk_payload_asize = 0;
 				dev->l2ad_log_blk_payload_start = 0;
 			}
 		}
 		/*
 		 * Just mark the device as pending for a rebuild. We won't
 		 * be starting a rebuild in line here as it would block pool
 		 * import. Instead spa_load_impl will hand that off to an
 		 * async task which will call l2arc_spa_rebuild_start.
 		 */
 		dev->l2ad_rebuild = B_TRUE;
 	} else if (spa_writeable(spa)) {
 		/*
 		 * In this case TRIM the whole device if l2arc_trim_ahead > 0,
 		 * otherwise create a new header. We zero out the memory holding
 		 * the header to reset dh_start_lbps. If we TRIM the whole
 		 * device the new header will be written by
 		 * vdev_trim_l2arc_thread() at the end of the TRIM to update the
 		 * trim_state in the header too. When reading the header, if
 		 * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
 		 * we opt to TRIM the whole device again.
 		 */
 		if (l2arc_trim_ahead > 0) {
 			dev->l2ad_trim_all = B_TRUE;
 		} else {
 			bzero(l2dhdr, l2dhdr_asize);
 			l2arc_dev_hdr_update(dev);
 		}
 	}
 }
 
 /*
  * Remove a vdev from the L2ARC.
  */
 void
 l2arc_remove_vdev(vdev_t *vd)
 {
 	l2arc_dev_t *remdev = NULL;
 
 	/*
 	 * Find the device by vdev
 	 */
 	remdev = l2arc_vdev_get(vd);
 	ASSERT3P(remdev, !=, NULL);
 
 	/*
 	 * Cancel any ongoing or scheduled rebuild.
 	 */
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	if (remdev->l2ad_rebuild_began == B_TRUE) {
 		remdev->l2ad_rebuild_cancel = B_TRUE;
 		while (remdev->l2ad_rebuild == B_TRUE)
 			cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
 	}
 	mutex_exit(&l2arc_rebuild_thr_lock);
 
 	/*
 	 * Remove device from global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_remove(l2arc_dev_list, remdev);
 	l2arc_dev_last = NULL;		/* may have been invalidated */
 	atomic_dec_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Clear all buflists and ARC references.  L2ARC device flush.
 	 */
 	l2arc_evict(remdev, 0, B_TRUE);
 	list_destroy(&remdev->l2ad_buflist);
 	ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
 	list_destroy(&remdev->l2ad_lbptr_list);
 	mutex_destroy(&remdev->l2ad_mtx);
 	zfs_refcount_destroy(&remdev->l2ad_alloc);
 	zfs_refcount_destroy(&remdev->l2ad_lb_asize);
 	zfs_refcount_destroy(&remdev->l2ad_lb_count);
 	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
 	vmem_free(remdev, sizeof (l2arc_dev_t));
 }
 
 void
 l2arc_init(void)
 {
 	l2arc_thread_exit = 0;
 	l2arc_ndev = 0;
 	l2arc_writes_sent = 0;
 	l2arc_writes_done = 0;
 
 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	l2arc_dev_list = &L2ARC_dev_list;
 	l2arc_free_on_write = &L2ARC_free_on_write;
 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
 	    offsetof(l2arc_dev_t, l2ad_node));
 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
 	    offsetof(l2arc_data_free_t, l2df_list_node));
 }
 
 void
 l2arc_fini(void)
 {
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_rebuild_thr_lock);
 	cv_destroy(&l2arc_rebuild_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
 	mutex_destroy(&l2arc_free_on_write_mtx);
 
 	list_destroy(l2arc_dev_list);
 	list_destroy(l2arc_free_on_write);
 }
 
 void
 l2arc_start(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
 	    TS_RUN, defclsyspri);
 }
 
 void
 l2arc_stop(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);
 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
 	l2arc_thread_exit = 1;
 	while (l2arc_thread_exit != 0)
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 }
 
 /*
  * Punches out rebuild threads for the L2ARC devices in a spa. This should
  * be called after pool import from the spa async thread, since starting
  * these threads directly from spa_import() will make them part of the
  * "zpool import" context and delay process exit (and thus pool import).
  */
 void
 l2arc_spa_rebuild_start(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Locate the spa's l2arc devices and kick off rebuild threads.
 	 */
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		l2arc_dev_t *dev =
 		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
 		if (dev == NULL) {
 			/* Don't attempt a rebuild if the vdev is UNAVAIL */
 			continue;
 		}
 		mutex_enter(&l2arc_rebuild_thr_lock);
 		if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
 			dev->l2ad_rebuild_began = B_TRUE;
 			(void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
 			    dev, 0, &p0, TS_RUN, minclsyspri);
 		}
 		mutex_exit(&l2arc_rebuild_thr_lock);
 	}
 }
 
 /*
  * Main entry point for L2ARC rebuilding.
  */
 static void
 l2arc_dev_rebuild_thread(void *arg)
 {
 	l2arc_dev_t *dev = arg;
 
 	VERIFY(!dev->l2ad_rebuild_cancel);
 	VERIFY(dev->l2ad_rebuild);
 	(void) l2arc_rebuild(dev);
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	dev->l2ad_rebuild_began = B_FALSE;
 	dev->l2ad_rebuild = B_FALSE;
 	mutex_exit(&l2arc_rebuild_thr_lock);
 
 	thread_exit();
 }
 
 /*
  * This function implements the actual L2ARC metadata rebuild. It:
  * starts reading the log block chain and restores each block's contents
  * to memory (reconstructing arc_buf_hdr_t's).
  *
  * Operation stops under any of the following conditions:
  *
  * 1) We reach the end of the log block chain.
  * 2) We encounter *any* error condition (cksum errors, io errors)
  */
 static int
 l2arc_rebuild(l2arc_dev_t *dev)
 {
 	vdev_t			*vd = dev->l2ad_vdev;
 	spa_t			*spa = vd->vdev_spa;
 	int			err = 0;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	l2arc_log_blk_phys_t	*this_lb, *next_lb;
 	zio_t			*this_io = NULL, *next_io = NULL;
 	l2arc_log_blkptr_t	lbps[2];
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	boolean_t		lock_held;
 
 	this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
 	next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
 
 	/*
 	 * We prevent device removal while issuing reads to the device,
 	 * then during the rebuilding phases we drop this lock again so
 	 * that a spa_unload or device remove can be initiated - this is
 	 * safe, because the spa will signal us to stop before removing
 	 * our device and wait for us to stop.
 	 */
 	spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
 	lock_held = B_TRUE;
 
 	/*
 	 * Retrieve the persistent L2ARC device state.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
 	dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
 	    L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
 	    dev->l2ad_start);
 	dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
 
 	vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
 	vd->vdev_trim_state = l2dhdr->dh_trim_state;
 
 	/*
 	 * In case the zfs module parameter l2arc_rebuild_enabled is false
 	 * we do not start the rebuild process.
 	 */
 	if (!l2arc_rebuild_enabled)
 		goto out;
 
 	/* Prepare the rebuild process */
 	bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
 
 	/* Start the rebuild process */
 	for (;;) {
 		if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
 			break;
 
 		if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
 		    this_lb, next_lb, this_io, &next_io)) != 0)
 			goto out;
 
 		/*
 		 * Our memory pressure valve. If the system is running low
 		 * on memory, rather than swamping memory with new ARC buf
 		 * hdrs, we opt not to rebuild the L2ARC. At this point,
 		 * however, we have already set up our L2ARC dev to chain in
 		 * new metadata log blocks, so the user may choose to offline/
 		 * online the L2ARC dev at a later time (or re-import the pool)
 		 * to reconstruct it (when there's less memory pressure).
 		 */
-		if (arc_reclaim_needed()) {
+		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
 			cmn_err(CE_NOTE, "System running low on memory, "
 			    "aborting L2ARC rebuild.");
 			err = SET_ERROR(ENOMEM);
 			goto out;
 		}
 
 		spa_config_exit(spa, SCL_L2ARC, vd);
 		lock_held = B_FALSE;
 
 		/*
 		 * Now that we know that the next_lb checks out alright, we
 		 * can start reconstruction from this log block.
 		 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 		 */
 		uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 		l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr);
 
 		/*
 		 * log block restored, include its pointer in the list of
 		 * pointers to log blocks present in the L2ARC device.
 		 */
 		lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 		lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
 		    KM_SLEEP);
 		bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
 		    sizeof (l2arc_log_blkptr_t));
 		mutex_enter(&dev->l2ad_mtx);
 		list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
 		ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 		ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 		zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 		zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 		mutex_exit(&dev->l2ad_mtx);
 		vdev_space_update(vd, asize, 0, 0);
 
 		/*
 		 * Protection against loops of log blocks:
 		 *
 		 *				       l2ad_hand  l2ad_evict
 		 *                                         V	      V
 		 * l2ad_start |=======================================| l2ad_end
 		 *             -----|||----|||---|||----|||
 		 *                  (3)    (2)   (1)    (0)
 		 *             ---|||---|||----|||---|||
 		 *		  (7)   (6)    (5)   (4)
 		 *
 		 * In this situation the pointer of log block (4) passes
 		 * l2arc_log_blkptr_valid() but the log block should not be
 		 * restored as it is overwritten by the payload of log block
 		 * (0). Only log blocks (0)-(3) should be restored. We check
 		 * whether l2ad_evict lies in between the payload starting
 		 * offset of the next log block (lbps[1].lbp_payload_start)
 		 * and the payload starting offset of the present log block
 		 * (lbps[0].lbp_payload_start). If true and this isn't the
 		 * first pass, we are looping from the beginning and we should
 		 * stop.
 		 */
 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
 		    lbps[0].lbp_payload_start, dev->l2ad_evict) &&
 		    !dev->l2ad_first)
 			goto out;
 
 		for (;;) {
 			mutex_enter(&l2arc_rebuild_thr_lock);
 			if (dev->l2ad_rebuild_cancel) {
 				dev->l2ad_rebuild = B_FALSE;
 				cv_signal(&l2arc_rebuild_thr_cv);
 				mutex_exit(&l2arc_rebuild_thr_lock);
 				err = SET_ERROR(ECANCELED);
 				goto out;
 			}
 			mutex_exit(&l2arc_rebuild_thr_lock);
 			if (spa_config_tryenter(spa, SCL_L2ARC, vd,
 			    RW_READER)) {
 				lock_held = B_TRUE;
 				break;
 			}
 			/*
 			 * L2ARC config lock held by somebody in writer,
 			 * possibly due to them trying to remove us. They'll
 			 * likely to want us to shut down, so after a little
 			 * delay, we check l2ad_rebuild_cancel and retry
 			 * the lock again.
 			 */
 			delay(1);
 		}
 
 		/*
 		 * Continue with the next log block.
 		 */
 		lbps[0] = lbps[1];
 		lbps[1] = this_lb->lb_prev_lbp;
 		PTR_SWAP(this_lb, next_lb);
 		this_io = next_io;
 		next_io = NULL;
 		}
 
 	if (this_io != NULL)
 		l2arc_log_blk_fetch_abort(this_io);
 out:
 	if (next_io != NULL)
 		l2arc_log_blk_fetch_abort(next_io);
 	vmem_free(this_lb, sizeof (*this_lb));
 	vmem_free(next_lb, sizeof (*next_lb));
 
 	if (!l2arc_rebuild_enabled) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "disabled");
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_success);
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "successful, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
 		/*
 		 * No error but also nothing restored, meaning the lbps array
 		 * in the device header points to invalid/non-present log
 		 * blocks. Reset the header.
 		 */
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "no valid log blocks");
 		bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
 		l2arc_dev_hdr_update(dev);
 	} else if (err == ECANCELED) {
 		/*
 		 * In case the rebuild was canceled do not log to spa history
 		 * log as the pool may be in the process of being removed.
 		 */
 		zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
 		    zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err != 0) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	}
 
 	if (lock_held)
 		spa_config_exit(spa, SCL_L2ARC, vd);
 
 	return (err);
 }
 
 /*
  * Attempts to read the device header on the provided L2ARC device and writes
  * it to `hdr'. On success, this function returns 0, otherwise the appropriate
  * error code is returned.
  */
 static int
 l2arc_dev_hdr_read(l2arc_dev_t *dev)
 {
 	int			err;
 	uint64_t		guid;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t 			*abd;
 
 	guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
 	    ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 	    ZIO_FLAG_SPECULATIVE, B_FALSE));
 
 	abd_put(abd);
 
 	if (err != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
 		    "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
 		return (err);
 	}
 
 	if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
 		byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
 
 	if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
 	    l2dhdr->dh_spa_guid != guid ||
 	    l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
 	    l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
 	    l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
 	    l2dhdr->dh_end != dev->l2ad_end ||
 	    !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
 	    l2dhdr->dh_evict) ||
 	    (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
 	    l2arc_trim_ahead > 0)) {
 		/*
 		 * Attempt to rebuild a device containing no actual dev hdr
 		 * or containing a header from some other pool or from another
 		 * version of persistent L2ARC.
 		 */
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 /*
  * Reads L2ARC log blocks from storage and validates their contents.
  *
  * This function implements a simple fetcher to make sure that while
  * we're processing one buffer the L2ARC is already fetching the next
  * one in the chain.
  *
  * The arguments this_lp and next_lp point to the current and next log block
  * address in the block chain. Similarly, this_lb and next_lb hold the
  * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
  *
  * The `this_io' and `next_io' arguments are used for block fetching.
  * When issuing the first blk IO during rebuild, you should pass NULL for
  * `this_io'. This function will then issue a sync IO to read the block and
  * also issue an async IO to fetch the next block in the block chain. The
  * fetched IO is returned in `next_io'. On subsequent calls to this
  * function, pass the value returned in `next_io' from the previous call
  * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
  * Prior to the call, you should initialize your `next_io' pointer to be
  * NULL. If no fetch IO was issued, the pointer is left set at NULL.
  *
  * On success, this function returns 0, otherwise it returns an appropriate
  * error code. On error the fetching IO is aborted and cleared before
  * returning from this function. Therefore, if we return `success', the
  * caller can assume that we have taken care of cleanup of fetch IOs.
  */
 static int
 l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io)
 {
 	int		err = 0;
 	zio_cksum_t	cksum;
 	abd_t		*abd = NULL;
 	uint64_t	asize;
 
 	ASSERT(this_lbp != NULL && next_lbp != NULL);
 	ASSERT(this_lb != NULL && next_lb != NULL);
 	ASSERT(next_io != NULL && *next_io == NULL);
 	ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
 
 	/*
 	 * Check to see if we have issued the IO for this log block in a
 	 * previous run. If not, this is the first call, so issue it now.
 	 */
 	if (this_io == NULL) {
 		this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
 		    this_lb);
 	}
 
 	/*
 	 * Peek to see if we can start issuing the next IO immediately.
 	 */
 	if (l2arc_log_blkptr_valid(dev, next_lbp)) {
 		/*
 		 * Start issuing IO for the next log block early - this
 		 * should help keep the L2ARC device busy while we
 		 * decompress and restore this log block.
 		 */
 		*next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
 		    next_lb);
 	}
 
 	/* Wait for the IO to read this log block to complete */
 	if ((err = zio_wait(this_io)) != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
 		    "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr,
 		    dev->l2ad_vdev->vdev_guid);
 		goto cleanup;
 	}
 
 	/*
 	 * Make sure the buffer checks out.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
 	fletcher_4_native(this_lb, asize, NULL, &cksum);
 	if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
 		zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
 		    "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
 		    this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid,
 		    dev->l2ad_hand, dev->l2ad_evict);
 		err = SET_ERROR(ECKSUM);
 		goto cleanup;
 	}
 
 	/* Now we can take our time decoding this buffer */
 	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
 	case ZIO_COMPRESS_OFF:
 		break;
 	case ZIO_COMPRESS_LZ4:
 		abd = abd_alloc_for_io(asize, B_TRUE);
 		abd_copy_from_buf_off(abd, this_lb, 0, asize);
 		if ((err = zio_decompress_data(
 		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
 		    abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
 			err = SET_ERROR(EINVAL);
 			goto cleanup;
 		}
 		break;
 	default:
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 	if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 		byteswap_uint64_array(this_lb, sizeof (*this_lb));
 	if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 cleanup:
 	/* Abort an in-flight fetch I/O in case of error */
 	if (err != 0 && *next_io != NULL) {
 		l2arc_log_blk_fetch_abort(*next_io);
 		*next_io = NULL;
 	}
 	if (abd != NULL)
 		abd_free(abd);
 	return (err);
 }
 
 /*
  * Restores the payload of a log block to ARC. This creates empty ARC hdr
  * entries which only contain an l2arc hdr, essentially restoring the
  * buffers to their L2ARC evicted state. This function also updates space
  * usage on the L2ARC vdev to make sure it tracks restored buffers.
  */
 static void
 l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
     uint64_t lb_asize, uint64_t lb_daddr)
 {
 	uint64_t	size = 0, asize = 0;
 	uint64_t	log_entries = dev->l2ad_log_entries;
 
+	/*
+	 * Usually arc_adapt() is called only for data, not headers, but
+	 * since we may allocate significant amount of memory here, let ARC
+	 * grow its arc_c.
+	 */
+	arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only);
+
 	for (int i = log_entries - 1; i >= 0; i--) {
 		/*
 		 * Restore goes in the reverse temporal direction to preserve
 		 * correct temporal ordering of buffers in the l2ad_buflist.
 		 * l2arc_hdr_restore also does a list_insert_tail instead of
 		 * list_insert_head on the l2ad_buflist:
 		 *
 		 *		LIST	l2ad_buflist		LIST
 		 *		HEAD  <------ (time) ------	TAIL
 		 * direction	+-----+-----+-----+-----+-----+    direction
 		 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
 		 * fill		+-----+-----+-----+-----+-----+
 		 *		^				^
 		 *		|				|
 		 *		|				|
 		 *	l2arc_feed_thread		l2arc_rebuild
 		 *	will place new bufs here	restores bufs here
 		 *
 		 * During l2arc_rebuild() the device is not used by
 		 * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
 		 */
 		size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
 		asize += vdev_psize_to_asize(dev->l2ad_vdev,
 		    L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
 		l2arc_hdr_restore(&lb->lb_entries[i], dev);
 	}
 
 	/*
 	 * Record rebuild stats:
 	 *	size		Logical size of restored buffers in the L2ARC
 	 *	asize		Aligned size of restored buffers in the L2ARC
 	 */
 	ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
 	ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
 	ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
 	ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
 }
 
 /*
  * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
  * into a state indicating that it has been evicted to L2ARC.
  */
 static void
 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
 {
 	arc_buf_hdr_t		*hdr, *exists;
 	kmutex_t		*hash_lock;
 	arc_buf_contents_t	type = L2BLK_GET_TYPE((le)->le_prop);
 	uint64_t		asize;
 
 	/*
 	 * Do all the allocation before grabbing any locks, this lets us
 	 * sleep if memory is full and we don't have to deal with failed
 	 * allocations.
 	 */
 	hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
 	    dev, le->le_dva, le->le_daddr,
 	    L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
 	    L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
 	    L2BLK_GET_PROTECTED((le)->le_prop),
 	    L2BLK_GET_PREFETCH((le)->le_prop));
 	asize = vdev_psize_to_asize(dev->l2ad_vdev,
 	    L2BLK_GET_PSIZE((le)->le_prop));
 
 	/*
 	 * vdev_space_update() has to be called before arc_hdr_destroy() to
 	 * avoid underflow since the latter also calls the former.
 	 */
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr));
 
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_tail(&dev->l2ad_buflist, hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
 	mutex_exit(&dev->l2ad_mtx);
 
 	exists = buf_hash_insert(hdr, &hash_lock);
 	if (exists) {
 		/* Buffer was already cached, no need to restore it. */
 		arc_hdr_destroy(hdr);
 		/*
 		 * If the buffer is already cached, check whether it has
 		 * L2ARC metadata. If not, enter them and update the flag.
 		 * This is important is case of onlining a cache device, since
 		 * we previously evicted all L2ARC metadata from ARC.
 		 */
 		if (!HDR_HAS_L2HDR(exists)) {
 			arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
 			exists->b_l2hdr.b_dev = dev;
 			exists->b_l2hdr.b_daddr = le->le_daddr;
 			mutex_enter(&dev->l2ad_mtx);
 			list_insert_tail(&dev->l2ad_buflist, exists);
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(exists), exists);
 			mutex_exit(&dev->l2ad_mtx);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 			ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists));
 			ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists));
 		}
 		ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
 	}
 
 	mutex_exit(hash_lock);
 }
 
 /*
  * Starts an asynchronous read IO to read a log block. This is used in log
  * block reconstruction to start reading the next block before we are done
  * decoding and reconstructing the current block, to keep the l2arc device
  * nice and hot with read IO to process.
  * The returned zio will contain a newly allocated memory buffers for the IO
  * data which should then be freed by the caller once the zio is no longer
  * needed (i.e. due to it having completed). If you wish to abort this
  * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
  * care of disposing of the allocated buffers correctly.
  */
 static zio_t *
 l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
     l2arc_log_blk_phys_t *lb)
 {
 	uint32_t		asize;
 	zio_t			*pio;
 	l2arc_read_callback_t	*cb;
 
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
 
 	cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
 	cb->l2rcb_abd = abd_get_from_buf(lb, asize);
 	pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
 	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
 	    ZIO_FLAG_DONT_RETRY);
 	(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
 	    cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
 
 	return (pio);
 }
 
 /*
  * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
  * buffers allocated for it.
  */
 static void
 l2arc_log_blk_fetch_abort(zio_t *zio)
 {
 	(void) zio_wait(zio);
 }
 
 /*
  * Creates a zio to update the device header on an l2arc device.
  */
 void
 l2arc_dev_hdr_update(l2arc_dev_t *dev)
 {
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t			*abd;
 	int			err;
 
 	VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
 
 	l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
 	l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
 	l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 	l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
 	l2dhdr->dh_log_entries = dev->l2ad_log_entries;
 	l2dhdr->dh_evict = dev->l2ad_evict;
 	l2dhdr->dh_start = dev->l2ad_start;
 	l2dhdr->dh_end = dev->l2ad_end;
 	l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
 	l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
 	l2dhdr->dh_flags = 0;
 	l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
 	l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
 	if (dev->l2ad_first)
 		l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
 	    NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
 
 	abd_put(abd);
 
 	if (err != 0) {
 		zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
 		    "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
 	}
 }
 
 /*
  * Commits a log block to the L2ARC device. This routine is invoked from
  * l2arc_write_buffers when the log block fills up.
  * This function allocates some memory to temporarily hold the serialized
  * buffer to be written. This is then released in l2arc_write_done.
  */
 static void
 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	uint8_t			*tmpbuf;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 
 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
 
 	tmpbuf = zio_buf_alloc(sizeof (*lb));
 	abd_buf = zio_buf_alloc(sizeof (*abd_buf));
 	abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
 	lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 	lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
 
 	/* link the buffer into the block chain */
 	lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
 	lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
 
 	/*
 	 * l2arc_log_blk_commit() may be called multiple times during a single
 	 * l2arc_write_buffers() call. Save the allocated abd buffers in a list
 	 * so we can free them in l2arc_write_done() later on.
 	 */
 	list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
 
 	/* try to compress the buffer */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
 	    abd_buf->abd, tmpbuf, sizeof (*lb), 0);
 
 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
 	asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 	ASSERT(asize <= sizeof (*lb));
 
 	/*
 	 * Update the start log block pointer in the device header to point
 	 * to the log block we're about to write.
 	 */
 	l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
 	l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
 	l2dhdr->dh_start_lbps[0].lbp_payload_asize =
 	    dev->l2ad_log_blk_payload_asize;
 	l2dhdr->dh_start_lbps[0].lbp_payload_start =
 	    dev->l2ad_log_blk_payload_start;
 	_NOTE(CONSTCOND)
 	L2BLK_SET_LSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
 	L2BLK_SET_PSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
 	L2BLK_SET_CHECKSUM(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
 		bzero(tmpbuf + psize, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
 		bcopy(lb, tmpbuf, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
 	}
 
 	/* checksum what we're about to write */
 	fletcher_4_native(tmpbuf, asize, NULL,
 	    &l2dhdr->dh_start_lbps[0].lbp_cksum);
 
 	abd_put(abd_buf->abd);
 
 	/* perform the write itself */
 	abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
 	abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
 	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
 	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
 	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
 	(void) zio_nowait(wzio);
 
 	dev->l2ad_hand += asize;
 	/*
 	 * Include the committed log block's pointer  in the list of pointers
 	 * to log blocks present in the L2ARC device.
 	 */
 	bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
 	    sizeof (l2arc_log_blkptr_t));
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
 	ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 	zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 	zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 	mutex_exit(&dev->l2ad_mtx);
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	/* bump the kstats */
 	ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
 	    dev->l2ad_log_blk_payload_asize / asize);
 
 	/* start a new log block */
 	dev->l2ad_log_ent_idx = 0;
 	dev->l2ad_log_blk_payload_asize = 0;
 	dev->l2ad_log_blk_payload_start = 0;
 }
 
 /*
  * Validates an L2ARC log block address to make sure that it can be read
  * from the provided L2ARC device.
  */
 boolean_t
 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
 {
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	uint64_t end = lbp->lbp_daddr + asize - 1;
 	uint64_t start = lbp->lbp_payload_start;
 	boolean_t evicted = B_FALSE;
 
 	/*
 	 * A log block is valid if all of the following conditions are true:
 	 * - it fits entirely (including its payload) between l2ad_start and
 	 *   l2ad_end
 	 * - it has a valid size
 	 * - neither the log block itself nor part of its payload was evicted
 	 *   by l2arc_evict():
 	 *
 	 *		l2ad_hand          l2ad_evict
 	 *		|			 |	lbp_daddr
 	 *		|     start		 |	|  end
 	 *		|     |			 |	|  |
 	 *		V     V		         V	V  V
 	 *   l2ad_start ============================================ l2ad_end
 	 *                    --------------------------||||
 	 *				^		 ^
 	 *				|		log block
 	 *				payload
 	 */
 
 	evicted =
 	    l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
 	    l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
 
 	return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
 	    asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
 	    (!evicted || dev->l2ad_first));
 }
 
 /*
  * Inserts ARC buffer header `hdr' into the current L2ARC log block on
  * the device. The buffer being inserted must be present in L2ARC.
  * Returns B_TRUE if the L2ARC log block is full and needs to be committed
  * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
  */
 static boolean_t
 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_log_ent_phys_t	*le;
 
 	if (dev->l2ad_log_entries == 0)
 		return (B_FALSE);
 
 	int index = dev->l2ad_log_ent_idx++;
 
 	ASSERT3S(index, <, dev->l2ad_log_entries);
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	le = &lb->lb_entries[index];
 	bzero(le, sizeof (*le));
 	le->le_dva = hdr->b_dva;
 	le->le_birth = hdr->b_birth;
 	le->le_daddr = hdr->b_l2hdr.b_daddr;
 	if (index == 0)
 		dev->l2ad_log_blk_payload_start = le->le_daddr;
 	L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
 	L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
 	L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
 	le->le_complevel = hdr->b_complevel;
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
 
 	return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
 }
 
 /*
  * Checks whether a given L2ARC device address sits in a time-sequential
  * range. The trick here is that the L2ARC is a rotary buffer, so we can't
  * just do a range comparison, we need to handle the situation in which the
  * range wraps around the end of the L2ARC device. Arguments:
  *	bottom -- Lower end of the range to check (written to earlier).
  *	top    -- Upper end of the range to check (written to later).
  *	check  -- The address for which we want to determine if it sits in
  *		  between the top and bottom.
  *
  * The 3-way conditional below represents the following cases:
  *
  *	bottom < top : Sequentially ordered case:
  *	  <check>--------+-------------------+
  *	                 |  (overlap here?)  |
  *	 L2ARC dev       V                   V
  *	 |---------------<bottom>============<top>--------------|
  *
  *	bottom > top: Looped-around case:
  *	                      <check>--------+------------------+
  *	                                     |  (overlap here?) |
  *	 L2ARC dev                           V                  V
  *	 |===============<top>---------------<bottom>===========|
  *	 ^               ^
  *	 |  (or here?)   |
  *	 +---------------+---------<check>
  *
  *	top == bottom : Just a single address comparison.
  */
 boolean_t
 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
 {
 	if (bottom < top)
 		return (bottom <= check && check <= top);
 	else if (bottom > top)
 		return (check <= top || bottom <= check);
 	else
 		return (check == top);
 }
 
 EXPORT_SYMBOL(arc_buf_size);
 EXPORT_SYMBOL(arc_write);
 EXPORT_SYMBOL(arc_read);
 EXPORT_SYMBOL(arc_buf_info);
 EXPORT_SYMBOL(arc_getbuf_func);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_long,
 	param_get_long, ZMOD_RW, "Min arc size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_long,
 	param_get_long, ZMOD_RW, "Max arc size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long,
 	param_get_long, ZMOD_RW, "Metadata limit for arc size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent,
 	param_set_arc_long, param_get_long, ZMOD_RW,
 	"Percent of arc size for arc meta limit");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long,
 	param_get_long, ZMOD_RW, "Min arc metadata");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW,
 	"Meta objects to scan for prune");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW,
 	"Limit number of restarts in arc_evict_meta");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW,
 	"Meta reclaim strategy");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
 	param_get_int, ZMOD_RW, "Seconds before growing arc size");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW,
 	"Disable arc_p adapt dampener");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
 	param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
 	"Percent of pagecache to reclaim arc to");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int,
 	param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD,
 	"Target average block size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
 	"Disable compressed arc buffers");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
 	param_get_int, ZMOD_RW, "Min life of prefetch block in ms");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
 	param_set_arc_int, param_get_int, ZMOD_RW,
 	"Min life of prescient prefetched block in ms");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW,
 	"Max write bytes per interval");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW,
 	"Extra write bytes during device warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW,
 	"Number of max device writes to precache");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW,
 	"Compressed l2arc_headroom multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW,
 	"TRIM ahead L2ARC write size multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW,
 	"Seconds between L2ARC writing");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW,
 	"Min feed interval in milliseconds");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
 	"Skip caching prefetched buffers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
 	"Turbo L2ARC warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
 	"No reads during writes");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW,
+	"Percent of ARC size allowed for L2ARC-only headers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
 	"Rebuild the L2ARC when importing a pool");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,
 	"Min size in bytes to write rebuild log blocks in L2ARC");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
 	param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long,
 	param_get_long, ZMOD_RW, "System free memory target size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long,
 	param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
 	param_set_arc_long, param_get_long, ZMOD_RW,
 	"Percent of ARC meta buffers for dnodes");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
 	"Percentage of excess dnodes to try to unpin");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
        "When full, ARC allocation waits for eviction of this % of alloc size");
 /* END CSTYLED */
Index: vendor-sys/openzfs/dist/module/zfs/dmu.c
===================================================================
--- vendor-sys/openzfs/dist/module/zfs/dmu.c	(revision 364927)
+++ vendor-sys/openzfs/dist/module/zfs/dmu.c	(revision 364928)
@@ -1,2482 +1,2483 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_prop.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/trace_zfs.h>
 #include <sys/zfs_rlock.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * Enable/disable nopwrite feature.
  */
 int zfs_nopwrite_enabled = 1;
 
 /*
  * Tunable to control percentage of dirtied L1 blocks from frees allowed into
  * one TXG. After this threshold is crossed, additional dirty blocks from frees
  * will wait until the next TXG.
  * A value of zero will disable this throttle.
  */
 unsigned long zfs_per_txg_dirty_frees_percent = 5;
 
 /*
  * Enable/disable forcing txg sync when dirty in dmu_offset_next.
  */
 int zfs_dmu_offset_next_sync = 0;
 
 /*
  * Limit the amount we can prefetch with one call to this amount.  This
  * helps to limit the amount of memory that can be used by prefetching.
  * Larger objects should be prefetched a bit at a time.
  */
 int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "object array"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "packed nvlist"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "packed nvlist size"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj"			},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj header"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map header"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, TRUE,  "ZIL intent log"	},
 	{DMU_BSWAP_DNODE,  TRUE,  FALSE, TRUE,  "DMU dnode"		},
 	{DMU_BSWAP_OBJSET, TRUE,  TRUE,  FALSE, "DMU objset"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL directory child map"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset snap map"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL props"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL dataset"		},
 	{DMU_BSWAP_ZNODE,  TRUE,  FALSE, FALSE, "ZFS znode"		},
 	{DMU_BSWAP_OLDACL, TRUE,  FALSE, TRUE,  "ZFS V0 ACL"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "ZFS plain file"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "ZFS master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS delete queue"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "zvol object"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "zvol prop"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "other uint8[]"		},
 	{DMU_BSWAP_UINT64, FALSE, FALSE, TRUE,  "other uint64[]"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "other ZAP"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "persistent error log"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "SPA history"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA history offsets"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "Pool properties"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL permissions"	},
 	{DMU_BSWAP_ACL,    TRUE,  FALSE, TRUE,  "ZFS ACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "ZFS SYSACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "FUID table"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "FUID table size"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset next clones"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan work queue"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project used" },
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project quota"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "snapshot refcount tags"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT ZAP algorithm"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT statistics"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,	"System attributes"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr registration"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr layouts"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan translations"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "deduplicated block"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL deadlist map"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL deadlist map hdr"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dir clones"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj subobj"		}
 };
 
 const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	byteswap_uint8_array,	"uint8"		},
 	{	byteswap_uint16_array,	"uint16"	},
 	{	byteswap_uint32_array,	"uint32"	},
 	{	byteswap_uint64_array,	"uint64"	},
 	{	zap_byteswap,		"zap"		},
 	{	dnode_buf_byteswap,	"dnode"		},
 	{	dmu_objset_byteswap,	"objset"	},
 	{	zfs_znode_byteswap,	"znode"		},
 	{	zfs_oldacl_byteswap,	"oldacl"	},
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
 static int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
     void *tag, dmu_buf_t **dbp)
 {
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (err);
 }
 
 int
 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_bonus_max(void)
 {
 	return (DN_OLD_MAX_BONUSLEN);
 }
 
 int
 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else if (newsize < 0 || newsize > db_fake->db_size) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonuslen(dn, newsize, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 int
 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (!DMU_OT_IS_VALID(type)) {
 		error = SET_ERROR(EINVAL);
 	} else if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonus_type(dn, type, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 dmu_object_type_t
 dmu_get_bonustype(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	type = dn->dn_bonustype;
 	DB_DNODE_EXIT(db);
 
 	return (type);
 }
 
 int
 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	dbuf_rm_spill(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_rm_spill(dn, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * Lookup and hold the bonus buffer for the provided dnode.  If the dnode
  * has not yet been allocated a new bonus dbuf a will be allocated.
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
     uint32_t flags)
 {
 	dmu_buf_impl_t *db;
 	int error;
 	uint32_t db_flags = DB_RF_MUST_SUCCEED;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		rw_exit(&dn->dn_struct_rwlock);
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		if (dn->dn_bonus == NULL)
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 
 	/* as long as the bonus buf is held, the dnode will be held */
 	if (zfs_refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
 		atomic_inc_32(&dn->dn_dbufs_count);
 	}
 
 	/*
 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
 	 * a dnode hold for every dbuf.
 	 */
 	rw_exit(&dn->dn_struct_rwlock);
 
 	error = dbuf_read(db, NULL, db_flags);
 	if (error) {
 		dnode_evict_bonus(dn);
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 		return (error);
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	if (error)
 		return (error);
 
 	error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH);
 	dnode_rele(dn, FTAG);
 
 	return (error);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  *
  * This interface will allocate a blank spill dbuf when a spill blk
  * doesn't already exist on the dnode.
  *
  * if you only want to find an already existing spill db, then
  * dmu_spill_hold_existing() should be used.
  */
 int
 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = NULL;
 	int err;
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 	err = dbuf_read(db, NULL, flags);
 	if (err == 0)
 		*dbp = &db->db;
 	else {
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 	}
 	return (err);
 }
 
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 		err = SET_ERROR(EINVAL);
 	} else {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 		if (!dn->dn_have_spill) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			err = dmu_spill_hold_by_dnode(dn,
 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 		}
 
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	DB_DNODE_EXIT(db);
 	return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 	uint32_t db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
  * whose dnodes are in the same block.
  */
 int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
 	/*
 	 * Note: We directly notify the prefetch code of this read, so that
 	 * we can tell it about the multi-block read.  dbuf_read() only knows
 	 * about the one block it is accessing.
 	 */
 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
 	    DB_RF_NOPREFETCH;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
 		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
 			    "%llx/%llx (size=%u access=%llu+%llu)",
 			    (longlong_t)dn->dn_objset->
 			    os_dsl_dataset->ds_object,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
 			rw_exit(&dn->dn_struct_rwlock);
 			return (SET_ERROR(EIO));
 		}
 		nblks = 1;
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			zio_nowait(zio);
 			return (SET_ERROR(EIO));
 		}
 
 		/* initiate async i/o */
 		if (read)
 			(void) dbuf_read(db, zio, dbuf_flags);
 		dbp[i] = &db->db;
 	}
 
 	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
 	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
 		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
 		    read && DNODE_IS_CACHEABLE(dn), B_TRUE);
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* wait for async i/o */
 	err = zio_wait(zio);
 	if (err) {
 		dmu_buf_rele_array(dbp, nblks, tag);
 		return (err);
 	}
 
 	/* wait for other io to complete */
 	if (read) {
 		for (i = 0; i < nblks; i++) {
 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL)
 				cv_wait(&db->db_changed, &db->db_mtx);
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 			if (err) {
 				dmu_buf_rele_array(dbp, nblks, tag);
 				return (err);
 			}
 		}
 	}
 
 	*numbufsp = nblks;
 	*dbpp = dbp;
 	return (0);
 }
 
 static int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, boolean_t read, void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 
 	if (numbufs == 0)
 		return;
 
 	for (i = 0; i < numbufs; i++) {
 		if (dbp[i])
 			dbuf_rele(dbp[i], tag);
 	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
 /*
  * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefetched will be those that point to the blocks containing
  * the data starting at offset, and continuing to offset + len.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
  * in cache, they will be asynchronously read in.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	int nblks, err;
 
 	if (len == 0) {  /* they're interested in the bonus buffer */
 		dn = DMU_META_DNODE(os);
 
 		if (object == 0 || object >= DN_MAX_OBJECT)
 			return;
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		blkid = dbuf_whichblock(dn, level,
 		    object * sizeof (dnode_phys_t));
 		dbuf_prefetch(dn, level, blkid, pri, 0);
 		rw_exit(&dn->dn_struct_rwlock);
 		return;
 	}
 
 	/*
 	 * See comment before the definition of dmu_prefetch_max.
 	 */
 	len = MIN(len, dmu_prefetch_max);
 
 	/*
 	 * XXX - Note, if the dnode for the requested object is not
 	 * already cached, we will do a *synchronous* read in the
 	 * dnode_hold() call.  The same is true for any indirects.
 	 */
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return;
 
 	/*
 	 * offset + len - 1 is the last byte we want to prefetch for, and offset
 	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
 	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
 	 * offset)  is the first.  Then the number we need to prefetch is the
 	 * last - first + 1.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (level > 0 || dn->dn_datablkshift != 0) {
 		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
 		    dbuf_whichblock(dn, level, offset) + 1;
 	} else {
 		nblks = (offset < dn->dn_datablksz);
 	}
 
 	if (nblks != 0) {
 		blkid = dbuf_whichblock(dn, level, offset);
 		for (int i = 0; i < nblks; i++)
 			dbuf_prefetch(dn, level, blkid + i, pri, 0);
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crashes in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
  *
  * On input, *start should be the first offset that does not need to be
  * freed (e.g. "offset + length").  On return, *start will be the first
  * offset that should be freed and l1blks is set to the number of level 1
  * indirect blocks found within the chunk.
  */
 static int
 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
 {
 	uint64_t blks;
 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
 	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange = (uint64_t)dn->dn_datablksz *
 	    EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
 	ASSERT3U(minimum, <=, *start);
 
 	/*
 	 * Check if we can free the entire range assuming that all of the
 	 * L1 blocks in this range have data. If we can, we use this
 	 * worst case value as an estimate so we can avoid having to look
 	 * at the object's actual data.
 	 */
 	uint64_t total_l1blks =
 	    (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
 	    iblkrange;
 	if (total_l1blks <= maxblks) {
 		*l1blks = total_l1blks;
 		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
 	for (blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
 		/*
 		 * dnode_next_offset(BACKWARDS) will find an allocated L1
 		 * indirect block at or before the input offset.  We must
 		 * decrement *start so that it is at the end of the region
 		 * to search.
 		 */
 		(*start)--;
 
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
 		/* if there are no indirect blocks before start, we are done */
 		if (err == ESRCH) {
 			*start = minimum;
 			break;
 		} else if (err != 0) {
 			*l1blks = blks;
 			return (err);
 		}
 
 		/* set start to the beginning of this L1 indirect */
 		*start = P2ALIGN(*start, iblkrange);
 	}
 	if (*start < minimum)
 		*start = minimum;
 	*l1blks = blks;
 
 	return (0);
 }
 
 /*
  * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
  * otherwise return false.
  * Used below in dmu_free_long_range_impl() to enable abort when unmounting
  */
 /*ARGSUSED*/
 static boolean_t
 dmu_objset_zfs_unmounting(objset_t *os)
 {
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS)
 		return (zfs_get_vfs_flag_unmounted(os));
 #endif
 	return (B_FALSE);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
     uint64_t length)
 {
 	uint64_t object_size;
 	int err;
 	uint64_t dirty_frees_threshold;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	if (dn == NULL)
 		return (SET_ERROR(EINVAL));
 
 	object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	if (offset >= object_size)
 		return (0);
 
 	if (zfs_per_txg_dirty_frees_percent <= 100)
 		dirty_frees_threshold =
 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 	else
 		dirty_frees_threshold = zfs_dirty_data_max / 20;
 
 	if (length == DMU_OBJECT_END || offset + length > object_size)
 		length = object_size - offset;
 
 	while (length != 0) {
 		uint64_t chunk_end, chunk_begin, chunk_len;
 		uint64_t l1blks;
 		dmu_tx_t *tx;
 
 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
 			return (SET_ERROR(EINTR));
 
 		chunk_end = chunk_begin = offset + length;
 
 		/* move chunk_begin backwards to the beginning of this chunk */
 		err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
 		if (err)
 			return (err);
 		ASSERT3U(chunk_begin, >=, offset);
 		ASSERT3U(chunk_begin, <=, chunk_end);
 
 		chunk_len = chunk_end - chunk_begin;
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 
 		/*
 		 * Mark this transaction as typically resulting in a net
 		 * reduction in space used.
 		 */
 		dmu_tx_mark_netfree(tx);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
 
 		uint64_t txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&dp->dp_lock);
 		uint64_t long_free_dirty =
 		    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
 		mutex_exit(&dp->dp_lock);
 
 		/*
 		 * To avoid filling up a TXG with just frees, wait for
 		 * the next TXG to open before freeing more chunks if
 		 * we have reached the threshold of frees.
 		 */
 		if (dirty_frees_threshold != 0 &&
 		    long_free_dirty >= dirty_frees_threshold) {
 			DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
 			dmu_tx_commit(tx);
 			txg_wait_open(dp, 0, B_TRUE);
 			continue;
 		}
 
 		/*
 		 * In order to prevent unnecessary write throttling, for each
 		 * TXG, we track the cumulative size of L1 blocks being dirtied
 		 * in dnode_free_range() below. We compare this number to a
 		 * tunable threshold, past which we prevent new L1 dirty freeing
 		 * blocks from being added into the open TXG. See
 		 * dmu_free_long_range_impl() for details. The threshold
 		 * prevents write throttle activation due to dirty freeing L1
 		 * blocks taking up a large percentage of zfs_dirty_data_max.
 		 */
 		mutex_enter(&dp->dp_lock);
 		dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
 		    l1blks << dn->dn_indblkshift;
 		mutex_exit(&dp->dp_lock);
 		DTRACE_PROBE3(free__long__range,
 		    uint64_t, long_free_dirty, uint64_t, chunk_len,
 		    uint64_t, txg);
 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 
 		dmu_tx_commit(tx);
 
 		length -= chunk_len;
 	}
 	return (0);
 }
 
 int
 dmu_free_long_range(objset_t *os, uint64_t object,
     uint64_t offset, uint64_t length)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length);
 
 	/*
 	 * It is important to zero out the maxblkid when freeing the entire
 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
 	 * will take the fast path, and (b) dnode_reallocate() can verify
 	 * that the entire file has been freed.
 	 */
 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
 		dn->dn_maxblkid = 0;
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_free_long_object(objset_t *os, uint64_t object)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 	if (err != 0)
 		return (err);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, object);
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 	dmu_tx_mark_netfree(tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err == 0) {
 		if (err == 0)
 			err = dmu_object_free(os, object, tx);
 
 		dmu_tx_commit(tx);
 	} else {
 		dmu_tx_abort(tx);
 	}
 
 	return (err);
 }
 
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 static int
 dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs, err = 0;
 
 	/*
 	 * Deal with odd block sizes, where there can't be data past the first
 	 * block.  If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
 	if (dn->dn_maxblkid == 0) {
 		uint64_t newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		bzero((char *)buf + newsz, size - newsz);
 		size = newsz;
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int i;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
 		for (i = 0; i < numbufs; i++) {
 			uint64_t tocpy;
 			int64_t bufoff;
 			dmu_buf_t *db = dbp[i];
 
 			ASSERT(size > 0);
 
 			bufoff = offset - db->db_offset;
 			tocpy = MIN(db->db_size - bufoff, size);
 
 			(void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
 
 			offset += tocpy;
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	return (err);
 }
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	err = dmu_read_impl(dn, offset, size, buf, flags);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags)
 {
 	return (dmu_read_impl(dn, offset, size, buf, flags));
 }
 
 static void
 dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	int i;
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = offset - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		offset += tocpy;
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
 }
 
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 /*
  * Note: Lustre is an external consumer of this interface.
  */
 void
 dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
 	if (size == 0)
 		return;
 
 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		dmu_buf_t *db = dbp[i];
 
 		dmu_buf_will_not_fill(db, tx);
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 
 	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
 	VERIFY0(dmu_buf_hold_noread(os, object, offset,
 	    FTAG, &db));
 
 	dmu_buf_write_embedded(db,
 	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
 	    uncompressed_size, compressed_size, byteorder, tx);
 
 	dmu_buf_rele(db, FTAG);
 }
 
 void
 dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	int numbufs, i;
 	dmu_buf_t **dbp;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
 	    &numbufs, &dbp));
 	for (i = 0; i < numbufs; i++)
 		dmu_buf_redact(dbp[i], tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 /*
  * DMU support for xuio
  */
 kstat_t *xuio_ksp = NULL;
 
 typedef struct xuio_stats {
 	/* loaned yet not returned arc_buf */
 	kstat_named_t xuiostat_onloan_rbuf;
 	kstat_named_t xuiostat_onloan_wbuf;
 	/* whether a copy is made when loaning out a read buffer */
 	kstat_named_t xuiostat_rbuf_copied;
 	kstat_named_t xuiostat_rbuf_nocopy;
 	/* whether a copy is made when assigning a write buffer */
 	kstat_named_t xuiostat_wbuf_copied;
 	kstat_named_t xuiostat_wbuf_nocopy;
 } xuio_stats_t;
 
 static xuio_stats_t xuio_stats = {
 	{ "onloan_read_buf",	KSTAT_DATA_UINT64 },
 	{ "onloan_write_buf",	KSTAT_DATA_UINT64 },
 	{ "read_buf_copied",	KSTAT_DATA_UINT64 },
 	{ "read_buf_nocopy",	KSTAT_DATA_UINT64 },
 	{ "write_buf_copied",	KSTAT_DATA_UINT64 },
 	{ "write_buf_nocopy",	KSTAT_DATA_UINT64 }
 };
 
 #define	XUIOSTAT_INCR(stat, val)        \
 	atomic_add_64(&xuio_stats.stat.value.ui64, (val))
 #define	XUIOSTAT_BUMP(stat)	XUIOSTAT_INCR(stat, 1)
 
 #ifdef HAVE_UIO_ZEROCOPY
 int
 dmu_xuio_init(xuio_t *xuio, int nblk)
 {
 	dmu_xuio_t *priv;
 	uio_t *uio = &xuio->xu_uio;
 
 	uio->uio_iovcnt = nblk;
 	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
 
 	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
 	priv->cnt = nblk;
 	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
 	priv->iovp = (iovec_t *)uio->uio_iov;
 	XUIO_XUZC_PRIV(xuio) = priv;
 
 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
 	else
 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
 
 	return (0);
 }
 
 void
 dmu_xuio_fini(xuio_t *xuio)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	int nblk = priv->cnt;
 
 	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
 	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
 	kmem_free(priv, sizeof (dmu_xuio_t));
 
 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
 	else
 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
 }
 
 /*
  * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
  * and increase priv->next by 1.
  */
 int
 dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
 {
 	struct iovec *iov;
 	uio_t *uio = &xuio->xu_uio;
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	int i = priv->next++;
 
 	ASSERT(i < priv->cnt);
 	ASSERT(off + n <= arc_buf_lsize(abuf));
 	iov = (iovec_t *)uio->uio_iov + i;
 	iov->iov_base = (char *)abuf->b_data + off;
 	iov->iov_len = n;
 	priv->bufs[i] = abuf;
 	return (0);
 }
 
 int
 dmu_xuio_cnt(xuio_t *xuio)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	return (priv->cnt);
 }
 
 arc_buf_t *
 dmu_xuio_arcbuf(xuio_t *xuio, int i)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 
 	ASSERT(i < priv->cnt);
 	return (priv->bufs[i]);
 }
 
 void
 dmu_xuio_clear(xuio_t *xuio, int i)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 
 	ASSERT(i < priv->cnt);
 	priv->bufs[i] = NULL;
 }
 #endif /* HAVE_UIO_ZEROCOPY */
 
 static void
 xuio_stat_init(void)
 {
 	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (xuio_ksp != NULL) {
 		xuio_ksp->ks_data = &xuio_stats;
 		kstat_install(xuio_ksp);
 	}
 }
 
 static void
 xuio_stat_fini(void)
 {
 	if (xuio_ksp != NULL) {
 		kstat_delete(xuio_ksp);
 		xuio_ksp = NULL;
 	}
 }
 
 void
 xuio_stat_wbuf_copied(void)
 {
 	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 }
 
 void
 xuio_stat_wbuf_nocopy(void)
 {
 	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
 }
 
 #ifdef _KERNEL
 int
 dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 #ifdef HAVE_UIO_ZEROCOPY
 	xuio_t *xuio = NULL;
 #endif
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
 	err = dmu_buf_hold_array_by_dnode(dn, uio_offset(uio), size,
 	    TRUE, FTAG, &numbufs, &dbp, 0);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = uio_offset(uio) - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 #ifdef HAVE_UIO_ZEROCOPY
 		if (xuio) {
 			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 			arc_buf_t *dbuf_abuf = dbi->db_buf;
 			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
 			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
 			if (!err)
 				uio_advance(uio, tocpy);
 
 			if (abuf == dbuf_abuf)
 				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
 			else
 				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
 		} else
 #endif
 #ifdef __FreeBSD__
 			err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
 			    tocpy, uio);
 #else
 			err = uiomove((char *)db->db_data + bufoff, tocpy,
 			    UIO_READ, uio);
 #endif
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From object zdb->db_object.
  * Starting at offset uio->uio_loffset.
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_read_uio_dnode(dn, uio, size);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From the specified object
  * Starting at offset uio->uio_loffset.
  */
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_read_uio_dnode(dn, uio, size);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
 	int i;
 
 	err = dmu_buf_hold_array_by_dnode(dn, uio_offset(uio), size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = uio_offset(uio) - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		/*
 		 * XXX uiomove could block forever (eg.nfs-backed
 		 * pages).  There needs to be a uiolockdown() function
 		 * to lock the pages in memory, so that uiomove won't
 		 * block.
 		 */
 #ifdef __FreeBSD__
 		err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
 		    tocpy, uio);
 #else
 		err = uiomove((char *)db->db_data + bufoff, tocpy,
 		    UIO_WRITE, uio);
 #endif
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To object zdb->db_object.
  * Starting at offset uio->uio_loffset.
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To the specified object.
  * Starting at offset uio->uio_loffset.
  */
 int
 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 #endif /* _KERNEL */
 
 /*
  * Allocate a loaned anonymous arc buffer.
  */
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
 }
 
 /*
  * Free a loaned arc buffer.
  */
 void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
 	arc_buf_destroy(buf, FTAG);
 }
 
 /*
  * When possible directly assign passed loaned arc buffer to a dbuf.
  * If this is not possible copy the contents of passed arc buf via
  * dmu_write().
  */
 int
 dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	objset_t *os = dn->dn_objset;
 	uint64_t object = dn->dn_object;
 	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
 	uint64_t blkid;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, FTAG);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/*
 	 * We can only assign if the offset is aligned, the arc buf is the
 	 * same size as the dbuf, and the dbuf is not metadata.
 	 */
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
 		/* compressed bufs must always be assignable to their dbuf */
 		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
 		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
 
 		dbuf_rele(db, FTAG);
 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
 		dmu_return_arcbuf(buf);
 		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 	}
 
 	return (0);
 }
 
 int
 dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	int err;
 	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
 
 	DB_DNODE_ENTER(dbuf);
 	err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx);
 	DB_DNODE_EXIT(dbuf);
 
 	return (err);
 }
 
 typedef struct {
 	dbuf_dirty_record_t	*dsa_dr;
 	dmu_sync_cb_t		*dsa_done;
 	zgd_t			*dsa_zgd;
 	dmu_tx_t		*dsa_tx;
 } dmu_sync_arg_t;
 
 /* ARGSUSED */
 static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	dmu_sync_arg_t *dsa = varg;
 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error == 0) {
 		if (BP_IS_HOLE(bp)) {
 			/*
 			 * A block of zeros may compress to a hole, but the
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
 		} else if (!BP_IS_EMBEDDED(bp)) {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			BP_SET_FILL(bp, 1);
 		}
 	}
 }
 
 static void
 dmu_sync_late_arrival_ready(zio_t *zio)
 {
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
 /* ARGSUSED */
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	/*
 	 * Record the vdev(s) backing this blkptr so they can be flushed after
 	 * the writes for the lwb have completed.
 	 */
 	if (zio->io_error == 0) {
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 	}
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
 		if (dr->dt.dl.dr_nopwrite) {
 			blkptr_t *bp = zio->io_bp;
 			blkptr_t *bp_orig = &zio->io_bp_orig;
 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			VERIFY(BP_EQUAL(bp, db->db_blkptr));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
 			VERIFY(zio_checksum_table[chksum].ci_flags &
 			    ZCHECKSUM_FLAG_NOPWRITE);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
 
 		/*
 		 * Old style holes are filled with all zeros, whereas
 		 * new-style holes maintain their lsize, type, level,
 		 * and birth time (see zio_write_compress). While we
 		 * need to reset the BP_SET_LSIZE() call that happened
 		 * in dmu_sync_ready for old style holes, we do *not*
 		 * want to wipe out the information contained in new
 		 * style holes. Thus, only zero out the block pointer if
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
 		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	}
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static void
 dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	if (zio->io_error == 0) {
 		/*
 		 * Record the vdev(s) backing this blkptr so they can be
 		 * flushed after the writes for the lwb have completed.
 		 */
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 		if (!BP_IS_HOLE(bp)) {
 			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
 			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
 			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	abd_put(zio->io_abd);
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
     zio_prop_t *zp, zbookmark_phys_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
 	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * In order to prevent the zgd's lwb from being free'd prior to
 	 * dmu_sync_late_arrival_done() being called, we have to ensure
 	 * the lwb's "max txg" takes this tx's txg into account.
 	 */
 	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = NULL;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
 	/*
 	 * Since we are currently syncing this txg, it's nontrivial to
 	 * determine what BP to nopwrite against, so we disable nopwrite.
 	 *
 	 * When syncing, the db_blkptr is initially the BP of the previous
 	 * txg.  We can not nopwrite against it because it will be changed
 	 * (this is similar to the non-late-arrival case where the dbuf is
 	 * dirty in a future txg).
 	 *
 	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
 	 * We can not nopwrite against it because although the BP will not
 	 * (typically) be changed, the data has not yet been persisted to this
 	 * location.
 	 *
 	 * Finally, when dbuf_write_done() is called, it is theoretically
 	 * possible to always nopwrite, because the data that was written in
 	 * this txg is the same data that we are trying to write.  However we
 	 * would need to check that this dbuf is not dirty in any future
 	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
 	 * don't nopwrite in this case.
 	 */
 	zp->zp_nopwrite = B_FALSE;
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
 
 /*
  * Intent log support: sync the block associated with db to disk.
  * N.B. and XXX: the caller is responsible for making sure that the
  * data isn't changing while dmu_sync() is writing it.
  *
  * Return values:
  *
  *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
  *	EIO: could not do the I/O.
  *		The caller should do a txg_wait_synced().
  *
  *	0: the I/O has been initiated.
  *		The caller should log this blkptr in the done callback.
  *		It is possible that the I/O will fail, in which case
  *		the error will be reported to the done callback and
  *		propagated to pio from zio_done().
  */
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr, *dr_next;
 	dmu_sync_arg_t *dsa;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	dnode_t *dn;
 
 	ASSERT(pio != NULL);
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
 	if (txg > spa_freeze_txg(os->os_spa))
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
 	 * and us.  If we determine that this txg is not yet syncing,
 	 * but it begins to sync a moment later, that's OK because the
 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
 		 * This txg has already synced.  There's nothing to do.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	if (txg <= spa_syncing_txg(os->os_spa)) {
 		/*
 		 * This txg is currently syncing, so we can't mess with
 		 * the dirty record anymore; just write a new log block.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = dbuf_find_dirty_eq(db, txg);
 
 	if (dr == NULL) {
 		/*
 		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	dr_next = list_next(&db->db_dirty_records, dr);
 	ASSERT(dr_next == NULL || dr_next->dr_txg < txg);
 
 	if (db->db_blkptr != NULL) {
 		/*
 		 * We need to fill in zgd_bp with the current blkptr so that
 		 * the nopwrite code can check if we're writing the same
 		 * data that's already on disk.  We can only nopwrite if we
 		 * are sure that after making the copy, db_blkptr will not
 		 * change until our i/o completes.  We ensure this by
 		 * holding the db_mtx, and only allowing nopwrite if the
 		 * block is not already dirty (see below).  This is verified
 		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
 		 * not changed.
 		 */
 		*zgd->zgd_bp = *db->db_blkptr;
 	}
 
 	/*
 	 * Assume the on-disk data is X, the current syncing data (in
 	 * txg - 1) is Y, and the current in-memory data is Z (currently
 	 * in dmu_sync).
 	 *
 	 * We usually want to perform a nopwrite if X and Z are the
 	 * same.  However, if Y is different (i.e. the BP is going to
 	 * change before this write takes effect), then a nopwrite will
 	 * be incorrect - we would override with X, which could have
 	 * been freed when Y was written.
 	 *
 	 * (Note that this is not a concern when we are nop-writing from
 	 * syncing context, because X and Y must be identical, because
 	 * all previous txgs have been synced.)
 	 *
 	 * Therefore, we disable nopwrite if the current BP could change
 	 * before this TXG.  There are two ways it could change: by
 	 * being dirty (dr_next is non-NULL), or by being freed
 	 * (dnode_block_freed()).  This behavior is verified by
 	 * zio_done(), which VERIFYs that the override BP is identical
 	 * to the on-disk BP.
 	 */
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
 		zp.zp_nopwrite = B_FALSE;
 	DB_DNODE_EXIT(db);
 
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * We have already issued a sync write for this buffer,
 		 * or this buffer has already been synced.  It could not
 		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EALREADY));
 	}
 
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 	mutex_exit(&db->db_mtx);
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = dr;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg,
 	    zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
 	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
 }
 
 int
 dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_nlevels(dn, nlevels, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's checksum function.  This
 	 * check ensures that the receiving system can understand the
 	 * checksum function transmitted.
 	 */
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's compression function.  This
 	 * check ensures that the receiving system can understand the
 	 * compression function transmitted.
 	 */
 	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * When the "redundant_metadata" property is set to "most", only indirect
  * blocks of this level and higher will have an additional ditto block.
  */
 int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
 	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	uint8_t complevel = os->os_complevel;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
 	boolean_t dedup = B_FALSE;
 	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	boolean_t encrypt = B_FALSE;
 	int copies = os->os_copies;
 
 	/*
 	 * We maintain different write policies for each of the following
 	 * types of data:
 	 *	 1. metadata
 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
 	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		/*
 		 * XXX -- we should design a compression algorithm
 		 * that specializes in arrays of bps.
 		 */
 		compress = zio_compress_select(os->os_spa,
 		    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
 
 		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
 		if (!(zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_METADATA) ||
 		    (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 
 		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
 		    (os->os_redundant_metadata ==
 		    ZFS_REDUNDANT_METADATA_MOST &&
 		    (level >= zfs_redundant_metadata_most_ditto_level ||
 		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
 			copies++;
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
 		/*
 		 * If we're writing preallocated blocks, we aren't actually
 		 * writing them so don't set any policy properties.  These
 		 * blocks are currently only used by an external subsystem
 		 * outside of zfs (i.e. dump) and not written by the zio
 		 * pipeline.
 		 */
 		compress = ZIO_COMPRESS_OFF;
 		checksum = ZIO_CHECKSUM_OFF;
 	} else {
 		compress = zio_compress_select(os->os_spa, dn->dn_compress,
 		    compress);
 		complevel = zio_complevel_select(os->os_spa, compress,
 		    complevel, complevel);
 
 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
 		    zio_checksum_select(dn->dn_checksum, checksum) :
 		    dedup_checksum;
 
 		/*
 		 * Determine dedup setting.  If we are in dmu_sync(),
 		 * we won't actually dedup now because that's all
 		 * done in syncing context; but we do want to use the
 		 * dedup checksum.  If the checksum is not strong
 		 * enough to ensure unique signatures, force
 		 * dedup_verify.
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
 			if (!(zio_checksum_table[checksum].ci_flags &
 			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
 
 		/*
 		 * Enable nopwrite if we have secure enough checksum
 		 * algorithm (see comment in zio_nop_write) and
 		 * compression is enabled.  We don't enable nopwrite if
 		 * dedup is enabled as the two features are mutually
 		 * exclusive.
 		 */
 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
 	/*
 	 * All objects in an encrypted objset are protected from modification
 	 * via a MAC. Encrypted objects store their IV and salt in the last DVA
 	 * in the bp, so we cannot use all copies. Encrypted objects are also
 	 * not subject to nopwrite since writing the same data will still
 	 * result in a new ciphertext. Only encrypted blocks can be dedup'd
 	 * to avoid ambiguity in the dedup code since the DDT does not store
 	 * object types.
 	 */
 	if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
 		encrypt = B_TRUE;
 
 		if (DMU_OT_IS_ENCRYPTED(type)) {
 			copies = MIN(copies, SPA_DVAS_PER_BP - 1);
 			nopwrite = B_FALSE;
 		} else {
 			dedup = B_FALSE;
 		}
 
 		if (level <= 0 &&
 		    (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) {
 			compress = ZIO_COMPRESS_EMPTY;
 		}
 	}
 
 	zp->zp_compress = compress;
 	zp->zp_complevel = complevel;
 	zp->zp_checksum = checksum;
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
 	zp->zp_encrypt = encrypt;
 	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
 	bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
 	bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
 	bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
 	zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
 	    os->os_zpl_special_smallblock : 0;
 
 	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
 }
 
 /*
  * This function is only called from zfs_holey_common() for zpl_llseek()
  * in order to determine the location of holes.  In order to accurately
  * report holes all dirty data must be synced to disk.  This causes extremely
  * poor performance when seeking for holes in a dirty file.  As a compromise,
  * only provide hole data when the dnode is clean.  When a dnode is dirty
  * report the dnode as having no holes which is always a safe thing to do.
  */
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
 	int i, err;
 	boolean_t clean = B_TRUE;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	/*
 	 * Check if dnode is dirty
 	 */
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (multilist_link_active(&dn->dn_dirty_link[i])) {
 			clean = B_FALSE;
 			break;
 		}
 	}
 
 	/*
 	 * If compatibility option is on, sync any current changes before
 	 * we go trundling through the block pointers.
 	 */
 	if (!clean && zfs_dmu_offset_next_sync) {
 		clean = B_TRUE;
 		dnode_rele(dn, FTAG);
 		txg_wait_synced(dmu_objset_pool(os), 0);
 		err = dnode_hold(os, object, FTAG, &dn);
 		if (err)
 			return (err);
 	}
 
 	if (clean)
 		err = dnode_next_offset(dn,
 		    (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	else
 		err = SET_ERROR(EBUSY);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 void
 __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
 	doi->doi_type = dn->dn_type;
 	doi->doi_bonus_type = dn->dn_bonustype;
 	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
 	doi->doi_nblkptr = dn->dn_nblkptr;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
 	__dmu_object_info_from_dnode(dn, doi);
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get information on a DMU object.
  * If doi is NULL, just indicates whether the object exists.
  */
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 
 	if (err)
 		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
 
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 /*
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
 	DB_DNODE_EXIT(db);
 }
 
 /*
  * Faster still when you only care about the size.
  */
 void
 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
     u_longlong_t *nblk512)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add in number of slots used for the dnode itself */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
 	DB_DNODE_EXIT(db);
 }
 
 void
 dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	*dnsize = dn->dn_num_slots << DNODE_SHIFT;
 	DB_DNODE_EXIT(db);
 }
 
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
 	uint64_t *buf = vbuf;
 	size_t count = size >> 3;
 	int i;
 
 	ASSERT((size & 7) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_64(buf[i]);
 }
 
 void
 byteswap_uint32_array(void *vbuf, size_t size)
 {
 	uint32_t *buf = vbuf;
 	size_t count = size >> 2;
 	int i;
 
 	ASSERT((size & 3) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_32(buf[i]);
 }
 
 void
 byteswap_uint16_array(void *vbuf, size_t size)
 {
 	uint16_t *buf = vbuf;
 	size_t count = size >> 1;
 	int i;
 
 	ASSERT((size & 1) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_16(buf[i]);
 }
 
 /* ARGSUSED */
 void
 byteswap_uint8_array(void *vbuf, size_t size)
 {
 }
 
 void
 dmu_init(void)
 {
 	abd_init();
 	zfs_dbgmsg_init();
 	sa_cache_init();
 	xuio_stat_init();
 	dmu_objset_init();
 	dnode_init();
 	zfetch_init();
 	dmu_tx_init();
 	l2arc_init();
 	arc_init();
 	dbuf_init();
 }
 
 void
 dmu_fini(void)
 {
 	arc_fini(); /* arc depends on l2arc, so arc must go first */
 	l2arc_fini();
 	dmu_tx_fini();
 	zfetch_fini();
 	dbuf_fini();
 	dnode_fini();
 	dmu_objset_fini();
 	xuio_stat_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
 	abd_fini();
 }
 
 EXPORT_SYMBOL(dmu_bonus_hold);
 EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
 EXPORT_SYMBOL(dmu_read);
 EXPORT_SYMBOL(dmu_read_by_dnode);
 EXPORT_SYMBOL(dmu_write);
 EXPORT_SYMBOL(dmu_write_by_dnode);
 EXPORT_SYMBOL(dmu_prealloc);
 EXPORT_SYMBOL(dmu_object_info);
 EXPORT_SYMBOL(dmu_object_info_from_dnode);
 EXPORT_SYMBOL(dmu_object_info_from_db);
 EXPORT_SYMBOL(dmu_object_size_from_db);
 EXPORT_SYMBOL(dmu_object_dnsize_from_db);
 EXPORT_SYMBOL(dmu_object_set_nlevels);
 EXPORT_SYMBOL(dmu_object_set_blocksize);
 EXPORT_SYMBOL(dmu_object_set_maxblkid);
 EXPORT_SYMBOL(dmu_object_set_checksum);
 EXPORT_SYMBOL(dmu_object_set_compress);
+EXPORT_SYMBOL(dmu_offset_next);
 EXPORT_SYMBOL(dmu_write_policy);
 EXPORT_SYMBOL(dmu_sync);
 EXPORT_SYMBOL(dmu_request_arcbuf);
 EXPORT_SYMBOL(dmu_return_arcbuf);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
 EXPORT_SYMBOL(dmu_buf_hold);
 EXPORT_SYMBOL(dmu_ot);
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
 	"Enable NOP writes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW,
 	"Percentage of dirtied blocks from frees in one TXG");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 	"Enable forcing txg sync to find holes");
 
 ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, INT, ZMOD_RW,
 	"Limit one prefetch call to this size");
 /* END CSTYLED */
Index: vendor-sys/openzfs/dist/module/zfs/dnode_sync.c
===================================================================
--- vendor-sys/openzfs/dist/module/zfs/dnode_sync.c	(revision 364927)
+++ vendor-sys/openzfs/dist/module/zfs/dnode_sync.c	(revision 364928)
@@ -1,846 +1,856 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_recv.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/range_tree.h>
 #include <sys/zfeature.h>
 
 static void
 dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	int nblkptr = dn->dn_phys->dn_nblkptr;
 	int old_toplvl = dn->dn_phys->dn_nlevels - 1;
 	int new_level = dn->dn_next_nlevels[txgoff];
 	int i;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
 	/* this dnode can't be paged out because it's dirty */
 	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 	ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
 
 	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
 	ASSERT(db != NULL);
 
 	dn->dn_phys->dn_nlevels = new_level;
 	dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
 	    dn->dn_object, dn->dn_phys->dn_nlevels);
 
 	/*
 	 * Lock ordering requires that we hold the children's db_mutexes (by
 	 * calling dbuf_find()) before holding the parent's db_rwlock.  The lock
 	 * order is imposed by dbuf_read's steps of "grab the lock to protect
 	 * db_parent, get db_parent, hold db_parent's db_rwlock".
 	 */
 	dmu_buf_impl_t *children[DN_MAX_NBLKPTR];
 	ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);
 	for (i = 0; i < nblkptr; i++) {
 		children[i] =
 		    dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
 	}
 
 	/* transfer dnode's block pointers to new indirect block */
 	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
 	if (dn->dn_dbuf != NULL)
 		rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER);
 	rw_enter(&db->db_rwlock, RW_WRITER);
 	ASSERT(db->db.db_data);
 	ASSERT(arc_released(db->db_buf));
 	ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
 	bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
 	    sizeof (blkptr_t) * nblkptr);
 	arc_buf_freeze(db->db_buf);
 
 	/* set dbuf's parent pointers to new indirect buf */
 	for (i = 0; i < nblkptr; i++) {
 		dmu_buf_impl_t *child = children[i];
 
 		if (child == NULL)
 			continue;
 #ifdef	ZFS_DEBUG
 		DB_DNODE_ENTER(child);
 		ASSERT3P(DB_DNODE(child), ==, dn);
 		DB_DNODE_EXIT(child);
 #endif	/* DEBUG */
 		if (child->db_parent && child->db_parent != dn->dn_dbuf) {
 			ASSERT(child->db_parent->db_level == db->db_level);
 			ASSERT(child->db_blkptr !=
 			    &dn->dn_phys->dn_blkptr[child->db_blkid]);
 			mutex_exit(&child->db_mtx);
 			continue;
 		}
 		ASSERT(child->db_parent == NULL ||
 		    child->db_parent == dn->dn_dbuf);
 
 		child->db_parent = db;
 		dbuf_add_ref(db, child);
 		if (db->db.db_data)
 			child->db_blkptr = (blkptr_t *)db->db.db_data + i;
 		else
 			child->db_blkptr = NULL;
 		dprintf_dbuf_bp(child, child->db_blkptr,
 		    "changed db_blkptr to new indirect %s", "");
 
 		mutex_exit(&child->db_mtx);
 	}
 
 	bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
 
 	rw_exit(&db->db_rwlock);
 	if (dn->dn_dbuf != NULL)
 		rw_exit(&dn->dn_dbuf->db_rwlock);
 
 	dbuf_rele(db, FTAG);
 
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 static void
 free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	uint64_t bytesfreed = 0;
 
 	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
 
 	for (int i = 0; i < num; i++, bp++) {
 		if (BP_IS_HOLE(bp))
 			continue;
 
 		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
 		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
 
 		/*
 		 * Save some useful information on the holes being
 		 * punched, including logical size, type, and indirection
 		 * level. Retaining birth time enables detection of when
 		 * holes are punched for reducing the number of free
 		 * records transmitted during a zfs send.
 		 */
 
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		dmu_object_type_t type = BP_GET_TYPE(bp);
 		uint64_t lvl = BP_GET_LEVEL(bp);
 
 		bzero(bp, sizeof (blkptr_t));
 
 		if (spa_feature_is_active(dn->dn_objset->os_spa,
 		    SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, type);
 			BP_SET_LEVEL(bp, lvl);
 			BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
 		}
 	}
 	dnode_diduse_space(dn, -bytesfreed);
 }
 
 #ifdef ZFS_DEBUG
 static void
 free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 {
 	int off, num;
 	int i, err, epbs;
 	uint64_t txg = tx->tx_txg;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	off = start - (db->db_blkid * 1<<epbs);
 	num = end - start + 1;
 
 	ASSERT3U(off, >=, 0);
 	ASSERT3U(num, >=, 0);
 	ASSERT3U(db->db_level, >, 0);
 	ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
 	ASSERT(db->db_blkptr != NULL);
 
 	for (i = off; i < off+num; i++) {
 		uint64_t *buf;
 		dmu_buf_impl_t *child;
 		dbuf_dirty_record_t *dr;
 		int j;
 
 		ASSERT(db->db_level == 1);
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		err = dbuf_hold_impl(dn, db->db_level - 1,
 		    (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
 		rw_exit(&dn->dn_struct_rwlock);
 		if (err == ENOENT)
 			continue;
 		ASSERT(err == 0);
 		ASSERT(child->db_level == 0);
 		dr = dbuf_find_dirty_eq(child, txg);
 
 		/* data_old better be zeroed */
 		if (dr) {
 			buf = dr->dt.dl.dr_data->b_data;
 			for (j = 0; j < child->db.db_size >> 3; j++) {
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
 					    "child=%p i=%d off=%d num=%d\n",
 					    (void *)child, i, off, num);
 				}
 			}
 		}
 
 		/*
 		 * db_data better be zeroed unless it's dirty in a
 		 * future txg.
 		 */
 		mutex_enter(&child->db_mtx);
 		buf = child->db.db_data;
 		if (buf != NULL && child->db_state != DB_FILL &&
 		    list_is_empty(&child->db_dirty_records)) {
 			for (j = 0; j < child->db.db_size >> 3; j++) {
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
 					    "child=%p i=%d off=%d num=%d\n",
 					    (void *)child, i, off, num);
 				}
 			}
 		}
 		mutex_exit(&child->db_mtx);
 
 		dbuf_rele(child, FTAG);
 	}
 	DB_DNODE_EXIT(db);
 }
 #endif
 
 /*
  * We don't usually free the indirect blocks here.  If in one txg we have a
  * free_range and a write to the same indirect block, it's important that we
  * preserve the hole's birth times. Therefore, we don't free any any indirect
  * blocks in free_children().  If an indirect block happens to turn into all
  * holes, it will be freed by dbuf_write_children_ready, which happens at a
  * point in the syncing process where we know for certain the contents of the
  * indirect block.
  *
  * However, if we're freeing a dnode, its space accounting must go to zero
  * before we actually try to free the dnode, or we will trip an assertion. In
  * addition, we know the case described above cannot occur, because the dnode is
  * being freed.  Therefore, we free the indirect blocks immediately in that
  * case.
  */
 static void
 free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
     boolean_t free_indirects, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	blkptr_t *bp;
 	dmu_buf_impl_t *subdb;
 	uint64_t start, end, dbstart, dbend;
 	unsigned int epbs, shift, i;
 
 	/*
 	 * There is a small possibility that this block will not be cached:
 	 *   1 - if level > 1 and there are no children with level <= 1
 	 *   2 - if this block was evicted since we read it from
 	 *	 dmu_tx_hold_free().
 	 */
 	if (db->db_state != DB_CACHED)
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 
 	/*
 	 * If we modify this indirect block, and we are not freeing the
 	 * dnode (!free_indirects), then this indirect block needs to get
 	 * written to disk by dbuf_write().  If it is dirty, we know it will
 	 * be written (otherwise, we would have incorrect on-disk state
 	 * because the space would be freed but still referenced by the BP
 	 * in this indirect block).  Therefore we VERIFY that it is
 	 * dirty.
 	 *
 	 * Our VERIFY covers some cases that do not actually have to be
 	 * dirty, but the open-context code happens to dirty.  E.g. if the
 	 * blocks we are freeing are all holes, because in that case, we
 	 * are only freeing part of this indirect block, so it is an
 	 * ancestor of the first or last block to be freed.  The first and
 	 * last L1 indirect blocks are always dirtied by dnode_free_range().
 	 */
 	db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 	VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
 	dmu_buf_unlock_parent(db, dblt, FTAG);
 
 	dbuf_release_bp(db);
 	bp = db->db.db_data;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	ASSERT3U(epbs, <, 31);
 	shift = (db->db_level - 1) * epbs;
 	dbstart = db->db_blkid << epbs;
 	start = blkid >> shift;
 	if (dbstart < start) {
 		bp += start - dbstart;
 	} else {
 		start = dbstart;
 	}
 	dbend = ((db->db_blkid + 1) << epbs) - 1;
 	end = (blkid + nblks - 1) >> shift;
 	if (dbend <= end)
 		end = dbend;
 
 	ASSERT3U(start, <=, end);
 
 	if (db->db_level == 1) {
 		FREE_VERIFY(db, start, end, tx);
 		rw_enter(&db->db_rwlock, RW_WRITER);
 		free_blocks(dn, bp, end - start + 1, tx);
 		rw_exit(&db->db_rwlock);
 	} else {
 		for (uint64_t id = start; id <= end; id++, bp++) {
 			if (BP_IS_HOLE(bp))
 				continue;
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
 			    id, TRUE, FALSE, FTAG, &subdb));
 			rw_exit(&dn->dn_struct_rwlock);
 			ASSERT3P(bp, ==, subdb->db_blkptr);
 
 			free_children(subdb, blkid, nblks, free_indirects, tx);
 			dbuf_rele(subdb, FTAG);
 		}
 	}
 
 	if (free_indirects) {
 		rw_enter(&db->db_rwlock, RW_WRITER);
 		for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
 			ASSERT(BP_IS_HOLE(bp));
 		bzero(db->db.db_data, db->db.db_size);
 		free_blocks(dn, db->db_blkptr, 1, tx);
 		rw_exit(&db->db_rwlock);
 	}
 
 	DB_DNODE_EXIT(db);
 	arc_buf_freeze(db->db_buf);
 }
 
 /*
  * Traverse the indicated range of the provided file
  * and "free" all the blocks contained there.
  */
 static void
 dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
     boolean_t free_indirects, dmu_tx_t *tx)
 {
 	blkptr_t *bp = dn->dn_phys->dn_blkptr;
 	int dnlevel = dn->dn_phys->dn_nlevels;
 	boolean_t trunc = B_FALSE;
 
 	if (blkid > dn->dn_phys->dn_maxblkid)
 		return;
 
 	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
 	if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
 		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
 		trunc = B_TRUE;
 	}
 
 	/* There are no indirect blocks in the object */
 	if (dnlevel == 1) {
 		if (blkid >= dn->dn_phys->dn_nblkptr) {
 			/* this range was never made persistent */
 			return;
 		}
 		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
 		free_blocks(dn, bp + blkid, nblks, tx);
 	} else {
 		int shift = (dnlevel - 1) *
 		    (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
 		int start = blkid >> shift;
 		int end = (blkid + nblks - 1) >> shift;
 		dmu_buf_impl_t *db;
 
 		ASSERT(start < dn->dn_phys->dn_nblkptr);
 		bp += start;
 		for (int i = start; i <= end; i++, bp++) {
 			if (BP_IS_HOLE(bp))
 				continue;
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
 			    TRUE, FALSE, FTAG, &db));
 			rw_exit(&dn->dn_struct_rwlock);
 			free_children(db, blkid, nblks, free_indirects, tx);
 			dbuf_rele(db, FTAG);
 		}
 	}
 
 	/*
 	 * Do not truncate the maxblkid if we are performing a raw
 	 * receive. The raw receive sets the maxblkid manually and
 	 * must not be overridden. Usually, the last DRR_FREE record
 	 * will be at the maxblkid, because the source system sets
 	 * the maxblkid when truncating. However, if the last block
 	 * was freed by overwriting with zeros and being compressed
 	 * away to a hole, the source system will generate a DRR_FREE
 	 * record while leaving the maxblkid after the end of that
 	 * record. In this case we need to leave the maxblkid as
 	 * indicated in the DRR_OBJECT record, so that it matches the
 	 * source system, ensuring that the cryptographic hashes will
 	 * match.
 	 */
 	if (trunc && !dn->dn_objset->os_raw_receive) {
 		uint64_t off __maybe_unused;
 		dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
 
 		off = (dn->dn_phys->dn_maxblkid + 1) *
 		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 		ASSERT(off < dn->dn_phys->dn_maxblkid ||
 		    dn->dn_phys->dn_maxblkid == 0 ||
 		    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
 	}
 }
 
 typedef struct dnode_sync_free_range_arg {
 	dnode_t *dsfra_dnode;
 	dmu_tx_t *dsfra_tx;
 	boolean_t dsfra_free_indirects;
 } dnode_sync_free_range_arg_t;
 
 static void
 dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
 {
 	dnode_sync_free_range_arg_t *dsfra = arg;
 	dnode_t *dn = dsfra->dsfra_dnode;
 
 	mutex_exit(&dn->dn_mtx);
 	dnode_sync_free_range_impl(dn, blkid, nblks,
 	    dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
 	mutex_enter(&dn->dn_mtx);
 }
 
 /*
  * Try to kick all the dnode's dbufs out of the cache...
  */
 void
 dnode_evict_dbufs(dnode_t *dn)
 {
 	dmu_buf_impl_t *db_marker;
 	dmu_buf_impl_t *db, *db_next;
 
 	db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
 
 #ifdef	ZFS_DEBUG
 		DB_DNODE_ENTER(db);
 		ASSERT3P(DB_DNODE(db), ==, dn);
 		DB_DNODE_EXIT(db);
 #endif	/* DEBUG */
 
 		mutex_enter(&db->db_mtx);
 		if (db->db_state != DB_EVICTING &&
 		    zfs_refcount_is_zero(&db->db_holds)) {
 			db_marker->db_level = db->db_level;
 			db_marker->db_blkid = db->db_blkid;
 			db_marker->db_state = DB_SEARCH;
 			avl_insert_here(&dn->dn_dbufs, db_marker, db,
 			    AVL_BEFORE);
 
 			/*
 			 * We need to use the "marker" dbuf rather than
 			 * simply getting the next dbuf, because
 			 * dbuf_destroy() may actually remove multiple dbufs.
 			 * It can call itself recursively on the parent dbuf,
 			 * which may also be removed from dn_dbufs.  The code
 			 * flow would look like:
 			 *
 			 * dbuf_destroy():
 			 *   dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
 			 *	if (!cacheable || pending_evict)
 			 *	  dbuf_destroy()
 			 */
 			dbuf_destroy(db);
 
 			db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
 			avl_remove(&dn->dn_dbufs, db_marker);
 		} else {
 			db->db_pending_evict = TRUE;
 			mutex_exit(&db->db_mtx);
 			db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		}
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 
 	kmem_free(db_marker, sizeof (dmu_buf_impl_t));
 
 	dnode_evict_bonus(dn);
 }
 
 void
 dnode_evict_bonus(dnode_t *dn)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	if (dn->dn_bonus != NULL) {
 		if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) {
 			mutex_enter(&dn->dn_bonus->db_mtx);
 			dbuf_destroy(dn->dn_bonus);
 			dn->dn_bonus = NULL;
 		} else {
 			dn->dn_bonus->db_pending_evict = TRUE;
 		}
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 static void
 dnode_undirty_dbufs(list_t *list)
 {
 	dbuf_dirty_record_t *dr;
 
 	while ((dr = list_head(list))) {
 		dmu_buf_impl_t *db = dr->dr_dbuf;
 		uint64_t txg = dr->dr_txg;
 
 		if (db->db_level != 0)
 			dnode_undirty_dbufs(&dr->dt.di.dr_children);
 
 		mutex_enter(&db->db_mtx);
 		/* XXX - use dbuf_undirty()? */
 		list_remove(list, dr);
 		ASSERT(list_head(&db->db_dirty_records) == dr);
 		list_remove_head(&db->db_dirty_records);
 		ASSERT(list_is_empty(&db->db_dirty_records));
 		db->db_dirtycnt -= 1;
 		if (db->db_level == 0) {
 			ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 			    dr->dt.dl.dr_data == db->db_buf);
 			dbuf_unoverride(dr);
 		} else {
 			mutex_destroy(&dr->dt.di.dr_mtx);
 			list_destroy(&dr->dt.di.dr_children);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
 	}
 }
 
 static void
 dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 {
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	/*
 	 * Our contents should have been freed in dnode_sync() by the
 	 * free range record inserted by the caller of dnode_free().
 	 */
 	ASSERT0(DN_USED_BYTES(dn->dn_phys));
 	ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
 
 	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 	dnode_evict_dbufs(dn);
 
 	/*
 	 * XXX - It would be nice to assert this, but we may still
 	 * have residual holds from async evictions from the arc...
 	 *
 	 * zfs_obj_to_path() also depends on this being
 	 * commented out.
 	 *
 	 * ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1);
 	 */
 
 	/* Undirty next bits */
 	dn->dn_next_nlevels[txgoff] = 0;
 	dn->dn_next_indblkshift[txgoff] = 0;
 	dn->dn_next_blksz[txgoff] = 0;
 	dn->dn_next_maxblkid[txgoff] = 0;
 
 	/* ASSERT(blkptrs are zero); */
 	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 
 	ASSERT(dn->dn_free_txg > 0);
 	if (dn->dn_allocated_txg != dn->dn_free_txg)
 		dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
 	bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
 	dnode_free_interior_slots(dn);
 
 	mutex_enter(&dn->dn_mtx);
 	dn->dn_type = DMU_OT_NONE;
 	dn->dn_maxblkid = 0;
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_num_slots = 1;
 	mutex_exit(&dn->dn_mtx);
 
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 
 	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 	/*
 	 * Now that we've released our hold, the dnode may
 	 * be evicted, so we mustn't access it.
 	 */
 }
 
 /*
  * Write out the dnode's dirty buffers.
  */
 void
 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	dnode_phys_t *dnp = dn->dn_phys;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	list_t *list = &dn->dn_dirty_records[txgoff];
 	static const dnode_phys_t zerodn __maybe_unused = { 0 };
 	boolean_t kill_spill = B_FALSE;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
 	ASSERT(dnp->dn_type != DMU_OT_NONE ||
 	    bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
 	DNODE_VERIFY(dn);
 
 	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
 
 	/*
 	 * Do user accounting if it is enabled and this is not
 	 * an encrypted receive.
 	 */
 	if (dmu_objset_userused_enabled(os) &&
 	    !DMU_OBJECT_IS_SPECIAL(dn->dn_object) &&
 	    (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
 		dn->dn_oldflags = dn->dn_phys->dn_flags;
 		dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
 		if (dmu_objset_userobjused_enabled(dn->dn_objset))
 			dn->dn_phys->dn_flags |=
 			    DNODE_FLAG_USEROBJUSED_ACCOUNTED;
 		mutex_exit(&dn->dn_mtx);
 		dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
 	} else {
 		/* Once we account for it, we should always account for it */
 		ASSERT(!(dn->dn_phys->dn_flags &
 		    DNODE_FLAG_USERUSED_ACCOUNTED));
 		ASSERT(!(dn->dn_phys->dn_flags &
 		    DNODE_FLAG_USEROBJUSED_ACCOUNTED));
 	}
 
 	mutex_enter(&dn->dn_mtx);
 	if (dn->dn_allocated_txg == tx->tx_txg) {
 		/* The dnode is newly allocated or reallocated */
 		if (dnp->dn_type == DMU_OT_NONE) {
 			/* this is a first alloc, not a realloc */
 			dnp->dn_nlevels = 1;
 			dnp->dn_nblkptr = dn->dn_nblkptr;
 		}
 
 		dnp->dn_type = dn->dn_type;
 		dnp->dn_bonustype = dn->dn_bonustype;
 		dnp->dn_bonuslen = dn->dn_bonuslen;
 	}
 
 	dnp->dn_extra_slots = dn->dn_num_slots - 1;
 
 	ASSERT(dnp->dn_nlevels > 1 ||
 	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 	    BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
 	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 	ASSERT(dnp->dn_nlevels < 2 ||
 	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
 
 	if (dn->dn_next_type[txgoff] != 0) {
 		dnp->dn_type = dn->dn_type;
 		dn->dn_next_type[txgoff] = 0;
 	}
 
 	if (dn->dn_next_blksz[txgoff] != 0) {
 		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
 		    SPA_MINBLOCKSIZE) == 0);
 		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
 		    dnp->dn_datablkszsec ||
 		    !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
 		dnp->dn_datablkszsec =
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
 		dn->dn_next_blksz[txgoff] = 0;
 	}
 
 	if (dn->dn_next_bonuslen[txgoff] != 0) {
 		if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
 			dnp->dn_bonuslen = 0;
 		else
 			dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
 		ASSERT(dnp->dn_bonuslen <=
 		    DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));
 		dn->dn_next_bonuslen[txgoff] = 0;
 	}
 
 	if (dn->dn_next_bonustype[txgoff] != 0) {
 		ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
 		dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
 		dn->dn_next_bonustype[txgoff] = 0;
 	}
 
 	boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
 	    dn->dn_free_txg <= tx->tx_txg;
 
 	/*
 	 * Remove the spill block if we have been explicitly asked to
 	 * remove it, or if the object is being removed.
 	 */
 	if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 			kill_spill = B_TRUE;
 		dn->dn_rm_spillblk[txgoff] = 0;
 	}
 
 	if (dn->dn_next_indblkshift[txgoff] != 0) {
 		ASSERT(dnp->dn_nlevels == 1);
 		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
 		dn->dn_next_indblkshift[txgoff] = 0;
 	}
 
 	/*
 	 * Just take the live (open-context) values for checksum and compress.
 	 * Strictly speaking it's a future leak, but nothing bad happens if we
 	 * start using the new checksum or compress algorithm a little early.
 	 */
 	dnp->dn_checksum = dn->dn_checksum;
 	dnp->dn_compress = dn->dn_compress;
 
 	mutex_exit(&dn->dn_mtx);
 
 	if (kill_spill) {
 		free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);
 		mutex_enter(&dn->dn_mtx);
 		dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	/* process all the "freed" ranges in the file */
 	if (dn->dn_free_ranges[txgoff] != NULL) {
 		dnode_sync_free_range_arg_t dsfra;
 		dsfra.dsfra_dnode = dn;
 		dsfra.dsfra_tx = tx;
 		dsfra.dsfra_free_indirects = freeing_dnode;
+		mutex_enter(&dn->dn_mtx);
 		if (freeing_dnode) {
 			ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
 			    0, dn->dn_maxblkid + 1));
 		}
-		mutex_enter(&dn->dn_mtx);
-		range_tree_vacate(dn->dn_free_ranges[txgoff],
+		/*
+		 * Because dnode_sync_free_range() must drop dn_mtx during its
+		 * processing, using it as a callback to range_tree_vacate() is
+		 * not safe.  No other operations (besides destroy) are allowed
+		 * once range_tree_vacate() has begun, and dropping dn_mtx
+		 * would leave a window open for another thread to observe that
+		 * invalid (and unsafe) state.
+		 */
+		range_tree_walk(dn->dn_free_ranges[txgoff],
 		    dnode_sync_free_range, &dsfra);
+		range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL);
 		range_tree_destroy(dn->dn_free_ranges[txgoff]);
 		dn->dn_free_ranges[txgoff] = NULL;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	if (freeing_dnode) {
 		dn->dn_objset->os_freed_dnodes++;
 		dnode_sync_free(dn, tx);
 		return;
 	}
 
 	if (dn->dn_num_slots > DNODE_MIN_SLOTS) {
 		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 		mutex_enter(&ds->ds_lock);
 		ds->ds_feature_activation[SPA_FEATURE_LARGE_DNODE] =
 		    (void *)B_TRUE;
 		mutex_exit(&ds->ds_lock);
 	}
 
 	if (dn->dn_next_nlevels[txgoff]) {
 		dnode_increase_indirection(dn, tx);
 		dn->dn_next_nlevels[txgoff] = 0;
 	}
 
 	/*
 	 * This must be done after dnode_sync_free_range()
 	 * and dnode_increase_indirection(). See dnode_new_blkid()
 	 * for an explanation of the high bit being set.
 	 */
 	if (dn->dn_next_maxblkid[txgoff]) {
 		mutex_enter(&dn->dn_mtx);
 		dnp->dn_maxblkid =
 		    dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET;
 		dn->dn_next_maxblkid[txgoff] = 0;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	if (dn->dn_next_nblkptr[txgoff]) {
 		/* this should only happen on a realloc */
 		ASSERT(dn->dn_allocated_txg == tx->tx_txg);
 		if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
 			/* zero the new blkptrs we are gaining */
 			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
 			    sizeof (blkptr_t) *
 			    (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
 #ifdef ZFS_DEBUG
 		} else {
 			int i;
 			ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
 			/* the blkptrs we are losing better be unallocated */
 			for (i = 0; i < dnp->dn_nblkptr; i++) {
 				if (i >= dn->dn_next_nblkptr[txgoff])
 					ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
 			}
 #endif
 		}
 		mutex_enter(&dn->dn_mtx);
 		dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
 		dn->dn_next_nblkptr[txgoff] = 0;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
 
 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		ASSERT3P(list_head(list), ==, NULL);
 		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 	}
 
 	/*
 	 * Although we have dropped our reference to the dnode, it
 	 * can't be evicted until its written, and we haven't yet
 	 * initiated the IO for the dnode's dbuf.
 	 */
 }
Index: vendor-sys/openzfs/dist/module/zfs/dsl_dir.c
===================================================================
--- vendor-sys/openzfs/dist/module/zfs/dsl_dir.c	(revision 364927)
+++ vendor-sys/openzfs/dist/module/zfs/dsl_dir.c	(revision 364928)
@@ -1,2414 +1,2403 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 Martin Matuska. All rights reserved.
  * Copyright (c) 2014 Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_impl.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/metaslab.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
 #include <sys/zfeature.h>
 #include <sys/policy.h>
 #include <sys/zfs_znode.h>
 #include <sys/zvol.h>
 #include <sys/zthr.h>
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
 /*
  * Filesystem and Snapshot Limits
  * ------------------------------
  *
  * These limits are used to restrict the number of filesystems and/or snapshots
  * that can be created at a given level in the tree or below. A typical
  * use-case is with a delegated dataset where the administrator wants to ensure
  * that a user within the zone is not creating too many additional filesystems
  * or snapshots, even though they're not exceeding their space quota.
  *
  * The filesystem and snapshot counts are stored as extensible properties. This
  * capability is controlled by a feature flag and must be enabled to be used.
  * Once enabled, the feature is not active until the first limit is set. At
  * that point, future operations to create/destroy filesystems or snapshots
  * will validate and update the counts.
  *
  * Because the count properties will not exist before the feature is active,
  * the counts are updated when a limit is first set on an uninitialized
  * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
  * all of the nested filesystems/snapshots. Thus, a new leaf node has a
  * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
  * snapshot count properties on a node indicate uninitialized counts on that
  * node.) When first setting a limit on an uninitialized node, the code starts
  * at the filesystem with the new limit and descends into all sub-filesystems
  * to add the count properties.
  *
  * In practice this is lightweight since a limit is typically set when the
  * filesystem is created and thus has no children. Once valid, changing the
  * limit value won't require a re-traversal since the counts are already valid.
  * When recursively fixing the counts, if a node with a limit is encountered
  * during the descent, the counts are known to be valid and there is no need to
  * descend into that filesystem's children. The counts on filesystems above the
  * one with the new limit will still be uninitialized, unless a limit is
  * eventually set on one of those filesystems. The counts are always recursively
  * updated when a limit is set on a dataset, unless there is already a limit.
  * When a new limit value is set on a filesystem with an existing limit, it is
  * possible for the new limit to be less than the current count at that level
  * since a user who can change the limit is also allowed to exceed the limit.
  *
  * Once the feature is active, then whenever a filesystem or snapshot is
  * created, the code recurses up the tree, validating the new count against the
  * limit at each initialized level. In practice, most levels will not have a
  * limit set. If there is a limit at any initialized level up the tree, the
  * check must pass or the creation will fail. Likewise, when a filesystem or
  * snapshot is destroyed, the counts are recursively adjusted all the way up
  * the initialized nodes in the tree. Renaming a filesystem into different point
  * in the tree will first validate, then update the counts on each branch up to
  * the common ancestor. A receive will also validate the counts and then update
  * them.
  *
  * An exception to the above behavior is that the limit is not enforced if the
  * user has permission to modify the limit. This is primarily so that
  * recursive snapshots in the global zone always work. We want to prevent a
  * denial-of-service in which a lower level delegated dataset could max out its
  * limit and thus block recursive snapshots from being taken in the global zone.
  * Because of this, it is possible for the snapshot count to be over the limit
  * and snapshots taken in the global zone could cause a lower level dataset to
  * hit or exceed its limit. The administrator taking the global zone recursive
  * snapshot should be aware of this side-effect and behave accordingly.
  * For consistency, the filesystem limit is also not enforced if the user can
  * modify the limit.
  *
  * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
  * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
  * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
  * dsl_dir_init_fs_ss_count().
- *
- * There is a special case when we receive a filesystem that already exists. In
- * this case a temporary clone name of %X is created (see dmu_recv_begin). We
- * never update the filesystem counts for temporary clones.
- *
- * Likewise, we do not update the snapshot counts for temporary snapshots,
- * such as those created by zfs diff.
  */
 
 extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 
 typedef struct ddulrt_arg {
 	dsl_dir_t	*ddulrta_dd;
 	uint64_t	ddlrta_txg;
 } ddulrt_arg_t;
 
 static void
 dsl_dir_evict_async(void *dbu)
 {
 	dsl_dir_t *dd = dbu;
 	int t;
 	dsl_pool_t *dp __maybe_unused = dd->dd_pool;
 
 	dd->dd_dbuf = NULL;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
 		ASSERT(dd->dd_tempreserved[t] == 0);
 		ASSERT(dd->dd_space_towrite[t] == 0);
 	}
 
 	if (dd->dd_parent)
 		dsl_dir_async_rele(dd->dd_parent, dd);
 
 	spa_async_close(dd->dd_pool->dp_spa, dd);
 
 	if (dsl_deadlist_is_open(&dd->dd_livelist))
 		dsl_dir_livelist_close(dd);
 
 	dsl_prop_fini(dd);
 	cv_destroy(&dd->dd_activity_cv);
 	mutex_destroy(&dd->dd_activity_lock);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 }
 
 int
 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, void *tag, dsl_dir_t **ddp)
 {
 	dmu_buf_t *dbuf;
 	dsl_dir_t *dd;
 	dmu_object_info_t doi;
 	int err;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
 	if (err != 0)
 		return (err);
 	dd = dmu_buf_get_user(dbuf);
 
 	dmu_object_info_from_db(dbuf, &doi);
 	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
 	ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
 
 	if (dd == NULL) {
 		dsl_dir_t *winner;
 
 		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
 		dd->dd_object = ddobj;
 		dd->dd_dbuf = dbuf;
 		dd->dd_pool = dp;
 
 		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL);
 		dsl_prop_init(dd);
 
 		if (dsl_dir_is_zapified(dd)) {
 			err = zap_lookup(dp->dp_meta_objset,
 			    ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
 			    sizeof (uint64_t), 1, &dd->dd_crypto_obj);
 			if (err == 0) {
 				/* check for on-disk format errata */
 				if (dsl_dir_incompatible_encryption_version(
 				    dd)) {
 					dp->dp_spa->spa_errata =
 					    ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
 				}
 			} else if (err != ENOENT) {
 				goto errout;
 			}
 		}
 
 		dsl_dir_snap_cmtime_update(dd);
 
 		if (dsl_dir_phys(dd)->dd_parent_obj) {
 			err = dsl_dir_hold_obj(dp,
 			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
 			    &dd->dd_parent);
 			if (err != 0)
 				goto errout;
 			if (tail) {
 #ifdef ZFS_DEBUG
 				uint64_t foundobj;
 
 				err = zap_lookup(dp->dp_meta_objset,
 				    dsl_dir_phys(dd->dd_parent)->
 				    dd_child_dir_zapobj, tail,
 				    sizeof (foundobj), 1, &foundobj);
 				ASSERT(err || foundobj == ddobj);
 #endif
 				(void) strlcpy(dd->dd_myname, tail,
 				    sizeof (dd->dd_myname));
 			} else {
 				err = zap_value_search(dp->dp_meta_objset,
 				    dsl_dir_phys(dd->dd_parent)->
 				    dd_child_dir_zapobj,
 				    ddobj, 0, dd->dd_myname);
 			}
 			if (err != 0)
 				goto errout;
 		} else {
 			(void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa),
 			    sizeof (dd->dd_myname));
 		}
 
 		if (dsl_dir_is_clone(dd)) {
 			dmu_buf_t *origin_bonus;
 			dsl_dataset_phys_t *origin_phys;
 
 			/*
 			 * We can't open the origin dataset, because
 			 * that would require opening this dsl_dir.
 			 * Just look at its phys directly instead.
 			 */
 			err = dmu_bonus_hold(dp->dp_meta_objset,
 			    dsl_dir_phys(dd)->dd_origin_obj, FTAG,
 			    &origin_bonus);
 			if (err != 0)
 				goto errout;
 			origin_phys = origin_bonus->db_data;
 			dd->dd_origin_txg =
 			    origin_phys->ds_creation_txg;
 			dmu_buf_rele(origin_bonus, FTAG);
 			if (dsl_dir_is_zapified(dd)) {
 				uint64_t obj;
 				err = zap_lookup(dp->dp_meta_objset,
 				    dd->dd_object, DD_FIELD_LIVELIST,
 				    sizeof (uint64_t), 1, &obj);
 				if (err == 0)
 					dsl_dir_livelist_open(dd, obj);
 				else if (err != ENOENT)
 					goto errout;
 			}
 		}
 
 		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
 		    &dd->dd_dbuf);
 		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
 		if (winner != NULL) {
 			if (dd->dd_parent)
 				dsl_dir_rele(dd->dd_parent, dd);
 			if (dsl_deadlist_is_open(&dd->dd_livelist))
 				dsl_dir_livelist_close(dd);
 			dsl_prop_fini(dd);
 			cv_destroy(&dd->dd_activity_cv);
 			mutex_destroy(&dd->dd_activity_lock);
 			mutex_destroy(&dd->dd_lock);
 			kmem_free(dd, sizeof (dsl_dir_t));
 			dd = winner;
 		} else {
 			spa_open_ref(dp->dp_spa, dd);
 		}
 	}
 
 	/*
 	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
 	 * holds on the spa.  We need the open-to-close holds because
 	 * otherwise the spa_refcnt wouldn't change when we open a
 	 * dir which the spa also has open, so we could incorrectly
 	 * think it was OK to unload/export/destroy the pool.  We need
 	 * the instantiate-to-evict hold because the dsl_dir_t has a
 	 * pointer to the dd_pool, which has a pointer to the spa_t.
 	 */
 	spa_open_ref(dp->dp_spa, tag);
 	ASSERT3P(dd->dd_pool, ==, dp);
 	ASSERT3U(dd->dd_object, ==, ddobj);
 	ASSERT3P(dd->dd_dbuf, ==, dbuf);
 	*ddp = dd;
 	return (0);
 
 errout:
 	if (dd->dd_parent)
 		dsl_dir_rele(dd->dd_parent, dd);
 	if (dsl_deadlist_is_open(&dd->dd_livelist))
 		dsl_dir_livelist_close(dd);
 	dsl_prop_fini(dd);
 	cv_destroy(&dd->dd_activity_cv);
 	mutex_destroy(&dd->dd_activity_lock);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 	dmu_buf_rele(dbuf, tag);
 	return (err);
 }
 
 void
 dsl_dir_rele(dsl_dir_t *dd, void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_close(dd->dd_pool->dp_spa, tag);
 	dmu_buf_rele(dd->dd_dbuf, tag);
 }
 
 /*
  * Remove a reference to the given dsl dir that is being asynchronously
  * released.  Async releases occur from a taskq performing eviction of
  * dsl datasets and dirs.  This process is identical to a normal release
  * with the exception of using the async API for releasing the reference on
  * the spa.
  */
 void
 dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_async_close(dd->dd_pool->dp_spa, tag);
 	dmu_buf_rele(dd->dd_dbuf, tag);
 }
 
 /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
 void
 dsl_dir_name(dsl_dir_t *dd, char *buf)
 {
 	if (dd->dd_parent) {
 		dsl_dir_name(dd->dd_parent, buf);
 		VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
 		    ZFS_MAX_DATASET_NAME_LEN);
 	} else {
 		buf[0] = '\0';
 	}
 	if (!MUTEX_HELD(&dd->dd_lock)) {
 		/*
 		 * recursive mutex so that we can use
 		 * dprintf_dd() with dd_lock held
 		 */
 		mutex_enter(&dd->dd_lock);
 		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
 		    <, ZFS_MAX_DATASET_NAME_LEN);
 		mutex_exit(&dd->dd_lock);
 	} else {
 		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
 		    <, ZFS_MAX_DATASET_NAME_LEN);
 	}
 }
 
 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
 int
 dsl_dir_namelen(dsl_dir_t *dd)
 {
 	int result = 0;
 
 	if (dd->dd_parent) {
 		/* parent's name + 1 for the "/" */
 		result = dsl_dir_namelen(dd->dd_parent) + 1;
 	}
 
 	if (!MUTEX_HELD(&dd->dd_lock)) {
 		/* see dsl_dir_name */
 		mutex_enter(&dd->dd_lock);
 		result += strlen(dd->dd_myname);
 		mutex_exit(&dd->dd_lock);
 	} else {
 		result += strlen(dd->dd_myname);
 	}
 
 	return (result);
 }
 
 static int
 getcomponent(const char *path, char *component, const char **nextp)
 {
 	char *p;
 
 	if ((path == NULL) || (path[0] == '\0'))
 		return (SET_ERROR(ENOENT));
 	/* This would be a good place to reserve some namespace... */
 	p = strpbrk(path, "/@");
 	if (p && (p[1] == '/' || p[1] == '@')) {
 		/* two separators in a row */
 		return (SET_ERROR(EINVAL));
 	}
 	if (p == NULL || p == path) {
 		/*
 		 * if the first thing is an @ or /, it had better be an
 		 * @ and it had better not have any more ats or slashes,
 		 * and it had better have something after the @.
 		 */
 		if (p != NULL &&
 		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
 			return (SET_ERROR(EINVAL));
 		if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN);
 		p = NULL;
 	} else if (p[0] == '/') {
 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strncpy(component, path, p - path);
 		component[p - path] = '\0';
 		p++;
 	} else if (p[0] == '@') {
 		/*
 		 * if the next separator is an @, there better not be
 		 * any more slashes.
 		 */
 		if (strchr(path, '/'))
 			return (SET_ERROR(EINVAL));
 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strncpy(component, path, p - path);
 		component[p - path] = '\0';
 	} else {
 		panic("invalid p=%p", (void *)p);
 	}
 	*nextp = p;
 	return (0);
 }
 
 /*
  * Return the dsl_dir_t, and possibly the last component which couldn't
  * be found in *tail.  The name must be in the specified dsl_pool_t.  This
  * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
  * path is bogus, or if tail==NULL and we couldn't parse the whole name.
  * (*tail)[0] == '@' means that the last component is a snapshot.
  */
 int
 dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
     dsl_dir_t **ddp, const char **tailp)
 {
 	char *buf;
 	const char *spaname, *next, *nextnext = NULL;
 	int err;
 	dsl_dir_t *dd;
 	uint64_t ddobj;
 
 	buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	err = getcomponent(name, buf, &next);
 	if (err != 0)
 		goto error;
 
 	/* Make sure the name is in the specified pool. */
 	spaname = spa_name(dp->dp_spa);
 	if (strcmp(buf, spaname) != 0) {
 		err = SET_ERROR(EXDEV);
 		goto error;
 	}
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
 	if (err != 0) {
 		goto error;
 	}
 
 	while (next != NULL) {
 		dsl_dir_t *child_dd;
 		err = getcomponent(next, buf, &nextnext);
 		if (err != 0)
 			break;
 		ASSERT(next[0] != '\0');
 		if (next[0] == '@')
 			break;
 		dprintf("looking up %s in obj%lld\n",
 		    buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
 
 		err = zap_lookup(dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_child_dir_zapobj,
 		    buf, sizeof (ddobj), 1, &ddobj);
 		if (err != 0) {
 			if (err == ENOENT)
 				err = 0;
 			break;
 		}
 
 		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
 		if (err != 0)
 			break;
 		dsl_dir_rele(dd, tag);
 		dd = child_dd;
 		next = nextnext;
 	}
 
 	if (err != 0) {
 		dsl_dir_rele(dd, tag);
 		goto error;
 	}
 
 	/*
 	 * It's an error if there's more than one component left, or
 	 * tailp==NULL and there's any component left.
 	 */
 	if (next != NULL &&
 	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
 		/* bad path name */
 		dsl_dir_rele(dd, tag);
 		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
 		err = SET_ERROR(ENOENT);
 	}
 	if (tailp != NULL)
 		*tailp = next;
 	if (err == 0)
 		*ddp = dd;
 error:
 	kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN);
 	return (err);
 }
 
 /*
  * If the counts are already initialized for this filesystem and its
  * descendants then do nothing, otherwise initialize the counts.
  *
  * The counts on this filesystem, and those below, may be uninitialized due to
  * either the use of a pre-existing pool which did not support the
  * filesystem/snapshot limit feature, or one in which the feature had not yet
  * been enabled.
  *
  * Recursively descend the filesystem tree and update the filesystem/snapshot
  * counts on each filesystem below, then update the cumulative count on the
  * current filesystem. If the filesystem already has a count set on it,
  * then we know that its counts, and the counts on the filesystems below it,
  * are already correct, so we don't have to update this filesystem.
  */
 static void
 dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	uint64_t my_fs_cnt = 0;
 	uint64_t my_ss_cnt = 0;
 	dsl_pool_t *dp = dd->dd_pool;
 	objset_t *os = dp->dp_meta_objset;
 	zap_cursor_t *zc;
 	zap_attribute_t *za;
 	dsl_dataset_t *ds;
 
 	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
 	ASSERT(dsl_pool_config_held(dp));
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dsl_dir_zapify(dd, tx);
 
 	/*
 	 * If the filesystem count has already been initialized then we
 	 * don't need to recurse down any further.
 	 */
 	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
 		return;
 
 	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	/* Iterate my child dirs */
 	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
 	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
 		dsl_dir_t *chld_dd;
 		uint64_t count;
 
 		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
 		    &chld_dd));
 
 		/*
-		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
-		 * temporary datasets.
+		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets.
 		 */
-		if (chld_dd->dd_myname[0] == '$' ||
-		    chld_dd->dd_myname[0] == '%') {
+		if (chld_dd->dd_myname[0] == '$') {
 			dsl_dir_rele(chld_dd, FTAG);
 			continue;
 		}
 
 		my_fs_cnt++;	/* count this child */
 
 		dsl_dir_init_fs_ss_count(chld_dd, tx);
 
 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
 		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
 		my_fs_cnt += count;
 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
 		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
 		my_ss_cnt += count;
 
 		dsl_dir_rele(chld_dd, FTAG);
 	}
 	zap_cursor_fini(zc);
 	/* Count my snapshots (we counted children's snapshots above) */
 	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
 	    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
 
 	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 		/* Don't count temporary snapshots */
 		if (za->za_name[0] != '%')
 			my_ss_cnt++;
 	}
 	zap_cursor_fini(zc);
 
 	dsl_dataset_rele(ds, FTAG);
 
 	kmem_free(zc, sizeof (zap_cursor_t));
 	kmem_free(za, sizeof (zap_attribute_t));
 
 	/* we're in a sync task, update counts */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
 	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
 	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
 }
 
 static int
 dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
 {
 	char *ddname = (char *)arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	dd = ds->ds_dir;
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
 	    dsl_dir_is_zapified(dd) &&
 	    zap_contains(dp->dp_meta_objset, dd->dd_object,
 	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EALREADY));
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
 {
 	char *ddname = (char *)arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	spa_t *spa;
 
 	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
 
 	spa = dsl_dataset_get_spa(ds);
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
 		/*
 		 * Since the feature was not active and we're now setting a
 		 * limit, increment the feature-active counter so that the
 		 * feature becomes active for the first time.
 		 *
 		 * We are already in a sync task so we can update the MOS.
 		 */
 		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
 	}
 
 	/*
 	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
 	 * we need to ensure the counts are correct. Descend down the tree from
 	 * this point and update all of the counts to be accurate.
 	 */
 	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
 
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * Make sure the feature is enabled and activate it if necessary.
  * Since we're setting a limit, ensure the on-disk counts are valid.
  * This is only called by the ioctl path when setting a limit value.
  *
  * We do not need to validate the new limit, since users who can change the
  * limit are also allowed to exceed the limit.
  */
 int
 dsl_dir_activate_fs_ss_limit(const char *ddname)
 {
 	int error;
 
 	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
 	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
 	    ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == EALREADY)
 		error = 0;
 
 	return (error);
 }
 
 /*
  * Used to determine if the filesystem_limit or snapshot_limit should be
  * enforced. We allow the limit to be exceeded if the user has permission to
  * write the property value. We pass in the creds that we got in the open
  * context since we will always be the GZ root in syncing context. We also have
  * to handle the case where we are allowed to change the limit on the current
  * dataset, but there may be another limit in the tree above.
  *
  * We can never modify these two properties within a non-global zone. In
  * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
  * can't use that function since we are already holding the dp_config_rwlock.
  * In addition, we already have the dd and dealing with snapshots is simplified
  * in this code.
  */
 
 typedef enum {
 	ENFORCE_ALWAYS,
 	ENFORCE_NEVER,
 	ENFORCE_ABOVE
 } enforce_res_t;
 
 static enforce_res_t
 dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop,
     cred_t *cr, proc_t *proc)
 {
 	enforce_res_t enforce = ENFORCE_ALWAYS;
 	uint64_t obj;
 	dsl_dataset_t *ds;
 	uint64_t zoned;
 	const char *zonedstr;
 
 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
 
 #ifdef _KERNEL
 	if (crgetzoneid(cr) != GLOBAL_ZONEID)
 		return (ENFORCE_ALWAYS);
 
 	/*
 	 * We are checking the saved credentials of the user process, which is
 	 * not the current process.  Note that we can't use secpolicy_zfs(),
 	 * because it only works if the cred is that of the current process (on
 	 * Linux).
 	 */
 	if (secpolicy_zfs_proc(cr, proc) == 0)
 		return (ENFORCE_NEVER);
 #endif
 
 	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
 		return (ENFORCE_ALWAYS);
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 
 	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
 		return (ENFORCE_ALWAYS);
 
 	zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED);
 	if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) {
 		/* Only root can access zoned fs's from the GZ */
 		enforce = ENFORCE_ALWAYS;
 	} else {
 		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
 			enforce = ENFORCE_ABOVE;
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (enforce);
 }
 
 /*
  * Check if adding additional child filesystem(s) would exceed any filesystem
  * limits or adding additional snapshot(s) would exceed any snapshot limits.
  * The prop argument indicates which limit to check.
  *
  * Note that all filesystem limits up to the root (or the highest
  * initialized) filesystem or the given ancestor must be satisfied.
  */
 int
 dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
     dsl_dir_t *ancestor, cred_t *cr, proc_t *proc)
 {
 	objset_t *os = dd->dd_pool->dp_meta_objset;
 	uint64_t limit, count;
 	char *count_prop;
 	enforce_res_t enforce;
 	int err = 0;
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
 
 	/*
 	 * If we're allowed to change the limit, don't enforce the limit
 	 * e.g. this can happen if a snapshot is taken by an administrative
 	 * user in the global zone (i.e. a recursive snapshot by root).
 	 * However, we must handle the case of delegated permissions where we
 	 * are allowed to change the limit on the current dataset, but there
 	 * is another limit in the tree above.
 	 */
 	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc);
 	if (enforce == ENFORCE_NEVER)
 		return (0);
 
 	/*
 	 * e.g. if renaming a dataset with no snapshots, count adjustment
 	 * is 0.
 	 */
 	if (delta == 0)
 		return (0);
 
 	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
 		/*
 		 * We don't enforce the limit for temporary snapshots. This is
 		 * indicated by a NULL cred_t argument.
 		 */
 		if (cr == NULL)
 			return (0);
 
 		count_prop = DD_FIELD_SNAPSHOT_COUNT;
 	} else {
 		count_prop = DD_FIELD_FILESYSTEM_COUNT;
 	}
 
 	/*
 	 * If an ancestor has been provided, stop checking the limit once we
 	 * hit that dir. We need this during rename so that we don't overcount
 	 * the check once we recurse up to the common ancestor.
 	 */
 	if (ancestor == dd)
 		return (0);
 
 	/*
 	 * If we hit an uninitialized node while recursing up the tree, we can
 	 * stop since we know there is no limit here (or above). The counts are
 	 * not valid on this node and we know we won't touch this node's counts.
 	 */
 	if (!dsl_dir_is_zapified(dd))
 		return (0);
 	err = zap_lookup(os, dd->dd_object,
 	    count_prop, sizeof (count), 1, &count);
 	if (err == ENOENT)
 		return (0);
 	if (err != 0)
 		return (err);
 
 	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
 	    B_FALSE);
 	if (err != 0)
 		return (err);
 
 	/* Is there a limit which we've hit? */
 	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
 		return (SET_ERROR(EDQUOT));
 
 	if (dd->dd_parent != NULL)
 		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
 		    ancestor, cr, proc);
 
 	return (err);
 }
 
 /*
  * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
  * parents. When a new filesystem/snapshot is created, increment the count on
  * all parents, and when a filesystem/snapshot is destroyed, decrement the
  * count.
  */
 void
 dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
     dmu_tx_t *tx)
 {
 	int err;
 	objset_t *os = dd->dd_pool->dp_meta_objset;
 	uint64_t count;
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
 	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
 
 	/*
-	 * When we receive an incremental stream into a filesystem that already
-	 * exists, a temporary clone is created.  We don't count this temporary
-	 * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
-	 * $MOS & $ORIGIN) objsets.
+	 * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets.
 	 */
-	if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
-	    strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
+	if (dd->dd_myname[0] == '$' && strcmp(prop,
+	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
 		return;
+	}
 
 	/*
 	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
 	 */
 	if (delta == 0)
 		return;
 
 	/*
 	 * If we hit an uninitialized node while recursing up the tree, we can
 	 * stop since we know the counts are not valid on this node and we
 	 * know we shouldn't touch this node's counts. An uninitialized count
 	 * on the node indicates that either the feature has not yet been
 	 * activated or there are no limits on this part of the tree.
 	 */
 	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
 	    prop, sizeof (count), 1, &count)) == ENOENT)
 		return;
 	VERIFY0(err);
 
 	count += delta;
 	/* Use a signed verify to make sure we're not neg. */
 	VERIFY3S(count, >=, 0);
 
 	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
 	    tx));
 
 	/* Roll up this additional count into our ancestors */
 	if (dd->dd_parent != NULL)
 		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
 }
 
 uint64_t
 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
     dmu_tx_t *tx)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t ddobj;
 	dsl_dir_phys_t *ddphys;
 	dmu_buf_t *dbuf;
 
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 	if (pds) {
 		VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
 		    name, sizeof (uint64_t), 1, &ddobj, tx));
 	} else {
 		/* it's the root dir */
 		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 	}
 	VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	ddphys = dbuf->db_data;
 
 	ddphys->dd_creation_time = gethrestime_sec();
 	if (pds) {
 		ddphys->dd_parent_obj = pds->dd_object;
 
 		/* update the filesystem counts */
 		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
 	}
 	ddphys->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 	ddphys->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
 		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 
 	dmu_buf_rele(dbuf, FTAG);
 
 	return (ddobj);
 }
 
 boolean_t
 dsl_dir_is_clone(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_origin_obj &&
 	    (dd->dd_pool->dp_origin_snap == NULL ||
 	    dsl_dir_phys(dd)->dd_origin_obj !=
 	    dd->dd_pool->dp_origin_snap->ds_object));
 }
 
 uint64_t
 dsl_dir_get_used(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_used_bytes);
 }
 
 uint64_t
 dsl_dir_get_compressed(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_compressed_bytes);
 }
 
 uint64_t
 dsl_dir_get_quota(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_quota);
 }
 
 uint64_t
 dsl_dir_get_reservation(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_reserved);
 }
 
 uint64_t
 dsl_dir_get_compressratio(dsl_dir_t *dd)
 {
 	/* a fixed point number, 100x the ratio */
 	return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
 	    (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
 	    dsl_dir_phys(dd)->dd_compressed_bytes));
 }
 
 uint64_t
 dsl_dir_get_logicalused(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
 }
 
 uint64_t
 dsl_dir_get_usedsnap(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
 }
 
 uint64_t
 dsl_dir_get_usedds(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
 }
 
 uint64_t
 dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
 }
 
 uint64_t
 dsl_dir_get_usedchild(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
 	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
 }
 
 void
 dsl_dir_get_origin(dsl_dir_t *dd, char *buf)
 {
 	dsl_dataset_t *ds;
 	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
 	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
 
 	dsl_dataset_name(ds, buf);
 
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count)
 {
 	if (dsl_dir_is_zapified(dd)) {
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 		return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
 		    sizeof (*count), 1, count));
 	} else {
 		return (SET_ERROR(ENOENT));
 	}
 }
 
 int
 dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count)
 {
 	if (dsl_dir_is_zapified(dd)) {
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 		return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
 		    sizeof (*count), 1, count));
 	} else {
 		return (SET_ERROR(ENOENT));
 	}
 }
 
 void
 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
 {
 	mutex_enter(&dd->dd_lock);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
 	    dsl_dir_get_quota(dd));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
 	    dsl_dir_get_reservation(dd));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
 	    dsl_dir_get_logicalused(dd));
 	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
 		    dsl_dir_get_usedsnap(dd));
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
 		    dsl_dir_get_usedds(dd));
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
 		    dsl_dir_get_usedrefreserv(dd));
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
 		    dsl_dir_get_usedchild(dd));
 	}
 	mutex_exit(&dd->dd_lock);
 
 	uint64_t count;
 	if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
 		    count);
 	}
 	if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
 		    count);
 	}
 
 	if (dsl_dir_is_clone(dd)) {
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		dsl_dir_get_origin(dd, buf);
 		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
 	}
 
 }
 
 void
 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dd->dd_pool;
 
 	ASSERT(dsl_dir_phys(dd));
 
 	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(dd->dd_dbuf, dd);
 	}
 }
 
 static int64_t
 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
 {
 	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
 	uint64_t new_accounted =
 	    MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
 	return (new_accounted - old_accounted);
 }
 
 void
 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	mutex_enter(&dd->dd_lock);
 	ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]);
 	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
 	    dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024);
 	dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0;
 	mutex_exit(&dd->dd_lock);
 
 	/* release the hold from dsl_dir_dirty */
 	dmu_buf_rele(dd->dd_dbuf, dd);
 }
 
 static uint64_t
 dsl_dir_space_towrite(dsl_dir_t *dd)
 {
 	uint64_t space = 0;
 
 	ASSERT(MUTEX_HELD(&dd->dd_lock));
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		space += dd->dd_space_towrite[i & TXG_MASK];
 		ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
 	}
 	return (space);
 }
 
 /*
  * How much space would dd have available if ancestor had delta applied
  * to it?  If ondiskonly is set, we're only interested in what's
  * on-disk, not estimated pending changes.
  */
 uint64_t
 dsl_dir_space_available(dsl_dir_t *dd,
     dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
 {
 	uint64_t parentspace, myspace, quota, used;
 
 	/*
 	 * If there are no restrictions otherwise, assume we have
 	 * unlimited space available.
 	 */
 	quota = UINT64_MAX;
 	parentspace = UINT64_MAX;
 
 	if (dd->dd_parent != NULL) {
 		parentspace = dsl_dir_space_available(dd->dd_parent,
 		    ancestor, delta, ondiskonly);
 	}
 
 	mutex_enter(&dd->dd_lock);
 	if (dsl_dir_phys(dd)->dd_quota != 0)
 		quota = dsl_dir_phys(dd)->dd_quota;
 	used = dsl_dir_phys(dd)->dd_used_bytes;
 	if (!ondiskonly)
 		used += dsl_dir_space_towrite(dd);
 
 	if (dd->dd_parent == NULL) {
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
 		    ZFS_SPACE_CHECK_NORMAL);
 		quota = MIN(quota, poolsize);
 	}
 
 	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
 		/*
 		 * We have some space reserved, in addition to what our
 		 * parent gave us.
 		 */
 		parentspace += dsl_dir_phys(dd)->dd_reserved - used;
 	}
 
 	if (dd == ancestor) {
 		ASSERT(delta <= 0);
 		ASSERT(used >= -delta);
 		used += delta;
 		if (parentspace != UINT64_MAX)
 			parentspace -= delta;
 	}
 
 	if (used > quota) {
 		/* over quota */
 		myspace = 0;
 	} else {
 		/*
 		 * the lesser of the space provided by our parent and
 		 * the space left in our quota
 		 */
 		myspace = MIN(parentspace, quota - used);
 	}
 
 	mutex_exit(&dd->dd_lock);
 
 	return (myspace);
 }
 
 struct tempreserve {
 	list_node_t tr_node;
 	dsl_dir_t *tr_ds;
 	uint64_t tr_size;
 };
 
 static int
 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
     boolean_t ignorequota, list_t *tr_list,
     dmu_tx_t *tx, boolean_t first)
 {
 	uint64_t txg;
 	uint64_t quota;
 	struct tempreserve *tr;
 	int retval;
 	uint64_t ref_rsrv;
 
 top_of_function:
 	txg = tx->tx_txg;
 	retval = EDQUOT;
 	ref_rsrv = 0;
 
 	ASSERT3U(txg, !=, 0);
 	ASSERT3S(asize, >, 0);
 
 	mutex_enter(&dd->dd_lock);
 
 	/*
 	 * Check against the dsl_dir's quota.  We don't add in the delta
 	 * when checking for over-quota because they get one free hit.
 	 */
 	uint64_t est_inflight = dsl_dir_space_towrite(dd);
 	for (int i = 0; i < TXG_SIZE; i++)
 		est_inflight += dd->dd_tempreserved[i];
 	uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
 
 	/*
 	 * On the first iteration, fetch the dataset's used-on-disk and
 	 * refreservation values. Also, if checkrefquota is set, test if
 	 * allocating this space would exceed the dataset's refquota.
 	 */
 	if (first && tx->tx_objset) {
 		int error;
 		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
 
 		error = dsl_dataset_check_quota(ds, !netfree,
 		    asize, est_inflight, &used_on_disk, &ref_rsrv);
 		if (error != 0) {
 			mutex_exit(&dd->dd_lock);
 			DMU_TX_STAT_BUMP(dmu_tx_quota);
 			return (error);
 		}
 	}
 
 	/*
 	 * If this transaction will result in a net free of space,
 	 * we want to let it through.
 	 */
 	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
 		quota = UINT64_MAX;
 	else
 		quota = dsl_dir_phys(dd)->dd_quota;
 
 	/*
 	 * Adjust the quota against the actual pool size at the root
 	 * minus any outstanding deferred frees.
 	 * To ensure that it's possible to remove files from a full
 	 * pool without inducing transient overcommits, we throttle
 	 * netfree transactions against a quota that is slightly larger,
 	 * but still within the pool's allocation slop.  In cases where
 	 * we're very close to full, this will allow a steady trickle of
 	 * removes to get through.
 	 */
 	uint64_t deferred = 0;
 	if (dd->dd_parent == NULL) {
 		uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
 		    (netfree) ?
 		    ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
 
 		if (avail < quota) {
 			quota = avail;
 			retval = SET_ERROR(ENOSPC);
 		}
 	}
 
 	/*
 	 * If they are requesting more space, and our current estimate
 	 * is over quota, they get to try again unless the actual
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
 	if (used_on_disk + est_inflight >= quota) {
 		if (est_inflight > 0 || used_on_disk < quota ||
 		    (retval == ENOSPC && used_on_disk < quota + deferred))
 			retval = ERESTART;
 		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 		    "quota=%lluK tr=%lluK err=%d\n",
 		    used_on_disk>>10, est_inflight>>10,
 		    quota>>10, asize>>10, retval);
 		mutex_exit(&dd->dd_lock);
 		DMU_TX_STAT_BUMP(dmu_tx_quota);
 		return (SET_ERROR(retval));
 	}
 
 	/* We need to up our estimated delta before dropping dd_lock */
 	dd->dd_tempreserved[txg & TXG_MASK] += asize;
 
 	uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
 	    asize - ref_rsrv);
 	mutex_exit(&dd->dd_lock);
 
 	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 	tr->tr_ds = dd;
 	tr->tr_size = asize;
 	list_insert_tail(tr_list, tr);
 
 	/* see if it's OK with our parent */
 	if (dd->dd_parent != NULL && parent_rsrv != 0) {
 		/*
 		 * Recurse on our parent without recursion. This has been
 		 * observed to be potentially large stack usage even within
 		 * the test suite. Largest seen stack was 7632 bytes on linux.
 		 */
 
 		dd = dd->dd_parent;
 		asize = parent_rsrv;
 		ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 		first = B_FALSE;
 		goto top_of_function;
 
 	} else {
 		return (0);
 	}
 }
 
 /*
  * Reserve space in this dsl_dir, to be used in this tx's txg.
  * After the space has been dirtied (and dsl_dir_willuse_space()
  * has been called), the reservation should be canceled, using
  * dsl_dir_tempreserve_clear().
  */
 int
 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
     boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
 {
 	int err;
 	list_t *tr_list;
 
 	if (asize == 0) {
 		*tr_cookiep = NULL;
 		return (0);
 	}
 
 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 	list_create(tr_list, sizeof (struct tempreserve),
 	    offsetof(struct tempreserve, tr_node));
 	ASSERT3S(asize, >, 0);
 
 	err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
 	if (err == 0) {
 		struct tempreserve *tr;
 
 		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 		tr->tr_size = lsize;
 		list_insert_tail(tr_list, tr);
 	} else {
 		if (err == EAGAIN) {
 			/*
 			 * If arc_memory_throttle() detected that pageout
 			 * is running and we are low on memory, we delay new
 			 * non-pageout transactions to give pageout an
 			 * advantage.
 			 *
 			 * It is unfortunate to be delaying while the caller's
 			 * locks are held.
 			 */
 			txg_delay(dd->dd_pool, tx->tx_txg,
 			    MSEC2NSEC(10), MSEC2NSEC(10));
 			err = SET_ERROR(ERESTART);
 		}
 	}
 
 	if (err == 0) {
 		err = dsl_dir_tempreserve_impl(dd, asize, netfree,
 		    B_FALSE, tr_list, tx, B_TRUE);
 	}
 
 	if (err != 0)
 		dsl_dir_tempreserve_clear(tr_list, tx);
 	else
 		*tr_cookiep = tr_list;
 
 	return (err);
 }
 
 /*
  * Clear a temporary reservation that we previously made with
  * dsl_dir_tempreserve_space().
  */
 void
 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 {
 	int txgidx = tx->tx_txg & TXG_MASK;
 	list_t *tr_list = tr_cookie;
 	struct tempreserve *tr;
 
 	ASSERT3U(tx->tx_txg, !=, 0);
 
 	if (tr_cookie == NULL)
 		return;
 
 	while ((tr = list_head(tr_list)) != NULL) {
 		if (tr->tr_ds) {
 			mutex_enter(&tr->tr_ds->dd_lock);
 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 			    tr->tr_size);
 			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
 			mutex_exit(&tr->tr_ds->dd_lock);
 		} else {
 			arc_tempreserve_clear(tr->tr_size);
 		}
 		list_remove(tr_list, tr);
 		kmem_free(tr, sizeof (struct tempreserve));
 	}
 
 	kmem_free(tr_list, sizeof (list_t));
 }
 
 /*
  * This should be called from open context when we think we're going to write
  * or free space, for example when dirtying data. Be conservative; it's okay
  * to write less space or free more, but we don't want to write more or free
  * less than the amount specified.
  *
  * NOTE: The behavior of this function is identical to the Illumos / FreeBSD
  * version however it has been adjusted to use an iterative rather than
  * recursive algorithm to minimize stack usage.
  */
 void
 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 {
 	int64_t parent_space;
 	uint64_t est_used;
 
 	do {
 		mutex_enter(&dd->dd_lock);
 		if (space > 0)
 			dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 
 		est_used = dsl_dir_space_towrite(dd) +
 		    dsl_dir_phys(dd)->dd_used_bytes;
 		parent_space = parent_delta(dd, est_used, space);
 		mutex_exit(&dd->dd_lock);
 
 		/* Make sure that we clean up dd_space_to* */
 		dsl_dir_dirty(dd, tx);
 
 		dd = dd->dd_parent;
 		space = parent_space;
 	} while (space && dd);
 }
 
 /* call from syncing context when we actually write/free space for this dd */
 void
 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 {
 	int64_t accounted_delta;
 
 	/*
 	 * dsl_dataset_set_refreservation_sync_impl() calls this with
 	 * dd_lock held, so that it can atomically update
 	 * ds->ds_reserved and the dsl_dir accounting, so that
 	 * dsl_dataset_check_quota() can see dataset and dir accounting
 	 * consistently.
 	 */
 	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(type < DD_USED_NUM);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	if (needlock)
 		mutex_enter(&dd->dd_lock);
 	accounted_delta =
 	    parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
 	ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
 	ASSERT(compressed >= 0 ||
 	    dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
 	ASSERT(uncompressed >= 0 ||
 	    dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
 	dsl_dir_phys(dd)->dd_used_bytes += used;
 	dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
 	dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
 
 	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		ASSERT(used > 0 ||
 		    dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
 		dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
 #ifdef ZFS_DEBUG
 		{
 			dd_used_t t;
 			uint64_t u = 0;
 			for (t = 0; t < DD_USED_NUM; t++)
 				u += dsl_dir_phys(dd)->dd_used_breakdown[t];
 			ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
 		}
 #endif
 	}
 	if (needlock)
 		mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent != NULL) {
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
 		    accounted_delta, compressed, uncompressed, tx);
 		dsl_dir_transfer_space(dd->dd_parent,
 		    used - accounted_delta,
 		    DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
 	}
 }
 
 void
 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(oldtype < DD_USED_NUM);
 	ASSERT(newtype < DD_USED_NUM);
 
 	if (delta == 0 ||
 	    !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
 		return;
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	mutex_enter(&dd->dd_lock);
 	ASSERT(delta > 0 ?
 	    dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
 	    dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
 	ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
 	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
 	dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
 	mutex_exit(&dd->dd_lock);
 }
 
 typedef struct dsl_dir_set_qr_arg {
 	const char *ddsqra_name;
 	zprop_source_t ddsqra_source;
 	uint64_t ddsqra_value;
 } dsl_dir_set_qr_arg_t;
 
 static int
 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 	uint64_t towrite, newval;
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	error = dsl_prop_predict(ds->ds_dir, "quota",
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	if (newval == 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	mutex_enter(&ds->ds_dir->dd_lock);
 	/*
 	 * If we are doing the preliminary check in open context, and
 	 * there are pending changes, then don't fail it, since the
 	 * pending changes could under-estimate the amount of space to be
 	 * freed up.
 	 */
 	towrite = dsl_dir_space_towrite(ds->ds_dir);
 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
 	    (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
 	    newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
 		error = SET_ERROR(ENOSPC);
 	}
 	mutex_exit(&ds->ds_dir->dd_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 static void
 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	uint64_t newval;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
 		    &ddsqra->ddsqra_value, tx);
 
 		VERIFY0(dsl_prop_get_int_ds(ds,
 		    zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
 	} else {
 		newval = ddsqra->ddsqra_value;
 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
 		    zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
 	}
 
 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 	mutex_enter(&ds->ds_dir->dd_lock);
 	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
 	mutex_exit(&ds->ds_dir->dd_lock);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 {
 	dsl_dir_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = ddname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = quota;
 
 	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
 	    dsl_dir_set_quota_sync, &ddsqra, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static int
 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	uint64_t newval, used, avail;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 	dd = ds->ds_dir;
 
 	/*
 	 * If we are doing the preliminary check in open context, the
 	 * space estimates may be inaccurate.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	error = dsl_prop_predict(ds->ds_dir,
 	    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	mutex_enter(&dd->dd_lock);
 	used = dsl_dir_phys(dd)->dd_used_bytes;
 	mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent) {
 		avail = dsl_dir_space_available(dd->dd_parent,
 		    NULL, 0, FALSE);
 	} else {
 		avail = dsl_pool_adjustedsize(dd->dd_pool,
 		    ZFS_SPACE_CHECK_NORMAL) - used;
 	}
 
 	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
 		uint64_t delta = MAX(used, newval) -
 		    MAX(used, dsl_dir_phys(dd)->dd_reserved);
 
 		if (delta > avail ||
 		    (dsl_dir_phys(dd)->dd_quota > 0 &&
 		    newval > dsl_dir_phys(dd)->dd_quota))
 			error = SET_ERROR(ENOSPC);
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 void
 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
 {
 	uint64_t used;
 	int64_t delta;
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	mutex_enter(&dd->dd_lock);
 	used = dsl_dir_phys(dd)->dd_used_bytes;
 	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
 	dsl_dir_phys(dd)->dd_reserved = value;
 
 	if (dd->dd_parent != NULL) {
 		/* Roll up this additional usage into our ancestors */
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
 		    delta, 0, 0, tx);
 	}
 	mutex_exit(&dd->dd_lock);
 }
 
 static void
 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	uint64_t newval;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
 		dsl_prop_set_sync_impl(ds,
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
 		    &ddsqra->ddsqra_value, tx);
 
 		VERIFY0(dsl_prop_get_int_ds(ds,
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
 	} else {
 		newval = ddsqra->ddsqra_value;
 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 		    (longlong_t)newval);
 	}
 
 	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
     uint64_t reservation)
 {
 	dsl_dir_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = ddname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = reservation;
 
 	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
 	    dsl_dir_set_reservation_sync, &ddsqra, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static dsl_dir_t *
 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
 {
 	for (; ds1; ds1 = ds1->dd_parent) {
 		dsl_dir_t *dd;
 		for (dd = ds2; dd; dd = dd->dd_parent) {
 			if (ds1 == dd)
 				return (dd);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * If delta is applied to dd, how much of that delta would be applied to
  * ancestor?  Syncing context only.
  */
 static int64_t
 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
 {
 	if (dd == ancestor)
 		return (delta);
 
 	mutex_enter(&dd->dd_lock);
 	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
 	mutex_exit(&dd->dd_lock);
 	return (would_change(dd->dd_parent, delta, ancestor));
 }
 
 typedef struct dsl_dir_rename_arg {
 	const char *ddra_oldname;
 	const char *ddra_newname;
 	cred_t *ddra_cred;
 	proc_t *ddra_proc;
 } dsl_dir_rename_arg_t;
 
 typedef struct dsl_valid_rename_arg {
 	int char_delta;
 	int nest_delta;
 } dsl_valid_rename_arg_t;
 
 /* ARGSUSED */
 static int
 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	dsl_valid_rename_arg_t *dvra = arg;
 	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
 	dsl_dataset_name(ds, namebuf);
 
 	ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
 	    <, ZFS_MAX_DATASET_NAME_LEN);
 	int namelen = strlen(namebuf) + dvra->char_delta;
 	int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
 
 	if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
 		return (SET_ERROR(ENAMETOOLONG));
 	return (0);
 }
 
 static int
 dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_rename_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd, *newparent;
 	dsl_valid_rename_arg_t dvra;
 	dsl_dataset_t *parentds;
 	objset_t *parentos;
 	const char *mynewname;
 	int error;
 
 	/* target dir should exist */
 	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
 	if (error != 0)
 		return (error);
 
 	/* new parent should exist */
 	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
 	    &newparent, &mynewname);
 	if (error != 0) {
 		dsl_dir_rele(dd, FTAG);
 		return (error);
 	}
 
 	/* can't rename to different pool */
 	if (dd->dd_pool != newparent->dd_pool) {
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/* new name should not already exist */
 	if (mynewname == NULL) {
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/* can't rename below anything but filesystems (eg. no ZVOLs) */
 	error = dsl_dataset_hold_obj(newparent->dd_pool,
 	    dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds);
 	if (error != 0) {
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (error);
 	}
 	error = dmu_objset_from_ds(parentds, &parentos);
 	if (error != 0) {
 		dsl_dataset_rele(parentds, FTAG);
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (error);
 	}
 	if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
 		dsl_dataset_rele(parentds, FTAG);
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 	}
 	dsl_dataset_rele(parentds, FTAG);
 
 	ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
 	    <, ZFS_MAX_DATASET_NAME_LEN);
 	ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
 	    <, ZFS_MAX_DATASET_NAME_LEN);
 	dvra.char_delta = strlen(ddra->ddra_newname)
 	    - strlen(ddra->ddra_oldname);
 	dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
 	    - get_dataset_depth(ddra->ddra_oldname);
 
 	/* if the name length is growing, validate child name lengths */
 	if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
 		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
 		    &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 		if (error != 0) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (error);
 		}
 	}
 
 	if (dmu_tx_is_syncing(tx)) {
 		if (spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_FS_SS_LIMIT)) {
 			/*
 			 * Although this is the check function and we don't
 			 * normally make on-disk changes in check functions,
 			 * we need to do that here.
 			 *
 			 * Ensure this portion of the tree's counts have been
 			 * initialized in case the new parent has limits set.
 			 */
 			dsl_dir_init_fs_ss_count(dd, tx);
 		}
 	}
 
 	if (newparent != dd->dd_parent) {
 		/* is there enough space? */
 		uint64_t myspace =
 		    MAX(dsl_dir_phys(dd)->dd_used_bytes,
 		    dsl_dir_phys(dd)->dd_reserved);
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 		uint64_t fs_cnt = 0;
 		uint64_t ss_cnt = 0;
 
 		if (dsl_dir_is_zapified(dd)) {
 			int err;
 
 			err = zap_lookup(os, dd->dd_object,
 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
 			    &fs_cnt);
 			if (err != ENOENT && err != 0) {
 				dsl_dir_rele(newparent, FTAG);
 				dsl_dir_rele(dd, FTAG);
 				return (err);
 			}
 
 			/*
 			 * have to add 1 for the filesystem itself that we're
 			 * moving
 			 */
 			fs_cnt++;
 
 			err = zap_lookup(os, dd->dd_object,
 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
 			    &ss_cnt);
 			if (err != ENOENT && err != 0) {
 				dsl_dir_rele(newparent, FTAG);
 				dsl_dir_rele(dd, FTAG);
 				return (err);
 			}
 		}
 
 		/* check for encryption errors */
 		error = dsl_dir_rename_crypt_check(dd, newparent);
 		if (error != 0) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(EACCES));
 		}
 
 		/* no rename into our descendant */
 		if (closest_common_ancestor(dd, newparent) == dd) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = dsl_dir_transfer_possible(dd->dd_parent,
 		    newparent, fs_cnt, ss_cnt, myspace,
 		    ddra->ddra_cred, ddra->ddra_proc);
 		if (error != 0) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (error);
 		}
 	}
 
 	dsl_dir_rele(newparent, FTAG);
 	dsl_dir_rele(dd, FTAG);
 	return (0);
 }
 
 static void
 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_rename_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd, *newparent;
 	const char *mynewname;
 	objset_t *mos = dp->dp_meta_objset;
 
 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
 	    &mynewname));
 
 	/* Log this before we change the name. */
 	spa_history_log_internal_dd(dd, "rename", tx,
 	    "-> %s", ddra->ddra_newname);
 
 	if (newparent != dd->dd_parent) {
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 		uint64_t fs_cnt = 0;
 		uint64_t ss_cnt = 0;
 
 		/*
 		 * We already made sure the dd counts were initialized in the
 		 * check function.
 		 */
 		if (spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_FS_SS_LIMIT)) {
 			VERIFY0(zap_lookup(os, dd->dd_object,
 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
 			    &fs_cnt));
 			/* add 1 for the filesystem itself that we're moving */
 			fs_cnt++;
 
 			VERIFY0(zap_lookup(os, dd->dd_object,
 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
 			    &ss_cnt));
 		}
 
 		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
 		    DD_FIELD_FILESYSTEM_COUNT, tx);
 		dsl_fs_ss_count_adjust(newparent, fs_cnt,
 		    DD_FIELD_FILESYSTEM_COUNT, tx);
 
 		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 		dsl_fs_ss_count_adjust(newparent, ss_cnt,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
 		    -dsl_dir_phys(dd)->dd_used_bytes,
 		    -dsl_dir_phys(dd)->dd_compressed_bytes,
 		    -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
 		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
 		    dsl_dir_phys(dd)->dd_used_bytes,
 		    dsl_dir_phys(dd)->dd_compressed_bytes,
 		    dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
 
 		if (dsl_dir_phys(dd)->dd_reserved >
 		    dsl_dir_phys(dd)->dd_used_bytes) {
 			uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
 			    dsl_dir_phys(dd)->dd_used_bytes;
 
 			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
 			    -unused_rsrv, 0, 0, tx);
 			dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
 			    unused_rsrv, 0, 0, tx);
 		}
 	}
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	/* remove from old parent zapobj */
 	VERIFY0(zap_remove(mos,
 	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
 	    dd->dd_myname, tx));
 
 	(void) strlcpy(dd->dd_myname, mynewname,
 	    sizeof (dd->dd_myname));
 	dsl_dir_rele(dd->dd_parent, dd);
 	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
 	VERIFY0(dsl_dir_hold_obj(dp,
 	    newparent->dd_object, NULL, dd, &dd->dd_parent));
 
 	/* add to new parent zapobj */
 	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
 	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
 
 	zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname,
 	    ddra->ddra_newname, B_TRUE);
 
 	dsl_prop_notify_all(dd);
 
 	dsl_dir_rele(newparent, FTAG);
 	dsl_dir_rele(dd, FTAG);
 }
 
 int
 dsl_dir_rename(const char *oldname, const char *newname)
 {
 	dsl_dir_rename_arg_t ddra;
 
 	ddra.ddra_oldname = oldname;
 	ddra.ddra_newname = newname;
 	ddra.ddra_cred = CRED();
 	ddra.ddra_proc = curproc;
 
 	return (dsl_sync_task(oldname,
 	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
 	    3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
     uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space,
     cred_t *cr, proc_t *proc)
 {
 	dsl_dir_t *ancestor;
 	int64_t adelta;
 	uint64_t avail;
 	int err;
 
 	ancestor = closest_common_ancestor(sdd, tdd);
 	adelta = would_change(sdd, -space, ancestor);
 	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
 	if (avail < space)
 		return (SET_ERROR(ENOSPC));
 
 	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
 	    ancestor, cr, proc);
 	if (err != 0)
 		return (err);
 	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
 	    ancestor, cr, proc);
 	if (err != 0)
 		return (err);
 
 	return (0);
 }
 
 inode_timespec_t
 dsl_dir_snap_cmtime(dsl_dir_t *dd)
 {
 	inode_timespec_t t;
 
 	mutex_enter(&dd->dd_lock);
 	t = dd->dd_snap_cmtime;
 	mutex_exit(&dd->dd_lock);
 
 	return (t);
 }
 
 void
 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
 {
 	inode_timespec_t t;
 
 	gethrestime(&t);
 	mutex_enter(&dd->dd_lock);
 	dd->dd_snap_cmtime = t;
 	mutex_exit(&dd->dd_lock);
 }
 
 void
 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
 }
 
 boolean_t
 dsl_dir_is_zapified(dsl_dir_t *dd)
 {
 	dmu_object_info_t doi;
 
 	dmu_object_info_from_db(dd->dd_dbuf, &doi);
 	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
 }
 
 void
 dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
 	    SPA_FEATURE_LIVELIST));
 	dsl_deadlist_open(&dd->dd_livelist, mos, obj);
 	bplist_create(&dd->dd_pending_allocs);
 	bplist_create(&dd->dd_pending_frees);
 }
 
 void
 dsl_dir_livelist_close(dsl_dir_t *dd)
 {
 	dsl_deadlist_close(&dd->dd_livelist);
 	bplist_destroy(&dd->dd_pending_allocs);
 	bplist_destroy(&dd->dd_pending_frees);
 }
 
 void
 dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
 {
 	uint64_t obj;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	livelist_condense_entry_t to_condense = spa->spa_to_condense;
 
 	if (!dsl_deadlist_is_open(&dd->dd_livelist))
 		return;
 
 	/*
 	 * If the livelist being removed is set to be condensed, stop the
 	 * condense zthr and indicate the cancellation in the spa_to_condense
 	 * struct in case the condense no-wait synctask has already started
 	 */
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL &&
 	    (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
 		/*
 		 * We use zthr_wait_cycle_done instead of zthr_cancel
 		 * because we don't want to destroy the zthr, just have
 		 * it skip its current task.
 		 */
 		spa->spa_to_condense.cancelled = B_TRUE;
 		zthr_wait_cycle_done(ll_condense_thread);
 		/*
 		 * If we've returned from zthr_wait_cycle_done without
 		 * clearing the to_condense data structure it's either
 		 * because the no-wait synctask has started (which is
 		 * indicated by 'syncing' field of to_condense) and we
 		 * can expect it to clear to_condense on its own.
 		 * Otherwise, we returned before the zthr ran. The
 		 * checkfunc will now fail as cancelled == B_TRUE so we
 		 * can safely NULL out ds, allowing a different dir's
 		 * livelist to be condensed.
 		 *
 		 * We can be sure that the to_condense struct will not
 		 * be repopulated at this stage because both this
 		 * function and dsl_livelist_try_condense execute in
 		 * syncing context.
 		 */
 		if ((spa->spa_to_condense.ds != NULL) &&
 		    !spa->spa_to_condense.syncing) {
 			dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
 			    spa);
 			spa->spa_to_condense.ds = NULL;
 		}
 	}
 
 	dsl_dir_livelist_close(dd);
 	VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object,
 	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj));
 	VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
 	    DD_FIELD_LIVELIST, tx));
 	if (total) {
 		dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
 		spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
 	}
 }
 
 static int
 dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds,
     zfs_wait_activity_t activity, boolean_t *in_progress)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&dd->dd_activity_lock));
 
 	switch (activity) {
 	case ZFS_WAIT_DELETEQ: {
 #ifdef _KERNEL
 		objset_t *os;
 		error = dmu_objset_from_ds(ds, &os);
 		if (error != 0)
 			break;
 
 		mutex_enter(&os->os_user_ptr_lock);
 		void *user = dmu_objset_get_user(os);
 		mutex_exit(&os->os_user_ptr_lock);
 		if (dmu_objset_type(os) != DMU_OST_ZFS ||
 		    user == NULL || zfs_get_vfs_flag_unmounted(os)) {
 			*in_progress = B_FALSE;
 			return (0);
 		}
 
 		uint64_t readonly = B_FALSE;
 		error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly,
 		    NULL);
 
 		if (error != 0)
 			break;
 
 		if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) {
 			*in_progress = B_FALSE;
 			return (0);
 		}
 
 		uint64_t count, unlinked_obj;
 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 		    &unlinked_obj);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			break;
 		}
 		error = zap_count(os, unlinked_obj, &count);
 
 		if (error == 0)
 			*in_progress = (count != 0);
 		break;
 #else
 		/*
 		 * The delete queue is ZPL specific, and libzpool doesn't have
 		 * it. It doesn't make sense to wait for it.
 		 */
 		*in_progress = B_FALSE;
 		break;
 #endif
 	}
 	default:
 		panic("unrecognized value for activity %d", activity);
 	}
 
 	return (error);
 }
 
 int
 dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity,
     boolean_t *waited)
 {
 	int error = 0;
 	boolean_t in_progress;
 	dsl_pool_t *dp = dd->dd_pool;
 	for (;;) {
 		dsl_pool_config_enter(dp, FTAG);
 		error = dsl_dir_activity_in_progress(dd, ds, activity,
 		    &in_progress);
 		dsl_pool_config_exit(dp, FTAG);
 		if (error != 0 || !in_progress)
 			break;
 
 		*waited = B_TRUE;
 
 		if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) ==
 		    0 || dd->dd_activity_cancelled) {
 			error = SET_ERROR(EINTR);
 			break;
 		}
 	}
 	return (error);
 }
 
 void
 dsl_dir_cancel_waiters(dsl_dir_t *dd)
 {
 	mutex_enter(&dd->dd_activity_lock);
 	dd->dd_activity_cancelled = B_TRUE;
 	cv_broadcast(&dd->dd_activity_cv);
 	while (dd->dd_activity_waiters > 0)
 		cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock);
 	mutex_exit(&dd->dd_activity_lock);
 }
 
 #if defined(_KERNEL)
 EXPORT_SYMBOL(dsl_dir_set_quota);
 EXPORT_SYMBOL(dsl_dir_set_reservation);
 #endif
Index: vendor-sys/openzfs/dist/module/zfs/sa.c
===================================================================
--- vendor-sys/openzfs/dist/module/zfs/sa.c	(revision 364927)
+++ vendor-sys/openzfs/dist/module/zfs/sa.c	(revision 364928)
@@ -1,2258 +1,2257 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/sysmacros.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/sunddi.h>
 #include <sys/sa_impl.h>
-#include <sys/dnode.h>
 #include <sys/errno.h>
 #include <sys/zfs_context.h>
 
 #ifdef _KERNEL
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * ZFS System attributes:
  *
  * A generic mechanism to allow for arbitrary attributes
  * to be stored in a dnode.  The data will be stored in the bonus buffer of
  * the dnode and if necessary a special "spill" block will be used to handle
  * overflow situations.  The spill block will be sized to fit the data
  * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
  * spill block is stored at the end of the current bonus buffer.  Any
  * attributes that would be in the way of the blkptr_t will be relocated
  * into the spill block.
  *
  * Attribute registration:
  *
  * Stored persistently on a per dataset basis
  * a mapping between attribute "string" names and their actual attribute
  * numeric values, length, and byteswap function.  The names are only used
  * during registration.  All  attributes are known by their unique attribute
  * id value.  If an attribute can have a variable size then the value
  * 0 will be used to indicate this.
  *
  * Attribute Layout:
  *
  * Attribute layouts are a way to compactly store multiple attributes, but
  * without taking the overhead associated with managing each attribute
  * individually.  Since you will typically have the same set of attributes
  * stored in the same order a single table will be used to represent that
  * layout.  The ZPL for example will usually have only about 10 different
  * layouts (regular files, device files, symlinks,
  * regular files + scanstamp, files/dir with extended attributes, and then
  * you have the possibility of all of those minus ACL, because it would
  * be kicked out into the spill block)
  *
  * Layouts are simply an array of the attributes and their
  * ordering i.e. [0, 1, 4, 5, 2]
  *
  * Each distinct layout is given a unique layout number and that is what's
  * stored in the header at the beginning of the SA data buffer.
  *
  * A layout only covers a single dbuf (bonus or spill).  If a set of
  * attributes is split up between the bonus buffer and a spill buffer then
  * two different layouts will be used.  This allows us to byteswap the
  * spill without looking at the bonus buffer and keeps the on disk format of
  * the bonus and spill buffer the same.
  *
  * Adding a single attribute will cause the entire set of attributes to
  * be rewritten and could result in a new layout number being constructed
  * as part of the rewrite if no such layout exists for the new set of
  * attributes.  The new attribute will be appended to the end of the already
  * existing attributes.
  *
  * Both the attribute registration and attribute layout information are
  * stored in normal ZAP attributes.  Their should be a small number of
  * known layouts and the set of attributes is assumed to typically be quite
  * small.
  *
  * The registered attributes and layout "table" information is maintained
  * in core and a special "sa_os_t" is attached to the objset_t.
  *
  * A special interface is provided to allow for quickly applying
  * a large set of attributes at once.  sa_replace_all_by_template() is
  * used to set an array of attributes.  This is used by the ZPL when
  * creating a brand new file.  The template that is passed into the function
  * specifies the attribute, size for variable length attributes, location of
  * data and special "data locator" function if the data isn't in a contiguous
  * location.
  *
  * Byteswap implications:
  *
  * Since the SA attributes are not entirely self describing we can't do
  * the normal byteswap processing.  The special ZAP layout attribute and
  * attribute registration attributes define the byteswap function and the
  * size of the attributes, unless it is variable sized.
  * The normal ZFS byteswapping infrastructure assumes you don't need
  * to read any objects in order to do the necessary byteswapping.  Whereas
  * SA attributes can only be properly byteswapped if the dataset is opened
  * and the layout/attribute ZAP attributes are available.  Because of this
  * the SA attributes will be byteswapped when they are first accessed by
  * the SA code that will read the SA data.
  */
 
 typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
     uint16_t length, int length_idx, boolean_t, void *userp);
 
 static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
 static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
 static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
     sa_hdr_phys_t *hdr);
 static void sa_idx_tab_rele(objset_t *os, void *arg);
 static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
     int buflen);
 static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx);
 
 arc_byteswap_func_t sa_bswap_table[] = {
 	byteswap_uint64_array,
 	byteswap_uint32_array,
 	byteswap_uint16_array,
 	byteswap_uint8_array,
 	zfs_acl_byteswap,
 };
 
 #ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
 #define	SA_COPY_DATA(f, s, t, l)				\
 do {								\
 	if (f == NULL) {					\
 		if (l == 8) {					\
 			*(uint64_t *)t = *(uint64_t *)s;	\
 		} else if (l == 16) {				\
 			*(uint64_t *)t = *(uint64_t *)s;	\
 			*(uint64_t *)((uintptr_t)t + 8) =	\
 			    *(uint64_t *)((uintptr_t)s + 8);	\
 		} else {					\
 			bcopy(s, t, l);				\
 		}						\
 	} else {						\
 		sa_copy_data(f, s, t, l);			\
 	}							\
 } while (0)
 #else
 #define	SA_COPY_DATA(f, s, t, l)	sa_copy_data(f, s, t, l)
 #endif
 
 /*
  * This table is fixed and cannot be changed.  Its purpose is to
  * allow the SA code to work with both old/new ZPL file systems.
  * It contains the list of legacy attributes.  These attributes aren't
  * stored in the "attribute" registry zap objects, since older ZPL file systems
  * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
  * use this static table.
  */
 sa_attr_reg_t sa_legacy_attrs[] = {
 	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
 	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
 	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
 	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
 	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
 	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
 	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
 	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
 	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
 	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
 	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
 	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
 	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
 	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
 	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
 	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
 };
 
 /*
  * This is only used for objects of type DMU_OT_ZNODE
  */
 sa_attr_type_t sa_legacy_zpl_layout[] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
 /*
  * Special dummy layout used for buffers with no attributes.
  */
 sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
 
 static int sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs);
 static kmem_cache_t *sa_cache = NULL;
 
 /*ARGSUSED*/
 static int
 sa_cache_constructor(void *buf, void *unused, int kmflag)
 {
 	sa_handle_t *hdl = buf;
 
 	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 sa_cache_destructor(void *buf, void *unused)
 {
 	sa_handle_t *hdl = buf;
 	mutex_destroy(&hdl->sa_lock);
 }
 
 void
 sa_cache_init(void)
 {
 	sa_cache = kmem_cache_create("sa_cache",
 	    sizeof (sa_handle_t), 0, sa_cache_constructor,
 	    sa_cache_destructor, NULL, NULL, NULL, 0);
 }
 
 void
 sa_cache_fini(void)
 {
 	if (sa_cache)
 		kmem_cache_destroy(sa_cache);
 }
 
 static int
 layout_num_compare(const void *arg1, const void *arg2)
 {
 	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
 	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
 
 	return (TREE_CMP(node1->lot_num, node2->lot_num));
 }
 
 static int
 layout_hash_compare(const void *arg1, const void *arg2)
 {
 	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
 	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
 
 	int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash);
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_CMP(node1->lot_instance, node2->lot_instance));
 }
 
 static boolean_t
 sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
 {
 	int i;
 
 	if (count != tbf->lot_attr_count)
 		return (1);
 
 	for (i = 0; i != count; i++) {
 		if (attrs[i] != tbf->lot_attrs[i])
 			return (1);
 	}
 	return (0);
 }
 
 #define	SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
 
 static uint64_t
 sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
 {
 	int i;
 	uint64_t crc = -1ULL;
 
 	for (i = 0; i != attr_count; i++)
 		crc ^= SA_ATTR_HASH(attrs[i]);
 
 	return (crc);
 }
 
 static int
 sa_get_spill(sa_handle_t *hdl)
 {
 	int rc;
 	if (hdl->sa_spill == NULL) {
 		if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
 		    &hdl->sa_spill)) == 0)
 			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 	} else {
 		rc = 0;
 	}
 
 	return (rc);
 }
 
 /*
  * Main attribute lookup/update function
  * returns 0 for success or non zero for failures
  *
  * Operates on bulk array, first failure will abort further processing
  */
 static int
 sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
     sa_data_op_t data_op, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	int i;
 	int error = 0;
 	sa_buf_type_t buftypes;
 
 	buftypes = 0;
 
 	ASSERT(count > 0);
 	for (i = 0; i != count; i++) {
 		ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
 
 		bulk[i].sa_addr = NULL;
 		/* First check the bonus buffer */
 
 		if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
 		    hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
 			SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
 			    SA_GET_HDR(hdl, SA_BONUS),
 			    bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
 			if (tx && !(buftypes & SA_BONUS)) {
 				dmu_buf_will_dirty(hdl->sa_bonus, tx);
 				buftypes |= SA_BONUS;
 			}
 		}
 		if (bulk[i].sa_addr == NULL &&
 		    ((error = sa_get_spill(hdl)) == 0)) {
 			if (TOC_ATTR_PRESENT(
 			    hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
 				SA_ATTR_INFO(sa, hdl->sa_spill_tab,
 				    SA_GET_HDR(hdl, SA_SPILL),
 				    bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
 				if (tx && !(buftypes & SA_SPILL) &&
 				    bulk[i].sa_size == bulk[i].sa_length) {
 					dmu_buf_will_dirty(hdl->sa_spill, tx);
 					buftypes |= SA_SPILL;
 				}
 			}
 		}
 		if (error && error != ENOENT) {
 			return ((error == ECKSUM) ? EIO : error);
 		}
 
 		switch (data_op) {
 		case SA_LOOKUP:
 			if (bulk[i].sa_addr == NULL)
 				return (SET_ERROR(ENOENT));
 			if (bulk[i].sa_data) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_addr, bulk[i].sa_data,
 				    bulk[i].sa_size);
 			}
 			continue;
 
 		case SA_UPDATE:
 			/* existing rewrite of attr */
 			if (bulk[i].sa_addr &&
 			    bulk[i].sa_size == bulk[i].sa_length) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_addr,
 				    bulk[i].sa_length);
 				continue;
 			} else if (bulk[i].sa_addr) { /* attr size change */
 				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 				    SA_REPLACE, bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_length, tx);
 			} else { /* adding new attribute */
 				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 				    SA_ADD, bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_length, tx);
 			}
 			if (error)
 				return (error);
 			break;
 		default:
 			break;
 		}
 	}
 	return (error);
 }
 
 static sa_lot_t *
 sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
     uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, *findtb;
 	int i;
 	avl_index_t loc;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
 	tb->lot_attr_count = attr_count;
 	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
 	bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
 	tb->lot_num = lot_num;
 	tb->lot_hash = hash;
 	tb->lot_instance = 0;
 
 	if (zapadd) {
 		char attr_name[8];
 
 		if (sa->sa_layout_attr_obj == 0) {
 			sa->sa_layout_attr_obj = zap_create_link(os,
 			    DMU_OT_SA_ATTR_LAYOUTS,
 			    sa->sa_master_obj, SA_LAYOUTS, tx);
 		}
 
 		(void) snprintf(attr_name, sizeof (attr_name),
 		    "%d", (int)lot_num);
 		VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
 		    attr_name, 2, attr_count, attrs, tx));
 	}
 
 	list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
 	    offsetof(sa_idx_tab_t, sa_next));
 
 	for (i = 0; i != attr_count; i++) {
 		if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
 			tb->lot_var_sizes++;
 	}
 
 	avl_add(&sa->sa_layout_num_tree, tb);
 
 	/* verify we don't have a hash collision */
 	if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
 		for (; findtb && findtb->lot_hash == hash;
 		    findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
 			if (findtb->lot_instance != tb->lot_instance)
 				break;
 			tb->lot_instance++;
 		}
 	}
 	avl_add(&sa->sa_layout_hash_tree, tb);
 	return (tb);
 }
 
 static void
 sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
     int count, dmu_tx_t *tx, sa_lot_t **lot)
 {
 	sa_lot_t *tb, tbsearch;
 	avl_index_t loc;
 	sa_os_t *sa = os->os_sa;
 	boolean_t found = B_FALSE;
 
 	mutex_enter(&sa->sa_lock);
 	tbsearch.lot_hash = hash;
 	tbsearch.lot_instance = 0;
 	tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
 	if (tb) {
 		for (; tb && tb->lot_hash == hash;
 		    tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
 			if (sa_layout_equal(tb, attrs, count) == 0) {
 				found = B_TRUE;
 				break;
 			}
 		}
 	}
 	if (!found) {
 		tb = sa_add_layout_entry(os, attrs, count,
 		    avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
 	}
 	mutex_exit(&sa->sa_lock);
 	*lot = tb;
 }
 
 static int
 sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
 {
 	int error;
 	uint32_t blocksize;
 
 	if (size == 0) {
 		blocksize = SPA_MINBLOCKSIZE;
 	} else if (size > SPA_OLD_MAXBLOCKSIZE) {
 		ASSERT(0);
 		return (SET_ERROR(EFBIG));
 	} else {
 		blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
 	}
 
 	error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
 	ASSERT(error == 0);
 	return (error);
 }
 
 static void
 sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
 {
 	if (func == NULL) {
 		bcopy(datastart, target, buflen);
 	} else {
 		boolean_t start;
 		int bytes;
 		void *dataptr;
 		void *saptr = target;
 		uint32_t length;
 
 		start = B_TRUE;
 		bytes = 0;
 		while (bytes < buflen) {
 			func(&dataptr, &length, buflen, start, datastart);
 			bcopy(dataptr, saptr, length);
 			saptr = (void *)((caddr_t)saptr + length);
 			bytes += length;
 			start = B_FALSE;
 		}
 	}
 }
 
 /*
  * Determine several different values pertaining to system attribute
  * buffers.
  *
  * Return the size of the sa_hdr_phys_t header for the buffer. Each
  * variable length attribute except the first contributes two bytes to
  * the header size, which is then rounded up to an 8-byte boundary.
  *
  * The following output parameters are also computed.
  *
  *  index - The index of the first attribute in attr_desc that will
  *  spill over. Only valid if will_spill is set.
  *
  *  total - The total number of bytes of all system attributes described
  *  in attr_desc.
  *
  *  will_spill - Set when spilling is necessary. It is only set when
  *  the buftype is SA_BONUS.
  */
 static int
 sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
     dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
     int *total, boolean_t *will_spill)
 {
 	int var_size_count = 0;
 	int i;
 	int hdrsize;
 	int extra_hdrsize;
 
 	if (buftype == SA_BONUS && sa->sa_force_spill) {
 		*total = 0;
 		*index = 0;
 		*will_spill = B_TRUE;
 		return (0);
 	}
 
 	*index = -1;
 	*total = 0;
 	*will_spill = B_FALSE;
 
 	extra_hdrsize = 0;
 	hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
 	    sizeof (sa_hdr_phys_t);
 
 	ASSERT(IS_P2ALIGNED(full_space, 8));
 
 	for (i = 0; i != attr_count; i++) {
 		boolean_t is_var_sz, might_spill_here;
 		int tmp_hdrsize;
 
 		*total = P2ROUNDUP(*total, 8);
 		*total += attr_desc[i].sa_length;
 		if (*will_spill)
 			continue;
 
 		is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
 		if (is_var_sz)
 			var_size_count++;
 
 		/*
 		 * Calculate what the SA header size would be if this
 		 * attribute doesn't spill.
 		 */
 		tmp_hdrsize = hdrsize + ((is_var_sz && var_size_count > 1) ?
 		    sizeof (uint16_t) : 0);
 
 		/*
 		 * Check whether this attribute spans into the space
 		 * that would be used by the spill block pointer should
 		 * a spill block be needed.
 		 */
 		might_spill_here =
 		    buftype == SA_BONUS && *index == -1 &&
 		    (*total + P2ROUNDUP(tmp_hdrsize, 8)) >
 		    (full_space - sizeof (blkptr_t));
 
 		if (is_var_sz && var_size_count > 1) {
 			if (buftype == SA_SPILL ||
 			    tmp_hdrsize + *total < full_space) {
 				/*
 				 * Record the extra header size in case this
 				 * increase needs to be reversed due to
 				 * spill-over.
 				 */
 				hdrsize = tmp_hdrsize;
 				if (*index != -1 || might_spill_here)
 					extra_hdrsize += sizeof (uint16_t);
 			} else {
 				ASSERT(buftype == SA_BONUS);
 				if (*index == -1)
 					*index = i;
 				*will_spill = B_TRUE;
 				continue;
 			}
 		}
 
 		/*
 		 * Store index of where spill *could* occur. Then
 		 * continue to count the remaining attribute sizes. The
 		 * sum is used later for sizing bonus and spill buffer.
 		 */
 		if (might_spill_here)
 			*index = i;
 
 		if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
 		    buftype == SA_BONUS)
 			*will_spill = B_TRUE;
 	}
 
 	if (*will_spill)
 		hdrsize -= extra_hdrsize;
 
 	hdrsize = P2ROUNDUP(hdrsize, 8);
 	return (hdrsize);
 }
 
 #define	BUF_SPACE_NEEDED(total, header) (total + header)
 
 /*
  * Find layout that corresponds to ordering of attributes
  * If not found a new layout number is created and added to
  * persistent layout tables.
  */
 static int
 sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
     dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	uint64_t hash;
 	sa_buf_type_t buftype;
 	sa_hdr_phys_t *sahdr;
 	void *data_start;
 	sa_attr_type_t *attrs, *attrs_start;
 	int i, lot_count;
 	int dnodesize;
 	int spill_idx;
 	int hdrsize;
 	int spillhdrsize = 0;
 	int used;
 	dmu_object_type_t bonustype;
 	sa_lot_t *lot;
 	int len_idx;
 	int spill_used;
 	int bonuslen;
 	boolean_t spilling;
 
 	dmu_buf_will_dirty(hdl->sa_bonus, tx);
 	bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
 	dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
 	bonuslen = DN_BONUS_SIZE(dnodesize);
 
 	/* first determine bonus header size and sum of all attributes */
 	hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
 	    SA_BONUS, bonuslen, &spill_idx, &used, &spilling);
 
 	if (used > SPA_OLD_MAXBLOCKSIZE)
 		return (SET_ERROR(EFBIG));
 
 	VERIFY0(dmu_set_bonus(hdl->sa_bonus, spilling ?
 	    MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
 	    used + hdrsize, tx));
 
 	ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
 	    bonustype == DMU_OT_SA);
 
 	/* setup and size spill buffer when needed */
 	if (spilling) {
 		boolean_t dummy;
 
 		if (hdl->sa_spill == NULL) {
 			VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL,
 			    &hdl->sa_spill) == 0);
 		}
 		dmu_buf_will_dirty(hdl->sa_spill, tx);
 
 		spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx],
 		    attr_count - spill_idx, hdl->sa_spill, SA_SPILL,
 		    hdl->sa_spill->db_size, &i, &spill_used, &dummy);
 
 		if (spill_used > SPA_OLD_MAXBLOCKSIZE)
 			return (SET_ERROR(EFBIG));
 
 		if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
 		    hdl->sa_spill->db_size)
 			VERIFY(0 == sa_resize_spill(hdl,
 			    BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
 	}
 
 	/* setup starting pointers to lay down data */
 	data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
 	sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
 	buftype = SA_BONUS;
 
 	attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
 	lot_count = 0;
 
 	for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
 		uint16_t length;
 
 		ASSERT(IS_P2ALIGNED(data_start, 8));
 		attrs[i] = attr_desc[i].sa_attr;
 		length = SA_REGISTERED_LEN(sa, attrs[i]);
 		if (length == 0)
 			length = attr_desc[i].sa_length;
 
 		if (spilling && i == spill_idx) { /* switch to spill buffer */
 			VERIFY(bonustype == DMU_OT_SA);
 			if (buftype == SA_BONUS && !sa->sa_force_spill) {
 				sa_find_layout(hdl->sa_os, hash, attrs_start,
 				    lot_count, tx, &lot);
 				SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
 			}
 
 			buftype = SA_SPILL;
 			hash = -1ULL;
 			len_idx = 0;
 
 			sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
 			sahdr->sa_magic = SA_MAGIC;
 			data_start = (void *)((uintptr_t)sahdr +
 			    spillhdrsize);
 			attrs_start = &attrs[i];
 			lot_count = 0;
 		}
 		hash ^= SA_ATTR_HASH(attrs[i]);
 		attr_desc[i].sa_addr = data_start;
 		attr_desc[i].sa_size = length;
 		SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
 		    data_start, length);
 		if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
 			sahdr->sa_lengths[len_idx++] = length;
 		}
 		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 		    length), 8);
 		lot_count++;
 	}
 
 	sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
 
 	/*
 	 * Verify that old znodes always have layout number 0.
 	 * Must be DMU_OT_SA for arbitrary layouts
 	 */
 	VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
 	    (bonustype == DMU_OT_SA && lot->lot_num > 1));
 
 	if (bonustype == DMU_OT_SA) {
 		SA_SET_HDR(sahdr, lot->lot_num,
 		    buftype == SA_BONUS ? hdrsize : spillhdrsize);
 	}
 
 	kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
 	if (hdl->sa_bonus_tab) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 		hdl->sa_bonus_tab = NULL;
 	}
 	if (!sa->sa_force_spill)
 		VERIFY(0 == sa_build_index(hdl, SA_BONUS));
 	if (hdl->sa_spill) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 		if (!spilling) {
 			/*
 			 * remove spill block that is no longer needed.
 			 */
 			dmu_buf_rele(hdl->sa_spill, NULL);
 			hdl->sa_spill = NULL;
 			hdl->sa_spill_tab = NULL;
 			VERIFY(0 == dmu_rm_spill(hdl->sa_os,
 			    sa_handle_object(hdl), tx));
 		} else {
 			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 		}
 	}
 
 	return (0);
 }
 
 static void
 sa_free_attr_table(sa_os_t *sa)
 {
 	int i;
 
 	if (sa->sa_attr_table == NULL)
 		return;
 
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_name)
 			kmem_free(sa->sa_attr_table[i].sa_name,
 			    strlen(sa->sa_attr_table[i].sa_name) + 1);
 	}
 
 	kmem_free(sa->sa_attr_table,
 	    sizeof (sa_attr_table_t) * sa->sa_num_attrs);
 
 	sa->sa_attr_table = NULL;
 }
 
 static int
 sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
 {
 	sa_os_t *sa = os->os_sa;
 	uint64_t sa_attr_count = 0;
 	uint64_t sa_reg_count = 0;
 	int error = 0;
 	uint64_t attr_value;
 	sa_attr_table_t *tb;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int registered_count = 0;
 	int i;
 	dmu_objset_type_t ostype = dmu_objset_type(os);
 
 	sa->sa_user_table =
 	    kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
 	sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
 
 	if (sa->sa_reg_attr_obj != 0) {
 		error = zap_count(os, sa->sa_reg_attr_obj,
 		    &sa_attr_count);
 
 		/*
 		 * Make sure we retrieved a count and that it isn't zero
 		 */
 		if (error || (error == 0 && sa_attr_count == 0)) {
 			if (error == 0)
 				error = SET_ERROR(EINVAL);
 			goto bail;
 		}
 		sa_reg_count = sa_attr_count;
 	}
 
 	if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
 		sa_attr_count += sa_legacy_attr_count;
 
 	/* Allocate attribute numbers for attributes that aren't registered */
 	for (i = 0; i != count; i++) {
 		boolean_t found = B_FALSE;
 		int j;
 
 		if (ostype == DMU_OST_ZFS) {
 			for (j = 0; j != sa_legacy_attr_count; j++) {
 				if (strcmp(reg_attrs[i].sa_name,
 				    sa_legacy_attrs[j].sa_name) == 0) {
 					sa->sa_user_table[i] =
 					    sa_legacy_attrs[j].sa_attr;
 					found = B_TRUE;
 				}
 			}
 		}
 		if (found)
 			continue;
 
 		if (sa->sa_reg_attr_obj)
 			error = zap_lookup(os, sa->sa_reg_attr_obj,
 			    reg_attrs[i].sa_name, 8, 1, &attr_value);
 		else
 			error = SET_ERROR(ENOENT);
 		switch (error) {
 		case ENOENT:
 			sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
 			sa_attr_count++;
 			break;
 		case 0:
 			sa->sa_user_table[i] = ATTR_NUM(attr_value);
 			break;
 		default:
 			goto bail;
 		}
 	}
 
 	sa->sa_num_attrs = sa_attr_count;
 	tb = sa->sa_attr_table =
 	    kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
 
 	/*
 	 * Attribute table is constructed from requested attribute list,
 	 * previously foreign registered attributes, and also the legacy
 	 * ZPL set of attributes.
 	 */
 
 	if (sa->sa_reg_attr_obj) {
 		for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
 		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t value;
 			value  = za.za_first_integer;
 
 			registered_count++;
 			tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
 			tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
 			tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
 			tb[ATTR_NUM(value)].sa_registered = B_TRUE;
 
 			if (tb[ATTR_NUM(value)].sa_name) {
 				continue;
 			}
 			tb[ATTR_NUM(value)].sa_name =
 			    kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
 			(void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
 			    strlen(za.za_name) +1);
 		}
 		zap_cursor_fini(&zc);
 		/*
 		 * Make sure we processed the correct number of registered
 		 * attributes
 		 */
 		if (registered_count != sa_reg_count) {
 			ASSERT(error != 0);
 			goto bail;
 		}
 
 	}
 
 	if (ostype == DMU_OST_ZFS) {
 		for (i = 0; i != sa_legacy_attr_count; i++) {
 			if (tb[i].sa_name)
 				continue;
 			tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
 			tb[i].sa_length = sa_legacy_attrs[i].sa_length;
 			tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
 			tb[i].sa_registered = B_FALSE;
 			tb[i].sa_name =
 			    kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
 			    KM_SLEEP);
 			(void) strlcpy(tb[i].sa_name,
 			    sa_legacy_attrs[i].sa_name,
 			    strlen(sa_legacy_attrs[i].sa_name) + 1);
 		}
 	}
 
 	for (i = 0; i != count; i++) {
 		sa_attr_type_t attr_id;
 
 		attr_id = sa->sa_user_table[i];
 		if (tb[attr_id].sa_name)
 			continue;
 
 		tb[attr_id].sa_length = reg_attrs[i].sa_length;
 		tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
 		tb[attr_id].sa_attr = attr_id;
 		tb[attr_id].sa_name =
 		    kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
 		(void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
 		    strlen(reg_attrs[i].sa_name) + 1);
 	}
 
 	sa->sa_need_attr_registration =
 	    (sa_attr_count != registered_count);
 
 	return (0);
 bail:
 	kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
 	sa->sa_user_table = NULL;
 	sa_free_attr_table(sa);
 	ASSERT(error != 0);
 	return (error);
 }
 
 int
 sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
     sa_attr_type_t **user_table)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	sa_os_t *sa;
 	dmu_objset_type_t ostype = dmu_objset_type(os);
 	sa_attr_type_t *tb;
 	int error;
 
 	mutex_enter(&os->os_user_ptr_lock);
 	if (os->os_sa) {
 		mutex_enter(&os->os_sa->sa_lock);
 		mutex_exit(&os->os_user_ptr_lock);
 		tb = os->os_sa->sa_user_table;
 		mutex_exit(&os->os_sa->sa_lock);
 		*user_table = tb;
 		return (0);
 	}
 
 	sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
 	mutex_init(&sa->sa_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	sa->sa_master_obj = sa_obj;
 
 	os->os_sa = sa;
 	mutex_enter(&sa->sa_lock);
 	mutex_exit(&os->os_user_ptr_lock);
 	avl_create(&sa->sa_layout_num_tree, layout_num_compare,
 	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
 	avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
 	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
 
 	if (sa_obj) {
 		error = zap_lookup(os, sa_obj, SA_LAYOUTS,
 		    8, 1, &sa->sa_layout_attr_obj);
 		if (error != 0 && error != ENOENT)
 			goto fail;
 		error = zap_lookup(os, sa_obj, SA_REGISTRY,
 		    8, 1, &sa->sa_reg_attr_obj);
 		if (error != 0 && error != ENOENT)
 			goto fail;
 	}
 
 	if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
 		goto fail;
 
 	if (sa->sa_layout_attr_obj != 0) {
 		uint64_t layout_count;
 
 		error = zap_count(os, sa->sa_layout_attr_obj,
 		    &layout_count);
 
 		/*
 		 * Layout number count should be > 0
 		 */
 		if (error || (error == 0 && layout_count == 0)) {
 			if (error == 0)
 				error = SET_ERROR(EINVAL);
 			goto fail;
 		}
 
 		for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
 		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			sa_attr_type_t *lot_attrs;
 			uint64_t lot_num;
 
 			lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
 			    za.za_num_integers, KM_SLEEP);
 
 			if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
 			    za.za_name, 2, za.za_num_integers,
 			    lot_attrs))) != 0) {
 				kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
 				    za.za_num_integers);
 				break;
 			}
 			VERIFY(ddi_strtoull(za.za_name, NULL, 10,
 			    (unsigned long long *)&lot_num) == 0);
 
 			(void) sa_add_layout_entry(os, lot_attrs,
 			    za.za_num_integers, lot_num,
 			    sa_layout_info_hash(lot_attrs,
 			    za.za_num_integers), B_FALSE, NULL);
 			kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
 			    za.za_num_integers);
 		}
 		zap_cursor_fini(&zc);
 
 		/*
 		 * Make sure layout count matches number of entries added
 		 * to AVL tree
 		 */
 		if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
 			ASSERT(error != 0);
 			goto fail;
 		}
 	}
 
 	/* Add special layout number for old ZNODES */
 	if (ostype == DMU_OST_ZFS) {
 		(void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
 		    sa_legacy_attr_count, 0,
 		    sa_layout_info_hash(sa_legacy_zpl_layout,
 		    sa_legacy_attr_count), B_FALSE, NULL);
 
 		(void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
 		    0, B_FALSE, NULL);
 	}
 	*user_table = os->os_sa->sa_user_table;
 	mutex_exit(&sa->sa_lock);
 	return (0);
 fail:
 	os->os_sa = NULL;
 	sa_free_attr_table(sa);
 	if (sa->sa_user_table)
 		kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 	mutex_exit(&sa->sa_lock);
 	avl_destroy(&sa->sa_layout_hash_tree);
 	avl_destroy(&sa->sa_layout_num_tree);
 	mutex_destroy(&sa->sa_lock);
 	kmem_free(sa, sizeof (sa_os_t));
 	return ((error == ECKSUM) ? EIO : error);
 }
 
 void
 sa_tear_down(objset_t *os)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *layout;
 	void *cookie;
 
 	kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 
 	/* Free up attr table */
 
 	sa_free_attr_table(sa);
 
 	cookie = NULL;
 	while ((layout =
 	    avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))) {
 		sa_idx_tab_t *tab;
 		while ((tab = list_head(&layout->lot_idx_tab))) {
 			ASSERT(zfs_refcount_count(&tab->sa_refcount));
 			sa_idx_tab_rele(os, tab);
 		}
 	}
 
 	cookie = NULL;
 	while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))) {
 		kmem_free(layout->lot_attrs,
 		    sizeof (sa_attr_type_t) * layout->lot_attr_count);
 		kmem_free(layout, sizeof (sa_lot_t));
 	}
 
 	avl_destroy(&sa->sa_layout_hash_tree);
 	avl_destroy(&sa->sa_layout_num_tree);
 	mutex_destroy(&sa->sa_lock);
 
 	kmem_free(sa, sizeof (sa_os_t));
 	os->os_sa = NULL;
 }
 
 static void
 sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t var_length, void *userp)
 {
 	sa_idx_tab_t *idx_tab = userp;
 
 	if (var_length) {
 		ASSERT(idx_tab->sa_variable_lengths);
 		idx_tab->sa_variable_lengths[length_idx] = length;
 	}
 	TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
 	    (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
 }
 
 static void
 sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
     sa_iterfunc_t func, sa_lot_t *tab, void *userp)
 {
 	void *data_start;
 	sa_lot_t *tb = tab;
 	sa_lot_t search;
 	avl_index_t loc;
 	sa_os_t *sa = os->os_sa;
 	int i;
 	uint16_t *length_start = NULL;
 	uint8_t length_idx = 0;
 
 	if (tab == NULL) {
 		search.lot_num = SA_LAYOUT_NUM(hdr, type);
 		tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
 		ASSERT(tb);
 	}
 
 	if (IS_SA_BONUSTYPE(type)) {
 		data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
 		    offsetof(sa_hdr_phys_t, sa_lengths) +
 		    (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
 		length_start = hdr->sa_lengths;
 	} else {
 		data_start = hdr;
 	}
 
 	for (i = 0; i != tb->lot_attr_count; i++) {
 		int attr_length, reg_length;
 		uint8_t idx_len;
 
 		reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
 		if (reg_length) {
 			attr_length = reg_length;
 			idx_len = 0;
 		} else {
 			attr_length = length_start[length_idx];
 			idx_len = length_idx++;
 		}
 
 		func(hdr, data_start, tb->lot_attrs[i], attr_length,
 		    idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
 
 		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 		    attr_length), 8);
 	}
 }
 
 /*ARGSUSED*/
 static void
 sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t variable_length, void *userp)
 {
 	sa_handle_t *hdl = userp;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
 	sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
 }
 
 static void
 sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
 {
 	sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
 	dmu_buf_impl_t *db;
 	int num_lengths = 1;
 	int i;
 	sa_os_t *sa __maybe_unused = hdl->sa_os->os_sa;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	if (sa_hdr_phys->sa_magic == SA_MAGIC)
 		return;
 
 	db = SA_GET_DB(hdl, buftype);
 
 	if (buftype == SA_SPILL) {
 		arc_release(db->db_buf, NULL);
 		arc_buf_thaw(db->db_buf);
 	}
 
 	sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
 	sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
 
 	/*
 	 * Determine number of variable lengths in header
 	 * The standard 8 byte header has one for free and a
 	 * 16 byte header would have 4 + 1;
 	 */
 	if (SA_HDR_SIZE(sa_hdr_phys) > 8)
 		num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
 	for (i = 0; i != num_lengths; i++)
 		sa_hdr_phys->sa_lengths[i] =
 		    BSWAP_16(sa_hdr_phys->sa_lengths[i]);
 
 	sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
 	    sa_byteswap_cb, NULL, hdl);
 
 	if (buftype == SA_SPILL)
 		arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
 }
 
 static int
 sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
 {
 	sa_hdr_phys_t *sa_hdr_phys;
 	dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
 	dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	sa_idx_tab_t *idx_tab;
 
 	sa_hdr_phys = SA_GET_HDR(hdl, buftype);
 
 	mutex_enter(&sa->sa_lock);
 
 	/* Do we need to byteswap? */
 
 	/* only check if not old znode */
 	if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
 	    sa_hdr_phys->sa_magic != 0) {
 		if (BSWAP_32(sa_hdr_phys->sa_magic) != SA_MAGIC) {
 			mutex_exit(&sa->sa_lock);
 			zfs_dbgmsg("Buffer Header: %x != SA_MAGIC:%x "
 			    "object=%#llx\n", sa_hdr_phys->sa_magic, SA_MAGIC,
 			    db->db.db_object);
 			return (SET_ERROR(EIO));
 		}
 		sa_byteswap(hdl, buftype);
 	}
 
 	idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
 
 	if (buftype == SA_BONUS)
 		hdl->sa_bonus_tab = idx_tab;
 	else
 		hdl->sa_spill_tab = idx_tab;
 
 	mutex_exit(&sa->sa_lock);
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 sa_evict_sync(void *dbu)
 {
 	panic("evicting sa dbuf\n");
 }
 
 static void
 sa_idx_tab_rele(objset_t *os, void *arg)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_idx_tab_t *idx_tab = arg;
 
 	if (idx_tab == NULL)
 		return;
 
 	mutex_enter(&sa->sa_lock);
 	if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
 		list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
 		if (idx_tab->sa_variable_lengths)
 			kmem_free(idx_tab->sa_variable_lengths,
 			    sizeof (uint16_t) *
 			    idx_tab->sa_layout->lot_var_sizes);
 		zfs_refcount_destroy(&idx_tab->sa_refcount);
 		kmem_free(idx_tab->sa_idx_tab,
 		    sizeof (uint32_t) * sa->sa_num_attrs);
 		kmem_free(idx_tab, sizeof (sa_idx_tab_t));
 	}
 	mutex_exit(&sa->sa_lock);
 }
 
 static void
 sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
 {
 	sa_os_t *sa __maybe_unused = os->os_sa;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	(void) zfs_refcount_add(&idx_tab->sa_refcount, NULL);
 }
 
 void
 sa_spill_rele(sa_handle_t *hdl)
 {
 	mutex_enter(&hdl->sa_lock);
 	if (hdl->sa_spill) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 		dmu_buf_rele(hdl->sa_spill, NULL);
 		hdl->sa_spill = NULL;
 		hdl->sa_spill_tab = NULL;
 	}
 	mutex_exit(&hdl->sa_lock);
 }
 
 void
 sa_handle_destroy(sa_handle_t *hdl)
 {
 	dmu_buf_t *db = hdl->sa_bonus;
 
 	mutex_enter(&hdl->sa_lock);
 	(void) dmu_buf_remove_user(db, &hdl->sa_dbu);
 
 	if (hdl->sa_bonus_tab)
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 
 	if (hdl->sa_spill_tab)
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 
 	dmu_buf_rele(hdl->sa_bonus, NULL);
 
 	if (hdl->sa_spill)
 		dmu_buf_rele(hdl->sa_spill, NULL);
 	mutex_exit(&hdl->sa_lock);
 
 	kmem_cache_free(sa_cache, hdl);
 }
 
 int
 sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
 	int error = 0;
 	sa_handle_t *handle = NULL;
 #ifdef ZFS_DEBUG
 	dmu_object_info_t doi;
 
 	dmu_object_info_from_db(db, &doi);
 	ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
 	    doi.doi_bonus_type == DMU_OT_ZNODE);
 #endif
 	/* find handle, if it exists */
 	/* if one doesn't exist then create a new one, and initialize it */
 
 	if (hdl_type == SA_HDL_SHARED)
 		handle = dmu_buf_get_user(db);
 
 	if (handle == NULL) {
 		sa_handle_t *winner = NULL;
 
 		handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
 		handle->sa_dbu.dbu_evict_func_sync = NULL;
 		handle->sa_dbu.dbu_evict_func_async = NULL;
 		handle->sa_userp = userp;
 		handle->sa_bonus = db;
 		handle->sa_os = os;
 		handle->sa_spill = NULL;
 		handle->sa_bonus_tab = NULL;
 		handle->sa_spill_tab = NULL;
 
 		error = sa_build_index(handle, SA_BONUS);
 
 		if (hdl_type == SA_HDL_SHARED) {
 			dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL,
 			    NULL);
 			winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
 		}
 
 		if (winner != NULL) {
 			kmem_cache_free(sa_cache, handle);
 			handle = winner;
 		}
 	}
 	*handlepp = handle;
 
 	return (error);
 }
 
 int
 sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
 	dmu_buf_t *db;
 	int error;
 
 	if ((error = dmu_bonus_hold(objset, objid, NULL, &db)))
 		return (error);
 
 	return (sa_handle_get_from_db(objset, db, userp, hdl_type,
 	    handlepp));
 }
 
 int
 sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
 {
 	return (dmu_bonus_hold(objset, obj_num, tag, db));
 }
 
 void
 sa_buf_rele(dmu_buf_t *db, void *tag)
 {
 	dmu_buf_rele(db, tag);
 }
 
 static int
 sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
 {
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 	return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
 }
 
 static int
 sa_lookup_locked(sa_handle_t *hdl, sa_attr_type_t attr, void *buf,
     uint32_t buflen)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
 
 	bulk.sa_attr = attr;
 	bulk.sa_data = buf;
 	bulk.sa_length = buflen;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 	error = sa_lookup_impl(hdl, &bulk, 1);
 	return (error);
 }
 
 int
 sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_lookup_locked(hdl, attr, buf, buflen);
 	mutex_exit(&hdl->sa_lock);
 
 	return (error);
 }
 
 #ifdef _KERNEL
 int
 sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_data = NULL;
 	bulk.sa_attr = attr;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 
 	mutex_enter(&hdl->sa_lock);
 	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
 		error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
 		    uio_resid(uio)), UIO_READ, uio);
 	}
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * For the existed object that is upgraded from old system, its ondisk layout
  * has no slot for the project ID attribute. But quota accounting logic needs
  * to access related slots by offset directly. So we need to adjust these old
  * objects' layout to make the project ID to some unified and fixed offset.
  */
 int
 sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
 {
 	znode_t *zp = sa_get_userdata(hdl);
 	dmu_buf_t *db = sa_get_db(hdl);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int count = 0, err = 0;
 	sa_bulk_attr_t *bulk, *attrs;
 	zfs_acl_locator_cb_t locate = { 0 };
 	uint64_t uid, gid, mode, rdev, xattr = 0, parent, gen, links;
 	uint64_t crtime[2], mtime[2], ctime[2], atime[2];
 	zfs_acl_phys_t znode_acl = { 0 };
 	char scanstamp[AV_SCANSTAMP_SZ];
 
 	if (zp->z_acl_cached == NULL) {
 		zfs_acl_t *aclp;
 
 		mutex_enter(&zp->z_acl_lock);
 		err = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
 		mutex_exit(&zp->z_acl_lock);
 		if (err != 0 && err != ENOENT)
 			return (err);
 	}
 
 	bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 	attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 	mutex_enter(&hdl->sa_lock);
 	mutex_enter(&zp->z_lock);
 
 	err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid,
 	    sizeof (uint64_t));
 	if (unlikely(err == 0))
 		/* Someone has added project ID attr by race. */
 		err = EEXIST;
 	if (err != ENOENT)
 		goto out;
 
 	/* First do a bulk query of the attributes that aren't cached */
 	if (zp->z_is_sa) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &mode, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
 		    &gen, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 		    &uid, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 		    &gid, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 		    &parent, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &atime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
 		    &crtime, 16);
 		if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 			    &rdev, 8);
 	} else {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &atime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
 		    &crtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
 		    &gen, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &mode, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 		    &parent, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL,
 		    &xattr, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 		    &uid, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 		    &gid, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
 		    &znode_acl, 88);
 	}
 	err = sa_bulk_lookup_locked(hdl, bulk, count);
 	if (err != 0)
 		goto out;
 
 	err = sa_lookup_locked(hdl, SA_ZPL_XATTR(zfsvfs), &xattr, 8);
 	if (err != 0 && err != ENOENT)
 		goto out;
 
 	zp->z_projid = projid;
 	zp->z_pflags |= ZFS_PROJID;
 	links = ZTONLNK(zp);
 	count = 0;
 	err = 0;
 
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
 	    &crtime, 16);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8);
 
 	if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 
 	if (zp->z_acl_cached != NULL) {
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
 		    &zp->z_acl_cached->z_acl_count, 8);
 		if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
 			zfs_acl_xform(zp, zp->z_acl_cached, CRED());
 		locate.cb_aclp = zp->z_acl_cached;
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
 		    zfs_acl_data_locator, &locate,
 		    zp->z_acl_cached->z_acl_bytes);
 	}
 
 	if (xattr)
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_XATTR(zfsvfs), NULL,
 		    &xattr, 8);
 
 	if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
 		bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
 		    scanstamp, AV_SCANSTAMP_SZ);
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL,
 		    scanstamp, AV_SCANSTAMP_SZ);
 		zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
 	}
 
 	VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
 	VERIFY(sa_replace_all_by_template_locked(hdl, attrs, count, tx) == 0);
 	if (znode_acl.z_acl_extern_obj) {
 		VERIFY(0 == dmu_object_free(zfsvfs->z_os,
 		    znode_acl.z_acl_extern_obj, tx));
 	}
 
 	zp->z_is_sa = B_TRUE;
 
 out:
 	mutex_exit(&zp->z_lock);
 	mutex_exit(&hdl->sa_lock);
 	kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
 	kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END);
 	return (err);
 }
 #endif
 
 static sa_idx_tab_t *
 sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr)
 {
 	sa_idx_tab_t *idx_tab;
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, search;
 	avl_index_t loc;
 
 	/*
 	 * Deterimine layout number.  If SA node and header == 0 then
 	 * force the index table to the dummy "1" empty layout.
 	 *
 	 * The layout number would only be zero for a newly created file
 	 * that has not added any attributes yet, or with crypto enabled which
 	 * doesn't write any attributes to the bonus buffer.
 	 */
 
 	search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
 
 	tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
 
 	/* Verify header size is consistent with layout information */
 	ASSERT(tb);
 	ASSERT((IS_SA_BONUSTYPE(bonustype) &&
 	    SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb)) || !IS_SA_BONUSTYPE(bonustype) ||
 	    (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
 
 	/*
 	 * See if any of the already existing TOC entries can be reused?
 	 */
 
 	for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
 	    idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
 		boolean_t valid_idx = B_TRUE;
 		int i;
 
 		if (tb->lot_var_sizes != 0 &&
 		    idx_tab->sa_variable_lengths != NULL) {
 			for (i = 0; i != tb->lot_var_sizes; i++) {
 				if (hdr->sa_lengths[i] !=
 				    idx_tab->sa_variable_lengths[i]) {
 					valid_idx = B_FALSE;
 					break;
 				}
 			}
 		}
 		if (valid_idx) {
 			sa_idx_tab_hold(os, idx_tab);
 			return (idx_tab);
 		}
 	}
 
 	/* No such luck, create a new entry */
 	idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
 	idx_tab->sa_idx_tab =
 	    kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
 	idx_tab->sa_layout = tb;
 	zfs_refcount_create(&idx_tab->sa_refcount);
 	if (tb->lot_var_sizes)
 		idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
 		    tb->lot_var_sizes, KM_SLEEP);
 
 	sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
 	    tb, idx_tab);
 	sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
 	sa_idx_tab_hold(os, idx_tab);	/* one for layout */
 	list_insert_tail(&tb->lot_idx_tab, idx_tab);
 	return (idx_tab);
 }
 
 void
 sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
     boolean_t start, void *userdata)
 {
 	ASSERT(start);
 
 	*dataptr = userdata;
 	*len = total_len;
 }
 
 static void
 sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
 {
 	uint64_t attr_value = 0;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	sa_attr_table_t *tb = sa->sa_attr_table;
 	int i;
 
 	mutex_enter(&sa->sa_lock);
 
 	if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
 		mutex_exit(&sa->sa_lock);
 		return;
 	}
 
 	if (sa->sa_reg_attr_obj == 0) {
 		sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
 		    DMU_OT_SA_ATTR_REGISTRATION,
 		    sa->sa_master_obj, SA_REGISTRY, tx);
 	}
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_registered)
 			continue;
 		ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
 		    tb[i].sa_byteswap);
 		VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
 		    tb[i].sa_name, 8, 1, &attr_value, tx));
 		tb[i].sa_registered = B_TRUE;
 	}
 	sa->sa_need_attr_registration = B_FALSE;
 	mutex_exit(&sa->sa_lock);
 }
 
 /*
  * Replace all attributes with attributes specified in template.
  * If dnode had a spill buffer then those attributes will be
  * also be replaced, possibly with just an empty spill block
  *
  * This interface is intended to only be used for bulk adding of
  * attributes for a new file.  It will also be used by the ZPL
  * when converting and old formatted znode to native SA support.
  */
 int
 sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
     int attr_count, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
 	if (sa->sa_need_attr_registration)
 		sa_attr_register_sync(hdl, tx);
 	return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
 }
 
 int
 sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
     int attr_count, dmu_tx_t *tx)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_replace_all_by_template_locked(hdl, attr_desc,
 	    attr_count, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * Add/remove a single attribute or replace a variable-sized attribute value
  * with a value of a different size, and then rewrite the entire set
  * of attributes.
  * Same-length attribute value replacement (including fixed-length attributes)
  * is handled more efficiently by the upper layers.
  */
 static int
 sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
 	dnode_t *dn;
 	sa_bulk_attr_t *attr_desc;
 	void *old_data[2];
 	int bonus_attr_count = 0;
 	int bonus_data_size = 0;
 	int spill_data_size = 0;
 	int spill_attr_count = 0;
 	int error;
 	uint16_t length, reg_length;
 	int i, j, k, length_idx;
 	sa_hdr_phys_t *hdr;
 	sa_idx_tab_t *idx_tab;
 	int attr_count;
 	int count;
 
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 
 	/* First make of copy of the old data */
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn->dn_bonuslen != 0) {
 		bonus_data_size = hdl->sa_bonus->db_size;
 		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
 		bcopy(hdl->sa_bonus->db_data, old_data[0],
 		    hdl->sa_bonus->db_size);
 		bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
 	} else {
 		old_data[0] = NULL;
 	}
 	DB_DNODE_EXIT(db);
 
 	/* Bring spill buffer online if it isn't currently */
 
 	if ((error = sa_get_spill(hdl)) == 0) {
 		spill_data_size = hdl->sa_spill->db_size;
 		old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP);
 		bcopy(hdl->sa_spill->db_data, old_data[1],
 		    hdl->sa_spill->db_size);
 		spill_attr_count =
 		    hdl->sa_spill_tab->sa_layout->lot_attr_count;
 	} else if (error && error != ENOENT) {
 		if (old_data[0])
 			kmem_free(old_data[0], bonus_data_size);
 		return (error);
 	} else {
 		old_data[1] = NULL;
 	}
 
 	/* build descriptor of all attributes */
 
 	attr_count = bonus_attr_count + spill_attr_count;
 	if (action == SA_ADD)
 		attr_count++;
 	else if (action == SA_REMOVE)
 		attr_count--;
 
 	attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
 
 	/*
 	 * loop through bonus and spill buffer if it exists, and
 	 * build up new attr_descriptor to reset the attributes
 	 */
 	k = j = 0;
 	count = bonus_attr_count;
 	hdr = SA_GET_HDR(hdl, SA_BONUS);
 	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
 	for (; k != 2; k++) {
 		/*
 		 * Iterate over each attribute in layout.  Fetch the
 		 * size of variable-length attributes needing rewrite
 		 * from sa_lengths[].
 		 */
 		for (i = 0, length_idx = 0; i != count; i++) {
 			sa_attr_type_t attr;
 
 			attr = idx_tab->sa_layout->lot_attrs[i];
 			reg_length = SA_REGISTERED_LEN(sa, attr);
 			if (reg_length == 0) {
 				length = hdr->sa_lengths[length_idx];
 				length_idx++;
 			} else {
 				length = reg_length;
 			}
 			if (attr == newattr) {
 				/*
 				 * There is nothing to do for SA_REMOVE,
 				 * so it is just skipped.
 				 */
 				if (action == SA_REMOVE)
 					continue;
 
 				/*
 				 * Duplicate attributes are not allowed, so the
 				 * action can not be SA_ADD here.
 				 */
 				ASSERT3S(action, ==, SA_REPLACE);
 
 				/*
 				 * Only a variable-sized attribute can be
 				 * replaced here, and its size must be changing.
 				 */
 				ASSERT3U(reg_length, ==, 0);
 				ASSERT3U(length, !=, buflen);
 				SA_ADD_BULK_ATTR(attr_desc, j, attr,
 				    locator, datastart, buflen);
 			} else {
 				SA_ADD_BULK_ATTR(attr_desc, j, attr,
 				    NULL, (void *)
 				    (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
 				    (uintptr_t)old_data[k]), length);
 			}
 		}
 		if (k == 0 && hdl->sa_spill) {
 			hdr = SA_GET_HDR(hdl, SA_SPILL);
 			idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
 			count = spill_attr_count;
 		} else {
 			break;
 		}
 	}
 	if (action == SA_ADD) {
 		reg_length = SA_REGISTERED_LEN(sa, newattr);
 		IMPLY(reg_length != 0, reg_length == buflen);
 		SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
 		    datastart, buflen);
 	}
 	ASSERT3U(j, ==, attr_count);
 
 	error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
 
 	if (old_data[0])
 		kmem_free(old_data[0], bonus_data_size);
 	if (old_data[1])
 		vmem_free(old_data[1], spill_data_size);
 	kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
 
 	return (error);
 }
 
 static int
 sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
     dmu_tx_t *tx)
 {
 	int error;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	dmu_object_type_t bonustype;
 	dmu_buf_t *saved_spill;
 
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 
 	bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
 	saved_spill = hdl->sa_spill;
 
 	/* sync out registration table if necessary */
 	if (sa->sa_need_attr_registration)
 		sa_attr_register_sync(hdl, tx);
 
 	error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
 	if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
 		sa->sa_update_cb(hdl, tx);
 
 	/*
 	 * If saved_spill is NULL and current sa_spill is not NULL that
 	 * means we increased the refcount of the spill buffer through
 	 * sa_get_spill() or dmu_spill_hold_by_dnode().  Therefore we
 	 * must release the hold before calling dmu_tx_commit() to avoid
 	 * making a copy of this buffer in dbuf_sync_leaf() due to the
 	 * reference count now being greater than 1.
 	 */
 	if (!saved_spill && hdl->sa_spill) {
 		if (hdl->sa_spill_tab) {
 			sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 			hdl->sa_spill_tab = NULL;
 		}
 
 		dmu_buf_rele(hdl->sa_spill, NULL);
 		hdl->sa_spill = NULL;
 	}
 
 	return (error);
 }
 
 /*
  * update or add new attribute
  */
 int
 sa_update(sa_handle_t *hdl, sa_attr_type_t type,
     void *buf, uint32_t buflen, dmu_tx_t *tx)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
 
 	bulk.sa_attr = type;
 	bulk.sa_data_func = NULL;
 	bulk.sa_length = buflen;
 	bulk.sa_data = buf;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * Return size of an attribute
  */
 
 int
 sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
 {
 	sa_bulk_attr_t bulk;
 	int error;
 
 	bulk.sa_data = NULL;
 	bulk.sa_attr = attr;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
 		mutex_exit(&hdl->sa_lock);
 		return (error);
 	}
 	*size = bulk.sa_size;
 
 	mutex_exit(&hdl->sa_lock);
 	return (0);
 }
 
 int
 sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
 {
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 	return (sa_lookup_impl(hdl, attrs, count));
 }
 
 int
 sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
 {
 	int error;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_lookup_locked(hdl, attrs, count);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
 {
 	int error;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, attrs, count, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
 	    NULL, 0, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 void
 sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
 {
 	dmu_object_info_from_db(hdl->sa_bonus, doi);
 }
 
 void
 sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
 {
 	dmu_object_size_from_db(hdl->sa_bonus,
 	    blksize, nblocks);
 }
 
 void
 sa_set_userp(sa_handle_t *hdl, void *ptr)
 {
 	hdl->sa_userp = ptr;
 }
 
 dmu_buf_t *
 sa_get_db(sa_handle_t *hdl)
 {
 	return (hdl->sa_bonus);
 }
 
 void *
 sa_get_userdata(sa_handle_t *hdl)
 {
 	return (hdl->sa_userp);
 }
 
 void
 sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
 {
 	ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
 	os->os_sa->sa_update_cb = func;
 }
 
 void
 sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
 {
 
 	mutex_enter(&os->os_sa->sa_lock);
 	sa_register_update_callback_locked(os, func);
 	mutex_exit(&os->os_sa->sa_lock);
 }
 
 uint64_t
 sa_handle_object(sa_handle_t *hdl)
 {
 	return (hdl->sa_bonus->db_object);
 }
 
 boolean_t
 sa_enabled(objset_t *os)
 {
 	return (os->os_sa == NULL);
 }
 
 int
 sa_set_sa_object(objset_t *os, uint64_t sa_object)
 {
 	sa_os_t *sa = os->os_sa;
 
 	if (sa->sa_master_obj)
 		return (1);
 
 	sa->sa_master_obj = sa_object;
 
 	return (0);
 }
 
 int
 sa_hdrsize(void *arg)
 {
 	sa_hdr_phys_t *hdr = arg;
 
 	return (SA_HDR_SIZE(hdr));
 }
 
 void
 sa_handle_lock(sa_handle_t *hdl)
 {
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 }
 
 void
 sa_handle_unlock(sa_handle_t *hdl)
 {
 	ASSERT(hdl);
 	mutex_exit(&hdl->sa_lock);
 }
 
 #ifdef _KERNEL
 EXPORT_SYMBOL(sa_handle_get);
 EXPORT_SYMBOL(sa_handle_get_from_db);
 EXPORT_SYMBOL(sa_handle_destroy);
 EXPORT_SYMBOL(sa_buf_hold);
 EXPORT_SYMBOL(sa_buf_rele);
 EXPORT_SYMBOL(sa_spill_rele);
 EXPORT_SYMBOL(sa_lookup);
 EXPORT_SYMBOL(sa_update);
 EXPORT_SYMBOL(sa_remove);
 EXPORT_SYMBOL(sa_bulk_lookup);
 EXPORT_SYMBOL(sa_bulk_lookup_locked);
 EXPORT_SYMBOL(sa_bulk_update);
 EXPORT_SYMBOL(sa_size);
 EXPORT_SYMBOL(sa_object_info);
 EXPORT_SYMBOL(sa_object_size);
 EXPORT_SYMBOL(sa_get_userdata);
 EXPORT_SYMBOL(sa_set_userp);
 EXPORT_SYMBOL(sa_get_db);
 EXPORT_SYMBOL(sa_handle_object);
 EXPORT_SYMBOL(sa_register_update_callback);
 EXPORT_SYMBOL(sa_setup);
 EXPORT_SYMBOL(sa_replace_all_by_template);
 EXPORT_SYMBOL(sa_replace_all_by_template_locked);
 EXPORT_SYMBOL(sa_enabled);
 EXPORT_SYMBOL(sa_cache_init);
 EXPORT_SYMBOL(sa_cache_fini);
 EXPORT_SYMBOL(sa_set_sa_object);
 EXPORT_SYMBOL(sa_hdrsize);
 EXPORT_SYMBOL(sa_handle_lock);
 EXPORT_SYMBOL(sa_handle_unlock);
 EXPORT_SYMBOL(sa_lookup_uio);
 EXPORT_SYMBOL(sa_add_projid);
 #endif /* _KERNEL */
Index: vendor-sys/openzfs/dist/module/zfs/spa.c
===================================================================
--- vendor-sys/openzfs/dist/module/zfs/spa.c	(revision 364927)
+++ vendor-sys/openzfs/dist/module/zfs/spa.c	(revision 364928)
@@ -1,9754 +1,9755 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright 2018 Joyent, Inc.
  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  */
 
 /*
  * SPA: Storage Pool Allocator
  *
  * This file contains all the routines used when modifying on-disk SPA state.
  * This includes opening, importing, destroying, exporting a pool, and syncing a
  * pool.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_removal.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/mmp.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_objset.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/systeminfo.h>
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_destroy.h>
 #include <sys/zvol.h>
 
 #ifdef	_KERNEL
 #include <sys/fm/protocol.h>
 #include <sys/fm/util.h>
 #include <sys/callb.h>
 #include <sys/zone.h>
 #include <sys/vmsystm.h>
 #endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
 /*
  * The interval, in seconds, at which failed configuration cache file writes
  * should be retried.
  */
 int zfs_ccw_retry_interval = 300;
 
 typedef enum zti_modes {
 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
 	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
 	ZTI_MODE_NULL,			/* don't create a taskq */
 	ZTI_NMODES
 } zti_modes_t;
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
 #define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
 #define	ZTI_N(n)	ZTI_P(n, 1)
 #define	ZTI_ONE		ZTI_N(1)
 
 typedef struct zio_taskq_info {
 	zti_modes_t zti_mode;
 	uint_t zti_value;
 	uint_t zti_count;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 	"iss", "iss_h", "int", "int_h"
 };
 
 /*
  * This table defines the taskq settings for each ZFS I/O type. When
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  * macros. Other operations process a large amount of data; the ZTI_BATCH
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
  * particular taskq is chosen at random.
  *
  * The different taskq priorities are to handle the different contexts (issue
  * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
  * need to be handled with minimum delay.
  */
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
 	{ ZTI_BATCH,	ZTI_N(5),	ZTI_P(12, 8),	ZTI_N(5) }, /* WRITE */
 	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
 	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
 };
 
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
 boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
 uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
 
 boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
 
 /*
  * Report any spa_load_verify errors found, but do not fail spa_load.
  * This is used by zdb to analyze non-idle pools.
  */
 boolean_t	spa_load_verify_dryrun = B_FALSE;
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
  */
 #define	TRYIMPORT_NAME	"$import"
 
 /*
  * For debugging purposes: print out vdev tree during pool import.
  */
 int		spa_load_print_vdev_tree = B_FALSE;
 
 /*
  * A non-zero value for zfs_max_missing_tvds means that we allow importing
  * pools with missing top-level vdevs. This is strictly intended for advanced
  * pool recovery cases since missing data is almost inevitable. Pools with
  * missing devices can only be imported read-only for safety reasons, and their
  * fail-mode will be automatically set to "continue".
  *
  * With 1 missing vdev we should be able to import the pool and mount all
  * datasets. User data that was not modified after the missing device has been
  * added should be recoverable. This means that snapshots created prior to the
  * addition of that device should be completely intact.
  *
  * With 2 missing vdevs, some datasets may fail to mount since there are
  * dataset statistics that are stored as regular metadata. Some data might be
  * recoverable if those vdevs were added recently.
  *
  * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
  * may be missing entirely. Chances of data recovery are very low. Note that
  * there are also risks of performing an inadvertent rewind as we might be
  * missing all the vdevs with the latest uberblocks.
  */
 unsigned long	zfs_max_missing_tvds = 0;
 
 /*
  * The parameters below are similar to zfs_max_missing_tvds but are only
  * intended for a preliminary open of the pool with an untrusted config which
  * might be incomplete or out-dated.
  *
  * We are more tolerant for pools opened from a cachefile since we could have
  * an out-dated cachefile where a device removal was not registered.
  * We could have set the limit arbitrarily high but in the case where devices
  * are really missing we would want to return the proper error codes; we chose
  * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
  * and we get a chance to retrieve the trusted config.
  */
 uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
 
 /*
  * In the case where config was assembled by scanning device paths (/dev/dsks
  * by default) we are less tolerant since all the existing devices should have
  * been detected and we want spa_load to return the right error codes.
  */
 uint64_t	zfs_max_missing_tvds_scan = 0;
 
 /*
  * Debugging aid that pauses spa_sync() towards the end.
  */
 boolean_t	zfs_pause_spa_sync = B_FALSE;
 
 /*
  * Variables to indicate the livelist condense zthr func should wait at certain
  * points for the livelist to be removed - used to test condense/destroy races
  */
 int zfs_livelist_condense_zthr_pause = 0;
 int zfs_livelist_condense_sync_pause = 0;
 
 /*
  * Variables to track whether or not condense cancellation has been
  * triggered in testing.
  */
 int zfs_livelist_condense_sync_cancel = 0;
 int zfs_livelist_condense_zthr_cancel = 0;
 
 /*
  * Variable to track whether or not extra ALLOC blkptrs were added to a
  * livelist entry while it was being condensed (caused by the way we track
  * remapped blkptrs in dbuf_remap_impl)
  */
 int zfs_livelist_condense_new_alloc = 0;
 
 /*
  * ==========================================================================
  * SPA properties routines
  * ==========================================================================
  */
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	const char *propname = zpool_prop_to_name(prop);
 	nvlist_t *propval;
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 
 	if (strval != NULL)
 		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 	else
 		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 
 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 	nvlist_free(propval);
 }
 
 /*
  * Get property values from the spa configuration.
  */
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	dsl_pool_t *pool = spa->spa_dsl_pool;
 	uint64_t size, alloc, cap, version;
 	const zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
 	metaslab_class_t *mc = spa_normal_class(spa);
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
 	if (rvd != NULL) {
 		alloc = metaslab_class_get_alloc(mc);
 		alloc += metaslab_class_get_alloc(spa_special_class(spa));
 		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
 
 		size = metaslab_class_get_space(mc);
 		size += metaslab_class_get_space(spa_special_class(spa));
 		size += metaslab_class_get_space(spa_dedup_class(spa));
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
 		    spa->spa_checkpoint_info.sci_dspace, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
 		    metaslab_class_fragmentation(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
 		    metaslab_class_expandable_space(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 		    (spa_mode(spa) == SPA_MODE_READ), src);
 
 		cap = (size == 0) ? 0 : (alloc * 100 / size);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
 
 		version = spa_version(spa);
 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_DEFAULT);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_LOCAL);
 		}
 		spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
 		    NULL, spa_load_guid(spa), src);
 	}
 
 	if (pool != NULL) {
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
 		if (pool->dp_free_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
 			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
 			    src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
 
 		if (pool->dp_leak_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
 			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
 			    src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
 			    NULL, 0, src);
 		}
 	}
 
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
 	if (spa->spa_comment != NULL) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 		    0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
 	} else {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
 	}
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
 		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
 	} else {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
 		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
 	}
 
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    "none", 0, ZPROP_SRC_LOCAL);
 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
 		}
 	}
 }
 
 /*
  * Get zpool property values.
  */
 int
 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	dsl_pool_t *dp;
 	int err;
 
 	err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
 	if (err)
 		return (err);
 
 	dp = spa_get_dsl(spa);
 	dsl_pool_config_enter(dp, FTAG);
 	mutex_enter(&spa->spa_props_lock);
 
 	/*
 	 * Get properties from the spa config.
 	 */
 	spa_prop_get_config(spa, nvp);
 
 	/* If no pool property object, no more prop to get. */
 	if (mos == NULL || spa->spa_pool_props_object == 0)
 		goto out;
 
 	/*
 	 * Get properties from the MOS pool property object.
 	 */
 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t intval = 0;
 		char *strval = NULL;
 		zprop_source_t src = ZPROP_SRC_DEFAULT;
 		zpool_prop_t prop;
 
 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
 			continue;
 
 		switch (za.za_integer_length) {
 		case 8:
 			/* integer property */
 			if (za.za_first_integer !=
 			    zpool_prop_default_numeric(prop))
 				src = ZPROP_SRC_LOCAL;
 
 			if (prop == ZPOOL_PROP_BOOTFS) {
 				dsl_dataset_t *ds = NULL;
 
 				err = dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &ds);
 				if (err != 0)
 					break;
 
 				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
 				    KM_SLEEP);
 				dsl_dataset_name(ds, strval);
 				dsl_dataset_rele(ds, FTAG);
 			} else {
 				strval = NULL;
 				intval = za.za_first_integer;
 			}
 
 			spa_prop_add_list(*nvp, prop, strval, intval, src);
 
 			if (strval != NULL)
 				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
 
 			break;
 
 		case 1:
 			/* string property */
 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 			err = zap_lookup(mos, spa->spa_pool_props_object,
 			    za.za_name, 1, za.za_num_integers, strval);
 			if (err) {
 				kmem_free(strval, za.za_num_integers);
 				break;
 			}
 			spa_prop_add_list(*nvp, prop, strval, 0, src);
 			kmem_free(strval, za.za_num_integers);
 			break;
 
 		default:
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 out:
 	mutex_exit(&spa->spa_props_lock);
 	dsl_pool_config_exit(dp, FTAG);
 	if (err && err != ENOENT) {
 		nvlist_free(*nvp);
 		*nvp = NULL;
 		return (err);
 	}
 
 	return (0);
 }
 
 /*
  * Validate the given pool properties nvlist and modify the list
  * for the property values to be set.
  */
 static int
 spa_prop_validate(spa_t *spa, nvlist_t *props)
 {
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
 	uint64_t objnum = 0;
 	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		uint64_t intval;
 		char *strval, *slash, *check, *fname;
 		const char *propname = nvpair_name(elem);
 		zpool_prop_t prop = zpool_name_to_prop(propname);
 
 		switch (prop) {
 		case ZPOOL_PROP_INVAL:
 			if (!zpool_prop_feature(propname)) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			/*
 			 * Sanitize the input.
 			 */
 			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			if (intval != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			fname = strchr(propname, '@') + 1;
 			if (zfeature_lookup_name(fname, NULL) != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			has_feature = B_TRUE;
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error &&
 			    (intval < spa_version(spa) ||
 			    intval > SPA_VERSION_BEFORE_FEATURES ||
 			    has_feature))
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_DELEGATION:
 		case ZPOOL_PROP_AUTOREPLACE:
 		case ZPOOL_PROP_LISTSNAPS:
 		case ZPOOL_PROP_AUTOEXPAND:
 		case ZPOOL_PROP_AUTOTRIM:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_MULTIHOST:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 
 			if (!error) {
 				uint32_t hostid = zone_get_hostid(NULL);
 				if (hostid)
 					spa->spa_hostid = hostid;
 				else
 					error = SET_ERROR(ENOTSUP);
 			}
 
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
 			/*
 			 * If the pool version is less than SPA_VERSION_BOOTFS,
 			 * or the pool is still being created (version == 0),
 			 * the bootfs property cannot be set.
 			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			/*
 			 * Make sure the vdev config is bootable
 			 */
 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			reset_bootfs = 1;
 
 			error = nvpair_value_string(elem, &strval);
 
 			if (!error) {
 				objset_t *os;
 
 				if (strval == NULL || strval[0] == '\0') {
 					objnum = zpool_prop_default_numeric(
 					    ZPOOL_PROP_BOOTFS);
 					break;
 				}
 
 				error = dmu_objset_hold(strval, FTAG, &os);
 				if (error != 0)
 					break;
 
 				/* Must be ZPL. */
 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
 					error = SET_ERROR(ENOTSUP);
 				} else {
 					objnum = dmu_objset_id(os);
 				}
 				dmu_objset_rele(os, FTAG);
 			}
 			break;
 
 		case ZPOOL_PROP_FAILUREMODE:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > ZIO_FAILURE_MODE_PANIC)
 				error = SET_ERROR(EINVAL);
 
 			/*
 			 * This is a special case which only occurs when
 			 * the pool has completely failed. This allows
 			 * the user to change the in-core failmode property
 			 * without syncing it out to disk (I/Os might
 			 * currently be blocked). We do this by returning
 			 * EIO to the caller (spa_prop_set) to trick it
 			 * into thinking we encountered a property validation
 			 * error.
 			 */
 			if (!error && spa_suspended(spa)) {
 				spa->spa_failmode = intval;
 				error = SET_ERROR(EIO);
 			}
 			break;
 
 		case ZPOOL_PROP_CACHEFILE:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 
 			if (strval[0] == '\0')
 				break;
 
 			if (strcmp(strval, "none") == 0)
 				break;
 
 			if (strval[0] != '/') {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			slash = strrchr(strval, '/');
 			ASSERT(slash != NULL);
 
 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 			    strcmp(slash, "/..") == 0)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_COMMENT:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 			for (check = strval; *check != '\0'; check++) {
 				if (!isprint(*check)) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 			}
 			if (strlen(strval) > ZPROP_MAX_COMMENT)
 				error = SET_ERROR(E2BIG);
 			break;
 
 		default:
 			break;
 		}
 
 		if (error)
 			break;
 	}
 
 	(void) nvlist_remove_all(props,
 	    zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
 
 	if (!error && reset_bootfs) {
 		error = nvlist_remove(props,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 
 		if (!error) {
 			error = nvlist_add_uint64(props,
 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 		}
 	}
 
 	return (error);
 }
 
 void
 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 {
 	char *cachefile;
 	spa_config_dirent_t *dp;
 
 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 	    &cachefile) != 0)
 		return;
 
 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
 	    KM_SLEEP);
 
 	if (cachefile[0] == '\0')
 		dp->scd_path = spa_strdup(spa_config_path);
 	else if (strcmp(cachefile, "none") == 0)
 		dp->scd_path = NULL;
 	else
 		dp->scd_path = spa_strdup(cachefile);
 
 	list_insert_head(&spa->spa_config_list, dp);
 	if (need_sync)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
 	nvpair_t *elem = NULL;
 	boolean_t need_sync = B_FALSE;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 
 		if (prop == ZPOOL_PROP_CACHEFILE ||
 		    prop == ZPOOL_PROP_ALTROOT ||
 		    prop == ZPOOL_PROP_READONLY)
 			continue;
 
 		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
 			uint64_t ver;
 
 			if (prop == ZPOOL_PROP_VERSION) {
 				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 			} else {
 				ASSERT(zpool_prop_feature(nvpair_name(elem)));
 				ver = SPA_VERSION_FEATURES;
 				need_sync = B_TRUE;
 			}
 
 			/* Save time if the version is already set. */
 			if (ver == spa_version(spa))
 				continue;
 
 			/*
 			 * In addition to the pool directory object, we might
 			 * create the pool properties object, the features for
 			 * read object, the features for write object, or the
 			 * feature descriptions object.
 			 */
 			error = dsl_sync_task(spa->spa_name, NULL,
 			    spa_sync_version, &ver,
 			    6, ZFS_SPACE_CHECK_RESERVED);
 			if (error)
 				return (error);
 			continue;
 		}
 
 		need_sync = B_TRUE;
 		break;
 	}
 
 	if (need_sync) {
 		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
 		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	return (0);
 }
 
 /*
  * If the bootfs property value is dsobj, clear it.
  */
 void
 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 {
 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 		VERIFY(zap_remove(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 		spa->spa_bootfs = 0;
 	}
 }
 
 /*ARGSUSED*/
 static int
 spa_change_guid_check(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid __maybe_unused = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t vdev_state;
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		int error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (SET_ERROR(error));
 	}
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_state = rvd->vdev_state;
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (vdev_state != VDEV_STATE_HEALTHY)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3U(spa_guid(spa), !=, *newguid);
 
 	return (0);
 }
 
 static void
 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	uint64_t oldguid;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	oldguid = spa_guid(spa);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	rvd->vdev_guid = *newguid;
 	rvd->vdev_guid_sum += (*newguid - oldguid);
 	vdev_config_dirty(rvd);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
 	    (u_longlong_t)oldguid, (u_longlong_t)*newguid);
 }
 
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
  * the root vdev's guid, our own pool guid, and then mark all of our
  * vdevs dirty.  Note that we must make sure that all our vdevs are
  * online when we do this, or else any vdevs that weren't present
  * would be orphaned from our pool.  We are also going to issue a
  * sysevent to update any watchers.
  */
 int
 spa_change_guid(spa_t *spa)
 {
 	int error;
 	uint64_t guid;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 	guid = spa_generate_guid(NULL);
 
 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == 0) {
 		spa_write_cachefile(spa, B_FALSE, B_TRUE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * SPA state manipulation (open/create/destroy/import/export)
  * ==========================================================================
  */
 
 static int
 spa_error_entry_compare(const void *a, const void *b)
 {
 	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
 	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
 	int ret;
 
 	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
 	    sizeof (zbookmark_phys_t));
 
 	return (TREE_ISIGN(ret));
 }
 
 /*
  * Utility function which retrieves copies of the current logs and
  * re-initializes them in the process.
  */
 void
 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 {
 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 
 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 static void
 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 	enum zti_modes mode = ztip->zti_mode;
 	uint_t value = ztip->zti_value;
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	uint_t flags = 0;
 	boolean_t batch = B_FALSE;
 
 	if (mode == ZTI_MODE_NULL) {
 		tqs->stqs_count = 0;
 		tqs->stqs_taskq = NULL;
 		return;
 	}
 
 	ASSERT3U(count, >, 0);
 
 	tqs->stqs_count = count;
 	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >=, 1);
 		value = MAX(value, 1);
 		flags |= TASKQ_DYNAMIC;
 		break;
 
 	case ZTI_MODE_BATCH:
 		batch = B_TRUE;
 		flags |= TASKQ_THREADS_CPU_PCT;
 		value = MIN(zio_taskq_batch_pct, 100);
 		break;
 
 	default:
 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 		    "spa_activate()",
 		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
 
 	for (uint_t i = 0; i < count; i++) {
 		taskq_t *tq;
 		char name[32];
 
 		(void) snprintf(name, sizeof (name), "%s_%s",
 		    zio_type_name[t], zio_taskq_types[q]);
 
 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 			if (batch)
 				flags |= TASKQ_DC_BATCH;
 
 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
 			pri_t pri = maxclsyspri;
 			/*
 			 * The write issue taskq can be extremely CPU
 			 * intensive.  Run it at slightly less important
 			 * priority than the other taskqs.  Under Linux this
 			 * means incrementing the priority value on platforms
 			 * like illumos it should be decremented.
 			 */
 			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
 				pri++;
 
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
 		}
 
 		tqs->stqs_taskq[i] = tq;
 	}
 }
 
 static void
 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 
 	if (tqs->stqs_taskq == NULL) {
 		ASSERT3U(tqs->stqs_count, ==, 0);
 		return;
 	}
 
 	for (uint_t i = 0; i < tqs->stqs_count; i++) {
 		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 		taskq_destroy(tqs->stqs_taskq[i]);
 	}
 
 	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
 	tqs->stqs_taskq = NULL;
 }
 
 /*
  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  * Note that a type may have multiple discrete taskqs to avoid lock contention
  * on the taskq itself. In that case we choose which taskq at random by using
  * the low bits of gethrtime().
  */
 void
 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
 
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
 	} else {
 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 	}
 
 	taskq_dispatch_ent(tq, func, arg, flags, ent);
 }
 
 /*
  * Same as spa_taskq_dispatch_ent() but block on the task until completion.
  */
 void
 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
 	taskqid_t id;
 
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
 	} else {
 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 	}
 
 	id = taskq_dispatch(tq, func, arg, flags);
 	if (id)
 		taskq_wait_id(tq, id);
 }
 
 static void
 spa_create_zio_taskqs(spa_t *spa)
 {
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_init(spa, t, q);
 		}
 	}
 }
 
 /*
  * Disabled until spa_thread() can be adapted for Linux.
  */
 #undef HAVE_SPA_THREAD
 
 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
 static void
 spa_thread(void *arg)
 {
 	psetid_t zio_taskq_psrset_bind = PS_NONE;
 	callb_cpr_t cprinfo;
 
 	spa_t *spa = arg;
 	user_t *pu = PTOU(curproc);
 
 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 	    spa->spa_name);
 
 	ASSERT(curproc != &p0);
 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 	    "zpool-%s", spa->spa_name);
 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 
 	/* bind this thread to the requested psrset */
 	if (zio_taskq_psrset_bind != PS_NONE) {
 		pool_lock();
 		mutex_enter(&cpu_lock);
 		mutex_enter(&pidlock);
 		mutex_enter(&curproc->p_lock);
 
 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 		    0, NULL, NULL) == 0)  {
 			curthread->t_bind_pset = zio_taskq_psrset_bind;
 		} else {
 			cmn_err(CE_WARN,
 			    "Couldn't bind process for zfs pool \"%s\" to "
 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 		}
 
 		mutex_exit(&curproc->p_lock);
 		mutex_exit(&pidlock);
 		mutex_exit(&cpu_lock);
 		pool_unlock();
 	}
 
 	if (zio_taskq_sysdc) {
 		sysdc_thread_enter(curthread, 100, 0);
 	}
 
 	spa->spa_proc = curproc;
 	spa->spa_did = curthread->t_did;
 
 	spa_create_zio_taskqs(spa);
 
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 
 	spa->spa_proc_state = SPA_PROC_ACTIVE;
 	cv_broadcast(&spa->spa_proc_cv);
 
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 
 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 	spa->spa_proc_state = SPA_PROC_GONE;
 	spa->spa_proc = &p0;
 	cv_broadcast(&spa->spa_proc_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
 
 	mutex_enter(&curproc->p_lock);
 	lwp_exit();
 }
 #endif
 
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, spa_mode_t mode)
 {
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 
 	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
 
 	/* Try to create a covering process */
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 	ASSERT(spa->spa_proc == &p0);
 	spa->spa_did = 0;
 
 #ifdef HAVE_SPA_THREAD
 	/* Only create a process if we're going to be around a while. */
 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 		    NULL, 0) == 0) {
 			spa->spa_proc_state = SPA_PROC_CREATED;
 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
 				cv_wait(&spa->spa_proc_cv,
 				    &spa->spa_proc_lock);
 			}
 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 			ASSERT(spa->spa_proc != &p0);
 			ASSERT(spa->spa_did != 0);
 		} else {
 #ifdef _KERNEL
 			cmn_err(CE_WARN,
 			    "Couldn't create process for zfs pool \"%s\"\n",
 			    spa->spa_name);
 #endif
 		}
 	}
 #endif /* HAVE_SPA_THREAD */
 	mutex_exit(&spa->spa_proc_lock);
 
 	/* If we didn't create a process, we need to create our taskqs. */
 	if (spa->spa_proc == &p0) {
 		spa_create_zio_taskqs(spa);
 	}
 
 	for (size_t i = 0; i < TXG_SIZE; i++) {
 		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	}
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
 	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
 	    offsetof(objset_t, os_evicting_node));
 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_state_dirty_node));
 
 	txg_list_create(&spa->spa_vdev_txg_list, spa,
 	    offsetof(struct vdev, vdev_txg_node));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 
 	spa_keystore_init(&spa->spa_keystore);
 
 	/*
 	 * This taskq is used to perform zvol-minor-related tasks
 	 * asynchronously. This has several advantages, including easy
 	 * resolution of various deadlocks (zfsonlinux bug #3681).
 	 *
 	 * The taskq must be single threaded to ensure tasks are always
 	 * processed in the order in which they were dispatched.
 	 *
 	 * A taskq per pool allows one to keep the pools independent.
 	 * This way if one pool is suspended, it will not impact another.
 	 *
 	 * The preferred location to dispatch a zvol minor task is a sync
 	 * task. In this context, there is easy access to the spa_t and minimal
 	 * error handling is required because the sync task must succeed.
 	 */
 	spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
 	    1, INT_MAX, 0);
 
 	/*
 	 * Taskq dedicated to prefetcher threads: this is used to prevent the
 	 * pool traverse code from monopolizing the global (and limited)
 	 * system_taskq by inappropriately scheduling long running tasks on it.
 	 */
 	spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus,
 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
 
 	/*
 	 * The taskq to upgrade datasets in this pool. Currently used by
 	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
 	 */
 	spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus,
 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
 }
 
 /*
  * Opposite of spa_activate().
  */
 static void
 spa_deactivate(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT(spa->spa_dsl_pool == NULL);
 	ASSERT(spa->spa_root_vdev == NULL);
 	ASSERT(spa->spa_async_zio_root == NULL);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
 	spa_evicting_os_wait(spa);
 
 	if (spa->spa_zvol_taskq) {
 		taskq_destroy(spa->spa_zvol_taskq);
 		spa->spa_zvol_taskq = NULL;
 	}
 
 	if (spa->spa_prefetch_taskq) {
 		taskq_destroy(spa->spa_prefetch_taskq);
 		spa->spa_prefetch_taskq = NULL;
 	}
 
 	if (spa->spa_upgrade_taskq) {
 		taskq_destroy(spa->spa_upgrade_taskq);
 		spa->spa_upgrade_taskq = NULL;
 	}
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_config_dirty_list);
 	list_destroy(&spa->spa_evicting_os_list);
 	list_destroy(&spa->spa_state_dirty_list);
 
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_fini(spa, t, q);
 		}
 	}
 
 	for (size_t i = 0; i < TXG_SIZE; i++) {
 		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
 		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
 		spa->spa_txg_zio[i] = NULL;
 	}
 
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
 	metaslab_class_destroy(spa->spa_log_class);
 	spa->spa_log_class = NULL;
 
 	metaslab_class_destroy(spa->spa_special_class);
 	spa->spa_special_class = NULL;
 
 	metaslab_class_destroy(spa->spa_dedup_class);
 	spa->spa_dedup_class = NULL;
 
 	/*
 	 * If this was part of an import or the open otherwise failed, we may
 	 * still have errors left in the queues.  Empty them just in case.
 	 */
 	spa_errlog_drain(spa);
 	avl_destroy(&spa->spa_errlist_scrub);
 	avl_destroy(&spa->spa_errlist_last);
 
 	spa_keystore_fini(&spa->spa_keystore);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 
 	mutex_enter(&spa->spa_proc_lock);
 	if (spa->spa_proc_state != SPA_PROC_NONE) {
 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 		cv_broadcast(&spa->spa_proc_cv);
 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 			ASSERT(spa->spa_proc != &p0);
 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 		}
 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 		spa->spa_proc_state = SPA_PROC_NONE;
 	}
 	ASSERT(spa->spa_proc == &p0);
 	mutex_exit(&spa->spa_proc_lock);
 
 	/*
 	 * We want to make sure spa_thread() has actually exited the ZFS
 	 * module, so that the module can't be unloaded out from underneath
 	 * it.
 	 */
 	if (spa->spa_did != 0) {
 		thread_join(spa->spa_did);
 		spa->spa_did = 0;
 	}
 }
 
 /*
  * Verify a pool configuration, and construct the vdev tree appropriately.  This
  * will create all the necessary vdevs in the appropriate layout, with each vdev
  * in the CLOSED state.  This will prep the pool before open/creation/import.
  * All vdev validation is done by the vdev_alloc() routine.
  */
 int
 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
     uint_t id, int atype)
 {
 	nvlist_t **child;
 	uint_t children;
 	int error;
 
 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 		return (error);
 
 	if ((*vdp)->vdev_ops->vdev_op_leaf)
 		return (0);
 
 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error) {
 		vdev_free(*vdp);
 		*vdp = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *vd;
 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 		    atype)) != 0) {
 			vdev_free(*vdp);
 			*vdp = NULL;
 			return (error);
 		}
 	}
 
 	ASSERT(*vdp != NULL);
 
 	return (0);
 }
 
 static boolean_t
 spa_should_flush_logs_on_unload(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return (B_FALSE);
 
 	if (!spa_writeable(spa))
 		return (B_FALSE);
 
 	if (!spa->spa_sync_on)
 		return (B_FALSE);
 
 	if (spa_state(spa) != POOL_STATE_EXPORTED)
 		return (B_FALSE);
 
 	if (zfs_keep_log_spacemaps_at_export)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Opens a transaction that will set the flag that will instruct
  * spa_sync to attempt to flush all the metaslabs for that txg.
  */
 static void
 spa_unload_log_sm_flush_all(spa_t *spa)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 
 	ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
 	spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
 
 	dmu_tx_commit(tx);
 	txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
 }
 
 static void
 spa_unload_log_sm_metadata(spa_t *spa)
 {
 	void *cookie = NULL;
 	spa_log_sm_t *sls;
 	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
 	    &cookie)) != NULL) {
 		VERIFY0(sls->sls_mscount);
 		kmem_free(sls, sizeof (spa_log_sm_t));
 	}
 
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e != NULL; e = list_head(&spa->spa_log_summary)) {
 		VERIFY0(e->lse_mscount);
 		list_remove(&spa->spa_log_summary, e);
 		kmem_free(e, sizeof (log_summary_entry_t));
 	}
 
 	spa->spa_unflushed_stats.sus_nblocks = 0;
 	spa->spa_unflushed_stats.sus_memused = 0;
 	spa->spa_unflushed_stats.sus_blocklimit = 0;
 }
 
 static void
 spa_destroy_aux_threads(spa_t *spa)
 {
 	if (spa->spa_condense_zthr != NULL) {
 		zthr_destroy(spa->spa_condense_zthr);
 		spa->spa_condense_zthr = NULL;
 	}
 	if (spa->spa_checkpoint_discard_zthr != NULL) {
 		zthr_destroy(spa->spa_checkpoint_discard_zthr);
 		spa->spa_checkpoint_discard_zthr = NULL;
 	}
 	if (spa->spa_livelist_delete_zthr != NULL) {
 		zthr_destroy(spa->spa_livelist_delete_zthr);
 		spa->spa_livelist_delete_zthr = NULL;
 	}
 	if (spa->spa_livelist_condense_zthr != NULL) {
 		zthr_destroy(spa->spa_livelist_condense_zthr);
 		spa->spa_livelist_condense_zthr = NULL;
 	}
 }
 
 /*
  * Opposite of spa_load().
  */
 static void
 spa_unload(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
 
 	spa_import_progress_remove(spa_guid(spa));
 	spa_load_note(spa, "UNLOADING");
 
 	spa_wake_waiters(spa);
 
 	/*
 	 * If the log space map feature is enabled and the pool is getting
 	 * exported (but not destroyed), we want to spend some time flushing
 	 * as many metaslabs as we can in an attempt to destroy log space
 	 * maps and save import time.
 	 */
 	if (spa_should_flush_logs_on_unload(spa))
 		spa_unload_log_sm_flush_all(spa);
 
 	/*
 	 * Stop async tasks.
 	 */
 	spa_async_suspend(spa);
 
 	if (spa->spa_root_vdev) {
 		vdev_t *root_vdev = spa->spa_root_vdev;
 		vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
 		vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
 		vdev_autotrim_stop_all(spa);
 		vdev_rebuild_stop_all(spa);
 	}
 
 	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
 		txg_sync_stop(spa->spa_dsl_pool);
 		spa->spa_sync_on = B_FALSE;
 	}
 
 	/*
 	 * This ensures that there is no async metaslab prefetching
 	 * while we attempt to unload the spa.
 	 */
 	if (spa->spa_root_vdev != NULL) {
 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
 			vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
 			if (vc->vdev_mg != NULL)
 				taskq_wait(vc->vdev_mg->mg_taskq);
 		}
 	}
 
 	if (spa->spa_mmp.mmp_thread)
 		mmp_thread_stop(spa);
 
 	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
 	if (spa->spa_async_zio_root != NULL) {
 		for (int i = 0; i < max_ncpus; i++)
 			(void) zio_wait(spa->spa_async_zio_root[i]);
 		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
 		spa->spa_async_zio_root = NULL;
 	}
 
 	if (spa->spa_vdev_removal != NULL) {
 		spa_vdev_removal_destroy(spa->spa_vdev_removal);
 		spa->spa_vdev_removal = NULL;
 	}
 
 	spa_destroy_aux_threads(spa);
 
 	spa_condense_fini(spa);
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	/*
 	 * Close all vdevs.
 	 */
 	if (spa->spa_root_vdev)
 		vdev_free(spa->spa_root_vdev);
 	ASSERT(spa->spa_root_vdev == NULL);
 
 	/*
 	 * Close the dsl pool.
 	 */
 	if (spa->spa_dsl_pool) {
 		dsl_pool_close(spa->spa_dsl_pool);
 		spa->spa_dsl_pool = NULL;
 		spa->spa_meta_objset = NULL;
 	}
 
 	ddt_unload(spa);
 	spa_unload_log_sm_metadata(spa);
 
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
 	for (int i = 0; i < spa->spa_spares.sav_count; i++)
 		vdev_free(spa->spa_spares.sav_vdevs[i]);
 	if (spa->spa_spares.sav_vdevs) {
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 		spa->spa_spares.sav_vdevs = NULL;
 	}
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 	}
 	spa->spa_spares.sav_count = 0;
 
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
 	}
 	if (spa->spa_l2cache.sav_vdevs) {
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
 		spa->spa_l2cache.sav_vdevs = NULL;
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 	}
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
 
 	spa->spa_indirect_vdevs_loaded = B_FALSE;
 
 	if (spa->spa_comment != NULL) {
 		spa_strfree(spa->spa_comment);
 		spa->spa_comment = NULL;
 	}
 
 	spa_config_exit(spa, SCL_ALL, spa);
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active spares for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  */
 void
 spa_load_spares(spa_t *spa)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int i;
 	vdev_t *vd, *tvd;
 
 #ifndef _KERNEL
 	/*
 	 * zdb opens both the current state of the pool and the
 	 * checkpointed state (if present), with a different spa_t.
 	 *
 	 * As spare vdevs are shared among open pools, we skip loading
 	 * them when we load the checkpointed state of the pool.
 	 */
 	if (!spa_writeable(spa))
 		return;
 #endif
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		vd = spa->spa_spares.sav_vdevs[i];
 
 		/* Undo the call to spa_activate() below */
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL && tvd->vdev_isspare)
 			spa_spare_remove(tvd);
 		vdev_close(vd);
 		vdev_free(vd);
 	}
 
 	if (spa->spa_spares.sav_vdevs)
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 
 	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
 	else
 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
 	spa->spa_spares.sav_count = (int)nspares;
 	spa->spa_spares.sav_vdevs = NULL;
 
 	if (nspares == 0)
 		return;
 
 	/*
 	 * Construct the array of vdevs, opening them to get status in the
 	 * process.   For each spare, there is potentially two different vdev_t
 	 * structures associated with it: one in the list of spares (used only
 	 * for basic validation purposes) and one in the active vdev
 	 * configuration (if it's spared in).  During this phase we open and
 	 * validate each vdev on the spare list.  If the vdev also exists in the
 	 * active configuration, then we also mark this vdev as an active spare.
 	 */
 	spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    VDEV_ALLOC_SPARE) == 0);
 		ASSERT(vd != NULL);
 
 		spa->spa_spares.sav_vdevs[i] = vd;
 
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL) {
 			if (!tvd->vdev_isspare)
 				spa_spare_add(tvd);
 
 			/*
 			 * We only mark the spare active if we were successfully
 			 * able to load the vdev.  Otherwise, importing a pool
 			 * with a bad active spare would result in strange
 			 * behavior, because multiple pool would think the spare
 			 * is actively in use.
 			 *
 			 * There is a vulnerability here to an equally bizarre
 			 * circumstance, where a dead active spare is later
 			 * brought back to life (onlined or otherwise).  Given
 			 * the rarity of this scenario, and the extra complexity
 			 * it adds, we ignore the possibility.
 			 */
 			if (!vdev_is_dead(tvd))
 				spa_spare_activate(tvd);
 		}
 
 		vd->vdev_top = vd;
 		vd->vdev_aux = &spa->spa_spares;
 
 		if (vdev_open(vd) != 0)
 			continue;
 
 		if (vdev_validate_aux(vd) == 0)
 			spa_spare_add(vd);
 	}
 
 	/*
 	 * Recompute the stashed list of spares, with status information
 	 * this time.
 	 */
 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		spares[i] = vdev_config_generate(spa,
 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		nvlist_free(spares[i]);
 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active l2cache for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  * Devices which are already active have their details maintained, and are
  * not re-opened.
  */
 void
 spa_load_l2cache(spa_t *spa)
 {
 	nvlist_t **l2cache = NULL;
 	uint_t nl2cache;
 	int i, j, oldnvdevs;
 	uint64_t guid;
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 #ifndef _KERNEL
 	/*
 	 * zdb opens both the current state of the pool and the
 	 * checkpointed state (if present), with a different spa_t.
 	 *
 	 * As L2 caches are part of the ARC which is shared among open
 	 * pools, we skip loading them when we load the checkpointed
 	 * state of the pool.
 	 */
 	if (!spa_writeable(spa))
 		return;
 #endif
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	oldvdevs = sav->sav_vdevs;
 	oldnvdevs = sav->sav_count;
 	sav->sav_vdevs = NULL;
 	sav->sav_count = 0;
 
 	if (sav->sav_config == NULL) {
 		nl2cache = 0;
 		newvdevs = NULL;
 		goto out;
 	}
 
 	VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 	newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 
 	/*
 	 * Process new nvlist of vdevs.
 	 */
 	for (i = 0; i < nl2cache; i++) {
 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
 		    &guid) == 0);
 
 		newvdevs[i] = NULL;
 		for (j = 0; j < oldnvdevs; j++) {
 			vd = oldvdevs[j];
 			if (vd != NULL && guid == vd->vdev_guid) {
 				/*
 				 * Retain previous vdev for add/remove ops.
 				 */
 				newvdevs[i] = vd;
 				oldvdevs[j] = NULL;
 				break;
 			}
 		}
 
 		if (newvdevs[i] == NULL) {
 			/*
 			 * Create new vdev
 			 */
 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
 			    VDEV_ALLOC_L2CACHE) == 0);
 			ASSERT(vd != NULL);
 			newvdevs[i] = vd;
 
 			/*
 			 * Commit this vdev as an l2cache device,
 			 * even if it fails to open.
 			 */
 			spa_l2cache_add(vd);
 
 			vd->vdev_top = vd;
 			vd->vdev_aux = sav;
 
 			spa_l2cache_activate(vd);
 
 			if (vdev_open(vd) != 0)
 				continue;
 
 			(void) vdev_validate_aux(vd);
 
 			if (!vdev_is_dead(vd))
 				l2arc_add_vdev(spa, vd);
 
 			/*
 			 * Upon cache device addition to a pool or pool
 			 * creation with a cache device or if the header
 			 * of the device is invalid we issue an async
 			 * TRIM command for the whole device which will
 			 * execute if l2arc_trim_ahead > 0.
 			 */
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
 		}
 	}
 
 	sav->sav_vdevs = newvdevs;
 	sav->sav_count = (int)nl2cache;
 
 	/*
 	 * Recompute the stashed list of l2cache devices, with status
 	 * information this time.
 	 */
 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 	if (sav->sav_count > 0)
 		l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
 		    KM_SLEEP);
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
 
 out:
 	/*
 	 * Purge vdevs that were dropped
 	 */
 	for (i = 0; i < oldnvdevs; i++) {
 		uint64_t pool;
 
 		vd = oldvdevs[i];
 		if (vd != NULL) {
 			ASSERT(vd->vdev_isl2cache);
 
 			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 			    pool != 0ULL && l2arc_vdev_present(vd))
 				l2arc_remove_vdev(vd);
 			vdev_clear_stats(vd);
 			vdev_free(vd);
 		}
 	}
 
 	if (oldvdevs)
 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
 
 	for (i = 0; i < sav->sav_count; i++)
 		nvlist_free(l2cache[i]);
 	if (sav->sav_count)
 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
 }
 
 static int
 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 {
 	dmu_buf_t *db;
 	char *packed = NULL;
 	size_t nvsize = 0;
 	int error;
 	*value = NULL;
 
 	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
 	if (error)
 		return (error);
 
 	nvsize = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
 	packed = vmem_alloc(nvsize, KM_SLEEP);
 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
 	    DMU_READ_PREFETCH);
 	if (error == 0)
 		error = nvlist_unpack(packed, nvsize, value, 0);
 	vmem_free(packed, nvsize);
 
 	return (error);
 }
 
 /*
  * Concrete top-level vdevs that are not missing and are not logs. At every
  * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
  */
 static uint64_t
 spa_healthy_core_tvds(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t tvds = 0;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 		if (vd->vdev_islog)
 			continue;
 		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
 			tvds++;
 	}
 
 	return (tvds);
 }
 
 /*
  * Checks to see if the given vdev could not be opened, in which case we post a
  * sysevent to notify the autoreplace code that the device has been removed.
  */
 static void
 spa_check_removed(vdev_t *vd)
 {
 	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
 	    vdev_is_concrete(vd)) {
 		zfs_post_autoreplace(vd->vdev_spa, vd);
 		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
 	}
 }
 
 static int
 spa_check_for_missing_logs(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If we're doing a normal import, then build up any additional
 	 * diagnostic information about missing log devices.
 	 * We'll pass this up to the user for further processing.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
 		nvlist_t **child, *nv;
 		uint64_t idx = 0;
 
 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 
 			/*
 			 * We consider a device as missing only if it failed
 			 * to open (i.e. offline or faulted is not considered
 			 * as missing).
 			 */
 			if (tvd->vdev_islog &&
 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 				child[idx++] = vdev_config_generate(spa, tvd,
 				    B_FALSE, VDEV_CONFIG_MISSING);
 			}
 		}
 
 		if (idx > 0) {
 			fnvlist_add_nvlist_array(nv,
 			    ZPOOL_CONFIG_CHILDREN, child, idx);
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
 
 			for (uint64_t i = 0; i < idx; i++)
 				nvlist_free(child[i]);
 		}
 		nvlist_free(nv);
 		kmem_free(child, rvd->vdev_children * sizeof (char **));
 
 		if (idx > 0) {
 			spa_load_failed(spa, "some log devices are missing");
 			vdev_dbgmsg_print_tree(rvd, 2);
 			return (SET_ERROR(ENXIO));
 		}
 	} else {
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 
 			if (tvd->vdev_islog &&
 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 				spa_set_log_state(spa, SPA_LOG_CLEAR);
 				spa_load_note(spa, "some log devices are "
 				    "missing, ZIL is dropped.");
 				vdev_dbgmsg_print_tree(rvd, 2);
 				break;
 			}
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Check for missing log devices
  */
 static boolean_t
 spa_check_logs(spa_t *spa)
 {
 	boolean_t rv = B_FALSE;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	switch (spa->spa_log_state) {
 	default:
 		break;
 	case SPA_LOG_MISSING:
 		/* need to recheck in case slog has been restored */
 	case SPA_LOG_UNKNOWN:
 		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
 		if (rv)
 			spa_set_log_state(spa, SPA_LOG_MISSING);
 		break;
 	}
 	return (rv);
 }
 
 static boolean_t
 spa_passivate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	boolean_t slog_found = B_FALSE;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	if (!spa_has_slogs(spa))
 		return (B_FALSE);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog) {
 			metaslab_group_passivate(mg);
 			slog_found = B_TRUE;
 		}
 	}
 
 	return (slog_found);
 }
 
 static void
 spa_activate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog)
 			metaslab_group_activate(mg);
 	}
 }
 
 int
 spa_reset_logs(spa_t *spa)
 {
 	int error;
 
 	error = dmu_objset_find(spa_name(spa), zil_reset,
 	    NULL, DS_FIND_CHILDREN);
 	if (error == 0) {
 		/*
 		 * We successfully offlined the log device, sync out the
 		 * current txg so that the "stubby" block can be removed
 		 * by zil_sync().
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 	return (error);
 }
 
 static void
 spa_aux_check_removed(spa_aux_vdev_t *sav)
 {
 	for (int i = 0; i < sav->sav_count; i++)
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
 void
 spa_claim_notify(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
 	if (zio->io_error)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
 	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
 		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
 	mutex_exit(&spa->spa_props_lock);
 }
 
 typedef struct spa_load_error {
 	uint64_t	sle_meta_count;
 	uint64_t	sle_data_count;
 } spa_load_error_t;
 
 static void
 spa_load_verify_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
 	spa_t *spa = zio->io_spa;
 
 	abd_free(zio->io_abd);
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
 			atomic_inc_64(&sle->sle_meta_count);
 		else
 			atomic_inc_64(&sle->sle_data_count);
 	}
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 /*
  * Maximum number of inflight bytes is the log2 fraction of the arc size.
  * By default, we set it to 1/16th of the arc.
  */
 int spa_load_verify_shift = 4;
 int spa_load_verify_metadata = B_TRUE;
 int spa_load_verify_data = B_TRUE;
 
 /*ARGSUSED*/
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
 	    BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
 		return (0);
 	/*
 	 * Note: normally this routine will not be called if
 	 * spa_load_verify_metadata is not set.  However, it may be useful
 	 * to manually set the flag after the traversal has begun.
 	 */
 	if (!spa_load_verify_metadata)
 		return (0);
 	if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
 		return (0);
 
 	uint64_t maxinflight_bytes =
 	    arc_target_bytes() >> spa_load_verify_shift;
 	zio_t *rio = arg;
 	size_t size = BP_GET_PSIZE(bp);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while (spa->spa_load_verify_bytes >= maxinflight_bytes)
 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes += size;
 	mutex_exit(&spa->spa_scrub_lock);
 
 	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
 	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	return (0);
 }
 
 static int
 spa_load_verify(spa_t *spa)
 {
 	zio_t *rio;
 	spa_load_error_t sle = { 0 };
 	zpool_load_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
 	int error = 0;
 
 	zpool_get_load_policy(spa->spa_config, &policy);
 
 	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
 		return (0);
 
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	error = dmu_objset_find_dp(spa->spa_dsl_pool,
 	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
 	    DS_FIND_CHILDREN);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 	if (error != 0)
 		return (error);
 
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
 	if (spa_load_verify_metadata) {
 		if (spa->spa_extreme_rewind) {
 			spa_load_note(spa, "performing a complete scan of the "
 			    "pool since extreme rewind is on. This may take "
 			    "a very long time.\n  (spa_load_verify_data=%u, "
 			    "spa_load_verify_metadata=%u)",
 			    spa_load_verify_data, spa_load_verify_metadata);
 		}
 
 		error = traverse_pool(spa, spa->spa_verify_min_txg,
 		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 		    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
 	}
 
 	(void) zio_wait(rio);
 	ASSERT0(spa->spa_load_verify_bytes);
 
 	spa->spa_load_meta_errors = sle.sle_meta_count;
 	spa->spa_load_data_errors = sle.sle_data_count;
 
 	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
 		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
 		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
 		    (u_longlong_t)sle.sle_data_count);
 	}
 
 	if (spa_load_verify_dryrun ||
 	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
 	    sle.sle_data_count <= policy.zlp_maxdata)) {
 		int64_t loss = 0;
 
 		verify_ok = B_TRUE;
 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
 		VERIFY(nvlist_add_int64(spa->spa_load_info,
 		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
 	} else {
 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
 	}
 
 	if (spa_load_verify_dryrun)
 		return (0);
 
 	if (error) {
 		if (error != ENXIO && error != EIO)
 			error = SET_ERROR(EIO);
 		return (error);
 	}
 
 	return (verify_ok ? 0 : EIO);
 }
 
 /*
  * Find a value in the pool props object.
  */
 static void
 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
 {
 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
 }
 
 /*
  * Find a value in the pool directory object.
  */
 static int
 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
 {
 	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    name, sizeof (uint64_t), 1, val);
 
 	if (error != 0 && (error != ENOENT || log_enoent)) {
 		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
 		    "[error=%d]", name, error);
 	}
 
 	return (error);
 }
 
 static int
 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 {
 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
 	return (SET_ERROR(err));
 }
 
 boolean_t
 spa_livelist_delete_check(spa_t *spa)
 {
 	return (spa->spa_livelists_to_delete != 0);
 }
 
 /* ARGSUSED */
 static boolean_t
 spa_livelist_delete_cb_check(void *arg, zthr_t *z)
 {
 	spa_t *spa = arg;
 	return (spa_livelist_delete_check(spa));
 }
 
 static int
 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	zio_free(spa, tx->tx_txg, bp);
 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
 	    -bp_get_dsize_sync(spa, bp),
 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
 	return (0);
 }
 
 static int
 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
 {
 	int err;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	zap_cursor_init(&zc, os, zap_obj);
 	err = zap_cursor_retrieve(&zc, &za);
 	zap_cursor_fini(&zc);
 	if (err == 0)
 		*llp = za.za_first_integer;
 	return (err);
 }
 
 /*
  * Components of livelist deletion that must be performed in syncing
  * context: freeing block pointers and updating the pool-wide data
  * structures to indicate how much work is left to do
  */
 typedef struct sublist_delete_arg {
 	spa_t *spa;
 	dsl_deadlist_t *ll;
 	uint64_t key;
 	bplist_t *to_free;
 } sublist_delete_arg_t;
 
 static void
 sublist_delete_sync(void *arg, dmu_tx_t *tx)
 {
 	sublist_delete_arg_t *sda = arg;
 	spa_t *spa = sda->spa;
 	dsl_deadlist_t *ll = sda->ll;
 	uint64_t key = sda->key;
 	bplist_t *to_free = sda->to_free;
 
 	bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
 	dsl_deadlist_remove_entry(ll, key, tx);
 }
 
 typedef struct livelist_delete_arg {
 	spa_t *spa;
 	uint64_t ll_obj;
 	uint64_t zap_obj;
 } livelist_delete_arg_t;
 
 static void
 livelist_delete_sync(void *arg, dmu_tx_t *tx)
 {
 	livelist_delete_arg_t *lda = arg;
 	spa_t *spa = lda->spa;
 	uint64_t ll_obj = lda->ll_obj;
 	uint64_t zap_obj = lda->zap_obj;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t count;
 
 	/* free the livelist and decrement the feature count */
 	VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
 	dsl_deadlist_free(mos, ll_obj, tx);
 	spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
 	VERIFY0(zap_count(mos, zap_obj, &count));
 	if (count == 0) {
 		/* no more livelists to delete */
 		VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_DELETED_CLONES, tx));
 		VERIFY0(zap_destroy(mos, zap_obj, tx));
 		spa->spa_livelists_to_delete = 0;
 		spa_notify_waiters(spa);
 	}
 }
 
 /*
  * Load in the value for the livelist to be removed and open it. Then,
  * load its first sublist and determine which block pointers should actually
  * be freed. Then, call a synctask which performs the actual frees and updates
  * the pool-wide livelist data.
  */
 /* ARGSUSED */
 static void
 spa_livelist_delete_cb(void *arg, zthr_t *z)
 {
 	spa_t *spa = arg;
 	uint64_t ll_obj = 0, count;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t zap_obj = spa->spa_livelists_to_delete;
 	/*
 	 * Determine the next livelist to delete. This function should only
 	 * be called if there is at least one deleted clone.
 	 */
 	VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
 	VERIFY0(zap_count(mos, ll_obj, &count));
 	if (count > 0) {
 		dsl_deadlist_t ll = { 0 };
 		dsl_deadlist_entry_t *dle;
 		bplist_t to_free;
 		dsl_deadlist_open(&ll, mos, ll_obj);
 		dle = dsl_deadlist_first(&ll);
 		ASSERT3P(dle, !=, NULL);
 		bplist_create(&to_free);
 		int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
 		    z, NULL);
 		if (err == 0) {
 			sublist_delete_arg_t sync_arg = {
 			    .spa = spa,
 			    .ll = &ll,
 			    .key = dle->dle_mintxg,
 			    .to_free = &to_free
 			};
 			zfs_dbgmsg("deleting sublist (id %llu) from"
 			    " livelist %llu, %d remaining",
 			    dle->dle_bpobj.bpo_object, ll_obj, count - 1);
 			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 			    sublist_delete_sync, &sync_arg, 0,
 			    ZFS_SPACE_CHECK_DESTROY));
 		} else {
 			VERIFY3U(err, ==, EINTR);
 		}
 		bplist_clear(&to_free);
 		bplist_destroy(&to_free);
 		dsl_deadlist_close(&ll);
 	} else {
 		livelist_delete_arg_t sync_arg = {
 		    .spa = spa,
 		    .ll_obj = ll_obj,
 		    .zap_obj = zap_obj
 		};
 		zfs_dbgmsg("deletion of livelist %llu completed", ll_obj);
 		VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
 		    &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
 	}
 }
 
 static void
 spa_start_livelist_destroy_thread(spa_t *spa)
 {
 	ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
 	spa->spa_livelist_delete_zthr =
 	    zthr_create("z_livelist_destroy",
 	    spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa);
 }
 
 typedef struct livelist_new_arg {
 	bplist_t *allocs;
 	bplist_t *frees;
 } livelist_new_arg_t;
 
 static int
 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(tx == NULL);
 	livelist_new_arg_t *lna = arg;
 	if (bp_freed) {
 		bplist_append(lna->frees, bp);
 	} else {
 		bplist_append(lna->allocs, bp);
 		zfs_livelist_condense_new_alloc++;
 	}
 	return (0);
 }
 
 typedef struct livelist_condense_arg {
 	spa_t *spa;
 	bplist_t to_keep;
 	uint64_t first_size;
 	uint64_t next_size;
 } livelist_condense_arg_t;
 
 static void
 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
 {
 	livelist_condense_arg_t *lca = arg;
 	spa_t *spa = lca->spa;
 	bplist_t new_frees;
 	dsl_dataset_t *ds = spa->spa_to_condense.ds;
 
 	/* Have we been cancelled? */
 	if (spa->spa_to_condense.cancelled) {
 		zfs_livelist_condense_sync_cancel++;
 		goto out;
 	}
 
 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
 	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
 
 	/*
 	 * It's possible that the livelist was changed while the zthr was
 	 * running. Therefore, we need to check for new blkptrs in the two
 	 * entries being condensed and continue to track them in the livelist.
 	 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
 	 * it's possible that the newly added blkptrs are FREEs or ALLOCs so
 	 * we need to sort them into two different bplists.
 	 */
 	uint64_t first_obj = first->dle_bpobj.bpo_object;
 	uint64_t next_obj = next->dle_bpobj.bpo_object;
 	uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 	uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 
 	bplist_create(&new_frees);
 	livelist_new_arg_t new_bps = {
 	    .allocs = &lca->to_keep,
 	    .frees = &new_frees,
 	};
 
 	if (cur_first_size > lca->first_size) {
 		VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
 		    livelist_track_new_cb, &new_bps, lca->first_size));
 	}
 	if (cur_next_size > lca->next_size) {
 		VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
 		    livelist_track_new_cb, &new_bps, lca->next_size));
 	}
 
 	dsl_deadlist_clear_entry(first, ll, tx);
 	ASSERT(bpobj_is_empty(&first->dle_bpobj));
 	dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
 
 	bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
 	bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
 	bplist_destroy(&new_frees);
 
 	char dsname[ZFS_MAX_DATASET_NAME_LEN];
 	dsl_dataset_name(ds, dsname);
 	zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
 	    "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
 	    "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj,
 	    cur_first_size, next_obj, cur_next_size,
 	    first->dle_bpobj.bpo_object,
 	    first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
 out:
 	dmu_buf_rele(ds->ds_dbuf, spa);
 	spa->spa_to_condense.ds = NULL;
 	bplist_clear(&lca->to_keep);
 	bplist_destroy(&lca->to_keep);
 	kmem_free(lca, sizeof (livelist_condense_arg_t));
 	spa->spa_to_condense.syncing = B_FALSE;
 }
 
 static void
 spa_livelist_condense_cb(void *arg, zthr_t *t)
 {
 	while (zfs_livelist_condense_zthr_pause &&
 	    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
 		delay(1);
 
 	spa_t *spa = arg;
 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
 	uint64_t first_size, next_size;
 
 	livelist_condense_arg_t *lca =
 	    kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
 	bplist_create(&lca->to_keep);
 
 	/*
 	 * Process the livelists (matching FREEs and ALLOCs) in open context
 	 * so we have minimal work in syncing context to condense.
 	 *
 	 * We save bpobj sizes (first_size and next_size) to use later in
 	 * syncing context to determine if entries were added to these sublists
 	 * while in open context. This is possible because the clone is still
 	 * active and open for normal writes and we want to make sure the new,
 	 * unprocessed blockpointers are inserted into the livelist normally.
 	 *
 	 * Note that dsl_process_sub_livelist() both stores the size number of
 	 * blockpointers and iterates over them while the bpobj's lock held, so
 	 * the sizes returned to us are consistent which what was actually
 	 * processed.
 	 */
 	int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
 	    &first_size);
 	if (err == 0)
 		err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
 		    t, &next_size);
 
 	if (err == 0) {
 		while (zfs_livelist_condense_sync_pause &&
 		    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
 			delay(1);
 
 		dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 		dmu_tx_mark_netfree(tx);
 		dmu_tx_hold_space(tx, 1);
 		err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
 		if (err == 0) {
 			/*
 			 * Prevent the condense zthr restarting before
 			 * the synctask completes.
 			 */
 			spa->spa_to_condense.syncing = B_TRUE;
 			lca->spa = spa;
 			lca->first_size = first_size;
 			lca->next_size = next_size;
 			dsl_sync_task_nowait(spa_get_dsl(spa),
 			    spa_livelist_condense_sync, lca, 0,
 			    ZFS_SPACE_CHECK_NONE, tx);
 			dmu_tx_commit(tx);
 			return;
 		}
 	}
 	/*
 	 * Condensing can not continue: either it was externally stopped or
 	 * we were unable to assign to a tx because the pool has run out of
 	 * space. In the second case, we'll just end up trying to condense
 	 * again in a later txg.
 	 */
 	ASSERT(err != 0);
 	bplist_clear(&lca->to_keep);
 	bplist_destroy(&lca->to_keep);
 	kmem_free(lca, sizeof (livelist_condense_arg_t));
 	dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
 	spa->spa_to_condense.ds = NULL;
 	if (err == EINTR)
 		zfs_livelist_condense_zthr_cancel++;
 }
 
 /* ARGSUSED */
 /*
  * Check that there is something to condense but that a condense is not
  * already in progress and that condensing has not been cancelled.
  */
 static boolean_t
 spa_livelist_condense_cb_check(void *arg, zthr_t *z)
 {
 	spa_t *spa = arg;
 	if ((spa->spa_to_condense.ds != NULL) &&
 	    (spa->spa_to_condense.syncing == B_FALSE) &&
 	    (spa->spa_to_condense.cancelled == B_FALSE)) {
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static void
 spa_start_livelist_condensing_thread(spa_t *spa)
 {
 	spa->spa_to_condense.ds = NULL;
 	spa->spa_to_condense.first = NULL;
 	spa->spa_to_condense.next = NULL;
 	spa->spa_to_condense.syncing = B_FALSE;
 	spa->spa_to_condense.cancelled = B_FALSE;
 
 	ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
 	spa->spa_livelist_condense_zthr =
 	    zthr_create("z_livelist_condense",
 	    spa_livelist_condense_cb_check,
 	    spa_livelist_condense_cb, spa);
 }
 
 static void
 spa_spawn_aux_threads(spa_t *spa)
 {
 	ASSERT(spa_writeable(spa));
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_start_indirect_condensing_thread(spa);
 	spa_start_livelist_destroy_thread(spa);
 	spa_start_livelist_condensing_thread(spa);
 
 	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
 	spa->spa_checkpoint_discard_zthr =
 	    zthr_create("z_checkpoint_discard",
 	    spa_checkpoint_discard_thread_check,
 	    spa_checkpoint_discard_thread, spa);
 }
 
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
  * pool have that entry in their config, but only the splitting one contains
  * a list of all the guids of the vdevs that are being split off.
  *
  * This function determines what to do with that list: either rejoin
  * all the disks to the pool, or complete the splitting process.  To attempt
  * the rejoin, each disk that is offlined is marked online again, and
  * we do a reopen() call.  If the vdev label for every disk that was
  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
  * then we call vdev_split() on each disk, and complete the split.
  *
  * Otherwise we leave the config alone, with all the vdevs in place in
  * the original pool.
  */
 static void
 spa_try_repair(spa_t *spa, nvlist_t *config)
 {
 	uint_t extracted;
 	uint64_t *glist;
 	uint_t i, gcount;
 	nvlist_t *nvl;
 	vdev_t **vd;
 	boolean_t attempt_reopen;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
 		return;
 
 	/* check that the config is complete */
 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    &glist, &gcount) != 0)
 		return;
 
 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
 
 	/* attempt to online all the vdevs & validate */
 	attempt_reopen = B_TRUE;
 	for (i = 0; i < gcount; i++) {
 		if (glist[i] == 0)	/* vdev is hole */
 			continue;
 
 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
 		if (vd[i] == NULL) {
 			/*
 			 * Don't bother attempting to reopen the disks;
 			 * just do the split.
 			 */
 			attempt_reopen = B_FALSE;
 		} else {
 			/* attempt to re-online it */
 			vd[i]->vdev_offline = B_FALSE;
 		}
 	}
 
 	if (attempt_reopen) {
 		vdev_reopen(spa->spa_root_vdev);
 
 		/* check each device to see what state it's in */
 		for (extracted = 0, i = 0; i < gcount; i++) {
 			if (vd[i] != NULL &&
 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
 				break;
 			++extracted;
 		}
 	}
 
 	/*
 	 * If every disk has been moved to the new pool, or if we never
 	 * even attempted to look at them, then we split them off for
 	 * good.
 	 */
 	if (!attempt_reopen || gcount == extracted) {
 		for (i = 0; i < gcount; i++)
 			if (vd[i] != NULL)
 				vdev_split(vd[i]);
 		vdev_reopen(spa->spa_root_vdev);
 	}
 
 	kmem_free(vd, gcount * sizeof (vdev_t *));
 }
 
 static int
 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 {
 	char *ereport = FM_EREPORT_ZFS_POOL;
 	int error;
 
 	spa->spa_load_state = state;
 	(void) spa_import_progress_set_state(spa_guid(spa),
 	    spa_load_state(spa));
 
 	gethrestime(&spa->spa_loaded_ts);
 	error = spa_load_impl(spa, type, &ereport);
 
 	/*
 	 * Don't count references from objsets that are already closed
 	 * and are making their way through the eviction process.
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 	if (error) {
 		if (error != EEXIST) {
 			spa->spa_loaded_ts.tv_sec = 0;
 			spa->spa_loaded_ts.tv_nsec = 0;
 		}
 		if (error != EBADF) {
 			zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0);
 		}
 	}
 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
 	spa->spa_ena = 0;
 
 	(void) spa_import_progress_set_state(spa_guid(spa),
 	    spa_load_state(spa));
 
 	return (error);
 }
 
 #ifdef ZFS_DEBUG
 /*
  * Count the number of per-vdev ZAPs associated with all of the vdevs in the
  * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
  * spa's per-vdev ZAP list.
  */
 static uint64_t
 vdev_count_verify_zaps(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t total = 0;
 
 	if (vd->vdev_top_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
 	}
 	if (vd->vdev_leaf_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		total += vdev_count_verify_zaps(vd->vdev_child[i]);
 	}
 
 	return (total);
 }
 #endif
 
 /*
  * Determine whether the activity check is required.
  */
 static boolean_t
 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
     nvlist_t *config)
 {
 	uint64_t state = 0;
 	uint64_t hostid = 0;
 	uint64_t tryconfig_txg = 0;
 	uint64_t tryconfig_timestamp = 0;
 	uint16_t tryconfig_mmp_seq = 0;
 	nvlist_t *nvinfo;
 
 	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
 		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
 		(void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
 		    &tryconfig_txg);
 		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    &tryconfig_timestamp);
 		(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
 		    &tryconfig_mmp_seq);
 	}
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
 
 	/*
 	 * Disable the MMP activity check - This is used by zdb which
 	 * is intended to be used on potentially active pools.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
 		return (B_FALSE);
 
 	/*
 	 * Skip the activity check when the MMP feature is disabled.
 	 */
 	if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
 		return (B_FALSE);
 
 	/*
 	 * If the tryconfig_ values are nonzero, they are the results of an
 	 * earlier tryimport.  If they all match the uberblock we just found,
 	 * then the pool has not changed and we return false so we do not test
 	 * a second time.
 	 */
 	if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
 	    tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
 	    tryconfig_mmp_seq && tryconfig_mmp_seq ==
 	    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
 		return (B_FALSE);
 
 	/*
 	 * Allow the activity check to be skipped when importing the pool
 	 * on the same host which last imported it.  Since the hostid from
 	 * configuration may be stale use the one read from the label.
 	 */
 	if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
 		hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
 
 	if (hostid == spa_get_hostid(spa))
 		return (B_FALSE);
 
 	/*
 	 * Skip the activity test when the pool was cleanly exported.
 	 */
 	if (state != POOL_STATE_ACTIVE)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Nanoseconds the activity check must watch for changes on-disk.
  */
 static uint64_t
 spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
 {
 	uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
 	uint64_t multihost_interval = MSEC2NSEC(
 	    MMP_INTERVAL_OK(zfs_multihost_interval));
 	uint64_t import_delay = MAX(NANOSEC, import_intervals *
 	    multihost_interval);
 
 	/*
 	 * Local tunables determine a minimum duration except for the case
 	 * where we know when the remote host will suspend the pool if MMP
 	 * writes do not land.
 	 *
 	 * See Big Theory comment at the top of mmp.c for the reasoning behind
 	 * these cases and times.
 	 */
 
 	ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
 
 	if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
 	    MMP_FAIL_INT(ub) > 0) {
 
 		/* MMP on remote host will suspend pool after failed writes */
 		import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
 		    MMP_IMPORT_SAFETY_FACTOR / 100;
 
 		zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
 		    "mmp_fails=%llu ub_mmp mmp_interval=%llu "
 		    "import_intervals=%u", import_delay, MMP_FAIL_INT(ub),
 		    MMP_INTERVAL(ub), import_intervals);
 
 	} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
 	    MMP_FAIL_INT(ub) == 0) {
 
 		/* MMP on remote host will never suspend pool */
 		import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
 		    ub->ub_mmp_delay) * import_intervals);
 
 		zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
 		    "mmp_interval=%llu ub_mmp_delay=%llu "
 		    "import_intervals=%u", import_delay, MMP_INTERVAL(ub),
 		    ub->ub_mmp_delay, import_intervals);
 
 	} else if (MMP_VALID(ub)) {
 		/*
 		 * zfs-0.7 compatibility case
 		 */
 
 		import_delay = MAX(import_delay, (multihost_interval +
 		    ub->ub_mmp_delay) * import_intervals);
 
 		zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
 		    "import_intervals=%u leaves=%u", import_delay,
 		    ub->ub_mmp_delay, import_intervals,
 		    vdev_count_leaves(spa));
 	} else {
 		/* Using local tunings is the only reasonable option */
 		zfs_dbgmsg("pool last imported on non-MMP aware "
 		    "host using import_delay=%llu multihost_interval=%llu "
 		    "import_intervals=%u", import_delay, multihost_interval,
 		    import_intervals);
 	}
 
 	return (import_delay);
 }
 
 /*
  * Perform the import activity check.  If the user canceled the import or
  * we detected activity then fail.
  */
 static int
 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 {
 	uint64_t txg = ub->ub_txg;
 	uint64_t timestamp = ub->ub_timestamp;
 	uint64_t mmp_config = ub->ub_mmp_config;
 	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
 	uint64_t import_delay;
 	hrtime_t import_expire;
 	nvlist_t *mmp_label = NULL;
 	vdev_t *rvd = spa->spa_root_vdev;
 	kcondvar_t cv;
 	kmutex_t mtx;
 	int error = 0;
 
 	cv_init(&cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_enter(&mtx);
 
 	/*
 	 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
 	 * during the earlier tryimport.  If the txg recorded there is 0 then
 	 * the pool is known to be active on another host.
 	 *
 	 * Otherwise, the pool might be in use on another host.  Check for
 	 * changes in the uberblocks on disk if necessary.
 	 */
 	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
 		nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
 		    ZPOOL_CONFIG_LOAD_INFO);
 
 		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
 		    fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
 			vdev_uberblock_load(rvd, ub, &mmp_label);
 			error = SET_ERROR(EREMOTEIO);
 			goto out;
 		}
 	}
 
 	import_delay = spa_activity_check_duration(spa, ub);
 
 	/* Add a small random factor in case of simultaneous imports (0-25%) */
 	import_delay += import_delay * spa_get_random(250) / 1000;
 
 	import_expire = gethrtime() + import_delay;
 
 	while (gethrtime() < import_expire) {
 		(void) spa_import_progress_set_mmp_check(spa_guid(spa),
 		    NSEC2SEC(import_expire - gethrtime()));
 
 		vdev_uberblock_load(rvd, ub, &mmp_label);
 
 		if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
 		    mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
 			zfs_dbgmsg("multihost activity detected "
 			    "txg %llu ub_txg  %llu "
 			    "timestamp %llu ub_timestamp  %llu "
 			    "mmp_config %#llx ub_mmp_config %#llx",
 			    txg, ub->ub_txg, timestamp, ub->ub_timestamp,
 			    mmp_config, ub->ub_mmp_config);
 
 			error = SET_ERROR(EREMOTEIO);
 			break;
 		}
 
 		if (mmp_label) {
 			nvlist_free(mmp_label);
 			mmp_label = NULL;
 		}
 
 		error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
 		if (error != -1) {
 			error = SET_ERROR(EINTR);
 			break;
 		}
 		error = 0;
 	}
 
 out:
 	mutex_exit(&mtx);
 	mutex_destroy(&mtx);
 	cv_destroy(&cv);
 
 	/*
 	 * If the pool is determined to be active store the status in the
 	 * spa->spa_load_info nvlist.  If the remote hostname or hostid are
 	 * available from configuration read from disk store them as well.
 	 * This allows 'zpool import' to generate a more useful message.
 	 *
 	 * ZPOOL_CONFIG_MMP_STATE    - observed pool status (mandatory)
 	 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
 	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
 	 */
 	if (error == EREMOTEIO) {
 		char *hostname = "<unknown>";
 		uint64_t hostid = 0;
 
 		if (mmp_label) {
 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
 				hostname = fnvlist_lookup_string(mmp_label,
 				    ZPOOL_CONFIG_HOSTNAME);
 				fnvlist_add_string(spa->spa_load_info,
 				    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
 			}
 
 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
 				hostid = fnvlist_lookup_uint64(mmp_label,
 				    ZPOOL_CONFIG_HOSTID);
 				fnvlist_add_uint64(spa->spa_load_info,
 				    ZPOOL_CONFIG_MMP_HOSTID, hostid);
 			}
 		}
 
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_TXG, 0);
 
 		error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
 	}
 
 	if (mmp_label)
 		nvlist_free(mmp_label);
 
 	return (error);
 }
 
 static int
 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
 {
 	uint64_t hostid;
 	char *hostname;
 	uint64_t myhostid = 0;
 
 	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
 	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 		hostname = fnvlist_lookup_string(mos_config,
 		    ZPOOL_CONFIG_HOSTNAME);
 
 		myhostid = zone_get_hostid(NULL);
 
 		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
 			cmn_err(CE_WARN, "pool '%s' could not be "
 			    "loaded as it was last accessed by "
 			    "another system (host: %s hostid: 0x%llx). "
-			    "See: http://illumos.org/msg/ZFS-8000-EY",
+			    "See: https://openzfs.github.io/openzfs-docs/msg/"
+			    "ZFS-8000-EY",
 			    spa_name(spa), hostname, (u_longlong_t)hostid);
 			spa_load_failed(spa, "hostid verification failed: pool "
 			    "last accessed by host: %s (hostid: 0x%llx)",
 			    hostname, (u_longlong_t)hostid);
 			return (SET_ERROR(EBADF));
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
 	int parse;
 	vdev_t *rvd;
 	uint64_t pool_guid;
 	char *comment;
 
 	/*
 	 * Versioning wasn't explicitly added to the label until later, so if
 	 * it's not present treat it as the initial version.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &spa->spa_ubsync.ub_version) != 0)
 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
 		spa_load_failed(spa, "invalid config provided: '%s' missing",
 		    ZPOOL_CONFIG_POOL_GUID);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If we are doing an import, ensure that the pool is not already
 	 * imported by checking if its pool guid already exists in the
 	 * spa namespace.
 	 *
 	 * The only case that we allow an already imported pool to be
 	 * imported again, is when the pool is checkpointed and we want to
 	 * look at its checkpointed state from userland tools like zdb.
 	 */
 #ifdef _KERNEL
 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0)) {
 #else
 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0) &&
 	    !spa_importing_readonly_checkpoint(spa)) {
 #endif
 		spa_load_failed(spa, "a pool with guid %llu is already open",
 		    (u_longlong_t)pool_guid);
 		return (SET_ERROR(EEXIST));
 	}
 
 	spa->spa_config_guid = pool_guid;
 
 	nvlist_free(spa->spa_load_info);
 	spa->spa_load_info = fnvlist_alloc();
 
 	ASSERT(spa->spa_comment == NULL);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
 		spa->spa_comment = spa_strdup(comment);
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
 		spa->spa_config_splitting = fnvlist_dup(nvl);
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
 		spa_load_failed(spa, "invalid config provided: '%s' missing",
 		    ZPOOL_CONFIG_VDEV_TREE);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
 	 * value that will be returned by spa_version() since parsing the
 	 * configuration requires knowing the version number.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	parse = (type == SPA_IMPORT_EXISTING ?
 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
 	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_load_failed(spa, "unable to parse config [error=%d]",
 		    error);
 		return (error);
 	}
 
 	ASSERT(spa->spa_root_vdev == rvd);
 	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
 	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
 
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_guid(spa) == pool_guid);
 	}
 
 	return (0);
 }
 
 /*
  * Recursively open all vdevs in the vdev tree. This function is called twice:
  * first with the untrusted config, then with the trusted config.
  */
 static int
 spa_ld_open_vdevs(spa_t *spa)
 {
 	int error = 0;
 
 	/*
 	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
 	 * missing/unopenable for the root vdev to be still considered openable.
 	 */
 	if (spa->spa_trust_config) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
 	} else {
 		spa->spa_missing_tvds_allowed = 0;
 	}
 
 	spa->spa_missing_tvds_allowed =
 	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_open(spa->spa_root_vdev);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (spa->spa_missing_tvds != 0) {
 		spa_load_note(spa, "vdev tree has %lld missing top-level "
 		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
 		if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
 			/*
 			 * Although theoretically we could allow users to open
 			 * incomplete pools in RW mode, we'd need to add a lot
 			 * of extra logic (e.g. adjust pool space to account
 			 * for missing vdevs).
 			 * This limitation also prevents users from accidentally
 			 * opening the pool in RW mode during data recovery and
 			 * damaging it further.
 			 */
 			spa_load_note(spa, "pools with missing top-level "
 			    "vdevs can only be opened in read-only mode.");
 			error = SET_ERROR(ENXIO);
 		} else {
 			spa_load_note(spa, "current settings allow for maximum "
 			    "%lld missing top-level vdevs at this stage.",
 			    (u_longlong_t)spa->spa_missing_tvds_allowed);
 		}
 	}
 	if (error != 0) {
 		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
 		    error);
 	}
 	if (spa->spa_missing_tvds != 0 || error != 0)
 		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
 
 	return (error);
 }
 
 /*
  * We need to validate the vdev labels against the configuration that
  * we have in hand. This function is called twice: first with an untrusted
  * config, then with a trusted config. The validation is more strict when the
  * config is trusted.
  */
 static int
 spa_ld_validate_vdevs(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_validate(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
 		return (error);
 	}
 
 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
 		spa_load_failed(spa, "cannot open vdev tree after invalidating "
 		    "some vdevs");
 		vdev_dbgmsg_print_tree(rvd, 2);
 		return (SET_ERROR(ENXIO));
 	}
 
 	return (0);
 }
 
 static void
 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
 {
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	spa->spa_claim_max_txg = spa->spa_first_txg;
 	spa->spa_prev_software_version = ub->ub_software_version;
 }
 
 static int
 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	nvlist_t *label;
 	uberblock_t *ub = &spa->spa_uberblock;
 	boolean_t activity_check = B_FALSE;
 
 	/*
 	 * If we are opening the checkpointed state of the pool by
 	 * rewinding to it, at this point we will have written the
 	 * checkpointed uberblock to the vdev labels, so searching
 	 * the labels will find the right uberblock.  However, if
 	 * we are opening the checkpointed state read-only, we have
 	 * not modified the labels. Therefore, we must ignore the
 	 * labels and continue using the spa_uberblock that was set
 	 * by spa_ld_checkpoint_rewind.
 	 *
 	 * Note that it would be fine to ignore the labels when
 	 * rewinding (opening writeable) as well. However, if we
 	 * crash just after writing the labels, we will end up
 	 * searching the labels. Doing so in the common case means
 	 * that this code path gets exercised normally, rather than
 	 * just in the edge case.
 	 */
 	if (ub->ub_checkpoint_txg != 0 &&
 	    spa_importing_readonly_checkpoint(spa)) {
 		spa_ld_select_uberblock_done(spa, ub);
 		return (0);
 	}
 
 	/*
 	 * Find the best uberblock.
 	 */
 	vdev_uberblock_load(rvd, ub, &label);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
 		nvlist_free(label);
 		spa_load_failed(spa, "no valid uberblock found");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
 	}
 
 	if (spa->spa_load_max_txg != UINT64_MAX) {
 		(void) spa_import_progress_set_max_txg(spa_guid(spa),
 		    (u_longlong_t)spa->spa_load_max_txg);
 	}
 	spa_load_note(spa, "using uberblock with txg=%llu",
 	    (u_longlong_t)ub->ub_txg);
 
 
 	/*
 	 * For pools which have the multihost property on determine if the
 	 * pool is truly inactive and can be safely imported.  Prevent
 	 * hosts which don't have a hostid set from importing the pool.
 	 */
 	activity_check = spa_activity_check_required(spa, ub, label,
 	    spa->spa_config);
 	if (activity_check) {
 		if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
 		    spa_get_hostid(spa) == 0) {
 			nvlist_free(label);
 			fnvlist_add_uint64(spa->spa_load_info,
 			    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 		}
 
 		int error = spa_activity_check(spa, ub, spa->spa_config);
 		if (error) {
 			nvlist_free(label);
 			return (error);
 		}
 
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
 		fnvlist_add_uint16(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_SEQ,
 		    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
 	}
 
 	/*
 	 * If the pool has an unsupported version we can't open it.
 	 */
 	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
 		nvlist_free(label);
 		spa_load_failed(spa, "version %llu is not supported",
 		    (u_longlong_t)ub->ub_version);
 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
 	}
 
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *features;
 
 		/*
 		 * If we weren't able to find what's necessary for reading the
 		 * MOS in the label, return failure.
 		 */
 		if (label == NULL) {
 			spa_load_failed(spa, "label config unavailable");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
 		    &features) != 0) {
 			nvlist_free(label);
 			spa_load_failed(spa, "invalid label: '%s' missing",
 			    ZPOOL_CONFIG_FEATURES_FOR_READ);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		/*
 		 * Update our in-core representation with the definitive values
 		 * from the label.
 		 */
 		nvlist_free(spa->spa_label_features);
 		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Look through entries in the label nvlist's features_for_read. If
 	 * there is a feature listed there which we don't understand then we
 	 * cannot open a pool.
 	 */
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *unsup_feat;
 
 		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
 		    0);
 
 		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
 		    NULL); nvp != NULL;
 		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
 			if (!zfeature_is_supported(nvpair_name(nvp))) {
 				VERIFY(nvlist_add_string(unsup_feat,
 				    nvpair_name(nvp), "") == 0);
 			}
 		}
 
 		if (!nvlist_empty(unsup_feat)) {
 			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
 			nvlist_free(unsup_feat);
 			spa_load_failed(spa, "some features are unsupported");
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		nvlist_free(unsup_feat);
 	}
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_try_repair(spa, spa->spa_config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		nvlist_free(spa->spa_config_splitting);
 		spa->spa_config_splitting = NULL;
 	}
 
 	/*
 	 * Initialize internal SPA structures.
 	 */
 	spa_ld_select_uberblock_done(spa, ub);
 
 	return (0);
 }
 
 static int
 spa_ld_open_rootbp(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error != 0) {
 		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
 		    "[error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
 	return (0);
 }
 
 static int
 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
     boolean_t reloading)
 {
 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
 	nvlist_t *nv, *mos_config, *policy;
 	int error = 0, copy_error;
 	uint64_t healthy_tvds, healthy_tvds_mos;
 	uint64_t mos_config_txg;
 
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
 	    != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * If we're assembling a pool from a split, the config provided is
 	 * already trusted so there is nothing to do.
 	 */
 	if (type == SPA_IMPORT_ASSEMBLE)
 		return (0);
 
 	healthy_tvds = spa_healthy_core_tvds(spa);
 
 	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
 	    != 0) {
 		spa_load_failed(spa, "unable to retrieve MOS config");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * If we are doing an open, pool owner wasn't verified yet, thus do
 	 * the verification here.
 	 */
 	if (spa->spa_load_state == SPA_LOAD_OPEN) {
 		error = spa_verify_host(spa, mos_config);
 		if (error != 0) {
 			nvlist_free(mos_config);
 			return (error);
 		}
 	}
 
 	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * Build a new vdev tree from the trusted config
 	 */
 	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
 
 	/*
 	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
 	 * obtained by scanning /dev/dsk, then it will have the right vdev
 	 * paths. We update the trusted MOS config with this information.
 	 * We first try to copy the paths with vdev_copy_path_strict, which
 	 * succeeds only when both configs have exactly the same vdev tree.
 	 * If that fails, we fall back to a more flexible method that has a
 	 * best effort policy.
 	 */
 	copy_error = vdev_copy_path_strict(rvd, mrvd);
 	if (copy_error != 0 || spa_load_print_vdev_tree) {
 		spa_load_note(spa, "provided vdev tree:");
 		vdev_dbgmsg_print_tree(rvd, 2);
 		spa_load_note(spa, "MOS vdev tree:");
 		vdev_dbgmsg_print_tree(mrvd, 2);
 	}
 	if (copy_error != 0) {
 		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
 		    "back to vdev_copy_path_relaxed");
 		vdev_copy_path_relaxed(rvd, mrvd);
 	}
 
 	vdev_close(rvd);
 	vdev_free(rvd);
 	spa->spa_root_vdev = mrvd;
 	rvd = mrvd;
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * We will use spa_config if we decide to reload the spa or if spa_load
 	 * fails and we rewind. We must thus regenerate the config using the
 	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
 	 * pass settings on how to load the pool and is not stored in the MOS.
 	 * We copy it over to our new, trusted config.
 	 */
 	mos_config_txg = fnvlist_lookup_uint64(mos_config,
 	    ZPOOL_CONFIG_POOL_TXG);
 	nvlist_free(mos_config);
 	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
 	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
 	    &policy) == 0)
 		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
 	spa_config_set(spa, mos_config);
 	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
 
 	/*
 	 * Now that we got the config from the MOS, we should be more strict
 	 * in checking blkptrs and can make assumptions about the consistency
 	 * of the vdev tree. spa_trust_config must be set to true before opening
 	 * vdevs in order for them to be writeable.
 	 */
 	spa->spa_trust_config = B_TRUE;
 
 	/*
 	 * Open and validate the new vdev tree
 	 */
 	error = spa_ld_open_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	error = spa_ld_validate_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	if (copy_error != 0 || spa_load_print_vdev_tree) {
 		spa_load_note(spa, "final vdev tree:");
 		vdev_dbgmsg_print_tree(rvd, 2);
 	}
 
 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
 	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
 		/*
 		 * Sanity check to make sure that we are indeed loading the
 		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
 		 * in the config provided and they happened to be the only ones
 		 * to have the latest uberblock, we could involuntarily perform
 		 * an extreme rewind.
 		 */
 		healthy_tvds_mos = spa_healthy_core_tvds(spa);
 		if (healthy_tvds_mos - healthy_tvds >=
 		    SPA_SYNC_MIN_VDEVS) {
 			spa_load_note(spa, "config provided misses too many "
 			    "top-level vdevs compared to MOS (%lld vs %lld). ",
 			    (u_longlong_t)healthy_tvds,
 			    (u_longlong_t)healthy_tvds_mos);
 			spa_load_note(spa, "vdev tree:");
 			vdev_dbgmsg_print_tree(rvd, 2);
 			if (reloading) {
 				spa_load_failed(spa, "config was already "
 				    "provided from MOS. Aborting.");
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 			spa_load_note(spa, "spa must be reloaded using MOS "
 			    "config");
 			return (SET_ERROR(EAGAIN));
 		}
 	}
 
 	error = spa_check_for_missing_logs(spa);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 
 	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
 		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
 		    "guid sum (%llu != %llu)",
 		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
 		    (u_longlong_t)rvd->vdev_guid_sum);
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
 		    ENXIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_open_indirect_vdev_metadata(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * Everything that we read before spa_remove_init() must be stored
 	 * on concreted vdevs.  Therefore we do this as early as possible.
 	 */
 	error = spa_remove_init(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Retrieve information needed to condense indirect vdev mappings.
 	 */
 	error = spa_condense_init(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
 		boolean_t missing_feat_read = B_FALSE;
 		nvlist_t *unsup_feat, *enabled_feat;
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
 		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
 		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
 		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		enabled_feat = fnvlist_alloc();
 		unsup_feat = fnvlist_alloc();
 
 		if (!spa_features_check(spa, B_FALSE,
 		    unsup_feat, enabled_feat))
 			missing_feat_read = B_TRUE;
 
 		if (spa_writeable(spa) ||
 		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
 			if (!spa_features_check(spa, B_TRUE,
 			    unsup_feat, enabled_feat)) {
 				*missing_feat_writep = B_TRUE;
 			}
 		}
 
 		fnvlist_add_nvlist(spa->spa_load_info,
 		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 		}
 
 		fnvlist_free(enabled_feat);
 		fnvlist_free(unsup_feat);
 
 		if (!missing_feat_read) {
 			fnvlist_add_boolean(spa->spa_load_info,
 			    ZPOOL_CONFIG_CAN_RDONLY);
 		}
 
 		/*
 		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
 		 * twofold: to determine whether the pool is available for
 		 * import in read-write mode and (if it is not) whether the
 		 * pool is available for import in read-only mode. If the pool
 		 * is available for import in read-write mode, it is displayed
 		 * as available in userland; if it is not available for import
 		 * in read-only mode, it is displayed as unavailable in
 		 * userland. If the pool is available for import in read-only
 		 * mode but not read-write mode, it is displayed as unavailable
 		 * in userland with a special note that the pool is actually
 		 * available for open in read-only mode.
 		 *
 		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
 		 * missing a feature for write, we must first determine whether
 		 * the pool can be opened read-only before returning to
 		 * userland in order to know whether to display the
 		 * abovementioned note.
 		 */
 		if (missing_feat_read || (*missing_feat_writep &&
 		    spa_writeable(spa))) {
 			spa_load_failed(spa, "pool uses unsupported features");
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		/*
 		 * Load refcounts for ZFS features from disk into an in-memory
 		 * cache during SPA initialization.
 		 */
 		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 			uint64_t refcount;
 
 			error = feature_get_refcount_from_disk(spa,
 			    &spa_feature_table[i], &refcount);
 			if (error == 0) {
 				spa->spa_feat_refcount_cache[i] = refcount;
 			} else if (error == ENOTSUP) {
 				spa->spa_feat_refcount_cache[i] =
 				    SPA_FEATURE_DISABLED;
 			} else {
 				spa_load_failed(spa, "error getting refcount "
 				    "for feature %s [error=%d]",
 				    spa_feature_table[i].fi_guid, error);
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 		}
 	}
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
 		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Encryption was added before bookmark_v2, even though bookmark_v2
 	 * is now a dependency. If this pool has encryption enabled without
 	 * bookmark_v2, trigger an errata message.
 	 */
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
 		spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_special_directories(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	spa->spa_is_initializing = B_TRUE;
 	error = dsl_pool_open(spa->spa_dsl_pool);
 	spa->spa_is_initializing = B_FALSE;
 	if (error != 0) {
 		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_get_props(spa_t *spa)
 {
 	int error = 0;
 	uint64_t obj;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/* Grab the checksum salt from the MOS. */
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CHECKSUM_SALT, 1,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes),
 	    spa->spa_cksum_salt.zcs_bytes);
 	if (error == ENOENT) {
 		/* Generate a new salt for subsequent use */
 		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 		    sizeof (spa->spa_cksum_salt.zcs_bytes));
 	} else if (error != 0) {
 		spa_load_failed(spa, "unable to retrieve checksum salt from "
 		    "MOS [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
 	if (error != 0) {
 		spa_load_failed(spa, "error opening deferred-frees bpobj "
 		    "[error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Load the bit that tells us to use the new accounting function
 	 * (raid-z deflation).  If we have an older pool, this will not
 	 * be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
 	    &spa->spa_creation_version, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
 	    B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
 	    &spa->spa_errlog_scrub, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the livelist deletion field. If a livelist is queued for
 	 * deletion, indicate that in the spa
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
 	    &spa->spa_livelists_to_delete, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the history object.  If we have an older pool, this
 	 * will not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the per-vdev ZAP map. If we have an older pool, this will not
 	 * be present; in this case, defer its creation to a later time to
 	 * avoid dirtying the MOS this early / out of sync context. See
 	 * spa_sync_config_object.
 	 */
 
 	/* The sentinel is only available in the MOS config. */
 	nvlist_t *mos_config;
 	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
 		spa_load_failed(spa, "unable to retrieve MOS config");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
 	    &spa->spa_all_vdev_zaps, B_FALSE);
 
 	if (error == ENOENT) {
 		VERIFY(!nvlist_exists(mos_config,
 		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 	} else if (error != 0) {
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
 		/*
 		 * An older version of ZFS overwrote the sentinel value, so
 		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
 		 * destruction to later; see spa_sync_config_object.
 		 */
 		spa->spa_avz_action = AVZ_ACTION_DESTROY;
 		/*
 		 * We're assuming that no vdevs have had their ZAPs created
 		 * before this. Better be sure of it.
 		 */
 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 	}
 	nvlist_free(mos_config);
 
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
 	    B_FALSE);
 	if (error && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (error == 0) {
 		uint64_t autoreplace;
 
 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
 		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
 		spa->spa_autoreplace = (autoreplace != 0);
 	}
 
 	/*
 	 * If we are importing a pool with missing top-level vdevs,
 	 * we enforce that the pool doesn't panic or get suspended on
 	 * error since the likelihood of missing data is extremely high.
 	 */
 	if (spa->spa_missing_tvds > 0 &&
 	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
 	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		spa_load_note(spa, "forcing failmode to 'continue' "
 		    "as some top level vdevs are missing");
 		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If we're assembling the pool from the split-off vdevs of
 	 * an existing pool, we don't want to attach the spares & cache
 	 * devices.
 	 */
 
 	/*
 	 * Load any hot spares for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
 	    B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
 		if (load_nvlist(spa, spa->spa_spares.sav_object,
 		    &spa->spa_spares.sav_config) != 0) {
 			spa_load_failed(spa, "error loading spares nvlist");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Load any level 2 ARC devices for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
 	    &spa->spa_l2cache.sav_object, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
 		    &spa->spa_l2cache.sav_config) != 0) {
 			spa_load_failed(spa, "error loading l2cache nvlist");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_vdev_metadata(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If the 'multihost' property is set, then never allow a pool to
 	 * be imported when the system hostid is zero.  The exception to
 	 * this rule is zdb which is always allowed to access pools.
 	 */
 	if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
 	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
 		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 	}
 
 	/*
 	 * If the 'autoreplace' property is set, then post a resource notifying
 	 * the ZFS DE that it should not issue any faults for unopenable
 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
 	 * unopenable vdevs so that the normal autoreplace handler can take
 	 * over.
 	 */
 	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		spa_check_removed(spa->spa_root_vdev);
 		/*
 		 * For the import case, this is done in spa_import(), because
 		 * at this point we're using the spare definitions from
 		 * the MOS config, not necessarily from the userland config.
 		 */
 		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
 			spa_aux_check_removed(&spa->spa_spares);
 			spa_aux_check_removed(&spa->spa_l2cache);
 		}
 	}
 
 	/*
 	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
 	 */
 	error = vdev_load(rvd);
 	if (error != 0) {
 		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	error = spa_ld_log_spacemaps(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	return (0);
 }
 
 static int
 spa_ld_load_dedup_tables(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = ddt_load(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
 		boolean_t missing = spa_check_logs(spa);
 		if (missing) {
 			if (spa->spa_missing_tvds != 0) {
 				spa_load_note(spa, "spa_check_logs failed "
 				    "so dropping the logs");
 			} else {
 				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 				spa_load_failed(spa, "spa_check_logs failed");
 				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
 				    ENXIO));
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_verify_pool_data(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * We've successfully opened the pool, verify that we're ready
 	 * to start pushing transactions.
 	 */
 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		error = spa_load_verify(spa);
 		if (error != 0) {
 			spa_load_failed(spa, "spa_load_verify failed "
 			    "[error=%d]", error);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    error));
 		}
 	}
 
 	return (0);
 }
 
 static void
 spa_ld_claim_log_blocks(spa_t *spa)
 {
 	dmu_tx_t *tx;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	/*
 	 * Claim log blocks that haven't been committed yet.
 	 * This must all happen in a single txg.
 	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
 	 * invoked from zil_claim_log_block()'s i/o done callback.
 	 * Price of rollback is that we abandon the log.
 	 */
 	spa->spa_claiming = B_TRUE;
 
 	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
 	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 	    zil_claim, tx, DS_FIND_CHILDREN);
 	dmu_tx_commit(tx);
 
 	spa->spa_claiming = B_FALSE;
 
 	spa_set_log_state(spa, SPA_LOG_GOOD);
 }
 
 static void
 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
     boolean_t update_config_cache)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	int need_update = B_FALSE;
 
 	/*
 	 * If the config cache is stale, or we have uninitialized
 	 * metaslabs (see spa_vdev_add()), then update the config.
 	 *
 	 * If this is a verbatim import, trust the current
 	 * in-core spa_config and update the disk labels.
 	 */
 	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
 	    spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
 		need_update = B_TRUE;
 
 	for (int c = 0; c < rvd->vdev_children; c++)
 		if (rvd->vdev_child[c]->vdev_ms_array == 0)
 			need_update = B_TRUE;
 
 	/*
 	 * Update the config cache asynchronously in case we're the
 	 * root pool, in which case the config cache isn't writable yet.
 	 */
 	if (need_update)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 static void
 spa_ld_prepare_for_reload(spa_t *spa)
 {
 	spa_mode_t mode = spa->spa_mode;
 	int async_suspended = spa->spa_async_suspended;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_activate(spa, mode);
 
 	/*
 	 * We save the value of spa_async_suspended as it gets reset to 0 by
 	 * spa_unload(). We want to restore it back to the original value before
 	 * returning as we might be calling spa_async_resume() later.
 	 */
 	spa->spa_async_suspended = async_suspended;
 }
 
 static int
 spa_ld_read_checkpoint_txg(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error = 0;
 
 	ASSERT0(spa->spa_checkpoint_txg);
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error != 0)
 		return (error);
 
 	ASSERT3U(checkpoint.ub_txg, !=, 0);
 	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
 	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
 	spa->spa_checkpoint_txg = checkpoint.ub_txg;
 	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
 
 	return (0);
 }
 
 static int
 spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	/*
 	 * Never trust the config that is provided unless we are assembling
 	 * a pool following a split.
 	 * This means don't trust blkptrs and the vdev tree in general. This
 	 * also effectively puts the spa in read-only mode since
 	 * spa_writeable() checks for spa_trust_config to be true.
 	 * We will later load a trusted config from the MOS.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE)
 		spa->spa_trust_config = B_FALSE;
 
 	/*
 	 * Parse the config provided to create a vdev tree.
 	 */
 	error = spa_ld_parse_config(spa, type);
 	if (error != 0)
 		return (error);
 
 	spa_import_progress_add(spa);
 
 	/*
 	 * Now that we have the vdev tree, try to open each vdev. This involves
 	 * opening the underlying physical device, retrieving its geometry and
 	 * probing the vdev with a dummy I/O. The state of each vdev will be set
 	 * based on the success of those operations. After this we'll be ready
 	 * to read from the vdevs.
 	 */
 	error = spa_ld_open_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Read the label of each vdev and make sure that the GUIDs stored
 	 * there match the GUIDs in the config provided.
 	 * If we're assembling a new pool that's been split off from an
 	 * existing pool, the labels haven't yet been updated so we skip
 	 * validation for now.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		error = spa_ld_validate_vdevs(spa);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Read all vdev labels to find the best uberblock (i.e. latest,
 	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
 	 * get the list of features required to read blkptrs in the MOS from
 	 * the vdev label with the best uberblock and verify that our version
 	 * of zfs supports them all.
 	 */
 	error = spa_ld_select_uberblock(spa, type);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Pass that uberblock to the dsl_pool layer which will open the root
 	 * blkptr. This blkptr points to the latest version of the MOS and will
 	 * allow us to read its contents.
 	 */
 	error = spa_ld_open_rootbp(spa);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 static int
 spa_ld_checkpoint_rewind(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error != 0) {
 		spa_load_failed(spa, "unable to retrieve checkpointed "
 		    "uberblock from the MOS config [error=%d]", error);
 
 		if (error == ENOENT)
 			error = ZFS_ERR_NO_CHECKPOINT;
 
 		return (error);
 	}
 
 	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
 	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
 
 	/*
 	 * We need to update the txg and timestamp of the checkpointed
 	 * uberblock to be higher than the latest one. This ensures that
 	 * the checkpointed uberblock is selected if we were to close and
 	 * reopen the pool right after we've written it in the vdev labels.
 	 * (also see block comment in vdev_uberblock_compare)
 	 */
 	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
 	checkpoint.ub_timestamp = gethrestime_sec();
 
 	/*
 	 * Set current uberblock to be the checkpointed uberblock.
 	 */
 	spa->spa_uberblock = checkpoint;
 
 	/*
 	 * If we are doing a normal rewind, then the pool is open for
 	 * writing and we sync the "updated" checkpointed uberblock to
 	 * disk. Once this is done, we've basically rewound the whole
 	 * pool and there is no way back.
 	 *
 	 * There are cases when we don't want to attempt and sync the
 	 * checkpointed uberblock to disk because we are opening a
 	 * pool as read-only. Specifically, verifying the checkpointed
 	 * state with zdb, and importing the checkpointed state to get
 	 * a "preview" of its content.
 	 */
 	if (spa_writeable(spa)) {
 		vdev_t *rvd = spa->spa_root_vdev;
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 		int svdcount = 0;
 		int children = rvd->vdev_children;
 		int c0 = spa_get_random(children);
 
 		for (int c = 0; c < children; c++) {
 			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
 
 			/* Stop when revisiting the first vdev */
 			if (c > 0 && svd[0] == vd)
 				break;
 
 			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
 			    !vdev_is_concrete(vd))
 				continue;
 
 			svd[svdcount++] = vd;
 			if (svdcount == SPA_SYNC_MIN_VDEVS)
 				break;
 		}
 		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 		spa_config_exit(spa, SCL_ALL, FTAG);
 
 		if (error != 0) {
 			spa_load_failed(spa, "failed to write checkpointed "
 			    "uberblock to the vdev labels [error=%d]", error);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
     boolean_t *update_config_cache)
 {
 	int error;
 
 	/*
 	 * Parse the config for pool, open and validate vdevs,
 	 * select an uberblock, and use that uberblock to open
 	 * the MOS.
 	 */
 	error = spa_ld_mos_init(spa, type);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve the trusted config stored in the MOS and use it to create
 	 * a new, exact version of the vdev tree, then reopen all vdevs.
 	 */
 	error = spa_ld_trusted_config(spa, type, B_FALSE);
 	if (error == EAGAIN) {
 		if (update_config_cache != NULL)
 			*update_config_cache = B_TRUE;
 
 		/*
 		 * Redo the loading process with the trusted config if it is
 		 * too different from the untrusted config.
 		 */
 		spa_ld_prepare_for_reload(spa);
 		spa_load_note(spa, "RELOADING");
 		error = spa_ld_mos_init(spa, type);
 		if (error != 0)
 			return (error);
 
 		error = spa_ld_trusted_config(spa, type, B_TRUE);
 		if (error != 0)
 			return (error);
 
 	} else if (error != 0) {
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Load an existing storage pool, using the config provided. This config
  * describes which vdevs are part of the pool and is later validated against
  * partial configs present in each vdev's label and an entire copy of the
  * config stored in the MOS.
  */
 static int
 spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 {
 	int error = 0;
 	boolean_t missing_feat_write = B_FALSE;
 	boolean_t checkpoint_rewind =
 	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 	boolean_t update_config_cache = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	spa_load_note(spa, "LOADING");
 
 	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If we are rewinding to the checkpoint then we need to repeat
 	 * everything we've done so far in this function but this time
 	 * selecting the checkpointed uberblock and using that to open
 	 * the MOS.
 	 */
 	if (checkpoint_rewind) {
 		/*
 		 * If we are rewinding to the checkpoint update config cache
 		 * anyway.
 		 */
 		update_config_cache = B_TRUE;
 
 		/*
 		 * Extract the checkpointed uberblock from the current MOS
 		 * and use this as the pool's uberblock from now on. If the
 		 * pool is imported as writeable we also write the checkpoint
 		 * uberblock to the labels, making the rewind permanent.
 		 */
 		error = spa_ld_checkpoint_rewind(spa);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * Redo the loading process again with the
 		 * checkpointed uberblock.
 		 */
 		spa_ld_prepare_for_reload(spa);
 		spa_load_note(spa, "LOADING checkpointed uberblock");
 		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Retrieve the checkpoint txg if the pool has a checkpoint.
 	 */
 	error = spa_ld_read_checkpoint_txg(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
 	 * from the pool and their contents were re-mapped to other vdevs. Note
 	 * that everything that we read before this step must have been
 	 * rewritten on concrete vdevs after the last device removal was
 	 * initiated. Otherwise we could be reading from indirect vdevs before
 	 * we have loaded their mappings.
 	 */
 	error = spa_ld_open_indirect_vdev_metadata(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve the full list of active features from the MOS and check if
 	 * they are all supported.
 	 */
 	error = spa_ld_check_features(spa, &missing_feat_write);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Load several special directories from the MOS needed by the dsl_pool
 	 * layer.
 	 */
 	error = spa_ld_load_special_directories(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve pool properties from the MOS.
 	 */
 	error = spa_ld_get_props(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve the list of auxiliary devices - cache devices and spares -
 	 * and open them.
 	 */
 	error = spa_ld_open_aux_vdevs(spa, type);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Load the metadata for all vdevs. Also check if unopenable devices
 	 * should be autoreplaced.
 	 */
 	error = spa_ld_load_vdev_metadata(spa);
 	if (error != 0)
 		return (error);
 
 	error = spa_ld_load_dedup_tables(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Verify the logs now to make sure we don't have any unexpected errors
 	 * when we claim log blocks later.
 	 */
 	error = spa_ld_verify_logs(spa, type, ereport);
 	if (error != 0)
 		return (error);
 
 	if (missing_feat_write) {
 		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * At this point, we know that we can open the pool in
 		 * read-only mode but not read-write mode. We now have enough
 		 * information and can return to userland.
 		 */
 		return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
 		    ENOTSUP));
 	}
 
 	/*
 	 * Traverse the last txgs to make sure the pool was left off in a safe
 	 * state. When performing an extreme rewind, we verify the whole pool,
 	 * which can take a very long time.
 	 */
 	error = spa_ld_verify_pool_data(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Calculate the deflated space for the pool. This must be done before
 	 * we write anything to the pool because we'd need to update the space
 	 * accounting using the deflated sizes.
 	 */
 	spa_update_dspace(spa);
 
 	/*
 	 * We have now retrieved all the information we needed to open the
 	 * pool. If we are importing the pool in read-write mode, a few
 	 * additional steps must be performed to finish the import.
 	 */
 	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		uint64_t config_cache_txg = spa->spa_config_txg;
 
 		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * In case of a checkpoint rewind, log the original txg
 		 * of the checkpointed uberblock.
 		 */
 		if (checkpoint_rewind) {
 			spa_history_log_internal(spa, "checkpoint rewind",
 			    NULL, "rewound state to txg=%llu",
 			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
 		}
 
 		/*
 		 * Traverse the ZIL and claim all blocks.
 		 */
 		spa_ld_claim_log_blocks(spa);
 
 		/*
 		 * Kick-off the syncing thread.
 		 */
 		spa->spa_sync_on = B_TRUE;
 		txg_sync_start(spa->spa_dsl_pool);
 		mmp_thread_start(spa);
 
 		/*
 		 * Wait for all claims to sync.  We sync up to the highest
 		 * claimed log block birth time so that claimed log blocks
 		 * don't appear to be from the future.  spa_claim_max_txg
 		 * will have been set for us by ZIL traversal operations
 		 * performed above.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
 		 * Check if we need to request an update of the config. On the
 		 * next sync, we would update the config stored in vdev labels
 		 * and the cachefile (by default /etc/zfs/zpool.cache).
 		 */
 		spa_ld_check_for_config_update(spa, config_cache_txg,
 		    update_config_cache);
 
 		/*
 		 * Check if a rebuild was in progress and if so resume it.
 		 * Then check all DTLs to see if anything needs resilvering.
 		 * The resilver will be deferred if a rebuild was started.
 		 */
 		if (vdev_rebuild_active(spa->spa_root_vdev)) {
 			vdev_rebuild_restart(spa);
 		} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 		}
 
 		/*
 		 * Log the fact that we booted up (so that we can detect if
 		 * we rebooted in the middle of an operation).
 		 */
 		spa_history_log_version(spa, "open", NULL);
 
 		spa_restart_removal(spa);
 		spa_spawn_aux_threads(spa);
 
 		/*
 		 * Delete any inconsistent datasets.
 		 *
 		 * Note:
 		 * Since we may be issuing deletes for clones here,
 		 * we make sure to do so after we've spawned all the
 		 * auxiliary threads above (from which the livelist
 		 * deletion zthr is part of).
 		 */
 		(void) dmu_objset_find(spa_name(spa),
 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
 		/*
 		 * Clean up any stale temporary dataset userrefs.
 		 */
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_initialize_restart(spa->spa_root_vdev);
 		vdev_trim_restart(spa->spa_root_vdev);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_import_progress_remove(spa_guid(spa));
 	spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 
 	spa_load_note(spa, "LOADED");
 
 	return (0);
 }
 
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state)
 {
 	spa_mode_t mode = spa->spa_mode;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 
 	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 
 	spa_activate(spa, mode);
 	spa_async_suspend(spa);
 
 	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
 	    (u_longlong_t)spa->spa_load_max_txg);
 
 	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
 }
 
 /*
  * If spa_load() fails this function will try loading prior txg's. If
  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
  * function will not rewind the pool and will return the same error as
  * spa_load().
  */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
     int rewind_flags)
 {
 	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
 	int load_error, rewind_error;
 	uint64_t safe_rewind_txg;
 	uint64_t min_txg;
 
 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
 		spa->spa_load_max_txg = spa->spa_load_txg;
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
 		if (max_request != UINT64_MAX)
 			spa->spa_extreme_rewind = B_TRUE;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
 	if (load_error == 0)
 		return (0);
 	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
 		/*
 		 * When attempting checkpoint-rewind on a pool with no
 		 * checkpoint, we should not attempt to load uberblocks
 		 * from previous txgs when spa_load fails.
 		 */
 		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 
 	if (spa->spa_root_vdev != NULL)
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
 		nvlist_free(config);
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 
 	if (state == SPA_LOAD_RECOVER) {
 		/* Price of rolling back is discarding txgs, including log */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		/*
 		 * If we aren't rolling back save the load info from our first
 		 * import attempt so that we can restore it after attempting
 		 * to rewind.
 		 */
 		loadinfo = spa->spa_load_info;
 		spa->spa_load_info = fnvlist_alloc();
 	}
 
 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
 	    TXG_INITIAL : safe_rewind_txg;
 
 	/*
 	 * Continue as long as we're finding errors, we're still within
 	 * the acceptable rewind range, and we're still finding uberblocks
 	 */
 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
 		if (spa->spa_load_max_txg < safe_rewind_txg)
 			spa->spa_extreme_rewind = B_TRUE;
 		rewind_error = spa_load_retry(spa, state);
 	}
 
 	spa->spa_extreme_rewind = B_FALSE;
 	spa->spa_load_max_txg = UINT64_MAX;
 
 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 		spa_config_set(spa, config);
 	else
 		nvlist_free(config);
 
 	if (state == SPA_LOAD_RECOVER) {
 		ASSERT3P(loadinfo, ==, NULL);
 		spa_import_progress_remove(spa_guid(spa));
 		return (rewind_error);
 	} else {
 		/* Store the rewind info as part of the initial load info */
 		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
 		    spa->spa_load_info);
 
 		/* Restore the initial load info */
 		fnvlist_free(spa->spa_load_info);
 		spa->spa_load_info = loadinfo;
 
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 }
 
 /*
  * Pool Open/Import
  *
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
  * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
  * ambiguous state.
  */
 static int
 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
     nvlist_t **config)
 {
 	spa_t *spa;
 	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 	int firstopen = B_FALSE;
 
 	*spapp = NULL;
 
 	/*
 	 * As disgusting as this is, we need to support recursive calls to this
 	 * function because dsl_dir_open() is called during spa_load(), and ends
 	 * up calling spa_open() again.  The real fix is to figure out how to
 	 * avoid dsl_dir_open() calling this in the first place.
 	 */
 	if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
 		mutex_enter(&spa_namespace_lock);
 		locked = B_TRUE;
 	}
 
 	if ((spa = spa_lookup(pool)) == NULL) {
 		if (locked)
 			mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 		zpool_load_policy_t policy;
 
 		firstopen = B_TRUE;
 
 		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
 		    &policy);
 		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
 			state = SPA_LOAD_RECOVER;
 
 		spa_activate(spa, spa_mode_global);
 
 		if (state != SPA_LOAD_RECOVER)
 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 
 		zfs_dbgmsg("spa_open_common: opening %s", pool);
 		error = spa_load_best(spa, state, policy.zlp_txg,
 		    policy.zlp_rewind);
 
 		if (error == EBADF) {
 			/*
 			 * If vdev_validate() returns failure (indicated by
 			 * EBADF), it indicates that one of the vdevs indicates
 			 * that the pool has been exported or destroyed.  If
 			 * this is the case, the config cache is out of sync and
 			 * we should remove the pool from the namespace.
 			 */
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa_write_cachefile(spa, B_TRUE, B_TRUE);
 			spa_remove(spa);
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(ENOENT));
 		}
 
 		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
 			if (config != NULL && spa->spa_config) {
 				VERIFY(nvlist_dup(spa->spa_config, config,
 				    KM_SLEEP) == 0);
 				VERIFY(nvlist_add_nvlist(*config,
 				    ZPOOL_CONFIG_LOAD_INFO,
 				    spa->spa_load_info) == 0);
 			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = error;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
 		}
 	}
 
 	spa_open_ref(spa, tag);
 
 	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	/*
 	 * If we've recovered the pool, pass back any information we
 	 * gathered while doing the load.
 	 */
 	if (state == SPA_LOAD_RECOVER) {
 		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info) == 0);
 	}
 
 	if (locked) {
 		spa->spa_last_open_failed = 0;
 		spa->spa_last_ubsync_txg = 0;
 		spa->spa_load_txg = 0;
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	if (firstopen)
 		zvol_create_minors_recursive(spa_name(spa));
 
 	*spapp = spa;
 
 	return (0);
 }
 
 int
 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
     nvlist_t **config)
 {
 	return (spa_open_common(name, spapp, tag, policy, config));
 }
 
 int
 spa_open(const char *name, spa_t **spapp, void *tag)
 {
 	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
  * Lookup the given spa_t, incrementing the inject count in the process,
  * preventing it from being exported or destroyed.
  */
 spa_t *
 spa_inject_addref(char *name)
 {
 	spa_t *spa;
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(name)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (NULL);
 	}
 	spa->spa_inject_ref++;
 	mutex_exit(&spa_namespace_lock);
 
 	return (spa);
 }
 
 void
 spa_inject_delref(spa_t *spa)
 {
 	mutex_enter(&spa_namespace_lock);
 	spa->spa_inject_ref--;
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * Add spares device information to the nvlist.
  */
 static void
 spa_add_spares(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **spares;
 	uint_t i, nspares;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_stat_t *vs;
 	uint_t vsc;
 	uint64_t pool;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_spares.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 	if (nspares != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
 		/*
 		 * Go through and find any spares which have since been
 		 * repurposed as an active spare.  If this is the case, update
 		 * their status appropriately.
 		 */
 		for (i = 0; i < nspares; i++) {
 			VERIFY(nvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
 				VERIFY(nvlist_lookup_uint64_array(
 				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
 				    (uint64_t **)&vs, &vsc) == 0);
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
 			}
 		}
 	}
 }
 
 /*
  * Add l2cache device information to the nvlist, including vdev stats.
  */
 static void
 spa_add_l2cache(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **l2cache;
 	uint_t i, j, nl2cache;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_t *vd;
 	vdev_stat_t *vs;
 	uint_t vsc;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_l2cache.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 	if (nl2cache != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 
 		/*
 		 * Update level 2 cache device stats.
 		 */
 
 		for (i = 0; i < nl2cache; i++) {
 			VERIFY(nvlist_lookup_uint64(l2cache[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
 
 			vd = NULL;
 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
 				if (guid ==
 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
 					vd = spa->spa_l2cache.sav_vdevs[j];
 					break;
 				}
 			}
 			ASSERT(vd != NULL);
 
 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
 			    == 0);
 			vdev_get_stats(vd, vs);
 			vdev_config_generate_stats(vd, l2cache[i]);
 
 		}
 	}
 }
 
 static void
 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	if (spa->spa_feat_for_read_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_read_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY0(nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	if (spa->spa_feat_for_write_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_write_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY0(nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 }
 
 static void
 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
 {
 	int i;
 
 	for (i = 0; i < SPA_FEATURES; i++) {
 		zfeature_info_t feature = spa_feature_table[i];
 		uint64_t refcount;
 
 		if (feature_get_refcount(spa, &feature, &refcount) != 0)
 			continue;
 
 		VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
 	}
 }
 
 /*
  * Store a list of pool features and their reference counts in the
  * config.
  *
  * The first time this is called on a spa, allocate a new nvlist, fetch
  * the pool features and reference counts from disk, then save the list
  * in the spa. In subsequent calls on the same spa use the saved nvlist
  * and refresh its values from the cached reference counts.  This
  * ensures we don't block here on I/O on a suspended pool so 'zpool
  * clear' can resume the pool.
  */
 static void
 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t *features;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	mutex_enter(&spa->spa_feat_stats_lock);
 	features = spa->spa_feat_stats;
 
 	if (features != NULL) {
 		spa_feature_stats_from_cache(spa, features);
 	} else {
 		VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
 		spa->spa_feat_stats = features;
 		spa_feature_stats_from_disk(spa, features);
 	}
 
 	VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
 	    features));
 
 	mutex_exit(&spa->spa_feat_stats_lock);
 }
 
 int
 spa_get_stats(const char *name, nvlist_t **config,
     char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
 
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
 	if (spa != NULL) {
 		/*
 		 * This still leaves a window of inconsistency where the spares
 		 * or l2cache devices could change and the config would be
 		 * self-inconsistent.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		if (*config != NULL) {
 			uint64_t loadtimes[2];
 
 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
 			VERIFY(nvlist_add_uint64_array(*config,
 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
 
 			VERIFY(nvlist_add_uint64(*config,
 			    ZPOOL_CONFIG_ERRCOUNT,
 			    spa_get_errlog_size(spa)) == 0);
 
 			if (spa_suspended(spa)) {
 				VERIFY(nvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED,
 				    spa->spa_failmode) == 0);
 				VERIFY(nvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED_REASON,
 				    spa->spa_suspended) == 0);
 			}
 
 			spa_add_spares(spa, *config);
 			spa_add_l2cache(spa, *config);
 			spa_add_feature_stats(spa, *config);
 		}
 	}
 
 	/*
 	 * We want to get the alternate root even for faulted pools, so we cheat
 	 * and call spa_lookup() directly.
 	 */
 	if (altroot) {
 		if (spa == NULL) {
 			mutex_enter(&spa_namespace_lock);
 			spa = spa_lookup(name);
 			if (spa)
 				spa_altroot(spa, altroot, buflen);
 			else
 				altroot[0] = '\0';
 			spa = NULL;
 			mutex_exit(&spa_namespace_lock);
 		} else {
 			spa_altroot(spa, altroot, buflen);
 		}
 	}
 
 	if (spa != NULL) {
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * Validate that the auxiliary device array is well formed.  We must have an
  * array of nvlists, each which describes a valid leaf vdev.  If this is an
  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
  * specified, as long as they are well-formed.
  */
 static int
 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
     spa_aux_vdev_t *sav, const char *config, uint64_t version,
     vdev_labeltype_t label)
 {
 	nvlist_t **dev;
 	uint_t i, ndev;
 	vdev_t *vd;
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * It's acceptable to have no devs specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
 		return (0);
 
 	if (ndev == 0)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Make sure the pool is formatted with a version that supports this
 	 * device type.
 	 */
 	if (spa_version(spa) < version)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the pending device list so we correctly handle device in-use
 	 * checking.
 	 */
 	sav->sav_pending = dev;
 	sav->sav_npending = ndev;
 
 	for (i = 0; i < ndev; i++) {
 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
 		    mode)) != 0)
 			goto out;
 
 		if (!vd->vdev_ops->vdev_op_leaf) {
 			vdev_free(vd);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		vd->vdev_top = vd;
 
 		if ((error = vdev_open(vd)) == 0 &&
 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
 			    vd->vdev_guid) == 0);
 		}
 
 		vdev_free(vd);
 
 		if (error &&
 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
 			goto out;
 		else
 			error = 0;
 	}
 
 out:
 	sav->sav_pending = NULL;
 	sav->sav_npending = 0;
 	return (error);
 }
 
 static int
 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
 {
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
 	    VDEV_LABEL_SPARE)) != 0) {
 		return (error);
 	}
 
 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
 	    VDEV_LABEL_L2CACHE));
 }
 
 static void
 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
     const char *config)
 {
 	int i;
 
 	if (sav->sav_config != NULL) {
 		nvlist_t **olddevs;
 		uint_t oldndevs;
 		nvlist_t **newdevs;
 
 		/*
 		 * Generate new dev list by concatenating with the
 		 * current dev list.
 		 */
 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
 		    &olddevs, &oldndevs) == 0);
 
 		newdevs = kmem_alloc(sizeof (void *) *
 		    (ndevs + oldndevs), KM_SLEEP);
 		for (i = 0; i < oldndevs; i++)
 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
 			    KM_SLEEP) == 0);
 		for (i = 0; i < ndevs; i++)
 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
 			    KM_SLEEP) == 0);
 
 		VERIFY(nvlist_remove(sav->sav_config, config,
 		    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 		    config, newdevs, ndevs + oldndevs) == 0);
 		for (i = 0; i < oldndevs + ndevs; i++)
 			nvlist_free(newdevs[i]);
 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
 	} else {
 		/*
 		 * Generate a new dev list.
 		 */
 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
 		    devs, ndevs) == 0);
 	}
 }
 
 /*
  * Stop and drop level 2 ARC devices
  */
 void
 spa_l2cache_drop(spa_t *spa)
 {
 	vdev_t *vd;
 	int i;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		uint64_t pool;
 
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
 	}
 }
 
 /*
  * Verify encryption parameters for spa creation. If we are encrypting, we must
  * have the encryption feature flag enabled.
  */
 static int
 spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
     boolean_t has_encryption)
 {
 	if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
 	    dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
 	    !has_encryption)
 		return (SET_ERROR(ENOTSUP));
 
 	return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
 }
 
 /*
  * Pool Creation
  */
 int
 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops, dsl_crypto_params_t *dcp)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
 	int error = 0;
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 	uint64_t version, obj;
 	boolean_t has_features;
 	boolean_t has_encryption;
 	boolean_t has_allocclass;
 	spa_feature_t feat;
 	char *feat_name;
 	char *poolname;
 	nvlist_t *nvl;
 
 	if (props == NULL ||
 	    nvlist_lookup_string(props, "tname", &poolname) != 0)
 		poolname = (char *)pool;
 
 	/*
 	 * If this pool already exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(poolname) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Allocate a new spa_t structure.
 	 */
 	nvl = fnvlist_alloc();
 	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(poolname, nvl, altroot);
 	fnvlist_free(nvl);
 	spa_activate(spa, spa_mode_global);
 
 	if (props && (error = spa_prop_validate(spa, props))) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
 	 * Temporary pool names should never be written to disk.
 	 */
 	if (poolname != pool)
 		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
 
 	has_features = B_FALSE;
 	has_encryption = B_FALSE;
 	has_allocclass = B_FALSE;
 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
 		if (zpool_prop_feature(nvpair_name(elem))) {
 			has_features = B_TRUE;
 
 			feat_name = strchr(nvpair_name(elem), '@') + 1;
 			VERIFY0(zfeature_lookup_name(feat_name, &feat));
 			if (feat == SPA_FEATURE_ENCRYPTION)
 				has_encryption = B_TRUE;
 			if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
 				has_allocclass = B_TRUE;
 		}
 	}
 
 	/* verify encryption params, if they were provided */
 	if (dcp != NULL) {
 		error = spa_create_check_encryption_params(dcp, has_encryption);
 		if (error != 0) {
 			spa_deactivate(spa);
 			spa_remove(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (error);
 		}
 	}
 	if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (ENOTSUP);
 	}
 
 	if (has_features || nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 		version = SPA_VERSION;
 	}
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
 	spa->spa_first_txg = txg;
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_load_state = SPA_LOAD_CREATE;
 	spa->spa_removing_phys.sr_state = DSS_NONE;
 	spa->spa_removing_phys.sr_removing_vdev = -1;
 	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
 	spa->spa_indirect_vdevs_loaded = B_TRUE;
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Create the root vdev.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
 
 	ASSERT(error != 0 || rvd != NULL);
 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 
 	if (error == 0 && !zfs_allocatable_devs(nvroot))
 		error = SET_ERROR(EINVAL);
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg,
 	    VDEV_ALLOC_ADD)) == 0) {
 		/*
 		 * instantiate the metaslab groups (this will dirty the vdevs)
 		 * we can no longer error exit past this point
 		 */
 		for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
 
 			vdev_ashift_optimize(vd);
 			vdev_metaslab_set_size(vd);
 			vdev_expand(vd, txg);
 		}
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
 	 * Get the list of spares, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Get the list of level 2 cache devices, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
 	spa->spa_is_initializing = B_FALSE;
 
 	/*
 	 * Create DDTs (dedup tables).
 	 */
 	ddt_create(spa);
 
 	spa_update_dspace(spa);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Create the pool's history object.
 	 */
 	if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
 		spa_history_create_obj(spa, tx);
 
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
 	spa_history_log_version(spa, "create", tx);
 
 	/*
 	 * Create the pool config object.
 	 */
 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
 	    sizeof (uint64_t), 1, &version, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool version");
 	}
 
 	/* Newly created pools with the right version are always deflated. */
 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		spa->spa_deflate = TRUE;
 		if (zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
 			cmn_err(CE_PANIC, "failed to add deflate");
 		}
 	}
 
 	/*
 	 * Create the deferred-free bpobj.  Turn off compression
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
 	dmu_object_set_compress(spa->spa_meta_objset, obj,
 	    ZIO_COMPRESS_OFF, tx);
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add bpobj");
 	}
 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
 	    spa->spa_meta_objset, obj));
 
 	/*
 	 * Generate some random noise for salted checksums to operate on.
 	 */
 	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes));
 
 	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
 	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
 	spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
 
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(props, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(dp);
 	mmp_thread_start(spa);
 	txg_wait_synced(dp, txg);
 
 	spa_spawn_aux_threads(spa);
 
 	spa_write_cachefile(spa, B_FALSE, B_TRUE);
 
 	/*
 	 * Don't count references from objsets that are already closed
 	 * and are making their way through the eviction process.
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 	spa->spa_load_state = SPA_LOAD_NONE;
 
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Import a non-root pool into the system.
  */
 int
 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	spa_load_state_t state = SPA_LOAD_IMPORT;
 	zpool_load_policy_t policy;
 	spa_mode_t mode = spa_mode_global;
 	uint64_t readonly = B_FALSE;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	/*
 	 * If a pool with this name exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	(void) nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
 	if (readonly)
 		mode = SPA_MODE_READ;
 	spa = spa_add(pool, config, altroot);
 	spa->spa_import_flags = flags;
 
 	/*
 	 * Verbatim import - Take a pool and insert it into the namespace
 	 * as if it had been loaded at boot.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
 		if (props != NULL)
 			spa_configfile_set(spa, props, B_FALSE);
 
 		spa_write_cachefile(spa, B_FALSE, B_TRUE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
 		mutex_exit(&spa_namespace_lock);
 		return (0);
 	}
 
 	spa_activate(spa, mode);
 
 	/*
 	 * Don't start async tasks until we know everything is healthy.
 	 */
 	spa_async_suspend(spa);
 
 	zpool_get_load_policy(config, &policy);
 	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
 		state = SPA_LOAD_RECOVER;
 
 	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
 
 	if (state != SPA_LOAD_RECOVER) {
 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 		zfs_dbgmsg("spa_import: importing %s", pool);
 	} else {
 		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
 		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
 	}
 	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
 
 	/*
 	 * Propagate anything learned while loading the pool and pass it
 	 * back to caller (i.e. rewind info, missing devices, etc).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 	    spa->spa_load_info) == 0);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
 	 * Toss any existing sparelist, as it doesn't have any validity
 	 * anymore, and conflicts with spa_has_spare().
 	 */
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 		spa_load_spares(spa);
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 		spa_load_l2cache(spa);
 	}
 
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (props != NULL)
 		spa_configfile_set(spa, props, B_FALSE);
 
 	if (error != 0 || (props && spa_writeable(spa) &&
 	    (error = spa_prop_set(spa, props)))) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	spa_async_resume(spa);
 
 	/*
 	 * Override any spares and level 2 cache devices as specified by
 	 * the user, as these may have correct device names/devids, etc.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		if (spa->spa_spares.sav_config)
 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		if (spa->spa_l2cache.sav_config)
 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Check for any removed devices.
 	 */
 	if (spa->spa_autoreplace) {
 		spa_aux_check_removed(&spa->spa_spares);
 		spa_aux_check_removed(&spa->spa_l2cache);
 	}
 
 	if (spa_writeable(spa)) {
 		/*
 		 * Update the config cache to include the newly-imported pool.
 		 */
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	}
 
 	/*
 	 * It's possible that the pool was expanded while it was exported.
 	 * We kick off an async task to handle this for us.
 	 */
 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
 	spa_history_log_version(spa, "import", NULL);
 
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 
 	mutex_exit(&spa_namespace_lock);
 
 	zvol_create_minors_recursive(pool);
 
 	return (0);
 }
 
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
 	nvlist_t *config = NULL;
 	char *poolname, *cachefile;
 	spa_t *spa;
 	uint64_t state;
 	int error;
 	zpool_load_policy_t policy;
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 		return (NULL);
 
 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
 		return (NULL);
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
 	spa_activate(spa, SPA_MODE_READ);
 
 	/*
 	 * Rewind pool if a max txg was provided.
 	 */
 	zpool_get_load_policy(spa->spa_config, &policy);
 	if (policy.zlp_txg != UINT64_MAX) {
 		spa->spa_load_max_txg = policy.zlp_txg;
 		spa->spa_extreme_rewind = B_TRUE;
 		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
 		    poolname, (longlong_t)policy.zlp_txg);
 	} else {
 		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
 	}
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
 	    == 0) {
 		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 	} else {
 		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
 	}
 
 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
 	 */
 	if (spa->spa_root_vdev != NULL) {
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    poolname) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 		    state) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    spa->spa_uberblock.ub_timestamp) == 0);
 		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
 		    spa->spa_errata) == 0);
 
 		/*
 		 * If the bootfs property exists on this pool then we
 		 * copy it out so that external consumers can tell which
 		 * pools are bootable.
 		 */
 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 			/*
 			 * We have to play games with the name since the
 			 * pool was opened as TRYIMPORT_NAME.
 			 */
 			if (dsl_dsobj_to_dsname(spa_name(spa),
 			    spa->spa_bootfs, tmpname) == 0) {
 				char *cp;
 				char *dsname;
 
 				dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 				cp = strchr(tmpname, '/');
 				if (cp == NULL) {
 					(void) strlcpy(dsname, tmpname,
 					    MAXPATHLEN);
 				} else {
 					(void) snprintf(dsname, MAXPATHLEN,
 					    "%s/%s", poolname, ++cp);
 				}
 				VERIFY(nvlist_add_string(config,
 				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
 				kmem_free(dsname, MAXPATHLEN);
 			}
 			kmem_free(tmpname, MAXPATHLEN);
 		}
 
 		/*
 		 * Add the list of hot spares and level 2 cache devices.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_add_spares(spa, config);
 		spa_add_l2cache(spa, config);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_remove(spa);
 	mutex_exit(&spa_namespace_lock);
 
 	return (config);
 }
 
 /*
  * Pool export/destroy
  *
  * The act of destroying or exporting a pool is very simple.  We make sure there
  * is no more pending I/O and any references to the pool are gone.  Then, we
  * update the pool state and sync all the labels to disk, removing the
  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
  * we don't sync the labels or remove the configuration cache.
  */
 static int
 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
     boolean_t force, boolean_t hardforce)
 {
 	spa_t *spa;
 
 	if (oldconfig)
 		*oldconfig = NULL;
 
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return (SET_ERROR(EROFS));
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pool)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_is_exporting) {
 		/* the pool is being exported by another thread */
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
 	}
 	spa->spa_is_exporting = B_TRUE;
 
 	/*
 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
 	 * reacquire the namespace lock, and see if we can export.
 	 */
 	spa_open_ref(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	spa_async_suspend(spa);
 	if (spa->spa_zvol_taskq) {
 		zvol_remove_minors(spa, spa_name(spa), B_TRUE);
 		taskq_wait(spa->spa_zvol_taskq);
 	}
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 		goto export_spa;
 	/*
 	 * The pool will be in core if it's openable, in which case we can
 	 * modify its state.  Objsets may be open only because they're dirty,
 	 * so we have to force it to sync before checking spa_refcnt.
 	 */
 	if (spa->spa_sync_on) {
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 		spa_evicting_os_wait(spa);
 	}
 
 	/*
 	 * A pool cannot be exported or destroyed if there are active
 	 * references.  If we are resetting a pool, allow references by
 	 * fault injection handlers.
 	 */
 	if (!spa_refcount_zero(spa) ||
 	    (spa->spa_inject_ref != 0 &&
 	    new_state != POOL_STATE_UNINITIALIZED)) {
 		spa_async_resume(spa);
 		spa->spa_is_exporting = B_FALSE;
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EBUSY));
 	}
 
 	if (spa->spa_sync_on) {
 		/*
 		 * A pool cannot be exported if it has an active shared spare.
 		 * This is to prevent other pools stealing the active spare
 		 * from an exported pool. At user's own will, such pool can
 		 * be forcedly exported.
 		 */
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			spa_async_resume(spa);
 			spa->spa_is_exporting = B_FALSE;
 			mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(EXDEV));
 		}
 
 		/*
 		 * We're about to export or destroy this pool. Make sure
 		 * we stop all initialization and trim activity here before
 		 * we set the spa_final_txg. This will ensure that all
 		 * dirty data resulting from the initialization is
 		 * committed to disk before we unload the pool.
 		 */
 		if (spa->spa_root_vdev != NULL) {
 			vdev_t *rvd = spa->spa_root_vdev;
 			vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
 			vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
 			vdev_autotrim_stop_all(spa);
 			vdev_rebuild_stop_all(spa);
 		}
 
 		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
 			spa->spa_final_txg = spa_last_synced_txg(spa) +
 			    TXG_DEFER_SIZE + 1;
 			vdev_config_dirty(spa->spa_root_vdev);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 	}
 
 export_spa:
 	if (new_state == POOL_STATE_DESTROYED)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
 	else if (new_state == POOL_STATE_EXPORTED)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 	}
 
 	if (oldconfig && spa->spa_config)
 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
 
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		if (!hardforce)
 			spa_write_cachefile(spa, B_TRUE, B_TRUE);
 		spa_remove(spa);
 	} else {
 		/*
 		 * If spa_remove() is not called for this spa_t and
 		 * there is any possibility that it can be reused,
 		 * we make sure to reset the exporting flag.
 		 */
 		spa->spa_is_exporting = B_FALSE;
 	}
 
 	mutex_exit(&spa_namespace_lock);
 	return (0);
 }
 
 /*
  * Destroy a storage pool.
  */
 int
 spa_destroy(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * Export a storage pool.
  */
 int
 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce)
 {
 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
 	    force, hardforce));
 }
 
 /*
  * Similar to spa_export(), this unloads the spa_t without actually removing it
  * from the namespace in any way.
  */
 int
 spa_reset(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
  */
 
 /*
  * Add a device to a storage pool.
  */
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
 	uint64_t txg;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
 	    &nspares) != 0)
 		nspares = 0;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
 	    &nl2cache) != 0)
 		nl2cache = 0;
 
 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 	if (vd->vdev_children != 0 &&
 	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * We must validate the spares and l2cache devices after checking the
 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
 	 */
 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * If we are in the middle of a device removal, we can only add
 	 * devices which match the existing devices in the pool.
 	 * If we are in the middle of a removal, or have some indirect
 	 * vdevs, we can not add raidz toplevels.
 	 */
 	if (spa->spa_vdev_removal != NULL ||
 	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			tvd = vd->vdev_child[c];
 			if (spa->spa_vdev_removal != NULL &&
 			    tvd->vdev_ashift != spa->spa_max_ashift) {
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 			}
 			/* Fail if top level vdev is raidz */
 			if (tvd->vdev_ops == &vdev_raidz_ops) {
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 			}
 			/*
 			 * Need the top level mirror to be
 			 * a mirror of leaf vdevs only
 			 */
 			if (tvd->vdev_ops == &vdev_mirror_ops) {
 				for (uint64_t cid = 0;
 				    cid < tvd->vdev_children; cid++) {
 					vdev_t *cvd = tvd->vdev_child[cid];
 					if (!cvd->vdev_ops->vdev_op_leaf) {
 						return (spa_vdev_exit(spa, vd,
 						    txg, EINVAL));
 					}
 				}
 			}
 		}
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
 		tvd->vdev_id = rvd->vdev_children;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (nspares != 0) {
 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
 		    ZPOOL_CONFIG_SPARES);
 		spa_load_spares(spa);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	if (nl2cache != 0) {
 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
 		    ZPOOL_CONFIG_L2CACHE);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * We have to be careful when adding new vdevs to an existing pool.
 	 * If other threads start allocating from these vdevs before we
 	 * sync the config cache, and we lose power, then upon reboot we may
 	 * fail to open the pool because there are DVAs that the config cache
 	 * can't translate.  Therefore, we first add the vdevs without
 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
 	 * and then let spa_config_update() initialize the new metaslabs.
 	 *
 	 * spa_load() checks for added-but-not-initialized vdevs, so that
 	 * if we lose power at any point in this sequence, the remaining
 	 * steps will be completed the next time we load the pool.
 	 */
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	mutex_enter(&spa_namespace_lock);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Attach a device to a mirror.  The arguments are the path to any device
  * in the mirror, and the nvroot for the new device.  If the path specifies
  * a device that is not mirrored, we automatically insert the mirror vdev.
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
  * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
  * is automatically detached.
  *
  * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
  * should be performed instead of traditional healing reconstruction.  From
  * an administrators perspective these are both resilver operations.
  */
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
     int rebuild)
 {
 	uint64_t txg, dtl_max_txg;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
 	char *oldvdpath, *newvdpath;
 	int newvd_isspare;
 	int error;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	if (rebuild) {
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 		if (dsl_scan_resilvering(spa_get_dsl(spa)))
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_RESILVER_IN_PROGRESS));
 	} else {
 		if (vdev_rebuild_active(rvd))
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_REBUILD_IN_PROGRESS));
 	}
 
 	if (spa->spa_vdev_removal != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!oldvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = oldvd->vdev_parent;
 
 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ATTACH)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
 
 	if (!newvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
 	/*
 	 * Spares can't replace logs
 	 */
 	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 	if (rebuild) {
 		/*
 		 * For rebuilds, the parent vdev must support reconstruction
 		 * using only space maps.  This means the only allowable
 		 * parents are the root vdev or a mirror vdev.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
 		    pvd->vdev_ops != &vdev_root_ops) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 	}
 
 	if (!replacing) {
 		/*
 		 * For attach, the only allowable parent is a mirror or the root
 		 * vdev.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
 		    pvd->vdev_ops != &vdev_root_ops)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		pvops = &vdev_mirror_ops;
 	} else {
 		/*
 		 * Active hot spares can only be replaced by inactive hot
 		 * spares.
 		 */
 		if (pvd->vdev_ops == &vdev_spare_ops &&
 		    oldvd->vdev_isspare &&
 		    !spa_has_spare(spa, newvd->vdev_guid))
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * If the source is a hot spare, and the parent isn't already a
 		 * spare, then we want to create a new hot spare.  Otherwise, we
 		 * want to create a replacing vdev.  The user is not allowed to
 		 * attach to a spared vdev child unless the 'isspare' state is
 		 * the same (spare replaces spare, non-spare replaces
 		 * non-spare).
 		 */
 		if (pvd->vdev_ops == &vdev_replacing_ops &&
 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 
 		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
 			pvops = &vdev_replacing_ops;
 	}
 
 	/*
 	 * Make sure the new device is big enough.
 	 */
 	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 	/*
 	 * If this is an in-place replacement, update oldvd's path and devid
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
 		spa_strfree(oldvd->vdev_path);
 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
 		    KM_SLEEP);
 		(void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
 		    "%s/%s", newvd->vdev_path, "old");
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
 			oldvd->vdev_devid = NULL;
 		}
 	}
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
 	 */
 	if (pvd->vdev_ops != pvops)
 		pvd = vdev_add_parent(oldvd, pvops);
 
 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
 	ASSERT(pvd->vdev_ops == pvops);
 	ASSERT(oldvd->vdev_parent == pvd);
 
 	/*
 	 * Extract the new device from its root and add it to pvd.
 	 */
 	vdev_remove_child(newrootvd, newvd);
 	newvd->vdev_id = pvd->vdev_children;
 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
 	vdev_add_child(pvd, newvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(pvd);
 
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
 	 * for any dmu_sync-ed blocks.  It will propagate upward when
 	 * spa_vdev_exit() calls vdev_dtl_reassess().
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
 	vdev_dtl_dirty(newvd, DTL_MISSING,
 	    TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
 
 	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
 		spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
 	}
 
 	oldvdpath = spa_strdup(oldvd->vdev_path);
 	newvdpath = spa_strdup(newvd->vdev_path);
 	newvd_isspare = newvd->vdev_isspare;
 
 	/*
 	 * Mark newvd's DTL dirty in this txg.
 	 */
 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
 	/*
 	 * Schedule the resilver or rebuild to restart in the future. We do
 	 * this to ensure that dmu_sync-ed blocks have been stitched into the
 	 * respective datasets.
 	 */
 	if (rebuild) {
 		newvd->vdev_rebuild_txg = txg;
 
 		vdev_rebuild(tvd);
 	} else {
 		newvd->vdev_resilver_txg = txg;
 
 		if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
 			vdev_defer_resilver(newvd);
 		} else {
 			dsl_scan_restart_resilver(spa->spa_dsl_pool,
 			    dtl_max_txg);
 		}
 	}
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
 
 	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
 
 	/*
 	 * Commit the config
 	 */
 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
 	spa_history_log_internal(spa, "vdev attach", NULL,
 	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
 	    replacing ? "replace" : "attach", newvdpath,
 	    replacing ? "for" : "to", oldvdpath);
 
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
 	return (0);
 }
 
 /*
  * Detach a device from a mirror or replacing vdev.
  *
  * If 'replace_done' is specified, only detach if the parent
  * is a replacing vdev.
  */
 int
 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
 	int error;
 	vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
 	uint64_t unspare_guid = 0;
 	char *vdpath;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_detach_enter(spa, guid);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	/*
 	 * Besides being called directly from the userland through the
 	 * ioctl interface, spa_vdev_detach() can be potentially called
 	 * at the end of spa_vdev_resilver_done().
 	 *
 	 * In the regular case, when we have a checkpoint this shouldn't
 	 * happen as we never empty the DTLs of a vdev during the scrub
 	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
 	 * should never get here when we have a checkpoint.
 	 *
 	 * That said, even in a case when we checkpoint the pool exactly
 	 * as spa_vdev_resilver_done() calls this function everything
 	 * should be fine as the resilver will return right away.
 	 */
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = vd->vdev_parent;
 
 	/*
 	 * If the parent/child relationship is not as expected, don't do it.
 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
 	 * vdev that's replacing B with C.  The user's intent in replacing
 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
 	 * the replace by detaching C, the expected behavior is to end up
 	 * M(A,B).  But suppose that right after deciding to detach C,
 	 * the replacement of B completes.  We would have M(A,C), and then
 	 * ask to detach C, which would leave us with just A -- not what
 	 * the user wanted.  To prevent this, we make sure that the
 	 * parent/child relationship hasn't changed -- in this example,
 	 * that C's parent is still the replacing vdev R.
 	 */
 	if (pvd->vdev_guid != pguid && pguid != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	/*
 	 * Only 'replacing' or 'spare' vdevs can be replaced.
 	 */
 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 	    spa_version(spa) >= SPA_VERSION_SPARES);
 
 	/*
 	 * Only mirror, replacing, and spare vdevs support detach.
 	 */
 	if (pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
 	 * If this device has the only valid copy of some data,
 	 * we cannot safely detach it.
 	 */
 	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	ASSERT(pvd->vdev_children >= 2);
 
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
 	 * check to see if we changed the original vdev's path to have "/old"
 	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
 	    vd->vdev_path != NULL) {
 		size_t len = strlen(vd->vdev_path);
 
 		for (int c = 0; c < pvd->vdev_children; c++) {
 			cvd = pvd->vdev_child[c];
 
 			if (cvd == vd || cvd->vdev_path == NULL)
 				continue;
 
 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
 				spa_strfree(cvd->vdev_path);
 				cvd->vdev_path = spa_strdup(vd->vdev_path);
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If we are detaching the original disk from a spare, then it implies
 	 * that the spare should become a real disk, and be removed from the
 	 * active spare list for the pool.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_id == 0 &&
 	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
 		unspare = B_TRUE;
 
 	/*
 	 * Erase the disk labels so the disk can be used for other things.
 	 * This must be done after all other error cases are handled,
 	 * but before we disembowel vd (so we can still do I/O to it).
 	 * But if we can't do it, don't treat the error as fatal --
 	 * it may be that the unwritability of the disk is the reason
 	 * it's being detached!
 	 */
 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	/*
 	 * Remove vd from its parent and compact the parent's children.
 	 */
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	/*
 	 * Remember one of the remaining children so we can get tvd below.
 	 */
 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
 	 * do it now, marking the vdev as no longer a spare in the process.
 	 * We must do this before vdev_remove_parent(), because that can
 	 * change the GUID if it creates a new toplevel GUID.  For a similar
 	 * reason, we must remove the spare now, in the same txg as the detach;
 	 * otherwise someone could attach a new sibling, change the GUID, and
 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 		cvd->vdev_unspare = B_TRUE;
 	}
 
 	/*
 	 * If the parent mirror/replacing vdev only has one child,
 	 * the parent is no longer needed.  Remove it from the tree.
 	 */
 	if (pvd->vdev_children == 1) {
 		if (pvd->vdev_ops == &vdev_spare_ops)
 			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
 	}
 
 	/*
 	 * We don't set tvd until now because the parent we just removed
 	 * may have been the previous top-level vdev.
 	 */
 	tvd = cvd->vdev_top;
 	ASSERT(tvd->vdev_parent == rvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(cvd);
 
 	/*
 	 * If the 'autoexpand' property is set on the pool then automatically
 	 * try to expand the size of the pool. For example if the device we
 	 * just detached was smaller than the others, it may be possible to
 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
 	 * first so that we can obtain the updated sizes of the leaf vdevs.
 	 */
 	if (spa->spa_autoexpand) {
 		vdev_reopen(tvd);
 		vdev_expand(tvd, txg);
 	}
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
 	 * But first make sure we're not on any *other* txg's DTL list, to
 	 * prevent vd from being accessed after it's freed.
 	 */
 	vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
 	for (int t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
 	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
 	spa_notify_waiters(spa);
 
 	/* hang on to the spa before we release the lock */
 	spa_open_ref(spa, FTAG);
 
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	spa_history_log_internal(spa, "detach", NULL,
 	    "vdev=%s", vdpath);
 	spa_strfree(vdpath);
 
 	/*
 	 * If this was the removal of the original device in a hot spare vdev,
 	 * then we want to go through and remove the device from the hot spare
 	 * list of every other pool.
 	 */
 	if (unspare) {
 		spa_t *altspa = NULL;
 
 		mutex_enter(&spa_namespace_lock);
 		while ((altspa = spa_next(altspa)) != NULL) {
 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
 			    altspa == spa)
 				continue;
 
 			spa_open_ref(altspa, FTAG);
 			mutex_exit(&spa_namespace_lock);
 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 			mutex_enter(&spa_namespace_lock);
 			spa_close(altspa, FTAG);
 		}
 		mutex_exit(&spa_namespace_lock);
 
 		/* search the rest of the vdevs for spares to remove */
 		spa_vdev_resilver_done(spa);
 	}
 
 	/* all done with the spa; OK to release */
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
 
 static int
 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
     list_t *vd_list)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 
 	/* Look up vdev and ensure it's a leaf. */
 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_detached) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(ENODEV));
 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EINVAL));
 	} else if (!vdev_writeable(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 	mutex_enter(&vd->vdev_initialize_lock);
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
 	/*
 	 * When we activate an initialize action we check to see
 	 * if the vdev_initialize_thread is NULL. We do this instead
 	 * of using the vdev_initialize_state since there might be
 	 * a previous initialization process which has completed but
 	 * the thread is not exited.
 	 */
 	if (cmd_type == POOL_INITIALIZE_START &&
 	    (vd->vdev_initialize_thread != NULL ||
 	    vd->vdev_top->vdev_removing)) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(EBUSY));
 	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
 	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
 	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
 	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(ESRCH));
 	}
 
 	switch (cmd_type) {
 	case POOL_INITIALIZE_START:
 		vdev_initialize(vd);
 		break;
 	case POOL_INITIALIZE_CANCEL:
 		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
 		break;
 	case POOL_INITIALIZE_SUSPEND:
 		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
 		break;
 	default:
 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	return (0);
 }
 
 int
 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     nvlist_t *vdev_errlist)
 {
 	int total_errors = 0;
 	list_t vd_list;
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));
 
 	/*
 	 * We hold the namespace lock through the whole function
 	 * to prevent any changes to the pool while we're starting or
 	 * stopping initialization. The config and state locks are held so that
 	 * we can properly assess the vdev state before we commit to
 	 * the initializing operation.
 	 */
 	mutex_enter(&spa_namespace_lock);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
 
 		int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
 		    &vd_list);
 		if (error != 0) {
 			char guid_as_str[MAXNAMELEN];
 
 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
 			    "%llu", (unsigned long long)vdev_guid);
 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
 			total_errors++;
 		}
 	}
 
 	/* Wait for all initialize threads to stop. */
 	vdev_initialize_stop_wait(spa, &vd_list);
 
 	/* Sync out the initializing state */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	mutex_exit(&spa_namespace_lock);
 
 	list_destroy(&vd_list);
 
 	return (total_errors);
 }
 
 static int
 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
     uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 
 	/* Look up vdev and ensure it's a leaf. */
 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_detached) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(ENODEV));
 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EINVAL));
 	} else if (!vdev_writeable(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EROFS));
 	} else if (!vd->vdev_has_trim) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EOPNOTSUPP));
 	} else if (secure && !vd->vdev_has_securetrim) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EOPNOTSUPP));
 	}
 	mutex_enter(&vd->vdev_trim_lock);
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
 	/*
 	 * When we activate a TRIM action we check to see if the
 	 * vdev_trim_thread is NULL. We do this instead of using the
 	 * vdev_trim_state since there might be a previous TRIM process
 	 * which has completed but the thread is not exited.
 	 */
 	if (cmd_type == POOL_TRIM_START &&
 	    (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(EBUSY));
 	} else if (cmd_type == POOL_TRIM_CANCEL &&
 	    (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
 	    vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_TRIM_SUSPEND &&
 	    vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(ESRCH));
 	}
 
 	switch (cmd_type) {
 	case POOL_TRIM_START:
 		vdev_trim(vd, rate, partial, secure);
 		break;
 	case POOL_TRIM_CANCEL:
 		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
 		break;
 	case POOL_TRIM_SUSPEND:
 		vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
 		break;
 	default:
 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 	}
 	mutex_exit(&vd->vdev_trim_lock);
 
 	return (0);
 }
 
 /*
  * Initiates a manual TRIM for the requested vdevs. This kicks off individual
  * TRIM threads for each child vdev.  These threads pass over all of the free
  * space in the vdev's metaslabs and issues TRIM commands for that space.
  */
 int
 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
     boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
 {
 	int total_errors = 0;
 	list_t vd_list;
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_trim_node));
 
 	/*
 	 * We hold the namespace lock through the whole function
 	 * to prevent any changes to the pool while we're starting or
 	 * stopping TRIM. The config and state locks are held so that
 	 * we can properly assess the vdev state before we commit to
 	 * the TRIM operation.
 	 */
 	mutex_enter(&spa_namespace_lock);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
 
 		int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
 		    rate, partial, secure, &vd_list);
 		if (error != 0) {
 			char guid_as_str[MAXNAMELEN];
 
 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
 			    "%llu", (unsigned long long)vdev_guid);
 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
 			total_errors++;
 		}
 	}
 
 	/* Wait for all TRIM threads to stop. */
 	vdev_trim_stop_wait(spa, &vd_list);
 
 	/* Sync out the TRIM state */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	mutex_exit(&spa_namespace_lock);
 
 	list_destroy(&vd_list);
 
 	return (total_errors);
 }
 
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
 int
 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp)
 {
 	int error = 0;
 	uint64_t txg, *glist;
 	spa_t *newspa;
 	uint_t c, children, lastlog;
 	nvlist_t **child, *nvl, *tmp;
 	dmu_tx_t *tx;
 	char *altroot = NULL;
 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
 	boolean_t activate_slog;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* clear the log and flush everything up to now */
 	activate_slog = spa_passivate_log(spa);
 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 	error = spa_reset_logs(spa);
 	txg = spa_vdev_config_enter(spa);
 
 	if (activate_slog)
 		spa_activate_log(spa);
 
 	if (error != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	/* check new spa name before going any further */
 	if (spa_lookup(newname) != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
 
 	/*
 	 * scan through all the children to ensure they're all mirrors
 	 */
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* first, check to ensure we've got the right child count */
 	rvd = spa->spa_root_vdev;
 	lastlog = 0;
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		/* don't count the holes & logs as children */
 		if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
 		    !vdev_is_concrete(vd))) {
 			if (lastlog == 0)
 				lastlog = c;
 			continue;
 		}
 
 		lastlog = 0;
 	}
 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* next, ensure no spare or cache devices are part of the split */
 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
 
 	/* then, loop over each vdev and validate it */
 	for (c = 0; c < children; c++) {
 		uint64_t is_hole = 0;
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
 		    &is_hole);
 
 		if (is_hole != 0) {
 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
 				continue;
 			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 		}
 
 		/* deal with indirect vdevs */
 		if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
 		    &vdev_indirect_ops)
 			continue;
 
 		/* which disk is going to be split? */
 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
 		    &glist[c]) != 0) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		/* look it up in the spa */
 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
 		if (vml[c] == NULL) {
 			error = SET_ERROR(ENODEV);
 			break;
 		}
 
 		/* make sure there's nothing stopping the split */
 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
 		    vml[c]->vdev_islog ||
 		    !vdev_is_concrete(vml[c]) ||
 		    vml[c]->vdev_isspare ||
 		    vml[c]->vdev_isl2cache ||
 		    !vdev_writeable(vml[c]) ||
 		    vml[c]->vdev_children != 0 ||
 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (vdev_dtl_required(vml[c]) ||
 		    vdev_resilver_needed(vml[c], NULL, NULL)) {
 			error = SET_ERROR(EBUSY);
 			break;
 		}
 
 		/* we need certain info from the top level */
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vml[c]->vdev_top->vdev_ms_array) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vml[c]->vdev_top->vdev_ms_shift) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
 		    vml[c]->vdev_top->vdev_asize) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
 		    vml[c]->vdev_top->vdev_ashift) == 0);
 
 		/* transfer per-vdev ZAPs */
 		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
 		VERIFY0(nvlist_add_uint64(child[c],
 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
 
 		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
 		VERIFY0(nvlist_add_uint64(child[c],
 		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    vml[c]->vdev_parent->vdev_top_zap));
 	}
 
 	if (error != 0) {
 		kmem_free(vml, children * sizeof (vdev_t *));
 		kmem_free(glist, children * sizeof (uint64_t));
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* stop writers from using the disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_TRUE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	/*
 	 * Temporarily record the splitting vdevs in the spa config.  This
 	 * will disappear once the config is regenerated.
 	 */
 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    glist, children) == 0);
 	kmem_free(glist, children * sizeof (uint64_t));
 
 	mutex_enter(&spa->spa_props_lock);
 	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
 	    nvl) == 0);
 	mutex_exit(&spa->spa_props_lock);
 	spa->spa_config_splitting = nvl;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	/* configure and create the new pool */
 	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 	    spa_version(spa)) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    spa->spa_config_txg) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_generate_guid(NULL)) == 0);
 	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
 	/* add the new pool to the namespace */
 	newspa = spa_add(newname, config, altroot);
 	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
 	newspa->spa_config_txg = spa->spa_config_txg;
 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
 	/* release the spa config lock, retaining the namespace lock */
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 1);
 
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
 	/*
 	 * Temporarily stop the initializing and TRIM activity.  We set the
 	 * state to ACTIVE so that we know to resume initializing or TRIM
 	 * once the split has completed.
 	 */
 	list_t vd_initialize_list;
 	list_create(&vd_initialize_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));
 
 	list_t vd_trim_list;
 	list_create(&vd_trim_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_trim_node));
 
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
 			mutex_enter(&vml[c]->vdev_initialize_lock);
 			vdev_initialize_stop(vml[c],
 			    VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
 			mutex_exit(&vml[c]->vdev_initialize_lock);
 
 			mutex_enter(&vml[c]->vdev_trim_lock);
 			vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
 			mutex_exit(&vml[c]->vdev_trim_lock);
 		}
 	}
 
 	vdev_initialize_stop_wait(spa, &vd_initialize_list);
 	vdev_trim_stop_wait(spa, &vd_trim_list);
 
 	list_destroy(&vd_initialize_list);
 	list_destroy(&vd_trim_list);
 
 	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
 	newspa->spa_is_splitting = B_TRUE;
 
 	/* create the new pool from the disks of the original pool */
 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
 	if (error)
 		goto out;
 
 	/* if that worked, generate a real config for the new pool */
 	if (newspa->spa_root_vdev != NULL) {
 		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
 		    B_TRUE));
 	}
 
 	/* set the props */
 	if (props != NULL) {
 		spa_configfile_set(newspa, props, B_FALSE);
 		error = spa_prop_set(newspa, props);
 		if (error)
 			goto out;
 	}
 
 	/* flush everything */
 	txg = spa_vdev_config_enter(newspa);
 	vdev_config_dirty(newspa->spa_root_vdev);
 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 2);
 
 	spa_async_resume(newspa);
 
 	/* finally, update the original pool's config */
 	txg = spa_vdev_config_enter(spa);
 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0)
 		dmu_tx_abort(tx);
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
 			vdev_t *tvd = vml[c]->vdev_top;
 
 			/*
 			 * Need to be sure the detachable VDEV is not
 			 * on any *other* txg's DTL list to prevent it
 			 * from being accessed after it's freed.
 			 */
 			for (int t = 0; t < TXG_SIZE; t++) {
 				(void) txg_list_remove_this(
 				    &tvd->vdev_dtl_list, vml[c], t);
 			}
 
 			vdev_split(vml[c]);
 			if (error == 0)
 				spa_history_log_internal(spa, "detach", tx,
 				    "vdev=%s", vml[c]->vdev_path);
 
 			vdev_free(vml[c]);
 		}
 	}
 	spa->spa_avz_action = AVZ_ACTION_REBUILD;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa->spa_config_splitting = NULL;
 	nvlist_free(nvl);
 	if (error == 0)
 		dmu_tx_commit(tx);
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 3);
 
 	/* split is complete; log a history record */
 	spa_history_log_internal(newspa, "split", NULL,
 	    "from pool %s", spa_name(spa));
 
 	newspa->spa_is_splitting = B_FALSE;
 	kmem_free(vml, children * sizeof (vdev_t *));
 
 	/* if we're not going to mount the filesystems in userland, export */
 	if (exp)
 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
 		    B_FALSE, B_FALSE);
 
 	return (error);
 
 out:
 	spa_unload(newspa);
 	spa_deactivate(newspa);
 	spa_remove(newspa);
 
 	txg = spa_vdev_config_enter(spa);
 
 	/* re-online all offlined disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
 
 	/* restart initializing or trimming disks as necessary */
 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
 	spa->spa_config_splitting = NULL;
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 	return (error);
 }
 
 /*
  * Find any device that's done replacing, or a vdev marked 'unspare' that's
  * currently spared, so we can detach it.
  */
 static vdev_t *
 spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed replacement.  We always consider the first
 	 * vdev in the list to be the oldest vdev, and the last one to be
 	 * the newest (see spa_vdev_attach() for how that works).  In
 	 * the case where the newest vdev is faulted, we will not automatically
 	 * remove it after a resilver completes.  This is OK as it will require
 	 * user intervention to determine which disk the admin wishes to keep.
 	 */
 	if (vd->vdev_ops == &vdev_replacing_ops) {
 		ASSERT(vd->vdev_children > 1);
 
 		newvd = vd->vdev_child[vd->vdev_children - 1];
 		oldvd = vd->vdev_child[0];
 
 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed resilver with the 'unspare' flag set.
 	 * Also potentially update faulted state.
 	 */
 	if (vd->vdev_ops == &vdev_spare_ops) {
 		vdev_t *first = vd->vdev_child[0];
 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
 
 		if (last->vdev_unspare) {
 			oldvd = first;
 			newvd = last;
 		} else if (first->vdev_unspare) {
 			oldvd = last;
 			newvd = first;
 		} else {
 			oldvd = NULL;
 		}
 
 		if (oldvd != NULL &&
 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 
 		vdev_propagate_state(vd);
 
 		/*
 		 * If there are more than two spares attached to a disk,
 		 * and those spares are not required, then we want to
 		 * attempt to free them up now so that they can be used
 		 * by other pools.  Once we're back down to a single
 		 * disk+spare, we stop removing them.
 		 */
 		if (vd->vdev_children > 2) {
 			newvd = vd->vdev_child[1];
 
 			if (newvd->vdev_isspare && last->vdev_isspare &&
 			    vdev_dtl_empty(last, DTL_MISSING) &&
 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
 			    !vdev_dtl_required(newvd))
 				return (newvd);
 		}
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd, *pvd, *ppvd;
 	uint64_t guid, sguid, pguid, ppguid;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		pvd = vd->vdev_parent;
 		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
 		pguid = pvd->vdev_guid;
 		ppguid = ppvd->vdev_guid;
 		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
 		    ppvd->vdev_children == 2) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
 		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
 
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * If a detach was not performed above replace waiters will not have
 	 * been notified.  In which case we must do so now.
 	 */
 	spa_notify_waiters(spa);
 }
 
 /*
  * Update the stored path or FRU for this vdev.
  */
 static int
 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
     boolean_t ispath)
 {
 	vdev_t *vd;
 	boolean_t sync = B_FALSE;
 
 	ASSERT(spa_writeable(spa));
 
 	spa_vdev_state_enter(spa, SCL_ALL);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	if (ispath) {
 		if (strcmp(value, vd->vdev_path) != 0) {
 			spa_strfree(vd->vdev_path);
 			vd->vdev_path = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	} else {
 		if (vd->vdev_fru == NULL) {
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		} else if (strcmp(value, vd->vdev_fru) != 0) {
 			spa_strfree(vd->vdev_fru);
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	}
 
 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 }
 
 int
 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
 {
 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
 }
 
 int
 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 {
 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * SPA Scanning
  * ==========================================================================
  */
 int
 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 
 	return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
 }
 
 int
 spa_scan_stop(spa_t *spa)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 	return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
 int
 spa_scan(spa_t *spa, pool_scan_func_t func)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 		return (SET_ERROR(ENOTSUP));
 
 	if (func == POOL_SCAN_RESILVER &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
 	 */
 	if (func == POOL_SCAN_RESILVER &&
 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
 	return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
 /*
  * ==========================================================================
  * SPA async task processing
  * ==========================================================================
  */
 
 static void
 spa_async_remove(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_remove_wanted) {
 		vd->vdev_remove_wanted = B_FALSE;
 		vd->vdev_delayed_close = B_FALSE;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
 
 		/*
 		 * We want to clear the stats, but we don't want to do a full
 		 * vdev_clear() as that will cause us to throw away
 		 * degraded/faulted state as well as attempt to reopen the
 		 * device, all of which is a waste.
 		 */
 		vd->vdev_stat.vs_read_errors = 0;
 		vd->vdev_stat.vs_write_errors = 0;
 		vd->vdev_stat.vs_checksum_errors = 0;
 
 		vdev_state_dirty(vd->vdev_top);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_remove(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_probe(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_probe_wanted) {
 		vd->vdev_probe_wanted = B_FALSE;
 		vdev_reopen(vd);	/* vdev_open() does the actual probe */
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_probe(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
 {
 	if (!spa->spa_autoexpand)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		spa_async_autoexpand(spa, cvd);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
 		return;
 
 	spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
 }
 
 static void
 spa_async_thread(void *arg)
 {
 	spa_t *spa = (spa_t *)arg;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	spa->spa_async_tasks = 0;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if the config needs to be updated.
 	 */
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
 		uint64_t old_space, new_space;
 
 		mutex_enter(&spa_namespace_lock);
 		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		old_space += metaslab_class_get_space(spa_special_class(spa));
 		old_space += metaslab_class_get_space(spa_dedup_class(spa));
 
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 
 		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		new_space += metaslab_class_get_space(spa_special_class(spa));
 		new_space += metaslab_class_get_space(spa_dedup_class(spa));
 		mutex_exit(&spa_namespace_lock);
 
 		/*
 		 * If the pool grew as a result of the config update,
 		 * then log an internal history event.
 		 */
 		if (new_space != old_space) {
 			spa_history_log_internal(spa, "vdev online", NULL,
 			    "pool '%s' size: %llu(+%llu)",
 			    spa_name(spa), (u_longlong_t)new_space,
 			    (u_longlong_t)(new_space - old_space));
 		}
 	}
 
 	/*
 	 * See if any devices need to be marked REMOVED.
 	 */
 	if (tasks & SPA_ASYNC_REMOVE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_remove(spa, spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_async_autoexpand(spa, spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	/*
 	 * See if any devices need to be probed.
 	 */
 	if (tasks & SPA_ASYNC_PROBE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_probe(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER_DONE)
 		spa_vdev_resilver_done(spa);
 
 	/*
 	 * If any devices are done replacing, detach them.  Then if no
 	 * top-level vdevs are rebuilding attempt to kick off a scrub.
 	 */
 	if (tasks & SPA_ASYNC_REBUILD_DONE) {
 		spa_vdev_resilver_done(spa);
 
 		if (!vdev_rebuild_active(spa->spa_root_vdev))
 			(void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
 	}
 
 	/*
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER &&
 	    !vdev_rebuild_active(spa->spa_root_vdev) &&
 	    (!dsl_scan_resilvering(dp) ||
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
 		dsl_scan_restart_resilver(dp, 0);
 
 	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_initialize_restart(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	if (tasks & SPA_ASYNC_TRIM_RESTART) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_trim_restart(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	/*
 	 * Kick off L2 cache whole device TRIM.
 	 */
 	if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_trim_l2arc(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	/*
 	 * Kick off L2 cache rebuilding.
 	 */
 	if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
 		l2arc_spa_rebuild_start(spa);
 		spa_config_exit(spa, SCL_L2ARC, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_thread = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 void
 spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
 	while (spa->spa_async_thread != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 
 	spa_vdev_remove_suspend(spa);
 
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL)
 		zthr_cancel(condense_thread);
 
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_cancel(discard_thread);
 
 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
 	if (ll_delete_thread != NULL)
 		zthr_cancel(ll_delete_thread);
 
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL)
 		zthr_cancel(ll_condense_thread);
 }
 
 void
 spa_async_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	ASSERT(spa->spa_async_suspended != 0);
 	spa->spa_async_suspended--;
 	mutex_exit(&spa->spa_async_lock);
 	spa_restart_removal(spa);
 
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL)
 		zthr_resume(condense_thread);
 
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_resume(discard_thread);
 
 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
 	if (ll_delete_thread != NULL)
 		zthr_resume(ll_delete_thread);
 
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL)
 		zthr_resume(ll_condense_thread);
 }
 
 static boolean_t
 spa_async_tasks_pending(spa_t *spa)
 {
 	uint_t non_config_tasks;
 	uint_t config_task;
 	boolean_t config_task_suspended;
 
 	non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
 	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
 	if (spa->spa_ccw_fail_time == 0) {
 		config_task_suspended = B_FALSE;
 	} else {
 		config_task_suspended =
 		    (gethrtime() - spa->spa_ccw_fail_time) <
 		    ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
 	}
 
 	return (non_config_tasks || (config_task && !config_task_suspended));
 }
 
 static void
 spa_async_dispatch(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if (spa_async_tasks_pending(spa) &&
 	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread == NULL)
 		spa->spa_async_thread = thread_create(NULL, 0,
 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_request(spa_t *spa, int task)
 {
 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
 }
 
 int
 spa_async_tasks(spa_t *spa)
 {
 	return (spa->spa_async_tasks);
 }
 
 /*
  * ==========================================================================
  * SPA syncing routines
  * ==========================================================================
  */
 
 
 static int
 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	bpobj_t *bpo = arg;
 	bpobj_enqueue(bpo, bp, bp_freed, tx);
 	return (0);
 }
 
 int
 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
 }
 
 int
 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
 }
 
 static int
 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zio_t *pio = arg;
 
 	zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
 	    pio->io_flags));
 	return (0);
 }
 
 static int
 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (spa_free_sync_cb(arg, bp, tx));
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing frees.
  */
 static void
 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
 	VERIFY(zio_wait(zio) == 0);
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing deferred frees.
  */
 static void
 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 {
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	/*
 	 * Note:
 	 * If the log space map feature is active, we stop deferring
 	 * frees to the next TXG and therefore running this function
 	 * would be considered a no-op as spa_deferred_bpobj should
 	 * not have any entries.
 	 *
 	 * That said we run this function anyway (instead of returning
 	 * immediately) for the edge-case scenario where we just
 	 * activated the log space map feature in this TXG but we have
 	 * deferred frees from the previous TXG.
 	 */
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
 	    bpobj_spa_free_sync_cb, zio, tx), ==, 0);
 	VERIFY0(zio_wait(zio));
 }
 
 static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
 	char *packed = NULL;
 	size_t bufsize;
 	size_t nvsize = 0;
 	dmu_buf_t *db;
 
 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
 
 	/*
 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
 	 * information.  This avoids the dmu_buf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 	packed = vmem_alloc(bufsize, KM_SLEEP);
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 	    KM_SLEEP) == 0);
 	bzero(packed + nvsize, bufsize - nvsize);
 
 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
 
 	vmem_free(packed, bufsize);
 
 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
 	dmu_buf_rele(db, FTAG);
 }
 
 static void
 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
     const char *config, const char *entry)
 {
 	nvlist_t *nvroot;
 	nvlist_t **list;
 	int i;
 
 	if (!sav->sav_sync)
 		return;
 
 	/*
 	 * Update the MOS nvlist describing the list of available devices.
 	 * spa_validate_aux() will have already made sure this nvlist is
 	 * valid and the vdevs are labeled appropriately.
 	 */
 	if (sav->sav_object == 0) {
 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
 		    sizeof (uint64_t), tx);
 		VERIFY(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
 		    &sav->sav_object, tx) == 0);
 	}
 
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	if (sav->sav_count == 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
 	} else {
 		list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
 			    B_FALSE, VDEV_CONFIG_L2CACHE);
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
 		    sav->sav_count) == 0);
 		for (i = 0; i < sav->sav_count; i++)
 			nvlist_free(list[i]);
 		kmem_free(list, sav->sav_count * sizeof (void *));
 	}
 
 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
 	nvlist_free(nvroot);
 
 	sav->sav_sync = B_FALSE;
 }
 
 /*
  * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
  * The all-vdev ZAP must be empty.
  */
 static void
 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd->vdev_top_zap != 0) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_top_zap, tx));
 	}
 	if (vd->vdev_leaf_zap != 0) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_leaf_zap, tx));
 	}
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		spa_avz_build(vd->vdev_child[i], avz, tx);
 	}
 }
 
 static void
 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
 	nvlist_t *config;
 
 	/*
 	 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
 	 * its config may not be dirty but we still need to build per-vdev ZAPs.
 	 * Similarly, if the pool is being assembled (e.g. after a split), we
 	 * need to rebuild the AVZ although the config may not be dirty.
 	 */
 	if (list_is_empty(&spa->spa_config_dirty_list) &&
 	    spa->spa_avz_action == AVZ_ACTION_NONE)
 		return;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
 	    spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
 	    spa->spa_all_vdev_zaps != 0);
 
 	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
 		/* Make and build the new AVZ */
 		uint64_t new_avz = zap_create(spa->spa_meta_objset,
 		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
 		spa_avz_build(spa->spa_root_vdev, new_avz, tx);
 
 		/* Diff old AVZ with new one */
 		zap_cursor_t zc;
 		zap_attribute_t za;
 
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t vdzap = za.za_first_integer;
 			if (zap_lookup_int(spa->spa_meta_objset, new_avz,
 			    vdzap) == ENOENT) {
 				/*
 				 * ZAP is listed in old AVZ but not in new one;
 				 * destroy it
 				 */
 				VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
 				    tx));
 			}
 		}
 
 		zap_cursor_fini(&zc);
 
 		/* Destroy the old AVZ */
 		VERIFY0(zap_destroy(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, tx));
 
 		/* Replace the old AVZ in the dir obj with the new one */
 		VERIFY0(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
 		    sizeof (new_avz), 1, &new_avz, tx));
 
 		spa->spa_all_vdev_zaps = new_avz;
 	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
 		zap_cursor_t zc;
 		zap_attribute_t za;
 
 		/* Walk through the AVZ and destroy all listed ZAPs */
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t zap = za.za_first_integer;
 			VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
 		}
 
 		zap_cursor_fini(&zc);
 
 		/* Destroy and unlink the AVZ itself */
 		VERIFY0(zap_destroy(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, tx));
 		VERIFY0(zap_remove(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
 		spa->spa_all_vdev_zaps = 0;
 	}
 
 	if (spa->spa_all_vdev_zaps == 0) {
 		spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
 		    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_VDEV_ZAP_MAP, tx);
 	}
 	spa->spa_avz_action = AVZ_ACTION_NONE;
 
 	/* Create ZAPs for vdevs that don't have them. */
 	vdev_construct_zaps(spa->spa_root_vdev, tx);
 
 	config = spa_config_generate(spa, spa->spa_root_vdev,
 	    dmu_tx_get_txg(tx), B_FALSE);
 
 	/*
 	 * If we're upgrading the spa version then make sure that
 	 * the config object gets updated with the correct version.
 	 */
 	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 		    spa->spa_uberblock.ub_version);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	nvlist_free(spa->spa_config_syncing);
 	spa->spa_config_syncing = config;
 
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
 static void
 spa_sync_version(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *versionp = arg;
 	uint64_t version = *versionp;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	/*
 	 * Setting the version is special cased when first creating the pool.
 	 */
 	ASSERT(tx->tx_txg != TXG_INITIAL);
 
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 	ASSERT(version >= spa_version(spa));
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa_history_log_internal(spa, "set", tx, "version=%lld",
 	    (longlong_t)version);
 }
 
 /*
  * Set zpool properties.
  */
 static void
 spa_sync_props(void *arg, dmu_tx_t *tx)
 {
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
 		uint64_t intval;
 		char *strval, *fname;
 		zpool_prop_t prop;
 		const char *propname;
 		zprop_type_t proptype;
 		spa_feature_t fid;
 
 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
 		case ZPOOL_PROP_INVAL:
 			/*
 			 * We checked this earlier in spa_prop_validate().
 			 */
 			ASSERT(zpool_prop_feature(nvpair_name(elem)));
 
 			fname = strchr(nvpair_name(elem), '@') + 1;
 			VERIFY0(zfeature_lookup_name(fname, &fid));
 
 			spa_feature_enable(spa, fid, tx);
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=enabled", nvpair_name(elem));
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			intval = fnvpair_value_uint64(elem);
 			/*
 			 * The version is synced separately before other
 			 * properties and should be correct by now.
 			 */
 			ASSERT3U(spa_version(spa), >=, intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
 			/*
 			 * 'altroot' is a non-persistent property. It should
 			 * have been set temporarily at creation or import time.
 			 */
 			ASSERT(spa->spa_root != NULL);
 			break;
 
 		case ZPOOL_PROP_READONLY:
 		case ZPOOL_PROP_CACHEFILE:
 			/*
 			 * 'readonly' and 'cachefile' are also non-persistent
 			 * properties.
 			 */
 			break;
 		case ZPOOL_PROP_COMMENT:
 			strval = fnvpair_value_string(elem);
 			if (spa->spa_comment != NULL)
 				spa_strfree(spa->spa_comment);
 			spa->spa_comment = spa_strdup(strval);
 			/*
 			 * We need to dirty the configuration on all the vdevs
 			 * so that their labels get updated.  It's unnecessary
 			 * to do this for pool creation since the vdev's
 			 * configuration has already been dirtied.
 			 */
 			if (tx->tx_txg != TXG_INITIAL)
 				vdev_config_dirty(spa->spa_root_vdev);
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=%s", nvpair_name(elem), strval);
 			break;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
 				spa->spa_pool_props_object =
 				    zap_create_link(mos, DMU_OT_POOL_PROPS,
 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
 				    tx);
 			}
 
 			/* normalize the property name */
 			propname = zpool_prop_to_name(prop);
 			proptype = zpool_prop_get_type(prop);
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%s", nvpair_name(elem), strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(zpool_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    8, 1, &intval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%lld", nvpair_name(elem),
 				    (longlong_t)intval);
 			} else {
 				ASSERT(0); /* not allowed */
 			}
 
 			switch (prop) {
 			case ZPOOL_PROP_DELEGATION:
 				spa->spa_delegation = intval;
 				break;
 			case ZPOOL_PROP_BOOTFS:
 				spa->spa_bootfs = intval;
 				break;
 			case ZPOOL_PROP_FAILUREMODE:
 				spa->spa_failmode = intval;
 				break;
 			case ZPOOL_PROP_AUTOTRIM:
 				spa->spa_autotrim = intval;
 				spa_async_request(spa,
 				    SPA_ASYNC_AUTOTRIM_RESTART);
 				break;
 			case ZPOOL_PROP_AUTOEXPAND:
 				spa->spa_autoexpand = intval;
 				if (tx->tx_txg != TXG_INITIAL)
 					spa_async_request(spa,
 					    SPA_ASYNC_AUTOEXPAND);
 				break;
 			case ZPOOL_PROP_MULTIHOST:
 				spa->spa_multihost = intval;
 				break;
 			default:
 				break;
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
  * Perform one-time upgrade on-disk changes.  spa_version() does not
  * reflect the new version this txg, so there must be no changes this
  * txg to anything that the upgrade code depends on after it executes.
  * Therefore this must be called after dsl_pool_sync() does the sync
  * tasks.
  */
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
 		dsl_pool_create_origin(dp, tx);
 
 		/* Keeping the origin open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
 		dsl_pool_upgrade_clones(dp, tx);
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
 		dsl_pool_upgrade_dir_clones(dp, tx);
 
 		/* Keeping the freedir open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		spa_feature_create_zap_objects(spa, tx);
 	}
 
 	/*
 	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
 	 * when possibility to use lz4 compression for metadata was added
 	 * Old pools that have this feature enabled must be upgraded to have
 	 * this feature active
 	 */
 	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		boolean_t lz4_en = spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 		boolean_t lz4_ac = spa_feature_is_active(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 
 		if (lz4_en && !lz4_ac)
 			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
 	}
 
 	/*
 	 * If we haven't written the salt, do so now.  Note that the
 	 * feature may not be activated yet, but that's fine since
 	 * the presence of this ZAP entry is backwards compatible.
 	 */
 	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
 		VERIFY0(zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
 		    sizeof (spa->spa_cksum_salt.zcs_bytes),
 		    spa->spa_cksum_salt.zcs_bytes, tx));
 	}
 
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 }
 
 static void
 vdev_indirect_state_sync_verify(vdev_t *vd)
 {
 	vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
 	vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
 
 	if (vd->vdev_ops == &vdev_indirect_ops) {
 		ASSERT(vim != NULL);
 		ASSERT(vib != NULL);
 	}
 
 	uint64_t obsolete_sm_object = 0;
 	ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 		ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
 		ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
 		ASSERT3U(obsolete_sm_object, ==,
 		    space_map_object(vd->vdev_obsolete_sm));
 		ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
 		    space_map_allocated(vd->vdev_obsolete_sm));
 	}
 	ASSERT(vd->vdev_obsolete_segments != NULL);
 
 	/*
 	 * Since frees / remaps to an indirect vdev can only
 	 * happen in syncing context, the obsolete segments
 	 * tree must be empty when we start syncing.
 	 */
 	ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
 }
 
 /*
  * Set the top-level vdev's max queue depth. Evaluate each top-level's
  * async write queue depth in case it changed. The max queue depth will
  * not change in the middle of syncing out this txg.
  */
 static void
 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
 {
 	ASSERT(spa_writeable(spa));
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
 	    zfs_vdev_queue_depth_pct / 100;
 	metaslab_class_t *normal = spa_normal_class(spa);
 	metaslab_class_t *special = spa_special_class(spa);
 	metaslab_class_t *dedup = spa_dedup_class(spa);
 
 	uint64_t slots_per_allocator = 0;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		metaslab_group_t *mg = tvd->vdev_mg;
 		if (mg == NULL || !metaslab_group_initialized(mg))
 			continue;
 
 		metaslab_class_t *mc = mg->mg_class;
 		if (mc != normal && mc != special && mc != dedup)
 			continue;
 
 		/*
 		 * It is safe to do a lock-free check here because only async
 		 * allocations look at mg_max_alloc_queue_depth, and async
 		 * allocations all happen from spa_sync().
 		 */
 		for (int i = 0; i < mg->mg_allocators; i++) {
 			ASSERT0(zfs_refcount_count(
 			    &(mg->mg_allocator[i].mga_alloc_queue_depth)));
 		}
 		mg->mg_max_alloc_queue_depth = max_queue_depth;
 
 		for (int i = 0; i < mg->mg_allocators; i++) {
 			mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
 			    zfs_vdev_def_queue_depth;
 		}
 		slots_per_allocator += zfs_vdev_def_queue_depth;
 	}
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i]));
 		ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i]));
 		ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i]));
 		normal->mc_alloc_max_slots[i] = slots_per_allocator;
 		special->mc_alloc_max_slots[i] = slots_per_allocator;
 		dedup->mc_alloc_max_slots[i] = slots_per_allocator;
 	}
 	normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 	special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 	dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 }
 
 static void
 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
 {
 	ASSERT(spa_writeable(spa));
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		vdev_indirect_state_sync_verify(vd);
 
 		if (vdev_indirect_should_condense(vd)) {
 			spa_condense_indirect_start_sync(vd, tx);
 			break;
 		}
 	}
 }
 
 static void
 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	uint64_t txg = tx->tx_txg;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 
 	do {
 		int pass = ++spa->spa_sync_pass;
 
 		spa_sync_config_object(spa, tx);
 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
 		if (pass < zfs_sync_pass_deferred_free ||
 		    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 			/*
 			 * If the log space map feature is active we don't
 			 * care about deferred frees and the deferred bpobj
 			 * as the log space map should effectively have the
 			 * same results (i.e. appending only to one object).
 			 */
 			spa_sync_frees(spa, free_bpl, tx);
 		} else {
 			/*
 			 * We can not defer frees in pass 1, because
 			 * we sync the deferred frees later in pass 1.
 			 */
 			ASSERT3U(pass, >, 1);
 			bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
 			    &spa->spa_deferred_bpobj, tx);
 		}
 
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
 		svr_sync(spa, tx);
 		spa_sync_upgrades(spa, tx);
 
 		spa_flush_metaslabs(spa, tx);
 
 		vdev_t *vd = NULL;
 		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 		    != NULL)
 			vdev_sync(vd, txg);
 
 		/*
 		 * Note: We need to check if the MOS is dirty because we could
 		 * have marked the MOS dirty without updating the uberblock
 		 * (e.g. if we have sync tasks but no dirty user data). We need
 		 * to check the uberblock's rootbp because it is updated if we
 		 * have synced out dirty data (though in this case the MOS will
 		 * most likely also be dirty due to second order effects, we
 		 * don't want to rely on that here).
 		 */
 		if (pass == 1 &&
 		    spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
 		    !dmu_objset_is_dirty(mos, txg)) {
 			/*
 			 * Nothing changed on the first pass, therefore this
 			 * TXG is a no-op. Avoid syncing deferred frees, so
 			 * that we can keep this TXG as a no-op.
 			 */
 			ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 			ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 			ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
 			ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
 			break;
 		}
 
 		spa_sync_deferred_frees(spa, tx);
 	} while (dmu_objset_is_dirty(mos, txg));
 }
 
 /*
  * Rewrite the vdev configuration (which includes the uberblock) to
  * commit the transaction group.
  *
  * If there are no dirty vdevs, we sync the uberblock to a few random
  * top-level vdevs that are known to be visible in the config cache
  * (see spa_vdev_add() for a complete description). If there *are* dirty
  * vdevs, sync the uberblock to all vdevs.
  */
 static void
 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t txg = tx->tx_txg;
 
 	for (;;) {
 		int error = 0;
 
 		/*
 		 * We hold SCL_STATE to prevent vdev open/close/etc.
 		 * while we're attempting to write the vdev labels.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 		if (list_is_empty(&spa->spa_config_dirty_list)) {
 			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = spa_get_random(children);
 
 			for (int c = 0; c < children; c++) {
 				vdev_t *vd =
 				    rvd->vdev_child[(c0 + c) % children];
 
 				/* Stop when revisiting the first vdev */
 				if (c > 0 && svd[0] == vd)
 					break;
 
 				if (vd->vdev_ms_array == 0 ||
 				    vd->vdev_islog ||
 				    !vdev_is_concrete(vd))
 					continue;
 
 				svd[svdcount++] = vd;
 				if (svdcount == SPA_SYNC_MIN_VDEVS)
 					break;
 			}
 			error = vdev_config_sync(svd, svdcount, txg);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
 			    rvd->vdev_children, txg);
 		}
 
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
 			break;
 		zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
 		zio_resume_wait(spa);
 	}
 }
 
 /*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
 void
 spa_sync(spa_t *spa, uint64_t txg)
 {
 	vdev_t *vd = NULL;
 
 	VERIFY(spa_writeable(spa));
 
 	/*
 	 * Wait for i/os issued in open context that need to complete
 	 * before this txg syncs.
 	 */
 	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
 	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
 	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * Lock out configuration changes.
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mutex_enter(&spa->spa_alloc_locks[i]);
 		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
 		mutex_exit(&spa->spa_alloc_locks[i]);
 	}
 
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	while (list_head(&spa->spa_state_dirty_list) != NULL) {
 		/*
 		 * We need the write lock here because, for aux vdevs,
 		 * calling vdev_config_dirty() modifies sav_config.
 		 * This is ugly and will become unnecessary when we
 		 * eliminate the aux vdev wart by integrating all vdevs
 		 * into the root vdev tree.
 		 */
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 		}
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 
 	spa->spa_sync_starttime = gethrtime();
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
 	    NSEC_TO_TICK(spa->spa_deadman_synctime));
 
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
 	 */
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		vdev_t *rvd = spa->spa_root_vdev;
 
 		int i;
 		for (i = 0; i < rvd->vdev_children; i++) {
 			vd = rvd->vdev_child[i];
 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
 				break;
 		}
 		if (i == rvd->vdev_children) {
 			spa->spa_deflate = TRUE;
 			VERIFY0(zap_add(spa->spa_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
 		}
 	}
 
 	spa_sync_adjust_vdev_max_queue_depth(spa);
 
 	spa_sync_condense_indirect(spa, tx);
 
 	spa_sync_iterate_to_convergence(spa, tx);
 
 #ifdef ZFS_DEBUG
 	if (!list_is_empty(&spa->spa_config_dirty_list)) {
 	/*
 	 * Make sure that the number of ZAPs for all the vdevs matches
 	 * the number of ZAPs in the per-vdev ZAP list. This only gets
 	 * called if the config is dirty; otherwise there may be
 	 * outstanding AVZ operations that weren't completed in
 	 * spa_sync_config_object.
 	 */
 		uint64_t all_vdev_zap_entry_count;
 		ASSERT0(zap_count(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
 		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
 		    all_vdev_zap_entry_count);
 	}
 #endif
 
 	if (spa->spa_vdev_removal != NULL) {
 		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
 	}
 
 	spa_sync_rewrite_vdev_config(spa, tx);
 	dmu_tx_commit(tx);
 
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 	spa->spa_deadman_tqid = 0;
 
 	/*
 	 * Clear the dirty config list.
 	 */
 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
 		vdev_config_clean(vd);
 
 	/*
 	 * Now that the new config has synced transactionally,
 	 * let it become visible to the config cache.
 	 */
 	if (spa->spa_config_syncing != NULL) {
 		spa_config_set(spa, spa->spa_config_syncing);
 		spa->spa_config_txg = txg;
 		spa->spa_config_syncing = NULL;
 	}
 
 	dsl_pool_sync_done(dp, txg);
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mutex_enter(&spa->spa_alloc_locks[i]);
 		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
 		mutex_exit(&spa->spa_alloc_locks[i]);
 	}
 
 	/*
 	 * Update usable space statistics.
 	 */
 	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 	    != NULL)
 		vdev_sync_done(vd, txg);
 
 	metaslab_class_evict_old(spa->spa_normal_class, txg);
 	metaslab_class_evict_old(spa->spa_log_class, txg);
 
 	spa_sync_close_syncing_log_sm(spa);
 
 	spa_update_dspace(spa);
 
 	/*
 	 * It had better be the case that we didn't dirty anything
 	 * since vdev_config_sync().
 	 */
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
 	while (zfs_pause_spa_sync)
 		delay(1);
 
 	spa->spa_sync_pass = 0;
 
 	/*
 	 * Update the last synced uberblock here. We want to do this at
 	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
 	 * will be guaranteed that all the processing associated with
 	 * that txg has been completed.
 	 */
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_handle_ignored_writes(spa);
 
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
 	spa_async_dispatch(spa);
 }
 
 /*
  * Sync all pools.  We don't want to hold the namespace lock across these
  * operations, so we take a reference on the spa_t and drop the lock during the
  * sync.
  */
 void
 spa_sync_allpools(void)
 {
 	spa_t *spa = NULL;
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
 		    !spa_writeable(spa) || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * ==========================================================================
  * Miscellaneous routines
  * ==========================================================================
  */
 
 /*
  * Remove all pools in the system.
  */
 void
 spa_evict_all(void)
 {
 	spa_t *spa;
 
 	/*
 	 * Remove all cached state.  All pools should be closed now,
 	 * so every spa in the AVL tree should be unreferenced.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
 		 * Stop async tasks.  The async thread may need to detach
 		 * a device that's been replaced, which requires grabbing
 		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		spa_async_suspend(spa);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 
 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 			spa_unload(spa);
 			spa_deactivate(spa);
 		}
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 vdev_t *
 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 {
 	vdev_t *vd;
 	int i;
 
 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 		return (vd);
 
 	if (aux) {
 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vd = spa->spa_l2cache.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 	}
 
 	return (NULL);
 }
 
 void
 spa_upgrade(spa_t *spa, uint64_t version)
 {
 	ASSERT(spa_writeable(spa));
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * This should only be called for a non-faulted pool, and since a
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
 	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
 	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 }
 
 boolean_t
 spa_has_spare(spa_t *spa, uint64_t guid)
 {
 	int i;
 	uint64_t spareguid;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++)
 		if (sav->sav_vdevs[i]->vdev_guid == guid)
 			return (B_TRUE);
 
 	for (i = 0; i < sav->sav_npending; i++) {
 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
 		    &spareguid) == 0 && spareguid == guid)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Check if a pool has an active shared spare device.
  * Note: reference count of an active spare is 2, as a spare and as a replace
  */
 static boolean_t
 spa_has_active_shared_spare(spa_t *spa)
 {
 	int i, refcnt;
 	uint64_t pool;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
 		    refcnt > 2)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 uint64_t
 spa_total_metaslabs(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	uint64_t m = 0;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		if (!vdev_is_concrete(vd))
 			continue;
 		m += vd->vdev_ms_count;
 	}
 	return (m);
 }
 
 /*
  * Notify any waiting threads that some activity has switched from being in-
  * progress to not-in-progress so that the thread can wake up and determine
  * whether it is finished waiting.
  */
 void
 spa_notify_waiters(spa_t *spa)
 {
 	/*
 	 * Acquiring spa_activities_lock here prevents the cv_broadcast from
 	 * happening between the waiting thread's check and cv_wait.
 	 */
 	mutex_enter(&spa->spa_activities_lock);
 	cv_broadcast(&spa->spa_activities_cv);
 	mutex_exit(&spa->spa_activities_lock);
 }
 
 /*
  * Notify any waiting threads that the pool is exporting, and then block until
  * they are finished using the spa_t.
  */
 void
 spa_wake_waiters(spa_t *spa)
 {
 	mutex_enter(&spa->spa_activities_lock);
 	spa->spa_waiters_cancel = B_TRUE;
 	cv_broadcast(&spa->spa_activities_cv);
 	while (spa->spa_waiters != 0)
 		cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
 	spa->spa_waiters_cancel = B_FALSE;
 	mutex_exit(&spa->spa_activities_lock);
 }
 
 /* Whether the vdev or any of its descendants are being initialized/trimmed. */
 static boolean_t
 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
 	ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
 	    activity == ZPOOL_WAIT_TRIM);
 
 	kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
 	    &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
 
 	mutex_exit(&spa->spa_activities_lock);
 	mutex_enter(lock);
 	mutex_enter(&spa->spa_activities_lock);
 
 	boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
 	    (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
 	    (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
 	mutex_exit(lock);
 
 	if (in_progress)
 		return (B_TRUE);
 
 	for (int i = 0; i < vd->vdev_children; i++) {
 		if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
 		    activity))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * If use_guid is true, this checks whether the vdev specified by guid is
  * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
  * is being initialized/trimmed. The caller must hold the config lock and
  * spa_activities_lock.
  */
 static int
 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
     zpool_wait_activity_t activity, boolean_t *in_progress)
 {
 	mutex_exit(&spa->spa_activities_lock);
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	mutex_enter(&spa->spa_activities_lock);
 
 	vdev_t *vd;
 	if (use_guid) {
 		vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 		if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
 			spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 			return (EINVAL);
 		}
 	} else {
 		vd = spa->spa_root_vdev;
 	}
 
 	*in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
 
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 	return (0);
 }
 
 /*
  * Locking for waiting threads
  * ---------------------------
  *
  * Waiting threads need a way to check whether a given activity is in progress,
  * and then, if it is, wait for it to complete. Each activity will have some
  * in-memory representation of the relevant on-disk state which can be used to
  * determine whether or not the activity is in progress. The in-memory state and
  * the locking used to protect it will be different for each activity, and may
  * not be suitable for use with a cvar (e.g., some state is protected by the
  * config lock). To allow waiting threads to wait without any races, another
  * lock, spa_activities_lock, is used.
  *
  * When the state is checked, both the activity-specific lock (if there is one)
  * and spa_activities_lock are held. In some cases, the activity-specific lock
  * is acquired explicitly (e.g. the config lock). In others, the locking is
  * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
  * thread releases the activity-specific lock and, if the activity is in
  * progress, then cv_waits using spa_activities_lock.
  *
  * The waiting thread is woken when another thread, one completing some
  * activity, updates the state of the activity and then calls
  * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
  * needs to hold its activity-specific lock when updating the state, and this
  * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
  *
  * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
  * and because it is held when the waiting thread checks the state of the
  * activity, it can never be the case that the completing thread both updates
  * the activity state and cv_broadcasts in between the waiting thread's check
  * and cv_wait. Thus, a waiting thread can never miss a wakeup.
  *
  * In order to prevent deadlock, when the waiting thread does its check, in some
  * cases it will temporarily drop spa_activities_lock in order to acquire the
  * activity-specific lock. The order in which spa_activities_lock and the
  * activity specific lock are acquired in the waiting thread is determined by
  * the order in which they are acquired in the completing thread; if the
  * completing thread calls spa_notify_waiters with the activity-specific lock
  * held, then the waiting thread must also acquire the activity-specific lock
  * first.
  */
 
 static int
 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
     boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
 
 	switch (activity) {
 	case ZPOOL_WAIT_CKPT_DISCARD:
 		*in_progress =
 		    (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
 		    zap_contains(spa_meta_objset(spa),
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
 		    ENOENT);
 		break;
 	case ZPOOL_WAIT_FREE:
 		*in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
 		    !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
 		    spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
 		    spa_livelist_delete_check(spa));
 		break;
 	case ZPOOL_WAIT_INITIALIZE:
 	case ZPOOL_WAIT_TRIM:
 		error = spa_vdev_activity_in_progress(spa, use_tag, tag,
 		    activity, in_progress);
 		break;
 	case ZPOOL_WAIT_REPLACE:
 		mutex_exit(&spa->spa_activities_lock);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 		mutex_enter(&spa->spa_activities_lock);
 
 		*in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		break;
 	case ZPOOL_WAIT_REMOVE:
 		*in_progress = (spa->spa_removing_phys.sr_state ==
 		    DSS_SCANNING);
 		break;
 	case ZPOOL_WAIT_RESILVER:
 		if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
 			break;
 		/* fall through */
 	case ZPOOL_WAIT_SCRUB:
 	{
 		boolean_t scanning, paused, is_scrub;
 		dsl_scan_t *scn =  spa->spa_dsl_pool->dp_scan;
 
 		is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
 		scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
 		paused = dsl_scan_is_paused_scrub(scn);
 		*in_progress = (scanning && !paused &&
 		    is_scrub == (activity == ZPOOL_WAIT_SCRUB));
 		break;
 	}
 	default:
 		panic("unrecognized value for activity %d", activity);
 	}
 
 	return (error);
 }
 
 static int
 spa_wait_common(const char *pool, zpool_wait_activity_t activity,
     boolean_t use_tag, uint64_t tag, boolean_t *waited)
 {
 	/*
 	 * The tag is used to distinguish between instances of an activity.
 	 * 'initialize' and 'trim' are the only activities that we use this for.
 	 * The other activities can only have a single instance in progress in a
 	 * pool at one time, making the tag unnecessary.
 	 *
 	 * There can be multiple devices being replaced at once, but since they
 	 * all finish once resilvering finishes, we don't bother keeping track
 	 * of them individually, we just wait for them all to finish.
 	 */
 	if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
 	    activity != ZPOOL_WAIT_TRIM)
 		return (EINVAL);
 
 	if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
 		return (EINVAL);
 
 	spa_t *spa;
 	int error = spa_open(pool, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Increment the spa's waiter count so that we can call spa_close and
 	 * still ensure that the spa_t doesn't get freed before this thread is
 	 * finished with it when the pool is exported. We want to call spa_close
 	 * before we start waiting because otherwise the additional ref would
 	 * prevent the pool from being exported or destroyed throughout the
 	 * potentially long wait.
 	 */
 	mutex_enter(&spa->spa_activities_lock);
 	spa->spa_waiters++;
 	spa_close(spa, FTAG);
 
 	*waited = B_FALSE;
 	for (;;) {
 		boolean_t in_progress;
 		error = spa_activity_in_progress(spa, activity, use_tag, tag,
 		    &in_progress);
 
 		if (error || !in_progress || spa->spa_waiters_cancel)
 			break;
 
 		*waited = B_TRUE;
 
 		if (cv_wait_sig(&spa->spa_activities_cv,
 		    &spa->spa_activities_lock) == 0) {
 			error = EINTR;
 			break;
 		}
 	}
 
 	spa->spa_waiters--;
 	cv_signal(&spa->spa_waiters_cv);
 	mutex_exit(&spa->spa_activities_lock);
 
 	return (error);
 }
 
 /*
  * Wait for a particular instance of the specified activity to complete, where
  * the instance is identified by 'tag'
  */
 int
 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
     boolean_t *waited)
 {
 	return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
 }
 
 /*
  * Wait for all instances of the specified activity complete
  */
 int
 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
 {
 
 	return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
 }
 
 sysevent_t *
 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
 	sysevent_t *ev = NULL;
 #ifdef _KERNEL
 	nvlist_t *resource;
 
 	resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
 	if (resource) {
 		ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
 		ev->resource = resource;
 	}
 #endif
 	return (ev);
 }
 
 void
 spa_event_post(sysevent_t *ev)
 {
 #ifdef _KERNEL
 	if (ev) {
 		zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
 		kmem_free(ev, sizeof (*ev));
 	}
 #endif
 }
 
 /*
  * Post a zevent corresponding to the given sysevent.   The 'name' must be one
  * of the event definitions in sys/sysevent/eventdefs.h.  The payload will be
  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
  * in the userland libzpool, as we don't want consumers to misinterpret ztest
  * or zdb as real changes.
  */
 void
 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
 	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
 }
 
 /* state manipulation functions */
 EXPORT_SYMBOL(spa_open);
 EXPORT_SYMBOL(spa_open_rewind);
 EXPORT_SYMBOL(spa_get_stats);
 EXPORT_SYMBOL(spa_create);
 EXPORT_SYMBOL(spa_import);
 EXPORT_SYMBOL(spa_tryimport);
 EXPORT_SYMBOL(spa_destroy);
 EXPORT_SYMBOL(spa_export);
 EXPORT_SYMBOL(spa_reset);
 EXPORT_SYMBOL(spa_async_request);
 EXPORT_SYMBOL(spa_async_suspend);
 EXPORT_SYMBOL(spa_async_resume);
 EXPORT_SYMBOL(spa_inject_addref);
 EXPORT_SYMBOL(spa_inject_delref);
 EXPORT_SYMBOL(spa_scan_stat_init);
 EXPORT_SYMBOL(spa_scan_get_stats);
 
 /* device manipulation */
 EXPORT_SYMBOL(spa_vdev_add);
 EXPORT_SYMBOL(spa_vdev_attach);
 EXPORT_SYMBOL(spa_vdev_detach);
 EXPORT_SYMBOL(spa_vdev_setpath);
 EXPORT_SYMBOL(spa_vdev_setfru);
 EXPORT_SYMBOL(spa_vdev_split_mirror);
 
 /* spare statech is global across all pools) */
 EXPORT_SYMBOL(spa_spare_add);
 EXPORT_SYMBOL(spa_spare_remove);
 EXPORT_SYMBOL(spa_spare_exists);
 EXPORT_SYMBOL(spa_spare_activate);
 
 /* L2ARC statech is global across all pools) */
 EXPORT_SYMBOL(spa_l2cache_add);
 EXPORT_SYMBOL(spa_l2cache_remove);
 EXPORT_SYMBOL(spa_l2cache_exists);
 EXPORT_SYMBOL(spa_l2cache_activate);
 EXPORT_SYMBOL(spa_l2cache_drop);
 
 /* scanning */
 EXPORT_SYMBOL(spa_scan);
 EXPORT_SYMBOL(spa_scan_stop);
 
 /* spa syncing */
 EXPORT_SYMBOL(spa_sync); /* only for DMU use */
 EXPORT_SYMBOL(spa_sync_allpools);
 
 /* properties */
 EXPORT_SYMBOL(spa_prop_set);
 EXPORT_SYMBOL(spa_prop_get);
 EXPORT_SYMBOL(spa_prop_clear_bootfs);
 
 /* asynchronous event notification */
 EXPORT_SYMBOL(spa_event_notify);
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW,
 	"log2(fraction of arc that can be used by inflight I/Os when "
 	"verifying pool during import");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
 	"Set to traverse metadata on pool import");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
 	"Set to traverse data on pool import");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
 	"Print vdev tree to zfs_dbgmsg during pool import");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
 	"Percentage of CPUs to run an IO worker thread");
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW,
 	"Allow importing pool with up to this number of missing top-level "
 	"vdevs (in read-only mode)");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW,
 	"Set the livelist condense zthr to pause");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW,
 	"Set the livelist condense synctask to pause");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW,
 	"Whether livelist condensing was canceled in the synctask");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW,
 	"Whether livelist condensing was canceled in the zthr function");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW,
 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
 	"was being condensed");
 /* END CSTYLED */
Index: vendor-sys/openzfs/dist/module/zfs/vdev_raidz_math.c
===================================================================
--- vendor-sys/openzfs/dist/module/zfs/vdev_raidz_math.c	(revision 364927)
+++ vendor-sys/openzfs/dist/module/zfs/vdev_raidz_math.c	(revision 364928)
@@ -1,666 +1,666 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/types.h>
 #include <sys/zio.h>
 #include <sys/debug.h>
 #include <sys/zfs_debug.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
 #include <sys/simd.h>
 
 /* Opaque implementation with NULL methods to represent original methods */
 static const raidz_impl_ops_t vdev_raidz_original_impl = {
 	.name = "original",
 	.is_supported = raidz_will_scalar_work,
 };
 
 /* RAIDZ parity op that contain the fastest methods */
 static raidz_impl_ops_t vdev_raidz_fastest_impl = {
 	.name = "fastest"
 };
 
 /* All compiled in implementations */
 const raidz_impl_ops_t *raidz_all_maths[] = {
 	&vdev_raidz_original_impl,
 	&vdev_raidz_scalar_impl,
 #if defined(__x86_64) && defined(HAVE_SSE2)	/* only x86_64 for now */
 	&vdev_raidz_sse2_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_SSSE3)	/* only x86_64 for now */
 	&vdev_raidz_ssse3_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX2)	/* only x86_64 for now */
 	&vdev_raidz_avx2_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX512F)	/* only x86_64 for now */
 	&vdev_raidz_avx512f_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX512BW)	/* only x86_64 for now */
 	&vdev_raidz_avx512bw_impl,
 #endif
-#if defined(__aarch64__)
+#if defined(__aarch64__) && !defined(__FreeBSD__)
 	&vdev_raidz_aarch64_neon_impl,
 	&vdev_raidz_aarch64_neonx2_impl,
 #endif
 #if defined(__powerpc__) && defined(__altivec__)
 	&vdev_raidz_powerpc_altivec_impl,
 #endif
 };
 
 /* Indicate that benchmark has been completed */
 static boolean_t raidz_math_initialized = B_FALSE;
 
 /* Select raidz implementation */
 #define	IMPL_FASTEST	(UINT32_MAX)
 #define	IMPL_CYCLE	(UINT32_MAX - 1)
 #define	IMPL_ORIGINAL	(0)
 #define	IMPL_SCALAR	(1)
 
 #define	RAIDZ_IMPL_READ(i)	(*(volatile uint32_t *) &(i))
 
 static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
 static uint32_t user_sel_impl = IMPL_FASTEST;
 
 /* Hold all supported implementations */
 static size_t raidz_supp_impl_cnt = 0;
 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
 
 #if defined(_KERNEL)
 /*
  * kstats values for supported implementations
  * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
  */
 static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
 
 /* kstat for benchmarked implementations */
 static kstat_t *raidz_math_kstat = NULL;
 #endif
 
 /*
  * Returns the RAIDZ operations for raidz_map() parity calculations.   When
  * a SIMD implementation is not allowed in the current context, then fallback
  * to the fastest generic implementation.
  */
 const raidz_impl_ops_t *
 vdev_raidz_math_get_ops(void)
 {
 	if (!kfpu_allowed())
 		return (&vdev_raidz_scalar_impl);
 
 	raidz_impl_ops_t *ops = NULL;
 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
 
 	switch (impl) {
 	case IMPL_FASTEST:
 		ASSERT(raidz_math_initialized);
 		ops = &vdev_raidz_fastest_impl;
 		break;
 	case IMPL_CYCLE:
 		/* Cycle through all supported implementations */
 		ASSERT(raidz_math_initialized);
 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
 		static size_t cycle_impl_idx = 0;
 		size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
 		ops = raidz_supp_impl[idx];
 		break;
 	case IMPL_ORIGINAL:
 		ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
 		break;
 	case IMPL_SCALAR:
 		ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl;
 		break;
 	default:
 		ASSERT3U(impl, <, raidz_supp_impl_cnt);
 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
 		if (impl < ARRAY_SIZE(raidz_all_maths))
 			ops = raidz_supp_impl[impl];
 		break;
 	}
 
 	ASSERT3P(ops, !=, NULL);
 
 	return (ops);
 }
 
 /*
  * Select parity generation method for raidz_map
  */
 int
 vdev_raidz_math_generate(raidz_map_t *rm)
 {
 	raidz_gen_f gen_parity = NULL;
 
 	switch (raidz_parity(rm)) {
 		case 1:
 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
 			break;
 		case 2:
 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
 			break;
 		case 3:
 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
 			break;
 		default:
 			gen_parity = NULL;
 			cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
 			    raidz_parity(rm));
 			break;
 	}
 
 	/* if method is NULL execute the original implementation */
 	if (gen_parity == NULL)
 		return (RAIDZ_ORIGINAL_IMPL);
 
 	gen_parity(rm);
 
 	return (0);
 }
 
 static raidz_rec_f
 reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
     const int nbaddata)
 {
 	if (nbaddata == 1 && parity_valid[CODE_P]) {
 		return (rm->rm_ops->rec[RAIDZ_REC_P]);
 	}
 	return ((raidz_rec_f) NULL);
 }
 
 static raidz_rec_f
 reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
     const int nbaddata)
 {
 	if (nbaddata == 1) {
 		if (parity_valid[CODE_P]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_P]);
 		} else if (parity_valid[CODE_Q]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
 		}
 	} else if (nbaddata == 2 &&
 	    parity_valid[CODE_P] && parity_valid[CODE_Q]) {
 		return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
 	}
 	return ((raidz_rec_f) NULL);
 }
 
 static raidz_rec_f
 reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
     const int nbaddata)
 {
 	if (nbaddata == 1) {
 		if (parity_valid[CODE_P]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_P]);
 		} else if (parity_valid[CODE_Q]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
 		} else if (parity_valid[CODE_R]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_R]);
 		}
 	} else if (nbaddata == 2) {
 		if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
 		} else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_PR]);
 		} else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_QR]);
 		}
 	} else if (nbaddata == 3 &&
 	    parity_valid[CODE_P] && parity_valid[CODE_Q] &&
 	    parity_valid[CODE_R]) {
 		return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
 	}
 	return ((raidz_rec_f) NULL);
 }
 
 /*
  * Select data reconstruction method for raidz_map
  * @parity_valid - Parity validity flag
  * @dt           - Failed data index array
  * @nbaddata     - Number of failed data columns
  */
 int
 vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
     const int *dt, const int nbaddata)
 {
 	raidz_rec_f rec_fn = NULL;
 
 	switch (raidz_parity(rm)) {
 	case PARITY_P:
 		rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
 		break;
 	case PARITY_PQ:
 		rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
 		break;
 	case PARITY_PQR:
 		rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
 		break;
 	default:
 		cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
 		    raidz_parity(rm));
 		break;
 	}
 
 	if (rec_fn == NULL)
 		return (RAIDZ_ORIGINAL_IMPL);
 	else
 		return (rec_fn(rm, dt));
 }
 
 const char *raidz_gen_name[] = {
 	"gen_p", "gen_pq", "gen_pqr"
 };
 const char *raidz_rec_name[] = {
 	"rec_p", "rec_q", "rec_r",
 	"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
 };
 
 #if defined(_KERNEL)
 
 #define	RAIDZ_KSTAT_LINE_LEN	(17 + 10*12 + 1)
 
 static int
 raidz_math_kstat_headers(char *buf, size_t size)
 {
 	int i;
 	ssize_t off;
 
 	ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
 
 	off = snprintf(buf, size, "%-17s", "implementation");
 
 	for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
 		off += snprintf(buf + off, size - off, "%-16s",
 		    raidz_gen_name[i]);
 
 	for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
 		off += snprintf(buf + off, size - off, "%-16s",
 		    raidz_rec_name[i]);
 
 	(void) snprintf(buf + off, size - off, "\n");
 
 	return (0);
 }
 
 static int
 raidz_math_kstat_data(char *buf, size_t size, void *data)
 {
 	raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
 	raidz_impl_kstat_t *cstat = (raidz_impl_kstat_t *)data;
 	ssize_t off = 0;
 	int i;
 
 	ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
 
 	if (cstat == fstat) {
 		off += snprintf(buf + off, size - off, "%-17s", "fastest");
 
 		for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) {
 			int id = fstat->gen[i];
 			off += snprintf(buf + off, size - off, "%-16s",
 			    raidz_supp_impl[id]->name);
 		}
 		for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) {
 			int id = fstat->rec[i];
 			off += snprintf(buf + off, size - off, "%-16s",
 			    raidz_supp_impl[id]->name);
 		}
 	} else {
 		ptrdiff_t id = cstat - raidz_impl_kstats;
 
 		off += snprintf(buf + off, size - off, "%-17s",
 		    raidz_supp_impl[id]->name);
 
 		for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
 			off += snprintf(buf + off, size - off, "%-16llu",
 			    (u_longlong_t)cstat->gen[i]);
 
 		for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
 			off += snprintf(buf + off, size - off, "%-16llu",
 			    (u_longlong_t)cstat->rec[i]);
 	}
 
 	(void) snprintf(buf + off, size - off, "\n");
 
 	return (0);
 }
 
 static void *
 raidz_math_kstat_addr(kstat_t *ksp, loff_t n)
 {
 	if (n <= raidz_supp_impl_cnt)
 		ksp->ks_private = (void *) (raidz_impl_kstats + n);
 	else
 		ksp->ks_private = NULL;
 
 	return (ksp->ks_private);
 }
 
 #define	BENCH_D_COLS	(8ULL)
 #define	BENCH_COLS	(BENCH_D_COLS + PARITY_PQR)
 #define	BENCH_ZIO_SIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)	/* 128 kiB */
 #define	BENCH_NS	MSEC2NSEC(25)			/* 25ms */
 
 typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
 
 static void
 benchmark_gen_impl(raidz_map_t *rm, const int fn)
 {
 	(void) fn;
 	vdev_raidz_generate_parity(rm);
 }
 
 static void
 benchmark_rec_impl(raidz_map_t *rm, const int fn)
 {
 	static const int rec_tgt[7][3] = {
 		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
 		{0, 2, 3},	/* rec_q:   bad PR & D[0]	*/
 		{0, 1, 3},	/* rec_r:   bad PQ & D[0]	*/
 		{2, 3, 4},	/* rec_pq:  bad R  & D[0][1]	*/
 		{1, 3, 4},	/* rec_pr:  bad Q  & D[0][1]	*/
 		{0, 3, 4},	/* rec_qr:  bad P  & D[0][1]	*/
 		{3, 4, 5}	/* rec_pqr: bad    & D[0][1][2] */
 	};
 
 	vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
 }
 
 /*
  * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
  * is performed by setting the rm_ops pointer and calling the top level
  * generate/reconstruct methods of bench_rm.
  */
 static void
 benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
 {
 	uint64_t run_cnt, speed, best_speed = 0;
 	hrtime_t t_start, t_diff;
 	raidz_impl_ops_t *curr_impl;
 	raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
 	int impl, i;
 
 	for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
 		/* set an implementation to benchmark */
 		curr_impl = raidz_supp_impl[impl];
 		bench_rm->rm_ops = curr_impl;
 
 		run_cnt = 0;
 		t_start = gethrtime();
 
 		do {
 			for (i = 0; i < 25; i++, run_cnt++)
 				bench_fn(bench_rm, fn);
 
 			t_diff = gethrtime() - t_start;
 		} while (t_diff < BENCH_NS);
 
 		speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
 		speed /= (t_diff * BENCH_COLS);
 
 		if (bench_fn == benchmark_gen_impl)
 			raidz_impl_kstats[impl].gen[fn] = speed;
 		else
 			raidz_impl_kstats[impl].rec[fn] = speed;
 
 		/* Update fastest implementation method */
 		if (speed > best_speed) {
 			best_speed = speed;
 
 			if (bench_fn == benchmark_gen_impl) {
 				fstat->gen[fn] = impl;
 				vdev_raidz_fastest_impl.gen[fn] =
 				    curr_impl->gen[fn];
 			} else {
 				fstat->rec[fn] = impl;
 				vdev_raidz_fastest_impl.rec[fn] =
 				    curr_impl->rec[fn];
 			}
 		}
 	}
 }
 #endif
 
 /*
  * Initialize and benchmark all supported implementations.
  */
 static void
 benchmark_raidz(void)
 {
 	raidz_impl_ops_t *curr_impl;
 	int i, c;
 
 	/* Move supported impl into raidz_supp_impl */
 	for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
 		curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
 
 		if (curr_impl->init)
 			curr_impl->init();
 
 		if (curr_impl->is_supported())
 			raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl;
 	}
 	membar_producer();		/* complete raidz_supp_impl[] init */
 	raidz_supp_impl_cnt = c;	/* number of supported impl */
 
 #if defined(_KERNEL)
 	zio_t *bench_zio = NULL;
 	raidz_map_t *bench_rm = NULL;
 	uint64_t bench_parity;
 
 	/* Fake a zio and run the benchmark on a warmed up buffer */
 	bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
 	bench_zio->io_offset = 0;
 	bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
 	bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
 	memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
 
 	/* Benchmark parity generation methods */
 	for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
 		bench_parity = fn + 1;
 		/* New raidz_map is needed for each generate_p/q/r */
 		bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
 		    BENCH_D_COLS + bench_parity, bench_parity);
 
 		benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
 
 		vdev_raidz_map_free(bench_rm);
 	}
 
 	/* Benchmark data reconstruction methods */
 	bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
 	    BENCH_COLS, PARITY_PQR);
 
 	for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
 		benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
 
 	vdev_raidz_map_free(bench_rm);
 
 	/* cleanup the bench zio */
 	abd_free(bench_zio->io_abd);
 	kmem_free(bench_zio, sizeof (zio_t));
 #else
 	/*
 	 * Skip the benchmark in user space to avoid impacting libzpool
 	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
 	 * is assumed to be the fastest and used by default.
 	 */
 	memcpy(&vdev_raidz_fastest_impl,
 	    raidz_supp_impl[raidz_supp_impl_cnt - 1],
 	    sizeof (vdev_raidz_fastest_impl));
 	strcpy(vdev_raidz_fastest_impl.name, "fastest");
 #endif /* _KERNEL */
 }
 
 void
 vdev_raidz_math_init(void)
 {
 	/* Determine the fastest available implementation. */
 	benchmark_raidz();
 
 #if defined(_KERNEL)
 	/* Install kstats for all implementations */
 	raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 	if (raidz_math_kstat != NULL) {
 		raidz_math_kstat->ks_data = NULL;
 		raidz_math_kstat->ks_ndata = UINT32_MAX;
 		kstat_set_raw_ops(raidz_math_kstat,
 		    raidz_math_kstat_headers,
 		    raidz_math_kstat_data,
 		    raidz_math_kstat_addr);
 		kstat_install(raidz_math_kstat);
 	}
 #endif
 
 	/* Finish initialization */
 	atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
 	raidz_math_initialized = B_TRUE;
 }
 
 void
 vdev_raidz_math_fini(void)
 {
 	raidz_impl_ops_t const *curr_impl;
 
 #if defined(_KERNEL)
 	if (raidz_math_kstat != NULL) {
 		kstat_delete(raidz_math_kstat);
 		raidz_math_kstat = NULL;
 	}
 #endif
 
 	for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
 		curr_impl = raidz_all_maths[i];
 		if (curr_impl->fini)
 			curr_impl->fini();
 	}
 }
 
 static const struct {
 	char *name;
 	uint32_t sel;
 } math_impl_opts[] = {
 		{ "cycle",	IMPL_CYCLE },
 		{ "fastest",	IMPL_FASTEST },
 		{ "original",	IMPL_ORIGINAL },
 		{ "scalar",	IMPL_SCALAR }
 };
 
 /*
  * Function sets desired raidz implementation.
  *
  * If we are called before init(), user preference will be saved in
  * user_sel_impl, and applied in later init() call. This occurs when module
  * parameter is specified on module load. Otherwise, directly update
  * zfs_vdev_raidz_impl.
  *
  * @val		Name of raidz implementation to use
  * @param	Unused.
  */
 int
 vdev_raidz_impl_set(const char *val)
 {
 	int err = -EINVAL;
 	char req_name[RAIDZ_IMPL_NAME_MAX];
 	uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
 	size_t i;
 
 	/* sanitize input */
 	i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
 	if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
 		return (err);
 
 	strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
 	while (i > 0 && !!isspace(req_name[i-1]))
 		i--;
 	req_name[i] = '\0';
 
 	/* Check mandatory options */
 	for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
 		if (strcmp(req_name, math_impl_opts[i].name) == 0) {
 			impl = math_impl_opts[i].sel;
 			err = 0;
 			break;
 		}
 	}
 
 	/* check all supported impl if init() was already called */
 	if (err != 0 && raidz_math_initialized) {
 		/* check all supported implementations */
 		for (i = 0; i < raidz_supp_impl_cnt; i++) {
 			if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
 				impl = i;
 				err = 0;
 				break;
 			}
 		}
 	}
 
 	if (err == 0) {
 		if (raidz_math_initialized)
 			atomic_swap_32(&zfs_vdev_raidz_impl, impl);
 		else
 			atomic_swap_32(&user_sel_impl, impl);
 	}
 
 	return (err);
 }
 
 #if defined(_KERNEL) && defined(__linux__)
 
 static int
 zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
 {
 	return (vdev_raidz_impl_set(val));
 }
 
 static int
 zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
 {
 	int i, cnt = 0;
 	char *fmt;
 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
 
 	ASSERT(raidz_math_initialized);
 
 	/* list mandatory options */
 	for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
 		fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
 		cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
 	}
 
 	/* list all supported implementations */
 	for (i = 0; i < raidz_supp_impl_cnt; i++) {
 		fmt = (i == impl) ? "[%s] " : "%s ";
 		cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
 	}
 
 	return (cnt);
 }
 
 module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
     zfs_vdev_raidz_impl_get, NULL, 0644);
 MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
 #endif
Index: vendor-sys/openzfs/dist/module/zstd/Makefile.in
===================================================================
--- vendor-sys/openzfs/dist/module/zstd/Makefile.in	(revision 364927)
+++ vendor-sys/openzfs/dist/module/zstd/Makefile.in	(revision 364928)
@@ -1,35 +1,38 @@
 ifneq ($(KBUILD_EXTMOD),)
 src = @abs_srcdir@
 obj = @abs_builddir@
 zstd_include = $(src)/include
 else
 zstd_include = $(srctree)/$(src)/include
 endif
 
 MODULE := zzstd
 
 obj-$(CONFIG_ZFS) := $(MODULE).o
 
 asflags-y := -I$(zstd_include)
 ccflags-y := -I$(zstd_include)
 
 # Zstd uses -O3 by default, so we should follow
 ccflags-y += -O3
 
 # -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h
 # Set it for other compilers, too.
 $(obj)/lib/zstd.o: c_flags += -fno-tree-vectorize
 
+# SSE register return with SSE disabled if -march=znverX is passed
+$(obj)/lib/zstd.o: c_flags += -U__BMI__
+
 # Quiet warnings about frame size due to unused code in unmodified zstd lib
 $(obj)/lib/zstd.o: c_flags += -Wframe-larger-than=20480
 
 # Disable aarch64 neon SIMD instructions for kernel mode
 $(obj)/lib/zstd.o: c_flags += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w
 
 $(obj)/zfs_zstd.o: c_flags += -include $(zstd_include)/zstd_compat_wrapper.h
 
 $(MODULE)-objs += zfs_zstd.o
 $(MODULE)-objs += lib/zstd.o
 
 all:
 	mkdir -p lib
Index: vendor-sys/openzfs/dist/tests/runfiles/common.run
===================================================================
--- vendor-sys/openzfs/dist/tests/runfiles/common.run	(revision 364927)
+++ vendor-sys/openzfs/dist/tests/runfiles/common.run	(revision 364928)
@@ -1,900 +1,900 @@
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 
 [DEFAULT]
 pre = setup
 quiet = False
 pre_user = root
 user = root
 timeout = 600
 post_user = root
 post = cleanup
 failsafe_user = root
 failsafe = callbacks/zfs_failsafe
 outputdir = /var/tmp/test_results
 tags = ['functional']
 
 [tests/functional/alloc_class]
 tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos',
     'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos',
     'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos',
     'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos',
     'alloc_class_013_pos']
 tags = ['functional', 'alloc_class']
 
 [tests/functional/atime]
 tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on']
 tags = ['functional', 'atime']
 
 [tests/functional/bootfs]
 tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos',
     'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos',
     'bootfs_008_pos']
 tags = ['functional', 'bootfs']
 
 [tests/functional/btree]
 tests = ['btree_positive', 'btree_negative']
 tags = ['functional', 'btree']
 pre =
 post =
 
 [tests/functional/cache]
 tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg',
     'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg',
     'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos']
 tags = ['functional', 'cache']
 
 [tests/functional/cachefile]
 tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos',
     'cachefile_004_pos']
 tags = ['functional', 'cachefile']
 
 [tests/functional/casenorm]
 tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure',
     'sensitive_none_lookup', 'sensitive_none_delete',
     'sensitive_formd_lookup', 'sensitive_formd_delete',
     'insensitive_none_lookup', 'insensitive_none_delete',
     'insensitive_formd_lookup', 'insensitive_formd_delete',
     'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete',
     'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete']
 tags = ['functional', 'casenorm']
 
 [tests/functional/channel_program/lua_core]
 tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists',
     'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg',
     'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries',
     'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua',
     'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large',
     'tst.return_nvlist_neg', 'tst.return_nvlist_pos',
     'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout']
 tags = ['functional', 'channel_program', 'lua_core']
 
 [tests/functional/channel_program/synctask_core]
 tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit',
     'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg',
     'tst.get_number_props', 'tst.get_string_props', 'tst.get_type',
     'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks',
     'tst.list_children', 'tst.list_clones', 'tst.list_holds',
     'tst.list_snapshots', 'tst.list_system_props',
     'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict',
     'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult',
     'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg',
     'tst.snapshot_recursive', 'tst.snapshot_simple',
     'tst.bookmark.create', 'tst.bookmark.copy',
     'tst.terminate_by_signal'
     ]
 tags = ['functional', 'channel_program', 'synctask_core']
 
 [tests/functional/checksum]
 tests = ['run_sha2_test', 'run_skein_test', 'filetest_001_pos']
 tags = ['functional', 'checksum']
 
 [tests/functional/clean_mirror]
 tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos',
     'clean_mirror_003_pos', 'clean_mirror_004_pos']
 tags = ['functional', 'clean_mirror']
 
 [tests/functional/cli_root/zdb]
 tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
     'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos',
     'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress',
     'zdb_display_block', 'zdb_object_range_neg', 'zdb_object_range_pos',
     'zdb_objset_id', 'zdb_decompress_zstd']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zdb']
 
 [tests/functional/cli_root/zfs]
 tests = ['zfs_001_neg', 'zfs_002_pos']
 tags = ['functional', 'cli_root', 'zfs']
 
 [tests/functional/cli_root/zfs_bookmark]
 tests = ['zfs_bookmark_cliargs']
 tags = ['functional', 'cli_root', 'zfs_bookmark']
 
 [tests/functional/cli_root/zfs_change-key]
 tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format',
     'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location',
     'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones']
 tags = ['functional', 'cli_root', 'zfs_change-key']
 
 [tests/functional/cli_root/zfs_clone]
 tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos',
     'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos',
     'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg',
     'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested']
 tags = ['functional', 'cli_root', 'zfs_clone']
 
 [tests/functional/cli_root/zfs_copies]
 tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos',
     'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos']
 tags = ['functional', 'cli_root', 'zfs_copies']
 
 [tests/functional/cli_root/zfs_create]
 tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos',
     'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos',
     'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg',
     'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos',
     'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted',
     'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_verbose']
 tags = ['functional', 'cli_root', 'zfs_create']
 
 [tests/functional/cli_root/zfs_destroy]
 tests = ['zfs_clone_livelist_condense_and_disable',
     'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos',
     'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
     'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
     'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
     'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
     'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos',
     'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist',
     'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense']
 tags = ['functional', 'cli_root', 'zfs_destroy']
 
 [tests/functional/cli_root/zfs_diff]
 tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp',
     'zfs_diff_types', 'zfs_diff_encrypted']
 tags = ['functional', 'cli_root', 'zfs_diff']
 
 [tests/functional/cli_root/zfs_get]
 tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos',
     'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg',
     'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg']
 tags = ['functional', 'cli_root', 'zfs_get']
 
 [tests/functional/cli_root/zfs_ids_to_path]
 tests = ['zfs_ids_to_path_001_pos']
 tags = ['functional', 'cli_root', 'zfs_ids_to_path']
 
 [tests/functional/cli_root/zfs_inherit]
 tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos',
     'zfs_inherit_mountpoint']
 tags = ['functional', 'cli_root', 'zfs_inherit']
 
 [tests/functional/cli_root/zfs_load-key]
 tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file',
     'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive']
 tags = ['functional', 'cli_root', 'zfs_load-key']
 
 [tests/functional/cli_root/zfs_mount]
 tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted',
     'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
     'zfs_mount_test_race']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
 tests = ['zfs_program_json']
 tags = ['functional', 'cli_root', 'zfs_program']
 
 [tests/functional/cli_root/zfs_promote]
 tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos',
     'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg',
     'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot']
 tags = ['functional', 'cli_root', 'zfs_promote']
 
 [tests/functional/cli_root/zfs_property]
 tests = ['zfs_written_property_001_pos']
 tags = ['functional', 'cli_root', 'zfs_property']
 
 [tests/functional/cli_root/zfs_receive]
 tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
     'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos',
     'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg',
     'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos',
     'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos',
     'zfs_receive_016_pos', 'receive-o-x_props_override',
     'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted',
     'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e',
-    'zfs_receive_raw_-d', 'zfs_receive_from_zstd']
+    'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props']
 tags = ['functional', 'cli_root', 'zfs_receive']
 
 [tests/functional/cli_root/zfs_rename]
 tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos',
     'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos',
     'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg',
     'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg',
     'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child',
     'zfs_rename_to_encrypted', 'zfs_rename_mountpoint']
 tags = ['functional', 'cli_root', 'zfs_rename']
 
 [tests/functional/cli_root/zfs_reservation]
 tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
 tags = ['functional', 'cli_root', 'zfs_reservation']
 
 [tests/functional/cli_root/zfs_rollback]
 tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos',
     'zfs_rollback_003_neg', 'zfs_rollback_004_neg']
 tags = ['functional', 'cli_root', 'zfs_rollback']
 
 [tests/functional/cli_root/zfs_send]
 tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos',
     'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos',
     'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_raw',
     'zfs_send_sparse', 'zfs_send-b']
 tags = ['functional', 'cli_root', 'zfs_send']
 
 [tests/functional/cli_root/zfs_set]
 tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos',
     'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos',
     'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos',
     'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos',
     'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos',
     'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos',
     'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg',
     'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos',
     'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation',
     'zfs_set_feature_activation']
 tags = ['functional', 'cli_root', 'zfs_set']
 
 [tests/functional/cli_root/zfs_share]
 tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos',
     'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg',
     'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares']
 tags = ['functional', 'cli_root', 'zfs_share']
 
 [tests/functional/cli_root/zfs_snapshot]
 tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg',
     'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg',
     'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg',
     'zfs_snapshot_009_pos']
 tags = ['functional', 'cli_root', 'zfs_snapshot']
 
 [tests/functional/cli_root/zfs_unload-key]
 tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive']
 tags = ['functional', 'cli_root', 'zfs_unload-key']
 
 [tests/functional/cli_root/zfs_unmount]
 tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos',
     'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos',
     'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos',
     'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys']
 tags = ['functional', 'cli_root', 'zfs_unmount']
 
 [tests/functional/cli_root/zfs_unshare]
 tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos',
     'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos',
     'zfs_unshare_007_pos']
 tags = ['functional', 'cli_root', 'zfs_unshare']
 
 [tests/functional/cli_root/zfs_upgrade]
 tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos',
     'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg',
     'zfs_upgrade_007_neg']
 tags = ['functional', 'cli_root', 'zfs_upgrade']
 
 [tests/functional/cli_root/zfs_wait]
 tests = ['zfs_wait_deleteq']
 tags = ['functional', 'cli_root', 'zfs_wait']
 
 [tests/functional/cli_root/zpool]
 tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors']
 tags = ['functional', 'cli_root', 'zpool']
 
 [tests/functional/cli_root/zpool_add]
 tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos',
     'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
     'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos',
     'add-o_ashift', 'add_prop_ashift']
 tags = ['functional', 'cli_root', 'zpool_add']
 
 [tests/functional/cli_root/zpool_attach]
 tests = ['zpool_attach_001_neg', 'attach-o_ashift']
 tags = ['functional', 'cli_root', 'zpool_attach']
 
 [tests/functional/cli_root/zpool_clear]
 tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg',
     'zpool_clear_readonly']
 tags = ['functional', 'cli_root', 'zpool_clear']
 
 [tests/functional/cli_root/zpool_create]
 tests = ['zpool_create_001_pos', 'zpool_create_002_pos',
     'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos',
     'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos',
     'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg',
     'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg',
     'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos',
     'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos',
     'zpool_create_023_neg', 'zpool_create_024_pos',
     'zpool_create_encrypted', 'zpool_create_crypt_combos',
     'zpool_create_features_001_pos', 'zpool_create_features_002_pos',
     'zpool_create_features_003_pos', 'zpool_create_features_004_neg',
     'zpool_create_features_005_pos',
     'create-o_ashift', 'zpool_create_tempname']
 tags = ['functional', 'cli_root', 'zpool_create']
 
 [tests/functional/cli_root/zpool_destroy]
 tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos',
     'zpool_destroy_003_neg']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zpool_destroy']
 
 [tests/functional/cli_root/zpool_detach]
 tests = ['zpool_detach_001_neg']
 tags = ['functional', 'cli_root', 'zpool_detach']
 
 [tests/functional/cli_root/zpool_events]
 tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow',
     'zpool_events_poolname', 'zpool_events_errors']
 tags = ['functional', 'cli_root', 'zpool_events']
 
 [tests/functional/cli_root/zpool_export]
 tests = ['zpool_export_001_pos', 'zpool_export_002_pos',
     'zpool_export_003_neg', 'zpool_export_004_pos']
 tags = ['functional', 'cli_root', 'zpool_export']
 
 [tests/functional/cli_root/zpool_get]
 tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos',
     'zpool_get_004_neg', 'zpool_get_005_pos']
 tags = ['functional', 'cli_root', 'zpool_get']
 
 [tests/functional/cli_root/zpool_history]
 tests = ['zpool_history_001_neg', 'zpool_history_002_pos']
 tags = ['functional', 'cli_root', 'zpool_history']
 
 [tests/functional/cli_root/zpool_import]
 tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
     'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos',
     'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos',
     'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg',
     'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos',
     'zpool_import_015_pos',
     'zpool_import_features_001_pos', 'zpool_import_features_002_neg',
     'zpool_import_features_003_pos', 'zpool_import_missing_001_pos',
     'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos',
     'zpool_import_rename_001_pos', 'zpool_import_all_001_pos',
     'zpool_import_encrypted', 'zpool_import_encrypted_load',
     'zpool_import_errata3', 'zpool_import_errata4',
     'import_cachefile_device_added',
     'import_cachefile_device_removed',
     'import_cachefile_device_replaced',
     'import_cachefile_mirror_attached',
     'import_cachefile_mirror_detached',
     'import_cachefile_shared_device',
     'import_devices_missing',
     'import_paths_changed',
     'import_rewind_config_changed',
     'import_rewind_device_replaced']
 tags = ['functional', 'cli_root', 'zpool_import']
 
 [tests/functional/cli_root/zpool_labelclear]
 tests = ['zpool_labelclear_active', 'zpool_labelclear_exported',
     'zpool_labelclear_removed', 'zpool_labelclear_valid']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zpool_labelclear']
 
 [tests/functional/cli_root/zpool_initialize]
 tests = ['zpool_initialize_attach_detach_add_remove',
     'zpool_initialize_import_export',
     'zpool_initialize_offline_export_import_online',
     'zpool_initialize_online_offline',
     'zpool_initialize_split',
     'zpool_initialize_start_and_cancel_neg',
     'zpool_initialize_start_and_cancel_pos',
     'zpool_initialize_suspend_resume',
     'zpool_initialize_unsupported_vdevs',
     'zpool_initialize_verify_checksums',
     'zpool_initialize_verify_initialized']
 pre =
 tags = ['functional', 'cli_root', 'zpool_initialize']
 
 [tests/functional/cli_root/zpool_offline]
 tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg',
     'zpool_offline_003_pos']
 tags = ['functional', 'cli_root', 'zpool_offline']
 
 [tests/functional/cli_root/zpool_online]
 tests = ['zpool_online_001_pos', 'zpool_online_002_neg']
 tags = ['functional', 'cli_root', 'zpool_online']
 
 [tests/functional/cli_root/zpool_remove]
 tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos',
     'zpool_remove_003_pos']
 tags = ['functional', 'cli_root', 'zpool_remove']
 
 [tests/functional/cli_root/zpool_replace]
 tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift']
 tags = ['functional', 'cli_root', 'zpool_replace']
 
 [tests/functional/cli_root/zpool_resilver]
 tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart']
 tags = ['functional', 'cli_root', 'zpool_resilver']
 
 [tests/functional/cli_root/zpool_scrub]
 tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
     'zpool_scrub_004_pos', 'zpool_scrub_005_pos',
     'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing',
     'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies']
 tags = ['functional', 'cli_root', 'zpool_scrub']
 
 [tests/functional/cli_root/zpool_set]
 tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
     'zpool_set_ashift', 'zpool_set_features']
 tags = ['functional', 'cli_root', 'zpool_set']
 
 [tests/functional/cli_root/zpool_split]
 tests = ['zpool_split_cliargs', 'zpool_split_devices',
     'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs',
     'zpool_split_resilver', 'zpool_split_indirect']
 tags = ['functional', 'cli_root', 'zpool_split']
 
 [tests/functional/cli_root/zpool_status]
 tests = ['zpool_status_001_pos', 'zpool_status_002_pos']
 tags = ['functional', 'cli_root', 'zpool_status']
 
 [tests/functional/cli_root/zpool_sync]
 tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg']
 tags = ['functional', 'cli_root', 'zpool_sync']
 
 [tests/functional/cli_root/zpool_trim]
 tests = ['zpool_trim_attach_detach_add_remove',
     'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg',
     'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline',
     'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg',
     'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg',
     'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume',
     'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums',
     'zpool_trim_verify_trimmed']
 tags = ['functional', 'zpool_trim']
 
 [tests/functional/cli_root/zpool_upgrade]
 tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos',
     'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos',
     'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg',
     'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos',
     'zpool_upgrade_009_neg']
 tags = ['functional', 'cli_root', 'zpool_upgrade']
 
 [tests/functional/cli_root/zpool_wait]
 tests = ['zpool_wait_discard', 'zpool_wait_freeing',
     'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel',
     'zpool_wait_initialize_flag', 'zpool_wait_multiple',
     'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel',
     'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag',
     'zpool_wait_usage']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_root/zpool_wait/scan]
 tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild',
     'zpool_wait_resilver', 'zpool_wait_scrub_cancel',
     'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_user/misc]
 tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg',
     'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg',
     'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg',
     'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg',
     'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg',
     'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg',
     'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg',
     'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg',
     'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg',
     'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg',
     'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg',
     'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg',
     'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg',
     'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos',
     'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege']
 user =
 tags = ['functional', 'cli_user', 'misc']
 
 [tests/functional/cli_user/zfs_list]
 tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos',
     'zfs_list_004_neg', 'zfs_list_007_pos', 'zfs_list_008_neg']
 user =
 tags = ['functional', 'cli_user', 'zfs_list']
 
 [tests/functional/cli_user/zpool_iostat]
 tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos',
     'zpool_iostat_003_neg', 'zpool_iostat_004_pos',
     'zpool_iostat_005_pos', 'zpool_iostat_-c_disable',
     'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath']
 user =
 tags = ['functional', 'cli_user', 'zpool_iostat']
 
 [tests/functional/cli_user/zpool_list]
 tests = ['zpool_list_001_pos', 'zpool_list_002_neg']
 user =
 tags = ['functional', 'cli_user', 'zpool_list']
 
 [tests/functional/cli_user/zpool_status]
 tests = ['zpool_status_003_pos', 'zpool_status_-c_disable',
     'zpool_status_-c_homedir', 'zpool_status_-c_searchpath']
 user =
 tags = ['functional', 'cli_user', 'zpool_status']
 
 [tests/functional/compression]
 tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
     'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled',
     'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc']
 tags = ['functional', 'compression']
 
 [tests/functional/cp_files]
 tests = ['cp_files_001_pos']
 tags = ['functional', 'cp_files']
 
 [tests/functional/ctime]
 tests = ['ctime_001_pos' ]
 tags = ['functional', 'ctime']
 
 [tests/functional/delegate]
 tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos',
     'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos',
     'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg',
     'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg',
     'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos',
     'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos',
     'zfs_unallow_007_neg', 'zfs_unallow_008_neg']
 tags = ['functional', 'delegate']
 
 [tests/functional/exec]
 tests = ['exec_001_pos', 'exec_002_neg']
 tags = ['functional', 'exec']
 
 [tests/functional/features/async_destroy]
 tests = ['async_destroy_001_pos']
 tags = ['functional', 'features', 'async_destroy']
 
 [tests/functional/features/large_dnode]
 tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
     'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos']
 tags = ['functional', 'features', 'large_dnode']
 
 [tests/functional/grow]
 pre =
 post =
 tests = ['grow_pool_001_pos', 'grow_replicas_001_pos']
 tags = ['functional', 'grow']
 
 [tests/functional/history]
 tests = ['history_001_pos', 'history_002_pos', 'history_003_pos',
     'history_004_pos', 'history_005_neg', 'history_006_neg',
     'history_007_pos', 'history_008_pos', 'history_009_pos',
     'history_010_pos']
 tags = ['functional', 'history']
 
 [tests/functional/hkdf]
 tests = ['run_hkdf_test']
 tags = ['functional', 'hkdf']
 
 [tests/functional/inheritance]
 tests = ['inherit_001_pos']
 pre =
 tags = ['functional', 'inheritance']
 
 [tests/functional/io]
 tests = ['sync', 'psync', 'posixaio', 'mmap']
 tags = ['functional', 'io']
 
 [tests/functional/inuse]
 tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos']
 post =
 tags = ['functional', 'inuse']
 
 [tests/functional/large_files]
 tests = ['large_files_001_pos', 'large_files_002_pos']
 tags = ['functional', 'large_files']
 
 [tests/functional/largest_pool]
 tests = ['largest_pool_001_pos']
 pre =
 post =
 tags = ['functional', 'largest_pool']
 
 [tests/functional/limits]
 tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count',
     'snapshot_limit']
 tags = ['functional', 'limits']
 
 [tests/functional/link_count]
 tests = ['link_count_001', 'link_count_root_inode']
 tags = ['functional', 'link_count']
 
 [tests/functional/migration]
 tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
     'migration_004_pos', 'migration_005_pos', 'migration_006_pos',
     'migration_007_pos', 'migration_008_pos', 'migration_009_pos',
     'migration_010_pos', 'migration_011_pos', 'migration_012_pos']
 tags = ['functional', 'migration']
 
 [tests/functional/mmap]
 tests = ['mmap_write_001_pos', 'mmap_read_001_pos']
 tags = ['functional', 'mmap']
 
 [tests/functional/mount]
 tests = ['umount_001', 'umountall_001']
 tags = ['functional', 'mount']
 
 [tests/functional/mv_files]
 tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation']
 tags = ['functional', 'mv_files']
 
 [tests/functional/nestedfs]
 tests = ['nestedfs_001_pos']
 tags = ['functional', 'nestedfs']
 
 [tests/functional/no_space]
 tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos',
     'enospc_df']
 tags = ['functional', 'no_space']
 
 [tests/functional/nopwrite]
 tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative',
     'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync',
     'nopwrite_varying_compression', 'nopwrite_volume']
 tags = ['functional', 'nopwrite']
 
 [tests/functional/online_offline]
 tests = ['online_offline_001_pos', 'online_offline_002_neg',
     'online_offline_003_neg']
 tags = ['functional', 'online_offline']
 
 [tests/functional/persist_l2arc]
 tests = ['persist_l2arc_001_pos', 'persist_l2arc_002_pos',
     'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos',
     'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos']
 tags = ['functional', 'persist_l2arc']
 
 [tests/functional/pool_checkpoint]
 tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind',
     'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard',
     'checkpoint_discard_busy', 'checkpoint_discard_many',
     'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz',
     'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind',
     'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice',
     'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat']
 tags = ['functional', 'pool_checkpoint']
 timeout = 1800
 
 [tests/functional/pool_names]
 tests = ['pool_names_001_pos', 'pool_names_002_neg']
 pre =
 post =
 tags = ['functional', 'pool_names']
 
 [tests/functional/poolversion]
 tests = ['poolversion_001_pos', 'poolversion_002_pos']
 tags = ['functional', 'poolversion']
 
 [tests/functional/pyzfs]
 tests = ['pyzfs_unittest']
 pre =
 post =
 tags = ['functional', 'pyzfs']
 
 [tests/functional/quota]
 tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos',
          'quota_004_pos', 'quota_005_pos', 'quota_006_neg']
 tags = ['functional', 'quota']
 
 [tests/functional/redacted_send]
 tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted',
     'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes',
     'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones',
     'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative',
     'redacted_origin', 'redacted_props', 'redacted_resume', 'redacted_size',
     'redacted_volume']
 tags = ['functional', 'redacted_send']
 
 [tests/functional/raidz]
 tests = ['raidz_001_neg', 'raidz_002_pos']
 tags = ['functional', 'raidz']
 
 [tests/functional/redundancy]
 tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos',
     'redundancy_004_neg']
 tags = ['functional', 'redundancy']
 
 [tests/functional/refquota]
 tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',
     'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg',
     'refquota_007_neg', 'refquota_008_neg']
 tags = ['functional', 'refquota']
 
 [tests/functional/refreserv]
 tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
     'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz',
     'refreserv_raidz']
 tags = ['functional', 'refreserv']
 
 [tests/functional/removal]
 pre =
 tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space',
     'removal_condense_export', 'removal_multiple_indirection',
     'removal_nopwrite', 'removal_remap_deadlists',
     'removal_resume_export', 'removal_sanity', 'removal_with_add',
     'removal_with_create_fs', 'removal_with_dedup',
     'removal_with_errors', 'removal_with_export',
     'removal_with_ganging', 'removal_with_faulted',
     'removal_with_remove', 'removal_with_scrub', 'removal_with_send',
     'removal_with_send_recv', 'removal_with_snapshot',
     'removal_with_write', 'removal_with_zdb', 'remove_expanded',
     'remove_mirror', 'remove_mirror_sanity', 'remove_raidz',
     'remove_indirect']
 tags = ['functional', 'removal']
 
 [tests/functional/rename_dirs]
 tests = ['rename_dirs_001_pos']
 tags = ['functional', 'rename_dirs']
 
 [tests/functional/replacement]
 tests = ['attach_import', 'attach_multiple', 'attach_rebuild',
     'attach_resilver', 'detach', 'rebuild_disabled_feature',
     'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild',
     'replace_resilver', 'resilver_restart_001', 'resilver_restart_002',
     'scrub_cancel']
 tags = ['functional', 'replacement']
 
 [tests/functional/reservation]
 tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
     'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos',
     'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos',
     'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos',
     'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos',
     'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos',
     'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg',
     'reservation_022_pos']
 tags = ['functional', 'reservation']
 
 [tests/functional/rootpool]
 tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos']
 tags = ['functional', 'rootpool']
 
 [tests/functional/rsend]
 tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos',
     'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos',
     'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos',
     'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos',
     'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos',
     'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos',
     'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props',
     'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump',
     'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
     'send-c_mixed_compression', 'send-c_stream_size_estimate',
     'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
     'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy',
     'send_encrypted_props', 'send_encrypted_truncated_files',
     'send_freeobjects', 'send_realloc_files',
     'send_realloc_encrypted_files', 'send_spill_block', 'send_holds',
     'send_hole_birth', 'send_mixed_raw', 'send-wR_encrypted_zvol',
     'send_partial_dataset']
 tags = ['functional', 'rsend']
 
 [tests/functional/scrub_mirror]
 tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
     'scrub_mirror_003_pos', 'scrub_mirror_004_pos']
 tags = ['functional', 'scrub_mirror']
 
 [tests/functional/slog]
 tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
     'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg',
     'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg',
     'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001',
     'slog_replay_fs_002', 'slog_replay_volume']
 tags = ['functional', 'slog']
 
 [tests/functional/snapshot]
 tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos',
     'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos',
     'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos',
     'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos',
     'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos',
     'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos',
     'snapshot_017_pos']
 tags = ['functional', 'snapshot']
 
 [tests/functional/snapused]
 tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos',
     'snapused_004_pos', 'snapused_005_pos']
 tags = ['functional', 'snapused']
 
 [tests/functional/sparse]
 tests = ['sparse_001_pos']
 tags = ['functional', 'sparse']
 
 [tests/functional/suid]
 tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid',
     'suid_write_to_none']
 tags = ['functional', 'suid']
 
 [tests/functional/threadsappend]
 tests = ['threadsappend_001_pos']
 tags = ['functional', 'threadsappend']
 
 [tests/functional/trim]
 tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity',
     'trim_integrity', 'trim_config', 'trim_l2arc']
 tags = ['functional', 'trim']
 
 [tests/functional/truncate]
 tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps']
 tags = ['functional', 'truncate']
 
 [tests/functional/upgrade]
 tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool']
 tags = ['functional', 'upgrade']
 
 [tests/functional/userquota]
 tests = [
     'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos',
     'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos',
     'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos',
     'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg',
     'userspace_001_pos', 'userspace_002_pos']
 tags = ['functional', 'userquota']
 
 [tests/functional/vdev_zaps]
 tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos',
     'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos',
     'vdev_zaps_007_pos']
 tags = ['functional', 'vdev_zaps']
 
 [tests/functional/write_dirs]
 tests = ['write_dirs_001_pos', 'write_dirs_002_pos']
 tags = ['functional', 'write_dirs']
 
 [tests/functional/xattr]
 tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos',
     'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg',
     'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos']
 tags = ['functional', 'xattr']
 
 [tests/functional/zvol/zvol_ENOSPC]
 tests = ['zvol_ENOSPC_001_pos']
 tags = ['functional', 'zvol', 'zvol_ENOSPC']
 
 [tests/functional/zvol/zvol_cli]
 tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg']
 tags = ['functional', 'zvol', 'zvol_cli']
 
 [tests/functional/zvol/zvol_misc]
 tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse',
     'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil']
 tags = ['functional', 'zvol', 'zvol_misc']
 
 [tests/functional/zvol/zvol_swap]
 tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos']
 tags = ['functional', 'zvol', 'zvol_swap']
 
 [tests/functional/libzfs]
 tests = ['many_fds', 'libzfs_input']
 tags = ['functional', 'libzfs']
 
 [tests/functional/log_spacemap]
 tests = ['log_spacemap_import_logs']
 pre =
 post =
 tags = ['functional', 'log_spacemap']
Index: vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am
===================================================================
--- vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am	(revision 364927)
+++ vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am	(revision 364928)
@@ -1,31 +1,32 @@
 pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_receive
 dist_pkgdata_SCRIPTS = \
 	setup.ksh \
 	cleanup.ksh \
 	zfs_receive_001_pos.ksh \
 	zfs_receive_002_pos.ksh \
 	zfs_receive_003_pos.ksh \
 	zfs_receive_004_neg.ksh \
 	zfs_receive_005_neg.ksh \
 	zfs_receive_006_pos.ksh \
 	zfs_receive_007_neg.ksh \
 	zfs_receive_008_pos.ksh \
 	zfs_receive_009_neg.ksh \
 	zfs_receive_010_pos.ksh \
 	zfs_receive_011_pos.ksh \
 	zfs_receive_012_pos.ksh \
 	zfs_receive_013_pos.ksh \
 	zfs_receive_014_pos.ksh \
 	zfs_receive_015_pos.ksh \
 	zfs_receive_016_pos.ksh \
 	receive-o-x_props_override.ksh \
 	zfs_receive_from_encrypted.ksh \
 	zfs_receive_from_zstd.ksh \
+	zfs_receive_new_props.ksh \
 	zfs_receive_to_encrypted.ksh \
 	zfs_receive_raw.ksh \
 	zfs_receive_raw_incremental.ksh \
 	zfs_receive_raw_-d.ksh \
 	zfs_receive_-e.ksh
 
 dist_pkgdata_DATA = \
 	zstd_test_data.txt
Index: vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_new_props.ksh
===================================================================
--- vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_new_props.ksh	(nonexistent)
+++ vendor-sys/openzfs/dist/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_new_props.ksh	(revision 364928)
@@ -0,0 +1,77 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# ZFS receive test to handle Issue #10698
+#
+# STRATEGY:
+# 1. Create a pool with filesystem_limits disabled
+# 2. Create a filesystem on that pool
+# 3. Enable filesystem limits on that pool
+# 4. On a pool with filesystem limits enabled, create a filesystem and set a
+#    limit
+# 5. Snapshot limited filesystem
+# 6. send -R limited filesystem and receive over filesystem with limits disabled
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+	destroy_pool "$poolname"
+	destroy_pool "$rpoolname"
+	log_must rm -f "$vdevfile"
+	log_must rm -f "$rvdevfile"
+	log_must rm -f "$streamfile"
+}
+
+log_onexit cleanup
+
+log_assert "ZFS should handle receiving streams with filesystem limits on \
+	pools where the feature was recently enabled"
+
+poolname=sendpool
+rpoolname=recvpool
+vdevfile="$TEST_BASE_DIR/vdevfile.$$"
+rvdevfile="$TEST_BASE_DIR/rvdevfile.$$"
+sendfs="$poolname/fs"
+recvfs="$rpoolname/rfs"
+streamfile="$TEST_BASE_DIR/streamfile.$$"
+
+log_must truncate -s $MINVDEVSIZE "$rvdevfile"
+log_must truncate -s $MINVDEVSIZE "$vdevfile"
+log_must zpool create -O mountpoint=none -o feature@filesystem_limits=disabled \
+	 "$rpoolname" "$rvdevfile"
+log_must zpool create -O mountpoint=none "$poolname" "$vdevfile"
+
+log_must zfs create "$recvfs"
+log_must zpool set feature@filesystem_limits=enabled "$rpoolname"
+
+log_must zfs create -o filesystem_limit=100 "$sendfs"
+log_must zfs snapshot "$sendfs@a"
+
+log_must zfs send -R "$sendfs@a" >"$streamfile"
+log_must eval "zfs recv -svuF $recvfs <$streamfile"
+
+log_pass "ZFS can handle receiving streams with filesystem limits on \
+	pools where the feature was recently enabled"