diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h
index 71db4dc35608..60e8ea873b23 100644
--- a/cmd/zpool/zpool_util.h
+++ b/cmd/zpool/zpool_util.h
@@ -1,139 +1,139 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	ZPOOL_UTIL_H
 #define	ZPOOL_UTIL_H
 
 #include <libnvpair.h>
 #include <libzfs.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /* Path to scripts you can run with "zpool status/iostat -c" */
 #define	ZPOOL_SCRIPTS_DIR SYSCONFDIR"/zfs/zpool.d"
 
 /*
  * Basic utility functions
  */
 void *safe_malloc(size_t);
 void *safe_realloc(void *, size_t);
 void zpool_no_memory(void);
 uint_t num_logs(nvlist_t *nv);
 uint64_t array64_max(uint64_t array[], unsigned int len);
 int highbit64(uint64_t i);
 int lowbit64(uint64_t i);
 
 /*
  * Misc utility functions
  */
 char *zpool_get_cmd_search_path(void);
 
 /*
  * Virtual device functions
  */
 
 nvlist_t *make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force,
     int check_rep, boolean_t replacing, boolean_t dryrun, int argc,
     char **argv);
 nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
     nvlist_t *props, splitflags_t flags, int argc, char **argv);
 
 /*
  * Pool list functions
  */
 int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **,
     boolean_t, zpool_iter_f, void *);
 
 /* Vdev list functions */
 typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *);
 int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data);
 
 typedef struct zpool_list zpool_list_t;
 
 zpool_list_t *pool_list_get(int, char **, zprop_list_t **, boolean_t, int *);
 void pool_list_update(zpool_list_t *);
 int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *);
 void pool_list_free(zpool_list_t *);
 int pool_list_count(zpool_list_t *);
 void pool_list_remove(zpool_list_t *, zpool_handle_t *);
 
 extern libzfs_handle_t *g_zfs;
 
 
 typedef	struct vdev_cmd_data
 {
 	char **lines;	/* Array of lines of output, minus the column name */
 	int lines_cnt;	/* Number of lines in the array */
 
 	char **cols;	/* Array of column names */
 	int cols_cnt;	/* Number of column names */
 
 
 	char *path;	/* vdev path */
 	char *upath;	/* vdev underlying path */
 	char *pool;	/* Pool name */
 	char *cmd;	/* backpointer to cmd */
 	char *vdev_enc_sysfs_path;	/* enclosure sysfs path (if any) */
 } vdev_cmd_data_t;
 
 typedef struct vdev_cmd_data_list
 {
 	char *cmd;		/* Command to run */
 	unsigned int count;	/* Number of vdev_cmd_data items (vdevs) */
 
 	/* fields used to select only certain vdevs, if requested */
 	libzfs_handle_t *g_zfs;
 	char **vdev_names;
 	int vdev_names_count;
 	int cb_name_flags;
 
 	vdev_cmd_data_t *data;	/* Array of vdevs */
 
 	/* List of unique column names and widths */
 	char **uniq_cols;
 	int uniq_cols_cnt;
 	int *uniq_cols_width;
 
 } vdev_cmd_data_list_t;
 
 vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv,
     char *cmd, libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count,
     int cb_name_flags);
 
 void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl);
 
 int check_device(const char *path, boolean_t force,
     boolean_t isspare, boolean_t iswholedisk);
 boolean_t check_sector_size_database(char *path, int *sector_size);
-void vdev_error(const char *fmt, ...);
+void vdev_error(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
 int check_file(const char *file, boolean_t force, boolean_t isspare);
 void after_zpool_upgrade(zpool_handle_t *zhp);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* ZPOOL_UTIL_H */
diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
index 3d83da641ecb..57d41e5eaaf3 100644
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@@ -1,1870 +1,1880 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2016, 2017 Intel Corporation.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
  */
 
 /*
  * Functions to convert between a list of vdevs and an nvlist representing the
  * configuration.  Each entry in the list can be one of:
  *
  * 	Device vdevs
  * 		disk=(path=..., devid=...)
  * 		file=(path=...)
  *
  * 	Group vdevs
  * 		raidz[1|2]=(...)
  * 		mirror=(...)
  *
  * 	Hot spares
  *
  * While the underlying implementation supports it, group vdevs cannot contain
  * other group vdevs.  All userland verification of devices is contained within
  * this file.  If successful, the nvlist returned can be passed directly to the
  * kernel; we've done as much verification as possible in userland.
  *
  * Hot spares are a special case, and passed down as an array of disk vdevs, at
  * the same level as the root of the vdev tree.
  *
  * The only function exported by this file is 'make_root_vdev'.  The
  * function performs several passes:
  *
  * 	1. Construct the vdev specification.  Performs syntax validation and
  *         makes sure each device is valid.
  * 	2. Check for devices in use.  Using libblkid to make sure that no
  *         devices are also in use.  Some can be overridden using the 'force'
  *         flag, others cannot.
  * 	3. Check for replication errors if the 'force' flag is not specified.
  *         validates that the replication level is consistent across the
  *         entire pool.
  * 	4. Call libzfs to label any whole disks with an EFI label.
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <libintl.h>
 #include <libnvpair.h>
 #include <libzutil.h>
 #include <limits.h>
 #include <sys/spa.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
 #include "zpool_util.h"
 #include <sys/zfs_context.h>
 #include <sys/stat.h>
 
 /*
  * For any given vdev specification, we can have multiple errors.  The
  * vdev_error() function keeps track of whether we have seen an error yet, and
  * prints out a header if its the first error we've seen.
  */
 boolean_t error_seen;
 boolean_t is_force;
 
-/*PRINTFLIKE1*/
 void
 vdev_error(const char *fmt, ...)
 {
 	va_list ap;
 
 	if (!error_seen) {
 		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
 		if (!is_force)
 			(void) fprintf(stderr, gettext("use '-f' to override "
 			    "the following errors:\n"));
 		else
 			(void) fprintf(stderr, gettext("the following errors "
 			    "must be manually repaired:\n"));
 		error_seen = B_TRUE;
 	}
 
 	va_start(ap, fmt);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 }
 
 /*
  * Check that a file is valid.  All we can do in this case is check that it's
  * not in use by another pool, and not in use by swap.
  */
 int
 check_file(const char *file, boolean_t force, boolean_t isspare)
 {
 	char  *name;
 	int fd;
 	int ret = 0;
 	pool_state_t state;
 	boolean_t inuse;
 
 	if ((fd = open(file, O_RDONLY)) < 0)
 		return (0);
 
 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
 		const char *desc;
 
 		switch (state) {
 		case POOL_STATE_ACTIVE:
 			desc = gettext("active");
 			break;
 
 		case POOL_STATE_EXPORTED:
 			desc = gettext("exported");
 			break;
 
 		case POOL_STATE_POTENTIALLY_ACTIVE:
 			desc = gettext("potentially active");
 			break;
 
 		default:
 			desc = gettext("unknown");
 			break;
 		}
 
 		/*
 		 * Allow hot spares to be shared between pools.
 		 */
 		if (state == POOL_STATE_SPARE && isspare) {
 			free(name);
 			(void) close(fd);
 			return (0);
 		}
 
 		if (state == POOL_STATE_ACTIVE ||
 		    state == POOL_STATE_SPARE || !force) {
 			switch (state) {
 			case POOL_STATE_SPARE:
 				vdev_error(gettext("%s is reserved as a hot "
 				    "spare for pool %s\n"), file, name);
 				break;
 			default:
 				vdev_error(gettext("%s is part of %s pool "
 				    "'%s'\n"), file, desc, name);
 				break;
 			}
 			ret = -1;
 		}
 
 		free(name);
 	}
 
 	(void) close(fd);
 	return (ret);
 }
 
 /*
  * This may be a shorthand device path or it could be total gibberish.
  * Check to see if it is a known device available in zfs_vdev_paths.
  * As part of this check, see if we've been given an entire disk
  * (minus the slice number).
  */
 static int
 is_shorthand_path(const char *arg, char *path, size_t path_size,
     struct stat64 *statbuf, boolean_t *wholedisk)
 {
 	int error;
 
 	error = zfs_resolve_shortname(arg, path, path_size);
 	if (error == 0) {
 		*wholedisk = zfs_dev_is_whole_disk(path);
 		if (*wholedisk || (stat64(path, statbuf) == 0))
 			return (0);
 	}
 
 	strlcpy(path, arg, path_size);
 	memset(statbuf, 0, sizeof (*statbuf));
 	*wholedisk = B_FALSE;
 
 	return (error);
 }
 
 /*
  * Determine if the given path is a hot spare within the given configuration.
  * If no configuration is given we rely solely on the label.
  */
 static boolean_t
 is_spare(nvlist_t *config, const char *path)
 {
 	int fd;
 	pool_state_t state;
 	char *name = NULL;
 	nvlist_t *label;
 	uint64_t guid, spareguid;
 	nvlist_t *nvroot;
 	nvlist_t **spares;
 	uint_t i, nspares;
 	boolean_t inuse;
 
 	if (zpool_is_draid_spare(path))
 		return (B_TRUE);
 
 	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
 		return (B_FALSE);
 
 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
 	    !inuse ||
 	    state != POOL_STATE_SPARE ||
 	    zpool_read_label(fd, &label, NULL) != 0) {
 		free(name);
 		(void) close(fd);
 		return (B_FALSE);
 	}
 	free(name);
 	(void) close(fd);
 
 	if (config == NULL) {
 		nvlist_free(label);
 		return (B_TRUE);
 	}
 
 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
 	nvlist_free(label);
 
 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		for (i = 0; i < nspares; i++) {
 			verify(nvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
 			if (spareguid == guid)
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
  * device, fill in the device id to make a complete nvlist.  Valid forms for a
  * leaf vdev are:
  *
  *	/dev/xxx	Complete disk path
  *	/xxx		Full path to file
  *	xxx		Shorthand for <zfs_vdev_paths>/xxx
  *	draid*		Virtual dRAID spare
  */
 static nvlist_t *
 make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
 {
 	char path[MAXPATHLEN];
 	struct stat64 statbuf;
 	nvlist_t *vdev = NULL;
 	char *type = NULL;
 	boolean_t wholedisk = B_FALSE;
 	uint64_t ashift = 0;
 	int err;
 
 	/*
 	 * Determine what type of vdev this is, and put the full path into
 	 * 'path'.  We detect whether this is a device of file afterwards by
 	 * checking the st_mode of the file.
 	 */
 	if (arg[0] == '/') {
 		/*
 		 * Complete device or file path.  Exact type is determined by
 		 * examining the file descriptor afterwards.  Symbolic links
 		 * are resolved to their real paths to determine whole disk
 		 * and S_ISBLK/S_ISREG type checks.  However, we are careful
 		 * to store the given path as ZPOOL_CONFIG_PATH to ensure we
 		 * can leverage udev's persistent device labels.
 		 */
 		if (realpath(arg, path) == NULL) {
 			(void) fprintf(stderr,
 			    gettext("cannot resolve path '%s'\n"), arg);
 			return (NULL);
 		}
 
 		wholedisk = zfs_dev_is_whole_disk(path);
 		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
 			(void) fprintf(stderr,
 			    gettext("cannot open '%s': %s\n"),
 			    path, strerror(errno));
 			return (NULL);
 		}
 
 		/* After whole disk check restore original passed path */
 		strlcpy(path, arg, sizeof (path));
 	} else if (zpool_is_draid_spare(arg)) {
 		if (!is_primary) {
 			(void) fprintf(stderr,
 			    gettext("cannot open '%s': dRAID spares can only "
 			    "be used to replace primary vdevs\n"), arg);
 			return (NULL);
 		}
 
 		wholedisk = B_TRUE;
 		strlcpy(path, arg, sizeof (path));
 		type = VDEV_TYPE_DRAID_SPARE;
 	} else {
 		err = is_shorthand_path(arg, path, sizeof (path),
 		    &statbuf, &wholedisk);
 		if (err != 0) {
 			/*
 			 * If we got ENOENT, then the user gave us
 			 * gibberish, so try to direct them with a
 			 * reasonable error message.  Otherwise,
 			 * regurgitate strerror() since it's the best we
 			 * can do.
 			 */
 			if (err == ENOENT) {
 				(void) fprintf(stderr,
 				    gettext("cannot open '%s': no such "
 				    "device in %s\n"), arg, DISK_ROOT);
 				(void) fprintf(stderr,
 				    gettext("must be a full path or "
 				    "shorthand device name\n"));
 				return (NULL);
 			} else {
 				(void) fprintf(stderr,
 				    gettext("cannot open '%s': %s\n"),
 				    path, strerror(errno));
 				return (NULL);
 			}
 		}
 	}
 
 	if (type == NULL) {
 		/*
 		 * Determine whether this is a device or a file.
 		 */
 		if (wholedisk || S_ISBLK(statbuf.st_mode)) {
 			type = VDEV_TYPE_DISK;
 		} else if (S_ISREG(statbuf.st_mode)) {
 			type = VDEV_TYPE_FILE;
 		} else {
 			fprintf(stderr, gettext("cannot use '%s': must "
 			    "be a block device or regular file\n"), path);
 			return (NULL);
 		}
 	}
 
 	/*
 	 * Finally, we have the complete device or file, and we know that it is
 	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
 	 * vdevs have a 'path' element, and devices also have a 'devid' element.
 	 */
 	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
 
 	if (strcmp(type, VDEV_TYPE_DISK) == 0)
 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
 		    (uint64_t)wholedisk) == 0);
 
 	/*
 	 * Override defaults if custom properties are provided.
 	 */
 	if (props != NULL) {
 		char *value = NULL;
 
 		if (nvlist_lookup_string(props,
 		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
 			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
 				(void) fprintf(stderr,
 				    gettext("ashift must be a number.\n"));
 				return (NULL);
 			}
 			if (ashift != 0 &&
 			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
 				(void) fprintf(stderr,
 				    gettext("invalid 'ashift=%" PRIu64 "' "
 				    "property: only values between %" PRId32 " "
 				    "and %" PRId32 " are allowed.\n"),
 				    ashift, ASHIFT_MIN, ASHIFT_MAX);
 				return (NULL);
 			}
 		}
 	}
 
 	/*
 	 * If the device is known to incorrectly report its physical sector
 	 * size explicitly provide the known correct value.
 	 */
 	if (ashift == 0) {
 		int sector_size;
 
 		if (check_sector_size_database(path, &sector_size) == B_TRUE)
 			ashift = highbit64(sector_size) - 1;
 	}
 
 	if (ashift > 0)
 		(void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
 
 	return (vdev);
 }
 
 /*
  * Go through and verify the replication level of the pool is consistent.
  * Performs the following checks:
  *
  * 	For the new spec, verifies that devices in mirrors and raidz are the
  * 	same size.
  *
  * 	If the current configuration already has inconsistent replication
  * 	levels, ignore any other potential problems in the new spec.
  *
  * 	Otherwise, make sure that the current spec (if there is one) and the new
  * 	spec have consistent replication levels.
  *
  *	If there is no current spec (create), make sure new spec has at least
  *	one general purpose vdev.
  */
 typedef struct replication_level {
 	char *zprl_type;
 	uint64_t zprl_children;
 	uint64_t zprl_parity;
 } replication_level_t;
 
 #define	ZPOOL_FUZZ	(16 * 1024 * 1024)
 
 /*
  * N.B. For the purposes of comparing replication levels dRAID can be
  * considered functionally equivalent to raidz.
  */
 static boolean_t
 is_raidz_mirror(replication_level_t *a, replication_level_t *b,
     replication_level_t **raidz, replication_level_t **mirror)
 {
 	if ((strcmp(a->zprl_type, "raidz") == 0 ||
 	    strcmp(a->zprl_type, "draid") == 0) &&
 	    strcmp(b->zprl_type, "mirror") == 0) {
 		*raidz = a;
 		*mirror = b;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Comparison for determining if dRAID and raidz where passed in either order.
  */
 static boolean_t
 is_raidz_draid(replication_level_t *a, replication_level_t *b)
 {
 	if ((strcmp(a->zprl_type, "raidz") == 0 ||
 	    strcmp(a->zprl_type, "draid") == 0) &&
 	    (strcmp(b->zprl_type, "raidz") == 0 ||
 	    strcmp(b->zprl_type, "draid") == 0)) {
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Given a list of toplevel vdevs, return the current replication level.  If
  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
  * an error message will be displayed for each self-inconsistent vdev.
  */
 static replication_level_t *
 get_replication(nvlist_t *nvroot, boolean_t fatal)
 {
 	nvlist_t **top;
 	uint_t t, toplevels;
 	nvlist_t **child;
 	uint_t c, children;
 	nvlist_t *nv;
 	char *type;
 	replication_level_t lastrep = {0};
 	replication_level_t rep;
 	replication_level_t *ret;
 	replication_level_t *raidz, *mirror;
 	boolean_t dontreport;
 
 	ret = safe_malloc(sizeof (replication_level_t));
 
 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &top, &toplevels) == 0);
 
 	for (t = 0; t < toplevels; t++) {
 		uint64_t is_log = B_FALSE;
 
 		nv = top[t];
 
 		/*
 		 * For separate logs we ignore the top level vdev replication
 		 * constraints.
 		 */
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
 		if (is_log)
 			continue;
 
 		/* Ignore holes introduced by removing aux devices */
 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
 		if (strcmp(type, VDEV_TYPE_HOLE) == 0)
 			continue;
 
 		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 		    &child, &children) != 0) {
 			/*
 			 * This is a 'file' or 'disk' vdev.
 			 */
 			rep.zprl_type = type;
 			rep.zprl_children = 1;
 			rep.zprl_parity = 0;
 		} else {
 			int64_t vdev_size;
 
 			/*
 			 * This is a mirror or RAID-Z vdev.  Go through and make
 			 * sure the contents are all the same (files vs. disks),
 			 * keeping track of the number of elements in the
 			 * process.
 			 *
 			 * We also check that the size of each vdev (if it can
 			 * be determined) is the same.
 			 */
 			rep.zprl_type = type;
 			rep.zprl_children = 0;
 
 			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
 			    strcmp(type, VDEV_TYPE_DRAID) == 0) {
 				verify(nvlist_lookup_uint64(nv,
 				    ZPOOL_CONFIG_NPARITY,
 				    &rep.zprl_parity) == 0);
 				assert(rep.zprl_parity != 0);
 			} else {
 				rep.zprl_parity = 0;
 			}
 
 			/*
 			 * The 'dontreport' variable indicates that we've
 			 * already reported an error for this spec, so don't
 			 * bother doing it again.
 			 */
 			type = NULL;
 			dontreport = 0;
 			vdev_size = -1LL;
 			for (c = 0; c < children; c++) {
 				nvlist_t *cnv = child[c];
 				char *path;
 				struct stat64 statbuf;
 				int64_t size = -1LL;
 				char *childtype;
 				int fd, err;
 
 				rep.zprl_children++;
 
 				verify(nvlist_lookup_string(cnv,
 				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
 
 				/*
 				 * If this is a replacing or spare vdev, then
 				 * get the real first child of the vdev: do this
 				 * in a loop because replacing and spare vdevs
 				 * can be nested.
 				 */
 				while (strcmp(childtype,
 				    VDEV_TYPE_REPLACING) == 0 ||
 				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
 					nvlist_t **rchild;
 					uint_t rchildren;
 
 					verify(nvlist_lookup_nvlist_array(cnv,
 					    ZPOOL_CONFIG_CHILDREN, &rchild,
 					    &rchildren) == 0);
 					assert(rchildren == 2);
 					cnv = rchild[0];
 
 					verify(nvlist_lookup_string(cnv,
 					    ZPOOL_CONFIG_TYPE,
 					    &childtype) == 0);
 				}
 
 				verify(nvlist_lookup_string(cnv,
 				    ZPOOL_CONFIG_PATH, &path) == 0);
 
 				/*
 				 * If we have a raidz/mirror that combines disks
 				 * with files, report it as an error.
 				 */
 				if (!dontreport && type != NULL &&
 				    strcmp(type, childtype) != 0) {
 					if (ret != NULL)
 						free(ret);
 					ret = NULL;
 					if (fatal)
 						vdev_error(gettext(
 						    "mismatched replication "
 						    "level: %s contains both "
 						    "files and devices\n"),
 						    rep.zprl_type);
 					else
 						return (NULL);
 					dontreport = B_TRUE;
 				}
 
 				/*
 				 * According to stat(2), the value of 'st_size'
 				 * is undefined for block devices and character
 				 * devices.  But there is no effective way to
 				 * determine the real size in userland.
 				 *
 				 * Instead, we'll take advantage of an
 				 * implementation detail of spec_size().  If the
 				 * device is currently open, then we (should)
 				 * return a valid size.
 				 *
 				 * If we still don't get a valid size (indicated
 				 * by a size of 0 or MAXOFFSET_T), then ignore
 				 * this device altogether.
 				 */
 				if ((fd = open(path, O_RDONLY)) >= 0) {
 					err = fstat64_blk(fd, &statbuf);
 					(void) close(fd);
 				} else {
 					err = stat64(path, &statbuf);
 				}
 
 				if (err != 0 ||
 				    statbuf.st_size == 0 ||
 				    statbuf.st_size == MAXOFFSET_T)
 					continue;
 
 				size = statbuf.st_size;
 
 				/*
 				 * Also make sure that devices and
 				 * slices have a consistent size.  If
 				 * they differ by a significant amount
 				 * (~16MB) then report an error.
 				 */
 				if (!dontreport &&
 				    (vdev_size != -1LL &&
 				    (llabs(size - vdev_size) >
 				    ZPOOL_FUZZ))) {
 					if (ret != NULL)
 						free(ret);
 					ret = NULL;
 					if (fatal)
 						vdev_error(gettext(
 						    "%s contains devices of "
 						    "different sizes\n"),
 						    rep.zprl_type);
 					else
 						return (NULL);
 					dontreport = B_TRUE;
 				}
 
 				type = childtype;
 				vdev_size = size;
 			}
 		}
 
 		/*
 		 * At this point, we have the replication of the last toplevel
 		 * vdev in 'rep'.  Compare it to 'lastrep' to see if it is
 		 * different.
 		 */
 		if (lastrep.zprl_type != NULL) {
 			if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
 			    is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
 				/*
 				 * Accepted raidz and mirror when they can
 				 * handle the same number of disk failures.
 				 */
 				if (raidz->zprl_parity !=
 				    mirror->zprl_children - 1) {
 					if (ret != NULL)
 						free(ret);
 					ret = NULL;
 					if (fatal)
 						vdev_error(gettext(
 						    "mismatched replication "
 						    "level: "
 						    "%s and %s vdevs with "
 						    "different redundancy, "
 						    "%llu vs. %llu (%llu-way) "
 						    "are present\n"),
 						    raidz->zprl_type,
 						    mirror->zprl_type,
+						    (u_longlong_t)
 						    raidz->zprl_parity,
+						    (u_longlong_t)
 						    mirror->zprl_children - 1,
+						    (u_longlong_t)
 						    mirror->zprl_children);
 					else
 						return (NULL);
 				}
 			} else if (is_raidz_draid(&lastrep, &rep)) {
 				/*
 				 * Accepted raidz and draid when they can
 				 * handle the same number of disk failures.
 				 */
 				if (lastrep.zprl_parity != rep.zprl_parity) {
 					if (ret != NULL)
 						free(ret);
 					ret = NULL;
 					if (fatal)
 						vdev_error(gettext(
 						    "mismatched replication "
 						    "level: %s and %s vdevs "
 						    "with different "
 						    "redundancy, %llu vs. "
 						    "%llu are present\n"),
 						    lastrep.zprl_type,
 						    rep.zprl_type,
+						    (u_longlong_t)
 						    lastrep.zprl_parity,
+						    (u_longlong_t)
 						    rep.zprl_parity);
 					else
 						return (NULL);
 				}
 			} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
 			    0) {
 				if (ret != NULL)
 					free(ret);
 				ret = NULL;
 				if (fatal)
 					vdev_error(gettext(
 					    "mismatched replication level: "
 					    "both %s and %s vdevs are "
 					    "present\n"),
 					    lastrep.zprl_type, rep.zprl_type);
 				else
 					return (NULL);
 			} else if (lastrep.zprl_parity != rep.zprl_parity) {
 				if (ret)
 					free(ret);
 				ret = NULL;
 				if (fatal)
 					vdev_error(gettext(
 					    "mismatched replication level: "
 					    "both %llu and %llu device parity "
 					    "%s vdevs are present\n"),
+					    (u_longlong_t)
 					    lastrep.zprl_parity,
-					    rep.zprl_parity,
+					    (u_longlong_t)rep.zprl_parity,
 					    rep.zprl_type);
 				else
 					return (NULL);
 			} else if (lastrep.zprl_children != rep.zprl_children) {
 				if (ret)
 					free(ret);
 				ret = NULL;
 				if (fatal)
 					vdev_error(gettext(
 					    "mismatched replication level: "
 					    "both %llu-way and %llu-way %s "
 					    "vdevs are present\n"),
+					    (u_longlong_t)
 					    lastrep.zprl_children,
+					    (u_longlong_t)
 					    rep.zprl_children,
 					    rep.zprl_type);
 				else
 					return (NULL);
 			}
 		}
 		lastrep = rep;
 	}
 
 	if (ret != NULL)
 		*ret = rep;
 
 	return (ret);
 }
 
 /*
  * Check the replication level of the vdev spec against the current pool.  Calls
  * get_replication() to make sure the new spec is self-consistent.  If the pool
  * has a consistent replication level, then we ignore any errors.  Otherwise,
  * report any difference between the two.
  */
 static int
 check_replication(nvlist_t *config, nvlist_t *newroot)
 {
 	nvlist_t **child;
 	uint_t	children;
 	replication_level_t *current = NULL, *new;
 	replication_level_t *raidz, *mirror;
 	int ret;
 
 	/*
 	 * If we have a current pool configuration, check to see if it's
 	 * self-consistent.  If not, simply return success.
 	 */
 	if (config != NULL) {
 		nvlist_t *nvroot;
 
 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 		    &nvroot) == 0);
 		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
 			return (0);
 	}
 	/*
 	 * for spares there may be no children, and therefore no
 	 * replication level to check
 	 */
 	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0) || (children == 0)) {
 		free(current);
 		return (0);
 	}
 
 	/*
 	 * If all we have is logs then there's no replication level to check.
 	 */
 	if (num_logs(newroot) == children) {
 		free(current);
 		return (0);
 	}
 
 	/*
 	 * Get the replication level of the new vdev spec, reporting any
 	 * inconsistencies found.
 	 */
 	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
 		free(current);
 		return (-1);
 	}
 
 	/*
 	 * Check to see if the new vdev spec matches the replication level of
 	 * the current pool.
 	 */
 	ret = 0;
 	if (current != NULL) {
 		if (is_raidz_mirror(current, new, &raidz, &mirror) ||
 		    is_raidz_mirror(new, current, &raidz, &mirror)) {
 			if (raidz->zprl_parity != mirror->zprl_children - 1) {
 				vdev_error(gettext(
 				    "mismatched replication level: pool and "
 				    "new vdev with different redundancy, %s "
 				    "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
 				    raidz->zprl_type,
 				    mirror->zprl_type,
-				    raidz->zprl_parity,
-				    mirror->zprl_children - 1,
-				    mirror->zprl_children);
+				    (u_longlong_t)raidz->zprl_parity,
+				    (u_longlong_t)mirror->zprl_children - 1,
+				    (u_longlong_t)mirror->zprl_children);
 				ret = -1;
 			}
 		} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
 			vdev_error(gettext(
 			    "mismatched replication level: pool uses %s "
 			    "and new vdev is %s\n"),
 			    current->zprl_type, new->zprl_type);
 			ret = -1;
 		} else if (current->zprl_parity != new->zprl_parity) {
 			vdev_error(gettext(
 			    "mismatched replication level: pool uses %llu "
 			    "device parity and new vdev uses %llu\n"),
-			    current->zprl_parity, new->zprl_parity);
+			    (u_longlong_t)current->zprl_parity,
+			    (u_longlong_t)new->zprl_parity);
 			ret = -1;
 		} else if (current->zprl_children != new->zprl_children) {
 			vdev_error(gettext(
 			    "mismatched replication level: pool uses %llu-way "
 			    "%s and new vdev uses %llu-way %s\n"),
-			    current->zprl_children, current->zprl_type,
-			    new->zprl_children, new->zprl_type);
+			    (u_longlong_t)current->zprl_children,
+			    current->zprl_type,
+			    (u_longlong_t)new->zprl_children,
+			    new->zprl_type);
 			ret = -1;
 		}
 	}
 
 	free(new);
 	if (current != NULL)
 		free(current);
 
 	return (ret);
 }
 
 static int
 zero_label(char *path)
 {
 	const int size = 4096;
 	char buf[size];
 	int err, fd;
 
 	if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
 		(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
 		    path, strerror(errno));
 		return (-1);
 	}
 
 	memset(buf, 0, size);
 	err = write(fd, buf, size);
 	(void) fdatasync(fd);
 	(void) close(fd);
 
 	if (err == -1) {
 		(void) fprintf(stderr, gettext("cannot zero first %d bytes "
 		    "of '%s': %s\n"), size, path, strerror(errno));
 		return (-1);
 	}
 
 	if (err != size) {
 		(void) fprintf(stderr, gettext("could only zero %d/%d bytes "
 		    "of '%s'\n"), err, size, path);
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Go through and find any whole disks in the vdev specification, labelling them
  * as appropriate.  When constructing the vdev spec, we were unable to open this
  * device in order to provide a devid.  Now that we have labelled the disk and
  * know that slice 0 is valid, we can construct the devid now.
  *
  * If the disk was already labeled with an EFI label, we will have gotten the
  * devid already (because we were able to open the whole disk).  Otherwise, we
  * need to get the devid after we label the disk.
  */
 static int
 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
 {
 	nvlist_t **child;
 	uint_t c, children;
 	char *type, *path;
 	char devpath[MAXPATHLEN];
 	char udevpath[MAXPATHLEN];
 	uint64_t wholedisk;
 	struct stat64 statbuf;
 	int is_exclusive = 0;
 	int fd;
 	int ret;
 
 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0) {
 
 		if (strcmp(type, VDEV_TYPE_DISK) != 0)
 			return (0);
 
 		/*
 		 * We have a disk device.  If this is a whole disk write
 		 * out the efi partition table, otherwise write zero's to
 		 * the first 4k of the partition.  This is to ensure that
 		 * libblkid will not misidentify the partition due to a
 		 * magic value left by the previous filesystem.
 		 */
 		verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
 		verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    &wholedisk));
 
 		if (!wholedisk) {
 			/*
 			 * Update device id string for mpath nodes (Linux only)
 			 */
 			if (is_mpath_whole_disk(path))
 				update_vdev_config_dev_strs(nv);
 
 			if (!is_spare(NULL, path))
 				(void) zero_label(path);
 			return (0);
 		}
 
 		if (realpath(path, devpath) == NULL) {
 			ret = errno;
 			(void) fprintf(stderr,
 			    gettext("cannot resolve path '%s'\n"), path);
 			return (ret);
 		}
 
 		/*
 		 * Remove any previously existing symlink from a udev path to
 		 * the device before labeling the disk.  This ensures that
 		 * only newly created links are used.  Otherwise there is a
 		 * window between when udev deletes and recreates the link
 		 * during which access attempts will fail with ENOENT.
 		 */
 		strlcpy(udevpath, path, MAXPATHLEN);
 		(void) zfs_append_partition(udevpath, MAXPATHLEN);
 
 		fd = open(devpath, O_RDWR|O_EXCL);
 		if (fd == -1) {
 			if (errno == EBUSY)
 				is_exclusive = 1;
 #ifdef __FreeBSD__
 			if (errno == EPERM)
 				is_exclusive = 1;
 #endif
 		} else {
 			(void) close(fd);
 		}
 
 		/*
 		 * If the partition exists, contains a valid spare label,
 		 * and is opened exclusively there is no need to partition
 		 * it.  Hot spares have already been partitioned and are
 		 * held open exclusively by the kernel as a safety measure.
 		 *
 		 * If the provided path is for a /dev/disk/ device its
 		 * symbolic link will be removed, partition table created,
 		 * and then block until udev creates the new link.
 		 */
 		if (!is_exclusive && !is_spare(NULL, udevpath)) {
 			char *devnode = strrchr(devpath, '/') + 1;
 
 			ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
 			if (ret == 0) {
 				ret = lstat64(udevpath, &statbuf);
 				if (ret == 0 && S_ISLNK(statbuf.st_mode))
 					(void) unlink(udevpath);
 			}
 
 			/*
 			 * When labeling a pool the raw device node name
 			 * is provided as it appears under /dev/.
 			 */
 			if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
 				return (-1);
 
 			/*
 			 * Wait for udev to signal the device is available
 			 * by the provided path.
 			 */
 			ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
 			if (ret) {
 				(void) fprintf(stderr,
 				    gettext("missing link: %s was "
 				    "partitioned but %s is missing\n"),
 				    devnode, udevpath);
 				return (ret);
 			}
 
 			ret = zero_label(udevpath);
 			if (ret)
 				return (ret);
 		}
 
 		/*
 		 * Update the path to refer to the partition.  The presence of
 		 * the 'whole_disk' field indicates to the CLI that we should
 		 * chop off the partition number when displaying the device in
 		 * future output.
 		 */
 		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
 
 		/*
 		 * Update device id strings for whole disks (Linux only)
 		 */
 		update_vdev_config_dev_strs(nv);
 
 		return (0);
 	}
 
 	for (c = 0; c < children; c++)
 		if ((ret = make_disks(zhp, child[c])) != 0)
 			return (ret);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
 			if ((ret = make_disks(zhp, child[c])) != 0)
 				return (ret);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
 			if ((ret = make_disks(zhp, child[c])) != 0)
 				return (ret);
 
 	return (0);
 }
 
 /*
  * Go through and find any devices that are in use.  We rely on libdiskmgt for
  * the majority of this task.
  */
 static boolean_t
 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
     boolean_t replacing, boolean_t isspare)
 {
 	nvlist_t **child;
 	uint_t c, children;
 	char *type, *path;
 	int ret = 0;
 	char buf[MAXPATHLEN];
 	uint64_t wholedisk = B_FALSE;
 	boolean_t anyinuse = B_FALSE;
 
 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0) {
 
 		verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
 			verify(!nvlist_lookup_uint64(nv,
 			    ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
 
 		/*
 		 * As a generic check, we look to see if this is a replace of a
 		 * hot spare within the same pool.  If so, we allow it
 		 * regardless of what libblkid or zpool_in_use() says.
 		 */
 		if (replacing) {
 			(void) strlcpy(buf, path, sizeof (buf));
 			if (wholedisk) {
 				ret = zfs_append_partition(buf,  sizeof (buf));
 				if (ret == -1)
 					return (-1);
 			}
 
 			if (is_spare(config, buf))
 				return (B_FALSE);
 		}
 
 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
 			ret = check_device(path, force, isspare, wholedisk);
 
 		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
 			ret = check_file(path, force, isspare);
 
 		return (ret != 0);
 	}
 
 	for (c = 0; c < children; c++)
 		if (is_device_in_use(config, child[c], force, replacing,
 		    B_FALSE))
 			anyinuse = B_TRUE;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
 			if (is_device_in_use(config, child[c], force, replacing,
 			    B_TRUE))
 				anyinuse = B_TRUE;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
 			if (is_device_in_use(config, child[c], force, replacing,
 			    B_FALSE))
 				anyinuse = B_TRUE;
 
 	return (anyinuse);
 }
 
 /*
  * Returns the parity level extracted from a raidz or draid type.
  * If the parity cannot be determined zero is returned.
  */
 static int
 get_parity(const char *type)
 {
 	long parity = 0;
 	const char *p;
 
 	if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) {
 		p = type + strlen(VDEV_TYPE_RAIDZ);
 
 		if (*p == '\0') {
 			/* when unspecified default to single parity */
 			return (1);
 		} else if (*p == '0') {
 			/* no zero prefixes allowed */
 			return (0);
 		} else {
 			/* 0-3, no suffixes allowed */
 			char *end;
 			errno = 0;
 			parity = strtol(p, &end, 10);
 			if (errno != 0 || *end != '\0' ||
 			    parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) {
 				return (0);
 			}
 		}
 	} else if (strncmp(type, VDEV_TYPE_DRAID,
 	    strlen(VDEV_TYPE_DRAID)) == 0) {
 		p = type + strlen(VDEV_TYPE_DRAID);
 
 		if (*p == '\0' || *p == ':') {
 			/* when unspecified default to single parity */
 			return (1);
 		} else if (*p == '0') {
 			/* no zero prefixes allowed */
 			return (0);
 		} else {
 			/* 0-3, allowed suffixes: '\0' or ':' */
 			char *end;
 			errno = 0;
 			parity = strtol(p, &end, 10);
 			if (errno != 0 ||
 			    parity < 1 || parity > VDEV_DRAID_MAXPARITY ||
 			    (*end != '\0' && *end != ':')) {
 				return (0);
 			}
 		}
 	}
 
 	return ((int)parity);
 }
 
 /*
  * Assign the minimum and maximum number of devices allowed for
  * the specified type.  On error NULL is returned, otherwise the
  * type prefix is returned (raidz, mirror, etc).
  */
 static const char *
 is_grouping(const char *type, int *mindev, int *maxdev)
 {
 	int nparity;
 
 	if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
 	    strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) {
 		nparity = get_parity(type);
 		if (nparity == 0)
 			return (NULL);
 		if (mindev != NULL)
 			*mindev = nparity + 1;
 		if (maxdev != NULL)
 			*maxdev = 255;
 
 		if (strncmp(type, VDEV_TYPE_RAIDZ,
 		    strlen(VDEV_TYPE_RAIDZ)) == 0) {
 			return (VDEV_TYPE_RAIDZ);
 		} else {
 			return (VDEV_TYPE_DRAID);
 		}
 	}
 
 	if (maxdev != NULL)
 		*maxdev = INT_MAX;
 
 	if (strcmp(type, "mirror") == 0) {
 		if (mindev != NULL)
 			*mindev = 2;
 		return (VDEV_TYPE_MIRROR);
 	}
 
 	if (strcmp(type, "spare") == 0) {
 		if (mindev != NULL)
 			*mindev = 1;
 		return (VDEV_TYPE_SPARE);
 	}
 
 	if (strcmp(type, "log") == 0) {
 		if (mindev != NULL)
 			*mindev = 1;
 		return (VDEV_TYPE_LOG);
 	}
 
 	if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
 	    strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
 		if (mindev != NULL)
 			*mindev = 1;
 		return (type);
 	}
 
 	if (strcmp(type, "cache") == 0) {
 		if (mindev != NULL)
 			*mindev = 1;
 		return (VDEV_TYPE_L2CACHE);
 	}
 
 	return (NULL);
 }
 
 /*
  * Extract the configuration parameters encoded in the dRAID type and
  * use them to generate a dRAID configuration.  The expected format is:
  *
  * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]
  *
  * The intent is to be able to generate a good configuration when no
  * additional information is provided.  The only mandatory component
  * of the 'type' is the 'draid' prefix.  If a value is not provided
  * then reasonable defaults are used.  The optional components may
  * appear in any order but the d/s/c suffix is required.
  *
  * Valid inputs:
  * - data:     number of data devices per group (1-255)
  * - parity:   number of parity blocks per group (1-3)
  * - spares:   number of distributed spare (0-100)
  * - children: total number of devices (1-255)
  *
  * Examples:
  * - zpool create tank draid <devices...>
  * - zpool create tank draid2:8d:51c:2s <devices...>
  */
 static int
 draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
 {
 	uint64_t nparity = 1;
 	uint64_t nspares = 0;
 	uint64_t ndata = UINT64_MAX;
 	uint64_t ngroups = 1;
 	long value;
 
 	if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)
 		return (EINVAL);
 
 	nparity = (uint64_t)get_parity(type);
 	if (nparity == 0)
 		return (EINVAL);
 
 	char *p = (char *)type;
 	while ((p = strchr(p, ':')) != NULL) {
 		char *end;
 
 		p = p + 1;
 		errno = 0;
 
 		if (!isdigit(p[0])) {
 			(void) fprintf(stderr, gettext("invalid dRAID "
 			    "syntax; expected [:<number><c|d|s>] not '%s'\n"),
 			    type);
 			return (EINVAL);
 		}
 
 		/* Expected non-zero value with c/d/s suffix */
 		value = strtol(p, &end, 10);
 		char suffix = tolower(*end);
 		if (errno != 0 ||
 		    (suffix != 'c' && suffix != 'd' && suffix != 's')) {
 			(void) fprintf(stderr, gettext("invalid dRAID "
 			    "syntax; expected [:<number><c|d|s>] not '%s'\n"),
 			    type);
 			return (EINVAL);
 		}
 
 		if (suffix == 'c') {
 			if ((uint64_t)value != children) {
 				fprintf(stderr,
 				    gettext("invalid number of dRAID children; "
 				    "%llu required but %llu provided\n"),
 				    (u_longlong_t)value,
 				    (u_longlong_t)children);
 				return (EINVAL);
 			}
 		} else if (suffix == 'd') {
 			ndata = (uint64_t)value;
 		} else if (suffix == 's') {
 			nspares = (uint64_t)value;
 		} else {
 			verify(0); /* Unreachable */
 		}
 	}
 
 	/*
 	 * When a specific number of data disks is not provided limit a
 	 * redundancy group to 8 data disks.  This value was selected to
 	 * provide a reasonable tradeoff between capacity and performance.
 	 */
 	if (ndata == UINT64_MAX) {
 		if (children > nspares + nparity) {
 			ndata = MIN(children - nspares - nparity, 8);
 		} else {
 			fprintf(stderr, gettext("request number of "
 			    "distributed spares %llu and parity level %llu\n"
 			    "leaves no disks available for data\n"),
 			    (u_longlong_t)nspares, (u_longlong_t)nparity);
 			return (EINVAL);
 		}
 	}
 
 	/* Verify the maximum allowed group size is never exceeded. */
 	if (ndata == 0 || (ndata + nparity > children - nspares)) {
 		fprintf(stderr, gettext("requested number of dRAID data "
 		    "disks per group %llu is too high,\nat most %llu disks "
 		    "are available for data\n"), (u_longlong_t)ndata,
 		    (u_longlong_t)(children - nspares - nparity));
 		return (EINVAL);
 	}
 
 	if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
 		fprintf(stderr,
 		    gettext("invalid dRAID parity level %llu; must be "
 		    "between 1 and %d\n"), (u_longlong_t)nparity,
 		    VDEV_DRAID_MAXPARITY);
 		return (EINVAL);
 	}
 
 	/*
 	 * Verify the requested number of spares can be satisfied.
 	 * An arbitrary limit of 100 distributed spares is applied.
 	 */
 	if (nspares > 100 || nspares > (children - (ndata + nparity))) {
 		fprintf(stderr,
 		    gettext("invalid number of dRAID spares %llu; additional "
 		    "disks would be required\n"), (u_longlong_t)nspares);
 		return (EINVAL);
 	}
 
 	/* Verify the requested number children is sufficient. */
 	if (children < (ndata + nparity + nspares)) {
 		fprintf(stderr, gettext("%llu disks were provided, but at "
 		    "least %llu disks are required for this config\n"),
 		    (u_longlong_t)children,
 		    (u_longlong_t)(ndata + nparity + nspares));
 	}
 
 	if (children > VDEV_DRAID_MAX_CHILDREN) {
 		fprintf(stderr, gettext("%llu disks were provided, but "
 		    "dRAID only supports up to %u disks"),
 		    (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);
 	}
 
 	/*
 	 * Calculate the minimum number of groups required to fill a slice.
 	 * This is the LCM of the stripe width (ndata + nparity) and the
 	 * number of data drives (children - nspares).
 	 */
 	while (ngroups * (ndata + nparity) % (children - nspares) != 0)
 		ngroups++;
 
 	/* Store the basic dRAID configuration. */
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
 
 	return (0);
 }
 
 /*
  * Construct a syntactically valid vdev specification,
  * and ensure that all devices and files exist and can be opened.
  * Note: we don't bother freeing anything in the error paths
  * because the program is just going to exit anyway.
  */
 static nvlist_t *
 construct_spec(nvlist_t *props, int argc, char **argv)
 {
 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
 	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
 	const char *type, *fulltype;
 	boolean_t is_log, is_special, is_dedup, is_spare;
 	boolean_t seen_logs;
 
 	top = NULL;
 	toplevels = 0;
 	spares = NULL;
 	l2cache = NULL;
 	nspares = 0;
 	nlogs = 0;
 	nl2cache = 0;
 	is_log = is_special = is_dedup = is_spare = B_FALSE;
 	seen_logs = B_FALSE;
 	nvroot = NULL;
 
 	while (argc > 0) {
 		fulltype = argv[0];
 		nv = NULL;
 
 		/*
 		 * If it's a mirror, raidz, or draid the subsequent arguments
 		 * are its leaves -- until we encounter the next mirror,
 		 * raidz or draid.
 		 */
 		if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {
 			nvlist_t **child = NULL;
 			int c, children = 0;
 
 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
 				if (spares != NULL) {
 					(void) fprintf(stderr,
 					    gettext("invalid vdev "
 					    "specification: 'spare' can be "
 					    "specified only once\n"));
 					goto spec_out;
 				}
 				is_spare = B_TRUE;
 				is_log = is_special = is_dedup = B_FALSE;
 			}
 
 			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
 				if (seen_logs) {
 					(void) fprintf(stderr,
 					    gettext("invalid vdev "
 					    "specification: 'log' can be "
 					    "specified only once\n"));
 					goto spec_out;
 				}
 				seen_logs = B_TRUE;
 				is_log = B_TRUE;
 				is_special = is_dedup = is_spare = B_FALSE;
 				argc--;
 				argv++;
 				/*
 				 * A log is not a real grouping device.
 				 * We just set is_log and continue.
 				 */
 				continue;
 			}
 
 			if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
 				is_special = B_TRUE;
 				is_log = is_dedup = is_spare = B_FALSE;
 				argc--;
 				argv++;
 				continue;
 			}
 
 			if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
 				is_dedup = B_TRUE;
 				is_log = is_special = is_spare = B_FALSE;
 				argc--;
 				argv++;
 				continue;
 			}
 
 			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
 				if (l2cache != NULL) {
 					(void) fprintf(stderr,
 					    gettext("invalid vdev "
 					    "specification: 'cache' can be "
 					    "specified only once\n"));
 					goto spec_out;
 				}
 				is_log = is_special = B_FALSE;
 				is_dedup = is_spare = B_FALSE;
 			}
 
 			if (is_log || is_special || is_dedup) {
 				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
 					(void) fprintf(stderr,
 					    gettext("invalid vdev "
 					    "specification: unsupported '%s' "
 					    "device: %s\n"), is_log ? "log" :
 					    "special", type);
 					goto spec_out;
 				}
 				nlogs++;
 			}
 
 			for (c = 1; c < argc; c++) {
 				if (is_grouping(argv[c], NULL, NULL) != NULL)
 					break;
 
 				children++;
 				child = realloc(child,
 				    children * sizeof (nvlist_t *));
 				if (child == NULL)
 					zpool_no_memory();
 				if ((nv = make_leaf_vdev(props, argv[c],
 				    !(is_log || is_special || is_dedup ||
 				    is_spare))) == NULL) {
 					for (c = 0; c < children - 1; c++)
 						nvlist_free(child[c]);
 					free(child);
 					goto spec_out;
 				}
 
 				child[children - 1] = nv;
 			}
 
 			if (children < mindev) {
 				(void) fprintf(stderr, gettext("invalid vdev "
 				    "specification: %s requires at least %d "
 				    "devices\n"), argv[0], mindev);
 				for (c = 0; c < children; c++)
 					nvlist_free(child[c]);
 				free(child);
 				goto spec_out;
 			}
 
 			if (children > maxdev) {
 				(void) fprintf(stderr, gettext("invalid vdev "
 				    "specification: %s supports no more than "
 				    "%d devices\n"), argv[0], maxdev);
 				for (c = 0; c < children; c++)
 					nvlist_free(child[c]);
 				free(child);
 				goto spec_out;
 			}
 
 			argc -= c;
 			argv += c;
 
 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
 				spares = child;
 				nspares = children;
 				continue;
 			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
 				l2cache = child;
 				nl2cache = children;
 				continue;
 			} else {
 				/* create a top-level vdev with children */
 				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
 				    0) == 0);
 				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
 				    type) == 0);
 				verify(nvlist_add_uint64(nv,
 				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
 				if (is_log) {
 					verify(nvlist_add_string(nv,
 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
 					    VDEV_ALLOC_BIAS_LOG) == 0);
 				}
 				if (is_special) {
 					verify(nvlist_add_string(nv,
 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
 					    VDEV_ALLOC_BIAS_SPECIAL) == 0);
 				}
 				if (is_dedup) {
 					verify(nvlist_add_string(nv,
 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
 					    VDEV_ALLOC_BIAS_DEDUP) == 0);
 				}
 				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
 					verify(nvlist_add_uint64(nv,
 					    ZPOOL_CONFIG_NPARITY,
 					    mindev - 1) == 0);
 				}
 				if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
 					if (draid_config_by_type(nv,
 					    fulltype, children) != 0) {
 						for (c = 0; c < children; c++)
 							nvlist_free(child[c]);
 						free(child);
 						goto spec_out;
 					}
 				}
 				verify(nvlist_add_nvlist_array(nv,
 				    ZPOOL_CONFIG_CHILDREN, child,
 				    children) == 0);
 
 				for (c = 0; c < children; c++)
 					nvlist_free(child[c]);
 				free(child);
 			}
 		} else {
 			/*
 			 * We have a device.  Pass off to make_leaf_vdev() to
 			 * construct the appropriate nvlist describing the vdev.
 			 */
 			if ((nv = make_leaf_vdev(props, argv[0], !(is_log ||
 			    is_special || is_dedup || is_spare))) == NULL)
 				goto spec_out;
 
 			verify(nvlist_add_uint64(nv,
 			    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
 			if (is_log) {
 				verify(nvlist_add_string(nv,
 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
 				    VDEV_ALLOC_BIAS_LOG) == 0);
 				nlogs++;
 			}
 
 			if (is_special) {
 				verify(nvlist_add_string(nv,
 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
 				    VDEV_ALLOC_BIAS_SPECIAL) == 0);
 			}
 			if (is_dedup) {
 				verify(nvlist_add_string(nv,
 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
 				    VDEV_ALLOC_BIAS_DEDUP) == 0);
 			}
 			argc--;
 			argv++;
 		}
 
 		toplevels++;
 		top = realloc(top, toplevels * sizeof (nvlist_t *));
 		if (top == NULL)
 			zpool_no_memory();
 		top[toplevels - 1] = nv;
 	}
 
 	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
 		(void) fprintf(stderr, gettext("invalid vdev "
 		    "specification: at least one toplevel vdev must be "
 		    "specified\n"));
 		goto spec_out;
 	}
 
 	if (seen_logs && nlogs == 0) {
 		(void) fprintf(stderr, gettext("invalid vdev specification: "
 		    "log requires at least 1 device\n"));
 		goto spec_out;
 	}
 
 	/*
 	 * Finally, create nvroot and add all top-level vdevs to it.
 	 */
 	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
 	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    top, toplevels) == 0);
 	if (nspares != 0)
 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    spares, nspares) == 0);
 	if (nl2cache != 0)
 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    l2cache, nl2cache) == 0);
 
 spec_out:
 	for (t = 0; t < toplevels; t++)
 		nvlist_free(top[t]);
 	for (t = 0; t < nspares; t++)
 		nvlist_free(spares[t]);
 	for (t = 0; t < nl2cache; t++)
 		nvlist_free(l2cache[t]);
 
 	free(spares);
 	free(l2cache);
 	free(top);
 
 	return (nvroot);
 }
 
 nvlist_t *
 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
     splitflags_t flags, int argc, char **argv)
 {
 	nvlist_t *newroot = NULL, **child;
 	uint_t c, children;
 
 	if (argc > 0) {
 		if ((newroot = construct_spec(props, argc, argv)) == NULL) {
 			(void) fprintf(stderr, gettext("Unable to build a "
 			    "pool from the specified devices\n"));
 			return (NULL);
 		}
 
 		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
 			nvlist_free(newroot);
 			return (NULL);
 		}
 
 		/* avoid any tricks in the spec */
 		verify(nvlist_lookup_nvlist_array(newroot,
 		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
 		for (c = 0; c < children; c++) {
 			char *path;
 			const char *type;
 			int min, max;
 
 			verify(nvlist_lookup_string(child[c],
 			    ZPOOL_CONFIG_PATH, &path) == 0);
 			if ((type = is_grouping(path, &min, &max)) != NULL) {
 				(void) fprintf(stderr, gettext("Cannot use "
 				    "'%s' as a device for splitting\n"), type);
 				nvlist_free(newroot);
 				return (NULL);
 			}
 		}
 	}
 
 	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	return (newroot);
 }
 
 static int
 num_normal_vdevs(nvlist_t *nvroot)
 {
 	nvlist_t **top;
 	uint_t t, toplevels, normal = 0;
 
 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &top, &toplevels) == 0);
 
 	for (t = 0; t < toplevels; t++) {
 		uint64_t log = B_FALSE;
 
 		(void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
 		if (log)
 			continue;
 		if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
 			continue;
 
 		normal++;
 	}
 
 	return (normal);
 }
 
 /*
  * Get and validate the contents of the given vdev specification.  This ensures
  * that the nvlist returned is well-formed, that all the devices exist, and that
  * they are not currently in use by any other known consumer.  The 'poolconfig'
  * parameter is the current configuration of the pool when adding devices
  * existing pool, and is used to perform additional checks, such as changing the
  * replication level of the pool.  It can be 'NULL' to indicate that this is a
  * new pool.  The 'force' flag controls whether devices should be forcefully
  * added, even if they appear in use.
  */
 nvlist_t *
 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
     boolean_t replacing, boolean_t dryrun, int argc, char **argv)
 {
 	nvlist_t *newroot;
 	nvlist_t *poolconfig = NULL;
 	is_force = force;
 
 	/*
 	 * Construct the vdev specification.  If this is successful, we know
 	 * that we have a valid specification, and that all devices can be
 	 * opened.
 	 */
 	if ((newroot = construct_spec(props, argc, argv)) == NULL)
 		return (NULL);
 
 	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	/*
 	 * Validate each device to make sure that it's not shared with another
 	 * subsystem.  We do this even if 'force' is set, because there are some
 	 * uses (such as a dedicated dump device) that even '-f' cannot
 	 * override.
 	 */
 	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	/*
 	 * Check the replication level of the given vdevs and report any errors
 	 * found.  We include the existing pool spec, if any, as we need to
 	 * catch changes against the existing replication level.
 	 */
 	if (check_rep && check_replication(poolconfig, newroot) != 0) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	/*
 	 * On pool create the new vdev spec must have one normal vdev.
 	 */
 	if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
 		vdev_error(gettext("at least one general top-level vdev must "
 		    "be specified\n"));
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	/*
 	 * Run through the vdev specification and label any whole disks found.
 	 */
 	if (!dryrun && make_disks(zhp, newroot) != 0) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	return (newroot);
 }
diff --git a/include/libuutil.h b/include/libuutil.h
index 1d179945cca1..cadc20d2d8f3 100644
--- a/include/libuutil.h
+++ b/include/libuutil.h
@@ -1,356 +1,359 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_LIBUUTIL_H
 #define	_LIBUUTIL_H
 
 #include <sys/types.h>
 #include <stdarg.h>
 #include <stdio.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Standard flags codes.
  */
 #define	UU_DEFAULT		0
 
 /*
  * Standard error codes.
  */
 #define	UU_ERROR_NONE		0	/* no error */
 #define	UU_ERROR_INVALID_ARGUMENT 1	/* invalid argument */
 #define	UU_ERROR_UNKNOWN_FLAG	2	/* passed flag invalid */
 #define	UU_ERROR_NO_MEMORY	3	/* out of memory */
 #define	UU_ERROR_CALLBACK_FAILED 4	/* callback-initiated error */
 #define	UU_ERROR_NOT_SUPPORTED	5	/* operation not supported */
 #define	UU_ERROR_EMPTY		6	/* no value provided */
 #define	UU_ERROR_UNDERFLOW	7	/* value is too small */
 #define	UU_ERROR_OVERFLOW	8	/* value is too value */
 #define	UU_ERROR_INVALID_CHAR	9	/* value contains unexpected char */
 #define	UU_ERROR_INVALID_DIGIT	10	/* value contains digit not in base */
 
 #define	UU_ERROR_SYSTEM		99	/* underlying system error */
 #define	UU_ERROR_UNKNOWN	100	/* error status not known */
 
 /*
  * Standard program exit codes.
  */
 #define	UU_EXIT_OK	(*(uu_exit_ok()))
 #define	UU_EXIT_FATAL	(*(uu_exit_fatal()))
 #define	UU_EXIT_USAGE	(*(uu_exit_usage()))
 
 /*
  * Exit status profiles.
  */
 #define	UU_PROFILE_DEFAULT	0
 #define	UU_PROFILE_LAUNCHER	1
 
 /*
  * Error reporting functions.
  */
 uint32_t uu_error(void);
 const char *uu_strerror(uint32_t);
 
 /*
  * Program notification functions.
  */
 extern void uu_alt_exit(int);
 extern const char *uu_setpname(char *);
 extern const char *uu_getpname(void);
-/*PRINTFLIKE1*/
-extern void uu_warn(const char *, ...);
-extern void uu_vwarn(const char *, va_list);
-/*PRINTFLIKE1*/
-extern void uu_die(const char *, ...) __NORETURN;
-extern void uu_vdie(const char *, va_list) __NORETURN;
-/*PRINTFLIKE2*/
-extern void uu_xdie(int, const char *, ...) __NORETURN;
-extern void uu_vxdie(int, const char *, va_list) __NORETURN;
+extern void uu_warn(const char *, ...)
+    __attribute__((format(printf, 1, 2)));
+extern void uu_vwarn(const char *, va_list)
+    __attribute__((format(printf, 1, 0)));
+extern void uu_die(const char *, ...)
+    __attribute__((format(printf, 1, 2))) __NORETURN;
+extern void uu_vdie(const char *, va_list)
+    __attribute__((format(printf, 1, 0))) __NORETURN;
+extern void uu_xdie(int, const char *, ...)
+    __attribute__((format(printf, 2, 3))) __NORETURN;
+extern void uu_vxdie(int, const char *, va_list)
+    __attribute__((format(printf, 2, 0))) __NORETURN;
 
 /*
  * Exit status functions (not to be used directly)
  */
 extern int *uu_exit_ok(void);
 extern int *uu_exit_fatal(void);
 extern int *uu_exit_usage(void);
 
 /*
  * Identifier test flags and function.
  */
 #define	UU_NAME_DOMAIN		0x1	/* allow SUNW, or com.sun, prefix */
 #define	UU_NAME_PATH		0x2	/* allow '/'-delimited paths */
 
 int uu_check_name(const char *, uint_t);
 
 /*
  * Convenience functions.
  */
 #define	UU_NELEM(a)	(sizeof (a) / sizeof ((a)[0]))
 
-/*PRINTFLIKE1*/
-extern char *uu_msprintf(const char *format, ...);
+extern char *uu_msprintf(const char *format, ...)
+    __attribute__((format(printf, 1, 2)));
 extern void *uu_zalloc(size_t);
 extern char *uu_strdup(const char *);
 extern void uu_free(void *);
 
 extern boolean_t uu_strcaseeq(const char *a, const char *b);
 extern boolean_t uu_streq(const char *a, const char *b);
 extern char *uu_strndup(const char *s, size_t n);
 extern boolean_t uu_strbw(const char *a, const char *b);
 extern void *uu_memdup(const void *buf, size_t sz);
 
 /*
  * Comparison function type definition.
  *   Developers should be careful in their use of the _private argument. If you
  *   break interface guarantees, you get undefined behavior.
  */
 typedef int uu_compare_fn_t(const void *__left, const void *__right,
     void *__private);
 
 /*
  * Walk variant flags.
  *   A data structure need not provide support for all variants and
  *   combinations.  Refer to the appropriate documentation.
  */
 #define	UU_WALK_ROBUST		0x00000001	/* walk can survive removes */
 #define	UU_WALK_REVERSE		0x00000002	/* reverse walk order */
 
 #define	UU_WALK_PREORDER	0x00000010	/* walk tree in pre-order */
 #define	UU_WALK_POSTORDER	0x00000020	/* walk tree in post-order */
 
 /*
  * Walk callback function return codes.
  */
 #define	UU_WALK_ERROR		-1
 #define	UU_WALK_NEXT		0
 #define	UU_WALK_DONE		1
 
 /*
  * Walk callback function type definition.
  */
 typedef int uu_walk_fn_t(void *_elem, void *_private);
 
 /*
  * lists: opaque structures
  */
 typedef struct uu_list_pool uu_list_pool_t;
 typedef struct uu_list uu_list_t;
 
 typedef struct uu_list_node {
 	uintptr_t uln_opaque[2];
 } uu_list_node_t;
 
 typedef struct uu_list_walk uu_list_walk_t;
 
 typedef uintptr_t uu_list_index_t;
 
 /*
  * lists: interface
  *
  * basic usage:
  *	typedef struct foo {
  *		...
  *		uu_list_node_t foo_node;
  *		...
  *	} foo_t;
  *
  *	static int
  *	foo_compare(void *l_arg, void *r_arg, void *private)
  *	{
  *		foo_t *l = l_arg;
  *		foo_t *r = r_arg;
  *
  *		if (... l greater than r ...)
  *			return (1);
  *		if (... l less than r ...)
  *			return (-1);
  *		return (0);
  *	}
  *
  *	...
  *		// at initialization time
  *		foo_pool = uu_list_pool_create("foo_pool",
  *		    sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare,
  *		    debugging? 0 : UU_AVL_POOL_DEBUG);
  *	...
  */
 uu_list_pool_t *uu_list_pool_create(const char *, size_t, size_t,
     uu_compare_fn_t *, uint32_t);
 #define	UU_LIST_POOL_DEBUG	0x00000001
 
 void uu_list_pool_destroy(uu_list_pool_t *);
 
 /*
  * usage:
  *
  *	foo_t *a;
  *	a = malloc(sizeof (*a));
  *	uu_list_node_init(a, &a->foo_list, pool);
  *	...
  *	uu_list_node_fini(a, &a->foo_list, pool);
  *	free(a);
  */
 void uu_list_node_init(void *, uu_list_node_t *, uu_list_pool_t *);
 void uu_list_node_fini(void *, uu_list_node_t *, uu_list_pool_t *);
 
 uu_list_t *uu_list_create(uu_list_pool_t *, void *_parent, uint32_t);
 #define	UU_LIST_DEBUG	0x00000001
 #define	UU_LIST_SORTED	0x00000002	/* list is sorted */
 
 void uu_list_destroy(uu_list_t *);	/* list must be empty */
 
 size_t uu_list_numnodes(uu_list_t *);
 
 void *uu_list_first(uu_list_t *);
 void *uu_list_last(uu_list_t *);
 
 void *uu_list_next(uu_list_t *, void *);
 void *uu_list_prev(uu_list_t *, void *);
 
 int uu_list_walk(uu_list_t *, uu_walk_fn_t *, void *, uint32_t);
 
 uu_list_walk_t *uu_list_walk_start(uu_list_t *, uint32_t);
 void *uu_list_walk_next(uu_list_walk_t *);
 void uu_list_walk_end(uu_list_walk_t *);
 
 void *uu_list_find(uu_list_t *, void *, void *, uu_list_index_t *);
 void uu_list_insert(uu_list_t *, void *, uu_list_index_t);
 
 void *uu_list_nearest_next(uu_list_t *, uu_list_index_t);
 void *uu_list_nearest_prev(uu_list_t *, uu_list_index_t);
 
 void *uu_list_teardown(uu_list_t *, void **);
 
 void uu_list_remove(uu_list_t *, void *);
 
 /*
  * lists: interfaces for non-sorted lists only
  */
 int uu_list_insert_before(uu_list_t *, void *_target, void *_elem);
 int uu_list_insert_after(uu_list_t *, void *_target, void *_elem);
 
 /*
  * avl trees: opaque structures
  */
 typedef struct uu_avl_pool uu_avl_pool_t;
 typedef struct uu_avl uu_avl_t;
 
 typedef struct uu_avl_node {
 #ifdef _LP64
 	uintptr_t uan_opaque[3];
 #else
 	uintptr_t uan_opaque[4];
 #endif
 } uu_avl_node_t;
 
 typedef struct uu_avl_walk uu_avl_walk_t;
 
 typedef uintptr_t uu_avl_index_t;
 
 /*
  * avl trees: interface
  *
  * basic usage:
  *	typedef struct foo {
  *		...
  *		uu_avl_node_t foo_node;
  *		...
  *	} foo_t;
  *
  *	static int
  *	foo_compare(void *l_arg, void *r_arg, void *private)
  *	{
  *		foo_t *l = l_arg;
  *		foo_t *r = r_arg;
  *
  *		if (... l greater than r ...)
  *			return (1);
  *		if (... l less than r ...)
  *			return (-1);
  *		return (0);
  *	}
  *
  *	...
  *		// at initialization time
  *		foo_pool = uu_avl_pool_create("foo_pool",
  *		    sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare,
  *		    debugging? 0 : UU_AVL_POOL_DEBUG);
  *	...
  */
 uu_avl_pool_t *uu_avl_pool_create(const char *, size_t, size_t,
     uu_compare_fn_t *, uint32_t);
 #define	UU_AVL_POOL_DEBUG	0x00000001
 
 void uu_avl_pool_destroy(uu_avl_pool_t *);
 
 /*
  * usage:
  *
  *	foo_t *a;
  *	a = malloc(sizeof (*a));
  *	uu_avl_node_init(a, &a->foo_avl, pool);
  *	...
  *	uu_avl_node_fini(a, &a->foo_avl, pool);
  *	free(a);
  */
 void uu_avl_node_init(void *, uu_avl_node_t *, uu_avl_pool_t *);
 void uu_avl_node_fini(void *, uu_avl_node_t *, uu_avl_pool_t *);
 
 uu_avl_t *uu_avl_create(uu_avl_pool_t *, void *_parent, uint32_t);
 #define	UU_AVL_DEBUG	0x00000001
 
 void uu_avl_destroy(uu_avl_t *);	/* list must be empty */
 
 size_t uu_avl_numnodes(uu_avl_t *);
 
 void *uu_avl_first(uu_avl_t *);
 void *uu_avl_last(uu_avl_t *);
 
 void *uu_avl_next(uu_avl_t *, void *);
 void *uu_avl_prev(uu_avl_t *, void *);
 
 int uu_avl_walk(uu_avl_t *, uu_walk_fn_t *, void *, uint32_t);
 
 uu_avl_walk_t *uu_avl_walk_start(uu_avl_t *, uint32_t);
 void *uu_avl_walk_next(uu_avl_walk_t *);
 void uu_avl_walk_end(uu_avl_walk_t *);
 
 void *uu_avl_find(uu_avl_t *, void *, void *, uu_avl_index_t *);
 void uu_avl_insert(uu_avl_t *, void *, uu_avl_index_t);
 
 void *uu_avl_nearest_next(uu_avl_t *, uu_avl_index_t);
 void *uu_avl_nearest_prev(uu_avl_t *, uu_avl_index_t);
 
 void *uu_avl_teardown(uu_avl_t *, void **);
 
 void uu_avl_remove(uu_avl_t *, void *);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _LIBUUTIL_H */
diff --git a/include/libuutil_impl.h b/include/libuutil_impl.h
index 50d8e012d5f2..753bbff2461d 100644
--- a/include/libuutil_impl.h
+++ b/include/libuutil_impl.h
@@ -1,175 +1,174 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_LIBUUTIL_IMPL_H
 #define	_LIBUUTIL_IMPL_H
 
 
 
 #include <libuutil.h>
 #include <pthread.h>
 
 #include <sys/avl_impl.h>
 #include <sys/byteorder.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 void uu_set_error(uint_t);
 
 
-/*PRINTFLIKE1*/
-void uu_panic(const char *format, ...);
+void uu_panic(const char *format, ...) __attribute__((format(printf, 1, 2)));
 
 
 /*
  * For debugging purposes, libuutil keeps around linked lists of all uu_lists
  * and uu_avls, along with pointers to their parents.  These can cause false
  * negatives when looking for memory leaks, so we encode the pointers by
  * storing them with swapped endianness;  this is not perfect, but it's about
  * the best we can do without wasting a lot of space.
  */
 #ifdef _LP64
 #define	UU_PTR_ENCODE(ptr)		BSWAP_64((uintptr_t)(void *)(ptr))
 #else
 #define	UU_PTR_ENCODE(ptr)		BSWAP_32((uintptr_t)(void *)(ptr))
 #endif
 
 #define	UU_PTR_DECODE(ptr)		((void *)UU_PTR_ENCODE(ptr))
 
 /*
  * uu_list structures
  */
 typedef struct uu_list_node_impl {
 	struct uu_list_node_impl *uln_next;
 	struct uu_list_node_impl *uln_prev;
 } uu_list_node_impl_t;
 
 struct uu_list_walk {
 	uu_list_walk_t	*ulw_next;
 	uu_list_walk_t	*ulw_prev;
 
 	uu_list_t	*ulw_list;
 	int8_t		ulw_dir;
 	uint8_t		ulw_robust;
 	uu_list_node_impl_t *ulw_next_result;
 };
 
 struct uu_list {
 	uintptr_t	ul_next_enc;
 	uintptr_t	ul_prev_enc;
 
 	uu_list_pool_t	*ul_pool;
 	uintptr_t	ul_parent_enc;	/* encoded parent pointer */
 	size_t		ul_offset;
 	size_t		ul_numnodes;
 	uint8_t		ul_debug;
 	uint8_t		ul_sorted;
 	uint8_t		ul_index;	/* mark for uu_list_index_ts */
 
 	uu_list_node_impl_t ul_null_node;
 	uu_list_walk_t	ul_null_walk;	/* for robust walkers */
 };
 
 #define	UU_LIST_PTR(ptr)		((uu_list_t *)UU_PTR_DECODE(ptr))
 
 #define	UU_LIST_POOL_MAXNAME	64
 
 struct uu_list_pool {
 	uu_list_pool_t	*ulp_next;
 	uu_list_pool_t	*ulp_prev;
 
 	char		ulp_name[UU_LIST_POOL_MAXNAME];
 	size_t		ulp_nodeoffset;
 	size_t		ulp_objsize;
 	uu_compare_fn_t	*ulp_cmp;
 	uint8_t		ulp_debug;
 	uint8_t		ulp_last_index;
 	pthread_mutex_t	ulp_lock;		/* protects null_list */
 	uu_list_t	ulp_null_list;
 };
 
 /*
  * uu_avl structures
  */
 typedef struct avl_node		uu_avl_node_impl_t;
 
 struct uu_avl_walk {
 	uu_avl_walk_t	*uaw_next;
 	uu_avl_walk_t	*uaw_prev;
 
 	uu_avl_t	*uaw_avl;
 	void		*uaw_next_result;
 	int8_t		uaw_dir;
 	uint8_t		uaw_robust;
 };
 
 struct uu_avl {
 	uintptr_t	ua_next_enc;
 	uintptr_t	ua_prev_enc;
 
 	uu_avl_pool_t	*ua_pool;
 	uintptr_t	ua_parent_enc;
 	uint8_t		ua_debug;
 	uint8_t		ua_index;	/* mark for uu_avl_index_ts */
 
 	struct avl_tree	ua_tree;
 	uu_avl_walk_t	ua_null_walk;
 };
 
 #define	UU_AVL_PTR(x)		((uu_avl_t *)UU_PTR_DECODE(x))
 
 #define	UU_AVL_POOL_MAXNAME	64
 
 struct uu_avl_pool {
 	uu_avl_pool_t	*uap_next;
 	uu_avl_pool_t	*uap_prev;
 
 	char		uap_name[UU_AVL_POOL_MAXNAME];
 	size_t		uap_nodeoffset;
 	size_t		uap_objsize;
 	uu_compare_fn_t	*uap_cmp;
 	uint8_t		uap_debug;
 	uint8_t		uap_last_index;
 	pthread_mutex_t	uap_lock;		/* protects null_avl */
 	uu_avl_t	uap_null_avl;
 };
 
 /*
  * atfork() handlers
  */
 void uu_avl_lockup(void);
 void uu_avl_release(void);
 
 void uu_list_lockup(void);
 void uu_list_release(void);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _LIBUUTIL_IMPL_H */
diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h
index 7109d42ffbb6..9970443103bf 100644
--- a/include/os/freebsd/spl/sys/ccompile.h
+++ b/include/os/freebsd/spl/sys/ccompile.h
@@ -1,284 +1,199 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_CCOMPILE_H
 #define	_SYS_CCOMPILE_H
 
 /*
  * This file contains definitions designed to enable different compilers
  * to be used harmoniously on Solaris systems.
  */
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-/*
- * Allow for version tests for compiler bugs and features.
- */
-#if defined(__GNUC__)
-#define	__GNUC_VERSION	\
-	(__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#else
-#define	__GNUC_VERSION	0
-#endif
-
-#if defined(__ATTRIBUTE_IMPLEMENTED) || defined(__GNUC__)
-
-#if 0
-/*
- * analogous to lint's PRINTFLIKEn
- */
-#define	__sun_attr___PRINTFLIKE__(__n)	\
-		__attribute__((__format__(printf, __n, (__n)+1)))
-#define	__sun_attr___VPRINTFLIKE__(__n)	\
-		__attribute__((__format__(printf, __n, 0)))
-
-#define	__sun_attr___KPRINTFLIKE__	__sun_attr___PRINTFLIKE__
-#define	__sun_attr___KVPRINTFLIKE__	__sun_attr___VPRINTFLIKE__
-#else
-/*
- * Currently the openzfs codebase has a lot of formatting errors
- * which are not picked up in the linux build because they're not
- * doing formatting checks. LLVM's kprintf implementation doesn't
- * actually do format checks!
- *
- * For FreeBSD these break under gcc! LLVM shim'ed cmn_err as a
- * format attribute but also didn't check anything.  If one
- * replaces it with the above, all of the format issues
- * in the codebase show up.
- *
- * Once those format string issues are addressed, the above
- * should be flipped on once again.
- */
-#define	__sun_attr___PRINTFLIKE__(__n)
-#define	__sun_attr___VPRINTFLIKE__(__n)
-#define	__sun_attr___KPRINTFLIKE__(__n)
-#define	__sun_attr___KVPRINTFLIKE__(__n)
-
-#endif
-
-/*
- * This one's pretty obvious -- the function never returns
- */
-#define	__sun_attr___noreturn__ __attribute__((__noreturn__))
-
-/*
- * This is an appropriate label for functions that do not
- * modify their arguments, e.g. strlen()
- */
-#define	__sun_attr___pure__	__attribute__((__pure__))
-
-/*
- * This is a stronger form of __pure__. Can be used for functions
- * that do not modify their arguments and don't depend on global
- * memory.
- */
-#define	__sun_attr___const__	__attribute__((__const__))
-
-/*
- * structure packing like #pragma pack(1)
- */
-#define	__sun_attr___packed__	__attribute__((__packed__))
-
-#define	___sun_attr_inner(__a)	__sun_attr_##__a
-#define	__sun_attr__(__a)	___sun_attr_inner __a
-
-#else	/* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */
-
-#define	__sun_attr__(__a)
-
-#endif	/* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */
-
-/*
- * Shorthand versions for readability
- */
-
-#define	__PRINTFLIKE(__n)	__sun_attr__((__PRINTFLIKE__(__n)))
-#define	__VPRINTFLIKE(__n)	__sun_attr__((__VPRINTFLIKE__(__n)))
-#define	__KPRINTFLIKE(__n)	__sun_attr__((__KPRINTFLIKE__(__n)))
-#define	__KVPRINTFLIKE(__n)	__sun_attr__((__KVPRINTFLIKE__(__n)))
 #if	defined(_KERNEL) || defined(_STANDALONE)
 #define	__NORETURN		__sun_attr__((__noreturn__))
 #endif /* _KERNEL || _STANDALONE */
 #define	__CONST			__sun_attr__((__const__))
 #define	__PURE			__sun_attr__((__pure__))
 
 #if defined(INVARIANTS) && !defined(ZFS_DEBUG)
 #define	ZFS_DEBUG
 #undef 	NDEBUG
 #endif
 
 #define	EXPORT_SYMBOL(x)
 #define	MODULE_AUTHOR(s)
 #define	MODULE_DESCRIPTION(s)
 #define	MODULE_LICENSE(s)
 #define	module_param(a, b, c)
 #define	module_param_call(a, b, c, d, e)
 #define	module_param_named(a, b, c, d)
 #define	MODULE_PARM_DESC(a, b)
 #define	asm __asm
 #ifdef ZFS_DEBUG
 #undef NDEBUG
 #endif
 #if !defined(ZFS_DEBUG) && !defined(NDEBUG)
 #define	NDEBUG
 #endif
 
 #ifndef EINTEGRITY
 #define	EINTEGRITY 97 /* EINTEGRITY is new in 13 */
 #endif
 
 /*
  * These are bespoke errnos used in ZFS. We map them to their closest FreeBSD
  * equivalents. This gives us more useful error messages from strerror(3).
  */
 #define	ECKSUM	EINTEGRITY
 #define	EFRAGS	ENOSPC
 
 /* Similar for ENOACTIVE */
 #define	ENOTACTIVE	ECANCELED
 
 #define	EREMOTEIO EREMOTE
 #define	ECHRNG ENXIO
 #define	ETIME ETIMEDOUT
 
 #ifndef LOCORE
 #ifndef HAVE_RPC_TYPES
 typedef int bool_t;
 typedef int enum_t;
 #endif
 #endif
 
 #ifndef __cplusplus
 #define	__init
 #define	__exit
 #endif
 
 #if defined(_KERNEL) || defined(_STANDALONE)
 #define	param_set_charp(a, b) (0)
 #define	ATTR_UID AT_UID
 #define	ATTR_GID AT_GID
 #define	ATTR_MODE AT_MODE
 #define	ATTR_XVATTR	AT_XVATTR
 #define	ATTR_CTIME	AT_CTIME
 #define	ATTR_MTIME	AT_MTIME
 #define	ATTR_ATIME	AT_ATIME
 #if defined(_STANDALONE)
 #define	vmem_free kmem_free
 #define	vmem_zalloc kmem_zalloc
 #define	vmem_alloc kmem_zalloc
 #else
 #define	vmem_free zfs_kmem_free
 #define	vmem_zalloc(size, flags) zfs_kmem_alloc(size, flags | M_ZERO)
 #define	vmem_alloc zfs_kmem_alloc
 #endif
 #define	MUTEX_NOLOCKDEP 0
 #define	RW_NOLOCKDEP 0
 
 #else
 #define	FALSE 0
 #define	TRUE 1
 	/*
 	 * XXX We really need to consolidate on standard
 	 * error codes in the common code
 	 */
 #define	ENOSTR ENOTCONN
 #define	ENODATA EINVAL
 
 
 #define	__BSD_VISIBLE 1
 #ifndef	IN_BASE
 #define	__POSIX_VISIBLE 201808
 #define	__XSI_VISIBLE 1000
 #endif
 #define	ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0]))
 #define	mmap64 mmap
 /* Note: this file can be used on linux/macOS when bootstrapping tools. */
 #if defined(__FreeBSD__)
 #define	open64 open
 #define	pwrite64 pwrite
 #define	ftruncate64 ftruncate
 #define	lseek64 lseek
 #define	pread64 pread
 #define	stat64 stat
 #define	lstat64 lstat
 #define	statfs64 statfs
 #define	readdir64 readdir
 #define	dirent64 dirent
 #endif
 #define	P2ALIGN(x, align)		((x) & -(align))
 #define	P2CROSS(x, y, align)	(((x) ^ (y)) > (align) - 1)
 #define	P2ROUNDUP(x, align)		((((x) - 1) | ((align) - 1)) + 1)
 #define	P2PHASE(x, align)		((x) & ((align) - 1))
 #define	P2NPHASE(x, align)		(-(x) & ((align) - 1))
 #define	ISP2(x)			(((x) & ((x) - 1)) == 0)
 #define	IS_P2ALIGNED(v, a)	((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
 #define	P2BOUNDARY(off, len, align) \
 	(((off) ^ ((off) + (len) - 1)) > (align) - 1)
 
 /*
  * Typed version of the P2* macros.  These macros should be used to ensure
  * that the result is correctly calculated based on the data type of (x),
  * which is passed in as the last argument, regardless of the data
  * type of the alignment.  For example, if (x) is of type uint64_t,
  * and we want to round it up to a page boundary using "PAGESIZE" as
  * the alignment, we can do either
  *
  * P2ROUNDUP(x, (uint64_t)PAGESIZE)
  * or
  * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t)
  */
 #define	P2ALIGN_TYPED(x, align, type)   \
 	((type)(x) & -(type)(align))
 #define	P2PHASE_TYPED(x, align, type)   \
 	((type)(x) & ((type)(align) - 1))
 #define	P2NPHASE_TYPED(x, align, type)  \
 	(-(type)(x) & ((type)(align) - 1))
 #define	P2ROUNDUP_TYPED(x, align, type) \
 	((((type)(x) - 1) | ((type)(align) - 1)) + 1)
 #define	P2END_TYPED(x, align, type)     \
 	(-(~(type)(x) & -(type)(align)))
 #define	P2PHASEUP_TYPED(x, align, phase, type)  \
 	((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align)))
 #define	P2CROSS_TYPED(x, y, align, type)        \
 	(((type)(x) ^ (type)(y)) > (type)(align) - 1)
 #define	P2SAMEHIGHBIT_TYPED(x, y, type) \
 	(((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y)))
 
 #define	DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
 #define	RLIM64_INFINITY RLIM_INFINITY
 #ifndef HAVE_ERESTART
 #define	ERESTART EAGAIN
 #endif
 #define	ABS(a)	((a) < 0 ? -(a) : (a))
 
 #endif
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_CCOMPILE_H */
diff --git a/include/os/freebsd/spl/sys/cmn_err.h b/include/os/freebsd/spl/sys/cmn_err.h
index ba4cff37d5f3..bf41ecdb286d 100644
--- a/include/os/freebsd/spl/sys/cmn_err.h
+++ b/include/os/freebsd/spl/sys/cmn_err.h
@@ -1,89 +1,82 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
 
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_CMN_ERR_H
 #define	_SYS_CMN_ERR_H
 
 #if !defined(_ASM)
 #include <sys/_stdarg.h>
 #endif
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /* Common error handling severity levels */
 
 #define	CE_CONT		0	/* continuation		*/
 #define	CE_NOTE		1	/* notice		*/
 #define	CE_WARN		2	/* warning		*/
 #define	CE_PANIC	3	/* panic		*/
 #define	CE_IGNORE	4	/* print nothing	*/
 
 #ifndef _ASM
 
-/*PRINTFLIKE2*/
 extern void cmn_err(int, const char *, ...)
-    __KPRINTFLIKE(2);
+    __attribute__((format(printf, 2, 3)));
 
 extern void vzcmn_err(zoneid_t, int, const char *, __va_list)
-    __KVPRINTFLIKE(3);
+    __attribute__((format(printf, 3, 0)));
 
 extern void vcmn_err(int, const char *, __va_list)
-    __KVPRINTFLIKE(2);
+    __attribute__((format(printf, 2, 0)));
 
-/*PRINTFLIKE3*/
 extern void zcmn_err(zoneid_t, int, const char *, ...)
-    __KPRINTFLIKE(3);
+    __attribute__((format(printf, 3, 4)));
 
 extern void vzprintf(zoneid_t, const char *, __va_list)
-    __KVPRINTFLIKE(2);
+    __attribute__((format(printf, 2, 0)));
 
-/*PRINTFLIKE2*/
 extern void zprintf(zoneid_t, const char *, ...)
-    __KPRINTFLIKE(2);
+    __attribute__((format(printf, 2, 3)));
 
 extern void vuprintf(const char *, __va_list)
-    __KVPRINTFLIKE(1);
+    __attribute__((format(printf, 1, 0)));
 
-/*PRINTFLIKE1*/
 extern void panic(const char *, ...)
-    __KPRINTFLIKE(1) __NORETURN;
-
-extern void vpanic(const char *, __va_list)
-    __KVPRINTFLIKE(1) __NORETURN;
+    __attribute__((format(printf, 1, 2))) __NORETURN;
 
 #endif /* !_ASM */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_CMN_ERR_H */
diff --git a/include/os/linux/spl/sys/cmn_err.h b/include/os/linux/spl/sys/cmn_err.h
index 314bbbaf9e95..79297067c17d 100644
--- a/include/os/linux/spl/sys/cmn_err.h
+++ b/include/os/linux/spl/sys/cmn_err.h
@@ -1,41 +1,44 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _SPL_CMN_ERR_H
 #define	_SPL_CMN_ERR_H
 
 #include <stdarg.h>
 
 #define	CE_CONT		0 /* continuation */
 #define	CE_NOTE		1 /* notice */
 #define	CE_WARN		2 /* warning */
 #define	CE_PANIC	3 /* panic */
 #define	CE_IGNORE	4 /* print nothing */
 
-extern void cmn_err(int, const char *, ...);
-extern void vcmn_err(int, const char *, va_list);
-extern void vpanic(const char *, va_list);
+extern void cmn_err(int, const char *, ...)
+    __attribute__((format(printf, 2, 3)));
+extern void vcmn_err(int, const char *, va_list)
+    __attribute__((format(printf, 2, 0)));
+extern void vpanic(const char *, va_list)
+    __attribute__((format(printf, 1, 0)));
 
 #define	fm_panic	panic
 
 #endif /* SPL_CMN_ERR_H */
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 08eba250d3a3..05b31004b3a8 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1,1209 +1,1211 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019, Klara Inc.
  */
 
 #ifndef _SYS_SPA_H
 #define	_SYS_SPA_H
 
 #include <sys/avl.h>
 #include <sys/zfs_context.h>
 #include <sys/kstat.h>
 #include <sys/nvpair.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
 #include <sys/spa_checksum.h>
 #include <sys/dmu.h>
 #include <sys/space_map.h>
 #include <sys/bitops.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Forward references that lots of things need.
  */
 typedef struct spa spa_t;
 typedef struct vdev vdev_t;
 typedef struct metaslab metaslab_t;
 typedef struct metaslab_group metaslab_group_t;
 typedef struct metaslab_class metaslab_class_t;
 typedef struct zio zio_t;
 typedef struct zilog zilog_t;
 typedef struct spa_aux_vdev spa_aux_vdev_t;
 typedef struct ddt ddt_t;
 typedef struct ddt_entry ddt_entry_t;
 typedef struct zbookmark_phys zbookmark_phys_t;
 
 struct bpobj;
 struct bplist;
 struct dsl_pool;
 struct dsl_dataset;
 struct dsl_crypto_params;
 
 /*
  * Alignment Shift (ashift) is an immutable, internal top-level vdev property
  * which can only be set at vdev creation time. Physical writes are always done
  * according to it, which makes 2^ashift the smallest possible IO on a vdev.
  *
  * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB
  * (2^16 = 65,536).
  */
 #define	ASHIFT_MIN		9
 #define	ASHIFT_MAX		16
 
 /*
  * Size of block to hold the configuration data (a packed nvlist)
  */
 #define	SPA_CONFIG_BLOCKSIZE	(1ULL << 14)
 
 /*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
  * The ASIZE encoding should be at least 64 times larger (6 more bits)
  * to support up to 4-way RAID-Z mirror mode with worst-case gang block
  * overhead, three DVAs per bp, plus one more bit in case we do anything
  * else that expands the ASIZE.
  */
 #define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
 
 #define	SPA_COMPRESSBITS	7
 #define	SPA_VDEVBITS		24
 #define	SPA_COMPRESSMASK	((1U << SPA_COMPRESSBITS) - 1)
 
 /*
  * All SPA data is represented by 128-bit data virtual addresses (DVAs).
  * The members of the dva_t should be considered opaque outside the SPA.
  */
 typedef struct dva {
 	uint64_t	dva_word[2];
 } dva_t;
 
 
 /*
  * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
  * secret and is suitable for use in MAC algorithms as the key.
  */
 typedef struct zio_cksum_salt {
 	uint8_t		zcs_bytes[32];
 } zio_cksum_salt_t;
 
 /*
  * Each block is described by its DVAs, time of birth, checksum, etc.
  * The word-by-word, bit-by-bit layout of the blkptr is as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|  pad  |	  vdev1         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 2	|  pad  |	  vdev2         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 4	|  pad  |	  vdev3         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|			fill count				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * c	|			checksum[0]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * d	|			checksum[1]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * e	|			checksum[2]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * f	|			checksum[3]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * vdev		virtual device ID
  * offset	offset into virtual device
  * LSIZE	logical size
  * PSIZE	physical size (after compression)
  * ASIZE	allocated size (including RAID-Z parity and gang block headers)
  * GRID		RAID-Z layout information (reserved for future use)
  * cksum	checksum function
  * comp		compression function
  * G		gang block indicator
  * B		byteorder (endianness)
  * D		dedup
  * X		encryption
  * E		blkptr_t contains embedded data (see below)
  * lvl		level of indirection
  * type		DMU object type
  * phys birth	txg when dva[0] was written; zero if same as logical birth txg
  *              note that typically all the dva's would be written in this
  *              txg, but they could be different if they were moved by
  *              device removal.
  * log. birth	transaction group in which the block was logically born
  * fill count	number of non-zero blocks under this bp
  * checksum[4]	256-bit checksum of the data this bp describes
  */
 
 /*
  * The blkptr_t's of encrypted blocks also need to store the encryption
  * parameters so that the block can be decrypted. This layout is as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|		vdev1		| GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 2	|		vdev2		| GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 4	|			salt					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|			IV1					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|		IV2		|	    fill count		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * c	|			checksum[0]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * d	|			checksum[1]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * e	|			MAC[0]					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * f	|			MAC[1]					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * salt		Salt for generating encryption keys
  * IV1		First 64 bits of encryption IV
  * X		Block requires encryption handling (set to 1)
  * E		blkptr_t contains embedded data (set to 0, see below)
  * fill count	number of non-zero blocks under this bp (truncated to 32 bits)
  * IV2		Last 32 bits of encryption IV
  * checksum[2]	128-bit checksum of the data this bp describes
  * MAC[2]	128-bit message authentication code for this data
  *
  * The X bit being set indicates that this block is one of 3 types. If this is
  * a level 0 block with an encrypted object type, the block is encrypted
  * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted
  * object type, this block is authenticated with an HMAC (see
  * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC
  * words to store a checksum-of-MACs from the level below (see
  * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED()
  * refers to both encrypted and authenticated blocks and BP_USES_CRYPT()
  * refers to any of these 3 kinds of blocks.
  *
  * The additional encryption parameters are the salt, IV, and MAC which are
  * explained in greater detail in the block comment at the top of zio_crypt.c.
  * The MAC occupies half of the checksum space since it serves a very similar
  * purpose: to prevent data corruption on disk. The only functional difference
  * is that the checksum is used to detect on-disk corruption whether or not the
  * encryption key is loaded and the MAC provides additional protection against
  * malicious disk tampering. We use the 3rd DVA to store the salt and first
  * 64 bits of the IV. As a result encrypted blocks can only have 2 copies
  * maximum instead of the normal 3. The last 32 bits of the IV are stored in
  * the upper bits of what is usually the fill count. Note that only blocks at
  * level 0 or -2 are ever encrypted, which allows us to guarantee that these
  * 32 bits are not trampled over by other code (see zio_crypt.c for details).
  * The salt and IV are not used for authenticated bps or bps with an indirect
  * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits
  * for the fill count.
  */
 
 /*
  * "Embedded" blkptr_t's don't actually point to a block, instead they
  * have a data payload embedded in the blkptr_t itself.  See the comment
  * in blkptr.c for more details.
  *
  * The blkptr_t is laid out as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|      payload                                                  |
  * 1	|      payload                                                  |
  * 2	|      payload                                                  |
  * 3	|      payload                                                  |
  * 4	|      payload                                                  |
  * 5	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| etype |E| comp| PSIZE|              LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|      payload                                                  |
  * 8	|      payload                                                  |
  * 9	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|      payload                                                  |
  * c	|      payload                                                  |
  * d	|      payload                                                  |
  * e	|      payload                                                  |
  * f	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * payload		contains the embedded data
  * B (byteorder)	byteorder (endianness)
  * D (dedup)		padding (set to zero)
  * X			encryption (set to zero)
  * E (embedded)		set to one
  * lvl			indirection level
  * type			DMU object type
  * etype		how to interpret embedded data (BP_EMBEDDED_TYPE_*)
  * comp			compression function of payload
  * PSIZE		size of payload after compression, in bytes
  * LSIZE		logical size of payload, in bytes
  *			note that 25 bits is enough to store the largest
  *			"normal" BP's LSIZE (2^16 * 2^9) in bytes
  * log. birth		transaction group in which the block was logically born
  *
  * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
  * bp's they are stored in units of SPA_MINBLOCKSHIFT.
  * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
  * The B, D, X, lvl, type, and comp fields are stored the same as with normal
  * BP's so the BP_SET_* macros can be used with them.  etype, PSIZE, LSIZE must
  * be set with the BPE_SET_* macros.  BP_SET_EMBEDDED() should be called before
  * other macros, as they assert that they are only used on BP's of the correct
  * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use
  * the payload space for encryption parameters (see the comment above on
  * how encryption parameters are stored).
  */
 
 #define	BPE_GET_ETYPE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET((bp)->blk_prop, 40, 8))
 #define	BPE_SET_ETYPE(bp, t)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET((bp)->blk_prop, 40, 8, t); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BPE_GET_LSIZE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
 #define	BPE_SET_LSIZE(bp, x)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BPE_GET_PSIZE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
 #define	BPE_SET_PSIZE(bp, x)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 typedef enum bp_embedded_type {
 	BP_EMBEDDED_TYPE_DATA,
 	BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */
 	BP_EMBEDDED_TYPE_REDACTED,
 	NUM_BP_EMBEDDED_TYPES
 } bp_embedded_type_t;
 
 #define	BPE_NUM_WORDS 14
 #define	BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
 #define	BPE_IS_PAYLOADWORD(bp, wp) \
 	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
 
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
 #define	SPA_SYNC_MIN_VDEVS 3		/* min vdevs to update during sync */
 
 /*
  * A block is a hole when it has either 1) never been written to, or
  * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
  * without physically allocating disk space. Holes are represented in the
  * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
  * done through the BP_IS_HOLE macro. For holes, the logical size, level,
  * DMU object type, and birth times are all also stored for holes that
  * were written to at some point (i.e. were punched after having been filled).
  */
 typedef struct blkptr {
 	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
 	uint64_t	blk_pad[2];	/* Extra space for the future	    */
 	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
 	uint64_t	blk_birth;	/* transaction group at birth	    */
 	uint64_t	blk_fill;	/* fill count			    */
 	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
 } blkptr_t;
 
 /*
  * Macros to get and set fields in a bp or DVA.
  */
 
 /*
  * Note, for gang blocks, DVA_GET_ASIZE() is the total space allocated for
  * this gang DVA including its children BP's.  The space allocated at this
  * DVA's vdev/offset is vdev_gang_header_asize(vdev).
  */
 #define	DVA_GET_ASIZE(dva)	\
 	BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_ASIZE(dva, x)	\
 	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
 	SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
 #define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
 
 #define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
 #define	DVA_SET_VDEV(dva, x)	\
 	BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
 
 #define	DVA_GET_OFFSET(dva)	\
 	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_OFFSET(dva, x)	\
 	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define	BP_GET_LSIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ?	\
 	(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
 	BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
 #define	BP_SET_LSIZE(bp, x)	do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, \
 	    0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_PSIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
 #define	BP_SET_PSIZE(bp, x)	do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, \
 	    16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_COMPRESS(bp)		\
 	BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
 #define	BP_SET_COMPRESS(bp, x)		\
 	BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
 
 #define	BP_IS_EMBEDDED(bp)		BF64_GET((bp)->blk_prop, 39, 1)
 #define	BP_SET_EMBEDDED(bp, x)		BF64_SET((bp)->blk_prop, 39, 1, x)
 
 #define	BP_GET_CHECKSUM(bp)		\
 	(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
 	BF64_GET((bp)->blk_prop, 40, 8))
 #define	BP_SET_CHECKSUM(bp, x)		do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET((bp)->blk_prop, 40, 8, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
 #define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
 
 #define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
 #define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
 
 /* encrypted, authenticated, and MAC cksum bps use the same bit */
 #define	BP_USES_CRYPT(bp)		BF64_GET((bp)->blk_prop, 61, 1)
 #define	BP_SET_CRYPT(bp, x)		BF64_SET((bp)->blk_prop, 61, 1, x)
 
 #define	BP_IS_ENCRYPTED(bp)			\
 	(BP_USES_CRYPT(bp) &&			\
 	BP_GET_LEVEL(bp) <= 0 &&		\
 	DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp)))
 
 #define	BP_IS_AUTHENTICATED(bp)			\
 	(BP_USES_CRYPT(bp) &&			\
 	BP_GET_LEVEL(bp) <= 0 &&		\
 	!DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp)))
 
 #define	BP_HAS_INDIRECT_MAC_CKSUM(bp)		\
 	(BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0)
 
 #define	BP_IS_PROTECTED(bp)			\
 	(BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp))
 
 #define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
 #define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
 
 #define	BP_GET_BYTEORDER(bp)		BF64_GET((bp)->blk_prop, 63, 1)
 #define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
 
 #define	BP_GET_FREE(bp)			BF64_GET((bp)->blk_fill, 0, 1)
 #define	BP_SET_FREE(bp, x)		BF64_SET((bp)->blk_fill, 0, 1, x)
 
 #define	BP_PHYSICAL_BIRTH(bp)		\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
 
 #define	BP_SET_BIRTH(bp, logical, physical)	\
 {						\
 	ASSERT(!BP_IS_EMBEDDED(bp));		\
 	(bp)->blk_birth = (logical);		\
 	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
 }
 
 #define	BP_GET_FILL(bp)				\
 	((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \
 	((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill))
 
 #define	BP_SET_FILL(bp, fill)			\
 {						\
 	if (BP_IS_ENCRYPTED(bp))			\
 		BF64_SET((bp)->blk_fill, 0, 32, fill); \
 	else					\
 		(bp)->blk_fill = fill;		\
 }
 
 #define	BP_GET_IV2(bp)				\
 	(ASSERT(BP_IS_ENCRYPTED(bp)),		\
 	BF64_GET((bp)->blk_fill, 32, 32))
 #define	BP_SET_IV2(bp, iv2)			\
 {						\
 	ASSERT(BP_IS_ENCRYPTED(bp));		\
 	BF64_SET((bp)->blk_fill, 32, 32, iv2);	\
 }
 
 #define	BP_IS_METADATA(bp)	\
 	(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 
 #define	BP_GET_ASIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	(DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))
 
 #define	BP_GET_UCSIZE(bp)	\
 	(BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define	BP_GET_NDVAS(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	(!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))
 
 #define	BP_COUNT_GANG(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
 	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
 	(DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))))
 
 #define	DVA_EQUAL(dva1, dva2)	\
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
 #define	BP_EQUAL(bp1, bp2)	\
 	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
 	(bp1)->blk_birth == (bp2)->blk_birth &&			\
 	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
 
 
 #define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
 
 #define	BP_IDENTITY(bp)		(ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
 #define	BP_IS_GANG(bp)		\
 	(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
 #define	DVA_IS_EMPTY(dva)	((dva)->dva_word[0] == 0ULL &&	\
 				(dva)->dva_word[1] == 0ULL)
 #define	BP_IS_HOLE(bp) \
 	(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
 
 #define	BP_SET_REDACTED(bp) \
 {							\
 	BP_SET_EMBEDDED(bp, B_TRUE);			\
 	BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED);	\
 }
 #define	BP_IS_REDACTED(bp) \
 	(BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED)
 
 /* BP_IS_RAIDZ(bp) assumes no block compression */
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
 				BP_GET_PSIZE(bp))
 
 #define	BP_ZERO(bp)				\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
 	(bp)->blk_dva[0].dva_word[1] = 0;	\
 	(bp)->blk_dva[1].dva_word[0] = 0;	\
 	(bp)->blk_dva[1].dva_word[1] = 0;	\
 	(bp)->blk_dva[2].dva_word[0] = 0;	\
 	(bp)->blk_dva[2].dva_word[1] = 0;	\
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
 	(bp)->blk_phys_birth = 0;		\
 	(bp)->blk_birth = 0;			\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
 
 #ifdef _ZFS_BIG_ENDIAN
 #define	ZFS_HOST_BYTEORDER	(0ULL)
 #else
 #define	ZFS_HOST_BYTEORDER	(1ULL)
 #endif
 
 #define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
 
 #define	BP_SPRINTF_LEN	400
 
 /*
  * This macro allows code sharing between zfs, libzpool, and mdb.
  * 'func' is either snprintf() or mdb_snprintf().
  * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
  */
 
 #define	SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
 {									\
 	static const char *copyname[] =					\
 	    { "zero", "single", "double", "triple" };			\
 	int len = 0;							\
 	int copies = 0;							\
 	const char *crypt_type;						\
 	if (bp != NULL) {						\
 		if (BP_IS_ENCRYPTED(bp)) {				\
 			crypt_type = "encrypted";			\
 			/* LINTED E_SUSPICIOUS_COMPARISON */		\
 		} else if (BP_IS_AUTHENTICATED(bp)) {			\
 			crypt_type = "authenticated";			\
 		} else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {		\
 			crypt_type = "indirect-MAC";			\
 		} else {						\
 			crypt_type = "unencrypted";			\
 		}							\
 	}								\
 	if (bp == NULL) {						\
 		len += func(buf + len, size - len, "<NULL>");		\
 	} else if (BP_IS_HOLE(bp)) {					\
 		len += func(buf + len, size - len,			\
 		    "HOLE [L%llu %s] "					\
 		    "size=%llxL birth=%lluL",				\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else if (BP_IS_EMBEDDED(bp)) {				\
 		len = func(buf + len, size - len,			\
 		    "EMBEDDED [L%llu %s] et=%u %s "			\
 		    "size=%llxL/%llxP birth=%lluL",			\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (int)BPE_GET_ETYPE(bp),				\
 		    compress,						\
 		    (u_longlong_t)BPE_GET_LSIZE(bp),			\
 		    (u_longlong_t)BPE_GET_PSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else if (BP_IS_REDACTED(bp)) {				\
 		len += func(buf + len, size - len,			\
 		    "REDACTED [L%llu %s] size=%llxL birth=%lluL",	\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else {							\
 		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
 			const dva_t *dva = &bp->blk_dva[d];		\
 			if (DVA_IS_VALID(dva))				\
 				copies++;				\
 			len += func(buf + len, size - len,		\
 			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
 			    (u_longlong_t)DVA_GET_VDEV(dva),		\
 			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
 			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
 			    ws);					\
 		}							\
 		if (BP_IS_ENCRYPTED(bp)) {				\
 			len += func(buf + len, size - len,		\
 			    "salt=%llx iv=%llx:%llx%c",			\
 			    (u_longlong_t)bp->blk_dva[2].dva_word[0],	\
 			    (u_longlong_t)bp->blk_dva[2].dva_word[1],	\
 			    (u_longlong_t)BP_GET_IV2(bp),		\
 			    ws);					\
 		}							\
 		if (BP_IS_GANG(bp) &&					\
 		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
 		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
 			copies--;					\
 		len += func(buf + len, size - len,			\
 		    "[L%llu %s] %s %s %s %s %s %s %s%c"			\
 		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
 		    "cksum=%llx:%llx:%llx:%llx",			\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    checksum,						\
 		    compress,						\
 		    crypt_type,						\
 		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
 		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
 		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
 		    copyname[copies],					\
 		    ws,							\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth,			\
 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
 		    (u_longlong_t)BP_GET_FILL(bp),			\
 		    ws,							\
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
 	}								\
 	ASSERT(len < size);						\
 }
 
 #define	BP_GET_BUFC_TYPE(bp)						\
 	(BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 typedef enum spa_import_type {
 	SPA_IMPORT_EXISTING,
 	SPA_IMPORT_ASSEMBLE
 } spa_import_type_t;
 
 typedef enum spa_mode {
 	SPA_MODE_UNINIT = 0,
 	SPA_MODE_READ = 1,
 	SPA_MODE_WRITE = 2,
 } spa_mode_t;
 
 /*
  * Send TRIM commands in-line during normal pool operation while deleting.
  *	OFF: no
  *	ON: yes
  * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources.
  */
 typedef enum {
 	SPA_AUTOTRIM_OFF = 0,	/* default */
 	SPA_AUTOTRIM_ON,
 #ifdef IN_FREEBSD_BASE
 	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON,
 #else
 	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF,
 #endif
 } spa_autotrim_t;
 
 /*
  * Reason TRIM command was issued, used internally for accounting purposes.
  */
 typedef enum trim_type {
 	TRIM_TYPE_MANUAL = 0,
 	TRIM_TYPE_AUTO = 1,
 	TRIM_TYPE_SIMPLE = 2
 } trim_type_t;
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
 extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
     nvlist_t *policy, nvlist_t **config);
 extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
     size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops, struct dsl_crypto_params *dcp);
 extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props,
     uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(const char *pool);
 extern int spa_checkpoint(const char *pool);
 extern int spa_checkpoint_discard(const char *pool);
 extern int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce);
 extern int spa_reset(const char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
 extern void spa_async_unrequest(spa_t *spa, int flag);
 extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);
 extern int spa_async_tasks(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
 extern void spa_inject_delref(spa_t *spa);
 extern void spa_scan_stat_init(spa_t *spa);
 extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
 extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
 #define	SPA_ASYNC_CONFIG_UPDATE			0x01
 #define	SPA_ASYNC_REMOVE			0x02
 #define	SPA_ASYNC_PROBE				0x04
 #define	SPA_ASYNC_RESILVER_DONE			0x08
 #define	SPA_ASYNC_RESILVER			0x10
 #define	SPA_ASYNC_AUTOEXPAND			0x20
 #define	SPA_ASYNC_REMOVE_DONE			0x40
 #define	SPA_ASYNC_REMOVE_STOP			0x80
 #define	SPA_ASYNC_INITIALIZE_RESTART		0x100
 #define	SPA_ASYNC_TRIM_RESTART			0x200
 #define	SPA_ASYNC_AUTOTRIM_RESTART		0x400
 #define	SPA_ASYNC_L2CACHE_REBUILD		0x800
 #define	SPA_ASYNC_L2CACHE_TRIM			0x1000
 #define	SPA_ASYNC_REBUILD_DONE			0x2000
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing, int rebuild);
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern boolean_t spa_vdev_remove_active(spa_t *spa);
 extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     nvlist_t *vdev_errlist);
 extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp);
 
 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
 extern void spa_spare_remove(vdev_t *vd);
 extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
 extern void spa_spare_activate(vdev_t *vd);
 
 /* L2ARC state (which is global across all pools) */
 extern void spa_l2cache_add(vdev_t *vd);
 extern void spa_l2cache_remove(vdev_t *vd);
 extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
 
 /* scanning */
 extern int spa_scan(spa_t *spa, pool_scan_func_t func);
 extern int spa_scan_stop(spa_t *spa);
 extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
 extern int zfs_sync_pass_deferred_free;
 
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
 /*
  * SPA configuration functions in spa_config.c
  */
 
 #define	SPA_CONFIG_UPDATE_POOL	0
 #define	SPA_CONFIG_UPDATE_VDEVS	1
 
 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
 extern void spa_config_load(void);
 extern nvlist_t *spa_all_configs(uint64_t *);
 extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
     int getstats);
 extern void spa_config_update(spa_t *spa, int what);
 extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv,
     vdev_t *parent, uint_t id, int atype);
 
 
 /*
  * Miscellaneous SPA routines in spa_misc.c
  */
 
 /* Namespace manipulation */
 extern spa_t *spa_lookup(const char *name);
 extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
 extern void spa_remove(spa_t *spa);
 extern spa_t *spa_next(spa_t *prev);
 
 /* Refcount functions */
 extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
 extern void spa_async_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
 #define	SCL_NONE	0x00
 #define	SCL_CONFIG	0x01
 #define	SCL_STATE	0x02
 #define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
 #define	SCL_ALLOC	0x08
 #define	SCL_ZIO		0x10
 #define	SCL_FREE	0x20
 #define	SCL_VDEV	0x40
 #define	SCL_LOCKS	7
 #define	SCL_ALL		((1 << SCL_LOCKS) - 1)
 #define	SCL_STATE_ALL	(SCL_STATE | SCL_L2ARC | SCL_ZIO)
 
 /* Historical pool statistics */
 typedef struct spa_history_kstat {
 	kmutex_t		lock;
 	uint64_t		count;
 	uint64_t		size;
 	kstat_t			*kstat;
 	void			*priv;
 	list_t			list;
 } spa_history_kstat_t;
 
 typedef struct spa_history_list {
 	uint64_t		size;
 	procfs_list_t		procfs_list;
 } spa_history_list_t;
 
 typedef struct spa_stats {
 	spa_history_list_t	read_history;
 	spa_history_list_t	txg_history;
 	spa_history_kstat_t	tx_assign_histogram;
 	spa_history_list_t	mmp_history;
 	spa_history_kstat_t	state;		/* pool state */
 	spa_history_kstat_t	iostats;
 } spa_stats_t;
 
 typedef enum txg_state {
 	TXG_STATE_BIRTH		= 0,
 	TXG_STATE_OPEN		= 1,
 	TXG_STATE_QUIESCED	= 2,
 	TXG_STATE_WAIT_FOR_SYNC	= 3,
 	TXG_STATE_SYNCED	= 4,
 	TXG_STATE_COMMITTED	= 5,
 } txg_state_t;
 
 typedef struct txg_stat {
 	vdev_stat_t		vs1;
 	vdev_stat_t		vs2;
 	uint64_t		txg;
 	uint64_t		ndirty;
 } txg_stat_t;
 
 /* Assorted pool IO kstats */
 typedef struct spa_iostats {
 	kstat_named_t	trim_extents_written;
 	kstat_named_t	trim_bytes_written;
 	kstat_named_t	trim_extents_skipped;
 	kstat_named_t	trim_bytes_skipped;
 	kstat_named_t	trim_extents_failed;
 	kstat_named_t	trim_bytes_failed;
 	kstat_named_t	autotrim_extents_written;
 	kstat_named_t	autotrim_bytes_written;
 	kstat_named_t	autotrim_extents_skipped;
 	kstat_named_t	autotrim_bytes_skipped;
 	kstat_named_t	autotrim_extents_failed;
 	kstat_named_t	autotrim_bytes_failed;
 	kstat_named_t	simple_trim_extents_written;
 	kstat_named_t	simple_trim_bytes_written;
 	kstat_named_t	simple_trim_extents_skipped;
 	kstat_named_t	simple_trim_bytes_skipped;
 	kstat_named_t	simple_trim_extents_failed;
 	kstat_named_t	simple_trim_bytes_failed;
 } spa_iostats_t;
 
 extern void spa_stats_init(spa_t *spa);
 extern void spa_stats_destroy(spa_t *spa);
 extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb,
     uint32_t aflags);
 extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time);
 extern int spa_txg_history_set(spa_t *spa,  uint64_t txg,
     txg_state_t completed_state, hrtime_t completed_time);
 extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t,
     struct dsl_pool *);
 extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *);
 extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs);
 extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id);
 extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
     hrtime_t duration);
 extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
     uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
     int error);
 extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
     uint64_t extents_written, uint64_t bytes_written,
     uint64_t extents_skipped, uint64_t bytes_skipped,
     uint64_t extents_failed, uint64_t bytes_failed);
 extern void spa_import_progress_add(spa_t *spa);
 extern void spa_import_progress_remove(uint64_t spa_guid);
 extern int spa_import_progress_set_mmp_check(uint64_t pool_guid,
     uint64_t mmp_sec_remaining);
 extern int spa_import_progress_set_max_txg(uint64_t pool_guid,
     uint64_t max_txg);
 extern int spa_import_progress_set_state(uint64_t pool_guid,
     spa_load_state_t spa_load_state);
 
 /* Pool configuration locks */
 extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
 extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw);
 extern void spa_config_exit(spa_t *spa, int locks, const void *tag);
 extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
 
 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
 extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid);
 extern uint64_t spa_vdev_config_enter(spa_t *spa);
 extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
     int error, char *tag);
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
 
 /* Pool vdev state change lock */
 extern void spa_vdev_state_enter(spa_t *spa, int oplock);
 extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
 
 /* Log state */
 typedef enum spa_log_state {
 	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
 	SPA_LOG_MISSING,	/* missing log(s) */
 	SPA_LOG_CLEAR,		/* clear the log(s) */
 	SPA_LOG_GOOD,		/* log(s) are good */
 } spa_log_state_t;
 
 extern spa_log_state_t spa_get_log_state(spa_t *spa);
 extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
 extern int spa_reset_logs(spa_t *spa);
 
 /* Log claim callback */
 extern void spa_claim_notify(zio_t *zio);
 extern void spa_deadman(void *);
 
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
 extern boolean_t spa_is_initializing(spa_t *spa);
 extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
 extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
 extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
 extern void spa_altroot(spa_t *, char *, size_t);
 extern int spa_sync_pass(spa_t *spa);
 extern char *spa_name(spa_t *spa);
 extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_load_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
 extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_final_dirty_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
 extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
 extern uint64_t spa_get_checkpoint_space(spa_t *spa);
 extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern boolean_t spa_deflate(spa_t *spa);
 extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern metaslab_class_t *spa_embedded_log_class(spa_t *spa);
 extern metaslab_class_t *spa_special_class(spa_t *spa);
 extern metaslab_class_t *spa_dedup_class(spa_t *spa);
 extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
     dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
 
 extern void spa_evicting_os_register(spa_t *, objset_t *os);
 extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
 extern void spa_evicting_os_wait(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_prev_software_version(spa_t *spa);
 extern uint64_t spa_get_failmode(spa_t *spa);
 extern uint64_t spa_get_deadman_failmode(spa_t *spa);
 extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode);
 extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
 extern space_map_t *spa_syncing_log_sm(spa_t *spa);
 extern uint64_t spa_deadman_synctime(spa_t *spa);
 extern uint64_t spa_deadman_ziotime(spa_t *spa);
 extern uint64_t spa_dirty_data(spa_t *spa);
 extern spa_autotrim_t spa_get_autotrim(spa_t *spa);
 
 /* Miscellaneous support routines */
-extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
-extern void spa_load_note(spa_t *spa, const char *fmt, ...);
+extern void spa_load_failed(spa_t *spa, const char *fmt, ...)
+    __attribute__((format(printf, 2, 3)));
+extern void spa_load_note(spa_t *spa, const char *fmt, ...)
+    __attribute__((format(printf, 2, 3)));
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
     dmu_tx_t *tx);
 extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
 extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
 extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern int spa_change_guid(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
     boolean_t l2cache);
 extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
 extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
 extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
 extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
 extern int spa_maxblocksize(spa_t *spa);
 extern int spa_maxdnodesize(spa_t *spa);
 extern boolean_t spa_has_checkpoint(spa_t *spa);
 extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
 extern boolean_t spa_suspend_async_destroy(spa_t *spa);
 extern uint64_t spa_min_claim_txg(spa_t *spa);
 extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
     const blkptr_t *bp);
 typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
     void *arg);
 extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
     spa_remap_cb_t callback, void *arg);
 extern uint64_t spa_get_last_removal_txg(spa_t *spa);
 extern boolean_t spa_trust_config(spa_t *spa);
 extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
 extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
 extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
 extern uint64_t spa_total_metaslabs(spa_t *spa);
 extern boolean_t spa_multihost(spa_t *spa);
 extern uint32_t spa_get_hostid(spa_t *spa);
 extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
 extern boolean_t spa_livelist_delete_check(spa_t *spa);
 
 extern spa_mode_t spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
 
 extern char *spa_his_ievent_table[];
 
 extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
 extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf);
 extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl);
 extern void spa_history_log_version(spa_t *spa, const char *operation,
     dmu_tx_t *tx);
 extern void spa_history_log_internal(spa_t *spa, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5);
 extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
     dmu_tx_t *tx, const char *fmt, ...)  __printflike(4, 5);
 extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5);
 
 extern const char *spa_state_to_name(spa_t *spa);
 
 /* error handling */
 struct zbookmark_phys;
 extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb);
 extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t state);
 extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd,
     zio_t *zio);
 extern void zfs_ereport_taskq_fini(void);
 extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd);
 extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
     const char *name, nvlist_t *aux);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
 extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_get_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
 extern void spa_errlog_rotate(spa_t *spa);
 extern void spa_errlog_drain(spa_t *spa);
 extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
 extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
 
 /* vdev cache */
 extern void vdev_cache_stat_init(void);
 extern void vdev_cache_stat_fini(void);
 
 /* vdev mirror */
 extern void vdev_mirror_stat_init(void);
 extern void vdev_mirror_stat_fini(void);
 
 /* Initialization and termination */
 extern void spa_init(spa_mode_t mode);
 extern void spa_fini(void);
 extern void spa_boot_init(void);
 
 /* properties */
 extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
 extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
 extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
 extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
 
 /* asynchronous event notification */
 extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl,
     const char *name);
 
 /* waiting for pool activities to complete */
 extern int spa_wait(const char *pool, zpool_wait_activity_t activity,
     boolean_t *waited);
 extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity,
     uint64_t tag, boolean_t *waited);
 extern void spa_notify_waiters(spa_t *spa);
 extern void spa_wake_waiters(spa_t *spa);
 
 /* module param call functions */
 int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS);
 int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS);
 int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS);
 int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_bp(bp, fmt, ...) do {				\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
 	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));	\
 	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
 extern spa_mode_t spa_mode_global;
 extern int zfs_deadman_enabled;
 extern unsigned long zfs_deadman_synctime_ms;
 extern unsigned long zfs_deadman_ziotime_ms;
 extern unsigned long zfs_deadman_checktime_ms;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_SPA_H */
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index f235bfc8cc19..0a81713a44d0 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -1,225 +1,226 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_H
 #define	_SYS_VDEV_H
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu.h>
 #include <sys/space_map.h>
 #include <sys/metaslab.h>
 #include <sys/fs/zfs.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 typedef enum vdev_dtl_type {
 	DTL_MISSING,	/* 0% replication: no copies of the data */
 	DTL_PARTIAL,	/* less than 100% replication: some copies missing */
 	DTL_SCRUB,	/* unable to fully repair during scrub/resilver */
 	DTL_OUTAGE,	/* temporarily missing (used to attempt detach) */
 	DTL_TYPES
 } vdev_dtl_type_t;
 
 extern int zfs_nocacheflush;
 
 typedef boolean_t vdev_open_children_func_t(vdev_t *vd);
 
-extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...);
+extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
+    __attribute__((format(printf, 2, 3)));
 extern void vdev_dbgmsg_print_tree(vdev_t *, int);
 extern int vdev_open(vdev_t *);
 extern void vdev_open_children(vdev_t *);
 extern void vdev_open_children_subset(vdev_t *, vdev_open_children_func_t *);
 extern int vdev_validate(vdev_t *);
 extern int vdev_copy_path_strict(vdev_t *, vdev_t *);
 extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
 extern void vdev_reopen(vdev_t *);
 extern int vdev_validate_aux(vdev_t *vd);
 extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
 extern boolean_t vdev_is_concrete(vdev_t *vd);
 extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
 extern int vdev_count_leaves(spa_t *spa);
 extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
 extern boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva,
     size_t psize, uint64_t phys_birth);
 extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva,
     size_t psize, uint64_t phys_birth);
 extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done);
 extern boolean_t vdev_dtl_required(vdev_t *vd);
 extern boolean_t vdev_resilver_needed(vdev_t *vd,
     uint64_t *minp, uint64_t *maxp);
 extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
     dmu_tx_t *tx);
 extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
     uint64_t size);
 extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
     uint64_t offset, uint64_t size, dmu_tx_t *tx);
 extern boolean_t vdev_replace_in_progress(vdev_t *vdev);
 
 extern void vdev_hold(vdev_t *);
 extern void vdev_rele(vdev_t *);
 
 extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
 extern void vdev_metaslab_fini(vdev_t *vd);
 extern void vdev_metaslab_set_size(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
 extern void vdev_deadman(vdev_t *vd, char *tag);
 
 typedef void vdev_xlate_func_t(void *arg, range_seg64_t *physical_rs);
 
 extern boolean_t vdev_xlate_is_empty(range_seg64_t *rs);
 extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
     range_seg64_t *physical_rs, range_seg64_t *remain_rs);
 extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
     vdev_xlate_func_t *func, void *arg);
 
 extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
 
 extern metaslab_group_t *vdev_get_mg(vdev_t *vd, metaslab_class_t *mc);
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
 extern void vdev_clear_stats(vdev_t *vd);
 extern void vdev_stat_update(zio_t *zio, uint64_t psize);
 extern void vdev_scan_stat_init(vdev_t *vd);
 extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
 extern boolean_t vdev_children_are_offline(vdev_t *vd);
 
 extern void vdev_space_update(vdev_t *vd,
     int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
 
 extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
 
 extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 
 /*
  * Return the amount of space allocated for a gang block header.
  */
 static inline uint64_t
 vdev_gang_header_asize(vdev_t *vd)
 {
 	return (vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
 }
 
 extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
 extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
 extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
     vdev_state_t *);
 extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
 extern void vdev_clear(spa_t *spa, vdev_t *vd);
 
 extern boolean_t vdev_is_dead(vdev_t *vd);
 extern boolean_t vdev_readable(vdev_t *vd);
 extern boolean_t vdev_writeable(vdev_t *vd);
 extern boolean_t vdev_allocatable(vdev_t *vd);
 extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
 extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
 
 extern void vdev_cache_init(vdev_t *vd);
 extern void vdev_cache_fini(vdev_t *vd);
 extern boolean_t vdev_cache_read(zio_t *zio);
 extern void vdev_cache_write(zio_t *zio);
 extern void vdev_cache_purge(vdev_t *vd);
 
 extern void vdev_queue_init(vdev_t *vd);
 extern void vdev_queue_fini(vdev_t *vd);
 extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
 extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
 
 extern int vdev_queue_length(vdev_t *vd);
 extern uint64_t vdev_queue_last_offset(vdev_t *vd);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
 extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
 
 extern void vdev_state_dirty(vdev_t *vd);
 extern void vdev_state_clean(vdev_t *vd);
 
 extern void vdev_defer_resilver(vdev_t *vd);
 extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx);
 
 typedef enum vdev_config_flag {
 	VDEV_CONFIG_SPARE = 1 << 0,
 	VDEV_CONFIG_L2CACHE = 1 << 1,
 	VDEV_CONFIG_REMOVING = 1 << 2,
 	VDEV_CONFIG_MOS = 1 << 3,
 	VDEV_CONFIG_MISSING = 1 << 4
 } vdev_config_flag_t;
 
 extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
 extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
     boolean_t getstats, vdev_config_flag_t flags);
 
 /*
  * Label routines
  */
 struct uberblock;
 extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
 extern int vdev_label_number(uint64_t psise, uint64_t offset);
 extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
 extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
 extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv);
 extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
     offset, uint64_t size, zio_done_func_t *done, void *priv, int flags);
 extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *);
 extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *);
 
 typedef enum {
 	VDEV_LABEL_CREATE,	/* create/add a new device */
 	VDEV_LABEL_REPLACE,	/* replace an existing device */
 	VDEV_LABEL_SPARE,	/* add a new hot spare */
 	VDEV_LABEL_REMOVE,	/* remove an existing device */
 	VDEV_LABEL_L2CACHE,	/* add an L2ARC cache device */
 	VDEV_LABEL_SPLIT	/* generating new label for split-off dev */
 } vdev_labeltype_t;
 
 extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_VDEV_H */
diff --git a/lib/libuutil/uu_pname.c b/lib/libuutil/uu_pname.c
index a6a0f22661e5..28c4a8a9cf7b 100644
--- a/lib/libuutil/uu_pname.c
+++ b/lib/libuutil/uu_pname.c
@@ -1,207 +1,201 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 
 
 #include "libuutil_common.h"
 
 #include <libintl.h>
 #include <limits.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <errno.h>
 #include <wchar.h>
 #include <unistd.h>
 
-static const char PNAME_FMT[] = "%s: ";
-static const char ERRNO_FMT[] = ": %s\n";
-
 static const char *pname;
 
 static void
 uu_die_internal(int status, const char *format, va_list alist) __NORETURN;
 
 int uu_exit_ok_value = EXIT_SUCCESS;
 int uu_exit_fatal_value = EXIT_FAILURE;
 int uu_exit_usage_value = 2;
 
 int *
 uu_exit_ok(void)
 {
 	return (&uu_exit_ok_value);
 }
 
 int *
 uu_exit_fatal(void)
 {
 	return (&uu_exit_fatal_value);
 }
 
 int *
 uu_exit_usage(void)
 {
 	return (&uu_exit_usage_value);
 }
 
 void
 uu_alt_exit(int profile)
 {
 	switch (profile) {
 	case UU_PROFILE_DEFAULT:
 		uu_exit_ok_value = EXIT_SUCCESS;
 		uu_exit_fatal_value = EXIT_FAILURE;
 		uu_exit_usage_value = 2;
 		break;
 	case UU_PROFILE_LAUNCHER:
 		uu_exit_ok_value = EXIT_SUCCESS;
 		uu_exit_fatal_value = 124;
 		uu_exit_usage_value = 125;
 		break;
 	}
 }
 
-static void
+static __attribute__((format(printf, 2, 0))) void
 uu_warn_internal(int err, const char *format, va_list alist)
 {
 	if (pname != NULL)
-		(void) fprintf(stderr, PNAME_FMT, pname);
+		(void) fprintf(stderr, "%s: ", pname);
 
 	(void) vfprintf(stderr, format, alist);
 
 	if (strrchr(format, '\n') == NULL)
-		(void) fprintf(stderr, ERRNO_FMT, strerror(err));
+		(void) fprintf(stderr, ": %s\n", strerror(err));
 }
 
 void
 uu_vwarn(const char *format, va_list alist)
 {
 	uu_warn_internal(errno, format, alist);
 }
 
-/*PRINTFLIKE1*/
 void
 uu_warn(const char *format, ...)
 {
 	va_list alist;
 	va_start(alist, format);
 	uu_warn_internal(errno, format, alist);
 	va_end(alist);
 }
 
-static void
+static __attribute__((format(printf, 2, 0))) __NORETURN void
 uu_die_internal(int status, const char *format, va_list alist)
 {
 	uu_warn_internal(errno, format, alist);
 #ifdef DEBUG
 	{
 		char *cp;
 
 		if (!issetugid()) {
 			cp = getenv("UU_DIE_ABORTS");
 			if (cp != NULL && *cp != '\0')
 				abort();
 		}
 	}
 #endif
 	exit(status);
 }
 
 void
 uu_vdie(const char *format, va_list alist)
 {
 	uu_die_internal(UU_EXIT_FATAL, format, alist);
 }
 
-/*PRINTFLIKE1*/
 void
 uu_die(const char *format, ...)
 {
 	va_list alist;
 	va_start(alist, format);
 	uu_die_internal(UU_EXIT_FATAL, format, alist);
 	va_end(alist);
 }
 
 void
 uu_vxdie(int status, const char *format, va_list alist)
 {
 	uu_die_internal(status, format, alist);
 }
 
-/*PRINTFLIKE2*/
 void
 uu_xdie(int status, const char *format, ...)
 {
 	va_list alist;
 	va_start(alist, format);
 	uu_die_internal(status, format, alist);
 	va_end(alist);
 }
 
 const char *
 uu_setpname(char *arg0)
 {
 	/*
 	 * Having a NULL argv[0], while uncommon, is possible.  It
 	 * makes more sense to handle this event in uu_setpname rather
 	 * than in each of its consumers.
 	 */
 	if (arg0 == NULL) {
 		pname = getexecname();
 		if (pname == NULL)
 			pname = "unknown_command";
 		return (pname);
 	}
 
 	/*
 	 * Guard against '/' at end of command invocation.
 	 */
 	for (;;) {
 		char *p = strrchr(arg0, '/');
 		if (p == NULL) {
 			pname = arg0;
 			break;
 		} else {
 			if (*(p + 1) == '\0') {
 				*p = '\0';
 				continue;
 			}
 
 			pname = p + 1;
 			break;
 		}
 	}
 
 	return (pname);
 }
 
 const char *
 uu_getpname(void)
 {
 	return (pname);
 }
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 68e97e4830d8..88d6561a5fb4 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -1,2088 +1,2083 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2020 Joyent, Inc. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2020 The FreeBSD Foundation
  *
  * Portions of this software were developed by Allan Jude
  * under sponsorship from the FreeBSD Foundation.
  */
 
 /*
  * Internal utility routines for the ZFS library.
  */
 
 #include <errno.h>
 #include <fcntl.h>
 #include <libintl.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <math.h>
 #if LIBFETCH_DYNAMIC
 #include <dlfcn.h>
 #endif
 #include <sys/stat.h>
 #include <sys/mnttab.h>
 #include <sys/mntent.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 
 #include <libzfs.h>
 #include <libzfs_core.h>
 
 #include "libzfs_impl.h"
 #include "zfs_prop.h"
 #include "zfeature_common.h"
 #include <zfs_fletcher.h>
 #include <libzutil.h>
 
 /*
  * We only care about the scheme in order to match the scheme
  * with the handler. Each handler should validate the full URI
  * as necessary.
  */
 #define	URI_REGEX	"^\\([A-Za-z][A-Za-z0-9+.\\-]*\\):"
 
 int
 libzfs_errno(libzfs_handle_t *hdl)
 {
 	return (hdl->libzfs_error);
 }
 
 const char *
 libzfs_error_action(libzfs_handle_t *hdl)
 {
 	return (hdl->libzfs_action);
 }
 
 const char *
 libzfs_error_description(libzfs_handle_t *hdl)
 {
 	if (hdl->libzfs_desc[0] != '\0')
 		return (hdl->libzfs_desc);
 
 	switch (hdl->libzfs_error) {
 	case EZFS_NOMEM:
 		return (dgettext(TEXT_DOMAIN, "out of memory"));
 	case EZFS_BADPROP:
 		return (dgettext(TEXT_DOMAIN, "invalid property value"));
 	case EZFS_PROPREADONLY:
 		return (dgettext(TEXT_DOMAIN, "read-only property"));
 	case EZFS_PROPTYPE:
 		return (dgettext(TEXT_DOMAIN, "property doesn't apply to "
 		    "datasets of this type"));
 	case EZFS_PROPNONINHERIT:
 		return (dgettext(TEXT_DOMAIN, "property cannot be inherited"));
 	case EZFS_PROPSPACE:
 		return (dgettext(TEXT_DOMAIN, "invalid quota or reservation"));
 	case EZFS_BADTYPE:
 		return (dgettext(TEXT_DOMAIN, "operation not applicable to "
 		    "datasets of this type"));
 	case EZFS_BUSY:
 		return (dgettext(TEXT_DOMAIN, "pool or dataset is busy"));
 	case EZFS_EXISTS:
 		return (dgettext(TEXT_DOMAIN, "pool or dataset exists"));
 	case EZFS_NOENT:
 		return (dgettext(TEXT_DOMAIN, "no such pool or dataset"));
 	case EZFS_BADSTREAM:
 		return (dgettext(TEXT_DOMAIN, "invalid backup stream"));
 	case EZFS_DSREADONLY:
 		return (dgettext(TEXT_DOMAIN, "dataset is read-only"));
 	case EZFS_VOLTOOBIG:
 		return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
 		    "this system"));
 	case EZFS_INVALIDNAME:
 		return (dgettext(TEXT_DOMAIN, "invalid name"));
 	case EZFS_BADRESTORE:
 		return (dgettext(TEXT_DOMAIN, "unable to restore to "
 		    "destination"));
 	case EZFS_BADBACKUP:
 		return (dgettext(TEXT_DOMAIN, "backup failed"));
 	case EZFS_BADTARGET:
 		return (dgettext(TEXT_DOMAIN, "invalid target vdev"));
 	case EZFS_NODEVICE:
 		return (dgettext(TEXT_DOMAIN, "no such device in pool"));
 	case EZFS_BADDEV:
 		return (dgettext(TEXT_DOMAIN, "invalid device"));
 	case EZFS_NOREPLICAS:
 		return (dgettext(TEXT_DOMAIN, "no valid replicas"));
 	case EZFS_RESILVERING:
 		return (dgettext(TEXT_DOMAIN, "currently resilvering"));
 	case EZFS_BADVERSION:
 		return (dgettext(TEXT_DOMAIN, "unsupported version or "
 		    "feature"));
 	case EZFS_POOLUNAVAIL:
 		return (dgettext(TEXT_DOMAIN, "pool is unavailable"));
 	case EZFS_DEVOVERFLOW:
 		return (dgettext(TEXT_DOMAIN, "too many devices in one vdev"));
 	case EZFS_BADPATH:
 		return (dgettext(TEXT_DOMAIN, "must be an absolute path"));
 	case EZFS_CROSSTARGET:
 		return (dgettext(TEXT_DOMAIN, "operation crosses datasets or "
 		    "pools"));
 	case EZFS_ZONED:
 		return (dgettext(TEXT_DOMAIN, "dataset in use by local zone"));
 	case EZFS_MOUNTFAILED:
 		return (dgettext(TEXT_DOMAIN, "mount failed"));
 	case EZFS_UMOUNTFAILED:
 		return (dgettext(TEXT_DOMAIN, "unmount failed"));
 	case EZFS_UNSHARENFSFAILED:
 		return (dgettext(TEXT_DOMAIN, "NFS share removal failed"));
 	case EZFS_SHARENFSFAILED:
 		return (dgettext(TEXT_DOMAIN, "NFS share creation failed"));
 	case EZFS_UNSHARESMBFAILED:
 		return (dgettext(TEXT_DOMAIN, "SMB share removal failed"));
 	case EZFS_SHARESMBFAILED:
 		return (dgettext(TEXT_DOMAIN, "SMB share creation failed"));
 	case EZFS_PERM:
 		return (dgettext(TEXT_DOMAIN, "permission denied"));
 	case EZFS_NOSPC:
 		return (dgettext(TEXT_DOMAIN, "out of space"));
 	case EZFS_FAULT:
 		return (dgettext(TEXT_DOMAIN, "bad address"));
 	case EZFS_IO:
 		return (dgettext(TEXT_DOMAIN, "I/O error"));
 	case EZFS_INTR:
 		return (dgettext(TEXT_DOMAIN, "signal received"));
 	case EZFS_ISSPARE:
 		return (dgettext(TEXT_DOMAIN, "device is reserved as a hot "
 		    "spare"));
 	case EZFS_INVALCONFIG:
 		return (dgettext(TEXT_DOMAIN, "invalid vdev configuration"));
 	case EZFS_RECURSIVE:
 		return (dgettext(TEXT_DOMAIN, "recursive dataset dependency"));
 	case EZFS_NOHISTORY:
 		return (dgettext(TEXT_DOMAIN, "no history available"));
 	case EZFS_POOLPROPS:
 		return (dgettext(TEXT_DOMAIN, "failed to retrieve "
 		    "pool properties"));
 	case EZFS_POOL_NOTSUP:
 		return (dgettext(TEXT_DOMAIN, "operation not supported "
 		    "on this type of pool"));
 	case EZFS_POOL_INVALARG:
 		return (dgettext(TEXT_DOMAIN, "invalid argument for "
 		    "this pool operation"));
 	case EZFS_NAMETOOLONG:
 		return (dgettext(TEXT_DOMAIN, "dataset name is too long"));
 	case EZFS_OPENFAILED:
 		return (dgettext(TEXT_DOMAIN, "open failed"));
 	case EZFS_NOCAP:
 		return (dgettext(TEXT_DOMAIN,
 		    "disk capacity information could not be retrieved"));
 	case EZFS_LABELFAILED:
 		return (dgettext(TEXT_DOMAIN, "write of label failed"));
 	case EZFS_BADWHO:
 		return (dgettext(TEXT_DOMAIN, "invalid user/group"));
 	case EZFS_BADPERM:
 		return (dgettext(TEXT_DOMAIN, "invalid permission"));
 	case EZFS_BADPERMSET:
 		return (dgettext(TEXT_DOMAIN, "invalid permission set name"));
 	case EZFS_NODELEGATION:
 		return (dgettext(TEXT_DOMAIN, "delegated administration is "
 		    "disabled on pool"));
 	case EZFS_BADCACHE:
 		return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));
 	case EZFS_ISL2CACHE:
 		return (dgettext(TEXT_DOMAIN, "device is in use as a cache"));
 	case EZFS_VDEVNOTSUP:
 		return (dgettext(TEXT_DOMAIN, "vdev specification is not "
 		    "supported"));
 	case EZFS_NOTSUP:
 		return (dgettext(TEXT_DOMAIN, "operation not supported "
 		    "on this dataset"));
 	case EZFS_IOC_NOTSUPPORTED:
 		return (dgettext(TEXT_DOMAIN, "operation not supported by "
 		    "zfs kernel module"));
 	case EZFS_ACTIVE_SPARE:
 		return (dgettext(TEXT_DOMAIN, "pool has active shared spare "
 		    "device"));
 	case EZFS_UNPLAYED_LOGS:
 		return (dgettext(TEXT_DOMAIN, "log device has unplayed intent "
 		    "logs"));
 	case EZFS_REFTAG_RELE:
 		return (dgettext(TEXT_DOMAIN, "no such tag on this dataset"));
 	case EZFS_REFTAG_HOLD:
 		return (dgettext(TEXT_DOMAIN, "tag already exists on this "
 		    "dataset"));
 	case EZFS_TAGTOOLONG:
 		return (dgettext(TEXT_DOMAIN, "tag too long"));
 	case EZFS_PIPEFAILED:
 		return (dgettext(TEXT_DOMAIN, "pipe create failed"));
 	case EZFS_THREADCREATEFAILED:
 		return (dgettext(TEXT_DOMAIN, "thread create failed"));
 	case EZFS_POSTSPLIT_ONLINE:
 		return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
 		    "into a new one"));
 	case EZFS_SCRUB_PAUSED:
 		return (dgettext(TEXT_DOMAIN, "scrub is paused; "
 		    "use 'zpool scrub' to resume"));
 	case EZFS_SCRUBBING:
 		return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
 		    "use 'zpool scrub -s' to cancel current scrub"));
 	case EZFS_NO_SCRUB:
 		return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
 	case EZFS_DIFF:
 		return (dgettext(TEXT_DOMAIN, "unable to generate diffs"));
 	case EZFS_DIFFDATA:
 		return (dgettext(TEXT_DOMAIN, "invalid diff data"));
 	case EZFS_POOLREADONLY:
 		return (dgettext(TEXT_DOMAIN, "pool is read-only"));
 	case EZFS_NO_PENDING:
 		return (dgettext(TEXT_DOMAIN, "operation is not "
 		    "in progress"));
 	case EZFS_CHECKPOINT_EXISTS:
 		return (dgettext(TEXT_DOMAIN, "checkpoint exists"));
 	case EZFS_DISCARDING_CHECKPOINT:
 		return (dgettext(TEXT_DOMAIN, "currently discarding "
 		    "checkpoint"));
 	case EZFS_NO_CHECKPOINT:
 		return (dgettext(TEXT_DOMAIN, "checkpoint does not exist"));
 	case EZFS_DEVRM_IN_PROGRESS:
 		return (dgettext(TEXT_DOMAIN, "device removal in progress"));
 	case EZFS_VDEV_TOO_BIG:
 		return (dgettext(TEXT_DOMAIN, "device exceeds supported size"));
 	case EZFS_ACTIVE_POOL:
 		return (dgettext(TEXT_DOMAIN, "pool is imported on a "
 		    "different host"));
 	case EZFS_CRYPTOFAILED:
 		return (dgettext(TEXT_DOMAIN, "encryption failure"));
 	case EZFS_TOOMANY:
 		return (dgettext(TEXT_DOMAIN, "argument list too long"));
 	case EZFS_INITIALIZING:
 		return (dgettext(TEXT_DOMAIN, "currently initializing"));
 	case EZFS_NO_INITIALIZE:
 		return (dgettext(TEXT_DOMAIN, "there is no active "
 		    "initialization"));
 	case EZFS_WRONG_PARENT:
 		return (dgettext(TEXT_DOMAIN, "invalid parent dataset"));
 	case EZFS_TRIMMING:
 		return (dgettext(TEXT_DOMAIN, "currently trimming"));
 	case EZFS_NO_TRIM:
 		return (dgettext(TEXT_DOMAIN, "there is no active trim"));
 	case EZFS_TRIM_NOTSUP:
 		return (dgettext(TEXT_DOMAIN, "trim operations are not "
 		    "supported by this device"));
 	case EZFS_NO_RESILVER_DEFER:
 		return (dgettext(TEXT_DOMAIN, "this action requires the "
 		    "resilver_defer feature"));
 	case EZFS_EXPORT_IN_PROGRESS:
 		return (dgettext(TEXT_DOMAIN, "pool export in progress"));
 	case EZFS_REBUILDING:
 		return (dgettext(TEXT_DOMAIN, "currently sequentially "
 		    "resilvering"));
 	case EZFS_UNKNOWN:
 		return (dgettext(TEXT_DOMAIN, "unknown error"));
 	default:
 		assert(hdl->libzfs_error == 0);
 		return (dgettext(TEXT_DOMAIN, "no error"));
 	}
 }
 
-/*PRINTFLIKE2*/
 void
 zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 
 	(void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc),
 	    fmt, ap);
 	hdl->libzfs_desc_active = 1;
 
 	va_end(ap);
 }
 
 static void
 zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap)
 {
 	(void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action),
 	    fmt, ap);
 	hdl->libzfs_error = error;
 
 	if (hdl->libzfs_desc_active)
 		hdl->libzfs_desc_active = 0;
 	else
 		hdl->libzfs_desc[0] = '\0';
 
 	if (hdl->libzfs_printerr) {
 		if (error == EZFS_UNKNOWN) {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal "
 			    "error: %s: %s\n"), hdl->libzfs_action,
 			    libzfs_error_description(hdl));
 			abort();
 		}
 
 		(void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action,
 		    libzfs_error_description(hdl));
 		if (error == EZFS_NOMEM)
 			exit(1);
 	}
 }
 
 int
 zfs_error(libzfs_handle_t *hdl, int error, const char *msg)
 {
 	return (zfs_error_fmt(hdl, error, "%s", msg));
 }
 
-/*PRINTFLIKE3*/
 int
 zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 
 	zfs_verror(hdl, error, fmt, ap);
 
 	va_end(ap);
 
 	return (-1);
 }
 
 static int
 zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt,
     va_list ap)
 {
 	switch (error) {
 	case EPERM:
 	case EACCES:
 		zfs_verror(hdl, EZFS_PERM, fmt, ap);
 		return (-1);
 
 	case ECANCELED:
 		zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap);
 		return (-1);
 
 	case EIO:
 		zfs_verror(hdl, EZFS_IO, fmt, ap);
 		return (-1);
 
 	case EFAULT:
 		zfs_verror(hdl, EZFS_FAULT, fmt, ap);
 		return (-1);
 
 	case EINTR:
 		zfs_verror(hdl, EZFS_INTR, fmt, ap);
 		return (-1);
 	}
 
 	return (0);
 }
 
 int
 zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg)
 {
 	return (zfs_standard_error_fmt(hdl, error, "%s", msg));
 }
 
-/*PRINTFLIKE3*/
 int
 zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 
 	if (zfs_common_error(hdl, error, fmt, ap) != 0) {
 		va_end(ap);
 		return (-1);
 	}
 
 	switch (error) {
 	case ENXIO:
 	case ENODEV:
 	case EPIPE:
 		zfs_verror(hdl, EZFS_IO, fmt, ap);
 		break;
 
 	case ENOENT:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset does not exist"));
 		zfs_verror(hdl, EZFS_NOENT, fmt, ap);
 		break;
 
 	case ENOSPC:
 	case EDQUOT:
 		zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
 		break;
 
 	case EEXIST:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset already exists"));
 		zfs_verror(hdl, EZFS_EXISTS, fmt, ap);
 		break;
 
 	case EBUSY:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset is busy"));
 		zfs_verror(hdl, EZFS_BUSY, fmt, ap);
 		break;
 	case EROFS:
 		zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
 		break;
 	case ENAMETOOLONG:
 		zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap);
 		break;
 	case ENOTSUP:
 		zfs_verror(hdl, EZFS_BADVERSION, fmt, ap);
 		break;
 	case EAGAIN:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool I/O is currently suspended"));
 		zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
 		break;
 	case EREMOTEIO:
 		zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap);
 		break;
 	case ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE:
 	case ZFS_ERR_IOC_CMD_UNAVAIL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
 		    "module does not support this operation. A reboot may "
 		    "be required to enable this operation."));
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	case ZFS_ERR_IOC_ARG_UNAVAIL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
 		    "module does not support an option for this operation. "
 		    "A reboot may be required to enable this option."));
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	case ZFS_ERR_IOC_ARG_REQUIRED:
 	case ZFS_ERR_IOC_ARG_BADTYPE:
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	case ZFS_ERR_WRONG_PARENT:
 		zfs_verror(hdl, EZFS_WRONG_PARENT, fmt, ap);
 		break;
 	case ZFS_ERR_BADPROP:
 		zfs_verror(hdl, EZFS_BADPROP, fmt, ap);
 		break;
 	default:
 		zfs_error_aux(hdl, "%s", strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
 		break;
 	}
 
 	va_end(ap);
 	return (-1);
 }
 
 void
 zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
     char *errbuf)
 {
 	switch (err) {
 
 	case ENOSPC:
 		/*
 		 * For quotas and reservations, ENOSPC indicates
 		 * something different; setting a quota or reservation
 		 * doesn't use any disk space.
 		 */
 		switch (prop) {
 		case ZFS_PROP_QUOTA:
 		case ZFS_PROP_REFQUOTA:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "size is less than current used or "
 			    "reserved space"));
 			(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
 			break;
 
 		case ZFS_PROP_RESERVATION:
 		case ZFS_PROP_REFRESERVATION:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "size is greater than available space"));
 			(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
 			break;
 
 		default:
 			(void) zfs_standard_error(hdl, err, errbuf);
 			break;
 		}
 		break;
 
 	case EBUSY:
 		(void) zfs_standard_error(hdl, EBUSY, errbuf);
 		break;
 
 	case EROFS:
 		(void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
 		break;
 
 	case E2BIG:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "property value too long"));
 		(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		break;
 
 	case ENOTSUP:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool and or dataset must be upgraded to set this "
 		    "property or value"));
 		(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 		break;
 
 	case ERANGE:
 		if (prop == ZFS_PROP_COMPRESSION ||
 		    prop == ZFS_PROP_DNODESIZE ||
 		    prop == ZFS_PROP_RECORDSIZE) {
 			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property setting is not allowed on "
 			    "bootable datasets"));
 			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
 		} else if (prop == ZFS_PROP_CHECKSUM ||
 		    prop == ZFS_PROP_DEDUP) {
 			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property setting is not allowed on "
 			    "root pools"));
 			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, err, errbuf);
 		}
 		break;
 
 	case EINVAL:
 		if (prop == ZPROP_INVAL) {
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, err, errbuf);
 		}
 		break;
 
 	case ZFS_ERR_BADPROP:
 		(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		break;
 
 	case EACCES:
 		if (prop == ZFS_PROP_KEYLOCATION) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "keylocation may only be set on encryption roots"));
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, err, errbuf);
 		}
 		break;
 
 	case EOVERFLOW:
 		/*
 		 * This platform can't address a volume this big.
 		 */
 #ifdef _ILP32
 		if (prop == ZFS_PROP_VOLSIZE) {
 			(void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
 			break;
 		}
 #endif
 		/* FALLTHROUGH */
 	default:
 		(void) zfs_standard_error(hdl, err, errbuf);
 	}
 }
 
 int
 zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg)
 {
 	return (zpool_standard_error_fmt(hdl, error, "%s", msg));
 }
 
-/*PRINTFLIKE3*/
 int
 zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 
 	if (zfs_common_error(hdl, error, fmt, ap) != 0) {
 		va_end(ap);
 		return (-1);
 	}
 
 	switch (error) {
 	case ENODEV:
 		zfs_verror(hdl, EZFS_NODEVICE, fmt, ap);
 		break;
 
 	case ENOENT:
 		zfs_error_aux(hdl,
 		    dgettext(TEXT_DOMAIN, "no such pool or dataset"));
 		zfs_verror(hdl, EZFS_NOENT, fmt, ap);
 		break;
 
 	case EEXIST:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool already exists"));
 		zfs_verror(hdl, EZFS_EXISTS, fmt, ap);
 		break;
 
 	case EBUSY:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy"));
 		zfs_verror(hdl, EZFS_BUSY, fmt, ap);
 		break;
 
 	/* There is no pending operation to cancel */
 	case ENOTACTIVE:
 		zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap);
 		break;
 
 	case ENXIO:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "one or more devices is currently unavailable"));
 		zfs_verror(hdl, EZFS_BADDEV, fmt, ap);
 		break;
 
 	case ENAMETOOLONG:
 		zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap);
 		break;
 
 	case ENOTSUP:
 		zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap);
 		break;
 
 	case EINVAL:
 		zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap);
 		break;
 
 	case ENOSPC:
 	case EDQUOT:
 		zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
 		return (-1);
 
 	case EAGAIN:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool I/O is currently suspended"));
 		zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
 		break;
 
 	case EROFS:
 		zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
 		break;
 	case EDOM:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "block size out of range or does not match"));
 		zfs_verror(hdl, EZFS_BADPROP, fmt, ap);
 		break;
 	case EREMOTEIO:
 		zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap);
 		break;
 	case ZFS_ERR_CHECKPOINT_EXISTS:
 		zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap);
 		break;
 	case ZFS_ERR_DISCARDING_CHECKPOINT:
 		zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap);
 		break;
 	case ZFS_ERR_NO_CHECKPOINT:
 		zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap);
 		break;
 	case ZFS_ERR_DEVRM_IN_PROGRESS:
 		zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap);
 		break;
 	case ZFS_ERR_VDEV_TOO_BIG:
 		zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap);
 		break;
 	case ZFS_ERR_EXPORT_IN_PROGRESS:
 		zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap);
 		break;
 	case ZFS_ERR_RESILVER_IN_PROGRESS:
 		zfs_verror(hdl, EZFS_RESILVERING, fmt, ap);
 		break;
 	case ZFS_ERR_REBUILD_IN_PROGRESS:
 		zfs_verror(hdl, EZFS_REBUILDING, fmt, ap);
 		break;
 	case ZFS_ERR_BADPROP:
 		zfs_verror(hdl, EZFS_BADPROP, fmt, ap);
 		break;
 	case ZFS_ERR_IOC_CMD_UNAVAIL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
 		    "module does not support this operation. A reboot may "
 		    "be required to enable this operation."));
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	case ZFS_ERR_IOC_ARG_UNAVAIL:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
 		    "module does not support an option for this operation. "
 		    "A reboot may be required to enable this option."));
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	case ZFS_ERR_IOC_ARG_REQUIRED:
 	case ZFS_ERR_IOC_ARG_BADTYPE:
 		zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
 		break;
 	default:
 		zfs_error_aux(hdl, "%s", strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
 	}
 
 	va_end(ap);
 	return (-1);
 }
 
 /*
  * Display an out of memory error message and abort the current program.
  */
 int
 no_memory(libzfs_handle_t *hdl)
 {
 	return (zfs_error(hdl, EZFS_NOMEM, "internal error"));
 }
 
 /*
  * A safe form of malloc() which will die if the allocation fails.
  */
 void *
 zfs_alloc(libzfs_handle_t *hdl, size_t size)
 {
 	void *data;
 
 	if ((data = calloc(1, size)) == NULL)
 		(void) no_memory(hdl);
 
 	return (data);
 }
 
 /*
  * A safe form of asprintf() which will die if the allocation fails.
  */
-/*PRINTFLIKE2*/
 char *
 zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...)
 {
 	va_list ap;
 	char *ret;
 	int err;
 
 	va_start(ap, fmt);
 
 	err = vasprintf(&ret, fmt, ap);
 
 	va_end(ap);
 
 	if (err < 0) {
 		(void) no_memory(hdl);
 		ret = NULL;
 	}
 
 	return (ret);
 }
 
 /*
  * A safe form of realloc(), which also zeroes newly allocated space.
  */
 void *
 zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize)
 {
 	void *ret;
 
 	if ((ret = realloc(ptr, newsize)) == NULL) {
 		(void) no_memory(hdl);
 		return (NULL);
 	}
 
 	bzero((char *)ret + oldsize, (newsize - oldsize));
 	return (ret);
 }
 
 /*
  * A safe form of strdup() which will die if the allocation fails.
  */
 char *
 zfs_strdup(libzfs_handle_t *hdl, const char *str)
 {
 	char *ret;
 
 	if ((ret = strdup(str)) == NULL)
 		(void) no_memory(hdl);
 
 	return (ret);
 }
 
 void
 libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr)
 {
 	hdl->libzfs_printerr = printerr;
 }
 
 /*
  * Read lines from an open file descriptor and store them in an array of
  * strings until EOF.  lines[] will be allocated and populated with all the
  * lines read.  All newlines are replaced with NULL terminators for
  * convenience.  lines[] must be freed after use with libzfs_free_str_array().
  *
  * Returns the number of lines read.
  */
 static int
 libzfs_read_stdout_from_fd(int fd, char **lines[])
 {
 
 	FILE *fp;
 	int lines_cnt = 0;
 	size_t len = 0;
 	char *line = NULL;
 	char **tmp_lines = NULL, **tmp;
 
 	fp = fdopen(fd, "r");
 	if (fp == NULL) {
 		close(fd);
 		return (0);
 	}
 	while (getline(&line, &len, fp) != -1) {
 		tmp = realloc(tmp_lines, sizeof (*tmp_lines) * (lines_cnt + 1));
 		if (tmp == NULL) {
 			/* Return the lines we were able to process */
 			break;
 		}
 		tmp_lines = tmp;
 
 		/* Remove newline if not EOF */
 		if (line[strlen(line) - 1] == '\n')
 			line[strlen(line) - 1] = '\0';
 
 		tmp_lines[lines_cnt] = strdup(line);
 		if (tmp_lines[lines_cnt] == NULL)
 			break;
 		++lines_cnt;
 	}
 	free(line);
 	fclose(fp);
 	*lines = tmp_lines;
 	return (lines_cnt);
 }
 
 static int
 libzfs_run_process_impl(const char *path, char *argv[], char *env[], int flags,
     char **lines[], int *lines_cnt)
 {
 	pid_t pid;
 	int error, devnull_fd;
 	int link[2];
 
 	/*
 	 * Setup a pipe between our child and parent process if we're
 	 * reading stdout.
 	 */
 	if (lines != NULL && pipe2(link, O_NONBLOCK | O_CLOEXEC) == -1)
 		return (-EPIPE);
 
 	pid = fork();
 	if (pid == 0) {
 		/* Child process */
 		devnull_fd = open("/dev/null", O_WRONLY | O_CLOEXEC);
 
 		if (devnull_fd < 0)
 			_exit(-1);
 
 		if (!(flags & STDOUT_VERBOSE) && (lines == NULL))
 			(void) dup2(devnull_fd, STDOUT_FILENO);
 		else if (lines != NULL) {
 			/* Save the output to lines[] */
 			dup2(link[1], STDOUT_FILENO);
 		}
 
 		if (!(flags & STDERR_VERBOSE))
 			(void) dup2(devnull_fd, STDERR_FILENO);
 
 		if (flags & NO_DEFAULT_PATH) {
 			if (env == NULL)
 				execv(path, argv);
 			else
 				execve(path, argv, env);
 		} else {
 			if (env == NULL)
 				execvp(path, argv);
 			else
 				execvpe(path, argv, env);
 		}
 
 		_exit(-1);
 	} else if (pid > 0) {
 		/* Parent process */
 		int status;
 
 		while ((error = waitpid(pid, &status, 0)) == -1 &&
 		    errno == EINTR)
 			;
 		if (error < 0 || !WIFEXITED(status))
 			return (-1);
 
 		if (lines != NULL) {
 			close(link[1]);
 			*lines_cnt = libzfs_read_stdout_from_fd(link[0], lines);
 		}
 		return (WEXITSTATUS(status));
 	}
 
 	return (-1);
 }
 
 int
 libzfs_run_process(const char *path, char *argv[], int flags)
 {
 	return (libzfs_run_process_impl(path, argv, NULL, flags, NULL, NULL));
 }
 
 /*
  * Run a command and store its stdout lines in an array of strings (lines[]).
  * lines[] is allocated and populated for you, and the number of lines is set in
  * lines_cnt.  lines[] must be freed after use with libzfs_free_str_array().
  * All newlines (\n) in lines[] are terminated for convenience.
  */
 int
 libzfs_run_process_get_stdout(const char *path, char *argv[], char *env[],
     char **lines[], int *lines_cnt)
 {
 	return (libzfs_run_process_impl(path, argv, env, 0, lines, lines_cnt));
 }
 
 /*
  * Same as libzfs_run_process_get_stdout(), but run without $PATH set.  This
  * means that *path needs to be the full path to the executable.
  */
 int
 libzfs_run_process_get_stdout_nopath(const char *path, char *argv[],
     char *env[], char **lines[], int *lines_cnt)
 {
 	return (libzfs_run_process_impl(path, argv, env, NO_DEFAULT_PATH,
 	    lines, lines_cnt));
 }
 
 /*
  * Free an array of strings.  Free both the strings contained in the array and
  * the array itself.
  */
 void
 libzfs_free_str_array(char **strs, int count)
 {
 	while (--count >= 0)
 		free(strs[count]);
 
 	free(strs);
 }
 
 /*
  * Returns 1 if environment variable is set to "YES", "yes", "ON", "on", or
  * a non-zero number.
  *
  * Returns 0 otherwise.
  */
 int
 libzfs_envvar_is_set(char *envvar)
 {
 	char *env = getenv(envvar);
 	if (env && (strtoul(env, NULL, 0) > 0 ||
 	    (!strncasecmp(env, "YES", 3) && strnlen(env, 4) == 3) ||
 	    (!strncasecmp(env, "ON", 2) && strnlen(env, 3) == 2)))
 		return (1);
 
 	return (0);
 }
 
 libzfs_handle_t *
 libzfs_init(void)
 {
 	libzfs_handle_t *hdl;
 	int error;
 	char *env;
 
 	if ((error = libzfs_load_module()) != 0) {
 		errno = error;
 		return (NULL);
 	}
 
 	if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) {
 		return (NULL);
 	}
 
 	if (regcomp(&hdl->libzfs_urire, URI_REGEX, 0) != 0) {
 		free(hdl);
 		return (NULL);
 	}
 
 	if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR|O_EXCL|O_CLOEXEC)) < 0) {
 		free(hdl);
 		return (NULL);
 	}
 
 	if (libzfs_core_init() != 0) {
 		(void) close(hdl->libzfs_fd);
 		free(hdl);
 		return (NULL);
 	}
 
 	zfs_prop_init();
 	zpool_prop_init();
 	zpool_feature_init();
 	libzfs_mnttab_init(hdl);
 	fletcher_4_init();
 
 	if (getenv("ZFS_PROP_DEBUG") != NULL) {
 		hdl->libzfs_prop_debug = B_TRUE;
 	}
 	if ((env = getenv("ZFS_SENDRECV_MAX_NVLIST")) != NULL) {
 		if ((error = zfs_nicestrtonum(hdl, env,
 		    &hdl->libzfs_max_nvlist))) {
 			errno = error;
 			(void) close(hdl->libzfs_fd);
 			free(hdl);
 			return (NULL);
 		}
 	} else {
 		hdl->libzfs_max_nvlist = (SPA_MAXBLOCKSIZE * 4);
 	}
 
 	/*
 	 * For testing, remove some settable properties and features
 	 */
 	if (libzfs_envvar_is_set("ZFS_SYSFS_PROP_SUPPORT_TEST")) {
 		zprop_desc_t *proptbl;
 
 		proptbl = zpool_prop_get_table();
 		proptbl[ZPOOL_PROP_COMMENT].pd_zfs_mod_supported = B_FALSE;
 
 		proptbl = zfs_prop_get_table();
 		proptbl[ZFS_PROP_DNODESIZE].pd_zfs_mod_supported = B_FALSE;
 
 		zfeature_info_t *ftbl = spa_feature_table;
 		ftbl[SPA_FEATURE_LARGE_BLOCKS].fi_zfs_mod_supported = B_FALSE;
 	}
 
 	return (hdl);
 }
 
 void
 libzfs_fini(libzfs_handle_t *hdl)
 {
 	(void) close(hdl->libzfs_fd);
 	zpool_free_handles(hdl);
 	namespace_clear(hdl);
 	libzfs_mnttab_fini(hdl);
 	libzfs_core_fini();
 	regfree(&hdl->libzfs_urire);
 	fletcher_4_fini();
 #if LIBFETCH_DYNAMIC
 	if (hdl->libfetch != (void *)-1 && hdl->libfetch != NULL)
 		(void) dlclose(hdl->libfetch);
 	free(hdl->libfetch_load_error);
 #endif
 	free(hdl);
 }
 
 libzfs_handle_t *
 zpool_get_handle(zpool_handle_t *zhp)
 {
 	return (zhp->zpool_hdl);
 }
 
 libzfs_handle_t *
 zfs_get_handle(zfs_handle_t *zhp)
 {
 	return (zhp->zfs_hdl);
 }
 
 zpool_handle_t *
 zfs_get_pool_handle(const zfs_handle_t *zhp)
 {
 	return (zhp->zpool_hdl);
 }
 
 /*
  * Given a name, determine whether or not it's a valid path
  * (starts with '/' or "./").  If so, walk the mnttab trying
  * to match the device number.  If not, treat the path as an
  * fs/vol/snap/bkmark name.
  */
 zfs_handle_t *
 zfs_path_to_zhandle(libzfs_handle_t *hdl, const char *path, zfs_type_t argtype)
 {
 	struct stat64 statbuf;
 	struct extmnttab entry;
 
 	if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) {
 		/*
 		 * It's not a valid path, assume it's a name of type 'argtype'.
 		 */
 		return (zfs_open(hdl, path, argtype));
 	}
 
 	if (getextmntent(path, &entry, &statbuf) != 0)
 		return (NULL);
 
 	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
 		(void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"),
 		    path);
 		return (NULL);
 	}
 
 	return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM));
 }
 
 /*
  * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from
  * an ioctl().
  */
 int
 zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
 {
 	if (len == 0)
 		len = 256 * 1024;
 	zc->zc_nvlist_dst_size = len;
 	zc->zc_nvlist_dst =
 	    (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
 	if (zc->zc_nvlist_dst == 0)
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Called when an ioctl() which returns an nvlist fails with ENOMEM.  This will
  * expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was
  * filled in by the kernel to indicate the actual required size.
  */
 int
 zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc)
 {
 	free((void *)(uintptr_t)zc->zc_nvlist_dst);
 	zc->zc_nvlist_dst =
 	    (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
 	if (zc->zc_nvlist_dst == 0)
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Called to free the src and dst nvlists stored in the command structure.
  */
 void
 zcmd_free_nvlists(zfs_cmd_t *zc)
 {
 	free((void *)(uintptr_t)zc->zc_nvlist_conf);
 	free((void *)(uintptr_t)zc->zc_nvlist_src);
 	free((void *)(uintptr_t)zc->zc_nvlist_dst);
 	zc->zc_nvlist_conf = 0;
 	zc->zc_nvlist_src = 0;
 	zc->zc_nvlist_dst = 0;
 }
 
 static int
 zcmd_write_nvlist_com(libzfs_handle_t *hdl, uint64_t *outnv, uint64_t *outlen,
     nvlist_t *nvl)
 {
 	char *packed;
 	size_t len;
 
 	verify(nvlist_size(nvl, &len, NV_ENCODE_NATIVE) == 0);
 
 	if ((packed = zfs_alloc(hdl, len)) == NULL)
 		return (-1);
 
 	verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0);
 
 	*outnv = (uint64_t)(uintptr_t)packed;
 	*outlen = len;
 
 	return (0);
 }
 
 int
 zcmd_write_conf_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl)
 {
 	return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf,
 	    &zc->zc_nvlist_conf_size, nvl));
 }
 
 int
 zcmd_write_src_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl)
 {
 	return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src,
 	    &zc->zc_nvlist_src_size, nvl));
 }
 
 /*
  * Unpacks an nvlist from the ZFS ioctl command structure.
  */
 int
 zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp)
 {
 	if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst,
 	    zc->zc_nvlist_dst_size, nvlp, 0) != 0)
 		return (no_memory(hdl));
 
 	return (0);
 }
 
 /*
  * ================================================================
  * API shared by zfs and zpool property management
  * ================================================================
  */
 
 static void
 zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 {
 	zprop_list_t *pl = cbp->cb_proplist;
 	int i;
 	char *title;
 	size_t len;
 
 	cbp->cb_first = B_FALSE;
 	if (cbp->cb_scripted)
 		return;
 
 	/*
 	 * Start with the length of the column headers.
 	 */
 	cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME"));
 	cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN,
 	    "PROPERTY"));
 	cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN,
 	    "VALUE"));
 	cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN,
 	    "RECEIVED"));
 	cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
 	    "SOURCE"));
 
 	/* first property is always NAME */
 	assert(cbp->cb_proplist->pl_prop ==
 	    ((type == ZFS_TYPE_POOL) ?  ZPOOL_PROP_NAME : ZFS_PROP_NAME));
 
 	/*
 	 * Go through and calculate the widths for each column.  For the
 	 * 'source' column, we kludge it up by taking the worst-case scenario of
 	 * inheriting from the longest name.  This is acceptable because in the
 	 * majority of cases 'SOURCE' is the last column displayed, and we don't
 	 * use the width anyway.  Note that the 'VALUE' column can be oversized,
 	 * if the name of the property is much longer than any values we find.
 	 */
 	for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
 		/*
 		 * 'PROPERTY' column
 		 */
 		if (pl->pl_prop != ZPROP_INVAL) {
 			const char *propname = (type == ZFS_TYPE_POOL) ?
 			    zpool_prop_to_name(pl->pl_prop) :
 			    zfs_prop_to_name(pl->pl_prop);
 
 			len = strlen(propname);
 			if (len > cbp->cb_colwidths[GET_COL_PROPERTY])
 				cbp->cb_colwidths[GET_COL_PROPERTY] = len;
 		} else {
 			len = strlen(pl->pl_user_prop);
 			if (len > cbp->cb_colwidths[GET_COL_PROPERTY])
 				cbp->cb_colwidths[GET_COL_PROPERTY] = len;
 		}
 
 		/*
 		 * 'VALUE' column.  The first property is always the 'name'
 		 * property that was tacked on either by /sbin/zfs's
 		 * zfs_do_get() or when calling zprop_expand_list(), so we
 		 * ignore its width.  If the user specified the name property
 		 * to display, then it will be later in the list in any case.
 		 */
 		if (pl != cbp->cb_proplist &&
 		    pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
 			cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;
 
 		/* 'RECEIVED' column. */
 		if (pl != cbp->cb_proplist &&
 		    pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD])
 			cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width;
 
 		/*
 		 * 'NAME' and 'SOURCE' columns
 		 */
 		if (pl->pl_prop == (type == ZFS_TYPE_POOL ? ZPOOL_PROP_NAME :
 		    ZFS_PROP_NAME) &&
 		    pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) {
 			cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width;
 			cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width +
 			    strlen(dgettext(TEXT_DOMAIN, "inherited from"));
 		}
 	}
 
 	/*
 	 * Now go through and print the headers.
 	 */
 	for (i = 0; i < ZFS_GET_NCOLS; i++) {
 		switch (cbp->cb_columns[i]) {
 		case GET_COL_NAME:
 			title = dgettext(TEXT_DOMAIN, "NAME");
 			break;
 		case GET_COL_PROPERTY:
 			title = dgettext(TEXT_DOMAIN, "PROPERTY");
 			break;
 		case GET_COL_VALUE:
 			title = dgettext(TEXT_DOMAIN, "VALUE");
 			break;
 		case GET_COL_RECVD:
 			title = dgettext(TEXT_DOMAIN, "RECEIVED");
 			break;
 		case GET_COL_SOURCE:
 			title = dgettext(TEXT_DOMAIN, "SOURCE");
 			break;
 		default:
 			title = NULL;
 		}
 
 		if (title != NULL) {
 			if (i == (ZFS_GET_NCOLS - 1) ||
 			    cbp->cb_columns[i + 1] == GET_COL_NONE)
 				(void) printf("%s", title);
 			else
 				(void) printf("%-*s  ",
 				    cbp->cb_colwidths[cbp->cb_columns[i]],
 				    title);
 		}
 	}
 	(void) printf("\n");
 }
 
 /*
  * Display a single line of output, according to the settings in the callback
  * structure.
  */
 void
 zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
     const char *propname, const char *value, zprop_source_t sourcetype,
     const char *source, const char *recvd_value)
 {
 	int i;
 	const char *str = NULL;
 	char buf[128];
 
 	/*
 	 * Ignore those source types that the user has chosen to ignore.
 	 */
 	if ((sourcetype & cbp->cb_sources) == 0)
 		return;
 
 	if (cbp->cb_first)
 		zprop_print_headers(cbp, cbp->cb_type);
 
 	for (i = 0; i < ZFS_GET_NCOLS; i++) {
 		switch (cbp->cb_columns[i]) {
 		case GET_COL_NAME:
 			str = name;
 			break;
 
 		case GET_COL_PROPERTY:
 			str = propname;
 			break;
 
 		case GET_COL_VALUE:
 			str = value;
 			break;
 
 		case GET_COL_SOURCE:
 			switch (sourcetype) {
 			case ZPROP_SRC_NONE:
 				str = "-";
 				break;
 
 			case ZPROP_SRC_DEFAULT:
 				str = "default";
 				break;
 
 			case ZPROP_SRC_LOCAL:
 				str = "local";
 				break;
 
 			case ZPROP_SRC_TEMPORARY:
 				str = "temporary";
 				break;
 
 			case ZPROP_SRC_INHERITED:
 				(void) snprintf(buf, sizeof (buf),
 				    "inherited from %s", source);
 				str = buf;
 				break;
 			case ZPROP_SRC_RECEIVED:
 				str = "received";
 				break;
 
 			default:
 				str = NULL;
 				assert(!"unhandled zprop_source_t");
 			}
 			break;
 
 		case GET_COL_RECVD:
 			str = (recvd_value == NULL ? "-" : recvd_value);
 			break;
 
 		default:
 			continue;
 		}
 
 		if (i == (ZFS_GET_NCOLS - 1) ||
 		    cbp->cb_columns[i + 1] == GET_COL_NONE)
 			(void) printf("%s", str);
 		else if (cbp->cb_scripted)
 			(void) printf("%s\t", str);
 		else
 			(void) printf("%-*s  ",
 			    cbp->cb_colwidths[cbp->cb_columns[i]],
 			    str);
 	}
 
 	(void) printf("\n");
 }
 
 /*
  * Given a numeric suffix, convert the value into a number of bits that the
  * resulting value must be shifted.
  */
 static int
 str2shift(libzfs_handle_t *hdl, const char *buf)
 {
 	const char *ends = "BKMGTPEZ";
 	int i;
 
 	if (buf[0] == '\0')
 		return (0);
 	for (i = 0; i < strlen(ends); i++) {
 		if (toupper(buf[0]) == ends[i])
 			break;
 	}
 	if (i == strlen(ends)) {
 		if (hdl)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid numeric suffix '%s'"), buf);
 		return (-1);
 	}
 
 	/*
 	 * Allow 'G' = 'GB' = 'GiB', case-insensitively.
 	 * However, 'BB' and 'BiB' are disallowed.
 	 */
 	if (buf[1] == '\0' ||
 	    (toupper(buf[0]) != 'B' &&
 	    ((toupper(buf[1]) == 'B' && buf[2] == '\0') ||
 	    (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'B' &&
 	    buf[3] == '\0'))))
 		return (10 * i);
 
 	if (hdl)
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid numeric suffix '%s'"), buf);
 	return (-1);
 }
 
 /*
  * Convert a string of the form '100G' into a real number.  Used when setting
  * properties or creating a volume.  'buf' is used to place an extended error
  * message for the caller to use.
  */
 int
 zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num)
 {
 	char *end;
 	int shift;
 
 	*num = 0;
 
 	/* Check to see if this looks like a number.  */
 	if ((value[0] < '0' || value[0] > '9') && value[0] != '.') {
 		if (hdl)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "bad numeric value '%s'"), value);
 		return (-1);
 	}
 
 	/* Rely on strtoull() to process the numeric portion.  */
 	errno = 0;
 	*num = strtoull(value, &end, 10);
 
 	/*
 	 * Check for ERANGE, which indicates that the value is too large to fit
 	 * in a 64-bit value.
 	 */
 	if (errno == ERANGE) {
 		if (hdl)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "numeric value is too large"));
 		return (-1);
 	}
 
 	/*
 	 * If we have a decimal value, then do the computation with floating
 	 * point arithmetic.  Otherwise, use standard arithmetic.
 	 */
 	if (*end == '.') {
 		double fval = strtod(value, &end);
 
 		if ((shift = str2shift(hdl, end)) == -1)
 			return (-1);
 
 		fval *= pow(2, shift);
 
 		/*
 		 * UINT64_MAX is not exactly representable as a double.
 		 * The closest representation is UINT64_MAX + 1, so we
 		 * use a >= comparison instead of > for the bounds check.
 		 */
 		if (fval >= (double)UINT64_MAX) {
 			if (hdl)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "numeric value is too large"));
 			return (-1);
 		}
 
 		*num = (uint64_t)fval;
 	} else {
 		if ((shift = str2shift(hdl, end)) == -1)
 			return (-1);
 
 		/* Check for overflow */
 		if (shift >= 64 || (*num << shift) >> shift != *num) {
 			if (hdl)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "numeric value is too large"));
 			return (-1);
 		}
 
 		*num <<= shift;
 	}
 
 	return (0);
 }
 
 /*
  * Given a propname=value nvpair to set, parse any numeric properties
  * (index, boolean, etc) if they are specified as strings and add the
  * resulting nvpair to the returned nvlist.
  *
  * At the DSL layer, all properties are either 64-bit numbers or strings.
  * We want the user to be able to ignore this fact and specify properties
  * as native values (numbers, for example) or as strings (to simplify
  * command line utilities).  This also handles converting index types
  * (compression, checksum, etc) from strings to their on-disk index.
  */
 int
 zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
     zfs_type_t type, nvlist_t *ret, char **svalp, uint64_t *ivalp,
     const char *errbuf)
 {
 	data_type_t datatype = nvpair_type(elem);
 	zprop_type_t proptype;
 	const char *propname;
 	char *value;
 	boolean_t isnone = B_FALSE;
 	boolean_t isauto = B_FALSE;
 	int err = 0;
 
 	if (type == ZFS_TYPE_POOL) {
 		proptype = zpool_prop_get_type(prop);
 		propname = zpool_prop_to_name(prop);
 	} else {
 		proptype = zfs_prop_get_type(prop);
 		propname = zfs_prop_to_name(prop);
 	}
 
 	/*
 	 * Convert any properties to the internal DSL value types.
 	 */
 	*svalp = NULL;
 	*ivalp = 0;
 
 	switch (proptype) {
 	case PROP_TYPE_STRING:
 		if (datatype != DATA_TYPE_STRING) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' must be a string"), nvpair_name(elem));
 			goto error;
 		}
 		err = nvpair_value_string(elem, svalp);
 		if (err != 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' is invalid"), nvpair_name(elem));
 			goto error;
 		}
 		if (strlen(*svalp) >= ZFS_MAXPROPLEN) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' is too long"), nvpair_name(elem));
 			goto error;
 		}
 		break;
 
 	case PROP_TYPE_NUMBER:
 		if (datatype == DATA_TYPE_STRING) {
 			(void) nvpair_value_string(elem, &value);
 			if (strcmp(value, "none") == 0) {
 				isnone = B_TRUE;
 			} else if (strcmp(value, "auto") == 0) {
 				isauto = B_TRUE;
 			} else if (zfs_nicestrtonum(hdl, value, ivalp) != 0) {
 				goto error;
 			}
 		} else if (datatype == DATA_TYPE_UINT64) {
 			(void) nvpair_value_uint64(elem, ivalp);
 		} else {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' must be a number"), nvpair_name(elem));
 			goto error;
 		}
 
 		/*
 		 * Quota special: force 'none' and don't allow 0.
 		 */
 		if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone &&
 		    (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "use 'none' to disable quota/refquota"));
 			goto error;
 		}
 
 		/*
 		 * Special handling for "*_limit=none". In this case it's not
 		 * 0 but UINT64_MAX.
 		 */
 		if ((type & ZFS_TYPE_DATASET) && isnone &&
 		    (prop == ZFS_PROP_FILESYSTEM_LIMIT ||
 		    prop == ZFS_PROP_SNAPSHOT_LIMIT)) {
 			*ivalp = UINT64_MAX;
 		}
 
 		/*
 		 * Special handling for setting 'refreservation' to 'auto'.  Use
 		 * UINT64_MAX to tell the caller to use zfs_fix_auto_resv().
 		 * 'auto' is only allowed on volumes.
 		 */
 		if (isauto) {
 			switch (prop) {
 			case ZFS_PROP_REFRESERVATION:
 				if ((type & ZFS_TYPE_VOLUME) == 0) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s=auto' only allowed on "
 					    "volumes"), nvpair_name(elem));
 					goto error;
 				}
 				*ivalp = UINT64_MAX;
 				break;
 			default:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'auto' is invalid value for '%s'"),
 				    nvpair_name(elem));
 				goto error;
 			}
 		}
 
 		break;
 
 	case PROP_TYPE_INDEX:
 		if (datatype != DATA_TYPE_STRING) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' must be a string"), nvpair_name(elem));
 			goto error;
 		}
 
 		(void) nvpair_value_string(elem, &value);
 
 		if (zprop_string_to_index(prop, value, ivalp, type) != 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' must be one of '%s'"), propname,
 			    zprop_values(prop, type));
 			goto error;
 		}
 		break;
 
 	default:
 		abort();
 	}
 
 	/*
 	 * Add the result to our return set of properties.
 	 */
 	if (*svalp != NULL) {
 		if (nvlist_add_string(ret, propname, *svalp) != 0) {
 			(void) no_memory(hdl);
 			return (-1);
 		}
 	} else {
 		if (nvlist_add_uint64(ret, propname, *ivalp) != 0) {
 			(void) no_memory(hdl);
 			return (-1);
 		}
 	}
 
 	return (0);
 error:
 	(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 	return (-1);
 }
 
 static int
 addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp,
     zfs_type_t type)
 {
 	int prop;
 	zprop_list_t *entry;
 
 	prop = zprop_name_to_prop(propname, type);
 
 	if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type, B_FALSE))
 		prop = ZPROP_INVAL;
 
 	/*
 	 * When no property table entry can be found, return failure if
 	 * this is a pool property or if this isn't a user-defined
 	 * dataset property,
 	 */
 	if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL &&
 	    !zpool_prop_feature(propname) &&
 	    !zpool_prop_unsupported(propname)) ||
 	    (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) &&
 	    !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid property '%s'"), propname);
 		return (zfs_error(hdl, EZFS_BADPROP,
 		    dgettext(TEXT_DOMAIN, "bad property list")));
 	}
 
 	if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL)
 		return (-1);
 
 	entry->pl_prop = prop;
 	if (prop == ZPROP_INVAL) {
 		if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) ==
 		    NULL) {
 			free(entry);
 			return (-1);
 		}
 		entry->pl_width = strlen(propname);
 	} else {
 		entry->pl_width = zprop_width(prop, &entry->pl_fixed,
 		    type);
 	}
 
 	*listp = entry;
 
 	return (0);
 }
 
 /*
  * Given a comma-separated list of properties, construct a property list
  * containing both user-defined and native properties.  This function will
  * return a NULL list if 'all' is specified, which can later be expanded
  * by zprop_expand_list().
  */
 int
 zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp,
     zfs_type_t type)
 {
 	*listp = NULL;
 
 	/*
 	 * If 'all' is specified, return a NULL list.
 	 */
 	if (strcmp(props, "all") == 0)
 		return (0);
 
 	/*
 	 * If no props were specified, return an error.
 	 */
 	if (props[0] == '\0') {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "no properties specified"));
 		return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN,
 		    "bad property list")));
 	}
 
 	/*
 	 * It would be nice to use getsubopt() here, but the inclusion of column
 	 * aliases makes this more effort than it's worth.
 	 */
 	while (*props != '\0') {
 		size_t len;
 		char *p;
 		char c;
 
 		if ((p = strchr(props, ',')) == NULL) {
 			len = strlen(props);
 			p = props + len;
 		} else {
 			len = p - props;
 		}
 
 		/*
 		 * Check for empty options.
 		 */
 		if (len == 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "empty property name"));
 			return (zfs_error(hdl, EZFS_BADPROP,
 			    dgettext(TEXT_DOMAIN, "bad property list")));
 		}
 
 		/*
 		 * Check all regular property names.
 		 */
 		c = props[len];
 		props[len] = '\0';
 
 		if (strcmp(props, "space") == 0) {
 			static char *spaceprops[] = {
 				"name", "avail", "used", "usedbysnapshots",
 				"usedbydataset", "usedbyrefreservation",
 				"usedbychildren", NULL
 			};
 			int i;
 
 			for (i = 0; spaceprops[i]; i++) {
 				if (addlist(hdl, spaceprops[i], listp, type))
 					return (-1);
 				listp = &(*listp)->pl_next;
 			}
 		} else {
 			if (addlist(hdl, props, listp, type))
 				return (-1);
 			listp = &(*listp)->pl_next;
 		}
 
 		props = p;
 		if (c == ',')
 			props++;
 	}
 
 	return (0);
 }
 
 void
 zprop_free_list(zprop_list_t *pl)
 {
 	zprop_list_t *next;
 
 	while (pl != NULL) {
 		next = pl->pl_next;
 		free(pl->pl_user_prop);
 		free(pl);
 		pl = next;
 	}
 }
 
 typedef struct expand_data {
 	zprop_list_t	**last;
 	libzfs_handle_t	*hdl;
 	zfs_type_t type;
 } expand_data_t;
 
 static int
 zprop_expand_list_cb(int prop, void *cb)
 {
 	zprop_list_t *entry;
 	expand_data_t *edp = cb;
 
 	if ((entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t))) == NULL)
 		return (ZPROP_INVAL);
 
 	entry->pl_prop = prop;
 	entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type);
 	entry->pl_all = B_TRUE;
 
 	*(edp->last) = entry;
 	edp->last = &entry->pl_next;
 
 	return (ZPROP_CONT);
 }
 
 int
 zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type)
 {
 	zprop_list_t *entry;
 	zprop_list_t **last;
 	expand_data_t exp;
 
 	if (*plp == NULL) {
 		/*
 		 * If this is the very first time we've been called for an 'all'
 		 * specification, expand the list to include all native
 		 * properties.
 		 */
 		last = plp;
 
 		exp.last = last;
 		exp.hdl = hdl;
 		exp.type = type;
 
 		if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE,
 		    B_FALSE, type) == ZPROP_INVAL)
 			return (-1);
 
 		/*
 		 * Add 'name' to the beginning of the list, which is handled
 		 * specially.
 		 */
 		if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL)
 			return (-1);
 
 		entry->pl_prop = (type == ZFS_TYPE_POOL) ?  ZPOOL_PROP_NAME :
 		    ZFS_PROP_NAME;
 		entry->pl_width = zprop_width(entry->pl_prop,
 		    &entry->pl_fixed, type);
 		entry->pl_all = B_TRUE;
 		entry->pl_next = *plp;
 		*plp = entry;
 	}
 	return (0);
 }
 
 int
 zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered,
     zfs_type_t type)
 {
 	return (zprop_iter_common(func, cb, show_all, ordered, type));
 }
 
 /*
  * Fill given version buffer with zfs userland version
  */
 void
 zfs_version_userland(char *version, int len)
 {
 	(void) strlcpy(version, ZFS_META_ALIAS, len);
 }
 
 /*
  * Prints both zfs userland and kernel versions
  * Returns 0 on success, and -1 on error (with errno set)
  */
 int
 zfs_version_print(void)
 {
 	char zver_userland[128];
 	char zver_kernel[128];
 
 	zfs_version_userland(zver_userland, sizeof (zver_userland));
 
 	(void) printf("%s\n", zver_userland);
 
 	if (zfs_version_kernel(zver_kernel, sizeof (zver_kernel)) == -1) {
 		fprintf(stderr, "zfs_version_kernel() failed: %s\n",
 		    strerror(errno));
 		return (-1);
 	}
 
 	(void) printf("zfs-kmod-%s\n", zver_kernel);
 
 	return (0);
 }
 
 /*
  * Return 1 if the user requested ANSI color output, and our terminal supports
  * it.  Return 0 for no color.
  */
 static int
 use_color(void)
 {
 	static int use_color = -1;
 	char *term;
 
 	/*
 	 * Optimization:
 	 *
 	 * For each zpool invocation, we do a single check to see if we should
 	 * be using color or not, and cache that value for the lifetime of the
 	 * the zpool command.  That makes it cheap to call use_color() when
 	 * we're printing with color.  We assume that the settings are not going
 	 * to change during the invocation of a zpool command (the user isn't
 	 * going to change the ZFS_COLOR value while zpool is running, for
 	 * example).
 	 */
 	if (use_color != -1) {
 		/*
 		 * We've already figured out if we should be using color or
 		 * not.  Return the cached value.
 		 */
 		return (use_color);
 	}
 
 	term = getenv("TERM");
 	/*
 	 * The user sets the ZFS_COLOR env var set to enable zpool ANSI color
 	 * output.  However if NO_COLOR is set (https://no-color.org/) then
 	 * don't use it.  Also, don't use color if terminal doesn't support
 	 * it.
 	 */
 	if (libzfs_envvar_is_set("ZFS_COLOR") &&
 	    !libzfs_envvar_is_set("NO_COLOR") &&
 	    isatty(STDOUT_FILENO) && term && strcmp("dumb", term) != 0 &&
 	    strcmp("unknown", term) != 0) {
 		/* Color supported */
 		use_color = 1;
 	} else {
 		use_color = 0;
 	}
 
 	return (use_color);
 }
 
 /*
  * color_start() and color_end() are used for when you want to colorize a block
  * of text.  For example:
  *
  * color_start(ANSI_RED_FG)
  * printf("hello");
  * printf("world");
  * color_end();
  */
 void
 color_start(char *color)
 {
 	if (use_color())
 		printf("%s", color);
 }
 
 void
 color_end(void)
 {
 	if (use_color())
 		printf(ANSI_RESET);
 }
 
 /* printf() with a color.  If color is NULL, then do a normal printf. */
 int
 printf_color(char *color, char *format, ...)
 {
 	va_list aptr;
 	int rc;
 
 	if (color)
 		color_start(color);
 
 	va_start(aptr, format);
 	rc = vprintf(format, aptr);
 	va_end(aptr);
 
 	if (color)
 		color_end();
 
 	return (rc);
 }
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index 25f58f156bf9..ef75706fa6e3 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -1,1377 +1,1376 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  */
 
 #include <assert.h>
 #include <fcntl.h>
 #include <libgen.h>
 #include <poll.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <libzutil.h>
 #include <sys/crypto/icp.h>
 #include <sys/processor.h>
 #include <sys/rrwlock.h>
 #include <sys/spa.h>
 #include <sys/stat.h>
 #include <sys/systeminfo.h>
 #include <sys/time.h>
 #include <sys/utsname.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zstd/zstd.h>
 #include <sys/zvol.h>
 #include <zfs_fletcher.h>
 #include <zlib.h>
 
 /*
  * Emulation of kernel services in userland.
  */
 
 uint64_t physmem;
 char hw_serial[HW_HOSTID_LEN];
 struct utsname hw_utsname;
 
 /* If set, all blocks read will be copied to the specified directory. */
 char *vn_dumpdir = NULL;
 
 /* this only exists to have its address taken */
 struct proc p0;
 
 /*
  * =========================================================================
  * threads
  * =========================================================================
  *
  * TS_STACK_MIN is dictated by the minimum allowed pthread stack size.  While
  * TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for
  * the expected stack depth while small enough to avoid exhausting address
  * space with high thread counts.
  */
 #define	TS_STACK_MIN	MAX(PTHREAD_STACK_MIN, 32768)
 #define	TS_STACK_MAX	(256 * 1024)
 
 /*ARGSUSED*/
 kthread_t *
 zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state)
 {
 	pthread_attr_t attr;
 	pthread_t tid;
 	char *stkstr;
 	int detachstate = PTHREAD_CREATE_DETACHED;
 
 	VERIFY0(pthread_attr_init(&attr));
 
 	if (state & TS_JOINABLE)
 		detachstate = PTHREAD_CREATE_JOINABLE;
 
 	VERIFY0(pthread_attr_setdetachstate(&attr, detachstate));
 
 	/*
 	 * We allow the default stack size in user space to be specified by
 	 * setting the ZFS_STACK_SIZE environment variable.  This allows us
 	 * the convenience of observing and debugging stack overruns in
 	 * user space.  Explicitly specified stack sizes will be honored.
 	 * The usage of ZFS_STACK_SIZE is discussed further in the
 	 * ENVIRONMENT VARIABLES sections of the ztest(1) man page.
 	 */
 	if (stksize == 0) {
 		stkstr = getenv("ZFS_STACK_SIZE");
 
 		if (stkstr == NULL)
 			stksize = TS_STACK_MAX;
 		else
 			stksize = MAX(atoi(stkstr), TS_STACK_MIN);
 	}
 
 	VERIFY3S(stksize, >, 0);
 	stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE);
 
 	/*
 	 * If this ever fails, it may be because the stack size is not a
 	 * multiple of system page size.
 	 */
 	VERIFY0(pthread_attr_setstacksize(&attr, stksize));
 	VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE));
 
 	VERIFY0(pthread_create(&tid, &attr, (void *(*)(void *))func, arg));
 	VERIFY0(pthread_attr_destroy(&attr));
 
 	return ((void *)(uintptr_t)tid);
 }
 
 /*
  * =========================================================================
  * kstats
  * =========================================================================
  */
 /*ARGSUSED*/
 kstat_t *
 kstat_create(const char *module, int instance, const char *name,
     const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag)
 {
 	return (NULL);
 }
 
 /*ARGSUSED*/
 void
 kstat_install(kstat_t *ksp)
 {}
 
 /*ARGSUSED*/
 void
 kstat_delete(kstat_t *ksp)
 {}
 
 void
 kstat_set_raw_ops(kstat_t *ksp,
     int (*headers)(char *buf, size_t size),
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index))
 {}
 
 /*
  * =========================================================================
  * mutexes
  * =========================================================================
  */
 
 void
 mutex_init(kmutex_t *mp, char *name, int type, void *cookie)
 {
 	VERIFY0(pthread_mutex_init(&mp->m_lock, NULL));
 	memset(&mp->m_owner, 0, sizeof (pthread_t));
 }
 
 void
 mutex_destroy(kmutex_t *mp)
 {
 	VERIFY0(pthread_mutex_destroy(&mp->m_lock));
 }
 
 void
 mutex_enter(kmutex_t *mp)
 {
 	VERIFY0(pthread_mutex_lock(&mp->m_lock));
 	mp->m_owner = pthread_self();
 }
 
 int
 mutex_tryenter(kmutex_t *mp)
 {
 	int error;
 
 	error = pthread_mutex_trylock(&mp->m_lock);
 	if (error == 0) {
 		mp->m_owner = pthread_self();
 		return (1);
 	} else {
 		VERIFY3S(error, ==, EBUSY);
 		return (0);
 	}
 }
 
 void
 mutex_exit(kmutex_t *mp)
 {
 	memset(&mp->m_owner, 0, sizeof (pthread_t));
 	VERIFY0(pthread_mutex_unlock(&mp->m_lock));
 }
 
 /*
  * =========================================================================
  * rwlocks
  * =========================================================================
  */
 
 void
 rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
 {
 	VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL));
 	rwlp->rw_readers = 0;
 	rwlp->rw_owner = 0;
 }
 
 void
 rw_destroy(krwlock_t *rwlp)
 {
 	VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock));
 }
 
 void
 rw_enter(krwlock_t *rwlp, krw_t rw)
 {
 	if (rw == RW_READER) {
 		VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock));
 		atomic_inc_uint(&rwlp->rw_readers);
 	} else {
 		VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock));
 		rwlp->rw_owner = pthread_self();
 	}
 }
 
 void
 rw_exit(krwlock_t *rwlp)
 {
 	if (RW_READ_HELD(rwlp))
 		atomic_dec_uint(&rwlp->rw_readers);
 	else
 		rwlp->rw_owner = 0;
 
 	VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock));
 }
 
 int
 rw_tryenter(krwlock_t *rwlp, krw_t rw)
 {
 	int error;
 
 	if (rw == RW_READER)
 		error = pthread_rwlock_tryrdlock(&rwlp->rw_lock);
 	else
 		error = pthread_rwlock_trywrlock(&rwlp->rw_lock);
 
 	if (error == 0) {
 		if (rw == RW_READER)
 			atomic_inc_uint(&rwlp->rw_readers);
 		else
 			rwlp->rw_owner = pthread_self();
 
 		return (1);
 	}
 
 	VERIFY3S(error, ==, EBUSY);
 
 	return (0);
 }
 
 /* ARGSUSED */
 uint32_t
 zone_get_hostid(void *zonep)
 {
 	/*
 	 * We're emulating the system's hostid in userland.
 	 */
 	return (strtoul(hw_serial, NULL, 10));
 }
 
 int
 rw_tryupgrade(krwlock_t *rwlp)
 {
 	return (0);
 }
 
 /*
  * =========================================================================
  * condition variables
  * =========================================================================
  */
 
 void
 cv_init(kcondvar_t *cv, char *name, int type, void *arg)
 {
 	VERIFY0(pthread_cond_init(cv, NULL));
 }
 
 void
 cv_destroy(kcondvar_t *cv)
 {
 	VERIFY0(pthread_cond_destroy(cv));
 }
 
 void
 cv_wait(kcondvar_t *cv, kmutex_t *mp)
 {
 	memset(&mp->m_owner, 0, sizeof (pthread_t));
 	VERIFY0(pthread_cond_wait(cv, &mp->m_lock));
 	mp->m_owner = pthread_self();
 }
 
 int
 cv_wait_sig(kcondvar_t *cv, kmutex_t *mp)
 {
 	cv_wait(cv, mp);
 	return (1);
 }
 
 int
 cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
 {
 	int error;
 	struct timeval tv;
 	struct timespec ts;
 	clock_t delta;
 
 	delta = abstime - ddi_get_lbolt();
 	if (delta <= 0)
 		return (-1);
 
 	VERIFY(gettimeofday(&tv, NULL) == 0);
 
 	ts.tv_sec = tv.tv_sec + delta / hz;
 	ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz);
 	if (ts.tv_nsec >= NANOSEC) {
 		ts.tv_sec++;
 		ts.tv_nsec -= NANOSEC;
 	}
 
 	memset(&mp->m_owner, 0, sizeof (pthread_t));
 	error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
 	mp->m_owner = pthread_self();
 
 	if (error == ETIMEDOUT)
 		return (-1);
 
 	VERIFY0(error);
 
 	return (1);
 }
 
 /*ARGSUSED*/
 int
 cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
     int flag)
 {
 	int error;
 	struct timeval tv;
 	struct timespec ts;
 	hrtime_t delta;
 
 	ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE);
 
 	delta = tim;
 	if (flag & CALLOUT_FLAG_ABSOLUTE)
 		delta -= gethrtime();
 
 	if (delta <= 0)
 		return (-1);
 
 	VERIFY0(gettimeofday(&tv, NULL));
 
 	ts.tv_sec = tv.tv_sec + delta / NANOSEC;
 	ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC);
 	if (ts.tv_nsec >= NANOSEC) {
 		ts.tv_sec++;
 		ts.tv_nsec -= NANOSEC;
 	}
 
 	memset(&mp->m_owner, 0, sizeof (pthread_t));
 	error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
 	mp->m_owner = pthread_self();
 
 	if (error == ETIMEDOUT)
 		return (-1);
 
 	VERIFY0(error);
 
 	return (1);
 }
 
 void
 cv_signal(kcondvar_t *cv)
 {
 	VERIFY0(pthread_cond_signal(cv));
 }
 
 void
 cv_broadcast(kcondvar_t *cv)
 {
 	VERIFY0(pthread_cond_broadcast(cv));
 }
 
 /*
  * =========================================================================
  * procfs list
  * =========================================================================
  */
 
 void
 seq_printf(struct seq_file *m, const char *fmt, ...)
 {}
 
 void
 procfs_list_install(const char *module,
     const char *submodule,
     const char *name,
     mode_t mode,
     procfs_list_t *procfs_list,
     int (*show)(struct seq_file *f, void *p),
     int (*show_header)(struct seq_file *f),
     int (*clear)(procfs_list_t *procfs_list),
     size_t procfs_list_node_off)
 {
 	mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&procfs_list->pl_list,
 	    procfs_list_node_off + sizeof (procfs_list_node_t),
 	    procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
 	procfs_list->pl_next_id = 1;
 	procfs_list->pl_node_offset = procfs_list_node_off;
 }
 
 void
 procfs_list_uninstall(procfs_list_t *procfs_list)
 {}
 
 void
 procfs_list_destroy(procfs_list_t *procfs_list)
 {
 	ASSERT(list_is_empty(&procfs_list->pl_list));
 	list_destroy(&procfs_list->pl_list);
 	mutex_destroy(&procfs_list->pl_lock);
 }
 
 #define	NODE_ID(procfs_list, obj) \
 		(((procfs_list_node_t *)(((char *)obj) + \
 		(procfs_list)->pl_node_offset))->pln_id)
 
 void
 procfs_list_add(procfs_list_t *procfs_list, void *p)
 {
 	ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
 	NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
 	list_insert_tail(&procfs_list->pl_list, p);
 }
 
 /*
  * =========================================================================
  * vnode operations
  * =========================================================================
  */
 
 /*
  * =========================================================================
  * Figure out which debugging statements to print
  * =========================================================================
  */
 
 static char *dprintf_string;
 static int dprintf_print_all;
 
 int
 dprintf_find_string(const char *string)
 {
 	char *tmp_str = dprintf_string;
 	int len = strlen(string);
 
 	/*
 	 * Find out if this is a string we want to print.
 	 * String format: file1.c,function_name1,file2.c,file3.c
 	 */
 
 	while (tmp_str != NULL) {
 		if (strncmp(tmp_str, string, len) == 0 &&
 		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
 			return (1);
 		tmp_str = strchr(tmp_str, ',');
 		if (tmp_str != NULL)
 			tmp_str++; /* Get rid of , */
 	}
 	return (0);
 }
 
 void
 dprintf_setup(int *argc, char **argv)
 {
 	int i, j;
 
 	/*
 	 * Debugging can be specified two ways: by setting the
 	 * environment variable ZFS_DEBUG, or by including a
 	 * "debug=..."  argument on the command line.  The command
 	 * line setting overrides the environment variable.
 	 */
 
 	for (i = 1; i < *argc; i++) {
 		int len = strlen("debug=");
 		/* First look for a command line argument */
 		if (strncmp("debug=", argv[i], len) == 0) {
 			dprintf_string = argv[i] + len;
 			/* Remove from args */
 			for (j = i; j < *argc; j++)
 				argv[j] = argv[j+1];
 			argv[j] = NULL;
 			(*argc)--;
 		}
 	}
 
 	if (dprintf_string == NULL) {
 		/* Look for ZFS_DEBUG environment variable */
 		dprintf_string = getenv("ZFS_DEBUG");
 	}
 
 	/*
 	 * Are we just turning on all debugging?
 	 */
 	if (dprintf_find_string("on"))
 		dprintf_print_all = 1;
 
 	if (dprintf_string != NULL)
 		zfs_flags |= ZFS_DEBUG_DPRINTF;
 }
 
 /*
  * =========================================================================
  * debug printfs
  * =========================================================================
  */
 void
 __dprintf(boolean_t dprint, const char *file, const char *func,
     int line, const char *fmt, ...)
 {
 	/* Get rid of annoying "../common/" prefix to filename. */
 	const char *newfile = zfs_basename(file);
 
 	va_list adx;
 	if (dprint) {
 		/* dprintf messages are printed immediately */
 
 		if (!dprintf_print_all &&
 		    !dprintf_find_string(newfile) &&
 		    !dprintf_find_string(func))
 			return;
 
 		/* Print out just the function name if requested */
 		flockfile(stdout);
 		if (dprintf_find_string("pid"))
 			(void) printf("%d ", getpid());
 		if (dprintf_find_string("tid"))
 			(void) printf("%ju ",
 			    (uintmax_t)(uintptr_t)pthread_self());
 		if (dprintf_find_string("cpu"))
 			(void) printf("%u ", getcpuid());
 		if (dprintf_find_string("time"))
 			(void) printf("%llu ", gethrtime());
 		if (dprintf_find_string("long"))
 			(void) printf("%s, line %d: ", newfile, line);
 		(void) printf("dprintf: %s: ", func);
 		va_start(adx, fmt);
 		(void) vprintf(fmt, adx);
 		va_end(adx);
 		funlockfile(stdout);
 	} else {
 		/* zfs_dbgmsg is logged for dumping later */
 		size_t size;
 		char *buf;
 		int i;
 
 		size = 1024;
 		buf = umem_alloc(size, UMEM_NOFAIL);
 		i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);
 
 		if (i < size) {
 			va_start(adx, fmt);
 			(void) vsnprintf(buf + i, size - i, fmt, adx);
 			va_end(adx);
 		}
 
 		__zfs_dbgmsg(buf);
 
 		umem_free(buf, size);
 	}
 }
 
 /*
  * =========================================================================
  * cmn_err() and panic()
  * =========================================================================
  */
 static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
 static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
 
 void
 vpanic(const char *fmt, va_list adx)
 {
 	(void) fprintf(stderr, "error: ");
 	(void) vfprintf(stderr, fmt, adx);
 	(void) fprintf(stderr, "\n");
 
 	abort();	/* think of it as a "user-level crash dump" */
 }
 
 void
 panic(const char *fmt, ...)
 {
 	va_list adx;
 
 	va_start(adx, fmt);
 	vpanic(fmt, adx);
 	va_end(adx);
 }
 
 void
 vcmn_err(int ce, const char *fmt, va_list adx)
 {
 	if (ce == CE_PANIC)
 		vpanic(fmt, adx);
 	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
 		(void) fprintf(stderr, "%s", ce_prefix[ce]);
 		(void) vfprintf(stderr, fmt, adx);
 		(void) fprintf(stderr, "%s", ce_suffix[ce]);
 	}
 }
 
-/*PRINTFLIKE2*/
 void
 cmn_err(int ce, const char *fmt, ...)
 {
 	va_list adx;
 
 	va_start(adx, fmt);
 	vcmn_err(ce, fmt, adx);
 	va_end(adx);
 }
 
 /*
  * =========================================================================
  * misc routines
  * =========================================================================
  */
 
 void
 delay(clock_t ticks)
 {
 	(void) poll(0, 0, ticks * (1000 / hz));
 }
 
 /*
  * Find highest one bit set.
  * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
  * The __builtin_clzll() function is supported by both GCC and Clang.
  */
 int
 highbit64(uint64_t i)
 {
 	if (i == 0)
 	return (0);
 
 	return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
 }
 
 /*
  * Find lowest one bit set.
  * Returns bit number + 1 of lowest bit that is set, otherwise returns 0.
  * The __builtin_ffsll() function is supported by both GCC and Clang.
  */
 int
 lowbit64(uint64_t i)
 {
 	if (i == 0)
 		return (0);
 
 	return (__builtin_ffsll(i));
 }
 
 const char *random_path = "/dev/random";
 const char *urandom_path = "/dev/urandom";
 static int random_fd = -1, urandom_fd = -1;
 
 void
 random_init(void)
 {
 	VERIFY((random_fd = open(random_path, O_RDONLY | O_CLOEXEC)) != -1);
 	VERIFY((urandom_fd = open(urandom_path, O_RDONLY | O_CLOEXEC)) != -1);
 }
 
 void
 random_fini(void)
 {
 	close(random_fd);
 	close(urandom_fd);
 
 	random_fd = -1;
 	urandom_fd = -1;
 }
 
 static int
 random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
 {
 	size_t resid = len;
 	ssize_t bytes;
 
 	ASSERT(fd != -1);
 
 	while (resid != 0) {
 		bytes = read(fd, ptr, resid);
 		ASSERT3S(bytes, >=, 0);
 		ptr += bytes;
 		resid -= bytes;
 	}
 
 	return (0);
 }
 
 int
 random_get_bytes(uint8_t *ptr, size_t len)
 {
 	return (random_get_bytes_common(ptr, len, random_fd));
 }
 
 int
 random_get_pseudo_bytes(uint8_t *ptr, size_t len)
 {
 	return (random_get_bytes_common(ptr, len, urandom_fd));
 }
 
 int
 ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
 {
 	char *end;
 
 	*result = strtoul(hw_serial, &end, base);
 	if (*result == 0)
 		return (errno);
 	return (0);
 }
 
 int
 ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
 {
 	char *end;
 
 	*result = strtoull(str, &end, base);
 	if (*result == 0)
 		return (errno);
 	return (0);
 }
 
 utsname_t *
 utsname(void)
 {
 	return (&hw_utsname);
 }
 
 /*
  * =========================================================================
  * kernel emulation setup & teardown
  * =========================================================================
  */
 static int
 umem_out_of_memory(void)
 {
 	char errmsg[] = "out of memory -- generating core dump\n";
 
 	(void) fprintf(stderr, "%s", errmsg);
 	abort();
 	return (0);
 }
 
 void
 kernel_init(int mode)
 {
 	extern uint_t rrw_tsd_key;
 
 	umem_nofail_callback(umem_out_of_memory);
 
 	physmem = sysconf(_SC_PHYS_PAGES);
 
 	dprintf("physmem = %llu pages (%.2f GB)\n", (u_longlong_t)physmem,
 	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
 
 	(void) snprintf(hw_serial, sizeof (hw_serial), "%ld",
 	    (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0);
 
 	random_init();
 
 	VERIFY0(uname(&hw_utsname));
 
 	system_taskq_init();
 	icp_init();
 
 	zstd_init();
 
 	spa_init((spa_mode_t)mode);
 
 	fletcher_4_init();
 
 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
 }
 
 void
 kernel_fini(void)
 {
 	fletcher_4_fini();
 	spa_fini();
 
 	zstd_fini();
 
 	icp_fini();
 	system_taskq_fini();
 
 	random_fini();
 }
 
 uid_t
 crgetuid(cred_t *cr)
 {
 	return (0);
 }
 
 uid_t
 crgetruid(cred_t *cr)
 {
 	return (0);
 }
 
 gid_t
 crgetgid(cred_t *cr)
 {
 	return (0);
 }
 
 int
 crgetngroups(cred_t *cr)
 {
 	return (0);
 }
 
 gid_t *
 crgetgroups(cred_t *cr)
 {
 	return (NULL);
 }
 
 int
 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 {
 	return (0);
 }
 
 int
 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
 {
 	return (0);
 }
 
 int
 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
 {
 	return (0);
 }
 
 int
 secpolicy_zfs(const cred_t *cr)
 {
 	return (0);
 }
 
 int
 secpolicy_zfs_proc(const cred_t *cr, proc_t *proc)
 {
 	return (0);
 }
 
 ksiddomain_t *
 ksid_lookupdomain(const char *dom)
 {
 	ksiddomain_t *kd;
 
 	kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
 	kd->kd_name = spa_strdup(dom);
 	return (kd);
 }
 
 void
 ksiddomain_rele(ksiddomain_t *ksid)
 {
 	spa_strfree(ksid->kd_name);
 	umem_free(ksid, sizeof (ksiddomain_t));
 }
 
 char *
 kmem_vasprintf(const char *fmt, va_list adx)
 {
 	char *buf = NULL;
 	va_list adx_copy;
 
 	va_copy(adx_copy, adx);
 	VERIFY(vasprintf(&buf, fmt, adx_copy) != -1);
 	va_end(adx_copy);
 
 	return (buf);
 }
 
 char *
 kmem_asprintf(const char *fmt, ...)
 {
 	char *buf = NULL;
 	va_list adx;
 
 	va_start(adx, fmt);
 	VERIFY(vasprintf(&buf, fmt, adx) != -1);
 	va_end(adx);
 
 	return (buf);
 }
 
 /* ARGSUSED */
 zfs_file_t *
 zfs_onexit_fd_hold(int fd, minor_t *minorp)
 {
 	*minorp = 0;
 	return (NULL);
 }
 
 /* ARGSUSED */
 void
 zfs_onexit_fd_rele(zfs_file_t *fp)
 {
 }
 
 /* ARGSUSED */
 int
 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
     uint64_t *action_handle)
 {
 	return (0);
 }
 
 fstrans_cookie_t
 spl_fstrans_mark(void)
 {
 	return ((fstrans_cookie_t)0);
 }
 
 void
 spl_fstrans_unmark(fstrans_cookie_t cookie)
 {
 }
 
 int
 __spl_pf_fstrans_check(void)
 {
 	return (0);
 }
 
 int
 kmem_cache_reap_active(void)
 {
 	return (0);
 }
 
 void *zvol_tag = "zvol_tag";
 
 void
 zvol_create_minor(const char *name)
 {
 }
 
 void
 zvol_create_minors_recursive(const char *name)
 {
 }
 
 void
 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
 {
 }
 
 void
 zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname,
     boolean_t async)
 {
 }
 
 /*
  * Open file
  *
  * path - fully qualified path to file
  * flags - file attributes O_READ / O_WRITE / O_EXCL
  * fpp - pointer to return file pointer
  *
  * Returns 0 on success underlying error on failure.
  */
 int
 zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
 {
 	int fd = -1;
 	int dump_fd = -1;
 	int err;
 	int old_umask = 0;
 	zfs_file_t *fp;
 	struct stat64 st;
 
 	if (!(flags & O_CREAT) && stat64(path, &st) == -1)
 		return (errno);
 
 	if (!(flags & O_CREAT) && S_ISBLK(st.st_mode))
 		flags |= O_DIRECT;
 
 	if (flags & O_CREAT)
 		old_umask = umask(0);
 
 	fd = open64(path, flags, mode);
 	if (fd == -1)
 		return (errno);
 
 	if (flags & O_CREAT)
 		(void) umask(old_umask);
 
 	if (vn_dumpdir != NULL) {
 		char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL);
 		const char *inpath = zfs_basename(path);
 
 		(void) snprintf(dumppath, MAXPATHLEN,
 		    "%s/%s", vn_dumpdir, inpath);
 		dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);
 		umem_free(dumppath, MAXPATHLEN);
 		if (dump_fd == -1) {
 			err = errno;
 			close(fd);
 			return (err);
 		}
 	} else {
 		dump_fd = -1;
 	}
 
 	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
 
 	fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL);
 	fp->f_fd = fd;
 	fp->f_dump_fd = dump_fd;
 	*fpp = fp;
 
 	return (0);
 }
 
 void
 zfs_file_close(zfs_file_t *fp)
 {
 	close(fp->f_fd);
 	if (fp->f_dump_fd != -1)
 		close(fp->f_dump_fd);
 
 	umem_free(fp, sizeof (zfs_file_t));
 }
 
 /*
  * Stateful write - use os internal file pointer to determine where to
  * write and update on successful completion.
  *
  * fp -  pointer to file (pipe, socket, etc) to write to
  * buf - buffer to write
  * count - # of bytes to write
  * resid -  pointer to count of unwritten bytes  (if short write)
  *
  * Returns 0 on success errno on failure.
  */
 int
 zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
 {
 	ssize_t rc;
 
 	rc = write(fp->f_fd, buf, count);
 	if (rc < 0)
 		return (errno);
 
 	if (resid) {
 		*resid = count - rc;
 	} else if (rc != count) {
 		return (EIO);
 	}
 
 	return (0);
 }
 
 /*
  * Stateless write - os internal file pointer is not updated.
  *
  * fp -  pointer to file (pipe, socket, etc) to write to
  * buf - buffer to write
  * count - # of bytes to write
  * off - file offset to write to (only valid for seekable types)
  * resid -  pointer to count of unwritten bytes
  *
  * Returns 0 on success errno on failure.
  */
 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf,
     size_t count, loff_t pos, ssize_t *resid)
 {
 	ssize_t rc, split, done;
 	int sectors;
 
 	/*
 	 * To simulate partial disk writes, we split writes into two
 	 * system calls so that the process can be killed in between.
 	 * This is used by ztest to simulate realistic failure modes.
 	 */
 	sectors = count >> SPA_MINBLOCKSHIFT;
 	split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT;
 	rc = pwrite64(fp->f_fd, buf, split, pos);
 	if (rc != -1) {
 		done = rc;
 		rc = pwrite64(fp->f_fd, (char *)buf + split,
 		    count - split, pos + split);
 	}
 #ifdef __linux__
 	if (rc == -1 && errno == EINVAL) {
 		/*
 		 * Under Linux, this most likely means an alignment issue
 		 * (memory or disk) due to O_DIRECT, so we abort() in order
 		 * to catch the offender.
 		 */
 		abort();
 	}
 #endif
 
 	if (rc < 0)
 		return (errno);
 
 	done += rc;
 
 	if (resid) {
 		*resid = count - done;
 	} else if (done != count) {
 		return (EIO);
 	}
 
 	return (0);
 }
 
 /*
  * Stateful read - use os internal file pointer to determine where to
  * read and update on successful completion.
  *
  * fp -  pointer to file (pipe, socket, etc) to read from
  * buf - buffer to write
  * count - # of bytes to read
  * resid -  pointer to count of unread bytes (if short read)
  *
  * Returns 0 on success errno on failure.
  */
 int
 zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
 {
 	int rc;
 
 	rc = read(fp->f_fd, buf, count);
 	if (rc < 0)
 		return (errno);
 
 	if (resid) {
 		*resid = count - rc;
 	} else if (rc != count) {
 		return (EIO);
 	}
 
 	return (0);
 }
 
 /*
  * Stateless read - os internal file pointer is not updated.
  *
  * fp -  pointer to file (pipe, socket, etc) to read from
  * buf - buffer to write
  * count - # of bytes to write
  * off - file offset to read from (only valid for seekable types)
  * resid -  pointer to count of unwritten bytes (if short write)
  *
  * Returns 0 on success errno on failure.
  */
 int
 zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
     ssize_t *resid)
 {
 	ssize_t rc;
 
 	rc = pread64(fp->f_fd, buf, count, off);
 	if (rc < 0) {
 #ifdef __linux__
 		/*
 		 * Under Linux, this most likely means an alignment issue
 		 * (memory or disk) due to O_DIRECT, so we abort() in order to
 		 * catch the offender.
 		 */
 		if (errno == EINVAL)
 			abort();
 #endif
 		return (errno);
 	}
 
 	if (fp->f_dump_fd != -1) {
 		int status;
 
 		status = pwrite64(fp->f_dump_fd, buf, rc, off);
 		ASSERT(status != -1);
 	}
 
 	if (resid) {
 		*resid = count - rc;
 	} else if (rc != count) {
 		return (EIO);
 	}
 
 	return (0);
 }
 
 /*
  * lseek - set / get file pointer
  *
  * fp -  pointer to file (pipe, socket, etc) to read from
  * offp - value to seek to, returns current value plus passed offset
  * whence - see man pages for standard lseek whence values
  *
  * Returns 0 on success errno on failure (ESPIPE for non seekable types)
  */
 int
 zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
 {
 	loff_t rc;
 
 	rc = lseek(fp->f_fd, *offp, whence);
 	if (rc < 0)
 		return (errno);
 
 	*offp = rc;
 
 	return (0);
 }
 
 /*
  * Get file attributes
  *
  * filp - file pointer
  * zfattr - pointer to file attr structure
  *
  * Currently only used for fetching size and file mode
  *
  * Returns 0 on success or error code of underlying getattr call on failure.
  */
 int
 zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
 {
 	struct stat64 st;
 
 	if (fstat64_blk(fp->f_fd, &st) == -1)
 		return (errno);
 
 	zfattr->zfa_size = st.st_size;
 	zfattr->zfa_mode = st.st_mode;
 
 	return (0);
 }
 
 /*
  * Sync file to disk
  *
  * filp - file pointer
  * flags - O_SYNC and or O_DSYNC
  *
  * Returns 0 on success or error code of underlying sync call on failure.
  */
 int
 zfs_file_fsync(zfs_file_t *fp, int flags)
 {
 	int rc;
 
 	rc = fsync(fp->f_fd);
 	if (rc < 0)
 		return (errno);
 
 	return (0);
 }
 
 /*
  * fallocate - allocate or free space on disk
  *
  * fp - file pointer
  * mode (non-standard options for hole punching etc)
  * offset - offset to start allocating or freeing from
  * len - length to free / allocate
  *
  * OPTIONAL
  */
 int
 zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len)
 {
 #ifdef __linux__
 	return (fallocate(fp->f_fd, mode, offset, len));
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 /*
  * Request current file pointer offset
  *
  * fp - pointer to file
  *
  * Returns current file offset.
  */
 loff_t
 zfs_file_off(zfs_file_t *fp)
 {
 	return (lseek(fp->f_fd, SEEK_CUR, 0));
 }
 
 /*
  * unlink file
  *
  * path - fully qualified file path
  *
  * Returns 0 on success.
  *
  * OPTIONAL
  */
 int
 zfs_file_unlink(const char *path)
 {
 	return (remove(path));
 }
 
 /*
  * Get reference to file pointer
  *
  * fd - input file descriptor
  *
  * Returns pointer to file struct or NULL.
  * Unsupported in user space.
  */
 zfs_file_t *
 zfs_file_get(int fd)
 {
 	abort();
 
 	return (NULL);
 }
 /*
  * Drop reference to file pointer
  *
  * fp - pointer to file struct
  *
  * Unsupported in user space.
  */
 void
 zfs_file_put(zfs_file_t *fp)
 {
 	abort();
 }
 
 void
 zfsvfs_update_fromname(const char *oldname, const char *newname)
 {
 }
diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c
index b5b2d7dbed91..277698aa0ebe 100644
--- a/lib/libzutil/zutil_import.c
+++ b/lib/libzutil/zutil_import.c
@@ -1,1861 +1,1859 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright 2015 RackTop Systems.
  * Copyright (c) 2016, Intel Corporation.
  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
  */
 
 /*
  * Pool import support functions.
  *
  * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since
  * these commands are expected to run in the global zone, we can assume
  * that the devices are all readable when called.
  *
  * To import a pool, we rely on reading the configuration information from the
  * ZFS label of each device.  If we successfully read the label, then we
  * organize the configuration information in the following hierarchy:
  *
  *	pool guid -> toplevel vdev guid -> label txg
  *
  * Duplicate entries matching this same tuple will be discarded.  Once we have
  * examined every device, we pick the best label txg config for each toplevel
  * vdev.  We then arrange these toplevel vdevs into a complete pool config, and
  * update any paths that have changed.  Finally, we attempt to import the pool
  * using our derived config, and record the results.
  */
 
 #include <aio.h>
 #include <ctype.h>
 #include <dirent.h>
 #include <errno.h>
 #include <libintl.h>
 #include <libgen.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/dktp/fdisk.h>
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 
 #include <thread_pool.h>
 #include <libzutil.h>
 #include <libnvpair.h>
 
 #include "zutil_import.h"
 
-/*PRINTFLIKE2*/
-static void
+static __attribute__((format(printf, 2, 3))) void
 zutil_error_aux(libpc_handle_t *hdl, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 
 	(void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap);
 	hdl->lpc_desc_active = B_TRUE;
 
 	va_end(ap);
 }
 
 static void
 zutil_verror(libpc_handle_t *hdl, const char *error, const char *fmt,
     va_list ap)
 {
 	char action[1024];
 
 	(void) vsnprintf(action, sizeof (action), fmt, ap);
 
 	if (hdl->lpc_desc_active)
 		hdl->lpc_desc_active = B_FALSE;
 	else
 		hdl->lpc_desc[0] = '\0';
 
 	if (hdl->lpc_printerr) {
 		if (hdl->lpc_desc[0] != '\0')
 			error = hdl->lpc_desc;
 
 		(void) fprintf(stderr, "%s: %s\n", action, error);
 	}
 }
 
-/*PRINTFLIKE3*/
-static int
+static __attribute__((format(printf, 3, 4))) int
 zutil_error_fmt(libpc_handle_t *hdl, const char *error, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 
 	zutil_verror(hdl, error, fmt, ap);
 
 	va_end(ap);
 
 	return (-1);
 }
 
 static int
 zutil_error(libpc_handle_t *hdl, const char *error, const char *msg)
 {
 	return (zutil_error_fmt(hdl, error, "%s", msg));
 }
 
 static int
 zutil_no_memory(libpc_handle_t *hdl)
 {
 	zutil_error(hdl, EZFS_NOMEM, "internal error");
 	exit(1);
 }
 
 void *
 zutil_alloc(libpc_handle_t *hdl, size_t size)
 {
 	void *data;
 
 	if ((data = calloc(1, size)) == NULL)
 		(void) zutil_no_memory(hdl);
 
 	return (data);
 }
 
 char *
 zutil_strdup(libpc_handle_t *hdl, const char *str)
 {
 	char *ret;
 
 	if ((ret = strdup(str)) == NULL)
 		(void) zutil_no_memory(hdl);
 
 	return (ret);
 }
 
 static char *
 zutil_strndup(libpc_handle_t *hdl, const char *str, size_t n)
 {
 	char *ret;
 
 	if ((ret = strndup(str, n)) == NULL)
 		(void) zutil_no_memory(hdl);
 
 	return (ret);
 }
 
 /*
  * Intermediate structures used to gather configuration information.
  */
 typedef struct config_entry {
 	uint64_t		ce_txg;
 	nvlist_t		*ce_config;
 	struct config_entry	*ce_next;
 } config_entry_t;
 
 typedef struct vdev_entry {
 	uint64_t		ve_guid;
 	config_entry_t		*ve_configs;
 	struct vdev_entry	*ve_next;
 } vdev_entry_t;
 
 typedef struct pool_entry {
 	uint64_t		pe_guid;
 	vdev_entry_t		*pe_vdevs;
 	struct pool_entry	*pe_next;
 } pool_entry_t;
 
 typedef struct name_entry {
 	char			*ne_name;
 	uint64_t		ne_guid;
 	uint64_t		ne_order;
 	uint64_t		ne_num_labels;
 	struct name_entry	*ne_next;
 } name_entry_t;
 
 typedef struct pool_list {
 	pool_entry_t		*pools;
 	name_entry_t		*names;
 } pool_list_t;
 
 /*
  * Go through and fix up any path and/or devid information for the given vdev
  * configuration.
  */
 static int
 fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names)
 {
 	nvlist_t **child;
 	uint_t c, children;
 	uint64_t guid;
 	name_entry_t *ne, *best;
 	char *path;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) == 0) {
 		for (c = 0; c < children; c++)
 			if (fix_paths(hdl, child[c], names) != 0)
 				return (-1);
 		return (0);
 	}
 
 	/*
 	 * This is a leaf (file or disk) vdev.  In either case, go through
 	 * the name list and see if we find a matching guid.  If so, replace
 	 * the path and see if we can calculate a new devid.
 	 *
 	 * There may be multiple names associated with a particular guid, in
 	 * which case we have overlapping partitions or multiple paths to the
 	 * same disk.  In this case we prefer to use the path name which
 	 * matches the ZPOOL_CONFIG_PATH.  If no matching entry is found we
 	 * use the lowest order device which corresponds to the first match
 	 * while traversing the ZPOOL_IMPORT_PATH search path.
 	 */
 	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
 		path = NULL;
 
 	best = NULL;
 	for (ne = names; ne != NULL; ne = ne->ne_next) {
 		if (ne->ne_guid == guid) {
 			if (path == NULL) {
 				best = ne;
 				break;
 			}
 
 			if ((strlen(path) == strlen(ne->ne_name)) &&
 			    strncmp(path, ne->ne_name, strlen(path)) == 0) {
 				best = ne;
 				break;
 			}
 
 			if (best == NULL) {
 				best = ne;
 				continue;
 			}
 
 			/* Prefer paths with move vdev labels. */
 			if (ne->ne_num_labels > best->ne_num_labels) {
 				best = ne;
 				continue;
 			}
 
 			/* Prefer paths earlier in the search order. */
 			if (ne->ne_num_labels == best->ne_num_labels &&
 			    ne->ne_order < best->ne_order) {
 				best = ne;
 				continue;
 			}
 		}
 	}
 
 	if (best == NULL)
 		return (0);
 
 	if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)
 		return (-1);
 
 	update_vdev_config_dev_strs(nv);
 
 	return (0);
 }
 
 /*
  * Add the given configuration to the list of known devices.
  */
 static int
 add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path,
     int order, int num_labels, nvlist_t *config)
 {
 	uint64_t pool_guid, vdev_guid, top_guid, txg, state;
 	pool_entry_t *pe;
 	vdev_entry_t *ve;
 	config_entry_t *ce;
 	name_entry_t *ne;
 
 	/*
 	 * If this is a hot spare not currently in use or level 2 cache
 	 * device, add it to the list of names to translate, but don't do
 	 * anything else.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    &state) == 0 &&
 	    (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&
 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
 		if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL)
 			return (-1);
 
 		if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) {
 			free(ne);
 			return (-1);
 		}
 		ne->ne_guid = vdev_guid;
 		ne->ne_order = order;
 		ne->ne_num_labels = num_labels;
 		ne->ne_next = pl->names;
 		pl->names = ne;
 
 		return (0);
 	}
 
 	/*
 	 * If we have a valid config but cannot read any of these fields, then
 	 * it means we have a half-initialized label.  In vdev_label_init()
 	 * we write a label with txg == 0 so that we can identify the device
 	 * in case the user refers to the same disk later on.  If we fail to
 	 * create the pool, we'll be left with a label in this state
 	 * which should not be considered part of a valid pool.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    &pool_guid) != 0 ||
 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
 	    &vdev_guid) != 0 ||
 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,
 	    &top_guid) != 0 ||
 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &txg) != 0 || txg == 0) {
 		return (0);
 	}
 
 	/*
 	 * First, see if we know about this pool.  If not, then add it to the
 	 * list of known pools.
 	 */
 	for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
 		if (pe->pe_guid == pool_guid)
 			break;
 	}
 
 	if (pe == NULL) {
 		if ((pe = zutil_alloc(hdl, sizeof (pool_entry_t))) == NULL) {
 			return (-1);
 		}
 		pe->pe_guid = pool_guid;
 		pe->pe_next = pl->pools;
 		pl->pools = pe;
 	}
 
 	/*
 	 * Second, see if we know about this toplevel vdev.  Add it if its
 	 * missing.
 	 */
 	for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
 		if (ve->ve_guid == top_guid)
 			break;
 	}
 
 	if (ve == NULL) {
 		if ((ve = zutil_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {
 			return (-1);
 		}
 		ve->ve_guid = top_guid;
 		ve->ve_next = pe->pe_vdevs;
 		pe->pe_vdevs = ve;
 	}
 
 	/*
 	 * Third, see if we have a config with a matching transaction group.  If
 	 * so, then we do nothing.  Otherwise, add it to the list of known
 	 * configs.
 	 */
 	for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {
 		if (ce->ce_txg == txg)
 			break;
 	}
 
 	if (ce == NULL) {
 		if ((ce = zutil_alloc(hdl, sizeof (config_entry_t))) == NULL) {
 			return (-1);
 		}
 		ce->ce_txg = txg;
 		ce->ce_config = fnvlist_dup(config);
 		ce->ce_next = ve->ve_configs;
 		ve->ve_configs = ce;
 	}
 
 	/*
 	 * At this point we've successfully added our config to the list of
 	 * known configs.  The last thing to do is add the vdev guid -> path
 	 * mappings so that we can fix up the configuration as necessary before
 	 * doing the import.
 	 */
 	if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL)
 		return (-1);
 
 	if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) {
 		free(ne);
 		return (-1);
 	}
 
 	ne->ne_guid = vdev_guid;
 	ne->ne_order = order;
 	ne->ne_num_labels = num_labels;
 	ne->ne_next = pl->names;
 	pl->names = ne;
 
 	return (0);
 }
 
 static int
 zutil_pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid,
     boolean_t *isactive)
 {
 	ASSERT(hdl->lpc_ops->pco_pool_active != NULL);
 
 	int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name,
 	    guid, isactive);
 
 	return (error);
 }
 
 static nvlist_t *
 zutil_refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig)
 {
 	ASSERT(hdl->lpc_ops->pco_refresh_config != NULL);
 
 	return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle,
 	    tryconfig));
 }
 
 /*
  * Determine if the vdev id is a hole in the namespace.
  */
 static boolean_t
 vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
 {
 	int c;
 
 	for (c = 0; c < holes; c++) {
 
 		/* Top-level is a hole */
 		if (hole_array[c] == id)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Convert our list of pools into the definitive set of configurations.  We
  * start by picking the best config for each toplevel vdev.  Once that's done,
  * we assemble the toplevel vdevs into a full config for the pool.  We make a
  * pass to fix up any incorrect paths, and then add it to the main list to
  * return to the user.
  */
 static nvlist_t *
 get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,
     nvlist_t *policy)
 {
 	pool_entry_t *pe;
 	vdev_entry_t *ve;
 	config_entry_t *ce;
 	nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t i, nspares, nl2cache;
 	boolean_t config_seen;
 	uint64_t best_txg;
 	char *name, *hostname = NULL;
 	uint64_t guid;
 	uint_t children = 0;
 	nvlist_t **child = NULL;
 	uint_t holes;
 	uint64_t *hole_array, max_id;
 	uint_t c;
 	boolean_t isactive;
 	uint64_t hostid;
 	nvlist_t *nvl;
 	boolean_t valid_top_config = B_FALSE;
 
 	if (nvlist_alloc(&ret, 0, 0) != 0)
 		goto nomem;
 
 	for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
 		uint64_t id, max_txg = 0;
 
 		if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
 			goto nomem;
 		config_seen = B_FALSE;
 
 		/*
 		 * Iterate over all toplevel vdevs.  Grab the pool configuration
 		 * from the first one we find, and then go through the rest and
 		 * add them as necessary to the 'vdevs' member of the config.
 		 */
 		for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
 
 			/*
 			 * Determine the best configuration for this vdev by
 			 * selecting the config with the latest transaction
 			 * group.
 			 */
 			best_txg = 0;
 			for (ce = ve->ve_configs; ce != NULL;
 			    ce = ce->ce_next) {
 
 				if (ce->ce_txg > best_txg) {
 					tmp = ce->ce_config;
 					best_txg = ce->ce_txg;
 				}
 			}
 
 			/*
 			 * We rely on the fact that the max txg for the
 			 * pool will contain the most up-to-date information
 			 * about the valid top-levels in the vdev namespace.
 			 */
 			if (best_txg > max_txg) {
 				(void) nvlist_remove(config,
 				    ZPOOL_CONFIG_VDEV_CHILDREN,
 				    DATA_TYPE_UINT64);
 				(void) nvlist_remove(config,
 				    ZPOOL_CONFIG_HOLE_ARRAY,
 				    DATA_TYPE_UINT64_ARRAY);
 
 				max_txg = best_txg;
 				hole_array = NULL;
 				holes = 0;
 				max_id = 0;
 				valid_top_config = B_FALSE;
 
 				if (nvlist_lookup_uint64(tmp,
 				    ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
 					verify(nvlist_add_uint64(config,
 					    ZPOOL_CONFIG_VDEV_CHILDREN,
 					    max_id) == 0);
 					valid_top_config = B_TRUE;
 				}
 
 				if (nvlist_lookup_uint64_array(tmp,
 				    ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
 				    &holes) == 0) {
 					verify(nvlist_add_uint64_array(config,
 					    ZPOOL_CONFIG_HOLE_ARRAY,
 					    hole_array, holes) == 0);
 				}
 			}
 
 			if (!config_seen) {
 				/*
 				 * Copy the relevant pieces of data to the pool
 				 * configuration:
 				 *
 				 *	version
 				 *	pool guid
 				 *	name
 				 *	comment (if available)
 				 *	compatibility features (if available)
 				 *	pool state
 				 *	hostid (if available)
 				 *	hostname (if available)
 				 */
 				uint64_t state, version;
 				char *comment = NULL;
 				char *compatibility = NULL;
 
 				version = fnvlist_lookup_uint64(tmp,
 				    ZPOOL_CONFIG_VERSION);
 				fnvlist_add_uint64(config,
 				    ZPOOL_CONFIG_VERSION, version);
 				guid = fnvlist_lookup_uint64(tmp,
 				    ZPOOL_CONFIG_POOL_GUID);
 				fnvlist_add_uint64(config,
 				    ZPOOL_CONFIG_POOL_GUID, guid);
 				name = fnvlist_lookup_string(tmp,
 				    ZPOOL_CONFIG_POOL_NAME);
 				fnvlist_add_string(config,
 				    ZPOOL_CONFIG_POOL_NAME, name);
 
 				if (nvlist_lookup_string(tmp,
 				    ZPOOL_CONFIG_COMMENT, &comment) == 0)
 					fnvlist_add_string(config,
 					    ZPOOL_CONFIG_COMMENT, comment);
 
 				if (nvlist_lookup_string(tmp,
 				    ZPOOL_CONFIG_COMPATIBILITY,
 				    &compatibility) == 0)
 					fnvlist_add_string(config,
 					    ZPOOL_CONFIG_COMPATIBILITY,
 					    compatibility);
 
 				state = fnvlist_lookup_uint64(tmp,
 				    ZPOOL_CONFIG_POOL_STATE);
 				fnvlist_add_uint64(config,
 				    ZPOOL_CONFIG_POOL_STATE, state);
 
 				hostid = 0;
 				if (nvlist_lookup_uint64(tmp,
 				    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 					fnvlist_add_uint64(config,
 					    ZPOOL_CONFIG_HOSTID, hostid);
 					hostname = fnvlist_lookup_string(tmp,
 					    ZPOOL_CONFIG_HOSTNAME);
 					fnvlist_add_string(config,
 					    ZPOOL_CONFIG_HOSTNAME, hostname);
 				}
 
 				config_seen = B_TRUE;
 			}
 
 			/*
 			 * Add this top-level vdev to the child array.
 			 */
 			verify(nvlist_lookup_nvlist(tmp,
 			    ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
 			verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
 			    &id) == 0);
 
 			if (id >= children) {
 				nvlist_t **newchild;
 
 				newchild = zutil_alloc(hdl, (id + 1) *
 				    sizeof (nvlist_t *));
 				if (newchild == NULL)
 					goto nomem;
 
 				for (c = 0; c < children; c++)
 					newchild[c] = child[c];
 
 				free(child);
 				child = newchild;
 				children = id + 1;
 			}
 			if (nvlist_dup(nvtop, &child[id], 0) != 0)
 				goto nomem;
 
 		}
 
 		/*
 		 * If we have information about all the top-levels then
 		 * clean up the nvlist which we've constructed. This
 		 * means removing any extraneous devices that are
 		 * beyond the valid range or adding devices to the end
 		 * of our array which appear to be missing.
 		 */
 		if (valid_top_config) {
 			if (max_id < children) {
 				for (c = max_id; c < children; c++)
 					nvlist_free(child[c]);
 				children = max_id;
 			} else if (max_id > children) {
 				nvlist_t **newchild;
 
 				newchild = zutil_alloc(hdl, (max_id) *
 				    sizeof (nvlist_t *));
 				if (newchild == NULL)
 					goto nomem;
 
 				for (c = 0; c < children; c++)
 					newchild[c] = child[c];
 
 				free(child);
 				child = newchild;
 				children = max_id;
 			}
 		}
 
 		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 		    &guid) == 0);
 
 		/*
 		 * The vdev namespace may contain holes as a result of
 		 * device removal. We must add them back into the vdev
 		 * tree before we process any missing devices.
 		 */
 		if (holes > 0) {
 			ASSERT(valid_top_config);
 
 			for (c = 0; c < children; c++) {
 				nvlist_t *holey;
 
 				if (child[c] != NULL ||
 				    !vdev_is_hole(hole_array, holes, c))
 					continue;
 
 				if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
 				    0) != 0)
 					goto nomem;
 
 				/*
 				 * Holes in the namespace are treated as
 				 * "hole" top-level vdevs and have a
 				 * special flag set on them.
 				 */
 				if (nvlist_add_string(holey,
 				    ZPOOL_CONFIG_TYPE,
 				    VDEV_TYPE_HOLE) != 0 ||
 				    nvlist_add_uint64(holey,
 				    ZPOOL_CONFIG_ID, c) != 0 ||
 				    nvlist_add_uint64(holey,
 				    ZPOOL_CONFIG_GUID, 0ULL) != 0) {
 					nvlist_free(holey);
 					goto nomem;
 				}
 				child[c] = holey;
 			}
 		}
 
 		/*
 		 * Look for any missing top-level vdevs.  If this is the case,
 		 * create a faked up 'missing' vdev as a placeholder.  We cannot
 		 * simply compress the child array, because the kernel performs
 		 * certain checks to make sure the vdev IDs match their location
 		 * in the configuration.
 		 */
 		for (c = 0; c < children; c++) {
 			if (child[c] == NULL) {
 				nvlist_t *missing;
 				if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
 				    0) != 0)
 					goto nomem;
 				if (nvlist_add_string(missing,
 				    ZPOOL_CONFIG_TYPE,
 				    VDEV_TYPE_MISSING) != 0 ||
 				    nvlist_add_uint64(missing,
 				    ZPOOL_CONFIG_ID, c) != 0 ||
 				    nvlist_add_uint64(missing,
 				    ZPOOL_CONFIG_GUID, 0ULL) != 0) {
 					nvlist_free(missing);
 					goto nomem;
 				}
 				child[c] = missing;
 			}
 		}
 
 		/*
 		 * Put all of this pool's top-level vdevs into a root vdev.
 		 */
 		if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
 			goto nomem;
 		if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 		    VDEV_TYPE_ROOT) != 0 ||
 		    nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 ||
 		    nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 ||
 		    nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 		    child, children) != 0) {
 			nvlist_free(nvroot);
 			goto nomem;
 		}
 
 		for (c = 0; c < children; c++)
 			nvlist_free(child[c]);
 		free(child);
 		children = 0;
 		child = NULL;
 
 		/*
 		 * Go through and fix up any paths and/or devids based on our
 		 * known list of vdev GUID -> path mappings.
 		 */
 		if (fix_paths(hdl, nvroot, pl->names) != 0) {
 			nvlist_free(nvroot);
 			goto nomem;
 		}
 
 		/*
 		 * Add the root vdev to this pool's configuration.
 		 */
 		if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 		    nvroot) != 0) {
 			nvlist_free(nvroot);
 			goto nomem;
 		}
 		nvlist_free(nvroot);
 
 		/*
 		 * zdb uses this path to report on active pools that were
 		 * imported or created using -R.
 		 */
 		if (active_ok)
 			goto add_pool;
 
 		/*
 		 * Determine if this pool is currently active, in which case we
 		 * can't actually import it.
 		 */
 		verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    &name) == 0);
 		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 		    &guid) == 0);
 
 		if (zutil_pool_active(hdl, name, guid, &isactive) != 0)
 			goto error;
 
 		if (isactive) {
 			nvlist_free(config);
 			config = NULL;
 			continue;
 		}
 
 		if (policy != NULL) {
 			if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
 			    policy) != 0)
 				goto nomem;
 		}
 
 		if ((nvl = zutil_refresh_config(hdl, config)) == NULL) {
 			nvlist_free(config);
 			config = NULL;
 			continue;
 		}
 
 		nvlist_free(config);
 		config = nvl;
 
 		/*
 		 * Go through and update the paths for spares, now that we have
 		 * them.
 		 */
 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 		    &nvroot) == 0);
 		if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    &spares, &nspares) == 0) {
 			for (i = 0; i < nspares; i++) {
 				if (fix_paths(hdl, spares[i], pl->names) != 0)
 					goto nomem;
 			}
 		}
 
 		/*
 		 * Update the paths for l2cache devices.
 		 */
 		if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    &l2cache, &nl2cache) == 0) {
 			for (i = 0; i < nl2cache; i++) {
 				if (fix_paths(hdl, l2cache[i], pl->names) != 0)
 					goto nomem;
 			}
 		}
 
 		/*
 		 * Restore the original information read from the actual label.
 		 */
 		(void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,
 		    DATA_TYPE_UINT64);
 		(void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,
 		    DATA_TYPE_STRING);
 		if (hostid != 0) {
 			verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
 			    hostid) == 0);
 			verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
 			    hostname) == 0);
 		}
 
 add_pool:
 		/*
 		 * Add this pool to the list of configs.
 		 */
 		verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    &name) == 0);
 
 		if (nvlist_add_nvlist(ret, name, config) != 0)
 			goto nomem;
 
 		nvlist_free(config);
 		config = NULL;
 	}
 
 	return (ret);
 
 nomem:
 	(void) zutil_no_memory(hdl);
 error:
 	nvlist_free(config);
 	nvlist_free(ret);
 	for (c = 0; c < children; c++)
 		nvlist_free(child[c]);
 	free(child);
 
 	return (NULL);
 }
 
 /*
  * Return the offset of the given label.
  */
 static uint64_t
 label_offset(uint64_t size, int l)
 {
 	ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0);
 	return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
 	    0 : size - VDEV_LABELS * sizeof (vdev_label_t)));
 }
 
 /*
  * The same description applies as to zpool_read_label below,
  * except here we do it without aio, presumably because an aio call
  * errored out in a way we think not using it could circumvent.
  */
 static int
 zpool_read_label_slow(int fd, nvlist_t **config, int *num_labels)
 {
 	struct stat64 statbuf;
 	int l, count = 0;
 	vdev_phys_t *label;
 	nvlist_t *expected_config = NULL;
 	uint64_t expected_guid = 0, size;
 	int error;
 
 	*config = NULL;
 
 	if (fstat64_blk(fd, &statbuf) == -1)
 		return (0);
 	size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
 
 	error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label));
 	if (error)
 		return (-1);
 
 	for (l = 0; l < VDEV_LABELS; l++) {
 		uint64_t state, guid, txg;
 		off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE;
 
 		if (pread64(fd, label, sizeof (vdev_phys_t),
 		    offset) != sizeof (vdev_phys_t))
 			continue;
 
 		if (nvlist_unpack(label->vp_nvlist,
 		    sizeof (label->vp_nvlist), config, 0) != 0)
 			continue;
 
 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,
 		    &guid) != 0 || guid == 0) {
 			nvlist_free(*config);
 			continue;
 		}
 
 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
 		    &state) != 0 || state > POOL_STATE_L2CACHE) {
 			nvlist_free(*config);
 			continue;
 		}
 
 		if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
 		    &txg) != 0 || txg == 0)) {
 			nvlist_free(*config);
 			continue;
 		}
 
 		if (expected_guid) {
 			if (expected_guid == guid)
 				count++;
 
 			nvlist_free(*config);
 		} else {
 			expected_config = *config;
 			expected_guid = guid;
 			count++;
 		}
 	}
 
 	if (num_labels != NULL)
 		*num_labels = count;
 
 	free(label);
 	*config = expected_config;
 
 	return (0);
 }
 
 /*
  * Given a file descriptor, read the label information and return an nvlist
  * describing the configuration, if there is one.  The number of valid
  * labels found will be returned in num_labels when non-NULL.
  */
 int
 zpool_read_label(int fd, nvlist_t **config, int *num_labels)
 {
 	struct stat64 statbuf;
 	struct aiocb aiocbs[VDEV_LABELS];
 	struct aiocb *aiocbps[VDEV_LABELS];
 	vdev_phys_t *labels;
 	nvlist_t *expected_config = NULL;
 	uint64_t expected_guid = 0, size;
 	int error, l, count = 0;
 
 	*config = NULL;
 
 	if (fstat64_blk(fd, &statbuf) == -1)
 		return (0);
 	size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
 
 	error = posix_memalign((void **)&labels, PAGESIZE,
 	    VDEV_LABELS * sizeof (*labels));
 	if (error)
 		return (-1);
 
 	memset(aiocbs, 0, sizeof (aiocbs));
 	for (l = 0; l < VDEV_LABELS; l++) {
 		off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE;
 
 		aiocbs[l].aio_fildes = fd;
 		aiocbs[l].aio_offset = offset;
 		aiocbs[l].aio_buf = &labels[l];
 		aiocbs[l].aio_nbytes = sizeof (vdev_phys_t);
 		aiocbs[l].aio_lio_opcode = LIO_READ;
 		aiocbps[l] = &aiocbs[l];
 	}
 
 	if (lio_listio(LIO_WAIT, aiocbps, VDEV_LABELS, NULL) != 0) {
 		int saved_errno = errno;
 		boolean_t do_slow = B_FALSE;
 		error = -1;
 
 		if (errno == EAGAIN || errno == EINTR || errno == EIO) {
 			/*
 			 * A portion of the requests may have been submitted.
 			 * Clean them up.
 			 */
 			for (l = 0; l < VDEV_LABELS; l++) {
 				errno = 0;
 				switch (aio_error(&aiocbs[l])) {
 				case EINVAL:
 					break;
 				case EINPROGRESS:
 					// This shouldn't be possible to
 					// encounter, die if we do.
 					ASSERT(B_FALSE);
 				case EOPNOTSUPP:
 				case ENOSYS:
 					do_slow = B_TRUE;
 				case 0:
 				default:
 					(void) aio_return(&aiocbs[l]);
 				}
 			}
 		}
 		if (do_slow) {
 			/*
 			 * At least some IO involved access unsafe-for-AIO
 			 * files. Let's try again, without AIO this time.
 			 */
 			error = zpool_read_label_slow(fd, config, num_labels);
 			saved_errno = errno;
 		}
 		free(labels);
 		errno = saved_errno;
 		return (error);
 	}
 
 	for (l = 0; l < VDEV_LABELS; l++) {
 		uint64_t state, guid, txg;
 
 		if (aio_return(&aiocbs[l]) != sizeof (vdev_phys_t))
 			continue;
 
 		if (nvlist_unpack(labels[l].vp_nvlist,
 		    sizeof (labels[l].vp_nvlist), config, 0) != 0)
 			continue;
 
 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,
 		    &guid) != 0 || guid == 0) {
 			nvlist_free(*config);
 			continue;
 		}
 
 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
 		    &state) != 0 || state > POOL_STATE_L2CACHE) {
 			nvlist_free(*config);
 			continue;
 		}
 
 		if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
 		    &txg) != 0 || txg == 0)) {
 			nvlist_free(*config);
 			continue;
 		}
 
 		if (expected_guid) {
 			if (expected_guid == guid)
 				count++;
 
 			nvlist_free(*config);
 		} else {
 			expected_config = *config;
 			expected_guid = guid;
 			count++;
 		}
 	}
 
 	if (num_labels != NULL)
 		*num_labels = count;
 
 	free(labels);
 	*config = expected_config;
 
 	return (0);
 }
 
 /*
  * Sorted by full path and then vdev guid to allow for multiple entries with
  * the same full path name.  This is required because it's possible to
  * have multiple block devices with labels that refer to the same
  * ZPOOL_CONFIG_PATH yet have different vdev guids.  In this case both
  * entries need to be added to the cache.  Scenarios where this can occur
  * include overwritten pool labels, devices which are visible from multiple
  * hosts and multipath devices.
  */
 int
 slice_cache_compare(const void *arg1, const void *arg2)
 {
 	const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name;
 	const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name;
 	uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;
 	uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
 	int rv;
 
 	rv = TREE_ISIGN(strcmp(nm1, nm2));
 	if (rv)
 		return (rv);
 
 	return (TREE_CMP(guid1, guid2));
 }
 
 static int
 label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid,
     uint64_t vdev_guid, char **path, char **devid)
 {
 	nvlist_t **child;
 	uint_t c, children;
 	uint64_t guid;
 	char *val;
 	int error;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) == 0) {
 		for (c = 0; c < children; c++) {
 			error  = label_paths_impl(hdl, child[c],
 			    pool_guid, vdev_guid, path, devid);
 			if (error)
 				return (error);
 		}
 		return (0);
 	}
 
 	if (nvroot == NULL)
 		return (0);
 
 	error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid);
 	if ((error != 0) || (guid != vdev_guid))
 		return (0);
 
 	error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val);
 	if (error == 0)
 		*path = val;
 
 	error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val);
 	if (error == 0)
 		*devid = val;
 
 	return (0);
 }
 
 /*
  * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID
  * and store these strings as config_path and devid_path respectively.
  * The returned pointers are only valid as long as label remains valid.
  */
 int
 label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, char **devid)
 {
 	nvlist_t *nvroot;
 	uint64_t pool_guid;
 	uint64_t vdev_guid;
 
 	*path = NULL;
 	*devid = NULL;
 
 	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid))
 		return (ENOENT);
 
 	return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path,
 	    devid));
 }
 
 static void
 zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock,
     avl_tree_t *cache, const char *path, const char *name, int order)
 {
 	avl_index_t where;
 	rdsk_node_t *slice;
 
 	slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
 	if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) {
 		free(slice);
 		return;
 	}
 	slice->rn_vdev_guid = 0;
 	slice->rn_lock = lock;
 	slice->rn_avl = cache;
 	slice->rn_hdl = hdl;
 	slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET;
 	slice->rn_labelpaths = B_FALSE;
 
 	pthread_mutex_lock(lock);
 	if (avl_find(cache, slice, &where)) {
 		free(slice->rn_name);
 		free(slice);
 	} else {
 		avl_insert(cache, slice, where);
 	}
 	pthread_mutex_unlock(lock);
 }
 
 static int
 zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock,
     avl_tree_t *cache, const char *dir, int order)
 {
 	int error;
 	char path[MAXPATHLEN];
 	struct dirent64 *dp;
 	DIR *dirp;
 
 	if (realpath(dir, path) == NULL) {
 		error = errno;
 		if (error == ENOENT)
 			return (0);
 
-		zutil_error_aux(hdl, strerror(error));
+		zutil_error_aux(hdl, "%s", strerror(error));
 		(void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext(
 		    TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
 		return (error);
 	}
 
 	dirp = opendir(path);
 	if (dirp == NULL) {
 		error = errno;
-		zutil_error_aux(hdl, strerror(error));
+		zutil_error_aux(hdl, "%s", strerror(error));
 		(void) zutil_error_fmt(hdl, EZFS_BADPATH,
 		    dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
 		return (error);
 	}
 
 	while ((dp = readdir64(dirp)) != NULL) {
 		const char *name = dp->d_name;
 		if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0)
 			continue;
 
 		switch (dp->d_type) {
 		case DT_UNKNOWN:
 		case DT_BLK:
 		case DT_LNK:
 #ifdef __FreeBSD__
 		case DT_CHR:
 #endif
 		case DT_REG:
 			break;
 		default:
 			continue;
 		}
 
 		zpool_find_import_scan_add_slice(hdl, lock, cache, path, name,
 		    order);
 	}
 
 	(void) closedir(dirp);
 	return (0);
 }
 
 static int
 zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock,
     avl_tree_t *cache, const char *dir, int order)
 {
 	int error = 0;
 	char path[MAXPATHLEN];
 	char *d = NULL;
 	ssize_t dl;
 	const char *dpath, *name;
 
 	/*
 	 * Separate the directory and the basename.
 	 * We do this so that we can get the realpath of
 	 * the directory. We don't get the realpath on the
 	 * whole path because if it's a symlink, we want the
 	 * path of the symlink not where it points to.
 	 */
 	name = zfs_basename(dir);
 	if ((dl = zfs_dirnamelen(dir)) == -1)
 		dpath = ".";
 	else
 		dpath = d = zutil_strndup(hdl, dir, dl);
 
 	if (realpath(dpath, path) == NULL) {
 		error = errno;
 		if (error == ENOENT) {
 			error = 0;
 			goto out;
 		}
 
-		zutil_error_aux(hdl, strerror(error));
+		zutil_error_aux(hdl, "%s", strerror(error));
 		(void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext(
 		    TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
 		goto out;
 	}
 
 	zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order);
 
 out:
 	free(d);
 	return (error);
 }
 
 /*
  * Scan a list of directories for zfs devices.
  */
 static int
 zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock,
     avl_tree_t **slice_cache, const char * const *dir, size_t dirs)
 {
 	avl_tree_t *cache;
 	rdsk_node_t *slice;
 	void *cookie;
 	int i, error;
 
 	*slice_cache = NULL;
 	cache = zutil_alloc(hdl, sizeof (avl_tree_t));
 	avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t),
 	    offsetof(rdsk_node_t, rn_node));
 
 	for (i = 0; i < dirs; i++) {
 		struct stat sbuf;
 
 		if (stat(dir[i], &sbuf) != 0) {
 			error = errno;
 			if (error == ENOENT)
 				continue;
 
-			zutil_error_aux(hdl, strerror(error));
+			zutil_error_aux(hdl, "%s", strerror(error));
 			(void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext(
 			    TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]);
 			goto error;
 		}
 
 		/*
 		 * If dir[i] is a directory, we walk through it and add all
 		 * the entries to the cache. If it's not a directory, we just
 		 * add it to the cache.
 		 */
 		if (S_ISDIR(sbuf.st_mode)) {
 			if ((error = zpool_find_import_scan_dir(hdl, lock,
 			    cache, dir[i], i)) != 0)
 				goto error;
 		} else {
 			if ((error = zpool_find_import_scan_path(hdl, lock,
 			    cache, dir[i], i)) != 0)
 				goto error;
 		}
 	}
 
 	*slice_cache = cache;
 	return (0);
 
 error:
 	cookie = NULL;
 	while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
 		free(slice->rn_name);
 		free(slice);
 	}
 	free(cache);
 
 	return (error);
 }
 
 /*
  * Given a list of directories to search, find all pools stored on disk.  This
  * includes partial pools which are not available to import.  If no args are
  * given (argc is 0), then the default directory (/dev/dsk) is searched.
  * poolname or guid (but not both) are provided by the caller when trying
  * to import a specific pool.
  */
 static nvlist_t *
 zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg,
     pthread_mutex_t *lock, avl_tree_t *cache)
 {
 	nvlist_t *ret = NULL;
 	pool_list_t pools = { 0 };
 	pool_entry_t *pe, *penext;
 	vdev_entry_t *ve, *venext;
 	config_entry_t *ce, *cenext;
 	name_entry_t *ne, *nenext;
 	rdsk_node_t *slice;
 	void *cookie;
 	tpool_t *t;
 
 	verify(iarg->poolname == NULL || iarg->guid == 0);
 
 	/*
 	 * Create a thread pool to parallelize the process of reading and
 	 * validating labels, a large number of threads can be used due to
 	 * minimal contention.
 	 */
 	t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL);
 	for (slice = avl_first(cache); slice;
 	    (slice = avl_walk(cache, slice, AVL_AFTER)))
 		(void) tpool_dispatch(t, zpool_open_func, slice);
 
 	tpool_wait(t);
 	tpool_destroy(t);
 
 	/*
 	 * Process the cache, filtering out any entries which are not
 	 * for the specified pool then adding matching label configs.
 	 */
 	cookie = NULL;
 	while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
 		if (slice->rn_config != NULL) {
 			nvlist_t *config = slice->rn_config;
 			boolean_t matched = B_TRUE;
 			boolean_t aux = B_FALSE;
 			int fd;
 
 			/*
 			 * Check if it's a spare or l2cache device. If it is,
 			 * we need to skip the name and guid check since they
 			 * don't exist on aux device label.
 			 */
 			if (iarg->poolname != NULL || iarg->guid != 0) {
 				uint64_t state;
 				aux = nvlist_lookup_uint64(config,
 				    ZPOOL_CONFIG_POOL_STATE, &state) == 0 &&
 				    (state == POOL_STATE_SPARE ||
 				    state == POOL_STATE_L2CACHE);
 			}
 
 			if (iarg->poolname != NULL && !aux) {
 				char *pname;
 
 				matched = nvlist_lookup_string(config,
 				    ZPOOL_CONFIG_POOL_NAME, &pname) == 0 &&
 				    strcmp(iarg->poolname, pname) == 0;
 			} else if (iarg->guid != 0 && !aux) {
 				uint64_t this_guid;
 
 				matched = nvlist_lookup_uint64(config,
 				    ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 &&
 				    iarg->guid == this_guid;
 			}
 			if (matched) {
 				/*
 				 * Verify all remaining entries can be opened
 				 * exclusively. This will prune all underlying
 				 * multipath devices which otherwise could
 				 * result in the vdev appearing as UNAVAIL.
 				 *
 				 * Under zdb, this step isn't required and
 				 * would prevent a zdb -e of active pools with
 				 * no cachefile.
 				 */
 				fd = open(slice->rn_name,
 				    O_RDONLY | O_EXCL | O_CLOEXEC);
 				if (fd >= 0 || iarg->can_be_active) {
 					if (fd >= 0)
 						close(fd);
 					add_config(hdl, &pools,
 					    slice->rn_name, slice->rn_order,
 					    slice->rn_num_labels, config);
 				}
 			}
 			nvlist_free(config);
 		}
 		free(slice->rn_name);
 		free(slice);
 	}
 	avl_destroy(cache);
 	free(cache);
 
 	ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);
 
 	for (pe = pools.pools; pe != NULL; pe = penext) {
 		penext = pe->pe_next;
 		for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {
 			venext = ve->ve_next;
 			for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
 				cenext = ce->ce_next;
 				nvlist_free(ce->ce_config);
 				free(ce);
 			}
 			free(ve);
 		}
 		free(pe);
 	}
 
 	for (ne = pools.names; ne != NULL; ne = nenext) {
 		nenext = ne->ne_next;
 		free(ne->ne_name);
 		free(ne);
 	}
 
 	return (ret);
 }
 
 /*
  * Given a config, discover the paths for the devices which
  * exist in the config.
  */
 static int
 discover_cached_paths(libpc_handle_t *hdl, nvlist_t *nv,
     avl_tree_t *cache, pthread_mutex_t *lock)
 {
 	char *path = NULL;
 	ssize_t dl;
 	uint_t children;
 	nvlist_t **child;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) == 0) {
 		for (int c = 0; c < children; c++) {
 			discover_cached_paths(hdl, child[c], cache, lock);
 		}
 	}
 
 	/*
 	 * Once we have the path, we need to add the directory to
 	 * our directory cache.
 	 */
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
 		if ((dl = zfs_dirnamelen(path)) == -1)
 			path = ".";
 		else
 			path[dl] = '\0';
 		return (zpool_find_import_scan_dir(hdl, lock, cache,
 		    path, 0));
 	}
 	return (0);
 }
 
 /*
  * Given a cache file, return the contents as a list of importable pools.
  * poolname or guid (but not both) are provided by the caller when trying
  * to import a specific pool.
  */
 static nvlist_t *
 zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg)
 {
 	char *buf;
 	int fd;
 	struct stat64 statbuf;
 	nvlist_t *raw, *src, *dst;
 	nvlist_t *pools;
 	nvpair_t *elem;
 	char *name;
 	uint64_t this_guid;
 	boolean_t active;
 
 	verify(iarg->poolname == NULL || iarg->guid == 0);
 
 	if ((fd = open(iarg->cachefile, O_RDONLY | O_CLOEXEC)) < 0) {
 		zutil_error_aux(hdl, "%s", strerror(errno));
 		(void) zutil_error(hdl, EZFS_BADCACHE,
 		    dgettext(TEXT_DOMAIN, "failed to open cache file"));
 		return (NULL);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		zutil_error_aux(hdl, "%s", strerror(errno));
 		(void) close(fd);
 		(void) zutil_error(hdl, EZFS_BADCACHE,
 		    dgettext(TEXT_DOMAIN, "failed to get size of cache file"));
 		return (NULL);
 	}
 
 	if ((buf = zutil_alloc(hdl, statbuf.st_size)) == NULL) {
 		(void) close(fd);
 		return (NULL);
 	}
 
 	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
 		(void) close(fd);
 		free(buf);
 		(void) zutil_error(hdl, EZFS_BADCACHE,
 		    dgettext(TEXT_DOMAIN,
 		    "failed to read cache file contents"));
 		return (NULL);
 	}
 
 	(void) close(fd);
 
 	if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {
 		free(buf);
 		(void) zutil_error(hdl, EZFS_BADCACHE,
 		    dgettext(TEXT_DOMAIN,
 		    "invalid or corrupt cache file contents"));
 		return (NULL);
 	}
 
 	free(buf);
 
 	/*
 	 * Go through and get the current state of the pools and refresh their
 	 * state.
 	 */
 	if (nvlist_alloc(&pools, 0, 0) != 0) {
 		(void) zutil_no_memory(hdl);
 		nvlist_free(raw);
 		return (NULL);
 	}
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {
 		src = fnvpair_value_nvlist(elem);
 
 		name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);
 		if (iarg->poolname != NULL && strcmp(iarg->poolname, name) != 0)
 			continue;
 
 		this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);
 		if (iarg->guid != 0 && iarg->guid != this_guid)
 			continue;
 
 		if (zutil_pool_active(hdl, name, this_guid, &active) != 0) {
 			nvlist_free(raw);
 			nvlist_free(pools);
 			return (NULL);
 		}
 
 		if (active)
 			continue;
 
 		if (iarg->scan) {
 			uint64_t saved_guid = iarg->guid;
 			const char *saved_poolname = iarg->poolname;
 			pthread_mutex_t lock;
 
 			/*
 			 * Create the device cache that will hold the
 			 * devices we will scan based on the cachefile.
 			 * This will get destroyed and freed by
 			 * zpool_find_import_impl.
 			 */
 			avl_tree_t *cache = zutil_alloc(hdl,
 			    sizeof (avl_tree_t));
 			avl_create(cache, slice_cache_compare,
 			    sizeof (rdsk_node_t),
 			    offsetof(rdsk_node_t, rn_node));
 			nvlist_t *nvroot = fnvlist_lookup_nvlist(src,
 			    ZPOOL_CONFIG_VDEV_TREE);
 
 			/*
 			 * We only want to find the pool with this_guid.
 			 * We will reset these values back later.
 			 */
 			iarg->guid = this_guid;
 			iarg->poolname = NULL;
 
 			/*
 			 * We need to build up a cache of devices that exists
 			 * in the paths pointed to by the cachefile. This allows
 			 * us to preserve the device namespace that was
 			 * originally specified by the user but also lets us
 			 * scan devices in those directories in case they had
 			 * been renamed.
 			 */
 			pthread_mutex_init(&lock, NULL);
 			discover_cached_paths(hdl, nvroot, cache, &lock);
 			nvlist_t *nv = zpool_find_import_impl(hdl, iarg,
 			    &lock, cache);
 			pthread_mutex_destroy(&lock);
 
 			/*
 			 * zpool_find_import_impl will return back
 			 * a list of pools that it found based on the
 			 * device cache. There should only be one pool
 			 * since we're looking for a specific guid.
 			 * We will use that pool to build up the final
 			 * pool nvlist which is returned back to the
 			 * caller.
 			 */
 			nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 			fnvlist_add_nvlist(pools, nvpair_name(pair),
 			    fnvpair_value_nvlist(pair));
 
 			VERIFY3P(nvlist_next_nvpair(nv, pair), ==, NULL);
 
 			iarg->guid = saved_guid;
 			iarg->poolname = saved_poolname;
 			continue;
 		}
 
 		if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,
 		    iarg->cachefile) != 0) {
 			(void) zutil_no_memory(hdl);
 			nvlist_free(raw);
 			nvlist_free(pools);
 			return (NULL);
 		}
 
 		if ((dst = zutil_refresh_config(hdl, src)) == NULL) {
 			nvlist_free(raw);
 			nvlist_free(pools);
 			return (NULL);
 		}
 
 		if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {
 			(void) zutil_no_memory(hdl);
 			nvlist_free(dst);
 			nvlist_free(raw);
 			nvlist_free(pools);
 			return (NULL);
 		}
 		nvlist_free(dst);
 	}
 	nvlist_free(raw);
 	return (pools);
 }
 
 static nvlist_t *
 zpool_find_import(libpc_handle_t *hdl, importargs_t *iarg)
 {
 	pthread_mutex_t lock;
 	avl_tree_t *cache;
 	nvlist_t *pools = NULL;
 
 	verify(iarg->poolname == NULL || iarg->guid == 0);
 	pthread_mutex_init(&lock, NULL);
 
 	/*
 	 * Locate pool member vdevs by blkid or by directory scanning.
 	 * On success a newly allocated AVL tree which is populated with an
 	 * entry for each discovered vdev will be returned in the cache.
 	 * It's the caller's responsibility to consume and destroy this tree.
 	 */
 	if (iarg->scan || iarg->paths != 0) {
 		size_t dirs = iarg->paths;
 		const char * const *dir = (const char * const *)iarg->path;
 
 		if (dirs == 0)
 			dir = zpool_default_search_paths(&dirs);
 
 		if (zpool_find_import_scan(hdl, &lock, &cache,
 		    dir, dirs) != 0) {
 			pthread_mutex_destroy(&lock);
 			return (NULL);
 		}
 	} else {
 		if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) {
 			pthread_mutex_destroy(&lock);
 			return (NULL);
 		}
 	}
 
 	pools = zpool_find_import_impl(hdl, iarg, &lock, cache);
 	pthread_mutex_destroy(&lock);
 	return (pools);
 }
 
 
 nvlist_t *
 zpool_search_import(void *hdl, importargs_t *import,
     const pool_config_ops_t *pco)
 {
 	libpc_handle_t handle = { 0 };
 	nvlist_t *pools = NULL;
 
 	handle.lpc_lib_handle = hdl;
 	handle.lpc_ops = pco;
 	handle.lpc_printerr = B_TRUE;
 
 	verify(import->poolname == NULL || import->guid == 0);
 
 	if (import->cachefile != NULL)
 		pools = zpool_find_import_cached(&handle, import);
 	else
 		pools = zpool_find_import(&handle, import);
 
 	if ((pools == NULL || nvlist_empty(pools)) &&
 	    handle.lpc_open_access_error && geteuid() != 0) {
 		(void) zutil_error(&handle, EZFS_EACESS, dgettext(TEXT_DOMAIN,
 		    "no pools found"));
 	}
 
 	return (pools);
 }
 
 static boolean_t
 pool_match(nvlist_t *cfg, char *tgt)
 {
 	uint64_t v, guid = strtoull(tgt, NULL, 0);
 	char *s;
 
 	if (guid != 0) {
 		if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
 			return (v == guid);
 	} else {
 		if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
 			return (strcmp(s, tgt) == 0);
 	}
 	return (B_FALSE);
 }
 
 int
 zpool_find_config(void *hdl, const char *target, nvlist_t **configp,
     importargs_t *args, const pool_config_ops_t *pco)
 {
 	nvlist_t *pools;
 	nvlist_t *match = NULL;
 	nvlist_t *config = NULL;
 	char *sepp = NULL;
 	int count = 0;
 	char *targetdup = strdup(target);
 
 	*configp = NULL;
 
 	if ((sepp = strpbrk(targetdup, "/@")) != NULL)
 		*sepp = '\0';
 
 	pools = zpool_search_import(hdl, args, pco);
 
 	if (pools != NULL) {
 		nvpair_t *elem = NULL;
 		while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
 			VERIFY0(nvpair_value_nvlist(elem, &config));
 			if (pool_match(config, targetdup)) {
 				count++;
 				if (match != NULL) {
 					/* multiple matches found */
 					continue;
 				} else {
 					match = fnvlist_dup(config);
 				}
 			}
 		}
 		fnvlist_free(pools);
 	}
 
 	if (count == 0) {
 		free(targetdup);
 		return (ENOENT);
 	}
 
 	if (count > 1) {
 		free(targetdup);
 		fnvlist_free(match);
 		return (EINVAL);
 	}
 
 	*configp = match;
 	free(targetdup);
 
 	return (0);
 }
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index c56fd3a6ff21..59d062ebe2a6 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1,928 +1,930 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_trim.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <linux/blkpg.h>
 #include <linux/msdos_fs.h>
 #include <linux/vfs_compat.h>
 
 typedef struct vdev_disk {
 	struct block_device		*vd_bdev;
 	krwlock_t			vd_lock;
 } vdev_disk_t;
 
 /*
  * Unique identifier for the exclusive vdev holder.
  */
 static void *zfs_vdev_holder = VDEV_HOLDER;
 
 /*
  * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the
  * device is missing. The missing path may be transient since the links
  * can be briefly removed and recreated in response to udev events.
  */
 static unsigned zfs_vdev_open_timeout_ms = 1000;
 
 /*
  * Size of the "reserved" partition, in blocks.
  */
 #define	EFI_MIN_RESV_SIZE	(16 * 1024)
 
 /*
  * Virtual device vector for disks.
  */
 typedef struct dio_request {
 	zio_t			*dr_zio;	/* Parent ZIO */
 	atomic_t		dr_ref;		/* References */
 	int			dr_error;	/* Bio error */
 	int			dr_bio_count;	/* Count of bio's */
 	struct bio		*dr_bio[0];	/* Attached bio's */
 } dio_request_t;
 
 static fmode_t
 vdev_bdev_mode(spa_mode_t spa_mode)
 {
 	fmode_t mode = 0;
 
 	if (spa_mode & SPA_MODE_READ)
 		mode |= FMODE_READ;
 
 	if (spa_mode & SPA_MODE_WRITE)
 		mode |= FMODE_WRITE;
 
 	return (mode);
 }
 
 /*
  * Returns the usable capacity (in bytes) for the partition or disk.
  */
 static uint64_t
 bdev_capacity(struct block_device *bdev)
 {
 	return (i_size_read(bdev->bd_inode));
 }
 
 #if !defined(HAVE_BDEV_WHOLE)
 static inline struct block_device *
 bdev_whole(struct block_device *bdev)
 {
 	return (bdev->bd_contains);
 }
 #endif
 
 /*
  * Returns the maximum expansion capacity of the block device (in bytes).
  *
  * It is possible to expand a vdev when it has been created as a wholedisk
  * and the containing block device has increased in capacity.  Or when the
  * partition containing the pool has been manually increased in size.
  *
  * This function is only responsible for calculating the potential expansion
  * size so it can be reported by 'zpool list'.  The efi_use_whole_disk() is
  * responsible for verifying the expected partition layout in the wholedisk
  * case, and updating the partition table if appropriate.  Once the partition
  * size has been increased the additional capacity will be visible using
  * bdev_capacity().
  *
  * The returned maximum expansion capacity is always expected to be larger, or
  * at the very least equal, to its usable capacity to prevent overestimating
  * the pool expandsize.
  */
 static uint64_t
 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
 {
 	uint64_t psize;
 	int64_t available;
 
 	if (wholedisk && bdev != bdev_whole(bdev)) {
 		/*
 		 * When reporting maximum expansion capacity for a wholedisk
 		 * deduct any capacity which is expected to be lost due to
 		 * alignment restrictions.  Over reporting this value isn't
 		 * harmful and would only result in slightly less capacity
 		 * than expected post expansion.
 		 * The estimated available space may be slightly smaller than
 		 * bdev_capacity() for devices where the number of sectors is
 		 * not a multiple of the alignment size and the partition layout
 		 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
 		 * "reserved" EFI partition: in such cases return the device
 		 * usable capacity.
 		 */
 		available = i_size_read(bdev_whole(bdev)->bd_inode) -
 		    ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
 		    PARTITION_END_ALIGNMENT) << SECTOR_BITS);
 		psize = MAX(available, bdev_capacity(bdev));
 	} else {
 		psize = bdev_capacity(bdev);
 	}
 
 	return (psize);
 }
 
 static void
 vdev_disk_error(zio_t *zio)
 {
 	/*
 	 * This function can be called in interrupt context, for instance while
 	 * handling IRQs coming from a misbehaving disk device; use printk()
 	 * which is safe from any context.
 	 */
 	printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
 	    "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
 	    zio->io_vd->vdev_path, zio->io_error, zio->io_type,
 	    (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
 	    zio->io_flags);
 }
 
 static int
 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	struct block_device *bdev;
 	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
 	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
 	vdev_disk_t *vd;
 
 	/* Must have a pathname and it must be absolute. */
 	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
 		v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		vdev_dbgmsg(v, "invalid vdev_path");
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Reopen the device if it is currently open.  When expanding a
 	 * partition force re-scanning the partition table if userland
 	 * did not take care of this already. We need to do this while closed
 	 * in order to get an accurate updated block device size.  Then
 	 * since udev may need to recreate the device links increase the
 	 * open retry timeout before reporting the device as unavailable.
 	 */
 	vd = v->vdev_tsd;
 	if (vd) {
 		char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
 		boolean_t reread_part = B_FALSE;
 
 		rw_enter(&vd->vd_lock, RW_WRITER);
 		bdev = vd->vd_bdev;
 		vd->vd_bdev = NULL;
 
 		if (bdev) {
 			if (v->vdev_expanding && bdev != bdev_whole(bdev)) {
 				bdevname(bdev_whole(bdev), disk_name + 5);
 				/*
 				 * If userland has BLKPG_RESIZE_PARTITION,
 				 * then it should have updated the partition
 				 * table already. We can detect this by
 				 * comparing our current physical size
 				 * with that of the device. If they are
 				 * the same, then we must not have
 				 * BLKPG_RESIZE_PARTITION or it failed to
 				 * update the partition table online. We
 				 * fallback to rescanning the partition
 				 * table from the kernel below. However,
 				 * if the capacity already reflects the
 				 * updated partition, then we skip
 				 * rescanning the partition table here.
 				 */
 				if (v->vdev_psize == bdev_capacity(bdev))
 					reread_part = B_TRUE;
 			}
 
 			blkdev_put(bdev, mode | FMODE_EXCL);
 		}
 
 		if (reread_part) {
 			bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL,
 			    zfs_vdev_holder);
 			if (!IS_ERR(bdev)) {
 				int error = vdev_bdev_reread_part(bdev);
 				blkdev_put(bdev, mode | FMODE_EXCL);
 				if (error == 0) {
 					timeout = MSEC2NSEC(
 					    zfs_vdev_open_timeout_ms * 2);
 				}
 			}
 		}
 	} else {
 		vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
 
 		rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
 		rw_enter(&vd->vd_lock, RW_WRITER);
 	}
 
 	/*
 	 * Devices are always opened by the path provided at configuration
 	 * time.  This means that if the provided path is a udev by-id path
 	 * then drives may be re-cabled without an issue.  If the provided
 	 * path is a udev by-path path, then the physical location information
 	 * will be preserved.  This can be critical for more complicated
 	 * configurations where drives are located in specific physical
 	 * locations to maximize the systems tolerance to component failure.
 	 *
 	 * Alternatively, you can provide your own udev rule to flexibly map
 	 * the drives as you see fit.  It is not advised that you use the
 	 * /dev/[hd]d devices which may be reordered due to probing order.
 	 * Devices in the wrong locations will be detected by the higher
 	 * level vdev validation.
 	 *
 	 * The specified paths may be briefly removed and recreated in
 	 * response to udev events.  This should be exceptionally unlikely
 	 * because the zpool command makes every effort to verify these paths
 	 * have already settled prior to reaching this point.  Therefore,
 	 * a ENOENT failure at this point is highly likely to be transient
 	 * and it is reasonable to sleep and retry before giving up.  In
 	 * practice delays have been observed to be on the order of 100ms.
 	 */
 	hrtime_t start = gethrtime();
 	bdev = ERR_PTR(-ENXIO);
 	while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) {
 		bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL,
 		    zfs_vdev_holder);
 		if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
 			schedule_timeout(MSEC_TO_TICK(10));
 		} else if (IS_ERR(bdev)) {
 			break;
 		}
 	}
 
 	if (IS_ERR(bdev)) {
 		int error = -PTR_ERR(bdev);
 		vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error,
 		    (u_longlong_t)(gethrtime() - start),
 		    (u_longlong_t)timeout);
 		vd->vd_bdev = NULL;
 		v->vdev_tsd = vd;
 		rw_exit(&vd->vd_lock);
 		return (SET_ERROR(error));
 	} else {
 		vd->vd_bdev = bdev;
 		v->vdev_tsd = vd;
 		rw_exit(&vd->vd_lock);
 	}
 
 	struct request_queue *q = bdev_get_queue(vd->vd_bdev);
 
 	/*  Determine the physical block size */
 	int physical_block_size = bdev_physical_block_size(vd->vd_bdev);
 
 	/*  Determine the logical block size */
 	int logical_block_size = bdev_logical_block_size(vd->vd_bdev);
 
 	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
 	v->vdev_nowritecache = B_FALSE;
 
 	/* Set when device reports it supports TRIM. */
 	v->vdev_has_trim = !!blk_queue_discard(q);
 
 	/* Set when device reports it supports secure TRIM. */
 	v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
 
 	/* Inform the ZIO pipeline that we are non-rotational */
 	v->vdev_nonrot = blk_queue_nonrot(q);
 
 	/* Physical volume size in bytes for the partition */
 	*psize = bdev_capacity(vd->vd_bdev);
 
 	/* Physical volume size in bytes including possible expansion space */
 	*max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
 
 	/* Based on the minimum sector size set the block size */
 	*physical_ashift = highbit64(MAX(physical_block_size,
 	    SPA_MINBLOCKSIZE)) - 1;
 
 	*logical_ashift = highbit64(MAX(logical_block_size,
 	    SPA_MINBLOCKSIZE)) - 1;
 
 	return (0);
 }
 
 static void
 vdev_disk_close(vdev_t *v)
 {
 	vdev_disk_t *vd = v->vdev_tsd;
 
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
 	if (vd->vd_bdev != NULL) {
 		blkdev_put(vd->vd_bdev,
 		    vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL);
 	}
 
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
 	v->vdev_tsd = NULL;
 }
 
 static dio_request_t *
 vdev_disk_dio_alloc(int bio_count)
 {
 	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
 	    sizeof (struct bio *) * bio_count, KM_SLEEP);
 	atomic_set(&dr->dr_ref, 0);
 	dr->dr_bio_count = bio_count;
 	dr->dr_error = 0;
 
 	for (int i = 0; i < dr->dr_bio_count; i++)
 		dr->dr_bio[i] = NULL;
 
 	return (dr);
 }
 
 static void
 vdev_disk_dio_free(dio_request_t *dr)
 {
 	int i;
 
 	for (i = 0; i < dr->dr_bio_count; i++)
 		if (dr->dr_bio[i])
 			bio_put(dr->dr_bio[i]);
 
 	kmem_free(dr, sizeof (dio_request_t) +
 	    sizeof (struct bio *) * dr->dr_bio_count);
 }
 
 static void
 vdev_disk_dio_get(dio_request_t *dr)
 {
 	atomic_inc(&dr->dr_ref);
 }
 
 static int
 vdev_disk_dio_put(dio_request_t *dr)
 {
 	int rc = atomic_dec_return(&dr->dr_ref);
 
 	/*
 	 * Free the dio_request when the last reference is dropped and
 	 * ensure zio_interpret is called only once with the correct zio
 	 */
 	if (rc == 0) {
 		zio_t *zio = dr->dr_zio;
 		int error = dr->dr_error;
 
 		vdev_disk_dio_free(dr);
 
 		if (zio) {
 			zio->io_error = error;
 			ASSERT3S(zio->io_error, >=, 0);
 			if (zio->io_error)
 				vdev_disk_error(zio);
 
 			zio_delay_interrupt(zio);
 		}
 	}
 
 	return (rc);
 }
 
 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
 {
 	dio_request_t *dr = bio->bi_private;
 	int rc;
 
 	if (dr->dr_error == 0) {
 #ifdef HAVE_1ARG_BIO_END_IO_T
 		dr->dr_error = BIO_END_IO_ERROR(bio);
 #else
 		if (error)
 			dr->dr_error = -(error);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			dr->dr_error = EIO;
 #endif
 	}
 
 	/* Drop reference acquired by __vdev_disk_physio */
 	rc = vdev_disk_dio_put(dr);
 }
 
 static inline void
 vdev_submit_bio_impl(struct bio *bio)
 {
 #ifdef HAVE_1ARG_SUBMIT_BIO
 	submit_bio(bio);
 #else
 	submit_bio(0, bio);
 #endif
 }
 
 /*
  * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so
  * replace it with preempt_schedule under the following condition:
  */
 #if defined(CONFIG_ARM64) && \
     defined(CONFIG_PREEMPTION) && \
     defined(CONFIG_BLK_CGROUP)
 #define	preempt_schedule_notrace(x) preempt_schedule(x)
 #endif
 
 #ifdef HAVE_BIO_SET_DEV
 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
 /*
  * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by
  * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched().
  * As a side effect the function was converted to GPL-only.  Define our
  * own version when needed which uses rcu_read_lock_sched().
  */
 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY)
 static inline bool
 vdev_blkg_tryget(struct blkcg_gq *blkg)
 {
 	struct percpu_ref *ref = &blkg->refcnt;
 	unsigned long __percpu *count;
 	bool rc;
 
 	rcu_read_lock_sched();
 
 	if (__ref_is_percpu(ref, &count)) {
 		this_cpu_inc(*count);
 		rc = true;
 	} else {
 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA
 		rc = atomic_long_inc_not_zero(&ref->data->count);
 #else
 		rc = atomic_long_inc_not_zero(&ref->count);
 #endif
 	}
 
 	rcu_read_unlock_sched();
 
 	return (rc);
 }
 #elif defined(HAVE_BLKG_TRYGET)
 #define	vdev_blkg_tryget(bg)	blkg_tryget(bg)
 #endif
 /*
  * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
  * GPL-only bio_associate_blkg() symbol thus inadvertently converting
  * the entire macro.  Provide a minimal version which always assigns the
  * request queue's root_blkg to the bio.
  */
 static inline void
 vdev_bio_associate_blkg(struct bio *bio)
 {
 #if defined(HAVE_BIO_BDEV_DISK)
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 #else
 	struct request_queue *q = bio->bi_disk->queue;
 #endif
 
 	ASSERT3P(q, !=, NULL);
 	ASSERT3P(bio->bi_blkg, ==, NULL);
 
 	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
 		bio->bi_blkg = q->root_blkg;
 }
 #define	bio_associate_blkg vdev_bio_associate_blkg
 #endif
 #else
 /*
  * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
  */
 static inline void
 bio_set_dev(struct bio *bio, struct block_device *bdev)
 {
 	bio->bi_bdev = bdev;
 }
 #endif /* HAVE_BIO_SET_DEV */
 
 static inline void
 vdev_submit_bio(struct bio *bio)
 {
 	struct bio_list *bio_list = current->bio_list;
 	current->bio_list = NULL;
 	vdev_submit_bio_impl(bio);
 	current->bio_list = bio_list;
 }
 
 static int
 __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
     size_t io_size, uint64_t io_offset, int rw, int flags)
 {
 	dio_request_t *dr;
 	uint64_t abd_offset;
 	uint64_t bio_offset;
 	int bio_size;
 	int bio_count = 16;
 	int error = 0;
 	struct blk_plug plug;
 
 	/*
 	 * Accessing outside the block device is never allowed.
 	 */
 	if (io_offset + io_size > bdev->bd_inode->i_size) {
 		vdev_dbgmsg(zio->io_vd,
 		    "Illegal access %llu size %llu, device size %llu",
-		    io_offset, io_size, i_size_read(bdev->bd_inode));
+		    (u_longlong_t)io_offset,
+		    (u_longlong_t)io_size,
+		    (u_longlong_t)i_size_read(bdev->bd_inode));
 		return (SET_ERROR(EIO));
 	}
 
 retry:
 	dr = vdev_disk_dio_alloc(bio_count);
 
 	if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
 		bio_set_flags_failfast(bdev, &flags);
 
 	dr->dr_zio = zio;
 
 	/*
 	 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
 	 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
 	 * can cover at least 128KB and at most 1MB.  When the required number
 	 * of iovec's exceeds this, we are forced to break the IO in multiple
 	 * bio's and wait for them all to complete.  This is likely if the
 	 * recordsize property is increased beyond 1MB.  The default
 	 * bio_count=16 should typically accommodate the maximum-size zio of
 	 * 16MB.
 	 */
 
 	abd_offset = 0;
 	bio_offset = io_offset;
 	bio_size = io_size;
 	for (int i = 0; i <= dr->dr_bio_count; i++) {
 
 		/* Finished constructing bio's for given buffer */
 		if (bio_size <= 0)
 			break;
 
 		/*
 		 * If additional bio's are required, we have to retry, but
 		 * this should be rare - see the comment above.
 		 */
 		if (dr->dr_bio_count == i) {
 			vdev_disk_dio_free(dr);
 			bio_count *= 2;
 			goto retry;
 		}
 
 		/* bio_alloc() with __GFP_WAIT never returns NULL */
 #ifdef HAVE_BIO_MAX_SEGS
 		dr->dr_bio[i] = bio_alloc(GFP_NOIO, bio_max_segs(
 		    abd_nr_pages_off(zio->io_abd, bio_size, abd_offset)));
 #else
 		dr->dr_bio[i] = bio_alloc(GFP_NOIO,
 		    MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
 		    BIO_MAX_PAGES));
 #endif
 		if (unlikely(dr->dr_bio[i] == NULL)) {
 			vdev_disk_dio_free(dr);
 			return (SET_ERROR(ENOMEM));
 		}
 
 		/* Matching put called by vdev_disk_physio_completion */
 		vdev_disk_dio_get(dr);
 
 		bio_set_dev(dr->dr_bio[i], bdev);
 		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
 		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
 		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
 
 		/* Remaining size is returned to become the new size */
 		bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd,
 		    bio_size, abd_offset);
 
 		/* Advance in buffer and construct another bio if needed */
 		abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
 		bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
 	}
 
 	/* Extra reference to protect dio_request during vdev_submit_bio */
 	vdev_disk_dio_get(dr);
 
 	if (dr->dr_bio_count > 1)
 		blk_start_plug(&plug);
 
 	/* Submit all bio's associated with this dio */
 	for (int i = 0; i < dr->dr_bio_count; i++) {
 		if (dr->dr_bio[i])
 			vdev_submit_bio(dr->dr_bio[i]);
 	}
 
 	if (dr->dr_bio_count > 1)
 		blk_finish_plug(&plug);
 
 	(void) vdev_disk_dio_put(dr);
 
 	return (error);
 }
 
 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
 {
 	zio_t *zio = bio->bi_private;
 #ifdef HAVE_1ARG_BIO_END_IO_T
 	zio->io_error = BIO_END_IO_ERROR(bio);
 #else
 	zio->io_error = -error;
 #endif
 
 	if (zio->io_error && (zio->io_error == EOPNOTSUPP))
 		zio->io_vd->vdev_nowritecache = B_TRUE;
 
 	bio_put(bio);
 	ASSERT3S(zio->io_error, >=, 0);
 	if (zio->io_error)
 		vdev_disk_error(zio);
 	zio_interrupt(zio);
 }
 
 static int
 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 {
 	struct request_queue *q;
 	struct bio *bio;
 
 	q = bdev_get_queue(bdev);
 	if (!q)
 		return (SET_ERROR(ENXIO));
 
 	bio = bio_alloc(GFP_NOIO, 0);
 	/* bio_alloc() with __GFP_WAIT never returns NULL */
 	if (unlikely(bio == NULL))
 		return (SET_ERROR(ENOMEM));
 
 	bio->bi_end_io = vdev_disk_io_flush_completion;
 	bio->bi_private = zio;
 	bio_set_dev(bio, bdev);
 	bio_set_flush(bio);
 	vdev_submit_bio(bio);
 	invalidate_bdev(bdev);
 
 	return (0);
 }
 
 static void
 vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
 	unsigned long trim_flags = 0;
 	int rw, error;
 
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
 	 * Nothing to be done here but return failure.
 	 */
 	if (vd == NULL) {
 		zio->io_error = ENXIO;
 		zio_interrupt(zio);
 		return;
 	}
 
 	rw_enter(&vd->vd_lock, RW_READER);
 
 	/*
 	 * If the vdev is closed, it's likely due to a failed reopen and is
 	 * in the UNAVAIL state.  Nothing to be done here but return failure.
 	 */
 	if (vd->vd_bdev == NULL) {
 		rw_exit(&vd->vd_lock);
 		zio->io_error = ENXIO;
 		zio_interrupt(zio);
 		return;
 	}
 
 	switch (zio->io_type) {
 	case ZIO_TYPE_IOCTL:
 
 		if (!vdev_readable(v)) {
 			rw_exit(&vd->vd_lock);
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return;
 		}
 
 		switch (zio->io_cmd) {
 		case DKIOCFLUSHWRITECACHE:
 
 			if (zfs_nocacheflush)
 				break;
 
 			if (v->vdev_nowritecache) {
 				zio->io_error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			error = vdev_disk_io_flush(vd->vd_bdev, zio);
 			if (error == 0) {
 				rw_exit(&vd->vd_lock);
 				return;
 			}
 
 			zio->io_error = error;
 
 			break;
 
 		default:
 			zio->io_error = SET_ERROR(ENOTSUP);
 		}
 
 		rw_exit(&vd->vd_lock);
 		zio_execute(zio);
 		return;
 	case ZIO_TYPE_WRITE:
 		rw = WRITE;
 		break;
 
 	case ZIO_TYPE_READ:
 		rw = READ;
 		break;
 
 	case ZIO_TYPE_TRIM:
 #if defined(BLKDEV_DISCARD_SECURE)
 		if (zio->io_trim_flags & ZIO_TRIM_SECURE)
 			trim_flags |= BLKDEV_DISCARD_SECURE;
 #endif
 		zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
 		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
 		    trim_flags);
 
 		rw_exit(&vd->vd_lock);
 		zio_interrupt(zio);
 		return;
 
 	default:
 		rw_exit(&vd->vd_lock);
 		zio->io_error = SET_ERROR(ENOTSUP);
 		zio_interrupt(zio);
 		return;
 	}
 
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
 	error = __vdev_disk_physio(vd->vd_bdev, zio,
 	    zio->io_size, zio->io_offset, rw, 0);
 	rw_exit(&vd->vd_lock);
 
 	if (error) {
 		zio->io_error = error;
 		zio_interrupt(zio);
 		return;
 	}
 }
 
 static void
 vdev_disk_io_done(zio_t *zio)
 {
 	/*
 	 * If the device returned EIO, we revalidate the media.  If it is
 	 * determined the media has changed this triggers the asynchronous
 	 * removal of the device from the configuration.
 	 */
 	if (zio->io_error == EIO) {
 		vdev_t *v = zio->io_vd;
 		vdev_disk_t *vd = v->vdev_tsd;
 
 		if (zfs_check_media_change(vd->vd_bdev)) {
 			invalidate_bdev(vd->vd_bdev);
 			v->vdev_remove_wanted = B_TRUE;
 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 		}
 	}
 }
 
 static void
 vdev_disk_hold(vdev_t *vd)
 {
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
 
 	/* We must have a pathname, and it must be absolute. */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
 		return;
 
 	/*
 	 * Only prefetch path and devid info if the device has
 	 * never been opened.
 	 */
 	if (vd->vdev_tsd != NULL)
 		return;
 
 }
 
 static void
 vdev_disk_rele(vdev_t *vd)
 {
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
 
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
 vdev_ops_t vdev_disk_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_disk_open,
 	.vdev_op_close = vdev_disk_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_disk_io_start,
 	.vdev_op_io_done = vdev_disk_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_disk_hold,
 	.vdev_op_rele = vdev_disk_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 /*
  * The zfs_vdev_scheduler module option has been deprecated. Setting this
  * value no longer has any effect.  It has not yet been entirely removed
  * to allow the module to be loaded if this option is specified in the
  * /etc/modprobe.d/zfs.conf file.  The following warning will be logged.
  */
 static int
 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
 {
 	int error = param_set_charp(val, kp);
 	if (error == 0) {
 		printk(KERN_INFO "The 'zfs_vdev_scheduler' module option "
 		    "is not supported.\n");
 	}
 
 	return (error);
 }
 
 char *zfs_vdev_scheduler = "unused";
 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
     param_get_charp, &zfs_vdev_scheduler, 0644);
 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
 
 int
 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
 {
 	uint64_t val;
 	int error;
 
 	error = kstrtoull(buf, 0, &val);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
 		return (SET_ERROR(-EINVAL));
 
 	error = param_set_ulong(buf, kp);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	return (0);
 }
 
 int
 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
 {
 	uint64_t val;
 	int error;
 
 	error = kstrtoull(buf, 0, &val);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
 		return (SET_ERROR(-EINVAL));
 
 	error = param_set_ulong(buf, kp);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	return (0);
 }
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 02663e8e2e5d..75b1dcc82b5a 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1,11058 +1,11058 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, Joyent, Inc.
  * Copyright (c) 2011, 2020, Delphix. All rights reserved.
  * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2020, George Amanakis. All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  */
 
 /*
  * DVA-based Adjustable Replacement Cache
  *
  * While much of the theory of operation used here is
  * based on the self-tuning, low overhead replacement cache
  * presented by Megiddo and Modha at FAST 2003, there are some
  * significant differences:
  *
  * 1. The Megiddo and Modha model assumes any page is evictable.
  * Pages in its cache cannot be "locked" into memory.  This makes
  * the eviction algorithm simple: evict the last page in the list.
  * This also make the performance characteristics easy to reason
  * about.  Our cache is not so simple.  At any given moment, some
  * subset of the blocks in the cache are un-evictable because we
  * have handed out a reference to them.  Blocks are only evictable
  * when there are no external references active.  This makes
  * eviction far more problematic:  we choose to evict the evictable
  * blocks that are the "lowest" in the list.
  *
  * There are times when it is not possible to evict the requested
  * space.  In these circumstances we are unable to adjust the cache
  * size.  To prevent the cache growing unbounded at these times we
  * implement a "cache throttle" that slows the flow of new data
  * into the cache until we can make space available.
  *
  * 2. The Megiddo and Modha model assumes a fixed cache size.
  * Pages are evicted when the cache is full and there is a cache
  * miss.  Our model has a variable sized cache.  It grows with
  * high use, but also tries to react to memory pressure from the
  * operating system: decreasing its size when system memory is
  * tight.
  *
  * 3. The Megiddo and Modha model assumes a fixed page size. All
  * elements of the cache are therefore exactly the same size.  So
  * when adjusting the cache size following a cache miss, its simply
  * a matter of choosing a single page to evict.  In our model, we
  * have variable sized cache blocks (ranging from 512 bytes to
  * 128K bytes).  We therefore choose a set of blocks to evict to make
  * space for a cache miss that approximates as closely as possible
  * the space used by the new block.
  *
  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  * by N. Megiddo & D. Modha, FAST 2003
  */
 
 /*
  * The locking model:
  *
  * A new reference to a cache buffer can be obtained in two
  * ways: 1) via a hash table lookup using the DVA as a key,
  * or 2) via one of the ARC lists.  The arc_read() interface
  * uses method 1, while the internal ARC algorithms for
  * adjusting the cache use method 2.  We therefore provide two
  * types of locks: 1) the hash table lock array, and 2) the
  * ARC list locks.
  *
  * Buffers do not have their own mutexes, rather they rely on the
  * hash table mutexes for the bulk of their protection (i.e. most
  * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
  * NULL for the mutex if the buffer was not in the table.
  *
  * buf_hash_remove() expects the appropriate hash mutex to be
  * already held before it is invoked.
  *
  * Each ARC state also has a mutex which is used to protect the
  * buffer list associated with the state.  When attempting to
  * obtain a hash table lock while holding an ARC list lock you
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the active state mutex must be held before the ghost state mutex.
  *
  * It as also possible to register a callback which is run when the
  * arc_meta_limit is reached and no buffers can be safely evicted.  In
  * this case the arc user should drop a reference on some arc buffers so
  * they can be reclaimed and the arc_meta_limit honored.  For example,
  * when using the ZPL each dentry holds a references on a znode.  These
  * dentries must be pruned before the arc buffer holding the znode can
  * be safely evicted.
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
  * The L2ARC uses the l2ad_mtx on each vdev for the following:
  *
  *	- L2ARC buflist creation
  *	- L2ARC buflist eviction
  *	- L2ARC write completion, which walks L2ARC buflists
  *	- ARC header destruction, as it removes from L2ARC buflists
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
 /*
  * ARC operation:
  *
  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
  * This structure can point either to a block that is still in the cache or to
  * one that is only accessible in an L2 ARC device, or it can provide
  * information about a block that was recently evicted. If a block is
  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
  * information to retrieve it from the L2ARC device. This information is
  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
  * that is in this state cannot access the data directly.
  *
  * Blocks that are actively being referenced or have not been evicted
  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
  * the arc_buf_hdr_t that will point to the data block in memory. A block can
  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
  *
  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
  * ability to store the physical data (b_pabd) associated with the DVA of the
  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
  * it will match its on-disk compression characteristics. This behavior can be
  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
  * compressed ARC functionality is disabled, the b_pabd will point to an
  * uncompressed version of the on-disk data.
  *
  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
  * consumer. The ARC will provide references to this data and will keep it
  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
  * data block and will evict any arc_buf_t that is no longer referenced. The
  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
  * "overhead_size" kstat.
  *
  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
  * compressed form. The typical case is that consumers will want uncompressed
  * data, and when that happens a new data buffer is allocated where the data is
  * decompressed for them to use. Currently the only consumer who wants
  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
  * with the arc_buf_hdr_t.
  *
  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
  * first one is owned by a compressed send consumer (and therefore references
  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
  * used by any other consumer (and has its own uncompressed copy of the data
  * buffer).
  *
  *   arc_buf_hdr_t
  *   +-----------+
  *   | fields    |
  *   | common to |
  *   | L1- and   |
  *   | L2ARC     |
  *   +-----------+
  *   | l2arc_buf_hdr_t
  *   |           |
  *   +-----------+
  *   | l1arc_buf_hdr_t
  *   |           |              arc_buf_t
  *   | b_buf     +------------>+-----------+      arc_buf_t
  *   | b_pabd    +-+           |b_next     +---->+-----------+
  *   +-----------+ |           |-----------|     |b_next     +-->NULL
  *                 |           |b_comp = T |     +-----------+
  *                 |           |b_data     +-+   |b_comp = F |
  *                 |           +-----------+ |   |b_data     +-+
  *                 +->+------+               |   +-----------+ |
  *        compressed  |      |               |                 |
  *           data     |      |<--------------+                 | uncompressed
  *                    +------+          compressed,            |     data
  *                                        shared               +-->+------+
  *                                         data                    |      |
  *                                                                 |      |
  *                                                                 +------+
  *
  * When a consumer reads a block, the ARC must first look to see if the
  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
  * arc_buf_t and either copies uncompressed data into a new data buffer from an
  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
  * hdr is compressed and the desired compression characteristics of the
  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
  * be anywhere in the hdr's list.
  *
  * The diagram below shows an example of an uncompressed ARC hdr that is
  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
  * the last element in the buf list):
  *
  *                arc_buf_hdr_t
  *                +-----------+
  *                |           |
  *                |           |
  *                |           |
  *                +-----------+
  * l2arc_buf_hdr_t|           |
  *                |           |
  *                +-----------+
  * l1arc_buf_hdr_t|           |
  *                |           |                 arc_buf_t    (shared)
  *                |    b_buf  +------------>+---------+      arc_buf_t
  *                |           |             |b_next   +---->+---------+
  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
  *                +-----------+ |           |         |     +---------+
  *                              |           |b_data   +-+   |         |
  *                              |           +---------+ |   |b_data   +-+
  *                              +->+------+             |   +---------+ |
  *                                 |      |             |               |
  *                   uncompressed  |      |             |               |
  *                        data     +------+             |               |
  *                                    ^                 +->+------+     |
  *                                    |       uncompressed |      |     |
  *                                    |           data     |      |     |
  *                                    |                    +------+     |
  *                                    +---------------------------------+
  *
  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
  * since the physical block is about to be rewritten. The new data contents
  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
  * it may compress the data before writing it to disk. The ARC will be called
  * with the transformed data and will bcopy the transformed on-disk block into
  * a newly allocated b_pabd. Writes are always done into buffers which have
  * either been loaned (and hence are new and don't have other readers) or
  * buffers which have been released (and hence have their own hdr, if there
  * were originally other readers of the buf's original hdr). This ensures that
  * the ARC only needs to update a single buf and its hdr after a write occurs.
  *
  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
  * that when compressed ARC is enabled that the L2ARC blocks are identical
  * to the on-disk block in the main data pool. This provides a significant
  * advantage since the ARC can leverage the bp's checksum when reading from the
  * L2ARC to determine if the contents are valid. However, if the compressed
  * ARC is disabled, then the L2ARC's block must be transformed to look
  * like the physical block in the main data pool before comparing the
  * checksum and determining its validity.
  *
  * The L1ARC has a slightly different system for storing encrypted data.
  * Raw (encrypted + possibly compressed) data has a few subtle differences from
  * data that is just compressed. The biggest difference is that it is not
  * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
  * The other difference is that encryption cannot be treated as a suggestion.
  * If a caller would prefer compressed data, but they actually wind up with
  * uncompressed data the worst thing that could happen is there might be a
  * performance hit. If the caller requests encrypted data, however, we must be
  * sure they actually get it or else secret information could be leaked. Raw
  * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
  * may have both an encrypted version and a decrypted version of its data at
  * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
  * copied out of this header. To avoid complications with b_pabd, raw buffers
  * cannot be shared.
  */
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/zfs_refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
 #include <sys/multilist.h>
 #include <sys/abd.h>
 #include <sys/zil.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/callb.h>
 #include <sys/kstat.h>
 #include <sys/zthr.h>
 #include <zfs_fletcher.h>
 #include <sys/arc_impl.h>
 #include <sys/trace_zfs.h>
 #include <sys/aggsum.h>
 #include <sys/wmsum.h>
 #include <cityhash.h>
 #include <sys/vdev_trim.h>
 #include <sys/zfs_racct.h>
 #include <sys/zstd/zstd.h>
 
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 boolean_t arc_watch = B_FALSE;
 #endif
 
 /*
  * This thread's job is to keep enough free memory in the system, by
  * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
  * arc_available_memory().
  */
 static zthr_t *arc_reap_zthr;
 
 /*
  * This thread's job is to keep arc_size under arc_c, by calling
  * arc_evict(), which improves arc_is_overflowing().
  */
 static zthr_t *arc_evict_zthr;
 
 static kmutex_t arc_evict_lock;
 static boolean_t arc_evict_needed = B_FALSE;
 
 /*
  * Count of bytes evicted since boot.
  */
 static uint64_t arc_evict_count;
 
 /*
  * List of arc_evict_waiter_t's, representing threads waiting for the
  * arc_evict_count to reach specific values.
  */
 static list_t arc_evict_waiters;
 
 /*
  * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
  * the requested amount of data to be evicted.  For example, by default for
  * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
  * Since this is above 100%, it ensures that progress is made towards getting
  * arc_size under arc_c.  Since this is finite, it ensures that allocations
  * can still happen, even during the potentially long time that arc_size is
  * more than arc_c.
  */
 int zfs_arc_eviction_pct = 200;
 
 /*
  * The number of headers to evict in arc_evict_state_impl() before
  * dropping the sublist lock and evicting from another sublist. A lower
  * value means we're more likely to evict the "correct" header (i.e. the
  * oldest header in the arc state), but comes with higher overhead
  * (i.e. more invocations of arc_evict_state_impl()).
  */
 int zfs_arc_evict_batch_limit = 10;
 
 /* number of seconds before growing cache again */
 int arc_grow_retry = 5;
 
 /*
  * Minimum time between calls to arc_kmem_reap_soon().
  */
 int arc_kmem_cache_reap_retry_ms = 1000;
 
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 int zfs_arc_overflow_shift = 8;
 
 /* shift of arc_c for calculating both min and max arc_p */
 int arc_p_min_shift = 4;
 
 /* log2(fraction of arc to reclaim) */
 int arc_shrink_shift = 7;
 
 /* percent of pagecache to reclaim arc to */
 #ifdef _KERNEL
 uint_t zfs_arc_pc_percent = 0;
 #endif
 
 /*
  * log2(fraction of ARC which must be free to allow growing).
  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
  * when reading a new block into the ARC, we will evict an equal-sized block
  * from the ARC.
  *
  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
  * we will still not allow it to grow.
  */
 int			arc_no_grow_shift = 5;
 
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
 static int		arc_min_prefetch_ms;
 static int		arc_min_prescient_prefetch_ms;
 
 /*
  * If this percent of memory is free, don't throttle.
  */
 int arc_lotsfree_percent = 10;
 
 /*
  * The arc has filled available memory and has now warmed up.
  */
 boolean_t arc_warm;
 
 /*
  * These tunables are for performance analysis.
  */
 unsigned long zfs_arc_max = 0;
 unsigned long zfs_arc_min = 0;
 unsigned long zfs_arc_meta_limit = 0;
 unsigned long zfs_arc_meta_min = 0;
 unsigned long zfs_arc_dnode_limit = 0;
 unsigned long zfs_arc_dnode_reduce_percent = 10;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 
 /*
  * ARC dirty data constraints for arc_tempreserve_space() throttle.
  */
 unsigned long zfs_arc_dirty_limit_percent = 50;	/* total dirty data limit */
 unsigned long zfs_arc_anon_limit_percent = 25;	/* anon block dirty limit */
 unsigned long zfs_arc_pool_dirty_percent = 20;	/* each pool's anon allowance */
 
 /*
  * Enable or disable compressed arc buffers.
  */
 int zfs_compressed_arc_enabled = B_TRUE;
 
 /*
  * ARC will evict meta buffers that exceed arc_meta_limit. This
  * tunable make arc_meta_limit adjustable for different workloads.
  */
 unsigned long zfs_arc_meta_limit_percent = 75;
 
 /*
  * Percentage that can be consumed by dnodes of ARC meta buffers.
  */
 unsigned long zfs_arc_dnode_limit_percent = 10;
 
 /*
  * These tunables are Linux specific
  */
 unsigned long zfs_arc_sys_free = 0;
 int zfs_arc_min_prefetch_ms = 0;
 int zfs_arc_min_prescient_prefetch_ms = 0;
 int zfs_arc_p_dampener_disable = 1;
 int zfs_arc_meta_prune = 10000;
 int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
 int zfs_arc_meta_adjust_restarts = 4096;
 int zfs_arc_lotsfree_percent = 10;
 
 /* The 6 states: */
 arc_state_t ARC_anon;
 arc_state_t ARC_mru;
 arc_state_t ARC_mru_ghost;
 arc_state_t ARC_mfu;
 arc_state_t ARC_mfu_ghost;
 arc_state_t ARC_l2c_only;
 
 arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "mru_hits",			KSTAT_DATA_UINT64 },
 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "access_skip",		KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
 	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible_mfu",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible_mru",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
 	{ "hash_chains",		KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
 	{ "p",				KSTAT_DATA_UINT64 },
 	{ "c",				KSTAT_DATA_UINT64 },
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
 	{ "compressed_size",		KSTAT_DATA_UINT64 },
 	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
 	{ "overhead_size",		KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
 	{ "metadata_size",		KSTAT_DATA_UINT64 },
 	{ "dbuf_size",			KSTAT_DATA_UINT64 },
 	{ "dnode_size",			KSTAT_DATA_UINT64 },
 	{ "bonus_size",			KSTAT_DATA_UINT64 },
 #if defined(COMPAT_FREEBSD11)
 	{ "other_size",			KSTAT_DATA_UINT64 },
 #endif
 	{ "anon_size",			KSTAT_DATA_UINT64 },
 	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_size",			KSTAT_DATA_UINT64 },
 	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "mfu_size",			KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_prefetch_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_mru_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_mfu_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_bufc_data_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_bufc_metadata_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_writes",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_avg_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_count",		KSTAT_DATA_UINT64 },
 	{ "l2_data_to_meta_ratio",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_success",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_unsupported",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_io_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_dh_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_cksum_lb_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_size",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs_precached",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_log_blks",	KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
 	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
 	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
 	{ "memory_all_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_free_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_available_bytes",	KSTAT_DATA_INT64 },
 	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
 	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
 	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
 	{ "arc_prune",			KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_dnode_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
 	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "arc_need_free",		KSTAT_DATA_UINT64 },
 	{ "arc_sys_free",		KSTAT_DATA_UINT64 },
 	{ "arc_raw_size",		KSTAT_DATA_UINT64 },
 	{ "cached_only_in_progress",	KSTAT_DATA_UINT64 },
 	{ "abd_chunk_waste_size",	KSTAT_DATA_UINT64 },
 };
 
 arc_sums_t arc_sums;
 
 #define	ARCSTAT_MAX(stat, val) {					\
 	uint64_t m;							\
 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
 		continue;						\
 }
 
 /*
  * We define a macro to allow ARC hits/misses to be easily broken down by
  * two separate conditions, giving a total of four different subtypes for
  * each of hits and misses (so eight statistics total).
  */
 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 	if (cond1) {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 		}							\
 	} else {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 		}							\
 	}
 
 /*
  * This macro allows us to use kstats as floating averages. Each time we
  * update this kstat, we first factor it and the update value by
  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
  * average. This macro assumes that integer loads and stores are atomic, but
  * is not safe for multiple writers updating the kstat in parallel (only the
  * last writer's update will remain).
  */
 #define	ARCSTAT_F_AVG_FACTOR	3
 #define	ARCSTAT_F_AVG(stat, value) \
 	do { \
 		uint64_t x = ARCSTAT(stat); \
 		x = x - x / ARCSTAT_F_AVG_FACTOR + \
 		    (value) / ARCSTAT_F_AVG_FACTOR; \
 		ARCSTAT(stat) = x; \
 		_NOTE(CONSTCOND) \
 	} while (0)
 
 kstat_t			*arc_ksp;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
  * but we don't want to have to grovel around in the kstat whenever we wish to
  * manipulate them.  For these variables, we therefore define them to be in
  * terms of the statistic variable.  This assures that we are not introducing
  * the possibility of inconsistency by having shadow copies of the variables,
  * while still allowing the code to be readable.
  */
 #define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
 #define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
 #define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 /* max size for dnodes */
 #define	arc_dnode_size_limit	ARCSTAT(arcstat_dnode_limit)
 #define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
 #define	arc_need_free	ARCSTAT(arcstat_need_free) /* waiting to be evicted */
 
 hrtime_t arc_growtime;
 list_t arc_prune_list;
 kmutex_t arc_prune_mtx;
 taskq_t *arc_prune_taskq;
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
 #define	HDR_PRESCIENT_PREFETCH(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define	HDR_COMPRESSION_ENABLED(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
 #define	HDR_L2_READING(hdr)	\
 	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
 	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 #define	HDR_PROTECTED(hdr)	((hdr)->b_flags & ARC_FLAG_PROTECTED)
 #define	HDR_NOAUTH(hdr)		((hdr)->b_flags & ARC_FLAG_NOAUTH)
 #define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 
 #define	HDR_ISTYPE_METADATA(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
 
 #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 #define	HDR_HAS_RABD(hdr)	\
 	(HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&	\
 	(hdr)->b_crypt_hdr.b_rabd != NULL)
 #define	HDR_ENCRYPTED(hdr)	\
 	(HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 #define	HDR_AUTHENTICATED(hdr)	\
 	(HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 
 /* For storing compression mode in b_flags */
 #define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
 
 #define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
 #define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
 
 #define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
 #define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
 #define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
 #define	ARC_BUF_ENCRYPTED(buf)	((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
 
 /*
  * Other sizes
  */
 
 #define	HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 #define	HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
  * Hash table routines
  */
 
 #define	BUF_LOCKS 2048
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
 	arc_buf_hdr_t **ht_table;
 	kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned;
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define	BUF_HASH_INDEX(spa, dva, birth) \
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK(idx)	(&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	HDR_LOCK(hdr) \
 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
 /*
  * Level 2 ARC
  */
 
 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
 #define	L2ARC_HEADROOM		2			/* num of writes */
 
 /*
  * If we discover during ARC scan any buffers to be compressed, we boost
  * our headroom for the next scanning cycle by this percentage multiple.
  */
 #define	L2ARC_HEADROOM_BOOST	200
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 /*
  * We can feed L2ARC from two states of ARC buffers, mru and mfu,
  * and each of the state has two types: data and metadata.
  */
 #define	L2ARC_FEED_TYPES	4
 
 /* L2ARC Performance Tunables */
 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
 unsigned long l2arc_headroom = L2ARC_HEADROOM;		/* # of dev writes */
 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
 int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
 int l2arc_feed_again = B_TRUE;			/* turbo warmup */
 int l2arc_norw = B_FALSE;			/* no reads during writes */
 int l2arc_meta_percent = 33;			/* limit on headers size */
 
 /*
  * L2ARC Internals
  */
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
 	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
 	blkptr_t		l2rcb_bp;		/* original blkptr */
 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
 	int			l2rcb_flags;		/* original flags */
 	abd_t			*l2rcb_abd;		/* temporary buffer */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	abd_t		*l2df_abd;
 	size_t		l2df_size;
 	arc_buf_contents_t l2df_type;
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
 typedef enum arc_fill_flags {
 	ARC_FILL_LOCKED		= 1 << 0, /* hdr lock is held */
 	ARC_FILL_COMPRESSED	= 1 << 1, /* fill with compressed data */
 	ARC_FILL_ENCRYPTED	= 1 << 2, /* fill with encrypted data */
 	ARC_FILL_NOAUTH		= 1 << 3, /* don't attempt to authenticate */
 	ARC_FILL_IN_PLACE	= 1 << 4  /* fill in place (special case) */
 } arc_fill_flags_t;
 
 typedef enum arc_ovf_level {
 	ARC_OVF_NONE,			/* ARC within target size. */
 	ARC_OVF_SOME,			/* ARC is slightly overflowed. */
 	ARC_OVF_SEVERE			/* ARC is severely overflowed. */
 } arc_ovf_level_t;
 
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
 static kmutex_t l2arc_rebuild_thr_lock;
 static kcondvar_t l2arc_rebuild_thr_cv;
 
 enum arc_hdr_alloc_flags {
 	ARC_HDR_ALLOC_RDATA = 0x1,
 	ARC_HDR_DO_ADAPT = 0x2,
 };
 
 
 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
 static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
 static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 static void arc_buf_watch(arc_buf_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 static void l2arc_do_free_on_write(void);
 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only);
 
 #define	l2arc_hdr_arcstats_increment(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
 #define	l2arc_hdr_arcstats_decrement(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
 #define	l2arc_hdr_arcstats_increment_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
 #define	l2arc_hdr_arcstats_decrement_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
 
 /*
  * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
  * 		metadata and data are cached from ARC into L2ARC.
  */
 int l2arc_mfuonly = 0;
 
 /*
  * L2ARC TRIM
  * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
  * 		the current write size (l2arc_write_max) we should TRIM if we
  * 		have filled the device. It is defined as a percentage of the
  * 		write size. If set to 100 we trim twice the space required to
  * 		accommodate upcoming writes. A minimum of 64MB will be trimmed.
  * 		It also enables TRIM of the whole L2ARC device upon creation or
  * 		addition to an existing pool or if the header of the device is
  * 		invalid upon importing a pool or onlining a cache device. The
  * 		default is 0, which disables TRIM on L2ARC altogether as it can
  * 		put significant stress on the underlying storage devices. This
  * 		will vary depending of how well the specific device handles
  * 		these commands.
  */
 unsigned long l2arc_trim_ahead = 0;
 
 /*
  * Performance tuning of L2ARC persistence:
  *
  * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
  * 		an L2ARC device (either at pool import or later) will attempt
  * 		to rebuild L2ARC buffer contents.
  * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
  * 		whether log blocks are written to the L2ARC device. If the L2ARC
  * 		device is less than 1GB, the amount of data l2arc_evict()
  * 		evicts is significant compared to the amount of restored L2ARC
  * 		data. In this case do not write log blocks in L2ARC in order
  * 		not to waste space.
  */
 int l2arc_rebuild_enabled = B_TRUE;
 unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
 
 /* L2ARC persistence rebuild control routines. */
 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
 static void l2arc_dev_rebuild_thread(void *arg);
 static int l2arc_rebuild(l2arc_dev_t *dev);
 
 /* L2ARC persistence read I/O routines. */
 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
 static int l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io);
 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
     const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
 static void l2arc_log_blk_fetch_abort(zio_t *zio);
 
 /* L2ARC persistence block restoration routines. */
 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
     const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
     l2arc_dev_t *dev);
 
 /* L2ARC persistence write I/O routines. */
 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     l2arc_write_callback_t *cb);
 
 /* L2ARC persistence auxiliary routines. */
 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *lbp);
 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
     const arc_buf_hdr_t *ab);
 boolean_t l2arc_range_check_overlap(uint64_t bottom,
     uint64_t top, uint64_t check);
 static void l2arc_blk_fetch_done(zio_t *zio);
 static inline uint64_t
     l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
  */
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
 	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
 }
 
 #define	HDR_EMPTY(hdr)						\
 	((hdr)->b_dva.dva_word[0] == 0 &&			\
 	(hdr)->b_dva.dva_word[1] == 0)
 
 #define	HDR_EMPTY_OR_LOCKED(hdr)				\
 	(HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
 
 #define	HDR_EQUAL(spa, dva, birth, hdr)				\
 	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
 	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
 	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
 {
 	hdr->b_dva.dva_word[0] = 0;
 	hdr->b_dva.dva_word[1] = 0;
 	hdr->b_birth = 0;
 }
 
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
 	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
 
 	mutex_enter(hash_lock);
 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 	    hdr = hdr->b_hash_next) {
 		if (HDR_EQUAL(spa, dva, birth, hdr)) {
 			*lockp = hash_lock;
 			return (hdr);
 		}
 	}
 	mutex_exit(hash_lock);
 	*lockp = NULL;
 	return (NULL);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  * If lockp == NULL, the caller is assumed to already hold the hash lock.
  */
 static arc_buf_hdr_t *
 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *fhdr;
 	uint32_t i;
 
 	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 	ASSERT(hdr->b_birth != 0);
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (lockp != NULL) {
 		*lockp = hash_lock;
 		mutex_enter(hash_lock);
 	} else {
 		ASSERT(MUTEX_HELD(hash_lock));
 	}
 
 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 	    fhdr = fhdr->b_hash_next, i++) {
 		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 			return (fhdr);
 	}
 
 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = hdr;
 	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	if (i > 0) {
 		ARCSTAT_BUMP(arcstat_hash_collisions);
 		if (i == 1)
 			ARCSTAT_BUMP(arcstat_hash_chains);
 
 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
 	}
 	uint64_t he = atomic_inc_64_nv(
 	    &arc_stats.arcstat_hash_elements.value.ui64);
 	ARCSTAT_MAX(arcstat_hash_elements_max, he);
 
 	return (NULL);
 }
 
 static void
 buf_hash_remove(arc_buf_hdr_t *hdr)
 {
 	arc_buf_hdr_t *fhdr, **hdrp;
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 	ASSERT(HDR_IN_HASH_TABLE(hdr));
 
 	hdrp = &buf_hash_table.ht_table[idx];
 	while ((fhdr = *hdrp) != hdr) {
 		ASSERT3P(fhdr, !=, NULL);
 		hdrp = &fhdr->b_hash_next;
 	}
 	*hdrp = hdr->b_hash_next;
 	hdr->b_hash_next = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64);
 
 	if (buf_hash_table.ht_table[idx] &&
 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 }
 
 /*
  * Global data structures and functions for the buf kmem cache.
  */
 
 static kmem_cache_t *hdr_full_cache;
 static kmem_cache_t *hdr_full_crypt_cache;
 static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
 
 static void
 buf_fini(void)
 {
 	int i;
 
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_free() in the linux kernel\
 	 */
 	vmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #else
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #endif
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(BUF_HASH_LOCK(i));
 	kmem_cache_destroy(hdr_full_cache);
 	kmem_cache_destroy(hdr_full_crypt_cache);
 	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
 
 /*
  * Constructor callback - called when the cache is empty
  * and a new buf is requested.
  */
 /* ARGSUSED */
 static int
 hdr_full_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	bzero(hdr, HDR_FULL_SIZE);
 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
 	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_link_init(&hdr->b_l1hdr.b_arc_node);
 	list_link_init(&hdr->b_l2hdr.b_l2node);
 	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	hdr_full_cons(vbuf, unused, kmflag);
 	bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
 	arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	bzero(hdr, HDR_L2ONLY_SIZE);
 	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_t *buf = vbuf;
 
 	bzero(buf, sizeof (arc_buf_t));
 	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /*
  * Destructor callback - called when a cached buf is
  * no longer required.
  */
 /* ARGSUSED */
 static void
 hdr_full_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	cv_destroy(&hdr->b_l1hdr.b_cv);
 	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
 /* ARGSUSED */
 static void
 hdr_full_crypt_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	hdr_full_dest(vbuf, unused);
 	arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
 }
 
 /* ARGSUSED */
 static void
 hdr_l2only_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *hdr __maybe_unused = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
 /* ARGSUSED */
 static void
 buf_dest(void *vbuf, void *unused)
 {
 	arc_buf_t *buf = vbuf;
 
 	mutex_destroy(&buf->b_evict_lock);
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
 static void
 buf_init(void)
 {
 	uint64_t *ct = NULL;
 	uint64_t hsize = 1ULL << 12;
 	int i, j;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory())
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_alloc() in the linux kernel
 	 */
 	buf_hash_table.ht_table =
 	    vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 #else
 	buf_hash_table.ht_table =
 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 #endif
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
 	    0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
 	hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
 	    HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
 	    NULL, NULL, NULL, 0);
 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
 	    NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < 256; i++)
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL);
 }
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
 /*
  * This is the size that the buf occupies in memory. If the buf is compressed,
  * it will correspond to the compressed size. You should use this method of
  * getting the buf size unless you explicitly need the logical size.
  */
 uint64_t
 arc_buf_size(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
 }
 
 uint64_t
 arc_buf_lsize(arc_buf_t *buf)
 {
 	return (HDR_GET_LSIZE(buf->b_hdr));
 }
 
 /*
  * This function will return B_TRUE if the buffer is encrypted in memory.
  * This buffer can be decrypted by calling arc_untransform().
  */
 boolean_t
 arc_is_encrypted(arc_buf_t *buf)
 {
 	return (ARC_BUF_ENCRYPTED(buf) != 0);
 }
 
 /*
  * Returns B_TRUE if the buffer represents data that has not had its MAC
  * verified yet.
  */
 boolean_t
 arc_is_unauthenticated(arc_buf_t *buf)
 {
 	return (HDR_NOAUTH(buf->b_hdr) != 0);
 }
 
 void
 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
     uint8_t *iv, uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 	*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 	    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 }
 
 /*
  * Indicates how this buffer is compressed in memory. If it is not compressed
  * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
  * arc_untransform() as long as it is also unencrypted.
  */
 enum zio_compress
 arc_get_compression(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
 }
 
 /*
  * Return the compression algorithm used to store this data in the ARC. If ARC
  * compression is enabled or this is an encrypted block, this will be the same
  * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
  */
 static inline enum zio_compress
 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
 {
 	return (HDR_COMPRESSION_ENABLED(hdr) ?
 	    HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
 }
 
 uint8_t
 arc_get_complevel(arc_buf_t *buf)
 {
 	return (buf->b_hdr->b_complevel);
 }
 
 static inline boolean_t
 arc_buf_is_shared(arc_buf_t *buf)
 {
 	boolean_t shared = (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
 	IMPLY(shared, ARC_BUF_SHARED(buf));
 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
 
 	/*
 	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
 	 * already being shared" requirement prevents us from doing that.
 	 */
 
 	return (shared);
 }
 
 /*
  * Free the checksum associated with this header. If there is no checksum, this
  * is a no-op.
  */
 static inline void
 arc_cksum_free(arc_buf_hdr_t *hdr)
 {
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
 		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_l1hdr.b_freeze_cksum = NULL;
 	}
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 }
 
 /*
  * Return true iff at least one of the bufs on hdr is not compressed.
  * Encrypted buffers count as compressed.
  */
 static boolean_t
 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
 {
 	ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
 
 	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
 		if (!ARC_BUF_COMPRESSED(b)) {
 			return (B_TRUE);
 		}
 	}
 	return (B_FALSE);
 }
 
 
 /*
  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
  * matches the checksum that is stored in the hdr. If there is no checksum,
  * or if the buf is compressed, this is a no-op.
  */
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 
 	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
 	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 }
 
 /*
  * This function makes the assumption that data stored in the L2ARC
  * will be transformed exactly as it is in the main pool. Because of
  * this we can verify the checksum against the reading process's bp.
  */
 static boolean_t
 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 {
 	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
 	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
 
 	/*
 	 * Block pointers always store the checksum for the logical data.
 	 * If the block pointer has the gang bit set, then the checksum
 	 * it represents is for the reconstituted data and not for an
 	 * individual gang member. The zio pipeline, however, must be able to
 	 * determine the checksum of each of the gang constituents so it
 	 * treats the checksum comparison differently than what we need
 	 * for l2arc blocks. This prevents us from using the
 	 * zio_checksum_error() interface directly. Instead we must call the
 	 * zio_checksum_error_impl() so that we can ensure the checksum is
 	 * generated using the correct checksum algorithm and accounts for the
 	 * logical I/O size and not just a gang fragment.
 	 */
 	return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
 	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
 	    zio->io_offset, NULL) == 0);
 }
 
 /*
  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
  * isn't modified later on. If buf is compressed or there is already a checksum
  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
  */
 static void
 arc_cksum_compute(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
 	    KM_SLEEP);
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
 	    hdr->b_l1hdr.b_freeze_cksum);
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 	arc_buf_watch(buf);
 }
 
 #ifndef _KERNEL
 void
 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
 {
 	panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
 }
 #endif
 
 /* ARGSUSED */
 static void
 arc_buf_unwatch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch) {
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ | PROT_WRITE));
 	}
 #endif
 }
 
 /* ARGSUSED */
 static void
 arc_buf_watch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch)
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ));
 #endif
 }
 
 static arc_buf_contents_t
 arc_buf_type(arc_buf_hdr_t *hdr)
 {
 	arc_buf_contents_t type;
 	if (HDR_ISTYPE_METADATA(hdr)) {
 		type = ARC_BUFC_METADATA;
 	} else {
 		type = ARC_BUFC_DATA;
 	}
 	VERIFY3U(hdr->b_type, ==, type);
 	return (type);
 }
 
 boolean_t
 arc_is_metadata(arc_buf_t *buf)
 {
 	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
 }
 
 static uint32_t
 arc_bufc_to_flags(arc_buf_contents_t type)
 {
 	switch (type) {
 	case ARC_BUFC_DATA:
 		/* metadata field is 0 if buffer contains normal data */
 		return (0);
 	case ARC_BUFC_METADATA:
 		return (ARC_FLAG_BUFC_METADATA);
 	default:
 		break;
 	}
 	panic("undefined ARC buffer type!");
 	return ((uint32_t)-1);
 }
 
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 
 	arc_cksum_verify(buf);
 
 	/*
 	 * Compressed buffers do not manipulate the b_freeze_cksum.
 	 */
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	arc_cksum_free(hdr);
 	arc_buf_unwatch(buf);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
 	arc_cksum_compute(buf);
 }
 
 /*
  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
  * the following functions should be used to ensure that the flags are
  * updated in a thread-safe way. When manipulating the flags either
  * the hash_lock must be held or the hdr must be undiscoverable. This
  * ensures that we're not racing with any other threads when updating
  * the flags.
  */
 static inline void
 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags |= flags;
 }
 
 static inline void
 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags &= ~flags;
 }
 
 /*
  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
  * done in a special way since we have to clear and set bits
  * at the same time. Consumers that wish to set the compression bits
  * must use this function to ensure that the flags are updated in
  * thread-safe manner.
  */
 static void
 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Holes and embedded blocks will always have a psize = 0 so
 	 * we ignore the compression of the blkptr and set the
 	 * want to uncompress them. Mark them as uncompressed.
 	 */
 	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
 	}
 
 	HDR_SET_COMPRESS(hdr, cmp);
 	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
 }
 
 /*
  * Looks for another buf on the same hdr which has the data decompressed, copies
  * from it, and returns true. If no such buf exists, returns false.
  */
 static boolean_t
 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t copied = B_FALSE;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 
 	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
 	    from = from->b_next) {
 		/* can't use our own data buffer */
 		if (from == buf) {
 			continue;
 		}
 
 		if (!ARC_BUF_COMPRESSED(from)) {
 			bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
 			copied = B_TRUE;
 			break;
 		}
 	}
 
 	/*
 	 * There were no decompressed bufs, so there should not be a
 	 * checksum on the hdr either.
 	 */
 	if (zfs_flags & ZFS_DEBUG_MODIFY)
 		EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
 
 	return (copied);
 }
 
 /*
  * Allocates an ARC buf header that's in an evicted & L2-cached state.
  * This is used during l2arc reconstruction to make empty ARC buffers
  * which circumvent the regular disk->arc->l2arc path and instead come
  * into being in the reverse order, i.e. l2arc->arc.
  */
 static arc_buf_hdr_t *
 arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
     dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
     enum zio_compress compress, uint8_t complevel, boolean_t protected,
     boolean_t prefetch, arc_state_type_t arcs_state)
 {
 	arc_buf_hdr_t	*hdr;
 
 	ASSERT(size != 0);
 	hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
 	hdr->b_birth = birth;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
 	HDR_SET_LSIZE(hdr, size);
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	if (prefetch)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 	hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
 
 	hdr->b_dva = dva;
 
 	hdr->b_l2hdr.b_dev = dev;
 	hdr->b_l2hdr.b_daddr = daddr;
 	hdr->b_l2hdr.b_arcs_state = arcs_state;
 
 	return (hdr);
 }
 
 /*
  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
  */
 static uint64_t
 arc_hdr_size(arc_buf_hdr_t *hdr)
 {
 	uint64_t size;
 
 	if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 	    HDR_GET_PSIZE(hdr) > 0) {
 		size = HDR_GET_PSIZE(hdr);
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
 		size = HDR_GET_LSIZE(hdr);
 	}
 	return (size);
 }
 
 static int
 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 {
 	int ret;
 	uint64_t csize;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	void *tmpbuf = NULL;
 	abd_t *abd = hdr->b_l1hdr.b_pabd;
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_AUTHENTICATED(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * The MAC is calculated on the compressed data that is stored on disk.
 	 * However, if compressed arc is disabled we will only have the
 	 * decompressed data available to us now. Compress it into a temporary
 	 * abd so we can verify the MAC. The performance overhead of this will
 	 * be relatively low, since most objects in an encrypted objset will
 	 * be encrypted (instead of authenticated) anyway.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		tmpbuf = zio_buf_alloc(lsize);
 		abd = abd_get_from_buf(tmpbuf, lsize);
 		abd_take_ownership_of_buf(abd, B_TRUE);
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel);
 		ASSERT3U(csize, <=, psize);
 		abd_zero_off(abd, csize, psize - csize);
 	}
 
 	/*
 	 * Authentication is best effort. We authenticate whenever the key is
 	 * available. If we succeed we clear ARC_FLAG_NOAUTH.
 	 */
 	if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
 		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
 		ASSERT3U(lsize, ==, psize);
 		ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
 		    psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	} else {
 		ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
 		    hdr->b_crypt_hdr.b_mac);
 	}
 
 	if (ret == 0)
 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
 	else if (ret != ENOENT)
 		goto error;
 
 	if (tmpbuf != NULL)
 		abd_free(abd);
 
 	return (0);
 
 error:
 	if (tmpbuf != NULL)
 		abd_free(abd);
 
 	return (ret);
 }
 
 /*
  * This function will take a header that only has raw encrypted data in
  * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
  * b_l1hdr.b_pabd. If designated in the header flags, this function will
  * also decompress the data.
  */
 static int
 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 {
 	int ret;
 	abd_t *cabd = NULL;
 	void *tmp = NULL;
 	boolean_t no_crypt = B_FALSE;
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_ENCRYPTED(hdr));
 
 	arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
 	    B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
 	    hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
 	    hdr->b_crypt_hdr.b_rabd, &no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (no_crypt) {
 		abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 	}
 
 	/*
 	 * If this header has disabled arc compression but the b_pabd is
 	 * compressed after decrypting it, we need to decompress the newly
 	 * decrypted data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		/*
 		 * We want to make sure that we are correctly honoring the
 		 * zfs_abd_scatter_enabled setting, so we allocate an abd here
 		 * and then loan a buffer from it, rather than allocating a
 		 * linear buffer and wrapping it in an abd later.
 		 */
 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE);
 		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
 			goto error;
 		}
 
 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 	}
 
 	return (0);
 
 error:
 	arc_hdr_free_abd(hdr, B_FALSE);
 	if (cabd != NULL)
 		arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
 
 	return (ret);
 }
 
 /*
  * This function is called during arc_buf_fill() to prepare the header's
  * abd plaintext pointer for use. This involves authenticated protected
  * data and decrypting encrypted data into the plaintext abd.
  */
 static int
 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
     const zbookmark_phys_t *zb, boolean_t noauth)
 {
 	int ret;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	if (hash_lock != NULL)
 		mutex_enter(hash_lock);
 
 	if (HDR_NOAUTH(hdr) && !noauth) {
 		/*
 		 * The caller requested authenticated data but our data has
 		 * not been authenticated yet. Verify the MAC now if we can.
 		 */
 		ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
 		if (ret != 0)
 			goto error;
 	} else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
 		/*
 		 * If we only have the encrypted version of the data, but the
 		 * unencrypted version was requested we take this opportunity
 		 * to store the decrypted version in the header for future use.
 		 */
 		ret = arc_hdr_decrypt(hdr, spa, zb);
 		if (ret != 0)
 			goto error;
 	}
 
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (0);
 
 error:
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (ret);
 }
 
 /*
  * This function is used by the dbuf code to decrypt bonus buffers in place.
  * The dbuf code itself doesn't have any locking for decrypting a shared dnode
  * block, so we use the hash lock here to protect against concurrent calls to
  * arc_buf_fill().
  */
 static void
 arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_ENCRYPTED(hdr));
 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
 	    arc_buf_size(buf));
 	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	hdr->b_crypt_hdr.b_ebufcnt -= 1;
 }
 
 /*
  * Given a buf that has a data buffer attached to it, this function will
  * efficiently fill the buf with data of the specified compression setting from
  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
  * are already sharing a data buf, no copy is performed.
  *
  * If the buf is marked as compressed but uncompressed data was requested, this
  * will allocate a new data buffer for the buf, remove that flag, and fill the
  * buf with uncompressed data. You can't request a compressed buf on a hdr with
  * uncompressed data, and (since we haven't added support for it yet) if you
  * want compressed data your buf must already be marked as compressed and have
  * the correct-sized data buffer.
  */
 static int
 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     arc_fill_flags_t flags)
 {
 	int error = 0;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t hdr_compressed =
 	    (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
 	boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
 	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
 	kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
 	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, !ARC_BUF_SHARED(buf));
 
 	/*
 	 * If the caller wanted encrypted data we just need to copy it from
 	 * b_rabd and potentially byteswap it. We won't be able to do any
 	 * further transforms on it.
 	 */
 	if (encrypted) {
 		ASSERT(HDR_HAS_RABD(hdr));
 		abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 		goto byteswap;
 	}
 
 	/*
 	 * Adjust encrypted and authenticated headers to accommodate
 	 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
 	 * allowed to fail decryption due to keys not being loaded
 	 * without being marked as an IO error.
 	 */
 	if (HDR_PROTECTED(hdr)) {
 		error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
 		    zb, !!(flags & ARC_FILL_NOAUTH));
 		if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
 			return (error);
 		} else if (error != 0) {
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			return (error);
 		}
 	}
 
 	/*
 	 * There is a special case here for dnode blocks which are
 	 * decrypting their bonus buffers. These blocks may request to
 	 * be decrypted in-place. This is necessary because there may
 	 * be many dnodes pointing into this buffer and there is
 	 * currently no method to synchronize replacing the backing
 	 * b_data buffer and updating all of the pointers. Here we use
 	 * the hash lock to ensure there are no races. If the need
 	 * arises for other types to be decrypted in-place, they must
 	 * add handling here as well.
 	 */
 	if ((flags & ARC_FILL_IN_PLACE) != 0) {
 		ASSERT(!hdr_compressed);
 		ASSERT(!compressed);
 		ASSERT(!encrypted);
 
 		if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
 			ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_buf_untransform_in_place(buf, hash_lock);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 
 			/* Compute the hdr's checksum if necessary */
 			arc_cksum_compute(buf);
 		}
 
 		return (0);
 	}
 
 	if (hdr_compressed == compressed) {
 		if (!arc_buf_is_shared(buf)) {
 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT(hdr_compressed);
 		ASSERT(!compressed);
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
 
 		/*
 		 * If the buf is sharing its data with the hdr, unlink it and
 		 * allocate a new data buffer for the buf.
 		 */
 		if (arc_buf_is_shared(buf)) {
 			ASSERT(ARC_BUF_COMPRESSED(buf));
 
 			/* We need to give the buf its own b_data */
 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 
 			/* Previously overhead was 0; just add new overhead */
 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 		} else if (ARC_BUF_COMPRESSED(buf)) {
 			/* We need to reallocate the buf's b_data */
 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
 			    buf);
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 
 			/* We increased the size of b_data; update overhead */
 			ARCSTAT_INCR(arcstat_overhead_size,
 			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
 		}
 
 		/*
 		 * Regardless of the buf's previous compression settings, it
 		 * should not be compressed at the end of this function.
 		 */
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 
 		/*
 		 * Try copying the data from another buf which already has a
 		 * decompressed version. If that's not possible, it's time to
 		 * bite the bullet and decompress the data from the hdr.
 		 */
 		if (arc_buf_try_copy_decompressed_data(buf)) {
 			/* Skip byteswapping and checksumming (already done) */
 			return (0);
 		} else {
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 			    hdr->b_l1hdr.b_pabd, buf->b_data,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
 			    &hdr->b_complevel);
 
 			/*
 			 * Absent hardware errors or software bugs, this should
 			 * be impossible, but log it anyway so we can debug it.
 			 */
 			if (error != 0) {
 				zfs_dbgmsg(
 				    "hdr %px, compress %d, psize %d, lsize %d",
 				    hdr, arc_hdr_get_compress(hdr),
 				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 				arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 				return (SET_ERROR(EIO));
 			}
 		}
 	}
 
 byteswap:
 	/* Byteswap the buf's data if necessary */
 	if (bswap != DMU_BSWAP_NUMFUNCS) {
 		ASSERT(!HDR_SHARED_DATA(hdr));
 		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
 		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
 	}
 
 	/* Compute the hdr's checksum if necessary */
 	arc_cksum_compute(buf);
 
 	return (0);
 }
 
 /*
  * If this function is being called to decrypt an encrypted buffer or verify an
  * authenticated one, the key must be loaded and a mapping must be made
  * available in the keystore via spa_keystore_create_mapping() or one of its
  * callers.
  */
 int
 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     boolean_t in_place)
 {
 	int ret;
 	arc_fill_flags_t flags = 0;
 
 	if (in_place)
 		flags |= ARC_FILL_IN_PLACE;
 
 	ret = arc_buf_fill(buf, spa, zb, flags);
 	if (ret == ECKSUM) {
 		/*
 		 * Convert authentication and decryption errors to EIO
 		 * (and generate an ereport) before leaving the ARC.
 		 */
 		ret = SET_ERROR(EIO);
 		spa_log_error(spa, zb);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 		    spa, NULL, zb, NULL, 0);
 	}
 
 	return (ret);
 }
 
 /*
  * Increment the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (arc_buf_is_shared(buf))
 			continue;
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Decrement the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (arc_buf_is_shared(buf))
 			continue;
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Add a reference to this hdr indicating that someone is actively
  * referencing that memory. When the refcount transitions from 0 to 1,
  * we remove it from the respective arc_state_t list to indicate that
  * it is not evictable.
  */
 static void
 add_reference(arc_buf_hdr_t *hdr, void *tag)
 {
 	arc_state_t *state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
 		ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 	}
 
 	state = hdr->b_l1hdr.b_state;
 
 	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
 	    (state != arc_anon)) {
 		/* We don't use the L2-only state list. */
 		if (state != arc_l2c_only) {
 			multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
 			    hdr);
 			arc_evictable_space_decrement(hdr, state);
 		}
 		/* remove the prefetch flag if we get a reference */
 		if (HDR_HAS_L2HDR(hdr))
 			l2arc_hdr_arcstats_decrement_state(hdr);
 		arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
 		if (HDR_HAS_L2HDR(hdr))
 			l2arc_hdr_arcstats_increment_state(hdr);
 	}
 }
 
 /*
  * Remove a reference from this hdr. When the reference transitions from
  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
  * list making it eligible for eviction.
  */
 static int
 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 {
 	int cnt;
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
 	ASSERT(!GHOST_STATE(state));
 
 	/*
 	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
 	 * check to prevent usage of the arc_l2c_only list.
 	 */
 	if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
 	    (state != arc_anon)) {
 		multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
 		arc_evictable_space_increment(hdr, state);
 	}
 	return (cnt);
 }
 
 /*
  * Returns detailed information about a specific arc buffer.  When the
  * state_index argument is set the function will calculate the arc header
  * list position for its arc state.  Since this requires a linear traversal
  * callers are strongly encourage not to do this.  However, it can be helpful
  * for targeted analysis so the functionality is provided.
  */
 void
 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 {
 	arc_buf_hdr_t *hdr = ab->b_hdr;
 	l1arc_buf_hdr_t *l1hdr = NULL;
 	l2arc_buf_hdr_t *l2hdr = NULL;
 	arc_state_t *state = NULL;
 
 	memset(abi, 0, sizeof (arc_buf_info_t));
 
 	if (hdr == NULL)
 		return;
 
 	abi->abi_flags = hdr->b_flags;
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		l1hdr = &hdr->b_l1hdr;
 		state = l1hdr->b_state;
 	}
 	if (HDR_HAS_L2HDR(hdr))
 		l2hdr = &hdr->b_l2hdr;
 
 	if (l1hdr) {
 		abi->abi_bufcnt = l1hdr->b_bufcnt;
 		abi->abi_access = l1hdr->b_arc_access;
 		abi->abi_mru_hits = l1hdr->b_mru_hits;
 		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
 		abi->abi_mfu_hits = l1hdr->b_mfu_hits;
 		abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
 		abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
 	}
 
 	if (l2hdr) {
 		abi->abi_l2arc_dattr = l2hdr->b_daddr;
 		abi->abi_l2arc_hits = l2hdr->b_hits;
 	}
 
 	abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
 	abi->abi_state_contents = arc_buf_type(hdr);
 	abi->abi_size = arc_hdr_size(hdr);
 }
 
 /*
  * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
     kmutex_t *hash_lock)
 {
 	arc_state_t *old_state;
 	int64_t refcnt;
 	uint32_t bufcnt;
 	boolean_t update_old, update_new;
 	arc_buf_contents_t buftype = arc_buf_type(hdr);
 
 	/*
 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
 	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
 	 * L1 hdr doesn't always exist when we change state to arc_anon before
 	 * destroying a header, in which case reallocating to add the L1 hdr is
 	 * pointless.
 	 */
 	if (HDR_HAS_L1HDR(hdr)) {
 		old_state = hdr->b_l1hdr.b_state;
 		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
 		bufcnt = hdr->b_l1hdr.b_bufcnt;
 		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
 		    HDR_HAS_RABD(hdr));
 	} else {
 		old_state = arc_l2c_only;
 		refcnt = 0;
 		bufcnt = 0;
 		update_old = B_FALSE;
 	}
 	update_new = update_old;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT3P(new_state, !=, old_state);
 	ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
 	ASSERT(old_state != arc_anon || bufcnt <= 1);
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
 	if (refcnt == 0) {
 		if (old_state != arc_anon && old_state != arc_l2c_only) {
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			multilist_remove(&old_state->arcs_list[buftype], hdr);
 
 			if (GHOST_STATE(old_state)) {
 				ASSERT0(bufcnt);
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 				update_old = B_TRUE;
 			}
 			arc_evictable_space_decrement(hdr, old_state);
 		}
 		if (new_state != arc_anon && new_state != arc_l2c_only) {
 			/*
 			 * An L1 header always exists here, since if we're
 			 * moving to some L1-cached state (i.e. not l2c_only or
 			 * anonymous), we realloc the header to add an L1hdr
 			 * beforehand.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			multilist_insert(&new_state->arcs_list[buftype], hdr);
 
 			if (GHOST_STATE(new_state)) {
 				ASSERT0(bufcnt);
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 				update_new = B_TRUE;
 			}
 			arc_evictable_space_increment(hdr, new_state);
 		}
 	}
 
 	ASSERT(!HDR_EMPTY(hdr));
 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
 		buf_hash_remove(hdr);
 
 	/* adjust state sizes (ignore arc_l2c_only) */
 
 	if (update_new && new_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(new_state)) {
 			ASSERT0(bufcnt);
 
 			/*
 			 * When moving a header to a ghost state, we first
 			 * remove all arc buffers. Thus, we'll have a
 			 * bufcnt of zero, and no arc buffer to use for
 			 * the reference. As a result, we use the arc
 			 * header pointer for the reference.
 			 */
 			(void) zfs_refcount_add_many(&new_state->arcs_size,
 			    HDR_GET_LSIZE(hdr), hdr);
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 		} else {
 			uint32_t buffers = 0;
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 				ASSERT3U(bufcnt, !=, 0);
 				buffers++;
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (arc_buf_is_shared(buf))
 					continue;
 
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size,
 				    arc_buf_size(buf), buf);
 			}
 			ASSERT3U(bufcnt, ==, buffers);
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size,
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size,
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
 
 	if (update_old && old_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(old_state)) {
 			ASSERT0(bufcnt);
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 
 			/*
 			 * When moving a header off of a ghost state,
 			 * the header will not contain any arc buffers.
 			 * We use the arc header pointer for the reference
 			 * which is exactly what we did when we put the
 			 * header on the ghost state.
 			 */
 
 			(void) zfs_refcount_remove_many(&old_state->arcs_size,
 			    HDR_GET_LSIZE(hdr), hdr);
 		} else {
 			uint32_t buffers = 0;
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 				ASSERT3U(bufcnt, !=, 0);
 				buffers++;
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (arc_buf_is_shared(buf))
 					continue;
 
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size, arc_buf_size(buf),
 				    buf);
 			}
 			ASSERT3U(bufcnt, ==, buffers);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size, arc_hdr_size(hdr),
 				    hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size, HDR_GET_PSIZE(hdr),
 				    hdr);
 			}
 		}
 	}
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		hdr->b_l1hdr.b_state = new_state;
 
 		if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
 			l2arc_hdr_arcstats_decrement_state(hdr);
 			hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
 			l2arc_hdr_arcstats_increment_state(hdr);
 		}
 	}
 
 	/*
 	 * L2 headers should never be on the L2 state list since they don't
 	 * have L1 headers allocated.
 	 */
 	ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
 	    multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
 }
 
 void
 arc_space_consume(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_metadata_size, space);
 		break;
 	case ARC_SPACE_BONUS:
 		ARCSTAT_INCR(arcstat_bonus_size, space);
 		break;
 	case ARC_SPACE_DNODE:
 		aggsum_add(&arc_sums.arcstat_dnode_size, space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&arc_sums.arcstat_l2_hdr_size, space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		/*
 		 * Note: this includes space wasted by all scatter ABD's, not
 		 * just those allocated by the ARC.  But the vast majority of
 		 * scatter ABD's come from the ARC, because other users are
 		 * very short-lived.
 		 */
 		ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
 		aggsum_add(&arc_sums.arcstat_meta_used, space);
 
 	aggsum_add(&arc_sums.arcstat_size, space);
 }
 
 void
 arc_space_return(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_metadata_size, -space);
 		break;
 	case ARC_SPACE_BONUS:
 		ARCSTAT_INCR(arcstat_bonus_size, -space);
 		break;
 	case ARC_SPACE_DNODE:
 		aggsum_add(&arc_sums.arcstat_dnode_size, -space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, -space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, -space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) {
 		ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used,
 		    space) >= 0);
 		ARCSTAT_MAX(arcstat_meta_max,
 		    aggsum_upper_bound(&arc_sums.arcstat_meta_used));
 		aggsum_add(&arc_sums.arcstat_meta_used, -space);
 	}
 
 	ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0);
 	aggsum_add(&arc_sums.arcstat_size, -space);
 }
 
 /*
  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
  * with the hdr's b_pabd.
  */
 static boolean_t
 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	/*
 	 * The criteria for sharing a hdr's data are:
 	 * 1. the buffer is not encrypted
 	 * 2. the hdr's compression matches the buf's compression
 	 * 3. the hdr doesn't need to be byteswapped
 	 * 4. the hdr isn't already being shared
 	 * 5. the buf is either compressed or it is the last buf in the hdr list
 	 *
 	 * Criterion #5 maintains the invariant that shared uncompressed
 	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
 	 * might ask, "if a compressed buf is allocated first, won't that be the
 	 * last thing in the list?", but in that case it's impossible to create
 	 * a shared uncompressed buf anyway (because the hdr must be compressed
 	 * to have the compressed buf). You might also think that #3 is
 	 * sufficient to make this guarantee, however it's possible
 	 * (specifically in the rare L2ARC write race mentioned in
 	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
 	 * is shareable, but wasn't at the time of its allocation. Rather than
 	 * allow a new shared uncompressed buf to be created and then shuffle
 	 * the list around to make it the last element, this simply disallows
 	 * sharing if the new buf isn't the first to be added.
 	 */
 	ASSERT3P(buf->b_hdr, ==, hdr);
 	boolean_t hdr_compressed =
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
 	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
 	return (!ARC_BUF_ENCRYPTED(buf) &&
 	    buf_compressed == hdr_compressed &&
 	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
 	    !HDR_SHARED_DATA(hdr) &&
 	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
 }
 
 /*
  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
  * copy was made successfully, or an error code otherwise.
  */
 static int
 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
     void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
     boolean_t fill, arc_buf_t **ret)
 {
 	arc_buf_t *buf;
 	arc_fill_flags_t flags = ARC_FILL_LOCKED;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
 	    hdr->b_type == ARC_BUFC_METADATA);
 	ASSERT3P(ret, !=, NULL);
 	ASSERT3P(*ret, ==, NULL);
 	IMPLY(encrypted, compressed);
 
 	hdr->b_l1hdr.b_mru_hits = 0;
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 	hdr->b_l1hdr.b_l2_hits = 0;
 
 	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_next = hdr->b_l1hdr.b_buf;
 	buf->b_flags = 0;
 
 	add_reference(hdr, tag);
 
 	/*
 	 * We're about to change the hdr's b_flags. We must either
 	 * hold the hash_lock or be undiscoverable.
 	 */
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Only honor requests for compressed bufs if the hdr is actually
 	 * compressed. This must be overridden if the buffer is encrypted since
 	 * encrypted buffers cannot be decompressed.
 	 */
 	if (encrypted) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
 		flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
 	} else if (compressed &&
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		flags |= ARC_FILL_COMPRESSED;
 	}
 
 	if (noauth) {
 		ASSERT0(encrypted);
 		flags |= ARC_FILL_NOAUTH;
 	}
 
 	/*
 	 * If the hdr's data can be shared then we share the data buffer and
 	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
 	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
 	 * buffer to store the buf's data.
 	 *
 	 * There are two additional restrictions here because we're sharing
 	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
 	 * actively involved in an L2ARC write, because if this buf is used by
 	 * an arc_write() then the hdr's data buffer will be released when the
 	 * write completes, even though the L2ARC write might still be using it.
 	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
 	 * need to be ABD-aware.  It must be allocated via
 	 * zio_[data_]buf_alloc(), not as a page, because we need to be able
 	 * to abd_release_ownership_of_buf(), which isn't allowed on "linear
 	 * page" buffers because the ABD code needs to handle freeing them
 	 * specially.
 	 */
 	boolean_t can_share = arc_can_share(hdr, buf) &&
 	    !HDR_L2_WRITING(hdr) &&
 	    hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(hdr->b_l1hdr.b_pabd) &&
 	    !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
 
 	/* Set up b_data and sharing */
 	if (can_share) {
 		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
 		buf->b_flags |= ARC_BUF_FLAG_SHARED;
 		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	} else {
 		buf->b_data =
 		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
 		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 	}
 	VERIFY3P(buf->b_data, !=, NULL);
 
 	hdr->b_l1hdr.b_buf = buf;
 	hdr->b_l1hdr.b_bufcnt += 1;
 	if (encrypted)
 		hdr->b_crypt_hdr.b_ebufcnt += 1;
 
 	/*
 	 * If the user wants the data from the hdr, we need to either copy or
 	 * decompress the data.
 	 */
 	if (fill) {
 		ASSERT3P(zb, !=, NULL);
 		return (arc_buf_fill(buf, spa, zb, flags));
 	}
 
 	return (0);
 }
 
 static char *arc_onloan_tag = "onloan";
 
 static inline void
 arc_loaned_bytes_update(int64_t delta)
 {
 	atomic_add_64(&arc_loaned_bytes, delta);
 
 	/* assert that it did not wrap around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 }
 
 /*
  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
  * flight data by arc_tempreserve_space() until they are "returned". Loaned
  * buffers must be returned to the arc before they can be used by the DMU or
  * freed.
  */
 arc_buf_t *
 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
 {
 	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
 	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
 	    psize, lsize, compression_type, complevel);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
 	    byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
 	    complevel);
 
 	atomic_add_64(&arc_loaned_bytes, psize);
 	return (buf);
 }
 
 
 /*
  * Return a loaned arc buffer to the arc.
  */
 void
 arc_return_buf(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 
 	arc_loaned_bytes_update(-arc_buf_size(buf));
 }
 
 /* Detach an arc_buf from a dbuf (tag) */
 void
 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 }
 
 static void
 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
 {
 	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
 
 	df->l2df_abd = abd;
 	df->l2df_size = size;
 	df->l2df_type = type;
 	mutex_enter(&l2arc_free_on_write_mtx);
 	list_insert_head(l2arc_free_on_write, df);
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 static void
 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, hdr);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 
 	if (free_rdata) {
 		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
 	} else {
 		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
 	}
 }
 
 /*
  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
  * data buffer, we transfer the refcount ownership to the hdr and update
  * the appropriate kstats.
  */
 static void
 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_can_share(hdr, buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Start sharing the data buffer. We transfer the
 	 * refcount ownership to the hdr since it always owns
 	 * the refcount whenever an arc_buf_t is shared.
 	 */
 	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
 	    arc_hdr_size(hdr), buf, hdr);
 	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
 	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
 	    HDR_ISTYPE_METADATA(hdr));
 	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	buf->b_flags |= ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since we've transferred ownership to the hdr we need
 	 * to increment its compressed and uncompressed kstats and
 	 * decrement the overhead size.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
 }
 
 static void
 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * We are no longer sharing this buffer so we need
 	 * to transfer its ownership to the rightful owner.
 	 */
 	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
 	    arc_hdr_size(hdr), hdr, buf);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
 	abd_free(hdr->b_l1hdr.b_pabd);
 	hdr->b_l1hdr.b_pabd = NULL;
 	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since the buffer is no longer shared between
 	 * the arc buf and the hdr, count it as overhead.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 }
 
 /*
  * Remove an arc_buf_t from the hdr's buf list and return the last
  * arc_buf_t on the list. If no buffers remain on the list then return
  * NULL.
  */
 static arc_buf_t *
 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
 	arc_buf_t *lastbuf = NULL;
 
 	/*
 	 * Remove the buf from the hdr list and locate the last
 	 * remaining buffer on the list.
 	 */
 	while (*bufp != NULL) {
 		if (*bufp == buf)
 			*bufp = buf->b_next;
 
 		/*
 		 * If we've removed a buffer in the middle of
 		 * the list then update the lastbuf and update
 		 * bufp.
 		 */
 		if (*bufp != NULL) {
 			lastbuf = *bufp;
 			bufp = &(*bufp)->b_next;
 		}
 	}
 	buf->b_next = NULL;
 	ASSERT3P(lastbuf, !=, buf);
 	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
 	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
 	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
 
 	return (lastbuf);
 }
 
 /*
  * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
  * list and free it.
  */
 static void
 arc_buf_destroy_impl(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Free up the data associated with the buf but only if we're not
 	 * sharing this with the hdr. If we are sharing it with the hdr, the
 	 * hdr is responsible for doing the free.
 	 */
 	if (buf->b_data != NULL) {
 		/*
 		 * We're about to change the hdr's b_flags. We must either
 		 * hold the hash_lock or be undiscoverable.
 		 */
 		ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		if (arc_buf_is_shared(buf)) {
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 		} else {
 			uint64_t size = arc_buf_size(buf);
 			arc_free_data_buf(hdr, buf->b_data, size, buf);
 			ARCSTAT_INCR(arcstat_overhead_size, -size);
 		}
 		buf->b_data = NULL;
 
 		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
 		hdr->b_l1hdr.b_bufcnt -= 1;
 
 		if (ARC_BUF_ENCRYPTED(buf)) {
 			hdr->b_crypt_hdr.b_ebufcnt -= 1;
 
 			/*
 			 * If we have no more encrypted buffers and we've
 			 * already gotten a copy of the decrypted data we can
 			 * free b_rabd to save some space.
 			 */
 			if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
 			    HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
 			    !HDR_IO_IN_PROGRESS(hdr)) {
 				arc_hdr_free_abd(hdr, B_TRUE);
 			}
 		}
 	}
 
 	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 
 	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 		/*
 		 * If the current arc_buf_t is sharing its data buffer with the
 		 * hdr, then reassign the hdr's b_pabd to share it with the new
 		 * buffer at the end of the list. The shared buffer is always
 		 * the last one on the hdr's buffer list.
 		 *
 		 * There is an equivalent case for compressed bufs, but since
 		 * they aren't guaranteed to be the last buf in the list and
 		 * that is an exceedingly rare case, we just allow that space be
 		 * wasted temporarily. We must also be careful not to share
 		 * encrypted buffers, since they cannot be shared.
 		 */
 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
 			/* Only one buf can be shared at once */
 			VERIFY(!arc_buf_is_shared(lastbuf));
 			/* hdr is uncompressed so can't have compressed buf */
 			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
 
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 			/*
 			 * We must setup a new shared block between the
 			 * last buffer and the hdr. The data would have
 			 * been allocated by the arc buf so we need to transfer
 			 * ownership to the hdr since it's now being shared.
 			 */
 			arc_share_buf(hdr, lastbuf);
 		}
 	} else if (HDR_SHARED_DATA(hdr)) {
 		/*
 		 * Uncompressed shared buffers are always at the end
 		 * of the list. Compressed buffers don't have the
 		 * same requirements. This makes it hard to
 		 * simply assert that the lastbuf is shared so
 		 * we rely on the hdr's compression flags to determine
 		 * if we have a compressed, shared buffer.
 		 */
 		ASSERT3P(lastbuf, !=, NULL);
 		ASSERT(arc_buf_is_shared(lastbuf) ||
 		    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	}
 
 	/*
 	 * Free the checksum if we're removing the last uncompressed buf from
 	 * this hdr.
 	 */
 	if (!arc_hdr_has_uncompressed_buf(hdr)) {
 		arc_cksum_free(hdr);
 	}
 
 	/* clean up the buf */
 	buf->b_hdr = NULL;
 	kmem_cache_free(buf_cache, buf);
 }
 
 static void
 arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
 {
 	uint64_t size;
 	boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
 	boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0);
 
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
 	IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
 
 	if (alloc_rdata) {
 		size = HDR_GET_PSIZE(hdr);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
 		hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
 		    do_adapt);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
 		ARCSTAT_INCR(arcstat_raw_size, size);
 	} else {
 		size = arc_hdr_size(hdr);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
 		    do_adapt);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	}
 
 	ARCSTAT_INCR(arcstat_compressed_size, size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 }
 
 static void
 arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 	IMPLY(free_rdata, HDR_HAS_RABD(hdr));
 
 	/*
 	 * If the hdr is currently being written to the l2arc then
 	 * we defer freeing the data by adding it to the l2arc_free_on_write
 	 * list. The l2arc will free the data once it's finished
 	 * writing it to the l2arc device.
 	 */
 	if (HDR_L2_WRITING(hdr)) {
 		arc_hdr_free_on_write(hdr, free_rdata);
 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
 	} else if (free_rdata) {
 		arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
 	} else {
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
 	}
 
 	if (free_rdata) {
 		hdr->b_crypt_hdr.b_rabd = NULL;
 		ARCSTAT_INCR(arcstat_raw_size, -size);
 	} else {
 		hdr->b_l1hdr.b_pabd = NULL;
 	}
 
 	if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
 		hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 
 	ARCSTAT_INCR(arcstat_compressed_size, -size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 }
 
 static arc_buf_hdr_t *
 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
     boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
     arc_buf_contents_t type, boolean_t alloc_rdata)
 {
 	arc_buf_hdr_t *hdr;
 	int flags = ARC_HDR_DO_ADAPT;
 
 	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
 	if (protected) {
 		hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
 	} else {
 		hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
 	}
 	flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0;
 
 	ASSERT(HDR_EMPTY(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 	HDR_SET_PSIZE(hdr, psize);
 	HDR_SET_LSIZE(hdr, lsize);
 	hdr->b_spa = spa;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
 	arc_hdr_set_compress(hdr, compression_type);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 
 	hdr->b_l1hdr.b_state = arc_anon;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_bufcnt = 0;
 	hdr->b_l1hdr.b_buf = NULL;
 
 	/*
 	 * Allocate the hdr's buffer. This will contain either
 	 * the compressed or uncompressed data depending on the block
 	 * it references and compressed arc enablement.
 	 */
 	arc_hdr_alloc_abd(hdr, flags);
 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 
 	return (hdr);
 }
 
 /*
  * Transition between the two allocation states for the arc_buf_hdr struct.
  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
  * version is used when a cache buffer is only in the L2ARC in order to reduce
  * memory usage.
  */
 static arc_buf_hdr_t *
 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 {
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	arc_buf_hdr_t *nhdr;
 	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 
 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
 	    (old == hdr_l2only_cache && new == hdr_full_cache));
 
 	/*
 	 * if the caller wanted a new full header and the header is to be
 	 * encrypted we will actually allocate the header from the full crypt
 	 * cache instead. The same applies to freeing from the old cache.
 	 */
 	if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
 		new = hdr_full_crypt_cache;
 	if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
 		old = hdr_full_crypt_cache;
 
 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	buf_hash_remove(hdr);
 
 	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
 
 	if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
 		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 		/*
 		 * arc_access and arc_change_state need to be aware that a
 		 * header has just come out of L2ARC, so we set its state to
 		 * l2c_only even though it's about to change.
 		 */
 		nhdr->b_l1hdr.b_state = arc_l2c_only;
 
 		/* Verify previous threads set to NULL before freeing */
 		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 	} else {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 
 		/*
 		 * If we've reached here, We must have been called from
 		 * arc_evict_hdr(), as such we should have already been
 		 * removed from any ghost list we were previously on
 		 * (which protects us from racing with arc_evict_state),
 		 * thus no locking is needed during this check.
 		 */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		/*
 		 * A buffer must not be moved into the arc_l2c_only
 		 * state if it's not finished being written out to the
 		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
 		 * might try to be accessed, even though it was removed.
 		 */
 		VERIFY(!HDR_L2_WRITING(hdr));
 		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 
 		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 	}
 	/*
 	 * The header has been reallocated so we need to re-insert it into any
 	 * lists it was on.
 	 */
 	(void) buf_hash_insert(nhdr, NULL);
 
 	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
 
 	mutex_enter(&dev->l2ad_mtx);
 
 	/*
 	 * We must place the realloc'ed header back into the list at
 	 * the same spot. Otherwise, if it's placed earlier in the list,
 	 * l2arc_write_buffers() could find it during the function's
 	 * write phase, and try to write it out to the l2arc.
 	 */
 	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	mutex_exit(&dev->l2ad_mtx);
 
 	/*
 	 * Since we're using the pointer address as the tag when
 	 * incrementing and decrementing the l2ad_alloc refcount, we
 	 * must remove the old pointer (that we're about to destroy) and
 	 * add the new pointer to the refcount. Otherwise we'd remove
 	 * the wrong pointer address when calling arc_hdr_destroy() later.
 	 */
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 	    arc_hdr_size(hdr), hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 	    arc_hdr_size(nhdr), nhdr);
 
 	buf_discard_identity(hdr);
 	kmem_cache_free(old, hdr);
 
 	return (nhdr);
 }
 
 /*
  * This function allows an L1 header to be reallocated as a crypt
  * header and vice versa. If we are going to a crypt header, the
  * new fields will be zeroed out.
  */
 static arc_buf_hdr_t *
 arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
 {
 	arc_buf_hdr_t *nhdr;
 	arc_buf_t *buf;
 	kmem_cache_t *ncache, *ocache;
 
 	/*
 	 * This function requires that hdr is in the arc_anon state.
 	 * Therefore it won't have any L2ARC data for us to worry
 	 * about copying.
 	 */
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_HAS_L2HDR(hdr));
 	ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 
 	if (need_crypt) {
 		ncache = hdr_full_crypt_cache;
 		ocache = hdr_full_cache;
 	} else {
 		ncache = hdr_full_cache;
 		ocache = hdr_full_crypt_cache;
 	}
 
 	nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
 
 	/*
 	 * Copy all members that aren't locks or condvars to the new header.
 	 * No lists are pointing to us (as we asserted above), so we don't
 	 * need to worry about the list nodes.
 	 */
 	nhdr->b_dva = hdr->b_dva;
 	nhdr->b_birth = hdr->b_birth;
 	nhdr->b_type = hdr->b_type;
 	nhdr->b_flags = hdr->b_flags;
 	nhdr->b_psize = hdr->b_psize;
 	nhdr->b_lsize = hdr->b_lsize;
 	nhdr->b_spa = hdr->b_spa;
 	nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
 	nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
 	nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
 	nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
 	nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
 	nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
 	nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
 	nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
 	nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
 	nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits;
 	nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
 	nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
 
 	/*
 	 * This zfs_refcount_add() exists only to ensure that the individual
 	 * arc buffers always point to a header that is referenced, avoiding
 	 * a small race condition that could trigger ASSERTs.
 	 */
 	(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
 	nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
 	for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
 		mutex_enter(&buf->b_evict_lock);
 		buf->b_hdr = nhdr;
 		mutex_exit(&buf->b_evict_lock);
 	}
 
 	zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
 	(void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
 
 	if (need_crypt) {
 		arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
 	} else {
 		arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
 	}
 
 	/* unset all members of the original hdr */
 	bzero(&hdr->b_dva, sizeof (dva_t));
 	hdr->b_birth = 0;
 	hdr->b_type = ARC_BUFC_INVALID;
 	hdr->b_flags = 0;
 	hdr->b_psize = 0;
 	hdr->b_lsize = 0;
 	hdr->b_spa = 0;
 	hdr->b_l1hdr.b_freeze_cksum = NULL;
 	hdr->b_l1hdr.b_buf = NULL;
 	hdr->b_l1hdr.b_bufcnt = 0;
 	hdr->b_l1hdr.b_byteswap = 0;
 	hdr->b_l1hdr.b_state = NULL;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_mru_hits = 0;
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 	hdr->b_l1hdr.b_l2_hits = 0;
 	hdr->b_l1hdr.b_acb = NULL;
 	hdr->b_l1hdr.b_pabd = NULL;
 
 	if (ocache == hdr_full_crypt_cache) {
 		ASSERT(!HDR_HAS_RABD(hdr));
 		hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
 		hdr->b_crypt_hdr.b_ebufcnt = 0;
 		hdr->b_crypt_hdr.b_dsobj = 0;
 		bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
 		bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
 		bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 	}
 
 	buf_discard_identity(hdr);
 	kmem_cache_free(ocache, hdr);
 
 	return (nhdr);
 }
 
 /*
  * This function is used by the send / receive code to convert a newly
  * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
  * is also used to allow the root objset block to be updated without altering
  * its embedded MACs. Both block types will always be uncompressed so we do not
  * have to worry about compression type or psize.
  */
 void
 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
     dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
     const uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 
 	buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
 	if (!HDR_PROTECTED(hdr))
 		hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	if (!arc_hdr_has_uncompressed_buf(hdr))
 		arc_cksum_free(hdr);
 
 	if (salt != NULL)
 		bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
 	if (iv != NULL)
 		bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
 	if (mac != NULL)
 		bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 }
 
 /*
  * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
  * The buf is returned thawed since we expect the consumer to modify it.
  */
 arc_buf_t *
 arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
 {
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
 	    B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	return (buf);
 }
 
 /*
  * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
  * for bufs containing metadata.
  */
 arc_buf_t *
 arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
 	    B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
 	    B_TRUE, B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 
 	if (!arc_buf_is_shared(buf)) {
 		/*
 		 * To ensure that the hdr has the correct data in it if we call
 		 * arc_untransform() on this buf before it's been written to
 		 * disk, it's easiest if we just set up sharing between the
 		 * buf and the hdr.
 		 */
 		arc_hdr_free_abd(hdr, B_FALSE);
 		arc_share_buf(hdr, buf);
 	}
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 	arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
 	    ARC_BUFC_METADATA : ARC_BUFC_DATA;
 
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
 	    compression_type, complevel, type, B_TRUE);
 
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
 	bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
 	bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 
 	/*
 	 * This buffer will be considered encrypted even if the ot is not an
 	 * encrypted type. It will become authenticated instead in
 	 * arc_write_ready().
 	 */
 	buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 
 	return (buf);
 }
 
 static void
 l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only)
 {
 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 	l2arc_dev_t *dev = l2hdr->b_dev;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 	arc_buf_contents_t type = hdr->b_type;
 	int64_t lsize_s;
 	int64_t psize_s;
 	int64_t asize_s;
 
 	if (incr) {
 		lsize_s = lsize;
 		psize_s = psize;
 		asize_s = asize;
 	} else {
 		lsize_s = -lsize;
 		psize_s = -psize;
 		asize_s = -asize;
 	}
 
 	/* If the buffer is a prefetch, count it as such. */
 	if (HDR_PREFETCH(hdr)) {
 		ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
 	} else {
 		/*
 		 * We use the value stored in the L2 header upon initial
 		 * caching in L2ARC. This value will be updated in case
 		 * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
 		 * metadata (log entry) cannot currently be updated. Having
 		 * the ARC state in the L2 header solves the problem of a
 		 * possibly absent L1 header (apparent in buffers restored
 		 * from persistent L2ARC).
 		 */
 		switch (hdr->b_l2hdr.b_arcs_state) {
 			case ARC_STATE_MRU_GHOST:
 			case ARC_STATE_MRU:
 				ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
 				break;
 			case ARC_STATE_MFU_GHOST:
 			case ARC_STATE_MFU:
 				ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
 				break;
 			default:
 				break;
 		}
 	}
 
 	if (state_only)
 		return;
 
 	ARCSTAT_INCR(arcstat_l2_psize, psize_s);
 	ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
 
 	switch (type) {
 		case ARC_BUFC_DATA:
 			ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
 			break;
 		case ARC_BUFC_METADATA:
 			ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
 			break;
 		default:
 			break;
 	}
 }
 
 
 static void
 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 	l2arc_dev_t *dev = l2hdr->b_dev;
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 
 	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	l2arc_hdr_arcstats_decrement(hdr);
 	vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
 	    hdr);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 }
 
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
 		    hdr->b_l1hdr.b_bufcnt > 0);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	}
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
 
 		if (!buflist_held)
 			mutex_enter(&dev->l2ad_mtx);
 
 		/*
 		 * Even though we checked this conditional above, we
 		 * need to check this again now that we have the
 		 * l2ad_mtx. This is because we could be racing with
 		 * another thread calling l2arc_evict() which might have
 		 * destroyed this header's L2 portion as we were waiting
 		 * to acquire the l2ad_mtx. If that happens, we don't
 		 * want to re-destroy the header's L2 portion.
 		 */
 		if (HDR_HAS_L2HDR(hdr))
 			arc_hdr_l2hdr_destroy(hdr);
 
 		if (!buflist_held)
 			mutex_exit(&dev->l2ad_mtx);
 	}
 
 	/*
 	 * The header's identify can only be safely discarded once it is no
 	 * longer discoverable.  This requires removing it from the hash table
 	 * and the l2arc header list.  After this point the hash lock can not
 	 * be used to protect the header.
 	 */
 	if (!HDR_EMPTY(hdr))
 		buf_discard_identity(hdr);
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		arc_cksum_free(hdr);
 
 		while (hdr->b_l1hdr.b_buf != NULL)
 			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
 
 		if (hdr->b_l1hdr.b_pabd != NULL)
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 
 		if (!HDR_PROTECTED(hdr)) {
 			kmem_cache_free(hdr_full_cache, hdr);
 		} else {
 			kmem_cache_free(hdr_full_crypt_cache, hdr);
 		}
 	} else {
 		kmem_cache_free(hdr_l2only_cache, hdr);
 	}
 }
 
 void
 arc_buf_destroy(arc_buf_t *buf, void* tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		VERIFY0(remove_reference(hdr, NULL, tag));
 		arc_hdr_destroy(hdr);
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	ASSERT3P(hdr, ==, buf->b_hdr);
 	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
 	ASSERT3P(buf->b_data, !=, NULL);
 
 	(void) remove_reference(hdr, hash_lock, tag);
 	arc_buf_destroy_impl(buf);
 	mutex_exit(hash_lock);
 }
 
 /*
  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
  * state of the header is dependent on its state prior to entering this
  * function. The following transitions are possible:
  *
  *    - arc_mru -> arc_mru_ghost
  *    - arc_mfu -> arc_mfu_ghost
  *    - arc_mru_ghost -> arc_l2c_only
  *    - arc_mru_ghost -> deleted
  *    - arc_mfu_ghost -> arc_l2c_only
  *    - arc_mfu_ghost -> deleted
  *
  * Return total size of evicted data buffers for eviction progress tracking.
  * When evicting from ghost states return logical buffer size to make eviction
  * progress at the same (or at least comparable) rate as from non-ghost states.
  *
  * Return *real_evicted for actual ARC size reduction to wake up threads
  * waiting for it.  For non-ghost states it includes size of evicted data
  * buffers (the headers are not freed there).  For ghost states it includes
  * only the evicted headers size.
  */
 static int64_t
 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
 {
 	arc_state_t *evicted_state, *state;
 	int64_t bytes_evicted = 0;
 	int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
 	    arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	*real_evicted = 0;
 	state = hdr->b_l1hdr.b_state;
 	if (GHOST_STATE(state)) {
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 
 		/*
 		 * l2arc_write_buffers() relies on a header's L1 portion
 		 * (i.e. its b_pabd field) during it's write phase.
 		 * Thus, we cannot push a header onto the arc_l2c_only
 		 * state (removing its L1 piece) until the header is
 		 * done being written to the l2arc.
 		 */
 		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
 			ARCSTAT_BUMP(arcstat_evict_l2_skip);
 			return (bytes_evicted);
 		}
 
 		ARCSTAT_BUMP(arcstat_deleted);
 		bytes_evicted += HDR_GET_LSIZE(hdr);
 
 		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
 
 		if (HDR_HAS_L2HDR(hdr)) {
 			ASSERT(hdr->b_l1hdr.b_pabd == NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 			/*
 			 * This buffer is cached on the 2nd Level ARC;
 			 * don't destroy the header.
 			 */
 			arc_change_state(arc_l2c_only, hdr, hash_lock);
 			/*
 			 * dropping from L1+L2 cached to L2-only,
 			 * realloc to remove the L1 header.
 			 */
 			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
 			    hdr_l2only_cache);
 			*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
 		} else {
 			arc_change_state(arc_anon, hdr, hash_lock);
 			arc_hdr_destroy(hdr);
 			*real_evicted += HDR_FULL_SIZE;
 		}
 		return (bytes_evicted);
 	}
 
 	ASSERT(state == arc_mru || state == arc_mfu);
 	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 
 	/* prefetch buffers have a minimum lifespan */
 	if (HDR_IO_IN_PROGRESS(hdr) ||
 	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
 	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
 	    MSEC_TO_TICK(min_lifetime))) {
 		ARCSTAT_BUMP(arcstat_evict_skip);
 		return (bytes_evicted);
 	}
 
 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
 	while (hdr->b_l1hdr.b_buf) {
 		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
 		if (!mutex_tryenter(&buf->b_evict_lock)) {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 			break;
 		}
 		if (buf->b_data != NULL) {
 			bytes_evicted += HDR_GET_LSIZE(hdr);
 			*real_evicted += HDR_GET_LSIZE(hdr);
 		}
 		mutex_exit(&buf->b_evict_lock);
 		arc_buf_destroy_impl(buf);
 	}
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
 	} else {
 		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
 			ARCSTAT_INCR(arcstat_evict_l2_eligible,
 			    HDR_GET_LSIZE(hdr));
 
 			switch (state->arcs_state) {
 				case ARC_STATE_MRU:
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_eligible_mru,
 					    HDR_GET_LSIZE(hdr));
 					break;
 				case ARC_STATE_MFU:
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_eligible_mfu,
 					    HDR_GET_LSIZE(hdr));
 					break;
 				default:
 					break;
 			}
 		} else {
 			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
 			    HDR_GET_LSIZE(hdr));
 		}
 	}
 
 	if (hdr->b_l1hdr.b_bufcnt == 0) {
 		arc_cksum_free(hdr);
 
 		bytes_evicted += arc_hdr_size(hdr);
 		*real_evicted += arc_hdr_size(hdr);
 
 		/*
 		 * If this hdr is being evicted and has a compressed
 		 * buffer then we discard it here before we change states.
 		 * This ensures that the accounting is updated correctly
 		 * in arc_free_data_impl().
 		 */
 		if (hdr->b_l1hdr.b_pabd != NULL)
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 
 		arc_change_state(evicted_state, hdr, hash_lock);
 		ASSERT(HDR_IN_HASH_TABLE(hdr));
 		arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
 	}
 
 	return (bytes_evicted);
 }
 
 static void
 arc_set_need_free(void)
 {
 	ASSERT(MUTEX_HELD(&arc_evict_lock));
 	int64_t remaining = arc_free_memory() - arc_sys_free / 2;
 	arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
 	if (aw == NULL) {
 		arc_need_free = MAX(-remaining, 0);
 	} else {
 		arc_need_free =
 		    MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
 	}
 }
 
 static uint64_t
 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
     uint64_t spa, uint64_t bytes)
 {
 	multilist_sublist_t *mls;
 	uint64_t bytes_evicted = 0, real_evicted = 0;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	int evict_count = zfs_arc_evict_batch_limit;
 
 	ASSERT3P(marker, !=, NULL);
 
 	mls = multilist_sublist_lock(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
 		if ((evict_count <= 0) || (bytes_evicted >= bytes))
 			break;
 
 		/*
 		 * To keep our iteration location, move the marker
 		 * forward. Since we're not holding hdr's hash lock, we
 		 * must be very careful and not remove 'hdr' from the
 		 * sublist. Otherwise, other consumers might mistake the
 		 * 'hdr' as not being on a sublist when they call the
 		 * multilist_link_active() function (they all rely on
 		 * the hash lock protecting concurrent insertions and
 		 * removals). multilist_sublist_move_forward() was
 		 * specifically implemented to ensure this is the case
 		 * (only 'marker' will be removed and re-inserted).
 		 */
 		multilist_sublist_move_forward(mls, marker);
 
 		/*
 		 * The only case where the b_spa field should ever be
 		 * zero, is the marker headers inserted by
 		 * arc_evict_state(). It's possible for multiple threads
 		 * to be calling arc_evict_state() concurrently (e.g.
 		 * dsl_pool_close() and zio_inject_fault()), so we must
 		 * skip any markers we see from these other threads.
 		 */
 		if (hdr->b_spa == 0)
 			continue;
 
 		/* we're only interested in evicting buffers of a certain spa */
 		if (spa != 0 && hdr->b_spa != spa) {
 			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
 		}
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We aren't calling this function from any code path
 		 * that would already be holding a hash lock, so we're
 		 * asserting on this assumption to be defensive in case
 		 * this ever changes. Without this check, it would be
 		 * possible to incorrectly increment arcstat_mutex_miss
 		 * below (e.g. if the code changed such that we called
 		 * this function with a hash lock held).
 		 */
 		ASSERT(!MUTEX_HELD(hash_lock));
 
 		if (mutex_tryenter(hash_lock)) {
 			uint64_t revicted;
 			uint64_t evicted = arc_evict_hdr(hdr, hash_lock,
 			    &revicted);
 			mutex_exit(hash_lock);
 
 			bytes_evicted += evicted;
 			real_evicted += revicted;
 
 			/*
 			 * If evicted is zero, arc_evict_hdr() must have
 			 * decided to skip this header, don't increment
 			 * evict_count in this case.
 			 */
 			if (evicted != 0)
 				evict_count--;
 
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
 	}
 
 	multilist_sublist_unlock(mls);
 
 	/*
 	 * Increment the count of evicted bytes, and wake up any threads that
 	 * are waiting for the count to reach this value.  Since the list is
 	 * ordered by ascending aew_count, we pop off the beginning of the
 	 * list until we reach the end, or a waiter that's past the current
 	 * "count".  Doing this outside the loop reduces the number of times
 	 * we need to acquire the global arc_evict_lock.
 	 *
 	 * Only wake when there's sufficient free memory in the system
 	 * (specifically, arc_sys_free/2, which by default is a bit more than
 	 * 1/64th of RAM).  See the comments in arc_wait_for_eviction().
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_count += real_evicted;
 
 	if (arc_free_memory() > arc_sys_free / 2) {
 		arc_evict_waiter_t *aw;
 		while ((aw = list_head(&arc_evict_waiters)) != NULL &&
 		    aw->aew_count <= arc_evict_count) {
 			list_remove(&arc_evict_waiters, aw);
 			cv_broadcast(&aw->aew_cv);
 		}
 	}
 	arc_set_need_free();
 	mutex_exit(&arc_evict_lock);
 
 	/*
 	 * If the ARC size is reduced from arc_c_max to arc_c_min (especially
 	 * if the average cached block is small), eviction can be on-CPU for
 	 * many seconds.  To ensure that other threads that may be bound to
 	 * this CPU are able to make progress, make a voluntary preemption
 	 * call here.
 	 */
 	cond_resched();
 
 	return (bytes_evicted);
 }
 
 /*
  * Evict buffers from the given arc state, until we've removed the
  * specified number of bytes. Move the removed buffers to the
  * appropriate evict state.
  *
  * This function makes a "best effort". It skips over any buffers
  * it can't get a hash_lock on, and so, may not catch all candidates.
  * It may also return without evicting as much space as requested.
  *
  * If bytes is specified using the special value ARC_EVICT_ALL, this
  * will evict all available (i.e. unlocked and evictable) buffers from
  * the given arc state; which is used by arc_flush().
  */
 static uint64_t
 arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
     arc_buf_contents_t type)
 {
 	uint64_t total_evicted = 0;
 	multilist_t *ml = &state->arcs_list[type];
 	int num_sublists;
 	arc_buf_hdr_t **markers;
 
 	num_sublists = multilist_get_num_sublists(ml);
 
 	/*
 	 * If we've tried to evict from each sublist, made some
 	 * progress, but still have not hit the target number of bytes
 	 * to evict, we want to keep trying. The markers allow us to
 	 * pick up where we left off for each individual sublist, rather
 	 * than starting from the tail each time.
 	 */
 	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
 		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
 
 		/*
 		 * A b_spa of 0 is used to indicate that this header is
 		 * a marker. This fact is used in arc_evict_type() and
 		 * arc_evict_state_impl().
 		 */
 		markers[i]->b_spa = 0;
 
 		mls = multilist_sublist_lock(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
 
 	/*
 	 * While we haven't hit our target number of bytes to evict, or
 	 * we're evicting all available buffers.
 	 */
 	while (total_evicted < bytes) {
 		int sublist_idx = multilist_get_random_index(ml);
 		uint64_t scan_evicted = 0;
 
 		/*
 		 * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
 		 * Request that 10% of the LRUs be scanned by the superblock
 		 * shrinker.
 		 */
 		if (type == ARC_BUFC_DATA && aggsum_compare(
 		    &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) {
 			arc_prune_async((aggsum_upper_bound(
 			    &arc_sums.arcstat_dnode_size) -
 			    arc_dnode_size_limit) / sizeof (dnode_t) /
 			    zfs_arc_dnode_reduce_percent);
 		}
 
 		/*
 		 * Start eviction using a randomly selected sublist,
 		 * this is to try and evenly balance eviction across all
 		 * sublists. Always starting at the same sublist
 		 * (e.g. index 0) would cause evictions to favor certain
 		 * sublists over others.
 		 */
 		for (int i = 0; i < num_sublists; i++) {
 			uint64_t bytes_remaining;
 			uint64_t bytes_evicted;
 
 			if (total_evicted < bytes)
 				bytes_remaining = bytes - total_evicted;
 			else
 				break;
 
 			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
 			    markers[sublist_idx], spa, bytes_remaining);
 
 			scan_evicted += bytes_evicted;
 			total_evicted += bytes_evicted;
 
 			/* we've reached the end, wrap to the beginning */
 			if (++sublist_idx >= num_sublists)
 				sublist_idx = 0;
 		}
 
 		/*
 		 * If we didn't evict anything during this scan, we have
 		 * no reason to believe we'll evict more during another
 		 * scan, so break the loop.
 		 */
 		if (scan_evicted == 0) {
 			/* This isn't possible, let's make that obvious */
 			ASSERT3S(bytes, !=, 0);
 
 			/*
 			 * When bytes is ARC_EVICT_ALL, the only way to
 			 * break the loop is when scan_evicted is zero.
 			 * In that case, we actually have evicted enough,
 			 * so we don't want to increment the kstat.
 			 */
 			if (bytes != ARC_EVICT_ALL) {
 				ASSERT3S(total_evicted, <, bytes);
 				ARCSTAT_BUMP(arcstat_evict_not_enough);
 			}
 
 			break;
 		}
 	}
 
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 
 		kmem_cache_free(hdr_full_cache, markers[i]);
 	}
 	kmem_free(markers, sizeof (*markers) * num_sublists);
 
 	return (total_evicted);
 }
 
 /*
  * Flush all "evictable" data of the given type from the arc state
  * specified. This will not evict any "active" buffers (i.e. referenced).
  *
  * When 'retry' is set to B_FALSE, the function will make a single pass
  * over the state and evict any buffers that it can. Since it doesn't
  * continually retry the eviction, it might end up leaving some buffers
  * in the ARC due to lock misses.
  *
  * When 'retry' is set to B_TRUE, the function will continually retry the
  * eviction until *all* evictable buffers have been removed from the
  * state. As a result, if concurrent insertions into the state are
  * allowed (e.g. if the ARC isn't shutting down), this function might
  * wind up in an infinite loop, continually trying to evict buffers.
  */
 static uint64_t
 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
     boolean_t retry)
 {
 	uint64_t evicted = 0;
 
 	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
 		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
 
 		if (!retry)
 			break;
 	}
 
 	return (evicted);
 }
 
 /*
  * Evict the specified number of bytes from the state specified,
  * restricting eviction to the spa and type given. This function
  * prevents us from trying to evict more from a state's list than
  * is "evictable", and to skip evicting altogether when passed a
  * negative value for "bytes". In contrast, arc_evict_state() will
  * evict everything it can, when passed a negative value for "bytes".
  */
 static uint64_t
 arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
     arc_buf_contents_t type)
 {
 	uint64_t delta;
 
 	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
 		    bytes);
 		return (arc_evict_state(state, spa, delta, type));
 	}
 
 	return (0);
 }
 
 /*
  * The goal of this function is to evict enough meta data buffers from the
  * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
  * more complicated than it appears because it is common for data buffers
  * to have holds on meta data buffers.  In addition, dnode meta data buffers
  * will be held by the dnodes in the block preventing them from being freed.
  * This means we can't simply traverse the ARC and expect to always find
  * enough unheld meta data buffer to release.
  *
  * Therefore, this function has been updated to make alternating passes
  * over the ARC releasing data buffers and then newly unheld meta data
  * buffers.  This ensures forward progress is maintained and meta_used
  * will decrease.  Normally this is sufficient, but if required the ARC
  * will call the registered prune callbacks causing dentry and inodes to
  * be dropped from the VFS cache.  This will make dnode meta data buffers
  * available for reclaim.
  */
 static uint64_t
 arc_evict_meta_balanced(uint64_t meta_used)
 {
 	int64_t delta, prune = 0, adjustmnt;
 	uint64_t total_evicted = 0;
 	arc_buf_contents_t type = ARC_BUFC_DATA;
 	int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
 
 restart:
 	/*
 	 * This slightly differs than the way we evict from the mru in
 	 * arc_evict because we don't have a "target" value (i.e. no
 	 * "meta" arc_p). As a result, I think we can completely
 	 * cannibalize the metadata in the MRU before we evict the
 	 * metadata from the MFU. I think we probably need to implement a
 	 * "metadata arc_p" value to do this properly.
 	 */
 	adjustmnt = meta_used - arc_meta_limit;
 
 	if (adjustmnt > 0 &&
 	    zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
 		    adjustmnt);
 		total_evicted += arc_evict_impl(arc_mru, 0, delta, type);
 		adjustmnt -= delta;
 	}
 
 	/*
 	 * We can't afford to recalculate adjustmnt here. If we do,
 	 * new metadata buffers can sneak into the MRU or ANON lists,
 	 * thus penalize the MFU metadata. Although the fudge factor is
 	 * small, it has been empirically shown to be significant for
 	 * certain workloads (e.g. creating many empty directories). As
 	 * such, we use the original calculation for adjustmnt, and
 	 * simply decrement the amount of data evicted from the MRU.
 	 */
 
 	if (adjustmnt > 0 &&
 	    zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
 		    adjustmnt);
 		total_evicted += arc_evict_impl(arc_mfu, 0, delta, type);
 	}
 
 	adjustmnt = meta_used - arc_meta_limit;
 
 	if (adjustmnt > 0 &&
 	    zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
 		delta = MIN(adjustmnt,
 		    zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
 		total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type);
 		adjustmnt -= delta;
 	}
 
 	if (adjustmnt > 0 &&
 	    zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
 		delta = MIN(adjustmnt,
 		    zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
 		total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type);
 	}
 
 	/*
 	 * If after attempting to make the requested adjustment to the ARC
 	 * the meta limit is still being exceeded then request that the
 	 * higher layers drop some cached objects which have holds on ARC
 	 * meta buffers.  Requests to the upper layers will be made with
 	 * increasingly large scan sizes until the ARC is below the limit.
 	 */
 	if (meta_used > arc_meta_limit) {
 		if (type == ARC_BUFC_DATA) {
 			type = ARC_BUFC_METADATA;
 		} else {
 			type = ARC_BUFC_DATA;
 
 			if (zfs_arc_meta_prune) {
 				prune += zfs_arc_meta_prune;
 				arc_prune_async(prune);
 			}
 		}
 
 		if (restarts > 0) {
 			restarts--;
 			goto restart;
 		}
 	}
 	return (total_evicted);
 }
 
 /*
  * Evict metadata buffers from the cache, such that arcstat_meta_used is
  * capped by the arc_meta_limit tunable.
  */
 static uint64_t
 arc_evict_meta_only(uint64_t meta_used)
 {
 	uint64_t total_evicted = 0;
 	int64_t target;
 
 	/*
 	 * If we're over the meta limit, we want to evict enough
 	 * metadata to get back under the meta limit. We don't want to
 	 * evict so much that we drop the MRU below arc_p, though. If
 	 * we're over the meta limit more than we're over arc_p, we
 	 * evict some from the MRU here, and some from the MFU below.
 	 */
 	target = MIN((int64_t)(meta_used - arc_meta_limit),
 	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
 	    zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
 
 	total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 
 	/*
 	 * Similar to the above, we want to evict enough bytes to get us
 	 * below the meta limit, but not so much as to drop us below the
 	 * space allotted to the MFU (which is defined as arc_c - arc_p).
 	 */
 	target = MIN((int64_t)(meta_used - arc_meta_limit),
 	    (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
 	    (arc_c - arc_p)));
 
 	total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 
 	return (total_evicted);
 }
 
 static uint64_t
 arc_evict_meta(uint64_t meta_used)
 {
 	if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
 		return (arc_evict_meta_only(meta_used));
 	else
 		return (arc_evict_meta_balanced(meta_used));
 }
 
 /*
  * Return the type of the oldest buffer in the given arc state
  *
  * This function will select a random sublist of type ARC_BUFC_DATA and
  * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
  * is compared, and the type which contains the "older" buffer will be
  * returned.
  */
 static arc_buf_contents_t
 arc_evict_type(arc_state_t *state)
 {
 	multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
 	multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
 	int data_idx = multilist_get_random_index(data_ml);
 	int meta_idx = multilist_get_random_index(meta_ml);
 	multilist_sublist_t *data_mls;
 	multilist_sublist_t *meta_mls;
 	arc_buf_contents_t type;
 	arc_buf_hdr_t *data_hdr;
 	arc_buf_hdr_t *meta_hdr;
 
 	/*
 	 * We keep the sublist lock until we're finished, to prevent
 	 * the headers from being destroyed via arc_evict_state().
 	 */
 	data_mls = multilist_sublist_lock(data_ml, data_idx);
 	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
 
 	/*
 	 * These two loops are to ensure we skip any markers that
 	 * might be at the tail of the lists due to arc_evict_state().
 	 */
 
 	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
 	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
 		if (data_hdr->b_spa != 0)
 			break;
 	}
 
 	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
 	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
 		if (meta_hdr->b_spa != 0)
 			break;
 	}
 
 	if (data_hdr == NULL && meta_hdr == NULL) {
 		type = ARC_BUFC_DATA;
 	} else if (data_hdr == NULL) {
 		ASSERT3P(meta_hdr, !=, NULL);
 		type = ARC_BUFC_METADATA;
 	} else if (meta_hdr == NULL) {
 		ASSERT3P(data_hdr, !=, NULL);
 		type = ARC_BUFC_DATA;
 	} else {
 		ASSERT3P(data_hdr, !=, NULL);
 		ASSERT3P(meta_hdr, !=, NULL);
 
 		/* The headers can't be on the sublist without an L1 header */
 		ASSERT(HDR_HAS_L1HDR(data_hdr));
 		ASSERT(HDR_HAS_L1HDR(meta_hdr));
 
 		if (data_hdr->b_l1hdr.b_arc_access <
 		    meta_hdr->b_l1hdr.b_arc_access) {
 			type = ARC_BUFC_DATA;
 		} else {
 			type = ARC_BUFC_METADATA;
 		}
 	}
 
 	multilist_sublist_unlock(meta_mls);
 	multilist_sublist_unlock(data_mls);
 
 	return (type);
 }
 
 /*
  * Evict buffers from the cache, such that arcstat_size is capped by arc_c.
  */
 static uint64_t
 arc_evict(void)
 {
 	uint64_t total_evicted = 0;
 	uint64_t bytes;
 	int64_t target;
 	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
 	uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used);
 
 	/*
 	 * If we're over arc_meta_limit, we want to correct that before
 	 * potentially evicting data buffers below.
 	 */
 	total_evicted += arc_evict_meta(ameta);
 
 	/*
 	 * Adjust MRU size
 	 *
 	 * If we're over the target cache size, we want to evict enough
 	 * from the list to get back to our target size. We don't want
 	 * to evict too much from the MRU, such that it drops below
 	 * arc_p. So, if we're over our target cache size more than
 	 * the MRU is over arc_p, we'll evict enough to get back to
 	 * arc_p here, and then evict more from the MFU below.
 	 */
 	target = MIN((int64_t)(asize - arc_c),
 	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
 	    zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
 
 	/*
 	 * If we're below arc_meta_min, always prefer to evict data.
 	 * Otherwise, try to satisfy the requested number of bytes to
 	 * evict from the type which contains older buffers; in an
 	 * effort to keep newer buffers in the cache regardless of their
 	 * type. If we cannot satisfy the number of bytes from this
 	 * type, spill over into the next type.
 	 */
 	if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA &&
 	    ameta > arc_meta_min) {
 		bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 		total_evicted += bytes;
 
 		/*
 		 * If we couldn't evict our target number of bytes from
 		 * metadata, we try to get the rest from data.
 		 */
 		target -= bytes;
 
 		total_evicted +=
 		    arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
 	} else {
 		bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
 		total_evicted += bytes;
 
 		/*
 		 * If we couldn't evict our target number of bytes from
 		 * data, we try to get the rest from metadata.
 		 */
 		target -= bytes;
 
 		total_evicted +=
 		    arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Re-sum ARC stats after the first round of evictions.
 	 */
 	asize = aggsum_value(&arc_sums.arcstat_size);
 	ameta = aggsum_value(&arc_sums.arcstat_meta_used);
 
 
 	/*
 	 * Adjust MFU size
 	 *
 	 * Now that we've tried to evict enough from the MRU to get its
 	 * size back to arc_p, if we're still above the target cache
 	 * size, we evict the rest from the MFU.
 	 */
 	target = asize - arc_c;
 
 	if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA &&
 	    ameta > arc_meta_min) {
 		bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 		total_evicted += bytes;
 
 		/*
 		 * If we couldn't evict our target number of bytes from
 		 * metadata, we try to get the rest from data.
 		 */
 		target -= bytes;
 
 		total_evicted +=
 		    arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
 	} else {
 		bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
 		total_evicted += bytes;
 
 		/*
 		 * If we couldn't evict our target number of bytes from
 		 * data, we try to get the rest from data.
 		 */
 		target -= bytes;
 
 		total_evicted +=
 		    arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Adjust ghost lists
 	 *
 	 * In addition to the above, the ARC also defines target values
 	 * for the ghost lists. The sum of the mru list and mru ghost
 	 * list should never exceed the target size of the cache, and
 	 * the sum of the mru list, mfu list, mru ghost list, and mfu
 	 * ghost list should never exceed twice the target size of the
 	 * cache. The following logic enforces these limits on the ghost
 	 * caches, and evicts from them as needed.
 	 */
 	target = zfs_refcount_count(&arc_mru->arcs_size) +
 	    zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
 
 	bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
 	total_evicted += bytes;
 
 	target -= bytes;
 
 	total_evicted +=
 	    arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
 
 	/*
 	 * We assume the sum of the mru list and mfu list is less than
 	 * or equal to arc_c (we enforced this above), which means we
 	 * can use the simpler of the two equations below:
 	 *
 	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
 	 *		    mru ghost + mfu ghost <= arc_c
 	 */
 	target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
 	    zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
 
 	bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
 	total_evicted += bytes;
 
 	target -= bytes;
 
 	total_evicted +=
 	    arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
 
 	return (total_evicted);
 }
 
 void
 arc_flush(spa_t *spa, boolean_t retry)
 {
 	uint64_t guid = 0;
 
 	/*
 	 * If retry is B_TRUE, a spa must not be specified since we have
 	 * no good way to determine if all of a spa's buffers have been
 	 * evicted from an arc state.
 	 */
 	ASSERT(!retry || spa == 0);
 
 	if (spa != NULL)
 		guid = spa_load_guid(spa);
 
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 }
 
 void
 arc_reduce_target_size(int64_t to_free)
 {
 	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
 
 	/*
 	 * All callers want the ARC to actually evict (at least) this much
 	 * memory.  Therefore we reduce from the lower of the current size and
 	 * the target size.  This way, even if arc_c is much higher than
 	 * arc_size (as can be the case after many calls to arc_freed(), we will
 	 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
 	 * will evict.
 	 */
 	uint64_t c = MIN(arc_c, asize);
 
 	if (c > to_free && c - to_free > arc_c_min) {
 		arc_c = c - to_free;
 		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
 		if (arc_p > arc_c)
 			arc_p = (arc_c >> 1);
 		ASSERT(arc_c >= arc_c_min);
 		ASSERT((int64_t)arc_p >= 0);
 	} else {
 		arc_c = arc_c_min;
 	}
 
 	if (asize > arc_c) {
 		/* See comment in arc_evict_cb_check() on why lock+flag */
 		mutex_enter(&arc_evict_lock);
 		arc_evict_needed = B_TRUE;
 		mutex_exit(&arc_evict_lock);
 		zthr_wakeup(arc_evict_zthr);
 	}
 }
 
 /*
  * Determine if the system is under memory pressure and is asking
  * to reclaim memory. A return value of B_TRUE indicates that the system
  * is under memory pressure and that the arc should adjust accordingly.
  */
 boolean_t
 arc_reclaim_needed(void)
 {
 	return (arc_available_memory() < 0);
 }
 
 void
 arc_kmem_reap_soon(void)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
 	extern kmem_cache_t	*zio_buf_cache[];
 	extern kmem_cache_t	*zio_data_buf_cache[];
 
 #ifdef _KERNEL
 	if ((aggsum_compare(&arc_sums.arcstat_meta_used,
 	    arc_meta_limit) >= 0) && zfs_arc_meta_prune) {
 		/*
 		 * We are exceeding our meta-data cache limit.
 		 * Prune some entries to release holds on meta-data.
 		 */
 		arc_prune_async(zfs_arc_meta_prune);
 	}
 #if defined(_ILP32)
 	/*
 	 * Reclaim unused memory from all kmem caches.
 	 */
 	kmem_reap();
 #endif
 #endif
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 #if defined(_ILP32)
 		/* reach upper limit of cache size on 32-bit */
 		if (zio_buf_cache[i] == NULL)
 			break;
 #endif
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
 		if (zio_data_buf_cache[i] != prev_data_cache) {
 			prev_data_cache = zio_data_buf_cache[i];
 			kmem_cache_reap_now(zio_data_buf_cache[i]);
 		}
 	}
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_full_cache);
 	kmem_cache_reap_now(hdr_l2only_cache);
 	kmem_cache_reap_now(zfs_btree_leaf_cache);
 	abd_cache_reap_now();
 }
 
 /* ARGSUSED */
 static boolean_t
 arc_evict_cb_check(void *arg, zthr_t *zthr)
 {
 #ifdef ZFS_DEBUG
 	/*
 	 * This is necessary in order to keep the kstat information
 	 * up to date for tools that display kstat data such as the
 	 * mdb ::arc dcmd and the Linux crash utility.  These tools
 	 * typically do not call kstat's update function, but simply
 	 * dump out stats from the most recent update.  Without
 	 * this call, these commands may show stale stats for the
 	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists.  Even
 	 * with this call, the data might be out of date if the
 	 * evict thread hasn't been woken recently; but that should
 	 * suffice.  The arc_state_t structures can be queried
 	 * directly if more accurate information is needed.
 	 */
 	if (arc_ksp != NULL)
 		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 #endif
 
 	/*
 	 * We have to rely on arc_wait_for_eviction() to tell us when to
 	 * evict, rather than checking if we are overflowing here, so that we
 	 * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
 	 * If we have become "not overflowing" since arc_wait_for_eviction()
 	 * checked, we need to wake it up.  We could broadcast the CV here,
 	 * but arc_wait_for_eviction() may have not yet gone to sleep.  We
 	 * would need to use a mutex to ensure that this function doesn't
 	 * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
 	 * the arc_evict_lock).  However, the lock ordering of such a lock
 	 * would necessarily be incorrect with respect to the zthr_lock,
 	 * which is held before this function is called, and is held by
 	 * arc_wait_for_eviction() when it calls zthr_wakeup().
 	 */
 	return (arc_evict_needed);
 }
 
 /*
  * Keep arc_size under arc_c by running arc_evict which evicts data
  * from the ARC.
  */
 /* ARGSUSED */
 static void
 arc_evict_cb(void *arg, zthr_t *zthr)
 {
 	uint64_t evicted = 0;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/* Evict from cache */
 	evicted = arc_evict();
 
 	/*
 	 * If evicted is zero, we couldn't evict anything
 	 * via arc_evict(). This could be due to hash lock
 	 * collisions, but more likely due to the majority of
 	 * arc buffers being unevictable. Therefore, even if
 	 * arc_size is above arc_c, another pass is unlikely to
 	 * be helpful and could potentially cause us to enter an
 	 * infinite loop.  Additionally, zthr_iscancelled() is
 	 * checked here so that if the arc is shutting down, the
 	 * broadcast will wake any remaining arc evict waiters.
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) &&
 	    evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
 	if (!arc_evict_needed) {
 		/*
 		 * We're either no longer overflowing, or we
 		 * can't evict anything more, so we should wake
 		 * arc_get_data_impl() sooner.
 		 */
 		arc_evict_waiter_t *aw;
 		while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
 			cv_broadcast(&aw->aew_cv);
 		}
 		arc_set_need_free();
 	}
 	mutex_exit(&arc_evict_lock);
 	spl_fstrans_unmark(cookie);
 }
 
 /* ARGSUSED */
 static boolean_t
 arc_reap_cb_check(void *arg, zthr_t *zthr)
 {
 	int64_t free_memory = arc_available_memory();
 	static int reap_cb_check_counter = 0;
 
 	/*
 	 * If a kmem reap is already active, don't schedule more.  We must
 	 * check for this because kmem_cache_reap_soon() won't actually
 	 * block on the cache being reaped (this is to prevent callers from
 	 * becoming implicitly blocked by a system-wide kmem reap -- which,
 	 * on a system with many, many full magazines, can take minutes).
 	 */
 	if (!kmem_cache_reap_active() && free_memory < 0) {
 
 		arc_no_grow = B_TRUE;
 		arc_warm = B_TRUE;
 		/*
 		 * Wait at least zfs_grow_retry (default 5) seconds
 		 * before considering growing.
 		 */
 		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
 		return (B_TRUE);
 	} else if (free_memory < arc_c >> arc_no_grow_shift) {
 		arc_no_grow = B_TRUE;
 	} else if (gethrtime() >= arc_growtime) {
 		arc_no_grow = B_FALSE;
 	}
 
 	/*
 	 * Called unconditionally every 60 seconds to reclaim unused
 	 * zstd compression and decompression context. This is done
 	 * here to avoid the need for an independent thread.
 	 */
 	if (!((reap_cb_check_counter++) % 60))
 		zfs_zstd_cache_reap_now();
 
 	return (B_FALSE);
 }
 
 /*
  * Keep enough free memory in the system by reaping the ARC's kmem
  * caches.  To cause more slabs to be reapable, we may reduce the
  * target size of the cache (arc_c), causing the arc_evict_cb()
  * to free more buffers.
  */
 /* ARGSUSED */
 static void
 arc_reap_cb(void *arg, zthr_t *zthr)
 {
 	int64_t free_memory;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/*
 	 * Kick off asynchronous kmem_reap()'s of all our caches.
 	 */
 	arc_kmem_reap_soon();
 
 	/*
 	 * Wait at least arc_kmem_cache_reap_retry_ms between
 	 * arc_kmem_reap_soon() calls. Without this check it is possible to
 	 * end up in a situation where we spend lots of time reaping
 	 * caches, while we're near arc_c_min.  Waiting here also gives the
 	 * subsequent free memory check a chance of finding that the
 	 * asynchronous reap has already freed enough memory, and we don't
 	 * need to call arc_reduce_target_size().
 	 */
 	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
 
 	/*
 	 * Reduce the target size as needed to maintain the amount of free
 	 * memory in the system at a fraction of the arc_size (1/128th by
 	 * default).  If oversubscribed (free_memory < 0) then reduce the
 	 * target arc_size by the deficit amount plus the fractional
 	 * amount.  If free memory is positive but less than the fractional
 	 * amount, reduce by what is needed to hit the fractional amount.
 	 */
 	free_memory = arc_available_memory();
 
 	int64_t to_free =
 	    (arc_c >> arc_shrink_shift) - free_memory;
 	if (to_free > 0) {
 		arc_reduce_target_size(to_free);
 	}
 	spl_fstrans_unmark(cookie);
 }
 
 #ifdef _KERNEL
 /*
  * Determine the amount of memory eligible for eviction contained in the
  * ARC. All clean data reported by the ghost lists can always be safely
  * evicted. Due to arc_c_min, the same does not hold for all clean data
  * contained by the regular mru and mfu lists.
  *
  * In the case of the regular mru and mfu lists, we need to report as
  * much clean data as possible, such that evicting that same reported
  * data will not bring arc_size below arc_c_min. Thus, in certain
  * circumstances, the total amount of clean data in the mru and mfu
  * lists might not actually be evictable.
  *
  * The following two distinct cases are accounted for:
  *
  * 1. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is greater than or equal to arc_c_min.
  *    (i.e. amount of dirty data >= arc_c_min)
  *
  *    This is the easy case; all clean data contained by the mru and mfu
  *    lists is evictable. Evicting all clean data can only drop arc_size
  *    to the amount of dirty data, which is greater than arc_c_min.
  *
  * 2. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is less than arc_c_min.
  *    (i.e. arc_c_min > amount of dirty data)
  *
  *    2.1. arc_size is greater than or equal arc_c_min.
  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
  *
  *         In this case, not all clean data from the regular mru and mfu
  *         lists is actually evictable; we must leave enough clean data
  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
  *         evictable data from the two lists combined, is exactly the
  *         difference between arc_size and arc_c_min.
  *
  *    2.2. arc_size is less than arc_c_min
  *         (i.e. arc_c_min > arc_size > amount of dirty data)
  *
  *         In this case, none of the data contained in the mru and mfu
  *         lists is evictable, even if it's clean. Since arc_size is
  *         already below arc_c_min, evicting any more would only
  *         increase this negative difference.
  */
 
 #endif /* _KERNEL */
 
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are coming from.  This function is only called
  * when we are adding new content to the cache.
  */
 static void
 arc_adapt(int bytes, arc_state_t *state)
 {
 	int mult;
 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
 	int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
 	int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
 
 	ASSERT(bytes > 0);
 	/*
 	 * Adapt the target size of the MRU list:
 	 *	- if we just hit in the MRU ghost list, then increase
 	 *	  the target size of the MRU list.
 	 *	- if we just hit in the MFU ghost list, then increase
 	 *	  the target size of the MFU list by decreasing the
 	 *	  target size of the MRU list.
 	 */
 	if (state == arc_mru_ghost) {
 		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
 		if (!zfs_arc_p_dampener_disable)
 			mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
 	} else if (state == arc_mfu_ghost) {
 		uint64_t delta;
 
 		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
 		if (!zfs_arc_p_dampener_disable)
 			mult = MIN(mult, 10);
 
 		delta = MIN(bytes * mult, arc_p);
 		arc_p = MAX(arc_p_min, arc_p - delta);
 	}
 	ASSERT((int64_t)arc_p >= 0);
 
 	/*
 	 * Wake reap thread if we do not have any available memory
 	 */
 	if (arc_reclaim_needed()) {
 		zthr_wakeup(arc_reap_zthr);
 		return;
 	}
 
 	if (arc_no_grow)
 		return;
 
 	if (arc_c >= arc_c_max)
 		return;
 
 	/*
 	 * If we're within (2 * maxblocksize) bytes of the target
 	 * cache size, increment the target cache size
 	 */
 	ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
 	if (aggsum_upper_bound(&arc_sums.arcstat_size) >=
 	    arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
 		atomic_add_64(&arc_c, (int64_t)bytes);
 		if (arc_c > arc_c_max)
 			arc_c = arc_c_max;
 		else if (state == arc_anon)
 			atomic_add_64(&arc_p, (int64_t)bytes);
 		if (arc_p > arc_c)
 			arc_p = arc_c;
 	}
 	ASSERT((int64_t)arc_p >= 0);
 }
 
 /*
  * Check if arc_size has grown past our upper threshold, determined by
  * zfs_arc_overflow_shift.
  */
 static arc_ovf_level_t
 arc_is_overflowing(void)
 {
 	/* Always allow at least one block of overflow */
 	int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
 	    arc_c >> zfs_arc_overflow_shift);
 
 	/*
 	 * We just compare the lower bound here for performance reasons. Our
 	 * primary goals are to make sure that the arc never grows without
 	 * bound, and that it can reach its maximum size. This check
 	 * accomplishes both goals. The maximum amount we could run over by is
 	 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
 	 * in the ARC. In practice, that's in the tens of MB, which is low
 	 * enough to be safe.
 	 */
 	int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
 	    arc_c - overflow / 2;
 	return (over < 0 ? ARC_OVF_NONE :
 	    over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
 }
 
 static abd_t *
 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
     boolean_t do_adapt)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, do_adapt);
 	if (type == ARC_BUFC_METADATA) {
 		return (abd_alloc(size, B_TRUE));
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		return (abd_alloc(size, B_FALSE));
 	}
 }
 
 static void *
 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, B_TRUE);
 	if (type == ARC_BUFC_METADATA) {
 		return (zio_buf_alloc(size));
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		return (zio_data_buf_alloc(size));
 	}
 }
 
 /*
  * Wait for the specified amount of data (in bytes) to be evicted from the
  * ARC, and for there to be sufficient free memory in the system.  Waiting for
  * eviction ensures that the memory used by the ARC decreases.  Waiting for
  * free memory ensures that the system won't run out of free pages, regardless
  * of ARC behavior and settings.  See arc_lowmem_init().
  */
 void
 arc_wait_for_eviction(uint64_t amount)
 {
 	switch (arc_is_overflowing()) {
 	case ARC_OVF_NONE:
 		return;
 	case ARC_OVF_SOME:
 		/*
 		 * This is a bit racy without taking arc_evict_lock, but the
 		 * worst that can happen is we either call zthr_wakeup() extra
 		 * time due to race with other thread here, or the set flag
 		 * get cleared by arc_evict_cb(), which is unlikely due to
 		 * big hysteresis, but also not important since at this level
 		 * of overflow the eviction is purely advisory.  Same time
 		 * taking the global lock here every time without waiting for
 		 * the actual eviction creates a significant lock contention.
 		 */
 		if (!arc_evict_needed) {
 			arc_evict_needed = B_TRUE;
 			zthr_wakeup(arc_evict_zthr);
 		}
 		return;
 	case ARC_OVF_SEVERE:
 	default:
 	{
 		arc_evict_waiter_t aw;
 		list_link_init(&aw.aew_node);
 		cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
 
 		uint64_t last_count = 0;
 		mutex_enter(&arc_evict_lock);
 		if (!list_is_empty(&arc_evict_waiters)) {
 			arc_evict_waiter_t *last =
 			    list_tail(&arc_evict_waiters);
 			last_count = last->aew_count;
 		} else if (!arc_evict_needed) {
 			arc_evict_needed = B_TRUE;
 			zthr_wakeup(arc_evict_zthr);
 		}
 		/*
 		 * Note, the last waiter's count may be less than
 		 * arc_evict_count if we are low on memory in which
 		 * case arc_evict_state_impl() may have deferred
 		 * wakeups (but still incremented arc_evict_count).
 		 */
 		aw.aew_count = MAX(last_count, arc_evict_count) + amount;
 
 		list_insert_tail(&arc_evict_waiters, &aw);
 
 		arc_set_need_free();
 
 		DTRACE_PROBE3(arc__wait__for__eviction,
 		    uint64_t, amount,
 		    uint64_t, arc_evict_count,
 		    uint64_t, aw.aew_count);
 
 		/*
 		 * We will be woken up either when arc_evict_count reaches
 		 * aew_count, or when the ARC is no longer overflowing and
 		 * eviction completes.
 		 * In case of "false" wakeup, we will still be on the list.
 		 */
 		do {
 			cv_wait(&aw.aew_cv, &arc_evict_lock);
 		} while (list_link_active(&aw.aew_node));
 		mutex_exit(&arc_evict_lock);
 
 		cv_destroy(&aw.aew_cv);
 	}
 	}
 }
 
 /*
  * Allocate a block and return it to the caller. If we are hitting the
  * hard limit for the cache size, we must sleep, waiting for the eviction
  * thread to catch up. If we're past the target size but below the hard
  * limit, we'll only signal the reclaim thread and continue on.
  */
 static void
 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
     boolean_t do_adapt)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	if (do_adapt)
 		arc_adapt(size, state);
 
 	/*
 	 * If arc_size is currently overflowing, we must be adding data
 	 * faster than we are evicting.  To ensure we don't compound the
 	 * problem by adding more data and forcing arc_size to grow even
 	 * further past it's target size, we wait for the eviction thread to
 	 * make some progress.  We also wait for there to be sufficient free
 	 * memory in the system, as measured by arc_free_memory().
 	 *
 	 * Specifically, we wait for zfs_arc_eviction_pct percent of the
 	 * requested size to be evicted.  This should be more than 100%, to
 	 * ensure that that progress is also made towards getting arc_size
 	 * under arc_c.  See the comment above zfs_arc_eviction_pct.
 	 */
 	arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100);
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_consume(size, ARC_SPACE_META);
 	} else {
 		arc_space_consume(size, ARC_SPACE_DATA);
 	}
 
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
 	if (!GHOST_STATE(state)) {
 
 		(void) zfs_refcount_add_many(&state->arcs_size, size, tag);
 
 		/*
 		 * If this is reached via arc_read, the link is
 		 * protected by the hash lock. If reached via
 		 * arc_buf_alloc, the header should not be accessed by
 		 * any other thread. And, if reached via arc_read_done,
 		 * the hash lock will protect it if it's found in the
 		 * hash table; otherwise no other thread should be
 		 * trying to [add|remove]_reference it.
 		 */
 		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 			ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 			(void) zfs_refcount_add_many(&state->arcs_esize[type],
 			    size, tag);
 		}
 
 		/*
 		 * If we are growing the cache, and we are adding anonymous
 		 * data, and we have outgrown arc_p, update arc_p
 		 */
 		if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c &&
 		    hdr->b_l1hdr.b_state == arc_anon &&
 		    (zfs_refcount_count(&arc_anon->arcs_size) +
 		    zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
 			arc_p = MIN(arc_c, arc_p + size);
 	}
 }
 
 static void
 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
 {
 	arc_free_data_impl(hdr, size, tag);
 	abd_free(abd);
 }
 
 static void
 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_free_data_impl(hdr, size, tag);
 	if (type == ARC_BUFC_METADATA) {
 		zio_buf_free(buf, size);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		zio_data_buf_free(buf, size);
 	}
 }
 
 /*
  * Free the arc data buffer.
  */
 static void
 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, tag);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 }
 
 /*
  * This routine is called whenever a buffer is accessed.
  * NOTE: the hash lock is dropped in this function.
  */
 static void
 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
 	clock_t now;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * This buffer is not in the cache, and does not
 		 * appear in our "ghost" list.  Add the new buffer
 		 * to the MRU state.
 		 */
 
 		ASSERT0(hdr->b_l1hdr.b_arc_access);
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mru, hdr, hash_lock);
 
 	} else if (hdr->b_l1hdr.b_state == arc_mru) {
 		now = ddi_get_lbolt();
 
 		/*
 		 * If this buffer is here because of a prefetch, then either:
 		 * - clear the flag if this is a "referencing" read
 		 *   (any subsequent access will bump this into the MFU state).
 		 * or
 		 * - move the buffer to the head of the list if this is
 		 *   another prefetch (to make it less likely to be evicted).
 		 */
 		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
 				/* link protected by hash lock */
 				ASSERT(multilist_link_active(
 				    &hdr->b_l1hdr.b_arc_node));
 			} else {
 				if (HDR_HAS_L2HDR(hdr))
 					l2arc_hdr_arcstats_decrement_state(hdr);
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PREFETCH |
 				    ARC_FLAG_PRESCIENT_PREFETCH);
 				atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
 				ARCSTAT_BUMP(arcstat_mru_hits);
 				if (HDR_HAS_L2HDR(hdr))
 					l2arc_hdr_arcstats_increment_state(hdr);
 			}
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 
 		/*
 		 * This buffer has been "accessed" only once so far,
 		 * but it is still in the cache. Move it to the MFU
 		 * state.
 		 */
 		if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
 		    ARC_MINTIME)) {
 			/*
 			 * More than 125ms have passed since we
 			 * instantiated this buffer.  Move it to the
 			 * most frequently used state.
 			 */
 			hdr->b_l1hdr.b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 			arc_change_state(arc_mfu, hdr, hash_lock);
 		}
 		atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
 		ARCSTAT_BUMP(arcstat_mru_hits);
 	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been "accessed" recently, but
 		 * was evicted from the cache.  Move it to the
 		 * MFU state.
 		 */
 		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			new_state = arc_mru;
 			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
 				if (HDR_HAS_L2HDR(hdr))
 					l2arc_hdr_arcstats_decrement_state(hdr);
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PREFETCH |
 				    ARC_FLAG_PRESCIENT_PREFETCH);
 				if (HDR_HAS_L2HDR(hdr))
 					l2arc_hdr_arcstats_increment_state(hdr);
 			}
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		}
 
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		arc_change_state(new_state, hdr, hash_lock);
 
 		atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
 	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
 		/*
 		 * This buffer has been accessed more than once and is
 		 * still in the cache.  Keep it in the MFU state.
 		 *
 		 * NOTE: an add_reference() that occurred when we did
 		 * the arc_read() will have kicked this off the list.
 		 * If it was a prefetch, we will explicitly move it to
 		 * the head of the list now.
 		 */
 
 		atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
 		ARCSTAT_BUMP(arcstat_mfu_hits);
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
 		arc_state_t	*new_state = arc_mfu;
 		/*
 		 * This buffer has been accessed more than once but has
 		 * been evicted from the cache.  Move it back to the
 		 * MFU state.
 		 */
 
 		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			/*
 			 * This is a prefetch access...
 			 * move this block back to the MRU state.
 			 */
 			new_state = arc_mru;
 		}
 
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		arc_change_state(new_state, hdr, hash_lock);
 
 		atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
 	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
 		/*
 		 * This buffer is on the 2nd Level ARC.
 		 */
 
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mfu, hdr, hash_lock);
 	} else {
 		cmn_err(CE_PANIC, "invalid arc state 0x%p",
 		    hdr->b_l1hdr.b_state);
 	}
 }
 
 /*
  * This routine is called by dbuf_hold() to update the arc_access() state
  * which otherwise would be skipped for entries in the dbuf cache.
  */
 void
 arc_buf_access(arc_buf_t *buf)
 {
 	mutex_enter(&buf->b_evict_lock);
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Avoid taking the hash_lock when possible as an optimization.
 	 * The header must be checked again under the hash_lock in order
 	 * to handle the case where it is concurrently being released.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
 		mutex_exit(&buf->b_evict_lock);
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
 		mutex_exit(hash_lock);
 		mutex_exit(&buf->b_evict_lock);
 		ARCSTAT_BUMP(arcstat_access_skip);
 		return;
 	}
 
 	mutex_exit(&buf->b_evict_lock);
 
 	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 	    hdr->b_l1hdr.b_state == arc_mfu);
 
 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 	arc_access(hdr, hash_lock);
 	mutex_exit(hash_lock);
 
 	ARCSTAT_BUMP(arcstat_hits);
 	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr),
 	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
 }
 
 /* a generic arc_read_done_func_t which you can use */
 /* ARGSUSED */
 void
 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	if (buf == NULL)
 		return;
 
 	bcopy(buf->b_data, arg, arc_buf_size(buf));
 	arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_read_done_func_t */
 /* ARGSUSED */
 void
 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	arc_buf_t **bufp = arg;
 
 	if (buf == NULL) {
 		ASSERT(zio == NULL || zio->io_error != 0);
 		*bufp = NULL;
 	} else {
 		ASSERT(zio == NULL || zio->io_error == 0);
 		*bufp = buf;
 		ASSERT(buf->b_data != NULL);
 	}
 }
 
 static void
 arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
 {
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
 		ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
 	} else {
 		if (HDR_COMPRESSION_ENABLED(hdr)) {
 			ASSERT3U(arc_hdr_get_compress(hdr), ==,
 			    BP_GET_COMPRESS(bp));
 		}
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
 		ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
 	}
 }
 
 static void
 arc_read_done(zio_t *zio)
 {
 	blkptr_t 	*bp = zio->io_bp;
 	arc_buf_hdr_t	*hdr = zio->io_private;
 	kmutex_t	*hash_lock = NULL;
 	arc_callback_t	*callback_list;
 	arc_callback_t	*acb;
 	boolean_t	freeable = B_FALSE;
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
 	 * prior to starting I/O.  We should find this header, since
 	 * it's in the hash table, and it should be legit since it's
 	 * not possible to evict it during the I/O.  The only possible
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
 		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
 
 		found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
 
 		ASSERT((found == hdr &&
 		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
 		    (found == hdr && HDR_L2_READING(hdr)));
 		ASSERT3P(hash_lock, !=, NULL);
 	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 
 		if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
 			void *tmpbuf;
 
 			tmpbuf = abd_borrow_buf_copy(zio->io_abd,
 			    sizeof (zil_chain_t));
 			zio_crypt_decode_mac_zil(tmpbuf,
 			    hdr->b_crypt_hdr.b_mac);
 			abd_return_buf(zio->io_abd, tmpbuf,
 			    sizeof (zil_chain_t));
 		} else {
 			zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
 		}
 	}
 
 	if (zio->io_error == 0) {
 		/* byteswap if necessary */
 		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
 			if (BP_GET_LEVEL(zio->io_bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 		if (!HDR_L2_READING(hdr)) {
 			hdr->b_complevel = zio->io_prop.zp_complevel;
 		}
 	}
 
 	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
 	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
 
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT3P(callback_list, !=, NULL);
 
 	if (hash_lock && zio->io_error == 0 &&
 	    hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
 		 * if we've issued an I/O for an evicted buffer, we've already
 		 * called arc_access (to prevent any simultaneous readers from
 		 * getting confused).
 		 */
 		arc_access(hdr, hash_lock);
 	}
 
 	/*
 	 * If a read request has a callback (i.e. acb_done is not NULL), then we
 	 * make a buf containing the data according to the parameters which were
 	 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
 	 * aren't needlessly decompressing the data multiple times.
 	 */
 	int callback_cnt = 0;
 	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
 		if (!acb->acb_done || acb->acb_nobuf)
 			continue;
 
 		callback_cnt++;
 
 		if (zio->io_error != 0)
 			continue;
 
 		int error = arc_buf_alloc_impl(hdr, zio->io_spa,
 		    &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
 		    acb->acb_compressed, acb->acb_noauth, B_TRUE,
 		    &acb->acb_buf);
 
 		/*
 		 * Assert non-speculative zios didn't fail because an
 		 * encryption key wasn't loaded
 		 */
 		ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
 		    error != EACCES);
 
 		/*
 		 * If we failed to decrypt, report an error now (as the zio
 		 * layer would have done if it had done the transforms).
 		 */
 		if (error == ECKSUM) {
 			ASSERT(BP_IS_PROTECTED(bp));
 			error = SET_ERROR(EIO);
 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 				spa_log_error(zio->io_spa, &acb->acb_zb);
 				(void) zfs_ereport_post(
 				    FM_EREPORT_ZFS_AUTHENTICATION,
 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0);
 			}
 		}
 
 		if (error != 0) {
 			/*
 			 * Decompression or decryption failed.  Set
 			 * io_error so that when we call acb_done
 			 * (below), we will indicate that the read
 			 * failed. Note that in the unusual case
 			 * where one callback is compressed and another
 			 * uncompressed, we will mark all of them
 			 * as failed, even though the uncompressed
 			 * one can't actually fail.  In this case,
 			 * the hdr will not be anonymous, because
 			 * if there are multiple callbacks, it's
 			 * because multiple threads found the same
 			 * arc buf in the hash table.
 			 */
 			zio->io_error = error;
 		}
 	}
 
 	/*
 	 * If there are multiple callbacks, we must have the hash lock,
 	 * because the only way for multiple threads to find this hdr is
 	 * in the hash table.  This ensures that if there are multiple
 	 * callbacks, the hdr is not anonymous.  If it were anonymous,
 	 * we couldn't use arc_buf_destroy() in the error case below.
 	 */
 	ASSERT(callback_cnt < 2 || hash_lock != NULL);
 
 	hdr->b_l1hdr.b_acb = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	if (callback_cnt == 0)
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 
 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
 	    callback_list != NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 		if (hdr->b_l1hdr.b_state != arc_anon)
 			arc_change_state(arc_anon, hdr, hash_lock);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
 		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
 	/*
 	 * Broadcast before we drop the hash_lock to avoid the possibility
 	 * that the hdr (and hence the cv) might be freed before we get to
 	 * the cv_broadcast().
 	 */
 	cv_broadcast(&hdr->b_l1hdr.b_cv);
 
 	if (hash_lock != NULL) {
 		mutex_exit(hash_lock);
 	} else {
 		/*
 		 * This block was freed while we waited for the read to
 		 * complete.  It has been removed from the hash table and
 		 * moved to the anonymous state (so that it won't show up
 		 * in the cache).
 		 */
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done != NULL) {
 			if (zio->io_error != 0 && acb->acb_buf != NULL) {
 				/*
 				 * If arc_buf_alloc_impl() fails during
 				 * decompression, the buf will still be
 				 * allocated, and needs to be freed here.
 				 */
 				arc_buf_destroy(acb->acb_buf,
 				    acb->acb_private);
 				acb->acb_buf = NULL;
 			}
 			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
 			    acb->acb_buf, acb->acb_private);
 		}
 
 		if (acb->acb_zio_dummy != NULL) {
 			acb->acb_zio_dummy->io_error = zio->io_error;
 			zio_nowait(acb->acb_zio_dummy);
 		}
 
 		callback_list = acb->acb_next;
 		kmem_free(acb, sizeof (arc_callback_t));
 	}
 
 	if (freeable)
 		arc_hdr_destroy(hdr);
 }
 
 /*
  * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
  * required.  If the block is not in the cache pass the read request
  * on to the spa with a substitute callback function, so that the
  * requested block will be added to the cache.
  *
  * If a read request arrives for a block that has a read in-progress,
  * either wait for the in-progress read to complete (and return the
  * results); or, if this is a read with a "done" func, add a record
  * to the read to invoke the "done" func when the read completes,
  * and return; or just return.
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_read_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = NULL;
 	kmutex_t *hash_lock = NULL;
 	zio_t *rzio;
 	uint64_t guid = spa_load_guid(spa);
 	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
 	boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
 	boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
 	int rc = 0;
 
 	ASSERT(!embedded_bp ||
 	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	/*
 	 * Normally SPL_FSTRANS will already be set since kernel threads which
 	 * expect to call the DMU interfaces will set it when created.  System
 	 * calls are similarly handled by setting/cleaning the bit in the
 	 * registered callback (module/os/.../zfs/zpl_*).
 	 *
 	 * External consumers such as Lustre which call the exported DMU
 	 * interfaces may not have set SPL_FSTRANS.  To avoid a deadlock
 	 * on the hash_lock always set and clear the bit.
 	 */
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 top:
 	if (!embedded_bp) {
 		/*
 		 * Embedded BP's have no DVA and require no I/O to "read".
 		 * Create an anonymous arc buf to back it.
 		 */
 		if (!zfs_blkptr_verify(spa, bp, zio_flags &
 		    ZIO_FLAG_CONFIG_WRITER, BLK_VERIFY_LOG)) {
 			rc = SET_ERROR(ECKSUM);
 			goto out;
 		}
 
 		hdr = buf_hash_find(guid, bp, &hash_lock);
 	}
 
 	/*
 	 * Determine if we have an L1 cache hit or a cache miss. For simplicity
 	 * we maintain encrypted data separately from compressed / uncompressed
 	 * data. If the user is requesting raw encrypted data and we don't have
 	 * that in the header we will read from disk to guarantee that we can
 	 * get it even if the encryption keys aren't loaded.
 	 */
 	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
 	    (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
 		arc_buf_t *buf = NULL;
 		*arc_flags |= ARC_FLAG_CACHED;
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
 
 			if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_cached_only_in_progress);
 				rc = SET_ERROR(ENOENT);
 				goto out;
 			}
 
 			ASSERT3P(head_zio, !=, NULL);
 			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
 			    priority == ZIO_PRIORITY_SYNC_READ) {
 				/*
 				 * This is a sync read that needs to wait for
 				 * an in-flight async read. Request that the
 				 * zio have its priority upgraded.
 				 */
 				zio_change_priority(head_zio, priority);
 				DTRACE_PROBE1(arc__async__upgrade__sync,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
 			}
 			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PREDICTIVE_PREFETCH);
 			}
 
 			if (*arc_flags & ARC_FLAG_WAIT) {
 				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
 				mutex_exit(hash_lock);
 				goto top;
 			}
 			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 
 			if (done) {
 				arc_callback_t *acb = NULL;
 
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
 				acb->acb_done = done;
 				acb->acb_private = private;
 				acb->acb_compressed = compressed_read;
 				acb->acb_encrypted = encrypted_read;
 				acb->acb_noauth = noauth_read;
 				acb->acb_nobuf = no_buf;
 				acb->acb_zb = *zb;
 				if (pio != NULL)
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
 
 				ASSERT3P(acb->acb_done, !=, NULL);
 				acb->acb_zio_head = head_zio;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb = acb;
 			}
 			mutex_exit(hash_lock);
 			goto out;
 		}
 
 		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 		    hdr->b_l1hdr.b_state == arc_mfu);
 
 		if (done && !no_buf) {
 			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
 				/*
 				 * This is a demand read which does not have to
 				 * wait for i/o because we did a predictive
 				 * prefetch i/o for it, which has completed.
 				 */
 				DTRACE_PROBE1(
 				    arc__demand__hit__predictive__prefetch,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(
 				    arcstat_demand_hit_predictive_prefetch);
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PREDICTIVE_PREFETCH);
 			}
 
 			if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
 				ARCSTAT_BUMP(
 				    arcstat_demand_hit_prescient_prefetch);
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PRESCIENT_PREFETCH);
 			}
 
 			ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
 
 			/* Get a buf with the desired data in it. */
 			rc = arc_buf_alloc_impl(hdr, spa, zb, private,
 			    encrypted_read, compressed_read, noauth_read,
 			    B_TRUE, &buf);
 			if (rc == ECKSUM) {
 				/*
 				 * Convert authentication and decryption errors
 				 * to EIO (and generate an ereport if needed)
 				 * before leaving the ARC.
 				 */
 				rc = SET_ERROR(EIO);
 				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 					spa_log_error(spa, zb);
 					(void) zfs_ereport_post(
 					    FM_EREPORT_ZFS_AUTHENTICATION,
 					    spa, NULL, zb, NULL, 0);
 				}
 			}
 			if (rc != 0) {
 				(void) remove_reference(hdr, hash_lock,
 				    private);
 				arc_buf_destroy_impl(buf);
 				buf = NULL;
 			}
 
 			/* assert any errors weren't due to unloaded keys */
 			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
 			    rc != EACCES);
 		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
 		    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 			if (HDR_HAS_L2HDR(hdr))
 				l2arc_hdr_arcstats_decrement_state(hdr);
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 			if (HDR_HAS_L2HDR(hdr))
 				l2arc_hdr_arcstats_increment_state(hdr);
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, hash_lock);
 		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 		if (*arc_flags & ARC_FLAG_L2CACHE)
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
 		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 		    data, metadata, hits);
 
 		if (done)
 			done(NULL, zb, bp, buf, private);
 	} else {
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		uint64_t psize = BP_GET_PSIZE(bp);
 		arc_callback_t *acb;
 		vdev_t *vd = NULL;
 		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
 		uint64_t size;
 		abd_t *hdr_abd;
 		int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
 
 		if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 			rc = SET_ERROR(ENOENT);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			goto out;
 		}
 
 		if (hdr == NULL) {
 			/*
 			 * This block is not in the cache or it has
 			 * embedded data.
 			 */
 			arc_buf_hdr_t *exists = NULL;
 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
 			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
 			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type,
 			    encrypted_read);
 
 			if (!embedded_bp) {
 				hdr->b_dva = *BP_IDENTITY(bp);
 				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
 				buf_discard_identity(hdr);
 				arc_hdr_destroy(hdr);
 				goto top; /* restart the IO request */
 			}
 		} else {
 			/*
 			 * This block is in the ghost cache or encrypted data
 			 * was requested and we didn't have it. If it was
 			 * L2-only (and thus didn't have an L1 hdr),
 			 * we realloc the header to add an L1 hdr.
 			 */
 			if (!HDR_HAS_L1HDR(hdr)) {
 				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
 				    hdr_full_cache);
 			}
 
 			if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
 				ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 				ASSERT(!HDR_HAS_RABD(hdr));
 				ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 				ASSERT0(zfs_refcount_count(
 				    &hdr->b_l1hdr.b_refcnt));
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 				ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 			} else if (HDR_IO_IN_PROGRESS(hdr)) {
 				/*
 				 * If this header already had an IO in progress
 				 * and we are performing another IO to fetch
 				 * encrypted data we must wait until the first
 				 * IO completes so as not to confuse
 				 * arc_read_done(). This should be very rare
 				 * and so the performance impact shouldn't
 				 * matter.
 				 */
 				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
 				mutex_exit(hash_lock);
 				goto top;
 			}
 
 			/*
 			 * This is a delicate dance that we play here.
 			 * This hdr might be in the ghost list so we access
 			 * it to move it out of the ghost list before we
 			 * initiate the read. If it's a prefetch then
 			 * it won't have a callback so we'll remove the
 			 * reference that arc_buf_alloc_impl() created. We
 			 * do this after we've called arc_access() to
 			 * avoid hitting an assert in remove_reference().
 			 */
 			arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
 			arc_access(hdr, hash_lock);
 			arc_hdr_alloc_abd(hdr, alloc_flags);
 		}
 
 		if (encrypted_read) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			size = HDR_GET_PSIZE(hdr);
 			hdr_abd = hdr->b_crypt_hdr.b_rabd;
 			zio_flags |= ZIO_FLAG_RAW;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			size = arc_hdr_size(hdr);
 			hdr_abd = hdr->b_l1hdr.b_pabd;
 
 			if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 			}
 
 			/*
 			 * For authenticated bp's, we do not ask the ZIO layer
 			 * to authenticate them since this will cause the entire
 			 * IO to fail if the key isn't loaded. Instead, we
 			 * defer authentication until arc_buf_fill(), which will
 			 * verify the data when the key is available.
 			 */
 			if (BP_IS_AUTHENTICATED(bp))
 				zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
 		}
 
 		if (*arc_flags & ARC_FLAG_PREFETCH &&
 		    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 			if (HDR_HAS_L2HDR(hdr))
 				l2arc_hdr_arcstats_decrement_state(hdr);
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 			if (HDR_HAS_L2HDR(hdr))
 				l2arc_hdr_arcstats_increment_state(hdr);
 		}
 		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 		if (*arc_flags & ARC_FLAG_L2CACHE)
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		if (BP_IS_AUTHENTICATED(bp))
 			arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		if (BP_GET_LEVEL(bp) > 0)
 			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
 		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
 		acb->acb_compressed = compressed_read;
 		acb->acb_encrypted = encrypted_read;
 		acb->acb_noauth = noauth_read;
 		acb->acb_zb = *zb;
 
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 		hdr->b_l1hdr.b_acb = acb;
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 
 		if (HDR_HAS_L2HDR(hdr) &&
 		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
 			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
 			addr = hdr->b_l2hdr.b_daddr;
 			/*
 			 * Lock out L2ARC device removal.
 			 */
 			if (vdev_is_dead(vd) ||
 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
 				vd = NULL;
 		}
 
 		/*
 		 * We count both async reads and scrub IOs as asynchronous so
 		 * that both can be upgraded in the event of a cache hit while
 		 * the read IO is still in-flight.
 		 */
 		if (priority == ZIO_PRIORITY_ASYNC_READ ||
 		    priority == ZIO_PRIORITY_SCRUB)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 		else
 			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 
 		/*
 		 * At this point, we have a level 1 cache miss or a blkptr
 		 * with embedded data.  Try again in L2ARC if possible.
 		 */
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
 
 		/*
 		 * Skip ARC stat bump for block pointers with embedded
 		 * data. The data are read from the blkptr itself via
 		 * decode_embedded_bp_compressed().
 		 */
 		if (!embedded_bp) {
 			DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
 			    blkptr_t *, bp, uint64_t, lsize,
 			    zbookmark_phys_t *, zb);
 			ARCSTAT_BUMP(arcstat_misses);
 			ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
 			    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
 			    metadata, misses);
 			zfs_racct_read(size, 1);
 		}
 
 		/* Check if the spa even has l2 configured */
 		const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
 		    spa->spa_l2cache.sav_count > 0;
 
 		if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true:
 			 * 1. The L2ARC vdev was previously cached.
 			 * 2. This buffer still has L2ARC metadata.
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
 			 * 5. This isn't prefetch or l2arc_noprefetch is 0.
 			 */
 			if (HDR_HAS_L2HDR(hdr) &&
 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
 				l2arc_read_callback_t *cb;
 				abd_t *abd;
 				uint64_t asize;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
 				atomic_inc_32(&hdr->b_l2hdr.b_hits);
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
 				    KM_SLEEP);
 				cb->l2rcb_hdr = hdr;
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
 
 				/*
 				 * When Compressed ARC is disabled, but the
 				 * L2ARC block is compressed, arc_hdr_size()
 				 * will have returned LSIZE rather than PSIZE.
 				 */
 				if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 				    !HDR_COMPRESSION_ENABLED(hdr) &&
 				    HDR_GET_PSIZE(hdr) != 0) {
 					size = HDR_GET_PSIZE(hdr);
 				}
 
 				asize = vdev_psize_to_asize(vd, size);
 				if (asize != size) {
 					abd = abd_alloc_for_io(asize,
 					    HDR_ISTYPE_METADATA(hdr));
 					cb->l2rcb_abd = abd;
 				} else {
 					abd = hdr_abd;
 				}
 
 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
 				    addr + asize <= vd->vdev_psize -
 				    VDEV_LABEL_END_SIZE);
 
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
 				 * Issue a null zio if the underlying buffer
 				 * was squashed to zero size by compression.
 				 */
 				ASSERT3U(arc_hdr_get_compress(hdr), !=,
 				    ZIO_COMPRESS_EMPTY);
 				rzio = zio_read_phys(pio, vd, addr,
 				    asize, abd,
 				    ZIO_CHECKSUM_OFF,
 				    l2arc_read_done, cb, priority,
 				    zio_flags | ZIO_FLAG_DONT_CACHE |
 				    ZIO_FLAG_CANFAIL |
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				acb->acb_zio_head = rzio;
 
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
 				ARCSTAT_INCR(arcstat_l2_read_bytes,
 				    HDR_GET_PSIZE(hdr));
 
 				if (*arc_flags & ARC_FLAG_NOWAIT) {
 					zio_nowait(rzio);
 					goto out;
 				}
 
 				ASSERT(*arc_flags & ARC_FLAG_WAIT);
 				if (zio_wait(rzio) == 0)
 					goto out;
 
 				/* l2arc read error; goto zio_read() */
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 			} else {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 				if (HDR_L2_WRITING(hdr))
 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			}
 		} else {
 			if (vd != NULL)
 				spa_config_exit(spa, SCL_L2ARC, vd);
 
 			/*
 			 * Only a spa with l2 should contribute to l2
 			 * miss stats.  (Including the case of having a
 			 * faulted cache device - that's also a miss.)
 			 */
 			if (spa_has_l2) {
 				/*
 				 * Skip ARC stat bump for block pointers with
 				 * embedded data. The data are read from the
 				 * blkptr itself via
 				 * decode_embedded_bp_compressed().
 				 */
 				if (!embedded_bp) {
 					DTRACE_PROBE1(l2arc__miss,
 					    arc_buf_hdr_t *, hdr);
 					ARCSTAT_BUMP(arcstat_l2_misses);
 				}
 			}
 		}
 
 		rzio = zio_read(pio, spa, bp, hdr_abd, size,
 		    arc_read_done, hdr, priority, zio_flags, zb);
 		acb->acb_zio_head = rzio;
 
 		if (hash_lock != NULL)
 			mutex_exit(hash_lock);
 
 		if (*arc_flags & ARC_FLAG_WAIT) {
 			rc = zio_wait(rzio);
 			goto out;
 		}
 
 		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 		zio_nowait(rzio);
 	}
 
 out:
 	/* embedded bps don't actually go to disk */
 	if (!embedded_bp)
 		spa_read_history_add(spa, zb, *arc_flags);
 	spl_fstrans_unmark(cookie);
 	return (rc);
 }
 
 arc_prune_t *
 arc_add_prune_callback(arc_prune_func_t *func, void *private)
 {
 	arc_prune_t *p;
 
 	p = kmem_alloc(sizeof (*p), KM_SLEEP);
 	p->p_pfunc = func;
 	p->p_private = private;
 	list_link_init(&p->p_node);
 	zfs_refcount_create(&p->p_refcnt);
 
 	mutex_enter(&arc_prune_mtx);
 	zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
 	list_insert_head(&arc_prune_list, p);
 	mutex_exit(&arc_prune_mtx);
 
 	return (p);
 }
 
 void
 arc_remove_prune_callback(arc_prune_t *p)
 {
 	boolean_t wait = B_FALSE;
 	mutex_enter(&arc_prune_mtx);
 	list_remove(&arc_prune_list, p);
 	if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
 		wait = B_TRUE;
 	mutex_exit(&arc_prune_mtx);
 
 	/* wait for arc_prune_task to finish */
 	if (wait)
 		taskq_wait_outstanding(arc_prune_taskq, 0);
 	ASSERT0(zfs_refcount_count(&p->p_refcnt));
 	zfs_refcount_destroy(&p->p_refcnt);
 	kmem_free(p, sizeof (*p));
 }
 
 /*
  * Notify the arc that a block was freed, and thus will never be used again.
  */
 void
 arc_freed(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint64_t guid = spa_load_guid(spa);
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	hdr = buf_hash_find(guid, bp, &hash_lock);
 	if (hdr == NULL)
 		return;
 
 	/*
 	 * We might be trying to free a block that is still doing I/O
 	 * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
 	 * dmu_sync-ed block). If this block is being prefetched, then it
 	 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
 	 * until the I/O completes. A block may also have a reference if it is
 	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
 	 * have written the new block to its final resting place on disk but
 	 * without the dedup flag set. This would have left the hdr in the MRU
 	 * state and discoverable. When the txg finally syncs it detects that
 	 * the block was overridden in open context and issues an override I/O.
 	 * Since this is a dedup block, the override I/O will determine if the
 	 * block is already in the DDT. If so, then it will replace the io_bp
 	 * with the bp from the DDT and allow the I/O to finish. When the I/O
 	 * reaches the done callback, dbuf_write_override_done, it will
 	 * check to see if the io_bp and io_bp_override are identical.
 	 * If they are not, then it indicates that the bp was replaced with
 	 * the bp in the DDT and the override bp is freed. This allows
 	 * us to arrive here with a reference on a block that is being
 	 * freed. So if we have an I/O in progress, or a reference to
 	 * this hdr, then we don't destroy the hdr.
 	 */
 	if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
 	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
 		arc_change_state(arc_anon, hdr, hash_lock);
 		arc_hdr_destroy(hdr);
 		mutex_exit(hash_lock);
 	} else {
 		mutex_exit(hash_lock);
 	}
 
 }
 
 /*
  * Release this buffer from the cache, making it an anonymous buffer.  This
  * must be done after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
  * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * It would be nice to assert that if its DMU metadata (level >
 	 * 0 || it's the dnode file), then it must be syncing context.
 	 * But we don't know that information at this level.
 	 */
 
 	mutex_enter(&buf->b_evict_lock);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
 	 * We don't grab the hash lock prior to this check, because if
 	 * the buffer's header is in the arc_anon state, it won't be
 	 * linked into the hash table.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		mutex_exit(&buf->b_evict_lock);
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		ASSERT(!HDR_HAS_L2HDR(hdr));
 		ASSERT(HDR_EMPTY(hdr));
 
 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
 		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
 		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		/*
 		 * If the buf is being overridden then it may already
 		 * have a hdr that is not empty.
 		 */
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	/*
 	 * This assignment is only valid as long as the hash_lock is
 	 * held, we must be careful not to reference state or the
 	 * b_state field after dropping the lock.
 	 */
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(state, !=, arc_anon);
 
 	/* this buffer is not on any list */
 	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 
 		/*
 		 * We have to recheck this conditional again now that
 		 * we're holding the l2ad_mtx to prevent a race with
 		 * another thread which might be concurrently calling
 		 * l2arc_evict(). In that case, l2arc_evict() might have
 		 * destroyed the header's L2 portion as we were waiting
 		 * to acquire the l2ad_mtx.
 		 */
 		if (HDR_HAS_L2HDR(hdr))
 			arc_hdr_l2hdr_destroy(hdr);
 
 		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 	}
 
 	/*
 	 * Do we have more than one buf?
 	 */
 	if (hdr->b_l1hdr.b_bufcnt > 1) {
 		arc_buf_hdr_t *nhdr;
 		uint64_t spa = hdr->b_spa;
 		uint64_t psize = HDR_GET_PSIZE(hdr);
 		uint64_t lsize = HDR_GET_LSIZE(hdr);
 		boolean_t protected = HDR_PROTECTED(hdr);
 		enum zio_compress compress = arc_hdr_get_compress(hdr);
 		arc_buf_contents_t type = arc_buf_type(hdr);
 		VERIFY3U(hdr->b_type, ==, type);
 
 		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
 		(void) remove_reference(hdr, hash_lock, tag);
 
 		if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(ARC_BUF_LAST(buf));
 		}
 
 		/*
 		 * Pull the data off of this hdr and attach it to
 		 * a new anonymous hdr. Also find the last buffer
 		 * in the hdr's buffer list.
 		 */
 		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 		ASSERT3P(lastbuf, !=, NULL);
 
 		/*
 		 * If the current arc_buf_t and the hdr are sharing their data
 		 * buffer, then we must stop sharing that block.
 		 */
 		if (arc_buf_is_shared(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			VERIFY(!arc_buf_is_shared(lastbuf));
 
 			/*
 			 * First, sever the block sharing relationship between
 			 * buf and the arc_buf_hdr_t.
 			 */
 			arc_unshare_buf(hdr, buf);
 
 			/*
 			 * Now we need to recreate the hdr's b_pabd. Since we
 			 * have lastbuf handy, we try to share with it, but if
 			 * we can't then we allocate a new b_pabd and copy the
 			 * data from buf into it.
 			 */
 			if (arc_can_share(hdr, lastbuf)) {
 				arc_share_buf(hdr, lastbuf);
 			} else {
 				arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
 				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
 				    buf->b_data, psize);
 			}
 			VERIFY3P(lastbuf->b_data, !=, NULL);
 		} else if (HDR_SHARED_DATA(hdr)) {
 			/*
 			 * Uncompressed shared buffers are always at the end
 			 * of the list. Compressed buffers don't have the
 			 * same requirements. This makes it hard to
 			 * simply assert that the lastbuf is shared so
 			 * we rely on the hdr's compression flags to determine
 			 * if we have a compressed, shared buffer.
 			 */
 			ASSERT(arc_buf_is_shared(lastbuf) ||
 			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 			ASSERT(!ARC_BUF_SHARED(buf));
 		}
 
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 		ASSERT3P(state, !=, arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_size,
 		    arc_buf_size(buf), buf);
 
 		if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 			ASSERT3P(state, !=, arc_l2c_only);
 			(void) zfs_refcount_remove_many(
 			    &state->arcs_esize[type],
 			    arc_buf_size(buf), buf);
 		}
 
 		hdr->b_l1hdr.b_bufcnt -= 1;
 		if (ARC_BUF_ENCRYPTED(buf))
 			hdr->b_crypt_hdr.b_ebufcnt -= 1;
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		/* if this is the last uncompressed buf free the checksum */
 		if (!arc_hdr_has_uncompressed_buf(hdr))
 			arc_cksum_free(hdr);
 
 		mutex_exit(hash_lock);
 
 		/*
 		 * Allocate a new hdr. The new hdr will contain a b_pabd
 		 * buffer which will be freed in arc_write().
 		 */
 		nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
 		    compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr));
 		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT0(nhdr->b_l1hdr.b_bufcnt);
 		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
 		VERIFY3U(nhdr->b_type, ==, type);
 		ASSERT(!HDR_SHARED_DATA(nhdr));
 
 		nhdr->b_l1hdr.b_buf = buf;
 		nhdr->b_l1hdr.b_bufcnt = 1;
 		if (ARC_BUF_ENCRYPTED(buf))
 			nhdr->b_crypt_hdr.b_ebufcnt = 1;
 		nhdr->b_l1hdr.b_mru_hits = 0;
 		nhdr->b_l1hdr.b_mru_ghost_hits = 0;
 		nhdr->b_l1hdr.b_mfu_hits = 0;
 		nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
 		nhdr->b_l1hdr.b_l2_hits = 0;
 		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
 
 		mutex_exit(&buf->b_evict_lock);
 		(void) zfs_refcount_add_many(&arc_anon->arcs_size,
 		    arc_buf_size(buf), buf);
 	} else {
 		mutex_exit(&buf->b_evict_lock);
 		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
 		/* protected by hash lock, or hdr is on arc_anon */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		hdr->b_l1hdr.b_mru_hits = 0;
 		hdr->b_l1hdr.b_mru_ghost_hits = 0;
 		hdr->b_l1hdr.b_mfu_hits = 0;
 		hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 		hdr->b_l1hdr.b_l2_hits = 0;
 		arc_change_state(arc_anon, hdr, hash_lock);
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		mutex_exit(hash_lock);
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
 	int released;
 
 	mutex_enter(&buf->b_evict_lock);
 	released = (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
 	mutex_exit(&buf->b_evict_lock);
 	return (released);
 }
 
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
 	int referenced;
 
 	mutex_enter(&buf->b_evict_lock);
 	referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
 	mutex_exit(&buf->b_evict_lock);
 	return (referenced);
 }
 #endif
 
 static void
 arc_write_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
 	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
 
 	/*
 	 * If we're reexecuting this zio because the pool suspended, then
 	 * cleanup any state that was previously set the first time the
 	 * callback was invoked.
 	 */
 	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
 		arc_cksum_free(hdr);
 		arc_buf_unwatch(buf);
 		if (hdr->b_l1hdr.b_pabd != NULL) {
 			if (arc_buf_is_shared(buf)) {
 				arc_unshare_buf(hdr, buf);
 			} else {
 				arc_hdr_free_abd(hdr, B_FALSE);
 			}
 		}
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!HDR_HAS_RABD(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr));
 	ASSERT(!arc_buf_is_shared(buf));
 
 	callback->awcb_ready(zio, buf, callback->awcb_private);
 
 	if (HDR_IO_IN_PROGRESS(hdr))
 		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
 
 	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 
 	if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
 		hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
 
 	if (BP_IS_PROTECTED(bp)) {
 		/* ZIL blocks are written through zio_rewrite */
 		ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 		ASSERT(HDR_PROTECTED(hdr));
 
 		if (BP_SHOULD_BYTESWAP(bp)) {
 			if (BP_GET_LEVEL(bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 		zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
 	}
 
 	/*
 	 * If this block was written for raw encryption but the zio layer
 	 * ended up only authenticating it, adjust the buffer flags now.
 	 */
 	if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
 			buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	} else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	}
 
 	/* this must be done after the buffer flags are adjusted */
 	arc_cksum_compute(buf);
 
 	enum zio_compress compress;
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		compress = ZIO_COMPRESS_OFF;
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		compress = BP_GET_COMPRESS(bp);
 	}
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = zio->io_prop.zp_complevel;
 
 	if (zio->io_error != 0 || psize == 0)
 		goto out;
 
 	/*
 	 * Fill the hdr with data. If the buffer is encrypted we have no choice
 	 * but to copy the data into b_radb. If the hdr is compressed, the data
 	 * we want is available from the zio, otherwise we can take it from
 	 * the buf.
 	 *
 	 * We might be able to share the buf's data with the hdr here. However,
 	 * doing so would cause the ARC to be full of linear ABDs if we write a
 	 * lot of shareable data. As a compromise, we check whether scattered
 	 * ABDs are allowed, and assume that if they are then the user wants
 	 * the ARC to be primarily filled with them regardless of the data being
 	 * written. Therefore, if they're allowed then we allocate one and copy
 	 * the data into it; otherwise, we share the data directly if we can.
 	 */
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT3U(psize, >, 0);
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
 		abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 	} else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
 		/*
 		 * Ideally, we would always copy the io_abd into b_pabd, but the
 		 * user may have disabled compressed ARC, thus we must check the
 		 * hdr's compression setting rather than the io_bp's.
 		 */
 		if (BP_IS_ENCRYPTED(bp)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr,
 			    ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
 			abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 		} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 		    !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
 			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
 		} else {
 			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
 			arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
 			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
 		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
 
 		arc_share_buf(hdr, buf);
 	}
 
 out:
 	arc_hdr_verify(hdr, bp);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 arc_write_children_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 
 	callback->awcb_children_ready(zio, buf, callback->awcb_private);
 }
 
 /*
  * The SPA calls this callback for each physical write that happens on behalf
  * of a logical write.  See the comment in dbuf_write_physdone() for details.
  */
 static void
 arc_write_physdone(zio_t *zio)
 {
 	arc_write_callback_t *cb = zio->io_private;
 	if (cb->awcb_physdone != NULL)
 		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
 }
 
 static void
 arc_write_done(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 
 		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(HDR_EMPTY(hdr));
 	}
 
 	/*
 	 * If the block to be written was all-zero or compressed enough to be
 	 * embedded in the BP, no write was performed so there will be no
 	 * dva/birth/checksum.  The buffer must therefore remain anonymous
 	 * (and uncached).
 	 */
 	if (!HDR_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
 		ASSERT3U(zio->io_error, ==, 0);
 
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
 		if (exists != NULL) {
 			/*
 			 * This can only happen if we overwrite for
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad overwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 				ASSERT(zfs_refcount_is_zero(
 				    &exists->b_l1hdr.b_refcnt));
 				arc_change_state(arc_anon, exists, hash_lock);
 				arc_hdr_destroy(exists);
 				mutex_exit(hash_lock);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT3P(exists, ==, NULL);
 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
 				/* nopwrite */
 				ASSERT(zio->io_prop.zp_nopwrite);
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad nopwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
 				ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
 				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
 		}
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		/* if it's not anon, we are doing a scrub */
 		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
 			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	}
 
 	ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	abd_free(zio->io_abd);
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc,
     const zio_prop_t *zp, arc_write_done_func_t *ready,
     arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
     arc_write_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
 	zio_prop_t localprop = *zp;
 
 	ASSERT3P(ready, !=, NULL);
 	ASSERT3P(done, !=, NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
 	if (l2arc)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		localprop.zp_encrypt = B_TRUE;
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		localprop.zp_byteorder =
 		    (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 		    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 		bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
 		    ZIO_DATA_SALT_LEN);
 		bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
 		    ZIO_DATA_IV_LEN);
 		bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
 		    ZIO_DATA_MAC_LEN);
 		if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
 			localprop.zp_nopwrite = B_FALSE;
 			localprop.zp_copies =
 			    MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
 		}
 		zio_flags |= ZIO_FLAG_RAW;
 	} else if (ARC_BUF_COMPRESSED(buf)) {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 	}
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_children_ready = children_ready;
 	callback->awcb_physdone = physdone;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
 	/*
 	 * The hdr's b_pabd is now stale, free it now. A new data block
 	 * will be allocated when the zio pipeline calls arc_write_ready().
 	 */
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		/*
 		 * If the buf is currently sharing the data block with
 		 * the hdr then we need to break that relationship here.
 		 * The hdr will remain with a NULL data pointer and the
 		 * buf will take sole ownership of the block.
 		 */
 		if (arc_buf_is_shared(buf)) {
 			arc_unshare_buf(hdr, buf);
 		} else {
 			arc_hdr_free_abd(hdr, B_FALSE);
 		}
 		VERIFY3P(buf->b_data, !=, NULL);
 	}
 
 	if (HDR_HAS_RABD(hdr))
 		arc_hdr_free_abd(hdr, B_TRUE);
 
 	if (!(zio_flags & ZIO_FLAG_RAW))
 		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
 
 	ASSERT(!arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 
 	zio = zio_write(pio, spa, txg, bp,
 	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
 	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
 	    arc_write_physdone, arc_write_done, callback,
 	    priority, zio_flags, zb);
 
 	return (zio);
 }
 
 void
 arc_tempreserve_clear(uint64_t reserve)
 {
 	atomic_add_64(&arc_tempreserve, -reserve);
 	ASSERT((int64_t)arc_tempreserve >= 0);
 }
 
 int
 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
 {
 	int error;
 	uint64_t anon_size;
 
 	if (!arc_no_grow &&
 	    reserve > arc_c/4 &&
 	    reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
 		arc_c = MIN(arc_c_max, reserve * 4);
 
 	/*
 	 * Throttle when the calculated memory footprint for the TXG
 	 * exceeds the target ARC size.
 	 */
 	if (reserve > arc_c) {
 		DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
 		return (SET_ERROR(ERESTART));
 	}
 
 	/*
 	 * Don't count loaned bufs as in flight dirty data to prevent long
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
 
 	/* assert that it has not wrapped around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 
 	anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
 	    arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
 	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
 	error = arc_memory_throttle(spa, reserve, txg);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Throttle writes when the amount of dirty data in the cache
 	 * gets too large.  We try to keep the cache less than half full
 	 * of dirty blocks so that our sync times don't grow too large.
 	 *
 	 * In the case of one pool being built on another pool, we want
 	 * to make sure we don't end up throttling the lower (backing)
 	 * pool when the upper pool is the majority contributor to dirty
 	 * data. To insure we make forward progress during throttling, we
 	 * also check the current pool's net dirty data and only throttle
 	 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
 	 * data in the cache.
 	 *
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
 	uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
 	uint64_t spa_dirty_anon = spa_dirty_data(spa);
 	uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
 	if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
 	    anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
 	    spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
 #ifdef ZFS_DEBUG
 		uint64_t meta_esize = zfs_refcount_count(
 		    &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 		uint64_t data_esize =
 		    zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
 		    (u_longlong_t)arc_tempreserve >> 10,
 		    (u_longlong_t)meta_esize >> 10,
 		    (u_longlong_t)data_esize >> 10,
 		    (u_longlong_t)reserve >> 10,
 		    (u_longlong_t)rarc_c >> 10);
 #endif
 		DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
 		return (SET_ERROR(ERESTART));
 	}
 	atomic_add_64(&arc_tempreserve, reserve);
 	return (0);
 }
 
 static void
 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
 	size->value.ui64 = zfs_refcount_count(&state->arcs_size);
 	evict_data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
 	evict_metadata->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
 }
 
 static int
 arc_kstat_update(kstat_t *ksp, int rw)
 {
 	arc_stats_t *as = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (SET_ERROR(EACCES));
 
 	as->arcstat_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hits);
 	as->arcstat_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_misses);
 	as->arcstat_demand_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_hits);
 	as->arcstat_demand_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_misses);
 	as->arcstat_demand_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_hits);
 	as->arcstat_demand_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_misses);
 	as->arcstat_prefetch_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_hits);
 	as->arcstat_prefetch_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_misses);
 	as->arcstat_prefetch_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits);
 	as->arcstat_prefetch_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses);
 	as->arcstat_mru_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mru_hits);
 	as->arcstat_mru_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mru_ghost_hits);
 	as->arcstat_mfu_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_hits);
 	as->arcstat_mfu_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_ghost_hits);
 	as->arcstat_deleted.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_deleted);
 	as->arcstat_mutex_miss.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mutex_miss);
 	as->arcstat_access_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_access_skip);
 	as->arcstat_evict_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_skip);
 	as->arcstat_evict_not_enough.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_not_enough);
 	as->arcstat_evict_l2_cached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_cached);
 	as->arcstat_evict_l2_eligible.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible);
 	as->arcstat_evict_l2_eligible_mfu.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu);
 	as->arcstat_evict_l2_eligible_mru.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru);
 	as->arcstat_evict_l2_ineligible.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_ineligible);
 	as->arcstat_evict_l2_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_skip);
 	as->arcstat_hash_collisions.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_collisions);
 	as->arcstat_hash_chains.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_chains);
 	as->arcstat_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_size);
 	as->arcstat_compressed_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_compressed_size);
 	as->arcstat_uncompressed_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_uncompressed_size);
 	as->arcstat_overhead_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_overhead_size);
 	as->arcstat_hdr_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hdr_size);
 	as->arcstat_data_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_data_size);
 	as->arcstat_metadata_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_metadata_size);
 	as->arcstat_dbuf_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #if defined(COMPAT_FREEBSD11)
 	as->arcstat_other_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size) +
 	    aggsum_value(&arc_sums.arcstat_dnode_size) +
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #endif
 
 	arc_kstat_update_state(arc_anon,
 	    &as->arcstat_anon_size,
 	    &as->arcstat_anon_evictable_data,
 	    &as->arcstat_anon_evictable_metadata);
 	arc_kstat_update_state(arc_mru,
 	    &as->arcstat_mru_size,
 	    &as->arcstat_mru_evictable_data,
 	    &as->arcstat_mru_evictable_metadata);
 	arc_kstat_update_state(arc_mru_ghost,
 	    &as->arcstat_mru_ghost_size,
 	    &as->arcstat_mru_ghost_evictable_data,
 	    &as->arcstat_mru_ghost_evictable_metadata);
 	arc_kstat_update_state(arc_mfu,
 	    &as->arcstat_mfu_size,
 	    &as->arcstat_mfu_evictable_data,
 	    &as->arcstat_mfu_evictable_metadata);
 	arc_kstat_update_state(arc_mfu_ghost,
 	    &as->arcstat_mfu_ghost_size,
 	    &as->arcstat_mfu_ghost_evictable_data,
 	    &as->arcstat_mfu_ghost_evictable_metadata);
 
 	as->arcstat_dnode_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_dnode_size);
 	as->arcstat_bonus_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size);
 	as->arcstat_l2_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_hits);
 	as->arcstat_l2_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_misses);
 	as->arcstat_l2_prefetch_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_prefetch_asize);
 	as->arcstat_l2_mru_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_mru_asize);
 	as->arcstat_l2_mfu_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_mfu_asize);
 	as->arcstat_l2_bufc_data_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize);
 	as->arcstat_l2_bufc_metadata_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize);
 	as->arcstat_l2_feeds.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_feeds);
 	as->arcstat_l2_rw_clash.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rw_clash);
 	as->arcstat_l2_read_bytes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_read_bytes);
 	as->arcstat_l2_write_bytes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_write_bytes);
 	as->arcstat_l2_writes_sent.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_sent);
 	as->arcstat_l2_writes_done.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_done);
 	as->arcstat_l2_writes_error.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_error);
 	as->arcstat_l2_writes_lock_retry.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry);
 	as->arcstat_l2_evict_lock_retry.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry);
 	as->arcstat_l2_evict_reading.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_reading);
 	as->arcstat_l2_evict_l1cached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_l1cached);
 	as->arcstat_l2_free_on_write.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_free_on_write);
 	as->arcstat_l2_abort_lowmem.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_abort_lowmem);
 	as->arcstat_l2_cksum_bad.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_cksum_bad);
 	as->arcstat_l2_io_error.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_io_error);
 	as->arcstat_l2_lsize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_lsize);
 	as->arcstat_l2_psize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_psize);
 	as->arcstat_l2_hdr_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_l2_hdr_size);
 	as->arcstat_l2_log_blk_writes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_writes);
 	as->arcstat_l2_log_blk_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_asize);
 	as->arcstat_l2_log_blk_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_count);
 	as->arcstat_l2_rebuild_success.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_success);
 	as->arcstat_l2_rebuild_abort_unsupported.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
 	as->arcstat_l2_rebuild_abort_io_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
 	as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
 	as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
 	as->arcstat_l2_rebuild_abort_lowmem.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
 	as->arcstat_l2_rebuild_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_size);
 	as->arcstat_l2_rebuild_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_asize);
 	as->arcstat_l2_rebuild_bufs.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs);
 	as->arcstat_l2_rebuild_bufs_precached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached);
 	as->arcstat_l2_rebuild_log_blks.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks);
 	as->arcstat_memory_throttle_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_throttle_count);
 	as->arcstat_memory_direct_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_direct_count);
 	as->arcstat_memory_indirect_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_indirect_count);
 
 	as->arcstat_memory_all_bytes.value.ui64 =
 	    arc_all_memory();
 	as->arcstat_memory_free_bytes.value.ui64 =
 	    arc_free_memory();
 	as->arcstat_memory_available_bytes.value.i64 =
 	    arc_available_memory();
 
 	as->arcstat_prune.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prune);
 	as->arcstat_meta_used.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_meta_used);
 	as->arcstat_async_upgrade_sync.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_async_upgrade_sync);
 	as->arcstat_demand_hit_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch);
 	as->arcstat_demand_hit_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch);
 	as->arcstat_raw_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_raw_size);
 	as->arcstat_cached_only_in_progress.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_cached_only_in_progress);
 	as->arcstat_abd_chunk_waste_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size);
 
 	return (0);
 }
 
 /*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the ARC eviction
  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
 static unsigned int
 arc_state_multilist_index_func(multilist_t *ml, void *obj)
 {
 	arc_buf_hdr_t *hdr = obj;
 
 	/*
 	 * We rely on b_dva to generate evenly distributed index
 	 * numbers using buf_hash below. So, as an added precaution,
 	 * let's make sure we never add empty buffers to the arc lists.
 	 */
 	ASSERT(!HDR_EMPTY(hdr));
 
 	/*
 	 * The assumption here, is the hash value for a given
 	 * arc_buf_hdr_t will remain constant throughout its lifetime
 	 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
 	 * Thus, we don't need to store the header's sublist index
 	 * on insertion, as this index can be recalculated on removal.
 	 *
 	 * Also, the low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed. In this context full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
 	    multilist_get_num_sublists(ml));
 }
 
 #define	WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do {	\
 	if ((do_warn) && (tuning) && ((tuning) != (value))) {	\
 		cmn_err(CE_WARN,				\
 		    "ignoring tunable %s (using %llu instead)",	\
-		    (#tuning), (value));			\
+		    (#tuning), (u_longlong_t)(value));	\
 	}							\
 } while (0)
 
 /*
  * Called during module initialization and periodically thereafter to
  * apply reasonable changes to the exposed performance tunings.  Can also be
  * called explicitly by param_set_arc_*() functions when ARC tunables are
  * updated manually.  Non-zero zfs_* values which differ from the currently set
  * values will be applied.
  */
 void
 arc_tuning_update(boolean_t verbose)
 {
 	uint64_t allmem = arc_all_memory();
 	unsigned long limit;
 
 	/* Valid range: 32M - <arc_c_max> */
 	if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
 	    (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
 	    (zfs_arc_min <= arc_c_max)) {
 		arc_c_min = zfs_arc_min;
 		arc_c = MAX(arc_c, arc_c_min);
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
 
 	/* Valid range: 64M - <all physical memory> */
 	if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
 	    (zfs_arc_max >= 64 << 20) && (zfs_arc_max < allmem) &&
 	    (zfs_arc_max > arc_c_min)) {
 		arc_c_max = zfs_arc_max;
 		arc_c = MIN(arc_c, arc_c_max);
 		arc_p = (arc_c >> 1);
 		if (arc_meta_limit > arc_c_max)
 			arc_meta_limit = arc_c_max;
 		if (arc_dnode_size_limit > arc_meta_limit)
 			arc_dnode_size_limit = arc_meta_limit;
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
 
 	/* Valid range: 16M - <arc_c_max> */
 	if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
 	    (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
 	    (zfs_arc_meta_min <= arc_c_max)) {
 		arc_meta_min = zfs_arc_meta_min;
 		if (arc_meta_limit < arc_meta_min)
 			arc_meta_limit = arc_meta_min;
 		if (arc_dnode_size_limit < arc_meta_min)
 			arc_dnode_size_limit = arc_meta_min;
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose);
 
 	/* Valid range: <arc_meta_min> - <arc_c_max> */
 	limit = zfs_arc_meta_limit ? zfs_arc_meta_limit :
 	    MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100;
 	if ((limit != arc_meta_limit) &&
 	    (limit >= arc_meta_min) &&
 	    (limit <= arc_c_max))
 		arc_meta_limit = limit;
 	WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose);
 
 	/* Valid range: <arc_meta_min> - <arc_meta_limit> */
 	limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
 	    MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100;
 	if ((limit != arc_dnode_size_limit) &&
 	    (limit >= arc_meta_min) &&
 	    (limit <= arc_meta_limit))
 		arc_dnode_size_limit = limit;
 	WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit,
 	    verbose);
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_grow_retry)
 		arc_grow_retry = zfs_arc_grow_retry;
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_shrink_shift) {
 		arc_shrink_shift = zfs_arc_shrink_shift;
 		arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
 	}
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_p_min_shift)
 		arc_p_min_shift = zfs_arc_p_min_shift;
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prefetch_ms)
 		arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prescient_prefetch_ms) {
 		arc_min_prescient_prefetch_ms =
 		    zfs_arc_min_prescient_prefetch_ms;
 	}
 
 	/* Valid range: 0 - 100 */
 	if ((zfs_arc_lotsfree_percent >= 0) &&
 	    (zfs_arc_lotsfree_percent <= 100))
 		arc_lotsfree_percent = zfs_arc_lotsfree_percent;
 	WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
 	    verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
 		arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem);
 	WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
 }
 
 static void
 arc_state_init(void)
 {
 	multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
 	    sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
 	    arc_state_multilist_index_func);
 
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_create(&arc_anon->arcs_size);
 	zfs_refcount_create(&arc_mru->arcs_size);
 	zfs_refcount_create(&arc_mru_ghost->arcs_size);
 	zfs_refcount_create(&arc_mfu->arcs_size);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_size);
 	zfs_refcount_create(&arc_l2c_only->arcs_size);
 
 	wmsum_init(&arc_sums.arcstat_hits, 0);
 	wmsum_init(&arc_sums.arcstat_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_hits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_mru_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_deleted, 0);
 	wmsum_init(&arc_sums.arcstat_mutex_miss, 0);
 	wmsum_init(&arc_sums.arcstat_access_skip, 0);
 	wmsum_init(&arc_sums.arcstat_evict_skip, 0);
 	wmsum_init(&arc_sums.arcstat_evict_not_enough, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0);
 	wmsum_init(&arc_sums.arcstat_hash_collisions, 0);
 	wmsum_init(&arc_sums.arcstat_hash_chains, 0);
 	aggsum_init(&arc_sums.arcstat_size, 0);
 	wmsum_init(&arc_sums.arcstat_compressed_size, 0);
 	wmsum_init(&arc_sums.arcstat_uncompressed_size, 0);
 	wmsum_init(&arc_sums.arcstat_overhead_size, 0);
 	wmsum_init(&arc_sums.arcstat_hdr_size, 0);
 	wmsum_init(&arc_sums.arcstat_data_size, 0);
 	wmsum_init(&arc_sums.arcstat_metadata_size, 0);
 	wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
 	aggsum_init(&arc_sums.arcstat_dnode_size, 0);
 	wmsum_init(&arc_sums.arcstat_bonus_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_hits, 0);
 	wmsum_init(&arc_sums.arcstat_l2_misses, 0);
 	wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_feeds, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0);
 	wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_done, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_error, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0);
 	wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0);
 	wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0);
 	wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0);
 	wmsum_init(&arc_sums.arcstat_l2_io_error, 0);
 	wmsum_init(&arc_sums.arcstat_l2_lsize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_psize, 0);
 	aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0);
 	wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_direct_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0);
 	wmsum_init(&arc_sums.arcstat_prune, 0);
 	aggsum_init(&arc_sums.arcstat_meta_used, 0);
 	wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_raw_size, 0);
 	wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0);
 	wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0);
 
 	arc_anon->arcs_state = ARC_STATE_ANON;
 	arc_mru->arcs_state = ARC_STATE_MRU;
 	arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
 	arc_mfu->arcs_state = ARC_STATE_MFU;
 	arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
 	arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
 }
 
 static void
 arc_state_fini(void)
 {
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_destroy(&arc_anon->arcs_size);
 	zfs_refcount_destroy(&arc_mru->arcs_size);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
 	zfs_refcount_destroy(&arc_mfu->arcs_size);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_size);
 
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
 
 	wmsum_fini(&arc_sums.arcstat_hits);
 	wmsum_fini(&arc_sums.arcstat_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_data_hits);
 	wmsum_fini(&arc_sums.arcstat_demand_data_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_hits);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_hits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_mru_hits);
 	wmsum_fini(&arc_sums.arcstat_mru_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_deleted);
 	wmsum_fini(&arc_sums.arcstat_mutex_miss);
 	wmsum_fini(&arc_sums.arcstat_access_skip);
 	wmsum_fini(&arc_sums.arcstat_evict_skip);
 	wmsum_fini(&arc_sums.arcstat_evict_not_enough);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_cached);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_skip);
 	wmsum_fini(&arc_sums.arcstat_hash_collisions);
 	wmsum_fini(&arc_sums.arcstat_hash_chains);
 	aggsum_fini(&arc_sums.arcstat_size);
 	wmsum_fini(&arc_sums.arcstat_compressed_size);
 	wmsum_fini(&arc_sums.arcstat_uncompressed_size);
 	wmsum_fini(&arc_sums.arcstat_overhead_size);
 	wmsum_fini(&arc_sums.arcstat_hdr_size);
 	wmsum_fini(&arc_sums.arcstat_data_size);
 	wmsum_fini(&arc_sums.arcstat_metadata_size);
 	wmsum_fini(&arc_sums.arcstat_dbuf_size);
 	aggsum_fini(&arc_sums.arcstat_dnode_size);
 	wmsum_fini(&arc_sums.arcstat_bonus_size);
 	wmsum_fini(&arc_sums.arcstat_l2_hits);
 	wmsum_fini(&arc_sums.arcstat_l2_misses);
 	wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_mru_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_mfu_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_feeds);
 	wmsum_fini(&arc_sums.arcstat_l2_rw_clash);
 	wmsum_fini(&arc_sums.arcstat_l2_read_bytes);
 	wmsum_fini(&arc_sums.arcstat_l2_write_bytes);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_sent);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_done);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_error);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_reading);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached);
 	wmsum_fini(&arc_sums.arcstat_l2_free_on_write);
 	wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem);
 	wmsum_fini(&arc_sums.arcstat_l2_cksum_bad);
 	wmsum_fini(&arc_sums.arcstat_l2_io_error);
 	wmsum_fini(&arc_sums.arcstat_l2_lsize);
 	wmsum_fini(&arc_sums.arcstat_l2_psize);
 	aggsum_fini(&arc_sums.arcstat_l2_hdr_size);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_count);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_success);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_size);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks);
 	wmsum_fini(&arc_sums.arcstat_memory_throttle_count);
 	wmsum_fini(&arc_sums.arcstat_memory_direct_count);
 	wmsum_fini(&arc_sums.arcstat_memory_indirect_count);
 	wmsum_fini(&arc_sums.arcstat_prune);
 	aggsum_fini(&arc_sums.arcstat_meta_used);
 	wmsum_fini(&arc_sums.arcstat_async_upgrade_sync);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_raw_size);
 	wmsum_fini(&arc_sums.arcstat_cached_only_in_progress);
 	wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size);
 }
 
 uint64_t
 arc_target_bytes(void)
 {
 	return (arc_c);
 }
 
 void
 arc_set_limits(uint64_t allmem)
 {
 	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
 	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
 
 	/* How to set default max varies by platform. */
 	arc_c_max = arc_default_max(arc_c_min, allmem);
 }
 void
 arc_init(void)
 {
 	uint64_t percent, allmem = arc_all_memory();
 	mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
 	    offsetof(arc_evict_waiter_t, aew_node));
 
 	arc_min_prefetch_ms = 1000;
 	arc_min_prescient_prefetch_ms = 6000;
 
 #if defined(_KERNEL)
 	arc_lowmem_init();
 #endif
 
 	arc_set_limits(allmem);
 
 #ifndef _KERNEL
 	/*
 	 * In userland, there's only the memory pressure that we artificially
 	 * create (see arc_available_memory()).  Don't let arc_c get too
 	 * small, because it can cause transactions to be larger than
 	 * arc_c, causing arc_tempreserve_space() to fail.
 	 */
 	arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
 #endif
 
 	arc_c = arc_c_min;
 	arc_p = (arc_c >> 1);
 
 	/* Set min to 1/2 of arc_c_min */
 	arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
 	/*
 	 * Set arc_meta_limit to a percent of arc_c_max with a floor of
 	 * arc_meta_min, and a ceiling of arc_c_max.
 	 */
 	percent = MIN(zfs_arc_meta_limit_percent, 100);
 	arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100);
 	percent = MIN(zfs_arc_dnode_limit_percent, 100);
 	arc_dnode_size_limit = (percent * arc_meta_limit) / 100;
 
 	/* Apply user specified tunings */
 	arc_tuning_update(B_TRUE);
 
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
 	arc_register_hotplug();
 
 	arc_state_init();
 
 	buf_init();
 
 	list_create(&arc_prune_list, sizeof (arc_prune_t),
 	    offsetof(arc_prune_t, p_node));
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri,
 	    boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
 	    TASKQ_THREADS_CPU_PCT);
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
 	if (arc_ksp != NULL) {
 		arc_ksp->ks_data = &arc_stats;
 		arc_ksp->ks_update = arc_kstat_update;
 		kstat_install(arc_ksp);
 	}
 
 	arc_evict_zthr = zthr_create("arc_evict",
 	    arc_evict_cb_check, arc_evict_cb, NULL);
 	arc_reap_zthr = zthr_create_timer("arc_reap",
 	    arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1));
 
 	arc_warm = B_FALSE;
 
 	/*
 	 * Calculate maximum amount of dirty data per pool.
 	 *
 	 * If it has been set by a module parameter, take that.
 	 * Otherwise, use a percentage of physical memory defined by
 	 * zfs_dirty_data_max_percent (default 10%) with a cap at
 	 * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
 	 */
 #ifdef __LP64__
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #else
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #endif
 
 	if (zfs_dirty_data_max == 0) {
 		zfs_dirty_data_max = allmem *
 		    zfs_dirty_data_max_percent / 100;
 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
 		    zfs_dirty_data_max_max);
 	}
 
 	if (zfs_wrlog_data_max == 0) {
 
 		/*
 		 * dp_wrlog_total is reduced for each txg at the end of
 		 * spa_sync(). However, dp_dirty_total is reduced every time
 		 * a block is written out. Thus under normal operation,
 		 * dp_wrlog_total could grow 2 times as big as
 		 * zfs_dirty_data_max.
 		 */
 		zfs_wrlog_data_max = zfs_dirty_data_max * 2;
 	}
 }
 
 void
 arc_fini(void)
 {
 	arc_prune_t *p;
 
 #ifdef _KERNEL
 	arc_lowmem_fini();
 #endif /* _KERNEL */
 
 	/* Use B_TRUE to ensure *all* buffers are evicted */
 	arc_flush(NULL, B_TRUE);
 
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
 	taskq_wait(arc_prune_taskq);
 	taskq_destroy(arc_prune_taskq);
 
 	mutex_enter(&arc_prune_mtx);
 	while ((p = list_head(&arc_prune_list)) != NULL) {
 		list_remove(&arc_prune_list, p);
 		zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
 		zfs_refcount_destroy(&p->p_refcnt);
 		kmem_free(p, sizeof (*p));
 	}
 	mutex_exit(&arc_prune_mtx);
 
 	list_destroy(&arc_prune_list);
 	mutex_destroy(&arc_prune_mtx);
 
 	(void) zthr_cancel(arc_evict_zthr);
 	(void) zthr_cancel(arc_reap_zthr);
 
 	mutex_destroy(&arc_evict_lock);
 	list_destroy(&arc_evict_waiters);
 
 	/*
 	 * Free any buffers that were tagged for destruction.  This needs
 	 * to occur before arc_state_fini() runs and destroys the aggsum
 	 * values which are updated when freeing scatter ABDs.
 	 */
 	l2arc_do_free_on_write();
 
 	/*
 	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
 	 * trigger the release of kmem magazines, which can callback to
 	 * arc_space_return() which accesses aggsums freed in act_state_fini().
 	 */
 	buf_fini();
 	arc_state_fini();
 
 	arc_unregister_hotplug();
 
 	/*
 	 * We destroy the zthrs after all the ARC state has been
 	 * torn down to avoid the case of them receiving any
 	 * wakeup() signals after they are destroyed.
 	 */
 	zthr_destroy(arc_evict_zthr);
 	zthr_destroy(arc_reap_zthr);
 
 	ASSERT0(arc_loaned_bytes);
 }
 
 /*
  * Level 2 ARC
  *
  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
  * It uses dedicated storage devices to hold cached data, which are populated
  * using large infrequent writes.  The main role of this cache is to boost
  * the performance of random read workloads.  The intended L2ARC devices
  * include short-stroked disks, solid state disks, and other media with
  * substantially faster read latency than disk.
  *
  *                 +-----------------------+
  *                 |         ARC           |
  *                 +-----------------------+
  *                    |         ^     ^
  *                    |         |     |
  *      l2arc_feed_thread()    arc_read()
  *                    |         |     |
  *                    |  l2arc read   |
  *                    V         |     |
  *               +---------------+    |
  *               |     L2ARC     |    |
  *               +---------------+    |
  *                   |    ^           |
  *          l2arc_write() |           |
  *                   |    |           |
  *                   V    |           |
  *                 +-------+      +-------+
  *                 | vdev  |      | vdev  |
  *                 | cache |      | cache |
  *                 +-------+      +-------+
  *                 +=========+     .-----.
  *                 :  L2ARC  :    |-_____-|
  *                 : devices :    | Disks |
  *                 +=========+    `-_____-'
  *
  * Read requests are satisfied from the following sources, in order:
  *
  *	1) ARC
  *	2) vdev cache of L2ARC devices
  *	3) L2ARC devices
  *	4) vdev cache of disks
  *	5) disks
  *
  * Some L2ARC device types exhibit extremely slow write performance.
  * To accommodate for this there are some significant differences between
  * the L2ARC and traditional cache design:
  *
  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
  * the ARC behave as usual, freeing buffers and placing headers on ghost
  * lists.  The ARC does not send buffers to the L2ARC during eviction as
  * this would add inflated write latencies for all ARC memory pressure.
  *
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
  * not already there. It scans until a headroom of buffers is satisfied,
  * which itself is a buffer for ARC eviction. If a compressible buffer is
  * found during scanning and selected for writing to an L2ARC device, we
  * temporarily boost scanning headroom during the next scan cycle to make
  * sure we adapt to compression effects (which might significantly reduce
  * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
  *	       head -->                        tail
  *	        +---------------------+----------+
  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
  *	        +---------------------+----------+   |   o L2ARC eligible
  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
  *	        +---------------------+----------+   |
  *	             15.9 Gbytes      ^ 32 Mbytes    |
  *	                           headroom          |
  *	                                      l2arc_feed_thread()
  *	                                             |
  *	                 l2arc write hand <--[oooo]--'
  *	                         |           8 Mbyte
  *	                         |          write max
  *	                         V
  *		  +==============================+
  *	L2ARC dev |####|#|###|###|    |####| ... |
  *	          +==============================+
  *	                     32 Gbytes
  *
  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
  * evicted, then the L2ARC has cached a buffer much sooner than it probably
  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
  * safe to say that this is an uncommon case, since buffers at the end of
  * the ARC lists have moved there due to inactivity.
  *
  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
  * then the L2ARC simply misses copying some buffers.  This serves as a
  * pressure valve to prevent heavy read workloads from both stalling the ARC
  * with waits and clogging the L2ARC with writes.  This also helps prevent
  * the potential for the L2ARC to churn if it attempts to cache content too
  * quickly, such as during backups of the entire pool.
  *
  * 5. After system boot and before the ARC has filled main memory, there are
  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
  * lists can remain mostly static.  Instead of searching from tail of these
  * lists as pictured, the l2arc_feed_thread() will search from the list heads
  * for eligible buffers, greatly increasing its chance of finding them.
  *
  * The L2ARC device write speed is also boosted during this time so that
  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
  * there are no L2ARC reads, and no fear of degrading read performance
  * through increased writes.
  *
  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  * the vdev queue can aggregate them into larger and fewer writes.  Each
  * device is written to in a rotor fashion, sweeping writes through
  * available space then repeating.
  *
  * 7. The L2ARC does not store dirty content.  It never needs to flush
  * write buffers back to disk based storage.
  *
  * 8. If an ARC buffer is written (and dirtied) which also exists in the
  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  *
  * The performance of the L2ARC can be tweaked by a number of tunables, which
  * may be necessary for different workloads:
  *
  *	l2arc_write_max		max write bytes per interval
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
  *	l2arc_headroom_boost	when we find compressed buffers during ARC
  *				scanning, we multiply headroom by this
  *				percentage factor for the next scan cycle,
  *				since more compressed buffers are likely to
  *				be present
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
  * integrated, and also may become zpool properties.
  *
  * There are three key functions that control how the L2ARC warms up:
  *
  *	l2arc_write_eligible()	check if a buffer is eligible to cache
  *	l2arc_write_size()	calculate how much to write
  *	l2arc_write_interval()	calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
  *
  * L2ARC persistence:
  *
  * When writing buffers to L2ARC, we periodically add some metadata to
  * make sure we can pick them up after reboot, thus dramatically reducing
  * the impact that any downtime has on the performance of storage systems
  * with large caches.
  *
  * The implementation works fairly simply by integrating the following two
  * modifications:
  *
  * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
  *    which is an additional piece of metadata which describes what's been
  *    written. This allows us to rebuild the arc_buf_hdr_t structures of the
  *    main ARC buffers. There are 2 linked-lists of log blocks headed by
  *    dh_start_lbps[2]. We alternate which chain we append to, so they are
  *    time-wise and offset-wise interleaved, but that is an optimization rather
  *    than for correctness. The log block also includes a pointer to the
  *    previous block in its chain.
  *
  * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
  *    for our header bookkeeping purposes. This contains a device header,
  *    which contains our top-level reference structures. We update it each
  *    time we write a new log block, so that we're able to locate it in the
  *    L2ARC device. If this write results in an inconsistent device header
  *    (e.g. due to power failure), we detect this by verifying the header's
  *    checksum and simply fail to reconstruct the L2ARC after reboot.
  *
  * Implementation diagram:
  *
  * +=== L2ARC device (not to scale) ======================================+
  * |       ___two newest log block pointers__.__________                  |
  * |      /                                   \dh_start_lbps[1]           |
  * |	 /				       \         \dh_start_lbps[0]|
  * |.___/__.                                    V         V               |
  * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
  * ||   hdr|      ^         /^       /^        /         /                |
  * |+------+  ...--\-------/  \-----/--\------/         /                 |
  * |                \--------------/    \--------------/                  |
  * +======================================================================+
  *
  * As can be seen on the diagram, rather than using a simple linked list,
  * we use a pair of linked lists with alternating elements. This is a
  * performance enhancement due to the fact that we only find out the
  * address of the next log block access once the current block has been
  * completely read in. Obviously, this hurts performance, because we'd be
  * keeping the device's I/O queue at only a 1 operation deep, thus
  * incurring a large amount of I/O round-trip latency. Having two lists
  * allows us to fetch two log blocks ahead of where we are currently
  * rebuilding L2ARC buffers.
  *
  * On-device data structures:
  *
  * L2ARC device header:	l2arc_dev_hdr_phys_t
  * L2ARC log block:	l2arc_log_blk_phys_t
  *
  * L2ARC reconstruction:
  *
  * When writing data, we simply write in the standard rotary fashion,
  * evicting buffers as we go and simply writing new data over them (writing
  * a new log block every now and then). This obviously means that once we
  * loop around the end of the device, we will start cutting into an already
  * committed log block (and its referenced data buffers), like so:
  *
  *    current write head__       __old tail
  *                        \     /
  *                        V    V
  * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
  *                         ^    ^^^^^^^^^___________________________________
  *                         |                                                \
  *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
  *
  * When importing the pool, we detect this situation and use it to stop
  * our scanning process (see l2arc_rebuild).
  *
  * There is one significant caveat to consider when rebuilding ARC contents
  * from an L2ARC device: what about invalidated buffers? Given the above
  * construction, we cannot update blocks which we've already written to amend
  * them to remove buffers which were invalidated. Thus, during reconstruction,
  * we might be populating the cache with buffers for data that's not on the
  * main pool anymore, or may have been overwritten!
  *
  * As it turns out, this isn't a problem. Every arc_read request includes
  * both the DVA and, crucially, the birth TXG of the BP the caller is
  * looking for. So even if the cache were populated by completely rotten
  * blocks for data that had been long deleted and/or overwritten, we'll
  * never actually return bad data from the cache, since the DVA with the
  * birth TXG uniquely identify a block in space and time - once created,
  * a block is immutable on disk. The worst thing we have done is wasted
  * some time and memory at l2arc rebuild to reconstruct outdated ARC
  * entries that will get dropped from the l2arc as it is being updated
  * with new blocks.
  *
  * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
  * hand are not restored. This is done by saving the offset (in bytes)
  * l2arc_evict() has evicted to in the L2ARC device header and taking it
  * into account when restoring buffers.
  */
 
 static boolean_t
 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
 	 * 1. belongs to a different spa.
 	 * 2. is already cached on the L2ARC.
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
 	if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
 	    HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static uint64_t
 l2arc_write_size(l2arc_dev_t *dev)
 {
 	uint64_t size, dev_size, tsize;
 
 	/*
 	 * Make sure our globals have meaningful values in case the user
 	 * altered them.
 	 */
 	size = l2arc_write_max;
 	if (size == 0) {
 		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
 		    "be greater than zero, resetting it to the default (%d)",
 		    L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
 	}
 
 	if (arc_warm == B_FALSE)
 		size += l2arc_write_boost;
 
 	/*
 	 * Make sure the write size does not exceed the size of the cache
 	 * device. This is important in l2arc_evict(), otherwise infinite
 	 * iteration can occur.
 	 */
 	dev_size = dev->l2ad_end - dev->l2ad_start;
 	tsize = size + l2arc_log_blk_overhead(size, dev);
 	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
 		tsize += MAX(64 * 1024 * 1024,
 		    (tsize * l2arc_trim_ahead) / 100);
 
 	if (tsize >= dev_size) {
 		cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
 		    "plus the overhead of log blocks (persistent L2ARC, "
 		    "%llu bytes) exceeds the size of the cache device "
 		    "(guid %llu), resetting them to the default (%d)",
 		    l2arc_log_blk_overhead(size, dev),
 		    dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
 
 		if (arc_warm == B_FALSE)
 			size += l2arc_write_boost;
 	}
 
 	return (size);
 
 }
 
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
 	clock_t interval, next, now;
 
 	/*
 	 * If the ARC lists are busy, increase our write rate; if the
 	 * lists are stale, idle back.  This is achieved by checking
 	 * how much we previously wrote - if it was more than half of
 	 * what we wanted, schedule the next write much sooner.
 	 */
 	if (l2arc_feed_again && wrote > (wanted / 2))
 		interval = (hz * l2arc_feed_min_ms) / 1000;
 	else
 		interval = hz * l2arc_feed_secs;
 
 	now = ddi_get_lbolt();
 	next = MAX(now, MIN(now + interval, began + interval));
 
 	return (next);
 }
 
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
  */
 static l2arc_dev_t *
 l2arc_dev_get_next(void)
 {
 	l2arc_dev_t *first, *next = NULL;
 
 	/*
 	 * Lock out the removal of spas (spa_namespace_lock), then removal
 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
 	 * both locks will be dropped and a spa config lock held instead.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	mutex_enter(&l2arc_dev_mtx);
 
 	/* if there are no vdevs, there is nothing to do */
 	if (l2arc_ndev == 0)
 		goto out;
 
 	first = NULL;
 	next = l2arc_dev_last;
 	do {
 		/* loop around the list looking for a non-faulted vdev */
 		if (next == NULL) {
 			next = list_head(l2arc_dev_list);
 		} else {
 			next = list_next(l2arc_dev_list, next);
 			if (next == NULL)
 				next = list_head(l2arc_dev_list);
 		}
 
 		/* if we have come back to the start, bail out */
 		if (first == NULL)
 			first = next;
 		else if (next == first)
 			break;
 
 	} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
 	    next->l2ad_trim_all);
 
 	/* if we were unable to find any usable vdevs, return NULL */
 	if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
 	    next->l2ad_trim_all)
 		next = NULL;
 
 	l2arc_dev_last = next;
 
 out:
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Grab the config lock to prevent the 'next' device from being
 	 * removed while we are writing to it.
 	 */
 	if (next != NULL)
 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
 	mutex_exit(&spa_namespace_lock);
 
 	return (next);
 }
 
 /*
  * Free buffers that were tagged for destruction.
  */
 static void
 l2arc_do_free_on_write(void)
 {
 	list_t *buflist;
 	l2arc_data_free_t *df, *df_prev;
 
 	mutex_enter(&l2arc_free_on_write_mtx);
 	buflist = l2arc_free_on_write;
 
 	for (df = list_tail(buflist); df; df = df_prev) {
 		df_prev = list_prev(buflist, df);
 		ASSERT3P(df->l2df_abd, !=, NULL);
 		abd_free(df->l2df_abd);
 		list_remove(buflist, df);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
 
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 /*
  * A write to a cache device has completed.  Update all headers to allow
  * reads from these buffers to begin.
  */
 static void
 l2arc_write_done(zio_t *zio)
 {
 	l2arc_write_callback_t	*cb;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	l2arc_dev_t		*dev;
 	l2arc_dev_hdr_phys_t	*l2dhdr;
 	list_t			*buflist;
 	arc_buf_hdr_t		*head, *hdr, *hdr_prev;
 	kmutex_t		*hash_lock;
 	int64_t			bytes_dropped = 0;
 
 	cb = zio->io_private;
 	ASSERT3P(cb, !=, NULL);
 	dev = cb->l2wcb_dev;
 	l2dhdr = dev->l2ad_dev_hdr;
 	ASSERT3P(dev, !=, NULL);
 	head = cb->l2wcb_head;
 	ASSERT3P(head, !=, NULL);
 	buflist = &dev->l2ad_buflist;
 	ASSERT3P(buflist, !=, NULL);
 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
 	    l2arc_write_callback_t *, cb);
 
 	/*
 	 * All writes completed, or an error was hit.
 	 */
 top:
 	mutex_enter(&dev->l2ad_mtx);
 	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock. We must retry so we
 			 * don't leave the ARC_FLAG_L2_WRITING bit set.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
 
 			/*
 			 * We don't want to rescan the headers we've
 			 * already marked as having been written out, so
 			 * we reinsert the head node so we can pick up
 			 * where we left off.
 			 */
 			list_remove(buflist, head);
 			list_insert_after(buflist, hdr, head);
 
 			mutex_exit(&dev->l2ad_mtx);
 
 			/*
 			 * We wait for the hash lock to become available
 			 * to try and prevent busy waiting, and increase
 			 * the chance we'll be able to acquire the lock
 			 * the next time around.
 			 */
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto top;
 		}
 
 		/*
 		 * We could not have been moved into the arc_l2c_only
 		 * state while in-flight due to our ARC_FLAG_L2_WRITING
 		 * bit being set. Let's just ensure that's being enforced.
 		 */
 		ASSERT(HDR_HAS_L1HDR(hdr));
 
 		/*
 		 * Skipped - drop L2ARC entry and mark the header as no
 		 * longer L2 eligibile.
 		 */
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
 			list_remove(buflist, hdr);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			l2arc_hdr_arcstats_decrement(hdr);
 
 			bytes_dropped +=
 			    vdev_psize_to_asize(dev->l2ad_vdev, psize);
 			(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 		}
 
 		/*
 		 * Allow ARC to begin reads and ghost list evictions to
 		 * this L2ARC entry.
 		 */
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
 
 		mutex_exit(hash_lock);
 	}
 
 	/*
 	 * Free the allocated abd buffers for writing the log blocks.
 	 * If the zio failed reclaim the allocated space and remove the
 	 * pointers to these log blocks from the log block pointer list
 	 * of the L2ARC device.
 	 */
 	while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
 		abd_free(abd_buf->abd);
 		zio_buf_free(abd_buf, sizeof (*abd_buf));
 		if (zio->io_error != 0) {
 			lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
 			/*
 			 * L2BLK_GET_PSIZE returns aligned size for log
 			 * blocks.
 			 */
 			uint64_t asize =
 			    L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
 			bytes_dropped += asize;
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 	list_destroy(&cb->l2wcb_abd_list);
 
 	if (zio->io_error != 0) {
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
 		/*
 		 * Restore the lbps array in the header to its previous state.
 		 * If the list of log block pointers is empty, zero out the
 		 * log block pointers in the device header.
 		 */
 		lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
 		for (int i = 0; i < 2; i++) {
 			if (lb_ptr_buf == NULL) {
 				/*
 				 * If the list is empty zero out the device
 				 * header. Otherwise zero out the second log
 				 * block pointer in the header.
 				 */
 				if (i == 0) {
 					bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
 				} else {
 					bzero(&l2dhdr->dh_start_lbps[i],
 					    sizeof (l2arc_log_blkptr_t));
 				}
 				break;
 			}
 			bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
 			    sizeof (l2arc_log_blkptr_t));
 			lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
 			    lb_ptr_buf);
 		}
 	}
 
 	ARCSTAT_BUMP(arcstat_l2_writes_done);
 	list_remove(buflist, head);
 	ASSERT(!HDR_HAS_L1HDR(head));
 	kmem_cache_free(hdr_l2only_cache, head);
 	mutex_exit(&dev->l2ad_mtx);
 
 	ASSERT(dev->l2ad_vdev != NULL);
 	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
 
 	l2arc_do_free_on_write();
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 
 static int
 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 {
 	int ret;
 	spa_t *spa = zio->io_spa;
 	arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/*
 	 * ZIL data is never be written to the L2ARC, so we don't need
 	 * special handling for its unique MAC storage.
 	 */
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * If the data was encrypted, decrypt it now. Note that
 	 * we must check the bp here and not the hdr, since the
 	 * hdr does not have its encryption parameters updated
 	 * until arc_read_done().
 	 */
 	if (BP_IS_ENCRYPTED(bp)) {
 		abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    B_TRUE);
 
 		zio_crypt_decode_params_bp(bp, salt, iv);
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
 		    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 		    salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
 		    hdr->b_l1hdr.b_pabd, &no_crypt);
 		if (ret != 0) {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		/*
 		 * If we actually performed decryption, replace b_pabd
 		 * with the decrypted data. Otherwise we can just throw
 		 * our decryption buffer away.
 		 */
 		if (!no_crypt) {
 			arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 			    arc_hdr_size(hdr), hdr);
 			hdr->b_l1hdr.b_pabd = eabd;
 			zio->io_abd = eabd;
 		} else {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 		}
 	}
 
 	/*
 	 * If the L2ARC block was compressed, but ARC compression
 	 * is disabled we decompress the data into a new buffer and
 	 * replace the existing data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    B_TRUE);
 		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 		zio->io_abd = cabd;
 		zio->io_size = HDR_GET_LSIZE(hdr);
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 
 /*
  * A read to a cache device completed.  Validate buffer contents before
  * handing over to the regular ARC routines.
  */
 static void
 l2arc_read_done(zio_t *zio)
 {
 	int tfm_error = 0;
 	l2arc_read_callback_t *cb = zio->io_private;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	boolean_t valid_cksum;
 	boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
 	    (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
 
 	ASSERT3P(zio->io_vd, !=, NULL);
 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
 
 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
 	ASSERT3P(cb, !=, NULL);
 	hdr = cb->l2rcb_hdr;
 	ASSERT3P(hdr, !=, NULL);
 
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
 	 * If the data was read into a temporary buffer,
 	 * move it and free the buffer.
 	 */
 	if (cb->l2rcb_abd != NULL) {
 		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
 		if (zio->io_error == 0) {
 			if (using_rdata) {
 				abd_copy(hdr->b_crypt_hdr.b_rabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			} else {
 				abd_copy(hdr->b_l1hdr.b_pabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			}
 		}
 
 		/*
 		 * The following must be done regardless of whether
 		 * there was an error:
 		 * - free the temporary buffer
 		 * - point zio to the real ARC buffer
 		 * - set zio size accordingly
 		 * These are required because zio is either re-used for
 		 * an I/O of the block in the case of the error
 		 * or the zio is passed to arc_read_done() and it
 		 * needs real data.
 		 */
 		abd_free(cb->l2rcb_abd);
 		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
 
 		if (using_rdata) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			zio->io_abd = zio->io_orig_abd =
 			    hdr->b_crypt_hdr.b_rabd;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
 		}
 	}
 
 	ASSERT3P(zio->io_abd, !=, NULL);
 
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
 	ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
 	    (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
 	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_prop.zp_complevel = hdr->b_complevel;
 
 	valid_cksum = arc_cksum_is_equal(hdr, zio);
 
 	/*
 	 * b_rabd will always match the data as it exists on disk if it is
 	 * being used. Therefore if we are reading into b_rabd we do not
 	 * attempt to untransform the data.
 	 */
 	if (valid_cksum && !using_rdata)
 		tfm_error = l2arc_untransform(zio, cb);
 
 	if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
 	    !HDR_L2_EVICTED(hdr)) {
 		mutex_exit(hash_lock);
 		zio->io_private = hdr;
 		arc_read_done(zio);
 	} else {
 		/*
 		 * Buffer didn't survive caching.  Increment stats and
 		 * reissue to the original storage device.
 		 */
 		if (zio->io_error != 0) {
 			ARCSTAT_BUMP(arcstat_l2_io_error);
 		} else {
 			zio->io_error = SET_ERROR(EIO);
 		}
 		if (!valid_cksum || tfm_error != 0)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
 		/*
 		 * If there's no waiter, issue an async i/o to the primary
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
 		if (zio->io_waiter == NULL) {
 			zio_t *pio = zio_unique_parent(zio);
 			void *abd = (using_rdata) ?
 			    hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
 
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
 			zio = zio_read(pio, zio->io_spa, zio->io_bp,
 			    abd, zio->io_size, arc_read_done,
 			    hdr, zio->io_priority, cb->l2rcb_flags,
 			    &cb->l2rcb_zb);
 
 			/*
 			 * Original ZIO will be freed, so we need to update
 			 * ARC header with the new ZIO pointer to be used
 			 * by zio_change_priority() in arc_read().
 			 */
 			for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
 			    acb != NULL; acb = acb->acb_next)
 				acb->acb_zio_head = zio;
 
 			mutex_exit(hash_lock);
 			zio_nowait(zio);
 		} else {
 			mutex_exit(hash_lock);
 		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * This is the list priority from which the L2ARC will search for pages to
  * cache.  This is used within loops (0..3) to cycle through lists in the
  * desired order.  This order can have a significant effect on cache
  * performance.
  *
  * Currently the metadata lists are hit first, MFU then MRU, followed by
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
 static multilist_sublist_t *
 l2arc_sublist_lock(int list_num)
 {
 	multilist_t *ml = NULL;
 	unsigned int idx;
 
 	ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
 
 	switch (list_num) {
 	case 0:
 		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 1:
 		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 2:
 		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
 		break;
 	case 3:
 		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
 		break;
 	default:
 		return (NULL);
 	}
 
 	/*
 	 * Return a randomly-selected sublist. This is acceptable
 	 * because the caller feeds only a little bit of data for each
 	 * call (8MB). Subsequent calls will result in different
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
 	return (multilist_sublist_lock(ml, idx));
 }
 
 /*
  * Calculates the maximum overhead of L2ARC metadata log blocks for a given
  * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
  * overhead in processing to make sure there is enough headroom available
  * when writing buffers.
  */
 static inline uint64_t
 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
 {
 	if (dev->l2ad_log_entries == 0) {
 		return (0);
 	} else {
 		uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
 
 		uint64_t log_blocks = (log_entries +
 		    dev->l2ad_log_entries - 1) /
 		    dev->l2ad_log_entries;
 
 		return (vdev_psize_to_asize(dev->l2ad_vdev,
 		    sizeof (l2arc_log_blk_phys_t)) * log_blocks);
 	}
 }
 
 /*
  * Evict buffers from the device write hand to the distance specified in
  * bytes. This distance may span populated buffers, it may span nothing.
  * This is clearing a region on the L2ARC device ready for writing.
  * If the 'all' boolean is set, every buffer is evicted.
  */
 static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
 	list_t *buflist;
 	arc_buf_hdr_t *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 	l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
 	vdev_t *vd = dev->l2ad_vdev;
 	boolean_t rerun;
 
 	buflist = &dev->l2ad_buflist;
 
 	/*
 	 * We need to add in the worst case scenario of log block overhead.
 	 */
 	distance += l2arc_log_blk_overhead(distance, dev);
 	if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
 		/*
 		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
 		 * times the write size, whichever is greater.
 		 */
 		distance += MAX(64 * 1024 * 1024,
 		    (distance * l2arc_trim_ahead) / 100);
 	}
 
 top:
 	rerun = B_FALSE;
 	if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
 		/*
 		 * When there is no space to accommodate upcoming writes,
 		 * evict to the end. Then bump the write and evict hands
 		 * to the start and iterate. This iteration does not
 		 * happen indefinitely as we make sure in
 		 * l2arc_write_size() that when the write hand is reset,
 		 * the write size does not exceed the end of the device.
 		 */
 		rerun = B_TRUE;
 		taddr = dev->l2ad_end;
 	} else {
 		taddr = dev->l2ad_hand + distance;
 	}
 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
 	    uint64_t, taddr, boolean_t, all);
 
 	if (!all) {
 		/*
 		 * This check has to be placed after deciding whether to
 		 * iterate (rerun).
 		 */
 		if (dev->l2ad_first) {
 			/*
 			 * This is the first sweep through the device. There is
 			 * nothing to evict. We have already trimmmed the
 			 * whole device.
 			 */
 			goto out;
 		} else {
 			/*
 			 * Trim the space to be evicted.
 			 */
 			if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
 			    l2arc_trim_ahead > 0) {
 				/*
 				 * We have to drop the spa_config lock because
 				 * vdev_trim_range() will acquire it.
 				 * l2ad_evict already accounts for the label
 				 * size. To prevent vdev_trim_ranges() from
 				 * adding it again, we subtract it from
 				 * l2ad_evict.
 				 */
 				spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
 				vdev_trim_simple(vd,
 				    dev->l2ad_evict - VDEV_LABEL_START_SIZE,
 				    taddr - dev->l2ad_evict);
 				spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
 				    RW_READER);
 			}
 
 			/*
 			 * When rebuilding L2ARC we retrieve the evict hand
 			 * from the header of the device. Of note, l2arc_evict()
 			 * does not actually delete buffers from the cache
 			 * device, but trimming may do so depending on the
 			 * hardware implementation. Thus keeping track of the
 			 * evict hand is useful.
 			 */
 			dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
 		}
 	}
 
 retry:
 	mutex_enter(&dev->l2ad_mtx);
 	/*
 	 * We have to account for evicted log blocks. Run vdev_space_update()
 	 * on log blocks whose offset (in bytes) is before the evicted offset
 	 * (in bytes) by searching in the list of pointers to log blocks
 	 * present in the L2ARC device.
 	 */
 	for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
 	    lb_ptr_buf = lb_ptr_buf_prev) {
 
 		lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
 
 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 		uint64_t asize = L2BLK_GET_PSIZE(
 		    (lb_ptr_buf->lb_ptr)->lbp_prop);
 
 		/*
 		 * We don't worry about log blocks left behind (ie
 		 * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
 		 * will never write more than l2arc_evict() evicts.
 		 */
 		if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
 			break;
 		} else {
 			vdev_space_update(vd, -asize, 0, 0);
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
 			list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 
 	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		ASSERT(!HDR_EMPTY(hdr));
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
 			mutex_exit(&dev->l2ad_mtx);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto retry;
 		}
 
 		/*
 		 * A header can't be on this list if it doesn't have L2 header.
 		 */
 		ASSERT(HDR_HAS_L2HDR(hdr));
 
 		/* Ensure this header has finished being written. */
 		ASSERT(!HDR_L2_WRITING(hdr));
 		ASSERT(!HDR_L2_WRITE_HEAD(hdr));
 
 		if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
 		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
 			/*
 			 * We've evicted to the target address,
 			 * or the end of the device.
 			 */
 			mutex_exit(hash_lock);
 			break;
 		}
 
 		if (!HDR_HAS_L1HDR(hdr)) {
 			ASSERT(!HDR_L2_READING(hdr));
 			/*
 			 * This doesn't exist in the ARC.  Destroy.
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_lsize.
 			 */
 			arc_change_state(arc_anon, hdr, hash_lock);
 			arc_hdr_destroy(hdr);
 		} else {
 			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
 			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
 			/*
 			 * Invalidate issued or about to be issued
 			 * reads, since we may be about to write
 			 * over this location.
 			 */
 			if (HDR_L2_READING(hdr)) {
 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
 				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
 			}
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 		mutex_exit(hash_lock);
 	}
 	mutex_exit(&dev->l2ad_mtx);
 
 out:
 	/*
 	 * We need to check if we evict all buffers, otherwise we may iterate
 	 * unnecessarily.
 	 */
 	if (!all && rerun) {
 		/*
 		 * Bump device hand to the device start if it is approaching the
 		 * end. l2arc_evict() has already evicted ahead for this case.
 		 */
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 		goto top;
 	}
 
 	if (!all) {
 		/*
 		 * In case of cache device removal (all) the following
 		 * assertions may be violated without functional consequences
 		 * as the device is about to be removed.
 		 */
 		ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
 		if (!dev->l2ad_first)
 			ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
 	}
 }
 
 /*
  * Handle any abd transforms that might be required for writing to the L2ARC.
  * If successful, this function will always return an abd with the data
  * transformed as it is on disk in a new abd of asize bytes.
  */
 static int
 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
     abd_t **abd_out)
 {
 	int ret;
 	void *tmp = NULL;
 	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
 	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t size = arc_hdr_size(hdr);
 	boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	dsl_crypto_key_t *dck = NULL;
 	uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) ||
 	    HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
 	ASSERT3U(psize, <=, asize);
 
 	/*
 	 * If this data simply needs its own buffer, we simply allocate it
 	 * and copy the data. This may be done to eliminate a dependency on a
 	 * shared buffer or to reallocate the buffer to match asize.
 	 */
 	if (HDR_HAS_RABD(hdr) && asize != psize) {
 		ASSERT3U(asize, >=, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
 		if (psize != asize)
 			abd_zero_off(to_write, psize, asize - psize);
 		goto out;
 	}
 
 	if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
 	    !HDR_ENCRYPTED(hdr)) {
 		ASSERT3U(size, ==, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
 		if (size != asize)
 			abd_zero_off(to_write, size, asize - size);
 		goto out;
 	}
 
 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
 		cabd = abd_alloc_for_io(asize, ismd);
 		tmp = abd_borrow_buf(cabd, asize);
 
 		psize = zio_compress_data(compress, to_write, tmp, size,
 		    hdr->b_complevel);
 
 		if (psize >= size) {
 			abd_return_buf(cabd, tmp, asize);
 			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
 			to_write = cabd;
 			abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
 			if (size != asize)
 				abd_zero_off(to_write, size, asize - size);
 			goto encrypt;
 		}
 		ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
 		if (psize < asize)
 			bzero((char *)tmp + psize, asize - psize);
 		psize = HDR_GET_PSIZE(hdr);
 		abd_return_buf_copy(cabd, tmp, asize);
 		to_write = cabd;
 	}
 
 encrypt:
 	if (HDR_ENCRYPTED(hdr)) {
 		eabd = abd_alloc_for_io(asize, ismd);
 
 		/*
 		 * If the dataset was disowned before the buffer
 		 * made it to this point, the key to re-encrypt
 		 * it won't be available. In this case we simply
 		 * won't write the buffer to the L2ARC.
 		 */
 		ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
 		    FTAG, &dck);
 		if (ret != 0)
 			goto error;
 
 		ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
 		    hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
 		    &no_crypt);
 		if (ret != 0)
 			goto error;
 
 		if (no_crypt)
 			abd_copy(eabd, to_write, psize);
 
 		if (psize != asize)
 			abd_zero_off(eabd, psize, asize - psize);
 
 		/* assert that the MAC we got here matches the one we saved */
 		ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 
 		if (to_write == cabd)
 			abd_free(cabd);
 
 		to_write = eabd;
 	}
 
 out:
 	ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
 	*abd_out = to_write;
 	return (0);
 
 error:
 	if (dck != NULL)
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 	if (cabd != NULL)
 		abd_free(cabd);
 	if (eabd != NULL)
 		abd_free(eabd);
 
 	*abd_out = NULL;
 	return (ret);
 }
 
 static void
 l2arc_blk_fetch_done(zio_t *zio)
 {
 	l2arc_read_callback_t *cb;
 
 	cb = zio->io_private;
 	if (cb->l2rcb_abd != NULL)
 		abd_free(cb->l2rcb_abd);
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  * The headroom_boost is an in-out parameter used to maintain headroom boost
  * state between calls to this function.
  *
  * Returns the number of bytes actually written (which may be smaller than
  * the delta by which the device hand has changed due to alignment and the
  * writing of log blocks).
  */
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
 	arc_buf_hdr_t 		*hdr, *hdr_prev, *head;
 	uint64_t 		write_asize, write_psize, write_lsize, headroom;
 	boolean_t		full;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
 	write_lsize = write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
 
 	/*
 	 * Copy buffers for L2ARC writing.
 	 */
 	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
 		/*
 		 * If pass == 1 or 3, we cache MRU metadata and data
 		 * respectively.
 		 */
 		if (l2arc_mfuonly) {
 			if (pass == 1 || pass == 3)
 				continue;
 		}
 
 		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		uint64_t passed_sz = 0;
 
 		VERIFY3P(mls, !=, NULL);
 
 		/*
 		 * L2ARC fast warmup.
 		 *
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
 		if (arc_warm == B_FALSE)
 			hdr = multilist_sublist_head(mls);
 		else
 			hdr = multilist_sublist_tail(mls);
 
 		headroom = target_sz * l2arc_headroom;
 		if (zfs_compressed_arc_enabled)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		for (; hdr; hdr = hdr_prev) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
 			if (arc_warm == B_FALSE)
 				hdr_prev = multilist_sublist_next(mls, hdr);
 			else
 				hdr_prev = multilist_sublist_prev(mls, hdr);
 
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
 				/*
 				 * Skip this buffer rather than waiting.
 				 */
 				continue;
 			}
 
 			passed_sz += HDR_GET_LSIZE(hdr);
 			if (l2arc_headroom != 0 && passed_sz > headroom) {
 				/*
 				 * Searched too far.
 				 */
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
 				continue;
 			}
 
 			/*
 			 * We rely on the L1 portion of the header below, so
 			 * it's invalid for this header to have been evicted out
 			 * of the ghost cache, prior to being written out. The
 			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
 
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
 			    psize);
 
 			if ((write_asize + asize) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			/*
 			 * We rely on the L1 portion of the header below, so
 			 * it's invalid for this header to have been evicted out
 			 * of the ghost cache, prior to being written out. The
 			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
 			 */
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
 			ASSERT(HDR_HAS_L1HDR(hdr));
 
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
 			 * must always match the data exactly as it exists on
 			 * disk. Otherwise, the L2ARC can normally use the
 			 * hdr's data, but if we're sharing data between the
 			 * hdr and one of its bufs, L2ARC needs its own copy of
 			 * the data so that the ZIO below can't race with the
 			 * buf consumer. To ensure that this copy will be
 			 * available for the lifetime of the ZIO and be cleaned
 			 * up afterwards, we add it to the l2arc_free_on_write
 			 * queue. If we need to apply any transforms to the
 			 * data (compression, encryption) we will also need the
 			 * extra buffer.
 			 */
 			if (HDR_HAS_RABD(hdr) && psize == asize) {
 				to_write = hdr->b_crypt_hdr.b_rabd;
 			} else if ((HDR_COMPRESSION_ENABLED(hdr) ||
 			    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
 			    !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
 			    psize == asize) {
 				to_write = hdr->b_l1hdr.b_pabd;
 			} else {
 				int ret;
 				arc_buf_contents_t type = arc_buf_type(hdr);
 
 				ret = l2arc_apply_transforms(spa, hdr, asize,
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
 					    ARC_FLAG_L2_WRITING);
 					mutex_exit(hash_lock);
 					continue;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
 				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
 				mutex_exit(&dev->l2ad_mtx);
 
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
 				/*
 				 * Create a list to save allocated abd buffers
 				 * for l2arc_log_blk_commit().
 				 */
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
 				pio = zio_root(spa, l2arc_write_done, cb,
 				    ZIO_FLAG_CANFAIL);
 			}
 
 			hdr->b_l2hdr.b_dev = dev;
 			hdr->b_l2hdr.b_hits = 0;
 
 			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
 			hdr->b_l2hdr.b_arcs_state =
 			    hdr->b_l1hdr.b_state->arcs_state;
 			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
 
 			mutex_enter(&dev->l2ad_mtx);
 			list_insert_head(&dev->l2ad_buflist, hdr);
 			mutex_exit(&dev->l2ad_mtx);
 
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    hdr->b_l2hdr.b_daddr, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
 			write_lsize += HDR_GET_LSIZE(hdr);
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
 			l2arc_hdr_arcstats_increment(hdr);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 			mutex_exit(hash_lock);
 
 			/*
 			 * Append buf info to current log and commit if full.
 			 * arcstat_l2_{size,asize} kstats are updated
 			 * internally.
 			 */
 			if (l2arc_log_blk_insert(dev, hdr))
 				l2arc_log_blk_commit(dev, pio, cb);
 
 			zio_nowait(wzio);
 		}
 
 		multilist_sublist_unlock(mls);
 
 		if (full == B_TRUE)
 			break;
 	}
 
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
 		ASSERT0(write_lsize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
 		/*
 		 * Although we did not write any buffers l2ad_evict may
 		 * have advanced.
 		 */
 		if (dev->l2ad_evict != l2dhdr->dh_evict)
 			l2arc_dev_hdr_update(dev);
 
 		return (0);
 	}
 
 	if (!dev->l2ad_first)
 		ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 
 	ASSERT3U(write_asize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
 
 	dev->l2ad_writing = B_TRUE;
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
 	/*
 	 * Update the device header after the zio completes as
 	 * l2arc_write_done() may have updated the memory holding the log block
 	 * pointers in the device header.
 	 */
 	l2arc_dev_hdr_update(dev);
 
 	return (write_asize);
 }
 
 static boolean_t
 l2arc_hdr_limit_reached(void)
 {
 	int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size);
 
 	return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) ||
 	    (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
 }
 
 /*
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
 /* ARGSUSED */
 static void
 l2arc_feed_thread(void *unused)
 {
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
 	fstrans_cookie_t cookie;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&l2arc_feed_thr_lock);
 
 	cookie = spl_fstrans_mark();
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait_idle(&l2arc_feed_thr_cv,
 		    &l2arc_feed_thr_lock, next);
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
 		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
 		 */
 		mutex_enter(&l2arc_dev_mtx);
 		if (l2arc_ndev == 0) {
 			mutex_exit(&l2arc_dev_mtx);
 			continue;
 		}
 		mutex_exit(&l2arc_dev_mtx);
 		begin = ddi_get_lbolt();
 
 		/*
 		 * This selects the next l2arc device to write to, and in
 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
 		 * will return NULL if there are now no l2arc devices or if
 		 * they are all faulted.
 		 *
 		 * If a device is returned, its spa's config lock is also
 		 * held to prevent device removal.  l2arc_dev_get_next()
 		 * will grab and release l2arc_dev_mtx.
 		 */
 		if ((dev = l2arc_dev_get_next()) == NULL)
 			continue;
 
 		spa = dev->l2ad_spa;
 		ASSERT3P(spa, !=, NULL);
 
 		/*
 		 * If the pool is read-only then force the feed thread to
 		 * sleep a little longer.
 		 */
 		if (!spa_writeable(spa)) {
 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		/*
 		 * Avoid contributing to memory pressure.
 		 */
 		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
 		size = l2arc_write_size(dev);
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
 		 */
 		l2arc_evict(dev, size, B_FALSE);
 
 		/*
 		 * Write ARC buffers.
 		 */
 		wrote = l2arc_write_buffers(spa, dev, size);
 
 		/*
 		 * Calculate interval between writes.
 		 */
 		next = l2arc_write_interval(begin, size, wrote);
 		spa_config_exit(spa, SCL_L2ARC, dev);
 	}
 	spl_fstrans_unmark(cookie);
 
 	l2arc_thread_exit = 0;
 	cv_broadcast(&l2arc_feed_thr_cv);
 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
 	thread_exit();
 }
 
 boolean_t
 l2arc_vdev_present(vdev_t *vd)
 {
 	return (l2arc_vdev_get(vd) != NULL);
 }
 
 /*
  * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
  * the vdev_t isn't an L2ARC device.
  */
 l2arc_dev_t *
 l2arc_vdev_get(vdev_t *vd)
 {
 	l2arc_dev_t	*dev;
 
 	mutex_enter(&l2arc_dev_mtx);
 	for (dev = list_head(l2arc_dev_list); dev != NULL;
 	    dev = list_next(l2arc_dev_list, dev)) {
 		if (dev->l2ad_vdev == vd)
 			break;
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
 	return (dev);
 }
 
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
  * validated the vdev and opened it.
  */
 void
 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 {
 	l2arc_dev_t		*adddev;
 	uint64_t		l2dhdr_asize;
 
 	ASSERT(!l2arc_vdev_present(vd));
 
 	/*
 	 * Create a new l2arc device entry.
 	 */
 	adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
 	/* leave extra size for an l2arc device header */
 	l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
 	    MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
 	adddev->l2ad_hand = adddev->l2ad_start;
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 	adddev->l2ad_trim_all = B_FALSE;
 	list_link_init(&adddev->l2ad_node);
 	adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
 
 	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
 	 * device.
 	 */
 	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
 	/*
 	 * This is a list of pointers to log blocks that are still present
 	 * on the device.
 	 */
 	list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
 	    offsetof(l2arc_lb_ptr_buf_t, node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 	zfs_refcount_create(&adddev->l2ad_alloc);
 	zfs_refcount_create(&adddev->l2ad_lb_asize);
 	zfs_refcount_create(&adddev->l2ad_lb_count);
 
 	/*
 	 * Add device to global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_insert_head(l2arc_dev_list, adddev);
 	atomic_inc_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Decide if vdev is eligible for L2ARC rebuild
 	 */
 	l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE);
 }
 
 void
 l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
 {
 	l2arc_dev_t		*dev = NULL;
 	l2arc_dev_hdr_phys_t	*l2dhdr;
 	uint64_t		l2dhdr_asize;
 	spa_t			*spa;
 
 	dev = l2arc_vdev_get(vd);
 	ASSERT3P(dev, !=, NULL);
 	spa = dev->l2ad_spa;
 	l2dhdr = dev->l2ad_dev_hdr;
 	l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 
 	/*
 	 * The L2ARC has to hold at least the payload of one log block for
 	 * them to be restored (persistent L2ARC). The payload of a log block
 	 * depends on the amount of its log entries. We always write log blocks
 	 * with 1022 entries. How many of them are committed or restored depends
 	 * on the size of the L2ARC device. Thus the maximum payload of
 	 * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
 	 * is less than that, we reduce the amount of committed and restored
 	 * log entries per block so as to enable persistence.
 	 */
 	if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
 		dev->l2ad_log_entries = 0;
 	} else {
 		dev->l2ad_log_entries = MIN((dev->l2ad_end -
 		    dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
 		    L2ARC_LOG_BLK_MAX_ENTRIES);
 	}
 
 	/*
 	 * Read the device header, if an error is returned do not rebuild L2ARC.
 	 */
 	if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
 		/*
 		 * If we are onlining a cache device (vdev_reopen) that was
 		 * still present (l2arc_vdev_present()) and rebuild is enabled,
 		 * we should evict all ARC buffers and pointers to log blocks
 		 * and reclaim their space before restoring its contents to
 		 * L2ARC.
 		 */
 		if (reopen) {
 			if (!l2arc_rebuild_enabled) {
 				return;
 			} else {
 				l2arc_evict(dev, 0, B_TRUE);
 				/* start a new log block */
 				dev->l2ad_log_ent_idx = 0;
 				dev->l2ad_log_blk_payload_asize = 0;
 				dev->l2ad_log_blk_payload_start = 0;
 			}
 		}
 		/*
 		 * Just mark the device as pending for a rebuild. We won't
 		 * be starting a rebuild in line here as it would block pool
 		 * import. Instead spa_load_impl will hand that off to an
 		 * async task which will call l2arc_spa_rebuild_start.
 		 */
 		dev->l2ad_rebuild = B_TRUE;
 	} else if (spa_writeable(spa)) {
 		/*
 		 * In this case TRIM the whole device if l2arc_trim_ahead > 0,
 		 * otherwise create a new header. We zero out the memory holding
 		 * the header to reset dh_start_lbps. If we TRIM the whole
 		 * device the new header will be written by
 		 * vdev_trim_l2arc_thread() at the end of the TRIM to update the
 		 * trim_state in the header too. When reading the header, if
 		 * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
 		 * we opt to TRIM the whole device again.
 		 */
 		if (l2arc_trim_ahead > 0) {
 			dev->l2ad_trim_all = B_TRUE;
 		} else {
 			bzero(l2dhdr, l2dhdr_asize);
 			l2arc_dev_hdr_update(dev);
 		}
 	}
 }
 
 /*
  * Remove a vdev from the L2ARC.
  */
 void
 l2arc_remove_vdev(vdev_t *vd)
 {
 	l2arc_dev_t *remdev = NULL;
 
 	/*
 	 * Find the device by vdev
 	 */
 	remdev = l2arc_vdev_get(vd);
 	ASSERT3P(remdev, !=, NULL);
 
 	/*
 	 * Cancel any ongoing or scheduled rebuild.
 	 */
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	if (remdev->l2ad_rebuild_began == B_TRUE) {
 		remdev->l2ad_rebuild_cancel = B_TRUE;
 		while (remdev->l2ad_rebuild == B_TRUE)
 			cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
 	}
 	mutex_exit(&l2arc_rebuild_thr_lock);
 
 	/*
 	 * Remove device from global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_remove(l2arc_dev_list, remdev);
 	l2arc_dev_last = NULL;		/* may have been invalidated */
 	atomic_dec_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Clear all buflists and ARC references.  L2ARC device flush.
 	 */
 	l2arc_evict(remdev, 0, B_TRUE);
 	list_destroy(&remdev->l2ad_buflist);
 	ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
 	list_destroy(&remdev->l2ad_lbptr_list);
 	mutex_destroy(&remdev->l2ad_mtx);
 	zfs_refcount_destroy(&remdev->l2ad_alloc);
 	zfs_refcount_destroy(&remdev->l2ad_lb_asize);
 	zfs_refcount_destroy(&remdev->l2ad_lb_count);
 	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
 	vmem_free(remdev, sizeof (l2arc_dev_t));
 }
 
 void
 l2arc_init(void)
 {
 	l2arc_thread_exit = 0;
 	l2arc_ndev = 0;
 
 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	l2arc_dev_list = &L2ARC_dev_list;
 	l2arc_free_on_write = &L2ARC_free_on_write;
 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
 	    offsetof(l2arc_dev_t, l2ad_node));
 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
 	    offsetof(l2arc_data_free_t, l2df_list_node));
 }
 
 void
 l2arc_fini(void)
 {
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_rebuild_thr_lock);
 	cv_destroy(&l2arc_rebuild_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
 	mutex_destroy(&l2arc_free_on_write_mtx);
 
 	list_destroy(l2arc_dev_list);
 	list_destroy(l2arc_free_on_write);
 }
 
 void
 l2arc_start(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
 	    TS_RUN, defclsyspri);
 }
 
 void
 l2arc_stop(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);
 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
 	l2arc_thread_exit = 1;
 	while (l2arc_thread_exit != 0)
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 }
 
 /*
  * Punches out rebuild threads for the L2ARC devices in a spa. This should
  * be called after pool import from the spa async thread, since starting
  * these threads directly from spa_import() will make them part of the
  * "zpool import" context and delay process exit (and thus pool import).
  */
 void
 l2arc_spa_rebuild_start(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Locate the spa's l2arc devices and kick off rebuild threads.
 	 */
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		l2arc_dev_t *dev =
 		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
 		if (dev == NULL) {
 			/* Don't attempt a rebuild if the vdev is UNAVAIL */
 			continue;
 		}
 		mutex_enter(&l2arc_rebuild_thr_lock);
 		if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
 			dev->l2ad_rebuild_began = B_TRUE;
 			(void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
 			    dev, 0, &p0, TS_RUN, minclsyspri);
 		}
 		mutex_exit(&l2arc_rebuild_thr_lock);
 	}
 }
 
 /*
  * Main entry point for L2ARC rebuilding.
  */
 static void
 l2arc_dev_rebuild_thread(void *arg)
 {
 	l2arc_dev_t *dev = arg;
 
 	VERIFY(!dev->l2ad_rebuild_cancel);
 	VERIFY(dev->l2ad_rebuild);
 	(void) l2arc_rebuild(dev);
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	dev->l2ad_rebuild_began = B_FALSE;
 	dev->l2ad_rebuild = B_FALSE;
 	mutex_exit(&l2arc_rebuild_thr_lock);
 
 	thread_exit();
 }
 
 /*
  * This function implements the actual L2ARC metadata rebuild. It:
  * starts reading the log block chain and restores each block's contents
  * to memory (reconstructing arc_buf_hdr_t's).
  *
  * Operation stops under any of the following conditions:
  *
  * 1) We reach the end of the log block chain.
  * 2) We encounter *any* error condition (cksum errors, io errors)
  */
 static int
 l2arc_rebuild(l2arc_dev_t *dev)
 {
 	vdev_t			*vd = dev->l2ad_vdev;
 	spa_t			*spa = vd->vdev_spa;
 	int			err = 0;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	l2arc_log_blk_phys_t	*this_lb, *next_lb;
 	zio_t			*this_io = NULL, *next_io = NULL;
 	l2arc_log_blkptr_t	lbps[2];
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	boolean_t		lock_held;
 
 	this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
 	next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
 
 	/*
 	 * We prevent device removal while issuing reads to the device,
 	 * then during the rebuilding phases we drop this lock again so
 	 * that a spa_unload or device remove can be initiated - this is
 	 * safe, because the spa will signal us to stop before removing
 	 * our device and wait for us to stop.
 	 */
 	spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
 	lock_held = B_TRUE;
 
 	/*
 	 * Retrieve the persistent L2ARC device state.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
 	dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
 	    L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
 	    dev->l2ad_start);
 	dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
 
 	vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
 	vd->vdev_trim_state = l2dhdr->dh_trim_state;
 
 	/*
 	 * In case the zfs module parameter l2arc_rebuild_enabled is false
 	 * we do not start the rebuild process.
 	 */
 	if (!l2arc_rebuild_enabled)
 		goto out;
 
 	/* Prepare the rebuild process */
 	bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
 
 	/* Start the rebuild process */
 	for (;;) {
 		if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
 			break;
 
 		if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
 		    this_lb, next_lb, this_io, &next_io)) != 0)
 			goto out;
 
 		/*
 		 * Our memory pressure valve. If the system is running low
 		 * on memory, rather than swamping memory with new ARC buf
 		 * hdrs, we opt not to rebuild the L2ARC. At this point,
 		 * however, we have already set up our L2ARC dev to chain in
 		 * new metadata log blocks, so the user may choose to offline/
 		 * online the L2ARC dev at a later time (or re-import the pool)
 		 * to reconstruct it (when there's less memory pressure).
 		 */
 		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
 			cmn_err(CE_NOTE, "System running low on memory, "
 			    "aborting L2ARC rebuild.");
 			err = SET_ERROR(ENOMEM);
 			goto out;
 		}
 
 		spa_config_exit(spa, SCL_L2ARC, vd);
 		lock_held = B_FALSE;
 
 		/*
 		 * Now that we know that the next_lb checks out alright, we
 		 * can start reconstruction from this log block.
 		 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 		 */
 		uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 		l2arc_log_blk_restore(dev, this_lb, asize);
 
 		/*
 		 * log block restored, include its pointer in the list of
 		 * pointers to log blocks present in the L2ARC device.
 		 */
 		lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 		lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
 		    KM_SLEEP);
 		bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
 		    sizeof (l2arc_log_blkptr_t));
 		mutex_enter(&dev->l2ad_mtx);
 		list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
 		ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 		ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 		zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 		zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 		mutex_exit(&dev->l2ad_mtx);
 		vdev_space_update(vd, asize, 0, 0);
 
 		/*
 		 * Protection against loops of log blocks:
 		 *
 		 *				       l2ad_hand  l2ad_evict
 		 *                                         V	      V
 		 * l2ad_start |=======================================| l2ad_end
 		 *             -----|||----|||---|||----|||
 		 *                  (3)    (2)   (1)    (0)
 		 *             ---|||---|||----|||---|||
 		 *		  (7)   (6)    (5)   (4)
 		 *
 		 * In this situation the pointer of log block (4) passes
 		 * l2arc_log_blkptr_valid() but the log block should not be
 		 * restored as it is overwritten by the payload of log block
 		 * (0). Only log blocks (0)-(3) should be restored. We check
 		 * whether l2ad_evict lies in between the payload starting
 		 * offset of the next log block (lbps[1].lbp_payload_start)
 		 * and the payload starting offset of the present log block
 		 * (lbps[0].lbp_payload_start). If true and this isn't the
 		 * first pass, we are looping from the beginning and we should
 		 * stop.
 		 */
 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
 		    lbps[0].lbp_payload_start, dev->l2ad_evict) &&
 		    !dev->l2ad_first)
 			goto out;
 
 		cond_resched();
 		for (;;) {
 			mutex_enter(&l2arc_rebuild_thr_lock);
 			if (dev->l2ad_rebuild_cancel) {
 				dev->l2ad_rebuild = B_FALSE;
 				cv_signal(&l2arc_rebuild_thr_cv);
 				mutex_exit(&l2arc_rebuild_thr_lock);
 				err = SET_ERROR(ECANCELED);
 				goto out;
 			}
 			mutex_exit(&l2arc_rebuild_thr_lock);
 			if (spa_config_tryenter(spa, SCL_L2ARC, vd,
 			    RW_READER)) {
 				lock_held = B_TRUE;
 				break;
 			}
 			/*
 			 * L2ARC config lock held by somebody in writer,
 			 * possibly due to them trying to remove us. They'll
 			 * likely to want us to shut down, so after a little
 			 * delay, we check l2ad_rebuild_cancel and retry
 			 * the lock again.
 			 */
 			delay(1);
 		}
 
 		/*
 		 * Continue with the next log block.
 		 */
 		lbps[0] = lbps[1];
 		lbps[1] = this_lb->lb_prev_lbp;
 		PTR_SWAP(this_lb, next_lb);
 		this_io = next_io;
 		next_io = NULL;
 	}
 
 	if (this_io != NULL)
 		l2arc_log_blk_fetch_abort(this_io);
 out:
 	if (next_io != NULL)
 		l2arc_log_blk_fetch_abort(next_io);
 	vmem_free(this_lb, sizeof (*this_lb));
 	vmem_free(next_lb, sizeof (*next_lb));
 
 	if (!l2arc_rebuild_enabled) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "disabled");
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_success);
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "successful, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
 		/*
 		 * No error but also nothing restored, meaning the lbps array
 		 * in the device header points to invalid/non-present log
 		 * blocks. Reset the header.
 		 */
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "no valid log blocks");
 		bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
 		l2arc_dev_hdr_update(dev);
 	} else if (err == ECANCELED) {
 		/*
 		 * In case the rebuild was canceled do not log to spa history
 		 * log as the pool may be in the process of being removed.
 		 */
 		zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err != 0) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	}
 
 	if (lock_held)
 		spa_config_exit(spa, SCL_L2ARC, vd);
 
 	return (err);
 }
 
 /*
  * Attempts to read the device header on the provided L2ARC device and writes
  * it to `hdr'. On success, this function returns 0, otherwise the appropriate
  * error code is returned.
  */
 static int
 l2arc_dev_hdr_read(l2arc_dev_t *dev)
 {
 	int			err;
 	uint64_t		guid;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t 			*abd;
 
 	guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
 	    ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 	    ZIO_FLAG_SPECULATIVE, B_FALSE));
 
 	abd_free(abd);
 
 	if (err != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
 		    "vdev guid: %llu", err,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 		return (err);
 	}
 
 	if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
 		byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
 
 	if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
 	    l2dhdr->dh_spa_guid != guid ||
 	    l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
 	    l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
 	    l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
 	    l2dhdr->dh_end != dev->l2ad_end ||
 	    !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
 	    l2dhdr->dh_evict) ||
 	    (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
 	    l2arc_trim_ahead > 0)) {
 		/*
 		 * Attempt to rebuild a device containing no actual dev hdr
 		 * or containing a header from some other pool or from another
 		 * version of persistent L2ARC.
 		 */
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 /*
  * Reads L2ARC log blocks from storage and validates their contents.
  *
  * This function implements a simple fetcher to make sure that while
  * we're processing one buffer the L2ARC is already fetching the next
  * one in the chain.
  *
  * The arguments this_lp and next_lp point to the current and next log block
  * address in the block chain. Similarly, this_lb and next_lb hold the
  * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
  *
  * The `this_io' and `next_io' arguments are used for block fetching.
  * When issuing the first blk IO during rebuild, you should pass NULL for
  * `this_io'. This function will then issue a sync IO to read the block and
  * also issue an async IO to fetch the next block in the block chain. The
  * fetched IO is returned in `next_io'. On subsequent calls to this
  * function, pass the value returned in `next_io' from the previous call
  * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
  * Prior to the call, you should initialize your `next_io' pointer to be
  * NULL. If no fetch IO was issued, the pointer is left set at NULL.
  *
  * On success, this function returns 0, otherwise it returns an appropriate
  * error code. On error the fetching IO is aborted and cleared before
  * returning from this function. Therefore, if we return `success', the
  * caller can assume that we have taken care of cleanup of fetch IOs.
  */
 static int
 l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io)
 {
 	int		err = 0;
 	zio_cksum_t	cksum;
 	abd_t		*abd = NULL;
 	uint64_t	asize;
 
 	ASSERT(this_lbp != NULL && next_lbp != NULL);
 	ASSERT(this_lb != NULL && next_lb != NULL);
 	ASSERT(next_io != NULL && *next_io == NULL);
 	ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
 
 	/*
 	 * Check to see if we have issued the IO for this log block in a
 	 * previous run. If not, this is the first call, so issue it now.
 	 */
 	if (this_io == NULL) {
 		this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
 		    this_lb);
 	}
 
 	/*
 	 * Peek to see if we can start issuing the next IO immediately.
 	 */
 	if (l2arc_log_blkptr_valid(dev, next_lbp)) {
 		/*
 		 * Start issuing IO for the next log block early - this
 		 * should help keep the L2ARC device busy while we
 		 * decompress and restore this log block.
 		 */
 		*next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
 		    next_lb);
 	}
 
 	/* Wait for the IO to read this log block to complete */
 	if ((err = zio_wait(this_io)) != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
 		    "offset: %llu, vdev guid: %llu", err,
 		    (u_longlong_t)this_lbp->lbp_daddr,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 		goto cleanup;
 	}
 
 	/*
 	 * Make sure the buffer checks out.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
 	fletcher_4_native(this_lb, asize, NULL, &cksum);
 	if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
 		zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
 		    "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
 		    (u_longlong_t)this_lbp->lbp_daddr,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid,
 		    (u_longlong_t)dev->l2ad_hand,
 		    (u_longlong_t)dev->l2ad_evict);
 		err = SET_ERROR(ECKSUM);
 		goto cleanup;
 	}
 
 	/* Now we can take our time decoding this buffer */
 	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
 	case ZIO_COMPRESS_OFF:
 		break;
 	case ZIO_COMPRESS_LZ4:
 		abd = abd_alloc_for_io(asize, B_TRUE);
 		abd_copy_from_buf_off(abd, this_lb, 0, asize);
 		if ((err = zio_decompress_data(
 		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
 		    abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
 			err = SET_ERROR(EINVAL);
 			goto cleanup;
 		}
 		break;
 	default:
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 	if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 		byteswap_uint64_array(this_lb, sizeof (*this_lb));
 	if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 cleanup:
 	/* Abort an in-flight fetch I/O in case of error */
 	if (err != 0 && *next_io != NULL) {
 		l2arc_log_blk_fetch_abort(*next_io);
 		*next_io = NULL;
 	}
 	if (abd != NULL)
 		abd_free(abd);
 	return (err);
 }
 
 /*
  * Restores the payload of a log block to ARC. This creates empty ARC hdr
  * entries which only contain an l2arc hdr, essentially restoring the
  * buffers to their L2ARC evicted state. This function also updates space
  * usage on the L2ARC vdev to make sure it tracks restored buffers.
  */
 static void
 l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
     uint64_t lb_asize)
 {
 	uint64_t	size = 0, asize = 0;
 	uint64_t	log_entries = dev->l2ad_log_entries;
 
 	/*
 	 * Usually arc_adapt() is called only for data, not headers, but
 	 * since we may allocate significant amount of memory here, let ARC
 	 * grow its arc_c.
 	 */
 	arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only);
 
 	for (int i = log_entries - 1; i >= 0; i--) {
 		/*
 		 * Restore goes in the reverse temporal direction to preserve
 		 * correct temporal ordering of buffers in the l2ad_buflist.
 		 * l2arc_hdr_restore also does a list_insert_tail instead of
 		 * list_insert_head on the l2ad_buflist:
 		 *
 		 *		LIST	l2ad_buflist		LIST
 		 *		HEAD  <------ (time) ------	TAIL
 		 * direction	+-----+-----+-----+-----+-----+    direction
 		 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
 		 * fill		+-----+-----+-----+-----+-----+
 		 *		^				^
 		 *		|				|
 		 *		|				|
 		 *	l2arc_feed_thread		l2arc_rebuild
 		 *	will place new bufs here	restores bufs here
 		 *
 		 * During l2arc_rebuild() the device is not used by
 		 * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
 		 */
 		size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
 		asize += vdev_psize_to_asize(dev->l2ad_vdev,
 		    L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
 		l2arc_hdr_restore(&lb->lb_entries[i], dev);
 	}
 
 	/*
 	 * Record rebuild stats:
 	 *	size		Logical size of restored buffers in the L2ARC
 	 *	asize		Aligned size of restored buffers in the L2ARC
 	 */
 	ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
 	ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
 	ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
 	ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
 }
 
 /*
  * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
  * into a state indicating that it has been evicted to L2ARC.
  */
 static void
 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
 {
 	arc_buf_hdr_t		*hdr, *exists;
 	kmutex_t		*hash_lock;
 	arc_buf_contents_t	type = L2BLK_GET_TYPE((le)->le_prop);
 	uint64_t		asize;
 
 	/*
 	 * Do all the allocation before grabbing any locks, this lets us
 	 * sleep if memory is full and we don't have to deal with failed
 	 * allocations.
 	 */
 	hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
 	    dev, le->le_dva, le->le_daddr,
 	    L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
 	    L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
 	    L2BLK_GET_PROTECTED((le)->le_prop),
 	    L2BLK_GET_PREFETCH((le)->le_prop),
 	    L2BLK_GET_STATE((le)->le_prop));
 	asize = vdev_psize_to_asize(dev->l2ad_vdev,
 	    L2BLK_GET_PSIZE((le)->le_prop));
 
 	/*
 	 * vdev_space_update() has to be called before arc_hdr_destroy() to
 	 * avoid underflow since the latter also calls vdev_space_update().
 	 */
 	l2arc_hdr_arcstats_increment(hdr);
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_tail(&dev->l2ad_buflist, hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
 	mutex_exit(&dev->l2ad_mtx);
 
 	exists = buf_hash_insert(hdr, &hash_lock);
 	if (exists) {
 		/* Buffer was already cached, no need to restore it. */
 		arc_hdr_destroy(hdr);
 		/*
 		 * If the buffer is already cached, check whether it has
 		 * L2ARC metadata. If not, enter them and update the flag.
 		 * This is important is case of onlining a cache device, since
 		 * we previously evicted all L2ARC metadata from ARC.
 		 */
 		if (!HDR_HAS_L2HDR(exists)) {
 			arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
 			exists->b_l2hdr.b_dev = dev;
 			exists->b_l2hdr.b_daddr = le->le_daddr;
 			exists->b_l2hdr.b_arcs_state =
 			    L2BLK_GET_STATE((le)->le_prop);
 			mutex_enter(&dev->l2ad_mtx);
 			list_insert_tail(&dev->l2ad_buflist, exists);
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(exists), exists);
 			mutex_exit(&dev->l2ad_mtx);
 			l2arc_hdr_arcstats_increment(exists);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 		}
 		ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
 	}
 
 	mutex_exit(hash_lock);
 }
 
 /*
  * Starts an asynchronous read IO to read a log block. This is used in log
  * block reconstruction to start reading the next block before we are done
  * decoding and reconstructing the current block, to keep the l2arc device
  * nice and hot with read IO to process.
  * The returned zio will contain a newly allocated memory buffers for the IO
  * data which should then be freed by the caller once the zio is no longer
  * needed (i.e. due to it having completed). If you wish to abort this
  * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
  * care of disposing of the allocated buffers correctly.
  */
 static zio_t *
 l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
     l2arc_log_blk_phys_t *lb)
 {
 	uint32_t		asize;
 	zio_t			*pio;
 	l2arc_read_callback_t	*cb;
 
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
 
 	cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
 	cb->l2rcb_abd = abd_get_from_buf(lb, asize);
 	pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
 	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
 	    ZIO_FLAG_DONT_RETRY);
 	(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
 	    cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
 
 	return (pio);
 }
 
 /*
  * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
  * buffers allocated for it.
  */
 static void
 l2arc_log_blk_fetch_abort(zio_t *zio)
 {
 	(void) zio_wait(zio);
 }
 
 /*
  * Creates a zio to update the device header on an l2arc device.
  */
 void
 l2arc_dev_hdr_update(l2arc_dev_t *dev)
 {
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t			*abd;
 	int			err;
 
 	VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
 
 	l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
 	l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
 	l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 	l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
 	l2dhdr->dh_log_entries = dev->l2ad_log_entries;
 	l2dhdr->dh_evict = dev->l2ad_evict;
 	l2dhdr->dh_start = dev->l2ad_start;
 	l2dhdr->dh_end = dev->l2ad_end;
 	l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
 	l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
 	l2dhdr->dh_flags = 0;
 	l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
 	l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
 	if (dev->l2ad_first)
 		l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
 	    NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
 
 	abd_free(abd);
 
 	if (err != 0) {
 		zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
 		    "vdev guid: %llu", err,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 	}
 }
 
 /*
  * Commits a log block to the L2ARC device. This routine is invoked from
  * l2arc_write_buffers when the log block fills up.
  * This function allocates some memory to temporarily hold the serialized
  * buffer to be written. This is then released in l2arc_write_done.
  */
 static void
 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	uint8_t			*tmpbuf;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 
 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
 
 	tmpbuf = zio_buf_alloc(sizeof (*lb));
 	abd_buf = zio_buf_alloc(sizeof (*abd_buf));
 	abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
 	lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 	lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
 
 	/* link the buffer into the block chain */
 	lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
 	lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
 
 	/*
 	 * l2arc_log_blk_commit() may be called multiple times during a single
 	 * l2arc_write_buffers() call. Save the allocated abd buffers in a list
 	 * so we can free them in l2arc_write_done() later on.
 	 */
 	list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
 
 	/* try to compress the buffer */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
 	    abd_buf->abd, tmpbuf, sizeof (*lb), 0);
 
 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
 	asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 	ASSERT(asize <= sizeof (*lb));
 
 	/*
 	 * Update the start log block pointer in the device header to point
 	 * to the log block we're about to write.
 	 */
 	l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
 	l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
 	l2dhdr->dh_start_lbps[0].lbp_payload_asize =
 	    dev->l2ad_log_blk_payload_asize;
 	l2dhdr->dh_start_lbps[0].lbp_payload_start =
 	    dev->l2ad_log_blk_payload_start;
 	_NOTE(CONSTCOND)
 	L2BLK_SET_LSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
 	L2BLK_SET_PSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
 	L2BLK_SET_CHECKSUM(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
 		bzero(tmpbuf + psize, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
 		bcopy(lb, tmpbuf, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
 	}
 
 	/* checksum what we're about to write */
 	fletcher_4_native(tmpbuf, asize, NULL,
 	    &l2dhdr->dh_start_lbps[0].lbp_cksum);
 
 	abd_free(abd_buf->abd);
 
 	/* perform the write itself */
 	abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
 	abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
 	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
 	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
 	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
 	(void) zio_nowait(wzio);
 
 	dev->l2ad_hand += asize;
 	/*
 	 * Include the committed log block's pointer  in the list of pointers
 	 * to log blocks present in the L2ARC device.
 	 */
 	bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
 	    sizeof (l2arc_log_blkptr_t));
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
 	ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 	zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 	zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 	mutex_exit(&dev->l2ad_mtx);
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	/* bump the kstats */
 	ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
 	    dev->l2ad_log_blk_payload_asize / asize);
 
 	/* start a new log block */
 	dev->l2ad_log_ent_idx = 0;
 	dev->l2ad_log_blk_payload_asize = 0;
 	dev->l2ad_log_blk_payload_start = 0;
 }
 
 /*
  * Validates an L2ARC log block address to make sure that it can be read
  * from the provided L2ARC device.
  */
 boolean_t
 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
 {
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	uint64_t end = lbp->lbp_daddr + asize - 1;
 	uint64_t start = lbp->lbp_payload_start;
 	boolean_t evicted = B_FALSE;
 
 	/*
 	 * A log block is valid if all of the following conditions are true:
 	 * - it fits entirely (including its payload) between l2ad_start and
 	 *   l2ad_end
 	 * - it has a valid size
 	 * - neither the log block itself nor part of its payload was evicted
 	 *   by l2arc_evict():
 	 *
 	 *		l2ad_hand          l2ad_evict
 	 *		|			 |	lbp_daddr
 	 *		|     start		 |	|  end
 	 *		|     |			 |	|  |
 	 *		V     V		         V	V  V
 	 *   l2ad_start ============================================ l2ad_end
 	 *                    --------------------------||||
 	 *				^		 ^
 	 *				|		log block
 	 *				payload
 	 */
 
 	evicted =
 	    l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
 	    l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
 
 	return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
 	    asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
 	    (!evicted || dev->l2ad_first));
 }
 
 /*
  * Inserts ARC buffer header `hdr' into the current L2ARC log block on
  * the device. The buffer being inserted must be present in L2ARC.
  * Returns B_TRUE if the L2ARC log block is full and needs to be committed
  * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
  */
 static boolean_t
 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_log_ent_phys_t	*le;
 
 	if (dev->l2ad_log_entries == 0)
 		return (B_FALSE);
 
 	int index = dev->l2ad_log_ent_idx++;
 
 	ASSERT3S(index, <, dev->l2ad_log_entries);
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	le = &lb->lb_entries[index];
 	bzero(le, sizeof (*le));
 	le->le_dva = hdr->b_dva;
 	le->le_birth = hdr->b_birth;
 	le->le_daddr = hdr->b_l2hdr.b_daddr;
 	if (index == 0)
 		dev->l2ad_log_blk_payload_start = le->le_daddr;
 	L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
 	L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
 	L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
 	le->le_complevel = hdr->b_complevel;
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
 	L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
 
 	return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
 }
 
 /*
  * Checks whether a given L2ARC device address sits in a time-sequential
  * range. The trick here is that the L2ARC is a rotary buffer, so we can't
  * just do a range comparison, we need to handle the situation in which the
  * range wraps around the end of the L2ARC device. Arguments:
  *	bottom -- Lower end of the range to check (written to earlier).
  *	top    -- Upper end of the range to check (written to later).
  *	check  -- The address for which we want to determine if it sits in
  *		  between the top and bottom.
  *
  * The 3-way conditional below represents the following cases:
  *
  *	bottom < top : Sequentially ordered case:
  *	  <check>--------+-------------------+
  *	                 |  (overlap here?)  |
  *	 L2ARC dev       V                   V
  *	 |---------------<bottom>============<top>--------------|
  *
  *	bottom > top: Looped-around case:
  *	                      <check>--------+------------------+
  *	                                     |  (overlap here?) |
  *	 L2ARC dev                           V                  V
  *	 |===============<top>---------------<bottom>===========|
  *	 ^               ^
  *	 |  (or here?)   |
  *	 +---------------+---------<check>
  *
  *	top == bottom : Just a single address comparison.
  */
 boolean_t
 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
 {
 	if (bottom < top)
 		return (bottom <= check && check <= top);
 	else if (bottom > top)
 		return (check <= top || bottom <= check);
 	else
 		return (check == top);
 }
 
 EXPORT_SYMBOL(arc_buf_size);
 EXPORT_SYMBOL(arc_write);
 EXPORT_SYMBOL(arc_read);
 EXPORT_SYMBOL(arc_buf_info);
 EXPORT_SYMBOL(arc_getbuf_func);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_long,
 	param_get_long, ZMOD_RW, "Min arc size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_long,
 	param_get_long, ZMOD_RW, "Max arc size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long,
 	param_get_long, ZMOD_RW, "Metadata limit for arc size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent,
 	param_set_arc_long, param_get_long, ZMOD_RW,
 	"Percent of arc size for arc meta limit");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long,
 	param_get_long, ZMOD_RW, "Min arc metadata");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW,
 	"Meta objects to scan for prune");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW,
 	"Limit number of restarts in arc_evict_meta");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW,
 	"Meta reclaim strategy");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
 	param_get_int, ZMOD_RW, "Seconds before growing arc size");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW,
 	"Disable arc_p adapt dampener");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
 	param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
 	"Percent of pagecache to reclaim arc to");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int,
 	param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD,
 	"Target average block size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
 	"Disable compressed arc buffers");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
 	param_get_int, ZMOD_RW, "Min life of prefetch block in ms");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
 	param_set_arc_int, param_get_int, ZMOD_RW,
 	"Min life of prescient prefetched block in ms");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW,
 	"Max write bytes per interval");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW,
 	"Extra write bytes during device warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW,
 	"Number of max device writes to precache");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW,
 	"Compressed l2arc_headroom multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW,
 	"TRIM ahead L2ARC write size multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW,
 	"Seconds between L2ARC writing");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW,
 	"Min feed interval in milliseconds");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
 	"Skip caching prefetched buffers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
 	"Turbo L2ARC warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
 	"No reads during writes");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW,
 	"Percent of ARC size allowed for L2ARC-only headers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
 	"Rebuild the L2ARC when importing a pool");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,
 	"Min size in bytes to write rebuild log blocks in L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
 	"Cache only MFU data from ARC into L2ARC");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
 	param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long,
 	param_get_long, ZMOD_RW, "System free memory target size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long,
 	param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
 	param_set_arc_long, param_get_long, ZMOD_RW,
 	"Percent of ARC meta buffers for dnodes");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
 	"Percentage of excess dnodes to try to unpin");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
 	"When full, ARC allocation waits for eviction of this % of alloc size");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW,
 	"The number of headers to evict per sublist before moving to the next");
 /* END CSTYLED */
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index f4c2910ad7fe..6fd302b8df34 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -1,1322 +1,1322 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/spa_log_spacemap.h>
 #include <sys/vdev_impl.h>
 #include <sys/zap.h>
 
 /*
  * Log Space Maps
  *
  * Log space maps are an optimization in ZFS metadata allocations for pools
  * whose workloads are primarily random-writes. Random-write workloads are also
  * typically random-free, meaning that they are freeing from locations scattered
  * throughout the pool. This means that each TXG we will have to append some
  * FREE records to almost every metaslab. With log space maps, we hold their
  * changes in memory and log them altogether in one pool-wide space map on-disk
  * for persistence. As more blocks are accumulated in the log space maps and
  * more unflushed changes are accounted in memory, we flush a selected group
  * of metaslabs every TXG to relieve memory pressure and potential overheads
  * when loading the pool. Flushing a metaslab to disk relieves memory as we
  * flush any unflushed changes from memory to disk (i.e. the metaslab's space
  * map) and saves import time by making old log space maps obsolete and
  * eventually destroying them. [A log space map is said to be obsolete when all
  * its entries have made it to their corresponding metaslab space maps].
  *
  * == On disk data structures used ==
  *
  * - The pool has a new feature flag and a new entry in the MOS. The feature
  *   is activated when we create the first log space map and remains active
  *   for the lifetime of the pool. The new entry in the MOS Directory [refer
  *   to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
  *   pairs are of the form <key: txg, value: log space map object for that txg>.
  *   This entry is our on-disk reference of the log space maps that exist in
  *   the pool for each TXG and it is used during import to load all the
  *   metaslab unflushed changes in memory. To see how this structure is first
  *   created and later populated refer to spa_generate_syncing_log_sm(). To see
  *   how it is used during import time refer to spa_ld_log_sm_metadata().
  *
  * - Each vdev has a new entry in its vdev_top_zap (see field
  *   VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
  *   each metaslab in this vdev. This field is the on-disk counterpart of the
  *   in-memory field ms_unflushed_txg which tells us from which TXG and onwards
  *   the metaslab haven't had its changes flushed. During import, we use this
  *   to ignore any entries in the space map log that are for this metaslab but
  *   from a TXG before msp_unflushed_txg. At that point, we also populate its
  *   in-memory counterpart and from there both fields are updated every time
  *   we flush that metaslab.
  *
  * - A space map is created every TXG and, during that TXG, it is used to log
  *   all incoming changes (the log space map). When created, the log space map
  *   is referenced in memory by spa_syncing_log_sm and its object ID is inserted
  *   to the space map ZAP mentioned above. The log space map is closed at the
  *   end of the TXG and will be destroyed when it becomes fully obsolete. We
  *   know when a log space map has become obsolete by looking at the oldest
  *   (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
  *   than the log space map's TXG, then it means that there is no metaslab who
  *   doesn't have the changes from that log and we can therefore destroy it.
  *   [see spa_cleanup_old_sm_logs()].
  *
  * == Important in-memory structures ==
  *
  * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
  *   the pool by their ms_unflushed_txg field. It is primarily used for three
  *   reasons. First of all, it is used during flushing where we try to flush
  *   metaslabs in-order from the oldest-flushed to the most recently flushed
  *   every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
  *   oldest flushed metaslab to distinguish which log space maps have become
  *   obsolete and which ones are still relevant. Finally it tells us which
  *   metaslabs have unflushed changes in a pool where this feature was just
  *   enabled, as we don't immediately add all of the pool's metaslabs but we
  *   add them over time as they go through metaslab_sync(). The reason that
  *   we do that is to ease these pools into the behavior of the flushing
  *   algorithm (described later on).
  *
  * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
  *   counterpart of the space map ZAP mentioned above. It's an AVL tree whose
  *   nodes represent the log space maps in the pool. This in-memory
  *   representation of log space maps in the pool sorts the log space maps by
  *   the TXG that they were created (which is also the TXG of their unflushed
  *   changes). It also contains the following extra information for each
  *   space map:
  *   [1] The number of metaslabs that were last flushed on that TXG. This is
  *       important because if that counter is zero and this is the oldest
  *       log then it means that it is also obsolete.
  *   [2] The number of blocks of that space map. This field is used by the
  *       block heuristic of our flushing algorithm (described later on).
  *       It represents how many blocks of metadata changes ZFS had to write
  *       to disk for that TXG.
  *
  * - The per-spa field spa_log_summary is a list of entries that summarizes
  *   the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
  *   AVL tree mentioned above. The reason this exists is that our flushing
  *   algorithm (described later) tries to estimate how many metaslabs to flush
  *   in each TXG by iterating over all the log space maps and looking at their
  *   block counts. Summarizing that information means that don't have to
  *   iterate through each space map, minimizing the runtime overhead of the
  *   flushing algorithm which would be induced in syncing context. In terms of
  *   implementation the log summary is used as a queue:
  *   * we modify or pop entries from its head when we flush metaslabs
  *   * we modify or append entries to its tail when we sync changes.
  *
  * - Each metaslab has two new range trees that hold its unflushed changes,
  *   ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
  *
  * == Flushing algorithm ==
  *
  * The decision of how many metaslabs to flush on a give TXG is guided by
  * two heuristics:
  *
  * [1] The memory heuristic -
  * We keep track of the memory used by the unflushed trees from all the
  * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
  * stays below a certain threshold which is determined by an arbitrary hard
  * limit and an arbitrary percentage of the system's memory [see
  * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
  * unflushed changes are passing that threshold, we flush metaslabs, which
  * empties their unflushed range trees, reducing the memory used.
  *
  * [2] The block heuristic -
  * We try to keep the total number of blocks in the log space maps in check
  * so the log doesn't grow indefinitely and we don't induce a lot of overhead
  * when loading the pool. At the same time we don't want to flush a lot of
  * metaslabs too often as this would defeat the purpose of the log space map.
  * As a result we set a limit in the amount of blocks that we think it's
  * acceptable for the log space maps to have and try not to cross it.
  * [see sus_blocklimit from spa_unflushed_stats].
  *
  * In order to stay below the block limit every TXG we have to estimate how
  * many metaslabs we need to flush based on the current rate of incoming blocks
  * and our history of log space map blocks. The main idea here is to answer
  * the question of how many metaslabs do we need to flush in order to get rid
  * at least an X amount of log space map blocks. We can answer this question
  * by iterating backwards from the oldest log space map to the newest one
  * and looking at their metaslab and block counts. At this point the log summary
  * mentioned above comes handy as it reduces the amount of things that we have
  * to iterate (even though it may reduce the preciseness of our estimates due
  * to its aggregation of data). So with that in mind, we project the incoming
  * rate of the current TXG into the future and attempt to approximate how many
  * metaslabs would we need to flush from now in order to avoid exceeding our
  * block limit in different points in the future (granted that we would keep
  * flushing the same number of metaslabs for every TXG). Then we take the
  * maximum number from all these estimates to be on the safe side. For the
  * exact implementation details of algorithm refer to
  * spa_estimate_metaslabs_to_flush.
  */
 
 /*
  * This is used as the block size for the space maps used for the
  * log space map feature. These space maps benefit from a bigger
  * block size as we expect to be writing a lot of data to them at
  * once.
  */
 unsigned long zfs_log_sm_blksz = 1ULL << 17;
 
 /*
  * Percentage of the overall system's memory that ZFS allows to be
  * used for unflushed changes (e.g. the sum of size of all the nodes
  * in the unflushed trees).
  *
  * Note that this value is calculated over 1000000 for finer granularity
  * (thus the _ppm suffix; reads as "parts per million"). As an example,
  * the default of 1000 allows 0.1% of memory to be used.
  */
 unsigned long zfs_unflushed_max_mem_ppm = 1000;
 
 /*
  * Specific hard-limit in memory that ZFS allows to be used for
  * unflushed changes.
  */
 unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
 
 /*
  * The following tunable determines the number of blocks that can be used for
  * the log space maps. It is expressed as a percentage of the total number of
  * metaslabs in the pool (i.e. the default of 400 means that the number of log
  * blocks is capped at 4 times the number of metaslabs).
  *
  * This value exists to tune our flushing algorithm, with higher values
  * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
  * flushing metaslabs more aggressively with the upside of saving overheads
  * when loading the pool. Another factor in this tradeoff is that flushing
  * less often can potentially lead to better utilization of the metaslab space
  * map's block size as we accumulate more changes per flush.
  *
  * Given that this tunable indirectly controls the flush rate (metaslabs
  * flushed per txg) and that's why making it a percentage in terms of the
  * number of metaslabs in the pool makes sense here.
  *
  * As a rule of thumb we default this tunable to 400% based on the following:
  *
  * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
  *    it is reasonable to expect that the amount of obsolete entries changes
  *    linearly from txg to txg (e.g. the oldest log should have the most
  *    obsolete entries, and the most recent one the least). With this we could
  *    say that, at any given time, about half of the entries in the whole space
  *    map log are obsolete. Thus for every two entries for a metaslab in the
  *    log space map, only one of them is valid and actually makes it to the
  *    metaslab's space map.
  *    [factor of 2]
  * 2] Each entry in the log space map is guaranteed to be two words while
  *    entries in metaslab space maps are generally single-word.
  *    [an extra factor of 2 - 400% overall]
  * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
  *    account any consolidation of segments from the log space map to the
  *    unflushed range trees nor their history (e.g. a segment being allocated,
  *    then freed, then allocated again means 3 log space map entries but 0
  *    metaslab space map entries). Depending on the workload, we've seen ~1.8
  *    non-obsolete log space map entries per metaslab entry, for a total of
  *    ~600%. Since most of these estimates though are workload dependent, we
  *    default on 400% to be conservative.
  *
  *    Thus we could say that even in the worst
  *    case of [1] and [2], the factor should end up being 4.
  *
  * That said, regardless of the number of metaslabs in the pool we need to
  * provide upper and lower bounds for the log block limit.
  * [see zfs_unflushed_log_block_{min,max}]
  */
 unsigned long zfs_unflushed_log_block_pct = 400;
 
 /*
  * If the number of metaslabs is small and our incoming rate is high, we could
  * get into a situation that we are flushing all our metaslabs every TXG. Thus
  * we always allow at least this many log blocks.
  */
 unsigned long zfs_unflushed_log_block_min = 1000;
 
 /*
  * If the log becomes too big, the import time of the pool can take a hit in
  * terms of performance. Thus we have a hard limit in the size of the log in
  * terms of blocks.
  */
 unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
 
 /*
  * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
  * stability of the flushing algorithm (longer summary) vs its runtime overhead
  * (smaller summary is faster to traverse).
  */
 unsigned long zfs_max_logsm_summary_length = 10;
 
 /*
  * Tunable that sets the lower bound on the metaslabs to flush every TXG.
  *
  * Setting this to 0 has no effect since if the pool is idle we won't even be
  * creating log space maps and therefore we won't be flushing. On the other
  * hand if the pool has any incoming workload our block heuristic will start
  * flushing metaslabs anyway.
  *
  * The point of this tunable is to be used in extreme cases where we really
  * want to flush more metaslabs than our adaptable heuristic plans to flush.
  */
 unsigned long zfs_min_metaslabs_to_flush = 1;
 
 /*
  * Tunable that specifies how far in the past do we want to look when trying to
  * estimate the incoming log blocks for the current TXG.
  *
  * Setting this too high may not only increase runtime but also minimize the
  * effect of the incoming rates from the most recent TXGs as we take the
  * average over all the blocks that we walk
  * [see spa_estimate_incoming_log_blocks].
  */
 unsigned long zfs_max_log_walking = 5;
 
 /*
  * This tunable exists solely for testing purposes. It ensures that the log
  * spacemaps are not flushed and destroyed during export in order for the
  * relevant log spacemap import code paths to be tested (effectively simulating
  * a crash).
  */
 int zfs_keep_log_spacemaps_at_export = 0;
 
 static uint64_t
 spa_estimate_incoming_log_blocks(spa_t *spa)
 {
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	uint64_t steps = 0, sum = 0;
 	for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
 	    sls != NULL && steps < zfs_max_log_walking;
 	    sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
 		if (sls->sls_txg == spa_syncing_txg(spa)) {
 			/*
 			 * skip the log created in this TXG as this would
 			 * make our estimations inaccurate.
 			 */
 			continue;
 		}
 		sum += sls->sls_nblocks;
 		steps++;
 	}
 	return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
 }
 
 uint64_t
 spa_log_sm_blocklimit(spa_t *spa)
 {
 	return (spa->spa_unflushed_stats.sus_blocklimit);
 }
 
 void
 spa_log_sm_set_blocklimit(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 		ASSERT0(spa_log_sm_blocklimit(spa));
 		return;
 	}
 
 	uint64_t calculated_limit =
 	    (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
 	spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
 	    zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
 }
 
 uint64_t
 spa_log_sm_nblocks(spa_t *spa)
 {
 	return (spa->spa_unflushed_stats.sus_nblocks);
 }
 
 /*
  * Ensure that the in-memory log space map structures and the summary
  * have the same block and metaslab counts.
  */
 static void
 spa_log_summary_verify_counts(spa_t *spa)
 {
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
 		return;
 
 	uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
 
 	uint64_t ms_in_summary = 0, blk_in_summary = 0;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e; e = list_next(&spa->spa_log_summary, e)) {
 		ms_in_summary += e->lse_mscount;
 		blk_in_summary += e->lse_blkcount;
 	}
 
 	uint64_t ms_in_logs = 0, blk_in_logs = 0;
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		ms_in_logs += sls->sls_mscount;
 		blk_in_logs += sls->sls_nblocks;
 	}
 
 	VERIFY3U(ms_in_logs, ==, ms_in_summary);
 	VERIFY3U(ms_in_logs, ==, ms_in_avl);
 	VERIFY3U(blk_in_logs, ==, blk_in_summary);
 	VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
 }
 
 static boolean_t
 summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
 {
 	uint64_t blocks_per_row = MAX(1,
 	    DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
 	    zfs_max_logsm_summary_length));
 	return (blocks_per_row <= e->lse_blkcount);
 }
 
 /*
  * Update the log summary information to reflect the fact that a metaslab
  * was flushed or destroyed (e.g due to device removal or pool export/destroy).
  *
  * We typically flush the oldest flushed metaslab so the first (and oldest)
  * entry of the summary is updated. However if that metaslab is getting loaded
  * we may flush the second oldest one which may be part of an entry later in
  * the summary. Moreover, if we call into this function from metaslab_fini()
  * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
  * for a txg as an argument so we can locate the appropriate summary entry for
  * the metaslab.
  */
 void
 spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
 {
 	/*
 	 * We don't track summary data for read-only pools and this function
 	 * can be called from metaslab_fini(). In that case return immediately.
 	 */
 	if (!spa_writeable(spa))
 		return;
 
 	log_summary_entry_t *target = NULL;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e != NULL; e = list_next(&spa->spa_log_summary, e)) {
 		if (e->lse_start > txg)
 			break;
 		target = e;
 	}
 
 	if (target == NULL || target->lse_mscount == 0) {
 		/*
 		 * We didn't find a summary entry for this metaslab. We must be
 		 * at the teardown of a spa_load() attempt that got an error
 		 * while reading the log space maps.
 		 */
 		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
 		return;
 	}
 
 	target->lse_mscount--;
 }
 
 /*
  * Update the log summary information to reflect the fact that we destroyed
  * old log space maps. Since we can only destroy the oldest log space maps,
  * we decrement the block count of the oldest summary entry and potentially
  * destroy it when that count hits 0.
  *
  * This function is called after a metaslab is flushed and typically that
  * metaslab is the oldest flushed, which means that this function will
  * typically decrement the block count of the first entry of the summary and
  * potentially free it if the block count gets to zero (its metaslab count
  * should be zero too at that point).
  *
  * There are certain scenarios though that don't work exactly like that so we
  * need to account for them:
  *
  * Scenario [1]: It is possible that after we flushed the oldest flushed
  * metaslab and we destroyed the oldest log space map, more recent logs had 0
  * metaslabs pointing to them so we got rid of them too. This can happen due
  * to metaslabs being destroyed through device removal, or because the oldest
  * flushed metaslab was loading but we kept flushing more recently flushed
  * metaslabs due to the memory pressure of unflushed changes. Because of that,
  * we always iterate from the beginning of the summary and if blocks_gone is
  * bigger than the block_count of the current entry we free that entry (we
  * expect its metaslab count to be zero), we decrement blocks_gone and on to
  * the next entry repeating this procedure until blocks_gone gets decremented
  * to 0. Doing this also works for the typical case mentioned above.
  *
  * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
  * the first (and oldest) entry in the summary. If the first few entries of
  * the summary were only accounting metaslabs from a device that was just
  * removed, then the current oldest flushed metaslab could be accounted by an
  * entry somewhere in the middle of the summary. Moreover flushing that
  * metaslab will destroy all the log space maps older than its ms_unflushed_txg
  * because they became obsolete after the removal. Thus, iterating as we did
  * for scenario [1] works out for this case too.
  *
  * Scenario [3]: At times we decide to flush all the metaslabs in the pool
  * in one TXG (either because we are exporting the pool or because our flushing
  * heuristics decided to do so). When that happens all the log space maps get
  * destroyed except the one created for the current TXG which doesn't have
  * any log blocks yet. As log space maps get destroyed with every metaslab that
  * we flush, entries in the summary are also destroyed. This brings a weird
  * corner-case when we flush the last metaslab and the log space map of the
  * current TXG is in the same summary entry with other log space maps that
  * are older. When that happens we are eventually left with this one last
  * summary entry whose blocks are gone (blocks_gone equals the entry's block
  * count) but its metaslab count is non-zero (because it accounts all the
  * metaslabs in the pool as they all got flushed). Under this scenario we can't
  * free this last summary entry as it's referencing all the metaslabs in the
  * pool and its block count will get incremented at the end of this sync (when
  * we close the syncing log space map). Thus we just decrement its current
  * block count and leave it alone. In the case that the pool gets exported,
  * its metaslab count will be decremented over time as we call metaslab_fini()
  * for all the metaslabs in the pool and the entry will be freed at
  * spa_unload_log_sm_metadata().
  */
 void
 spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
 {
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e != NULL; e = list_head(&spa->spa_log_summary)) {
 		if (e->lse_blkcount > blocks_gone) {
 			/*
 			 * Assert that we stopped at an entry that is not
 			 * obsolete.
 			 */
 			ASSERT(e->lse_mscount != 0);
 
 			e->lse_blkcount -= blocks_gone;
 			blocks_gone = 0;
 			break;
 		} else if (e->lse_mscount == 0) {
 			/* remove obsolete entry */
 			blocks_gone -= e->lse_blkcount;
 			list_remove(&spa->spa_log_summary, e);
 			kmem_free(e, sizeof (log_summary_entry_t));
 		} else {
 			/* Verify that this is scenario [3] mentioned above. */
 			VERIFY3U(blocks_gone, ==, e->lse_blkcount);
 
 			/*
 			 * Assert that this is scenario [3] further by ensuring
 			 * that this is the only entry in the summary.
 			 */
 			VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
 			ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
 
 			blocks_gone = e->lse_blkcount = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Ensure that there is no way we are trying to remove more blocks
 	 * than the # of blocks in the summary.
 	 */
 	ASSERT0(blocks_gone);
 }
 
 void
 spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
 {
 	spa_log_sm_t target = { .sls_txg = txg };
 	spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
 	    &target, NULL);
 
 	if (sls == NULL) {
 		/*
 		 * We must be at the teardown of a spa_load() attempt that
 		 * got an error while reading the log space maps.
 		 */
 		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
 		return;
 	}
 
 	ASSERT(sls->sls_mscount > 0);
 	sls->sls_mscount--;
 }
 
 void
 spa_log_sm_increment_current_mscount(spa_t *spa)
 {
 	spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
 	ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
 	last_sls->sls_mscount++;
 }
 
 static void
 summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
     uint64_t nblocks)
 {
 	log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
 
 	if (e == NULL || summary_entry_is_full(spa, e)) {
 		e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
 		e->lse_start = txg;
 		list_insert_tail(&spa->spa_log_summary, e);
 	}
 
 	ASSERT3U(e->lse_start, <=, txg);
 	e->lse_mscount += metaslabs_flushed;
 	e->lse_blkcount += nblocks;
 }
 
 static void
 spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
 {
 	summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
 }
 
 void
 spa_log_summary_add_flushed_metaslab(spa_t *spa)
 {
 	summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
 }
 
 /*
  * This function attempts to estimate how many metaslabs should
  * we flush to satisfy our block heuristic for the log spacemap
  * for the upcoming TXGs.
  *
  * Specifically, it first tries to estimate the number of incoming
  * blocks in this TXG. Then by projecting that incoming rate to
  * future TXGs and using the log summary, it figures out how many
  * flushes we would need to do for future TXGs individually to
  * stay below our block limit and returns the maximum number of
  * flushes from those estimates.
  */
 static uint64_t
 spa_estimate_metaslabs_to_flush(spa_t *spa)
 {
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(spa_log_sm_blocklimit(spa) != 0);
 
 	/*
 	 * This variable contains the incoming rate that will be projected
 	 * and used for our flushing estimates in the future.
 	 */
 	uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
 
 	/*
 	 * At any point in time this variable tells us how many
 	 * TXGs in the future we are so we can make our estimations.
 	 */
 	uint64_t txgs_in_future = 1;
 
 	/*
 	 * This variable tells us how much room do we have until we hit
 	 * our limit. When it goes negative, it means that we've exceeded
 	 * our limit and we need to flush.
 	 *
 	 * Note that since we start at the first TXG in the future (i.e.
 	 * txgs_in_future starts from 1) we already decrement this
 	 * variable by the incoming rate.
 	 */
 	int64_t available_blocks =
 	    spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
 
 	/*
 	 * This variable tells us the total number of flushes needed to
 	 * keep the log size within the limit when we reach txgs_in_future.
 	 */
 	uint64_t total_flushes = 0;
 
 	/* Holds the current maximum of our estimates so far. */
 	uint64_t max_flushes_pertxg =
 	    MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
 	    zfs_min_metaslabs_to_flush);
 
 	/*
 	 * For our estimations we only look as far in the future
 	 * as the summary allows us.
 	 */
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e; e = list_next(&spa->spa_log_summary, e)) {
 
 		/*
 		 * If there is still room before we exceed our limit
 		 * then keep skipping TXGs accumulating more blocks
 		 * based on the incoming rate until we exceed it.
 		 */
 		if (available_blocks >= 0) {
 			uint64_t skip_txgs = (available_blocks / incoming) + 1;
 			available_blocks -= (skip_txgs * incoming);
 			txgs_in_future += skip_txgs;
 			ASSERT3S(available_blocks, >=, -incoming);
 		}
 
 		/*
 		 * At this point we're far enough into the future where
 		 * the limit was just exceeded and we flush metaslabs
 		 * based on the current entry in the summary, updating
 		 * our available_blocks.
 		 */
 		ASSERT3S(available_blocks, <, 0);
 		available_blocks += e->lse_blkcount;
 		total_flushes += e->lse_mscount;
 
 		/*
 		 * Keep the running maximum of the total_flushes that
 		 * we've done so far over the number of TXGs in the
 		 * future that we are. The idea here is to estimate
 		 * the average number of flushes that we should do
 		 * every TXG so that when we are that many TXGs in the
 		 * future we stay under the limit.
 		 */
 		max_flushes_pertxg = MAX(max_flushes_pertxg,
 		    DIV_ROUND_UP(total_flushes, txgs_in_future));
 		ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
 		    max_flushes_pertxg);
 	}
 	return (max_flushes_pertxg);
 }
 
 uint64_t
 spa_log_sm_memused(spa_t *spa)
 {
 	return (spa->spa_unflushed_stats.sus_memused);
 }
 
 static boolean_t
 spa_log_exceeds_memlimit(spa_t *spa)
 {
 	if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
 		return (B_TRUE);
 
 	uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
 	    zfs_unflushed_max_mem_ppm) / 1000000;
 	if (spa_log_sm_memused(spa) > system_mem_allowed)
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 boolean_t
 spa_flush_all_logs_requested(spa_t *spa)
 {
 	return (spa->spa_log_flushall_txg != 0);
 }
 
 void
 spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	/*
 	 * If we don't have any metaslabs with unflushed changes
 	 * return immediately.
 	 */
 	if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
 		return;
 
 	/*
 	 * During SPA export we leave a few empty TXGs to go by [see
 	 * spa_final_dirty_txg() to understand why]. For this specific
 	 * case, it is important to not flush any metaslabs as that
 	 * would dirty this TXG.
 	 *
 	 * That said, during one of these dirty TXGs that is less or
 	 * equal to spa_final_dirty(), spa_unload() will request that
 	 * we try to flush all the metaslabs for that TXG before
 	 * exporting the pool, thus we ensure that we didn't get a
 	 * request of flushing everything before we attempt to return
 	 * immediately.
 	 */
 	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
 	    !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
 	    !spa_flush_all_logs_requested(spa))
 		return;
 
 	/*
 	 * We need to generate a log space map before flushing because this
 	 * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
 	 * for this TXG's flushed metaslab count (aka sls_mscount which is
 	 * manipulated in many ways down the metaslab_flush() codepath).
 	 *
 	 * That is not to say that we may generate a log space map when we
 	 * don't need it. If we are flushing metaslabs, that means that we
 	 * were going to write changes to disk anyway, so even if we were
 	 * not flushing, a log space map would have been created anyway in
 	 * metaslab_sync().
 	 */
 	spa_generate_syncing_log_sm(spa, tx);
 
 	/*
 	 * This variable tells us how many metaslabs we want to flush based
 	 * on the block-heuristic of our flushing algorithm (see block comment
 	 * of log space map feature). We also decrement this as we flush
 	 * metaslabs and attempt to destroy old log space maps.
 	 */
 	uint64_t want_to_flush;
 	if (spa_flush_all_logs_requested(spa)) {
 		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
 		want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
 	} else {
 		want_to_flush = spa_estimate_metaslabs_to_flush(spa);
 	}
 
 	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
 	    want_to_flush);
 
 	/* Used purely for verification purposes */
 	uint64_t visited = 0;
 
 	/*
 	 * Ideally we would only iterate through spa_metaslabs_by_flushed
 	 * using only one variable (curr). We can't do that because
 	 * metaslab_flush() mutates position of curr in the AVL when
 	 * it flushes that metaslab by moving it to the end of the tree.
 	 * Thus we always keep track of the original next node of the
 	 * current node (curr) in another variable (next).
 	 */
 	metaslab_t *next = NULL;
 	for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
 	    curr != NULL; curr = next) {
 		next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
 
 		/*
 		 * If this metaslab has been flushed this txg then we've done
 		 * a full circle over the metaslabs.
 		 */
 		if (metaslab_unflushed_txg(curr) == txg)
 			break;
 
 		/*
 		 * If we are done flushing for the block heuristic and the
 		 * unflushed changes don't exceed the memory limit just stop.
 		 */
 		if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
 			break;
 
 		mutex_enter(&curr->ms_sync_lock);
 		mutex_enter(&curr->ms_lock);
 		boolean_t flushed = metaslab_flush(curr, tx);
 		mutex_exit(&curr->ms_lock);
 		mutex_exit(&curr->ms_sync_lock);
 
 		/*
 		 * If we failed to flush a metaslab (because it was loading),
 		 * then we are done with the block heuristic as it's not
 		 * possible to destroy any log space maps once you've skipped
 		 * a metaslab. In that case we just set our counter to 0 but
 		 * we continue looping in case there is still memory pressure
 		 * due to unflushed changes. Note that, flushing a metaslab
 		 * that is not the oldest flushed in the pool, will never
 		 * destroy any log space maps [see spa_cleanup_old_sm_logs()].
 		 */
 		if (!flushed) {
 			want_to_flush = 0;
 		} else if (want_to_flush > 0) {
 			want_to_flush--;
 		}
 
 		visited++;
 	}
 	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
 }
 
 /*
  * Close the log space map for this TXG and update the block counts
  * for the log's in-memory structure and the summary.
  */
 void
 spa_sync_close_syncing_log_sm(spa_t *spa)
 {
 	if (spa_syncing_log_sm(spa) == NULL)
 		return;
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
 	ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
 
 	sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
 	spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
 
 	/*
 	 * Note that we can't assert that sls_mscount is not 0,
 	 * because there is the case where the first metaslab
 	 * in spa_metaslabs_by_flushed is loading and we were
 	 * not able to flush any metaslabs the current TXG.
 	 */
 	ASSERT(sls->sls_nblocks != 0);
 
 	spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
 	spa_log_summary_verify_counts(spa);
 
 	space_map_close(spa->spa_syncing_log_sm);
 	spa->spa_syncing_log_sm = NULL;
 
 	/*
 	 * At this point we tried to flush as many metaslabs as we
 	 * can as the pool is getting exported. Reset the "flush all"
 	 * so the last few TXGs before closing the pool can be empty
 	 * (e.g. not dirty).
 	 */
 	if (spa_flush_all_logs_requested(spa)) {
 		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
 		spa->spa_log_flushall_txg = 0;
 	}
 }
 
 void
 spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
 {
 	objset_t *mos = spa_meta_objset(spa);
 
 	uint64_t spacemap_zap;
 	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT) {
 		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 		return;
 	}
 	VERIFY0(error);
 
 	metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
 	uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
 
 	/* Free all log space maps older than the oldest_flushed_txg. */
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls && sls->sls_txg < oldest_flushed_txg;
 	    sls = avl_first(&spa->spa_sm_logs_by_txg)) {
 		ASSERT0(sls->sls_mscount);
 		avl_remove(&spa->spa_sm_logs_by_txg, sls);
 		space_map_free_obj(mos, sls->sls_sm_obj, tx);
 		VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
 		spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
 		kmem_free(sls, sizeof (spa_log_sm_t));
 	}
 }
 
 static spa_log_sm_t *
 spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
 {
 	spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
 	sls->sls_sm_obj = sm_obj;
 	sls->sls_txg = txg;
 	return (sls);
 }
 
 void
 spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
 	objset_t *mos = spa_meta_objset(spa);
 
 	if (spa_syncing_log_sm(spa) != NULL)
 		return;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	uint64_t spacemap_zap;
 	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT) {
 		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 
 		error = 0;
 		spacemap_zap = zap_create(mos,
 		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
 		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
 		    &spacemap_zap, tx));
 		spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
 	}
 	VERIFY0(error);
 
 	uint64_t sm_obj;
 	ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
 	    ==, ENOENT);
 	sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
 	VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
 	avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
 
 	/*
 	 * We pass UINT64_MAX as the space map's representation size
 	 * and SPA_MINBLOCKSHIFT as the shift, to make the space map
 	 * accept any sorts of segments since there's no real advantage
 	 * to being more restrictive (given that we're already going
 	 * to be using 2-word entries).
 	 */
 	VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
 	    0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 	/*
 	 * If the log space map feature was just enabled, the blocklimit
 	 * has not yet been set.
 	 */
 	if (spa_log_sm_blocklimit(spa) == 0)
 		spa_log_sm_set_blocklimit(spa);
 }
 
 /*
  * Find all the log space maps stored in the space map ZAP and sort
  * them by their TXG in spa_sm_logs_by_txg.
  */
 static int
 spa_ld_log_sm_metadata(spa_t *spa)
 {
 	int error;
 	uint64_t spacemap_zap;
 
 	ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 
 	error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT) {
 		/* the space map ZAP doesn't exist yet */
 		return (0);
 	} else if (error != 0) {
 		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
 		    "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
 		    error);
 		return (error);
 	}
 
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t log_txg = zfs_strtonum(za.za_name, NULL);
 		spa_log_sm_t *sls =
 		    spa_log_sm_alloc(za.za_first_integer, log_txg);
 		avl_add(&spa->spa_sm_logs_by_txg, sls);
 	}
 	zap_cursor_fini(&zc);
 	if (error != ENOENT) {
 		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
 		    "zap_cursor_retrieve(spacemap_zap) [error %d]",
 		    error);
 		return (error);
 	}
 
 	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
 	    m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
 		spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
 		spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
 		    &target, NULL);
 
 		/*
 		 * At this point if sls is zero it means that a bug occurred
 		 * in ZFS the last time the pool was open or earlier in the
 		 * import code path. In general, we would have placed a
 		 * VERIFY() here or in this case just let the kernel panic
 		 * with NULL pointer dereference when incrementing sls_mscount,
 		 * but since this is the import code path we can be a bit more
 		 * lenient. Thus, for DEBUG bits we always cause a panic, while
 		 * in production we log the error and just fail the import.
 		 */
 		ASSERT(sls != NULL);
 		if (sls == NULL) {
 			spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
 			    "encountered: could not find log spacemap for "
-			    "TXG %ld [error %d]",
-			    metaslab_unflushed_txg(m), ENOENT);
+			    "TXG %llu [error %d]",
+			    (u_longlong_t)metaslab_unflushed_txg(m), ENOENT);
 			return (ENOENT);
 		}
 		sls->sls_mscount++;
 	}
 
 	return (0);
 }
 
 typedef struct spa_ld_log_sm_arg {
 	spa_t *slls_spa;
 	uint64_t slls_txg;
 } spa_ld_log_sm_arg_t;
 
 static int
 spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
 {
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint32_t vdev_id = sme->sme_vdev;
 
 	spa_ld_log_sm_arg_t *slls = arg;
 	spa_t *spa = slls->slls_spa;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/*
 	 * If the vdev has been removed (i.e. it is indirect or a hole)
 	 * skip this entry. The contents of this vdev have already moved
 	 * elsewhere.
 	 */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(!ms->ms_loaded);
 
 	/*
 	 * If we have already flushed entries for this TXG to this
 	 * metaslab's space map, then ignore it. Note that we flush
 	 * before processing any allocations/frees for that TXG, so
 	 * the metaslab's space map only has entries from *before*
 	 * the unflushed TXG.
 	 */
 	if (slls->slls_txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	switch (sme->sme_type) {
 	case SM_ALLOC:
 		range_tree_remove_xor_add_segment(offset, offset + size,
 		    ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
 		break;
 	case SM_FREE:
 		range_tree_remove_xor_add_segment(offset, offset + size,
 		    ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
 		break;
 	default:
 		panic("invalid maptype_t");
 		break;
 	}
 	return (0);
 }
 
 static int
 spa_ld_log_sm_data(spa_t *spa)
 {
 	int error = 0;
 
 	/*
 	 * If we are not going to do any writes there is no need
 	 * to read the log space maps.
 	 */
 	if (!spa_writeable(spa))
 		return (0);
 
 	ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
 	ASSERT0(spa->spa_unflushed_stats.sus_memused);
 
 	hrtime_t read_logs_starttime = gethrtime();
 	/* this is a no-op when we don't have space map logs */
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		space_map_t *sm = NULL;
 		error = space_map_open(&sm, spa_meta_objset(spa),
 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
 		if (error != 0) {
 			spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
 			    "space_map_open(obj=%llu) [error %d]",
 			    (u_longlong_t)sls->sls_sm_obj, error);
 			goto out;
 		}
 
 		struct spa_ld_log_sm_arg vla = {
 			.slls_spa = spa,
 			.slls_txg = sls->sls_txg
 		};
 		error = space_map_iterate(sm, space_map_length(sm),
 		    spa_ld_log_sm_cb, &vla);
 		if (error != 0) {
 			space_map_close(sm);
 			spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
 			    "at space_map_iterate(obj=%llu) [error %d]",
 			    (u_longlong_t)sls->sls_sm_obj, error);
 			goto out;
 		}
 
 		ASSERT0(sls->sls_nblocks);
 		sls->sls_nblocks = space_map_nblocks(sm);
 		spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
 		summary_add_data(spa, sls->sls_txg,
 		    sls->sls_mscount, sls->sls_nblocks);
 
 		space_map_close(sm);
 	}
 	hrtime_t read_logs_endtime = gethrtime();
 	spa_load_note(spa,
 	    "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
 	    "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
 	    (u_longlong_t)spa_log_sm_nblocks(spa),
 	    (u_longlong_t)zfs_log_sm_blksz,
 	    (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
 
 out:
 	/*
 	 * Now that the metaslabs contain their unflushed changes:
 	 * [1] recalculate their actual allocated space
 	 * [2] recalculate their weights
 	 * [3] sum up the memory usage of their unflushed range trees
 	 * [4] optionally load them, if debug_load is set
 	 *
 	 * Note that even in the case where we get here because of an
 	 * error (e.g. error != 0), we still want to update the fields
 	 * below in order to have a proper teardown in spa_unload().
 	 */
 	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
 	    m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
 		mutex_enter(&m->ms_lock);
 		m->ms_allocated_space = space_map_allocated(m->ms_sm) +
 		    range_tree_space(m->ms_unflushed_allocs) -
 		    range_tree_space(m->ms_unflushed_frees);
 
 		vdev_t *vd = m->ms_group->mg_vd;
 		metaslab_space_update(vd, m->ms_group->mg_class,
 		    range_tree_space(m->ms_unflushed_allocs), 0, 0);
 		metaslab_space_update(vd, m->ms_group->mg_class,
 		    -range_tree_space(m->ms_unflushed_frees), 0, 0);
 
 		ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
 		metaslab_recalculate_weight_and_sort(m);
 
 		spa->spa_unflushed_stats.sus_memused +=
 		    metaslab_unflushed_changes_memused(m);
 
 		if (metaslab_debug_load && m->ms_sm != NULL) {
 			VERIFY0(metaslab_load(m));
 			metaslab_set_selected_txg(m, 0);
 		}
 		mutex_exit(&m->ms_lock);
 	}
 
 	return (error);
 }
 
 static int
 spa_ld_unflushed_txgs(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 
 	if (vd->vdev_top_zap == 0)
 		return (0);
 
 	uint64_t object = 0;
 	int error = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 	    sizeof (uint64_t), 1, &object);
 	if (error == ENOENT)
 		return (0);
 	else if (error != 0) {
 		spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
 		    "zap_lookup(vdev_top_zap=%llu) [error %d]",
 		    (u_longlong_t)vd->vdev_top_zap, error);
 		return (error);
 	}
 
 	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *ms = vd->vdev_ms[m];
 		ASSERT(ms != NULL);
 
 		metaslab_unflushed_phys_t entry;
 		uint64_t entry_size = sizeof (entry);
 		uint64_t entry_offset = ms->ms_id * entry_size;
 
 		error = dmu_read(mos, object,
 		    entry_offset, entry_size, &entry, 0);
 		if (error != 0) {
 			spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
 			    "failed at dmu_read(obj=%llu) [error %d]",
 			    (u_longlong_t)object, error);
 			return (error);
 		}
 
 		ms->ms_unflushed_txg = entry.msp_unflushed_txg;
 		if (ms->ms_unflushed_txg != 0) {
 			mutex_enter(&spa->spa_flushed_ms_lock);
 			avl_add(&spa->spa_metaslabs_by_flushed, ms);
 			mutex_exit(&spa->spa_flushed_ms_lock);
 		}
 	}
 	return (0);
 }
 
 /*
  * Read all the log space map entries into their respective
  * metaslab unflushed trees and keep them sorted by TXG in the
  * SPA's metadata. In addition, setup all the metadata for the
  * memory and the block heuristics.
  */
 int
 spa_ld_log_spacemaps(spa_t *spa)
 {
 	int error;
 
 	spa_log_sm_set_blocklimit(spa);
 
 	for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
 		error = spa_ld_unflushed_txgs(vd);
 		if (error != 0)
 			return (error);
 	}
 
 	error = spa_ld_log_sm_metadata(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Note: we don't actually expect anything to change at this point
 	 * but we grab the config lock so we don't fail any assertions
 	 * when using vdev_lookup_top().
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	error = spa_ld_log_sm_data(spa);
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	return (error);
 }
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW,
     "Specific hard-limit in memory that ZFS allows to be used for "
     "unflushed changes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW,
     "Percentage of the overall system memory that ZFS allows to be "
     "used for unflushed changes (value is calculated over 1000000 for "
     "finer granularity)");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW,
     "Hard limit (upper-bound) in the size of the space map log "
     "in terms of blocks.");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
     "Lower-bound limit for the maximum amount of blocks allowed in "
     "log spacemap (see zfs_unflushed_log_block_max)");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
     "Tunable used to determine the number of blocks that can be used for "
     "the spacemap log, expressed as a percentage of the total number of "
     "metaslabs in the pool (e.g. 400 means the number of log blocks is "
     "capped at 4 times the number of metaslabs)");
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW,
     "The number of past TXGs that the flushing algorithm of the log "
     "spacemap feature uses to estimate incoming log blocks");
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW,
     "Maximum number of rows allowed in the summary of the spacemap log");
 
 ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW,
     "Minimum number of metaslabs to flush per dirty TXG");
 
 ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
     "Prevent the log spacemaps from being flushed and destroyed "
     "during pool export/destroy");
 /* END CSTYLED */
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index ad1c4437c098..1ecd2294dba0 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1,2965 +1,2963 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_raidz.h>
 #include <sys/metaslab.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/fm/util.h>
 #include <sys/dsl_scan.h>
 #include <sys/fs/zfs.h>
 #include <sys/metaslab_impl.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include <sys/kstat.h>
 #include "zfs_prop.h"
 #include <sys/btree.h>
 #include <sys/zfeature.h>
 #include <sys/qat.h>
 #include <sys/zstd/zstd.h>
 
 /*
  * SPA locking
  *
  * There are three basic locks for managing spa_t structures:
  *
  * spa_namespace_lock (global mutex)
  *
  *	This lock must be acquired to do any of the following:
  *
  *		- Lookup a spa_t by name
  *		- Add or remove a spa_t from the namespace
  *		- Increase spa_refcount from non-zero
  *		- Check if spa_refcount is zero
  *		- Rename a spa_t
  *		- add/remove/attach/detach devices
  *		- Held for the duration of create/destroy/import/export
  *
  *	It does not need to handle recursion.  A create or destroy may
  *	reference objects (files or zvols) in other pools, but by
  *	definition they must have an existing reference, and will never need
  *	to lookup a spa_t by name.
  *
  * spa_refcount (per-spa zfs_refcount_t protected by mutex)
  *
  *	This reference count keep track of any active users of the spa_t.  The
  *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
  *	the refcount is never really 'zero' - opening a pool implicitly keeps
  *	some references in the DMU.  Internally we check against spa_minref, but
  *	present the image of a zero/non-zero value to consumers.
  *
  * spa_config_lock[] (per-spa array of rwlocks)
  *
  *	This protects the spa_t from config changes, and must be held in
  *	the following circumstances:
  *
  *		- RW_READER to perform I/O to the spa
  *		- RW_WRITER to change the vdev config
  *
  * The locking order is fairly straightforward:
  *
  *		spa_namespace_lock	->	spa_refcount
  *
  *	The namespace lock must be acquired to increase the refcount from 0
  *	or to check if it is zero.
  *
  *		spa_refcount		->	spa_config_lock[]
  *
  *	There must be at least one valid reference on the spa_t to acquire
  *	the config lock.
  *
  *		spa_namespace_lock	->	spa_config_lock[]
  *
  *	The namespace lock must always be taken before the config lock.
  *
  *
  * The spa_namespace_lock can be acquired directly and is globally visible.
  *
  * The namespace is manipulated using the following functions, all of which
  * require the spa_namespace_lock to be held.
  *
  *	spa_lookup()		Lookup a spa_t by name.
  *
  *	spa_add()		Create a new spa_t in the namespace.
  *
  *	spa_remove()		Remove a spa_t from the namespace.  This also
  *				frees up any memory associated with the spa_t.
  *
  *	spa_next()		Returns the next spa_t in the system, or the
  *				first if NULL is passed.
  *
  *	spa_evict_all()		Shutdown and remove all spa_t structures in
  *				the system.
  *
  *	spa_guid_exists()	Determine whether a pool/device guid exists.
  *
  * The spa_refcount is manipulated using the following functions:
  *
  *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
  *				called with spa_namespace_lock held if the
  *				refcount is currently zero.
  *
  *	spa_close()		Remove a reference from the spa_t.  This will
  *				not free the spa_t or remove it from the
  *				namespace.  No locking is required.
  *
  *	spa_refcount_zero()	Returns true if the refcount is currently
  *				zero.  Must be called with spa_namespace_lock
  *				held.
  *
  * The spa_config_lock[] is an array of rwlocks, ordered as follows:
  * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
  * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
  *
  * To read the configuration, it suffices to hold one of these locks as reader.
  * To modify the configuration, you must hold all locks as writer.  To modify
  * vdev state without altering the vdev tree's topology (e.g. online/offline),
  * you must hold SCL_STATE and SCL_ZIO as writer.
  *
  * We use these distinct config locks to avoid recursive lock entry.
  * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
  * block allocations (SCL_ALLOC), which may require reading space maps
  * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
  *
  * The spa config locks cannot be normal rwlocks because we need the
  * ability to hand off ownership.  For example, SCL_ZIO is acquired
  * by the issuing thread and later released by an interrupt thread.
  * They do, however, obey the usual write-wanted semantics to prevent
  * writer (i.e. system administrator) starvation.
  *
  * The lock acquisition rules are as follows:
  *
  * SCL_CONFIG
  *	Protects changes to the vdev tree topology, such as vdev
  *	add/remove/attach/detach.  Protects the dirty config list
  *	(spa_config_dirty_list) and the set of spares and l2arc devices.
  *
  * SCL_STATE
  *	Protects changes to pool state and vdev state, such as vdev
  *	online/offline/fault/degrade/clear.  Protects the dirty state list
  *	(spa_state_dirty_list) and global pool state (spa_state).
  *
  * SCL_ALLOC
  *	Protects changes to metaslab groups and classes.
  *	Held as reader by metaslab_alloc() and metaslab_claim().
  *
  * SCL_ZIO
  *	Held by bp-level zios (those which have no io_vd upon entry)
  *	to prevent changes to the vdev tree.  The bp-level zio implicitly
  *	protects all of its vdev child zios, which do not hold SCL_ZIO.
  *
  * SCL_FREE
  *	Protects changes to metaslab groups and classes.
  *	Held as reader by metaslab_free().  SCL_FREE is distinct from
  *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
  *	blocks in zio_done() while another i/o that holds either
  *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
  *
  * SCL_VDEV
  *	Held as reader to prevent changes to the vdev tree during trivial
  *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
  *	other locks, and lower than all of them, to ensure that it's safe
  *	to acquire regardless of caller context.
  *
  * In addition, the following rules apply:
  *
  * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
  *	The lock ordering is SCL_CONFIG > spa_props_lock.
  *
  * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
  *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
  *	or zio_write_phys() -- the caller must ensure that the config cannot
  *	cannot change in the interim, and that the vdev cannot be reopened.
  *	SCL_STATE as reader suffices for both.
  *
  * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
  *
  *	spa_vdev_enter()	Acquire the namespace lock and the config lock
  *				for writing.
  *
  *	spa_vdev_exit()		Release the config lock, wait for all I/O
  *				to complete, sync the updated configs to the
  *				cache, and release the namespace lock.
  *
  * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
  * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
  */
 
 static avl_tree_t spa_namespace_avl;
 kmutex_t spa_namespace_lock;
 static kcondvar_t spa_namespace_cv;
 int spa_max_replication_override = SPA_DVAS_PER_BP;
 
 static kmutex_t spa_spare_lock;
 static avl_tree_t spa_spare_avl;
 static kmutex_t spa_l2cache_lock;
 static avl_tree_t spa_l2cache_avl;
 
 kmem_cache_t *spa_buffer_pool;
 spa_mode_t spa_mode_global = SPA_MODE_UNINIT;
 
 #ifdef ZFS_DEBUG
 /*
  * Everything except dprintf, set_error, spa, and indirect_remap is on
  * by default in debug builds.
  */
 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR |
     ZFS_DEBUG_INDIRECT_REMAP);
 #else
 int zfs_flags = 0;
 #endif
 
 /*
  * zfs_recover can be set to nonzero to attempt to recover from
  * otherwise-fatal errors, typically caused by on-disk corruption.  When
  * set, calls to zfs_panic_recover() will turn into warning messages.
  * This should only be used as a last resort, as it typically results
  * in leaked space, or worse.
  */
 int zfs_recover = B_FALSE;
 
 /*
  * If destroy encounters an EIO while reading metadata (e.g. indirect
  * blocks), space referenced by the missing metadata can not be freed.
  * Normally this causes the background destroy to become "stalled", as
  * it is unable to make forward progress.  While in this stalled state,
  * all remaining space to free from the error-encountering filesystem is
  * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
  * permanently leak the space from indirect blocks that can not be read,
  * and continue to free everything else that it can.
  *
  * The default, "stalling" behavior is useful if the storage partially
  * fails (i.e. some but not all i/os fail), and then later recovers.  In
  * this case, we will be able to continue pool operations while it is
  * partially failed, and when it recovers, we can continue to free the
  * space, with no leaks.  However, note that this case is actually
  * fairly rare.
  *
  * Typically pools either (a) fail completely (but perhaps temporarily,
  * e.g. a top-level vdev going offline), or (b) have localized,
  * permanent errors (e.g. disk returns the wrong data due to bit flip or
  * firmware bug).  In case (a), this setting does not matter because the
  * pool will be suspended and the sync thread will not be able to make
  * forward progress regardless.  In case (b), because the error is
  * permanent, the best we can do is leak the minimum amount of space,
  * which is what setting this flag will do.  Therefore, it is reasonable
  * for this flag to normally be set, but we chose the more conservative
  * approach of not setting it, so that there is no possibility of
  * leaking space in the "partial temporary" failure case.
  */
 int zfs_free_leak_on_eio = B_FALSE;
 
 /*
  * Expiration time in milliseconds. This value has two meanings. First it is
  * used to determine when the spa_deadman() logic should fire. By default the
  * spa_deadman() will fire if spa_sync() has not completed in 600 seconds.
  * Secondly, the value determines if an I/O is considered "hung". Any I/O that
  * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
  * in one of three behaviors controlled by zfs_deadman_failmode.
  */
 unsigned long zfs_deadman_synctime_ms = 600000UL;
 
 /*
  * This value controls the maximum amount of time zio_wait() will block for an
  * outstanding IO.  By default this is 300 seconds at which point the "hung"
  * behavior will be applied as described for zfs_deadman_synctime_ms.
  */
 unsigned long zfs_deadman_ziotime_ms = 300000UL;
 
 /*
  * Check time in milliseconds. This defines the frequency at which we check
  * for hung I/O.
  */
 unsigned long zfs_deadman_checktime_ms = 60000UL;
 
 /*
  * By default the deadman is enabled.
  */
 int zfs_deadman_enabled = 1;
 
 /*
  * Controls the behavior of the deadman when it detects a "hung" I/O.
  * Valid values are zfs_deadman_failmode=<wait|continue|panic>.
  *
  * wait     - Wait for the "hung" I/O (default)
  * continue - Attempt to recover from a "hung" I/O
  * panic    - Panic the system
  */
 char *zfs_deadman_failmode = "wait";
 
 /*
  * The worst case is single-sector max-parity RAID-Z blocks, in which
  * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
  * times the size; so just assume that.  Add to this the fact that
  * we can have up to 3 DVAs per bp, and one more factor of 2 because
  * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
  * the worst case is:
  *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
  */
 int spa_asize_inflation = 24;
 
 /*
  * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
  * the pool to be consumed (bounded by spa_max_slop).  This ensures that we
  * don't run the pool completely out of space, due to unaccounted changes (e.g.
  * to the MOS).  It also limits the worst-case time to allocate space.  If we
  * have less than this amount of free space, most ZPL operations (e.g.  write,
  * create) will return ENOSPC.  The ZIL metaslabs (spa_embedded_log_class) are
  * also part of this 3.2% of space which can't be consumed by normal writes;
  * the slop space "proper" (spa_get_slop_space()) is decreased by the embedded
  * log space.
  *
  * Certain operations (e.g. file removal, most administrative actions) can
  * use half the slop space.  They will only return ENOSPC if less than half
  * the slop space is free.  Typically, once the pool has less than the slop
  * space free, the user will use these operations to free up space in the pool.
  * These are the operations that call dsl_pool_adjustedsize() with the netfree
  * argument set to TRUE.
  *
  * Operations that are almost guaranteed to free up space in the absence of
  * a pool checkpoint can use up to three quarters of the slop space
  * (e.g zfs destroy).
  *
  * A very restricted set of operations are always permitted, regardless of
  * the amount of free space.  These are the operations that call
  * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
  * increase in the amount of space used, it is possible to run the pool
  * completely out of space, causing it to be permanently read-only.
  *
  * Note that on very small pools, the slop space will be larger than
  * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
  * but we never allow it to be more than half the pool size.
  *
  * Further, on very large pools, the slop space will be smaller than
  * 3.2%, to avoid reserving much more space than we actually need; bounded
  * by spa_max_slop (128GB).
  *
  * See also the comments in zfs_space_check_t.
  */
 int spa_slop_shift = 5;
 uint64_t spa_min_slop = 128ULL * 1024 * 1024;
 uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
 int spa_allocators = 4;
 
 
-/*PRINTFLIKE2*/
 void
 spa_load_failed(spa_t *spa, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
 	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
 }
 
-/*PRINTFLIKE2*/
 void
 spa_load_note(spa_t *spa, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
 	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
 }
 
 /*
  * By default dedup and user data indirects land in the special class
  */
 int zfs_ddt_data_is_special = B_TRUE;
 int zfs_user_indirect_is_special = B_TRUE;
 
 /*
  * The percentage of special class final space reserved for metadata only.
  * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
  * let metadata into the class.
  */
 int zfs_special_class_metadata_reserve_pct = 25;
 
 /*
  * ==========================================================================
  * SPA config locking
  * ==========================================================================
  */
 static void
 spa_config_lock_init(spa_t *spa)
 {
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
 		scl->scl_writer = NULL;
 		scl->scl_write_wanted = 0;
 		scl->scl_count = 0;
 	}
 }
 
 static void
 spa_config_lock_destroy(spa_t *spa)
 {
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		mutex_destroy(&scl->scl_lock);
 		cv_destroy(&scl->scl_cv);
 		ASSERT(scl->scl_writer == NULL);
 		ASSERT(scl->scl_write_wanted == 0);
 		ASSERT(scl->scl_count == 0);
 	}
 }
 
 int
 spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
 {
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
 		if (rw == RW_READER) {
 			if (scl->scl_writer || scl->scl_write_wanted) {
 				mutex_exit(&scl->scl_lock);
 				spa_config_exit(spa, locks & ((1 << i) - 1),
 				    tag);
 				return (0);
 			}
 		} else {
 			ASSERT(scl->scl_writer != curthread);
 			if (scl->scl_count != 0) {
 				mutex_exit(&scl->scl_lock);
 				spa_config_exit(spa, locks & ((1 << i) - 1),
 				    tag);
 				return (0);
 			}
 			scl->scl_writer = curthread;
 		}
 		scl->scl_count++;
 		mutex_exit(&scl->scl_lock);
 	}
 	return (1);
 }
 
 void
 spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
 {
 	int wlocks_held = 0;
 
 	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
 
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (scl->scl_writer == curthread)
 			wlocks_held |= (1 << i);
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
 		if (rw == RW_READER) {
 			while (scl->scl_writer || scl->scl_write_wanted) {
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 			}
 		} else {
 			ASSERT(scl->scl_writer != curthread);
 			while (scl->scl_count != 0) {
 				scl->scl_write_wanted++;
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 				scl->scl_write_wanted--;
 			}
 			scl->scl_writer = curthread;
 		}
 		scl->scl_count++;
 		mutex_exit(&scl->scl_lock);
 	}
 	ASSERT3U(wlocks_held, <=, locks);
 }
 
 void
 spa_config_exit(spa_t *spa, int locks, const void *tag)
 {
 	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
 		ASSERT(scl->scl_count > 0);
 		if (--scl->scl_count == 0) {
 			ASSERT(scl->scl_writer == NULL ||
 			    scl->scl_writer == curthread);
 			scl->scl_writer = NULL;	/* OK in either case */
 			cv_broadcast(&scl->scl_cv);
 		}
 		mutex_exit(&scl->scl_lock);
 	}
 }
 
 int
 spa_config_held(spa_t *spa, int locks, krw_t rw)
 {
 	int locks_held = 0;
 
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
 			continue;
 		if ((rw == RW_READER && scl->scl_count != 0) ||
 		    (rw == RW_WRITER && scl->scl_writer == curthread))
 			locks_held |= 1 << i;
 	}
 
 	return (locks_held);
 }
 
 /*
  * ==========================================================================
  * SPA namespace functions
  * ==========================================================================
  */
 
 /*
  * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
  * Returns NULL if no matching spa_t is found.
  */
 spa_t *
 spa_lookup(const char *name)
 {
 	static spa_t search;	/* spa_t is large; don't allocate on stack */
 	spa_t *spa;
 	avl_index_t where;
 	char *cp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
 
 	/*
 	 * If it's a full dataset name, figure out the pool name and
 	 * just use that.
 	 */
 	cp = strpbrk(search.spa_name, "/@#");
 	if (cp != NULL)
 		*cp = '\0';
 
 	spa = avl_find(&spa_namespace_avl, &search, &where);
 
 	return (spa);
 }
 
 /*
  * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
  * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
  * looking for potentially hung I/Os.
  */
 void
 spa_deadman(void *arg)
 {
 	spa_t *spa = arg;
 
 	/* Disable the deadman if the pool is suspended. */
 	if (spa_suspended(spa))
 		return;
 
 	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
 	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
 	    (u_longlong_t)++spa->spa_deadman_calls);
 	if (zfs_deadman_enabled)
 		vdev_deadman(spa->spa_root_vdev, FTAG);
 
 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
 	    MSEC_TO_TICK(zfs_deadman_checktime_ms));
 }
 
 static int
 spa_log_sm_sort_by_txg(const void *va, const void *vb)
 {
 	const spa_log_sm_t *a = va;
 	const spa_log_sm_t *b = vb;
 
 	return (TREE_CMP(a->sls_txg, b->sls_txg));
 }
 
 /*
  * Create an uninitialized spa_t with the given name.  Requires
  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
  * exist by calling spa_lookup() first.
  */
 spa_t *
 spa_add(const char *name, nvlist_t *config, const char *altroot)
 {
 	spa_t *spa;
 	spa_config_dirent_t *dp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 
 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		bplist_create(&spa->spa_free_bplist[t]);
 
 	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 	spa->spa_freeze_txg = UINT64_MAX;
 	spa->spa_final_txg = UINT64_MAX;
 	spa->spa_load_max_txg = UINT64_MAX;
 	spa->spa_proc = &p0;
 	spa->spa_proc_state = SPA_PROC_NONE;
 	spa->spa_trust_config = B_TRUE;
 	spa->spa_hostid = zone_get_hostid(NULL);
 
 	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 	spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
 	spa_set_deadman_failmode(spa, zfs_deadman_failmode);
 
 	zfs_refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
 	spa_stats_init(spa);
 
 	avl_add(&spa_namespace_avl, spa);
 
 	/*
 	 * Set the alternate root, if there is one.
 	 */
 	if (altroot)
 		spa->spa_root = spa_strdup(altroot);
 
 	spa->spa_alloc_count = spa_allocators;
 	spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (spa_alloc_t), KM_SLEEP);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
 		    NULL);
 		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
 	}
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
 	avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
 	    sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node));
 	list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t),
 	    offsetof(log_summary_entry_t, lse_node));
 
 	/*
 	 * Every pool starts with the default cachefile
 	 */
 	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
 	    offsetof(spa_config_dirent_t, scd_link));
 
 	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
 	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
 	list_insert_head(&spa->spa_config_list, dp);
 
 	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
 	    KM_SLEEP) == 0);
 
 	if (config != NULL) {
 		nvlist_t *features;
 
 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
 		    &features) == 0) {
 			VERIFY(nvlist_dup(features, &spa->spa_label_features,
 			    0) == 0);
 		}
 
 		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
 	}
 
 	if (spa->spa_label_features == NULL) {
 		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 	}
 
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 	spa->spa_min_alloc = INT_MAX;
 
 	/* Reset cached value */
 	spa->spa_dedup_dspace = ~0ULL;
 
 	/*
 	 * As a pool is being created, treat all features as disabled by
 	 * setting SPA_FEATURE_DISABLED for all entries in the feature
 	 * refcount cache.
 	 */
 	for (int i = 0; i < SPA_FEATURES; i++) {
 		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
 	}
 
 	list_create(&spa->spa_leaf_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_leaf_node));
 
 	return (spa);
 }
 
 /*
  * Removes a spa_t from the namespace, freeing up any memory used.  Requires
  * spa_namespace_lock.  This is called only after the spa_t has been closed and
  * deactivated.
  */
 void
 spa_remove(spa_t *spa)
 {
 	spa_config_dirent_t *dp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED);
 	ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
 	ASSERT0(spa->spa_waiters);
 
 	nvlist_free(spa->spa_config_splitting);
 
 	avl_remove(&spa_namespace_avl, spa);
 	cv_broadcast(&spa_namespace_cv);
 
 	if (spa->spa_root)
 		spa_strfree(spa->spa_root);
 
 	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		list_remove(&spa->spa_config_list, dp);
 		if (dp->scd_path != NULL)
 			spa_strfree(dp->scd_path);
 		kmem_free(dp, sizeof (spa_config_dirent_t));
 	}
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		avl_destroy(&spa->spa_allocs[i].spaa_tree);
 		mutex_destroy(&spa->spa_allocs[i].spaa_lock);
 	}
 	kmem_free(spa->spa_allocs, spa->spa_alloc_count *
 	    sizeof (spa_alloc_t));
 
 	avl_destroy(&spa->spa_metaslabs_by_flushed);
 	avl_destroy(&spa->spa_sm_logs_by_txg);
 	list_destroy(&spa->spa_log_summary);
 	list_destroy(&spa->spa_config_list);
 	list_destroy(&spa->spa_leaf_list);
 
 	nvlist_free(spa->spa_label_features);
 	nvlist_free(spa->spa_load_info);
 	nvlist_free(spa->spa_feat_stats);
 	spa_config_set(spa, NULL);
 
 	zfs_refcount_destroy(&spa->spa_refcount);
 
 	spa_stats_destroy(spa);
 	spa_config_lock_destroy(spa);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		bplist_destroy(&spa->spa_free_bplist[t]);
 
 	zio_checksum_templates_free(spa);
 
 	cv_destroy(&spa->spa_async_cv);
 	cv_destroy(&spa->spa_evicting_os_cv);
 	cv_destroy(&spa->spa_proc_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
 	cv_destroy(&spa->spa_activities_cv);
 	cv_destroy(&spa->spa_waiters_cv);
 
 	mutex_destroy(&spa->spa_flushed_ms_lock);
 	mutex_destroy(&spa->spa_async_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
 	mutex_destroy(&spa->spa_evicting_os_lock);
 	mutex_destroy(&spa->spa_history_lock);
 	mutex_destroy(&spa->spa_proc_lock);
 	mutex_destroy(&spa->spa_props_lock);
 	mutex_destroy(&spa->spa_cksum_tmpls_lock);
 	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
 	mutex_destroy(&spa->spa_vdev_top_lock);
 	mutex_destroy(&spa->spa_feat_stats_lock);
 	mutex_destroy(&spa->spa_activities_lock);
 
 	kmem_free(spa, sizeof (spa_t));
 }
 
 /*
  * Given a pool, return the next pool in the namespace, or NULL if there is
  * none.  If 'prev' is NULL, return the first pool.
  */
 spa_t *
 spa_next(spa_t *prev)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	if (prev)
 		return (AVL_NEXT(&spa_namespace_avl, prev));
 	else
 		return (avl_first(&spa_namespace_avl));
 }
 
 /*
  * ==========================================================================
  * SPA refcount functions
  * ==========================================================================
  */
 
 /*
  * Add a reference to the given spa_t.  Must have at least one reference, or
  * have the namespace lock held.
  */
 void
 spa_open_ref(spa_t *spa, void *tag)
 {
 	ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
 	    MUTEX_HELD(&spa_namespace_lock));
 	(void) zfs_refcount_add(&spa->spa_refcount, tag);
 }
 
 /*
  * Remove a reference to the given spa_t.  Must have at least one reference, or
  * have the namespace lock held.
  */
 void
 spa_close(spa_t *spa, void *tag)
 {
 	ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
 	    MUTEX_HELD(&spa_namespace_lock));
 	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
 }
 
 /*
  * Remove a reference to the given spa_t held by a dsl dir that is
  * being asynchronously released.  Async releases occur from a taskq
  * performing eviction of dsl datasets and dirs.  The namespace lock
  * isn't held and the hold by the object being evicted may contribute to
  * spa_minref (e.g. dataset or directory released during pool export),
  * so the asserts in spa_close() do not apply.
  */
 void
 spa_async_close(spa_t *spa, void *tag)
 {
 	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
 }
 
 /*
  * Check to see if the spa refcount is zero.  Must be called with
  * spa_namespace_lock held.  We really compare against spa_minref, which is the
  * number of references acquired when opening a pool
  */
 boolean_t
 spa_refcount_zero(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
 }
 
 /*
  * ==========================================================================
  * SPA spare and l2cache tracking
  * ==========================================================================
  */
 
 /*
  * Hot spares and cache devices are tracked using the same code below,
  * for 'auxiliary' devices.
  */
 
 typedef struct spa_aux {
 	uint64_t	aux_guid;
 	uint64_t	aux_pool;
 	avl_node_t	aux_avl;
 	int		aux_count;
 } spa_aux_t;
 
 static inline int
 spa_aux_compare(const void *a, const void *b)
 {
 	const spa_aux_t *sa = (const spa_aux_t *)a;
 	const spa_aux_t *sb = (const spa_aux_t *)b;
 
 	return (TREE_CMP(sa->aux_guid, sb->aux_guid));
 }
 
 static void
 spa_aux_add(vdev_t *vd, avl_tree_t *avl)
 {
 	avl_index_t where;
 	spa_aux_t search;
 	spa_aux_t *aux;
 
 	search.aux_guid = vd->vdev_guid;
 	if ((aux = avl_find(avl, &search, &where)) != NULL) {
 		aux->aux_count++;
 	} else {
 		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
 		aux->aux_guid = vd->vdev_guid;
 		aux->aux_count = 1;
 		avl_insert(avl, aux, where);
 	}
 }
 
 static void
 spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
 {
 	spa_aux_t search;
 	spa_aux_t *aux;
 	avl_index_t where;
 
 	search.aux_guid = vd->vdev_guid;
 	aux = avl_find(avl, &search, &where);
 
 	ASSERT(aux != NULL);
 
 	if (--aux->aux_count == 0) {
 		avl_remove(avl, aux);
 		kmem_free(aux, sizeof (spa_aux_t));
 	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
 		aux->aux_pool = 0ULL;
 	}
 }
 
 static boolean_t
 spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
 {
 	spa_aux_t search, *found;
 
 	search.aux_guid = guid;
 	found = avl_find(avl, &search, NULL);
 
 	if (pool) {
 		if (found)
 			*pool = found->aux_pool;
 		else
 			*pool = 0ULL;
 	}
 
 	if (refcnt) {
 		if (found)
 			*refcnt = found->aux_count;
 		else
 			*refcnt = 0;
 	}
 
 	return (found != NULL);
 }
 
 static void
 spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
 {
 	spa_aux_t search, *found;
 	avl_index_t where;
 
 	search.aux_guid = vd->vdev_guid;
 	found = avl_find(avl, &search, &where);
 	ASSERT(found != NULL);
 	ASSERT(found->aux_pool == 0ULL);
 
 	found->aux_pool = spa_guid(vd->vdev_spa);
 }
 
 /*
  * Spares are tracked globally due to the following constraints:
  *
  *	- A spare may be part of multiple pools.
  *	- A spare may be added to a pool even if it's actively in use within
  *	  another pool.
  *	- A spare in use in any pool can only be the source of a replacement if
  *	  the target is a spare in the same pool.
  *
  * We keep track of all spares on the system through the use of a reference
  * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
  * spare, then we bump the reference count in the AVL tree.  In addition, we set
  * the 'vdev_isspare' member to indicate that the device is a spare (active or
  * inactive).  When a spare is made active (used to replace a device in the
  * pool), we also keep track of which pool its been made a part of.
  *
  * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
  * called under the spa_namespace lock as part of vdev reconfiguration.  The
  * separate spare lock exists for the status query path, which does not need to
  * be completely consistent with respect to other vdev configuration changes.
  */
 
 static int
 spa_spare_compare(const void *a, const void *b)
 {
 	return (spa_aux_compare(a, b));
 }
 
 void
 spa_spare_add(vdev_t *vd)
 {
 	mutex_enter(&spa_spare_lock);
 	ASSERT(!vd->vdev_isspare);
 	spa_aux_add(vd, &spa_spare_avl);
 	vd->vdev_isspare = B_TRUE;
 	mutex_exit(&spa_spare_lock);
 }
 
 void
 spa_spare_remove(vdev_t *vd)
 {
 	mutex_enter(&spa_spare_lock);
 	ASSERT(vd->vdev_isspare);
 	spa_aux_remove(vd, &spa_spare_avl);
 	vd->vdev_isspare = B_FALSE;
 	mutex_exit(&spa_spare_lock);
 }
 
 boolean_t
 spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
 {
 	boolean_t found;
 
 	mutex_enter(&spa_spare_lock);
 	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
 	mutex_exit(&spa_spare_lock);
 
 	return (found);
 }
 
 void
 spa_spare_activate(vdev_t *vd)
 {
 	mutex_enter(&spa_spare_lock);
 	ASSERT(vd->vdev_isspare);
 	spa_aux_activate(vd, &spa_spare_avl);
 	mutex_exit(&spa_spare_lock);
 }
 
 /*
  * Level 2 ARC devices are tracked globally for the same reasons as spares.
  * Cache devices currently only support one pool per cache device, and so
  * for these devices the aux reference count is currently unused beyond 1.
  */
 
 static int
 spa_l2cache_compare(const void *a, const void *b)
 {
 	return (spa_aux_compare(a, b));
 }
 
 void
 spa_l2cache_add(vdev_t *vd)
 {
 	mutex_enter(&spa_l2cache_lock);
 	ASSERT(!vd->vdev_isl2cache);
 	spa_aux_add(vd, &spa_l2cache_avl);
 	vd->vdev_isl2cache = B_TRUE;
 	mutex_exit(&spa_l2cache_lock);
 }
 
 void
 spa_l2cache_remove(vdev_t *vd)
 {
 	mutex_enter(&spa_l2cache_lock);
 	ASSERT(vd->vdev_isl2cache);
 	spa_aux_remove(vd, &spa_l2cache_avl);
 	vd->vdev_isl2cache = B_FALSE;
 	mutex_exit(&spa_l2cache_lock);
 }
 
 boolean_t
 spa_l2cache_exists(uint64_t guid, uint64_t *pool)
 {
 	boolean_t found;
 
 	mutex_enter(&spa_l2cache_lock);
 	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
 	mutex_exit(&spa_l2cache_lock);
 
 	return (found);
 }
 
 void
 spa_l2cache_activate(vdev_t *vd)
 {
 	mutex_enter(&spa_l2cache_lock);
 	ASSERT(vd->vdev_isl2cache);
 	spa_aux_activate(vd, &spa_l2cache_avl);
 	mutex_exit(&spa_l2cache_lock);
 }
 
 /*
  * ==========================================================================
  * SPA vdev locking
  * ==========================================================================
  */
 
 /*
  * Lock the given spa_t for the purpose of adding or removing a vdev.
  * Grabs the global spa_namespace_lock plus the spa config lock for writing.
  * It returns the next transaction group for the spa_t.
  */
 uint64_t
 spa_vdev_enter(spa_t *spa)
 {
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 
 	vdev_autotrim_stop_all(spa);
 
 	return (spa_vdev_config_enter(spa));
 }
 
 /*
  * The same as spa_vdev_enter() above but additionally takes the guid of
  * the vdev being detached.  When there is a rebuild in process it will be
  * suspended while the vdev tree is modified then resumed by spa_vdev_exit().
  * The rebuild is canceled if only a single child remains after the detach.
  */
 uint64_t
 spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
 {
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 
 	vdev_autotrim_stop_all(spa);
 
 	if (guid != 0) {
 		vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 		if (vd) {
 			vdev_rebuild_stop_wait(vd->vdev_top);
 		}
 	}
 
 	return (spa_vdev_config_enter(spa));
 }
 
 /*
  * Internal implementation for spa_vdev_enter().  Used when a vdev
  * operation requires multiple syncs (i.e. removing a device) while
  * keeping the spa_namespace_lock held.
  */
 uint64_t
 spa_vdev_config_enter(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	return (spa_last_synced_txg(spa) + 1);
 }
 
 /*
  * Used in combination with spa_vdev_config_enter() to allow the syncing
  * of multiple transactions without releasing the spa_namespace_lock.
  */
 void
 spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	int config_changed = B_FALSE;
 
 	ASSERT(txg > spa_last_synced_txg(spa));
 
 	spa->spa_pending_vdev = NULL;
 
 	/*
 	 * Reassess the DTLs.
 	 */
 	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE);
 
 	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
 		config_changed = B_TRUE;
 		spa->spa_config_generation++;
 	}
 
 	/*
 	 * Verify the metaslab classes.
 	 */
 	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
 
 	spa_config_exit(spa, SCL_ALL, spa);
 
 	/*
 	 * Panic the system if the specified tag requires it.  This
 	 * is useful for ensuring that configurations are updated
 	 * transactionally.
 	 */
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, tag, 0);
 
 	/*
 	 * Note: this txg_wait_synced() is important because it ensures
 	 * that there won't be more than one config change per txg.
 	 * This allows us to use the txg as the generation number.
 	 */
 	if (error == 0)
 		txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	if (vd != NULL) {
 		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
 		if (vd->vdev_ops->vdev_op_leaf) {
 			mutex_enter(&vd->vdev_initialize_lock);
 			vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED,
 			    NULL);
 			mutex_exit(&vd->vdev_initialize_lock);
 
 			mutex_enter(&vd->vdev_trim_lock);
 			vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
 			mutex_exit(&vd->vdev_trim_lock);
 		}
 
 		/*
 		 * The vdev may be both a leaf and top-level device.
 		 */
 		vdev_autotrim_stop_wait(vd);
 
 		spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
 		vdev_free(vd);
 		spa_config_exit(spa, SCL_STATE_ALL, spa);
 	}
 
 	/*
 	 * If the config changed, update the config cache.
 	 */
 	if (config_changed)
 		spa_write_cachefile(spa, B_FALSE, B_TRUE);
 }
 
 /*
  * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
  * locking of spa_vdev_enter(), we also want make sure the transactions have
  * synced to disk, and then update the global configuration cache with the new
  * information.
  */
 int
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
 	vdev_autotrim_restart(spa);
 	vdev_rebuild_restart(spa);
 
 	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Lock the given spa_t for the purpose of changing vdev state.
  */
 void
 spa_vdev_state_enter(spa_t *spa, int oplocks)
 {
 	int locks = SCL_STATE_ALL | oplocks;
 
 	/*
 	 * Root pools may need to read of the underlying devfs filesystem
 	 * when opening up a vdev.  Unfortunately if we're holding the
 	 * SCL_ZIO lock it will result in a deadlock when we try to issue
 	 * the read from the root filesystem.  Instead we "prefetch"
 	 * the associated vnodes that we need prior to opening the
 	 * underlying devices and cache them so that we can prevent
 	 * any I/O when we are doing the actual open.
 	 */
 	if (spa_is_root(spa)) {
 		int low = locks & ~(SCL_ZIO - 1);
 		int high = locks & ~low;
 
 		spa_config_enter(spa, high, spa, RW_WRITER);
 		vdev_hold(spa->spa_root_vdev);
 		spa_config_enter(spa, low, spa, RW_WRITER);
 	} else {
 		spa_config_enter(spa, locks, spa, RW_WRITER);
 	}
 	spa->spa_vdev_locks = locks;
 }
 
 int
 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 {
 	boolean_t config_changed = B_FALSE;
 	vdev_t *vdev_top;
 
 	if (vd == NULL || vd == spa->spa_root_vdev) {
 		vdev_top = spa->spa_root_vdev;
 	} else {
 		vdev_top = vd->vdev_top;
 	}
 
 	if (vd != NULL || error == 0)
 		vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE);
 
 	if (vd != NULL) {
 		if (vd != spa->spa_root_vdev)
 			vdev_state_dirty(vdev_top);
 
 		config_changed = B_TRUE;
 		spa->spa_config_generation++;
 	}
 
 	if (spa_is_root(spa))
 		vdev_rele(spa->spa_root_vdev);
 
 	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
 	spa_config_exit(spa, spa->spa_vdev_locks, spa);
 
 	/*
 	 * If anything changed, wait for it to sync.  This ensures that,
 	 * from the system administrator's perspective, zpool(8) commands
 	 * are synchronous.  This is important for things like zpool offline:
 	 * when the command completes, you expect no further I/O from ZFS.
 	 */
 	if (vd != NULL)
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
 	/*
 	 * If the config changed, update the config cache.
 	 */
 	if (config_changed) {
 		mutex_enter(&spa_namespace_lock);
 		spa_write_cachefile(spa, B_FALSE, B_TRUE);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * Miscellaneous functions
  * ==========================================================================
  */
 
 void
 spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
 {
 	if (!nvlist_exists(spa->spa_label_features, feature)) {
 		fnvlist_add_boolean(spa->spa_label_features, feature);
 		/*
 		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
 		 * dirty the vdev config because lock SCL_CONFIG is not held.
 		 * Thankfully, in this case we don't need to dirty the config
 		 * because it will be written out anyway when we finish
 		 * creating the pool.
 		 */
 		if (tx->tx_txg != TXG_INITIAL)
 			vdev_config_dirty(spa->spa_root_vdev);
 	}
 }
 
 void
 spa_deactivate_mos_feature(spa_t *spa, const char *feature)
 {
 	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
 		vdev_config_dirty(spa->spa_root_vdev);
 }
 
 /*
  * Return the spa_t associated with given pool_guid, if it exists.  If
  * device_guid is non-zero, determine whether the pool exists *and* contains
  * a device with the specified device_guid.
  */
 spa_t *
 spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
 {
 	spa_t *spa;
 	avl_tree_t *t = &spa_namespace_avl;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
 		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 			continue;
 		if (spa->spa_root_vdev == NULL)
 			continue;
 		if (spa_guid(spa) == pool_guid) {
 			if (device_guid == 0)
 				break;
 
 			if (vdev_lookup_by_guid(spa->spa_root_vdev,
 			    device_guid) != NULL)
 				break;
 
 			/*
 			 * Check any devices we may be in the process of adding.
 			 */
 			if (spa->spa_pending_vdev) {
 				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
 				    device_guid) != NULL)
 					break;
 			}
 		}
 	}
 
 	return (spa);
 }
 
 /*
  * Determine whether a pool with the given pool_guid exists.
  */
 boolean_t
 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
 {
 	return (spa_by_guid(pool_guid, device_guid) != NULL);
 }
 
 char *
 spa_strdup(const char *s)
 {
 	size_t len;
 	char *new;
 
 	len = strlen(s);
 	new = kmem_alloc(len + 1, KM_SLEEP);
 	bcopy(s, new, len);
 	new[len] = '\0';
 
 	return (new);
 }
 
 void
 spa_strfree(char *s)
 {
 	kmem_free(s, strlen(s) + 1);
 }
 
 uint64_t
 spa_generate_guid(spa_t *spa)
 {
 	uint64_t guid;
 
 	if (spa != NULL) {
 		do {
 			(void) random_get_pseudo_bytes((void *)&guid,
 			    sizeof (guid));
 		} while (guid == 0 || spa_guid_exists(spa_guid(spa), guid));
 	} else {
 		do {
 			(void) random_get_pseudo_bytes((void *)&guid,
 			    sizeof (guid));
 		} while (guid == 0 || spa_guid_exists(guid, 0));
 	}
 
 	return (guid);
 }
 
 void
 snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
 {
 	char type[256];
 	char *checksum = NULL;
 	char *compress = NULL;
 
 	if (bp != NULL) {
 		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
 			dmu_object_byteswap_t bswap =
 			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
 			(void) snprintf(type, sizeof (type), "bswap %s %s",
 			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
 			    "metadata" : "data",
 			    dmu_ot_byteswap[bswap].ob_name);
 		} else {
 			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
 			    sizeof (type));
 		}
 		if (!BP_IS_EMBEDDED(bp)) {
 			checksum =
 			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
 		}
 		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
 	}
 
 	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
 	    compress);
 }
 
 void
 spa_freeze(spa_t *spa)
 {
 	uint64_t freeze_txg = 0;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	if (spa->spa_freeze_txg == UINT64_MAX) {
 		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
 		spa->spa_freeze_txg = freeze_txg;
 	}
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (freeze_txg != 0)
 		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
 }
 
 void
 zfs_panic_recover(const char *fmt, ...)
 {
 	va_list adx;
 
 	va_start(adx, fmt);
 	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
 	va_end(adx);
 }
 
 /*
  * This is a stripped-down version of strtoull, suitable only for converting
  * lowercase hexadecimal numbers that don't overflow.
  */
 uint64_t
 zfs_strtonum(const char *str, char **nptr)
 {
 	uint64_t val = 0;
 	char c;
 	int digit;
 
 	while ((c = *str) != '\0') {
 		if (c >= '0' && c <= '9')
 			digit = c - '0';
 		else if (c >= 'a' && c <= 'f')
 			digit = 10 + c - 'a';
 		else
 			break;
 
 		val *= 16;
 		val += digit;
 
 		str++;
 	}
 
 	if (nptr)
 		*nptr = (char *)str;
 
 	return (val);
 }
 
 void
 spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
 {
 	/*
 	 * We bump the feature refcount for each special vdev added to the pool
 	 */
 	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
 	spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
 }
 
 /*
  * ==========================================================================
  * Accessor functions
  * ==========================================================================
  */
 
 boolean_t
 spa_shutting_down(spa_t *spa)
 {
 	return (spa->spa_async_suspended);
 }
 
 dsl_pool_t *
 spa_get_dsl(spa_t *spa)
 {
 	return (spa->spa_dsl_pool);
 }
 
 boolean_t
 spa_is_initializing(spa_t *spa)
 {
 	return (spa->spa_is_initializing);
 }
 
 boolean_t
 spa_indirect_vdevs_loaded(spa_t *spa)
 {
 	return (spa->spa_indirect_vdevs_loaded);
 }
 
 blkptr_t *
 spa_get_rootblkptr(spa_t *spa)
 {
 	return (&spa->spa_ubsync.ub_rootbp);
 }
 
 void
 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
 {
 	spa->spa_uberblock.ub_rootbp = *bp;
 }
 
 void
 spa_altroot(spa_t *spa, char *buf, size_t buflen)
 {
 	if (spa->spa_root == NULL)
 		buf[0] = '\0';
 	else
 		(void) strncpy(buf, spa->spa_root, buflen);
 }
 
 int
 spa_sync_pass(spa_t *spa)
 {
 	return (spa->spa_sync_pass);
 }
 
 char *
 spa_name(spa_t *spa)
 {
 	return (spa->spa_name);
 }
 
 uint64_t
 spa_guid(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	uint64_t guid;
 
 	/*
 	 * If we fail to parse the config during spa_load(), we can go through
 	 * the error path (which posts an ereport) and end up here with no root
 	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
 	 * this case.
 	 */
 	if (spa->spa_root_vdev == NULL)
 		return (spa->spa_config_guid);
 
 	guid = spa->spa_last_synced_guid != 0 ?
 	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
 
 	/*
 	 * Return the most recently synced out guid unless we're
 	 * in syncing context.
 	 */
 	if (dp && dsl_pool_sync_context(dp))
 		return (spa->spa_root_vdev->vdev_guid);
 	else
 		return (guid);
 }
 
 uint64_t
 spa_load_guid(spa_t *spa)
 {
 	/*
 	 * This is a GUID that exists solely as a reference for the
 	 * purposes of the arc.  It is generated at load time, and
 	 * is never written to persistent storage.
 	 */
 	return (spa->spa_load_guid);
 }
 
 uint64_t
 spa_last_synced_txg(spa_t *spa)
 {
 	return (spa->spa_ubsync.ub_txg);
 }
 
 uint64_t
 spa_first_txg(spa_t *spa)
 {
 	return (spa->spa_first_txg);
 }
 
 uint64_t
 spa_syncing_txg(spa_t *spa)
 {
 	return (spa->spa_syncing_txg);
 }
 
 /*
  * Return the last txg where data can be dirtied. The final txgs
  * will be used to just clear out any deferred frees that remain.
  */
 uint64_t
 spa_final_dirty_txg(spa_t *spa)
 {
 	return (spa->spa_final_txg - TXG_DEFER_SIZE);
 }
 
 pool_state_t
 spa_state(spa_t *spa)
 {
 	return (spa->spa_state);
 }
 
 spa_load_state_t
 spa_load_state(spa_t *spa)
 {
 	return (spa->spa_load_state);
 }
 
 uint64_t
 spa_freeze_txg(spa_t *spa)
 {
 	return (spa->spa_freeze_txg);
 }
 
 /*
  * Return the inflated asize for a logical write in bytes. This is used by the
  * DMU to calculate the space a logical write will require on disk.
  * If lsize is smaller than the largest physical block size allocatable on this
  * pool we use its value instead, since the write will end up using the whole
  * block anyway.
  */
 uint64_t
 spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
 {
 	if (lsize == 0)
 		return (0);	/* No inflation needed */
 	return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation);
 }
 
 /*
  * Return the amount of slop space in bytes.  It is typically 1/32 of the pool
  * (3.2%), minus the embedded log space.  On very small pools, it may be
  * slightly larger than this.  On very large pools, it will be capped to
  * the value of spa_max_slop.  The embedded log space is not included in
  * spa_dspace.  By subtracting it, the usable space (per "zfs list") is a
  * constant 97% of the total space, regardless of metaslab size (assuming the
  * default spa_slop_shift=5 and a non-tiny pool).
  *
  * See the comment above spa_slop_shift for more details.
  */
 uint64_t
 spa_get_slop_space(spa_t *spa)
 {
 	uint64_t space = 0;
 	uint64_t slop = 0;
 
 	/*
 	 * Make sure spa_dedup_dspace has been set.
 	 */
 	if (spa->spa_dedup_dspace == ~0ULL)
 		spa_update_dspace(spa);
 
 	/*
 	 * spa_get_dspace() includes the space only logically "used" by
 	 * deduplicated data, so since it's not useful to reserve more
 	 * space with more deduplicated data, we subtract that out here.
 	 */
 	space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
 	slop = MIN(space >> spa_slop_shift, spa_max_slop);
 
 	/*
 	 * Subtract the embedded log space, but no more than half the (3.2%)
 	 * unusable space.  Note, the "no more than half" is only relevant if
 	 * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by
 	 * default.
 	 */
 	uint64_t embedded_log =
 	    metaslab_class_get_dspace(spa_embedded_log_class(spa));
 	slop -= MIN(embedded_log, slop >> 1);
 
 	/*
 	 * Slop space should be at least spa_min_slop, but no more than half
 	 * the entire pool.
 	 */
 	slop = MAX(slop, MIN(space >> 1, spa_min_slop));
 	return (slop);
 }
 
 uint64_t
 spa_get_dspace(spa_t *spa)
 {
 	return (spa->spa_dspace);
 }
 
 uint64_t
 spa_get_checkpoint_space(spa_t *spa)
 {
 	return (spa->spa_checkpoint_info.sci_dspace);
 }
 
 void
 spa_update_dspace(spa_t *spa)
 {
 	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
 	    ddt_get_dedup_dspace(spa);
 	if (spa->spa_vdev_removal != NULL) {
 		/*
 		 * We can't allocate from the removing device, so subtract
 		 * its size if it was included in dspace (i.e. if this is a
 		 * normal-class vdev, not special/dedup).  This prevents the
 		 * DMU/DSL from filling up the (now smaller) pool while we
 		 * are in the middle of removing the device.
 		 *
 		 * Note that the DMU/DSL doesn't actually know or care
 		 * how much space is allocated (it does its own tracking
 		 * of how much space has been logically used).  So it
 		 * doesn't matter that the data we are moving may be
 		 * allocated twice (on the old device and the new
 		 * device).
 		 */
 		spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 		vdev_t *vd =
 		    vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
 		/*
 		 * If the stars align, we can wind up here after
 		 * vdev_remove_complete() has cleared vd->vdev_mg but before
 		 * spa->spa_vdev_removal gets cleared, so we must check before
 		 * we dereference.
 		 */
 		if (vd->vdev_mg &&
 		    vd->vdev_mg->mg_class == spa_normal_class(spa)) {
 			spa->spa_dspace -= spa_deflate(spa) ?
 			    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 		}
 		spa_config_exit(spa, SCL_VDEV, FTAG);
 	}
 }
 
 /*
  * Return the failure mode that has been set to this pool. The default
  * behavior will be to block all I/Os when a complete failure occurs.
  */
 uint64_t
 spa_get_failmode(spa_t *spa)
 {
 	return (spa->spa_failmode);
 }
 
 boolean_t
 spa_suspended(spa_t *spa)
 {
 	return (spa->spa_suspended != ZIO_SUSPEND_NONE);
 }
 
 uint64_t
 spa_version(spa_t *spa)
 {
 	return (spa->spa_ubsync.ub_version);
 }
 
 boolean_t
 spa_deflate(spa_t *spa)
 {
 	return (spa->spa_deflate);
 }
 
 metaslab_class_t *
 spa_normal_class(spa_t *spa)
 {
 	return (spa->spa_normal_class);
 }
 
 metaslab_class_t *
 spa_log_class(spa_t *spa)
 {
 	return (spa->spa_log_class);
 }
 
 metaslab_class_t *
 spa_embedded_log_class(spa_t *spa)
 {
 	return (spa->spa_embedded_log_class);
 }
 
 metaslab_class_t *
 spa_special_class(spa_t *spa)
 {
 	return (spa->spa_special_class);
 }
 
 metaslab_class_t *
 spa_dedup_class(spa_t *spa)
 {
 	return (spa->spa_dedup_class);
 }
 
 /*
  * Locate an appropriate allocation class
  */
 metaslab_class_t *
 spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
     uint_t level, uint_t special_smallblk)
 {
 	/*
 	 * ZIL allocations determine their class in zio_alloc_zil().
 	 */
 	ASSERT(objtype != DMU_OT_INTENT_LOG);
 
 	boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
 
 	if (DMU_OT_IS_DDT(objtype)) {
 		if (spa->spa_dedup_class->mc_groups != 0)
 			return (spa_dedup_class(spa));
 		else if (has_special_class && zfs_ddt_data_is_special)
 			return (spa_special_class(spa));
 		else
 			return (spa_normal_class(spa));
 	}
 
 	/* Indirect blocks for user data can land in special if allowed */
 	if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
 		if (has_special_class && zfs_user_indirect_is_special)
 			return (spa_special_class(spa));
 		else
 			return (spa_normal_class(spa));
 	}
 
 	if (DMU_OT_IS_METADATA(objtype) || level > 0) {
 		if (has_special_class)
 			return (spa_special_class(spa));
 		else
 			return (spa_normal_class(spa));
 	}
 
 	/*
 	 * Allow small file blocks in special class in some cases (like
 	 * for the dRAID vdev feature). But always leave a reserve of
 	 * zfs_special_class_metadata_reserve_pct exclusively for metadata.
 	 */
 	if (DMU_OT_IS_FILE(objtype) &&
 	    has_special_class && size <= special_smallblk) {
 		metaslab_class_t *special = spa_special_class(spa);
 		uint64_t alloc = metaslab_class_get_alloc(special);
 		uint64_t space = metaslab_class_get_space(special);
 		uint64_t limit =
 		    (space * (100 - zfs_special_class_metadata_reserve_pct))
 		    / 100;
 
 		if (alloc < limit)
 			return (special);
 	}
 
 	return (spa_normal_class(spa));
 }
 
 void
 spa_evicting_os_register(spa_t *spa, objset_t *os)
 {
 	mutex_enter(&spa->spa_evicting_os_lock);
 	list_insert_head(&spa->spa_evicting_os_list, os);
 	mutex_exit(&spa->spa_evicting_os_lock);
 }
 
 void
 spa_evicting_os_deregister(spa_t *spa, objset_t *os)
 {
 	mutex_enter(&spa->spa_evicting_os_lock);
 	list_remove(&spa->spa_evicting_os_list, os);
 	cv_broadcast(&spa->spa_evicting_os_cv);
 	mutex_exit(&spa->spa_evicting_os_lock);
 }
 
 void
 spa_evicting_os_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_evicting_os_lock);
 	while (!list_is_empty(&spa->spa_evicting_os_list))
 		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
 	mutex_exit(&spa->spa_evicting_os_lock);
 
 	dmu_buf_user_evict_wait();
 }
 
 int
 spa_max_replication(spa_t *spa)
 {
 	/*
 	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
 	 * handle BPs with more than one DVA allocated.  Set our max
 	 * replication level accordingly.
 	 */
 	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
 		return (1);
 	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
 }
 
 int
 spa_prev_software_version(spa_t *spa)
 {
 	return (spa->spa_prev_software_version);
 }
 
 uint64_t
 spa_deadman_synctime(spa_t *spa)
 {
 	return (spa->spa_deadman_synctime);
 }
 
 spa_autotrim_t
 spa_get_autotrim(spa_t *spa)
 {
 	return (spa->spa_autotrim);
 }
 
 uint64_t
 spa_deadman_ziotime(spa_t *spa)
 {
 	return (spa->spa_deadman_ziotime);
 }
 
 uint64_t
 spa_get_deadman_failmode(spa_t *spa)
 {
 	return (spa->spa_deadman_failmode);
 }
 
 void
 spa_set_deadman_failmode(spa_t *spa, const char *failmode)
 {
 	if (strcmp(failmode, "wait") == 0)
 		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
 	else if (strcmp(failmode, "continue") == 0)
 		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE;
 	else if (strcmp(failmode, "panic") == 0)
 		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
 	else
 		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
 }
 
 void
 spa_set_deadman_ziotime(hrtime_t ns)
 {
 	spa_t *spa = NULL;
 
 	if (spa_mode_global != SPA_MODE_UNINIT) {
 		mutex_enter(&spa_namespace_lock);
 		while ((spa = spa_next(spa)) != NULL)
 			spa->spa_deadman_ziotime = ns;
 		mutex_exit(&spa_namespace_lock);
 	}
 }
 
 void
 spa_set_deadman_synctime(hrtime_t ns)
 {
 	spa_t *spa = NULL;
 
 	if (spa_mode_global != SPA_MODE_UNINIT) {
 		mutex_enter(&spa_namespace_lock);
 		while ((spa = spa_next(spa)) != NULL)
 			spa->spa_deadman_synctime = ns;
 		mutex_exit(&spa_namespace_lock);
 	}
 }
 
 uint64_t
 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
 	uint64_t asize = DVA_GET_ASIZE(dva);
 	uint64_t dsize = asize;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (asize != 0 && spa->spa_deflate) {
 		vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 		if (vd != NULL)
 			dsize = (asize >> SPA_MINBLOCKSHIFT) *
 			    vd->vdev_deflate_ratio;
 	}
 
 	return (dsize);
 }
 
 uint64_t
 bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
 {
 	uint64_t dsize = 0;
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
 	return (dsize);
 }
 
 uint64_t
 bp_get_dsize(spa_t *spa, const blkptr_t *bp)
 {
 	uint64_t dsize = 0;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	return (dsize);
 }
 
 uint64_t
 spa_dirty_data(spa_t *spa)
 {
 	return (spa->spa_dsl_pool->dp_dirty_total);
 }
 
 /*
  * ==========================================================================
  * SPA Import Progress Routines
  * ==========================================================================
  */
 
 typedef struct spa_import_progress {
 	uint64_t		pool_guid;	/* unique id for updates */
 	char			*pool_name;
 	spa_load_state_t	spa_load_state;
 	uint64_t		mmp_sec_remaining;	/* MMP activity check */
 	uint64_t		spa_load_max_txg;	/* rewind txg */
 	procfs_list_node_t	smh_node;
 } spa_import_progress_t;
 
 spa_history_list_t *spa_import_progress_list = NULL;
 
 static int
 spa_import_progress_show_header(struct seq_file *f)
 {
 	seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid",
 	    "load_state", "multihost_secs", "max_txg",
 	    "pool_name");
 	return (0);
 }
 
 static int
 spa_import_progress_show(struct seq_file *f, void *data)
 {
 	spa_import_progress_t *sip = (spa_import_progress_t *)data;
 
 	seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n",
 	    (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state,
 	    (u_longlong_t)sip->mmp_sec_remaining,
 	    (u_longlong_t)sip->spa_load_max_txg,
 	    (sip->pool_name ? sip->pool_name : "-"));
 
 	return (0);
 }
 
 /* Remove oldest elements from list until there are no more than 'size' left */
 static void
 spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size)
 {
 	spa_import_progress_t *sip;
 	while (shl->size > size) {
 		sip = list_remove_head(&shl->procfs_list.pl_list);
 		if (sip->pool_name)
 			spa_strfree(sip->pool_name);
 		kmem_free(sip, sizeof (spa_import_progress_t));
 		shl->size--;
 	}
 
 	IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list));
 }
 
 static void
 spa_import_progress_init(void)
 {
 	spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t),
 	    KM_SLEEP);
 
 	spa_import_progress_list->size = 0;
 
 	spa_import_progress_list->procfs_list.pl_private =
 	    spa_import_progress_list;
 
 	procfs_list_install("zfs",
 	    NULL,
 	    "import_progress",
 	    0644,
 	    &spa_import_progress_list->procfs_list,
 	    spa_import_progress_show,
 	    spa_import_progress_show_header,
 	    NULL,
 	    offsetof(spa_import_progress_t, smh_node));
 }
 
 static void
 spa_import_progress_destroy(void)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	procfs_list_uninstall(&shl->procfs_list);
 	spa_import_progress_truncate(shl, 0);
 	procfs_list_destroy(&shl->procfs_list);
 	kmem_free(shl, sizeof (spa_history_list_t));
 }
 
 int
 spa_import_progress_set_state(uint64_t pool_guid,
     spa_load_state_t load_state)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 	int error = ENOENT;
 
 	if (shl->size == 0)
 		return (0);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			sip->spa_load_state = load_state;
 			error = 0;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 
 	return (error);
 }
 
 int
 spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 	int error = ENOENT;
 
 	if (shl->size == 0)
 		return (0);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			sip->spa_load_max_txg = load_max_txg;
 			error = 0;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 
 	return (error);
 }
 
 int
 spa_import_progress_set_mmp_check(uint64_t pool_guid,
     uint64_t mmp_sec_remaining)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 	int error = ENOENT;
 
 	if (shl->size == 0)
 		return (0);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			sip->mmp_sec_remaining = mmp_sec_remaining;
 			error = 0;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 
 	return (error);
 }
 
 /*
  * A new import is in progress, add an entry.
  */
 void
 spa_import_progress_add(spa_t *spa)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 	char *poolname = NULL;
 
 	sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP);
 	sip->pool_guid = spa_guid(spa);
 
 	(void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME,
 	    &poolname);
 	if (poolname == NULL)
 		poolname = spa_name(spa);
 	sip->pool_name = spa_strdup(poolname);
 	sip->spa_load_state = spa_load_state(spa);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	procfs_list_add(&shl->procfs_list, sip);
 	shl->size++;
 	mutex_exit(&shl->procfs_list.pl_lock);
 }
 
 void
 spa_import_progress_remove(uint64_t pool_guid)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			if (sip->pool_name)
 				spa_strfree(sip->pool_name);
 			list_remove(&shl->procfs_list.pl_list, sip);
 			shl->size--;
 			kmem_free(sip, sizeof (spa_import_progress_t));
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 }
 
 /*
  * ==========================================================================
  * Initialization and Termination
  * ==========================================================================
  */
 
 static int
 spa_name_compare(const void *a1, const void *a2)
 {
 	const spa_t *s1 = a1;
 	const spa_t *s2 = a2;
 	int s;
 
 	s = strcmp(s1->spa_name, s2->spa_name);
 
 	return (TREE_ISIGN(s));
 }
 
 void
 spa_boot_init(void)
 {
 	spa_config_load();
 }
 
 void
 spa_init(spa_mode_t mode)
 {
 	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
 
 	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
 	    offsetof(spa_t, spa_avl));
 
 	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
 	    offsetof(spa_aux_t, aux_avl));
 
 	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
 	    offsetof(spa_aux_t, aux_avl));
 
 	spa_mode_global = mode;
 
 #ifndef _KERNEL
 	if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) {
 		struct sigaction sa;
 
 		sa.sa_flags = SA_SIGINFO;
 		sigemptyset(&sa.sa_mask);
 		sa.sa_sigaction = arc_buf_sigsegv;
 
 		if (sigaction(SIGSEGV, &sa, NULL) == -1) {
 			perror("could not enable watchpoints: "
 			    "sigaction(SIGSEGV, ...) = ");
 		} else {
 			arc_watch = B_TRUE;
 		}
 	}
 #endif
 
 	fm_init();
 	zfs_refcount_init();
 	unique_init();
 	zfs_btree_init();
 	metaslab_stat_init();
 	ddt_init();
 	zio_init();
 	dmu_init();
 	zil_init();
 	vdev_cache_stat_init();
 	vdev_mirror_stat_init();
 	vdev_raidz_math_init();
 	vdev_file_init();
 	zfs_prop_init();
 	zpool_prop_init();
 	zpool_feature_init();
 	spa_config_load();
 	l2arc_start();
 	scan_init();
 	qat_init();
 	spa_import_progress_init();
 }
 
 void
 spa_fini(void)
 {
 	l2arc_stop();
 
 	spa_evict_all();
 
 	vdev_file_fini();
 	vdev_cache_stat_fini();
 	vdev_mirror_stat_fini();
 	vdev_raidz_math_fini();
 	zil_fini();
 	dmu_fini();
 	zio_fini();
 	ddt_fini();
 	metaslab_stat_fini();
 	zfs_btree_fini();
 	unique_fini();
 	zfs_refcount_fini();
 	fm_fini();
 	scan_fini();
 	qat_fini();
 	spa_import_progress_destroy();
 
 	avl_destroy(&spa_namespace_avl);
 	avl_destroy(&spa_spare_avl);
 	avl_destroy(&spa_l2cache_avl);
 
 	cv_destroy(&spa_namespace_cv);
 	mutex_destroy(&spa_namespace_lock);
 	mutex_destroy(&spa_spare_lock);
 	mutex_destroy(&spa_l2cache_lock);
 }
 
 /*
  * Return whether this pool has a dedicated slog device. No locking needed.
  * It's not a problem if the wrong answer is returned as it's only for
  * performance and not correctness.
  */
 boolean_t
 spa_has_slogs(spa_t *spa)
 {
 	return (spa->spa_log_class->mc_groups != 0);
 }
 
 spa_log_state_t
 spa_get_log_state(spa_t *spa)
 {
 	return (spa->spa_log_state);
 }
 
 void
 spa_set_log_state(spa_t *spa, spa_log_state_t state)
 {
 	spa->spa_log_state = state;
 }
 
 boolean_t
 spa_is_root(spa_t *spa)
 {
 	return (spa->spa_is_root);
 }
 
 boolean_t
 spa_writeable(spa_t *spa)
 {
 	return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config);
 }
 
 /*
  * Returns true if there is a pending sync task in any of the current
  * syncing txg, the current quiescing txg, or the current open txg.
  */
 boolean_t
 spa_has_pending_synctask(spa_t *spa)
 {
 	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
 	    !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
 }
 
 spa_mode_t
 spa_mode(spa_t *spa)
 {
 	return (spa->spa_mode);
 }
 
 uint64_t
 spa_bootfs(spa_t *spa)
 {
 	return (spa->spa_bootfs);
 }
 
 uint64_t
 spa_delegation(spa_t *spa)
 {
 	return (spa->spa_delegation);
 }
 
 objset_t *
 spa_meta_objset(spa_t *spa)
 {
 	return (spa->spa_meta_objset);
 }
 
 enum zio_checksum
 spa_dedup_checksum(spa_t *spa)
 {
 	return (spa->spa_dedup_checksum);
 }
 
 /*
  * Reset pool scan stat per scan pass (or reboot).
  */
 void
 spa_scan_stat_init(spa_t *spa)
 {
 	/* data not stored on disk */
 	spa->spa_scan_pass_start = gethrestime_sec();
 	if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
 		spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
 	else
 		spa->spa_scan_pass_scrub_pause = 0;
 	spa->spa_scan_pass_scrub_spent_paused = 0;
 	spa->spa_scan_pass_exam = 0;
 	spa->spa_scan_pass_issued = 0;
 	vdev_scan_stat_init(spa->spa_root_vdev);
 }
 
 /*
  * Get scan stats for zpool status reports
  */
 int
 spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 {
 	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
 
 	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
 		return (SET_ERROR(ENOENT));
 	bzero(ps, sizeof (pool_scan_stat_t));
 
 	/* data stored on disk */
 	ps->pss_func = scn->scn_phys.scn_func;
 	ps->pss_state = scn->scn_phys.scn_state;
 	ps->pss_start_time = scn->scn_phys.scn_start_time;
 	ps->pss_end_time = scn->scn_phys.scn_end_time;
 	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
 	ps->pss_examined = scn->scn_phys.scn_examined;
 	ps->pss_to_process = scn->scn_phys.scn_to_process;
 	ps->pss_processed = scn->scn_phys.scn_processed;
 	ps->pss_errors = scn->scn_phys.scn_errors;
 
 	/* data not stored on disk */
 	ps->pss_pass_exam = spa->spa_scan_pass_exam;
 	ps->pss_pass_start = spa->spa_scan_pass_start;
 	ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
 	ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
 	ps->pss_pass_issued = spa->spa_scan_pass_issued;
 	ps->pss_issued =
 	    scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
 
 	return (0);
 }
 
 int
 spa_maxblocksize(spa_t *spa)
 {
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
 		return (SPA_MAXBLOCKSIZE);
 	else
 		return (SPA_OLD_MAXBLOCKSIZE);
 }
 
 
 /*
  * Returns the txg that the last device removal completed. No indirect mappings
  * have been added since this txg.
  */
 uint64_t
 spa_get_last_removal_txg(spa_t *spa)
 {
 	uint64_t vdevid;
 	uint64_t ret = -1ULL;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	/*
 	 * sr_prev_indirect_vdev is only modified while holding all the
 	 * config locks, so it is sufficient to hold SCL_VDEV as reader when
 	 * examining it.
 	 */
 	vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
 
 	while (vdevid != -1ULL) {
 		vdev_t *vd = vdev_lookup_top(spa, vdevid);
 		vdev_indirect_births_t *vib = vd->vdev_indirect_births;
 
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		/*
 		 * If the removal did not remap any data, we don't care.
 		 */
 		if (vdev_indirect_births_count(vib) != 0) {
 			ret = vdev_indirect_births_last_entry_txg(vib);
 			break;
 		}
 
 		vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	IMPLY(ret != -1ULL,
 	    spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
 
 	return (ret);
 }
 
 int
 spa_maxdnodesize(spa_t *spa)
 {
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
 		return (DNODE_MAX_SIZE);
 	else
 		return (DNODE_MIN_SIZE);
 }
 
 boolean_t
 spa_multihost(spa_t *spa)
 {
 	return (spa->spa_multihost ? B_TRUE : B_FALSE);
 }
 
 uint32_t
 spa_get_hostid(spa_t *spa)
 {
 	return (spa->spa_hostid);
 }
 
 boolean_t
 spa_trust_config(spa_t *spa)
 {
 	return (spa->spa_trust_config);
 }
 
 uint64_t
 spa_missing_tvds_allowed(spa_t *spa)
 {
 	return (spa->spa_missing_tvds_allowed);
 }
 
 space_map_t *
 spa_syncing_log_sm(spa_t *spa)
 {
 	return (spa->spa_syncing_log_sm);
 }
 
 void
 spa_set_missing_tvds(spa_t *spa, uint64_t missing)
 {
 	spa->spa_missing_tvds = missing;
 }
 
 /*
  * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc).
  */
 const char *
 spa_state_to_name(spa_t *spa)
 {
 	ASSERT3P(spa, !=, NULL);
 
 	/*
 	 * it is possible for the spa to exist, without root vdev
 	 * as the spa transitions during import/export
 	 */
 	vdev_t *rvd = spa->spa_root_vdev;
 	if (rvd == NULL) {
 		return ("TRANSITIONING");
 	}
 	vdev_state_t state = rvd->vdev_state;
 	vdev_aux_t aux = rvd->vdev_stat.vs_aux;
 
 	if (spa_suspended(spa) &&
 	    (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE))
 		return ("SUSPENDED");
 
 	switch (state) {
 	case VDEV_STATE_CLOSED:
 	case VDEV_STATE_OFFLINE:
 		return ("OFFLINE");
 	case VDEV_STATE_REMOVED:
 		return ("REMOVED");
 	case VDEV_STATE_CANT_OPEN:
 		if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
 			return ("FAULTED");
 		else if (aux == VDEV_AUX_SPLIT_POOL)
 			return ("SPLIT");
 		else
 			return ("UNAVAIL");
 	case VDEV_STATE_FAULTED:
 		return ("FAULTED");
 	case VDEV_STATE_DEGRADED:
 		return ("DEGRADED");
 	case VDEV_STATE_HEALTHY:
 		return ("ONLINE");
 	default:
 		break;
 	}
 
 	return ("UNKNOWN");
 }
 
 boolean_t
 spa_top_vdevs_spacemap_addressable(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 boolean_t
 spa_has_checkpoint(spa_t *spa)
 {
 	return (spa->spa_checkpoint_txg != 0);
 }
 
 boolean_t
 spa_importing_readonly_checkpoint(spa_t *spa)
 {
 	return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
 	    spa->spa_mode == SPA_MODE_READ);
 }
 
 uint64_t
 spa_min_claim_txg(spa_t *spa)
 {
 	uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
 
 	if (checkpoint_txg != 0)
 		return (checkpoint_txg + 1);
 
 	return (spa->spa_first_txg);
 }
 
 /*
  * If there is a checkpoint, async destroys may consume more space from
  * the pool instead of freeing it. In an attempt to save the pool from
  * getting suspended when it is about to run out of space, we stop
  * processing async destroys.
  */
 boolean_t
 spa_suspend_async_destroy(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	uint64_t unreserved = dsl_pool_unreserved_space(dp,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
 	uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
 	uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
 
 	if (spa_has_checkpoint(spa) && avail == 0)
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 #if defined(_KERNEL)
 
 int
 param_set_deadman_failmode_common(const char *val)
 {
 	spa_t *spa = NULL;
 	char *p;
 
 	if (val == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((p = strchr(val, '\n')) != NULL)
 		*p = '\0';
 
 	if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 &&
 	    strcmp(val, "panic"))
 		return (SET_ERROR(EINVAL));
 
 	if (spa_mode_global != SPA_MODE_UNINIT) {
 		mutex_enter(&spa_namespace_lock);
 		while ((spa = spa_next(spa)) != NULL)
 			spa_set_deadman_failmode(spa, val);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	return (0);
 }
 #endif
 
 /* Namespace manipulation */
 EXPORT_SYMBOL(spa_lookup);
 EXPORT_SYMBOL(spa_add);
 EXPORT_SYMBOL(spa_remove);
 EXPORT_SYMBOL(spa_next);
 
 /* Refcount functions */
 EXPORT_SYMBOL(spa_open_ref);
 EXPORT_SYMBOL(spa_close);
 EXPORT_SYMBOL(spa_refcount_zero);
 
 /* Pool configuration lock */
 EXPORT_SYMBOL(spa_config_tryenter);
 EXPORT_SYMBOL(spa_config_enter);
 EXPORT_SYMBOL(spa_config_exit);
 EXPORT_SYMBOL(spa_config_held);
 
 /* Pool vdev add/remove lock */
 EXPORT_SYMBOL(spa_vdev_enter);
 EXPORT_SYMBOL(spa_vdev_exit);
 
 /* Pool vdev state change lock */
 EXPORT_SYMBOL(spa_vdev_state_enter);
 EXPORT_SYMBOL(spa_vdev_state_exit);
 
 /* Accessor functions */
 EXPORT_SYMBOL(spa_shutting_down);
 EXPORT_SYMBOL(spa_get_dsl);
 EXPORT_SYMBOL(spa_get_rootblkptr);
 EXPORT_SYMBOL(spa_set_rootblkptr);
 EXPORT_SYMBOL(spa_altroot);
 EXPORT_SYMBOL(spa_sync_pass);
 EXPORT_SYMBOL(spa_name);
 EXPORT_SYMBOL(spa_guid);
 EXPORT_SYMBOL(spa_last_synced_txg);
 EXPORT_SYMBOL(spa_first_txg);
 EXPORT_SYMBOL(spa_syncing_txg);
 EXPORT_SYMBOL(spa_version);
 EXPORT_SYMBOL(spa_state);
 EXPORT_SYMBOL(spa_load_state);
 EXPORT_SYMBOL(spa_freeze_txg);
 EXPORT_SYMBOL(spa_get_dspace);
 EXPORT_SYMBOL(spa_update_dspace);
 EXPORT_SYMBOL(spa_deflate);
 EXPORT_SYMBOL(spa_normal_class);
 EXPORT_SYMBOL(spa_log_class);
 EXPORT_SYMBOL(spa_special_class);
 EXPORT_SYMBOL(spa_preferred_class);
 EXPORT_SYMBOL(spa_max_replication);
 EXPORT_SYMBOL(spa_prev_software_version);
 EXPORT_SYMBOL(spa_get_failmode);
 EXPORT_SYMBOL(spa_suspended);
 EXPORT_SYMBOL(spa_bootfs);
 EXPORT_SYMBOL(spa_delegation);
 EXPORT_SYMBOL(spa_meta_objset);
 EXPORT_SYMBOL(spa_maxblocksize);
 EXPORT_SYMBOL(spa_maxdnodesize);
 
 /* Miscellaneous support routines */
 EXPORT_SYMBOL(spa_guid_exists);
 EXPORT_SYMBOL(spa_strdup);
 EXPORT_SYMBOL(spa_strfree);
 EXPORT_SYMBOL(spa_generate_guid);
 EXPORT_SYMBOL(snprintf_blkptr);
 EXPORT_SYMBOL(spa_freeze);
 EXPORT_SYMBOL(spa_upgrade);
 EXPORT_SYMBOL(spa_evict_all);
 EXPORT_SYMBOL(spa_lookup_by_guid);
 EXPORT_SYMBOL(spa_has_spare);
 EXPORT_SYMBOL(dva_get_dsize_sync);
 EXPORT_SYMBOL(bp_get_dsize_sync);
 EXPORT_SYMBOL(bp_get_dsize);
 EXPORT_SYMBOL(spa_has_slogs);
 EXPORT_SYMBOL(spa_is_root);
 EXPORT_SYMBOL(spa_writeable);
 EXPORT_SYMBOL(spa_mode);
 EXPORT_SYMBOL(spa_namespace_lock);
 EXPORT_SYMBOL(spa_trust_config);
 EXPORT_SYMBOL(spa_missing_tvds_allowed);
 EXPORT_SYMBOL(spa_set_missing_tvds);
 EXPORT_SYMBOL(spa_state_to_name);
 EXPORT_SYMBOL(spa_importing_readonly_checkpoint);
 EXPORT_SYMBOL(spa_min_claim_txg);
 EXPORT_SYMBOL(spa_suspend_async_destroy);
 EXPORT_SYMBOL(spa_has_checkpoint);
 EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable);
 
 ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW,
 	"Set additional debugging flags");
 
 ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW,
 	"Set to attempt to recover from fatal errors");
 
 ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW,
 	"Set to ignore IO errors during free and permanently leak the space");
 
 ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, ULONG, ZMOD_RW,
 	"Dead I/O check interval in milliseconds");
 
 ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW,
 	"Enable deadman timer");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, INT, ZMOD_RW,
 	"SPA size estimate multiplication factor");
 
 ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW,
 	"Place DDT data into the special class");
 
 ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW,
 	"Place user data indirect blocks into the special class");
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode,
 	param_set_deadman_failmode, param_get_charp, ZMOD_RW,
 	"Failmode for deadman timer");
 
 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms,
 	param_set_deadman_synctime, param_get_ulong, ZMOD_RW,
 	"Pool sync expiration time in milliseconds");
 
 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms,
 	param_set_deadman_ziotime, param_get_ulong, ZMOD_RW,
 	"IO expiration time in milliseconds");
 
 ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW,
 	"Small file blocks in special vdevs depends on this much "
 	"free space available");
 /* END CSTYLED */
 
 ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
 	param_get_int, ZMOD_RW, "Reserved free space in pool");
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 4e316d8135ee..47a475135302 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1,5426 +1,5426 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
  * Copyright [2021] Hewlett Packard Enterprise Development LP
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dir.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/space_map.h>
 #include <sys/space_reftree.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_raidz.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 
 /*
  * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
  * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
  * part of the spa_embedded_log_class.  The metaslab with the most free space
  * in each vdev is selected for this purpose when the pool is opened (or a
  * vdev is added).  See vdev_metaslab_init().
  *
  * Log blocks can be allocated from the following locations.  Each one is tried
  * in order until the allocation succeeds:
  * 1. dedicated log vdevs, aka "slog" (spa_log_class)
  * 2. embedded slog metaslabs (spa_embedded_log_class)
  * 3. other metaslabs in normal vdevs (spa_normal_class)
  *
  * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
  * than this number of metaslabs in the vdev.  This ensures that we don't set
  * aside an unreasonable amount of space for the ZIL.  If set to less than
  * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
  * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
  */
 int zfs_embedded_slog_min_ms = 64;
 
 /* default target for number of metaslabs per top-level vdev */
 int zfs_vdev_default_ms_count = 200;
 
 /* minimum number of metaslabs per top-level vdev */
 int zfs_vdev_min_ms_count = 16;
 
 /* practical upper limit of total metaslabs per top-level vdev */
 int zfs_vdev_ms_count_limit = 1ULL << 17;
 
 /* lower limit for metaslab size (512M) */
 int zfs_vdev_default_ms_shift = 29;
 
 /* upper limit for metaslab size (16G) */
 int zfs_vdev_max_ms_shift = 34;
 
 int vdev_validate_skip = B_FALSE;
 
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
  */
 int zfs_vdev_dtl_sm_blksz = (1 << 12);
 
 /*
  * Rate limit slow IO (delay) events to this many per second.
  */
 unsigned int zfs_slow_io_events_per_second = 20;
 
 /*
  * Rate limit checksum events after this many checksum errors per second.
  */
 unsigned int zfs_checksum_events_per_second = 20;
 
 /*
  * Ignore errors during scrub/resilver.  Allows to work around resilver
  * upon import when there are pool errors.
  */
 int zfs_scan_ignore_errors = 0;
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
  * the end of each transaction can benefit from a higher I/O bandwidth
  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
  */
 int zfs_vdev_standard_sm_blksz = (1 << 17);
 
 /*
  * Tunable parameter for debugging or performance analysis. Setting this
  * will cause pool corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 int zfs_nocacheflush = 0;
 
 uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
 uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
 
-/*PRINTFLIKE2*/
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	if (vd->vdev_path != NULL) {
 		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
 		    vd->vdev_path, buf);
 	} else {
 		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
 		    vd->vdev_ops->vdev_op_type,
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)vd->vdev_guid, buf);
 	}
 }
 
 void
 vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 {
 	char state[20];
 
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
 		zfs_dbgmsg("%*svdev %llu: %s", indent, "",
 		    (u_longlong_t)vd->vdev_id,
 		    vd->vdev_ops->vdev_op_type);
 		return;
 	}
 
 	switch (vd->vdev_state) {
 	case VDEV_STATE_UNKNOWN:
 		(void) snprintf(state, sizeof (state), "unknown");
 		break;
 	case VDEV_STATE_CLOSED:
 		(void) snprintf(state, sizeof (state), "closed");
 		break;
 	case VDEV_STATE_OFFLINE:
 		(void) snprintf(state, sizeof (state), "offline");
 		break;
 	case VDEV_STATE_REMOVED:
 		(void) snprintf(state, sizeof (state), "removed");
 		break;
 	case VDEV_STATE_CANT_OPEN:
 		(void) snprintf(state, sizeof (state), "can't open");
 		break;
 	case VDEV_STATE_FAULTED:
 		(void) snprintf(state, sizeof (state), "faulted");
 		break;
 	case VDEV_STATE_DEGRADED:
 		(void) snprintf(state, sizeof (state), "degraded");
 		break;
 	case VDEV_STATE_HEALTHY:
 		(void) snprintf(state, sizeof (state), "healthy");
 		break;
 	default:
 		(void) snprintf(state, sizeof (state), "<state %u>",
 		    (uint_t)vd->vdev_state);
 	}
 
 	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
 	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
 	    vd->vdev_islog ? " (log)" : "",
 	    (u_longlong_t)vd->vdev_guid,
 	    vd->vdev_path ? vd->vdev_path : "N/A", state);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++)
 		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
 }
 
 /*
  * Virtual device management.
  */
 
 static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
 	&vdev_draid_ops,
 	&vdev_draid_spare_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
 	&vdev_disk_ops,
 	&vdev_file_ops,
 	&vdev_missing_ops,
 	&vdev_hole_ops,
 	&vdev_indirect_ops,
 	NULL
 };
 
 /*
  * Given a vdev type, return the appropriate ops vector.
  */
 static vdev_ops_t *
 vdev_getops(const char *type)
 {
 	vdev_ops_t *ops, **opspp;
 
 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
 		if (strcmp(ops->vdev_op_type, type) == 0)
 			break;
 
 	return (ops);
 }
 
 /*
  * Given a vdev and a metaslab class, find which metaslab group we're
  * interested in. All vdevs may belong to two different metaslab classes.
  * Dedicated slog devices use only the primary metaslab group, rather than a
  * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
  */
 metaslab_group_t *
 vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
 {
 	if (mc == spa_embedded_log_class(vd->vdev_spa) &&
 	    vd->vdev_log_mg != NULL)
 		return (vd->vdev_log_mg);
 	else
 		return (vd->vdev_mg);
 }
 
 /* ARGSUSED */
 void
 vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
 	physical_rs->rs_start = logical_rs->rs_start;
 	physical_rs->rs_end = logical_rs->rs_end;
 }
 
 /*
  * Derive the enumerated allocation bias from string input.
  * String origin is either the per-vdev zap or zpool(8).
  */
 static vdev_alloc_bias_t
 vdev_derive_alloc_bias(const char *bias)
 {
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 
 	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
 		alloc_bias = VDEV_BIAS_LOG;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
 		alloc_bias = VDEV_BIAS_SPECIAL;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
 		alloc_bias = VDEV_BIAS_DEDUP;
 
 	return (alloc_bias);
 }
 
 /*
  * Default asize function: return the MAX of psize with the asize of
  * all children.  This is what's used by anything other than RAID-Z.
  */
 uint64_t
 vdev_default_asize(vdev_t *vd, uint64_t psize)
 {
 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
 		asize = MAX(asize, csize);
 	}
 
 	return (asize);
 }
 
 uint64_t
 vdev_default_min_asize(vdev_t *vd)
 {
 	return (vd->vdev_min_asize);
 }
 
 /*
  * Get the minimum allocatable size. We define the allocatable size as
  * the vdev's asize rounded to the nearest metaslab. This allows us to
  * replace or attach devices which don't have the same physical size but
  * can still satisfy the same number of allocations.
  */
 uint64_t
 vdev_get_min_asize(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	/*
 	 * If our parent is NULL (inactive spare or cache) or is the root,
 	 * just return our own asize.
 	 */
 	if (pvd == NULL)
 		return (vd->vdev_asize);
 
 	/*
 	 * The top-level vdev just returns the allocatable size rounded
 	 * to the nearest metaslab.
 	 */
 	if (vd == vd->vdev_top)
 		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
 
 	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 }
 
 void
 vdev_set_min_asize(vdev_t *vd)
 {
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_set_min_asize(vd->vdev_child[c]);
 }
 
 /*
  * Get the minimal allocation size for the top-level vdev.
  */
 uint64_t
 vdev_get_min_alloc(vdev_t *vd)
 {
 	uint64_t min_alloc = 1ULL << vd->vdev_ashift;
 
 	if (vd->vdev_ops->vdev_op_min_alloc != NULL)
 		min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
 
 	return (min_alloc);
 }
 
 /*
  * Get the parity level for a top-level vdev.
  */
 uint64_t
 vdev_get_nparity(vdev_t *vd)
 {
 	uint64_t nparity = 0;
 
 	if (vd->vdev_ops->vdev_op_nparity != NULL)
 		nparity = vd->vdev_ops->vdev_op_nparity(vd);
 
 	return (nparity);
 }
 
 /*
  * Get the number of data disks for a top-level vdev.
  */
 uint64_t
 vdev_get_ndisks(vdev_t *vd)
 {
 	uint64_t ndisks = 1;
 
 	if (vd->vdev_ops->vdev_op_ndisks != NULL)
 		ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
 
 	return (ndisks);
 }
 
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev < rvd->vdev_children) {
 		ASSERT(rvd->vdev_child[vdev] != NULL);
 		return (rvd->vdev_child[vdev]);
 	}
 
 	return (NULL);
 }
 
 vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
 	vdev_t *mvd;
 
 	if (vd->vdev_guid == guid)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 static int
 vdev_count_leaves_impl(vdev_t *vd)
 {
 	int n = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		return (1);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		n += vdev_count_leaves_impl(vd->vdev_child[c]);
 
 	return (n);
 }
 
 int
 vdev_count_leaves(spa_t *spa)
 {
 	int rc;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	rc = vdev_count_leaves_impl(spa->spa_root_vdev);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	return (rc);
 }
 
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
 	size_t oldsize, newsize;
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(cvd->vdev_parent == NULL);
 
 	cvd->vdev_parent = pvd;
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 
 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 	newsize = pvd->vdev_children * sizeof (vdev_t *);
 
 	newchild = kmem_alloc(newsize, KM_SLEEP);
 	if (pvd->vdev_child != NULL) {
 		bcopy(pvd->vdev_child, newchild, oldsize);
 		kmem_free(pvd->vdev_child, oldsize);
 	}
 
 	pvd->vdev_child = newchild;
 	pvd->vdev_child[id] = cvd;
 
 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
 		cvd->vdev_spa->spa_leaf_list_gen++;
 	}
 }
 
 void
 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 {
 	int c;
 	uint_t id = cvd->vdev_id;
 
 	ASSERT(cvd->vdev_parent == pvd);
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id < pvd->vdev_children);
 	ASSERT(pvd->vdev_child[id] == cvd);
 
 	pvd->vdev_child[id] = NULL;
 	cvd->vdev_parent = NULL;
 
 	for (c = 0; c < pvd->vdev_children; c++)
 		if (pvd->vdev_child[c])
 			break;
 
 	if (c == pvd->vdev_children) {
 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 		pvd->vdev_child = NULL;
 		pvd->vdev_children = 0;
 	}
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		spa_t *spa = cvd->vdev_spa;
 		list_remove(&spa->spa_leaf_list, cvd);
 		spa->spa_leaf_list_gen++;
 	}
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 }
 
 /*
  * Remove any holes in the child array.
  */
 void
 vdev_compact_children(vdev_t *pvd)
 {
 	vdev_t **newchild, *cvd;
 	int oldc = pvd->vdev_children;
 	int newc;
 
 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (oldc == 0)
 		return;
 
 	for (int c = newc = 0; c < oldc; c++)
 		if (pvd->vdev_child[c])
 			newc++;
 
 	if (newc > 0) {
 		newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
 
 		for (int c = newc = 0; c < oldc; c++) {
 			if ((cvd = pvd->vdev_child[c]) != NULL) {
 				newchild[newc] = cvd;
 				cvd->vdev_id = newc++;
 			}
 		}
 	} else {
 		newchild = NULL;
 	}
 
 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 	pvd->vdev_child = newchild;
 	pvd->vdev_children = newc;
 }
 
 /*
  * Allocate and minimally initialize a vdev_t.
  */
 vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 
 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 	vic = &vd->vdev_indirect_config;
 
 	if (spa->spa_root_vdev == NULL) {
 		ASSERT(ops == &vdev_root_ops);
 		spa->spa_root_vdev = vd;
 		spa->spa_load_guid = spa_generate_guid(NULL);
 	}
 
 	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
 			 * which must be unique among all pools.
 			 */
 			guid = spa_generate_guid(NULL);
 		} else {
 			/*
 			 * Any other vdev's guid must be unique within the pool.
 			 */
 			guid = spa_generate_guid(spa);
 		}
 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 	}
 
 	vd->vdev_spa = spa;
 	vd->vdev_id = id;
 	vd->vdev_guid = guid;
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_ishole = (ops == &vdev_hole_ops);
 	vic->vic_prev_indirect_vdev = UINT64_MAX;
 
 	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
 	vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
 	    0, 0);
 
 	/*
 	 * Initialize rate limit structs for events.  We rate limit ZIO delay
 	 * and checksum events so that we don't overwhelm ZED with thousands
 	 * of events when a disk is acting up.
 	 */
 	zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_checksum_rl,
 	    &zfs_checksum_events_per_second, 1);
 
 	list_link_init(&vd->vdev_config_dirty_node);
 	list_link_init(&vd->vdev_state_dirty_node);
 	list_link_init(&vd->vdev_initialize_node);
 	list_link_init(&vd->vdev_leaf_node);
 	list_link_init(&vd->vdev_trim_node);
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
 		    0);
 	}
 
 	txg_list_create(&vd->vdev_ms_list, spa,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list, spa,
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
 	vdev_queue_init(vd);
 	vdev_cache_init(vd);
 
 	return (vd);
 }
 
 /*
  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
  * creating a new vdev or loading an existing one - the behavior is slightly
  * different for each case.
  */
 int
 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
     int alloctype)
 {
 	vdev_ops_t *ops;
 	char *type;
 	uint64_t guid = 0, islog;
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 	char *tmp = NULL;
 	int rc;
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 	boolean_t top_level = (parent && !parent->vdev_parent);
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((ops = vdev_getops(type)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If this is a load, get the vdev guid from the nvlist.
 	 * Otherwise, vdev_alloc_common() will generate one for us.
 	 */
 	if (alloctype == VDEV_ALLOC_LOAD) {
 		uint64_t label_id;
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 		    label_id != id)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_SPARE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The first allocated vdev must be of type 'root'.
 	 */
 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Determine whether we're a log vdev.
 	 */
 	islog = 0;
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (SET_ERROR(ENOTSUP));
 
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));
 
 	if (top_level && alloctype == VDEV_ALLOC_ADD) {
 		char *bias;
 
 		/*
 		 * If creating a top-level vdev, check for allocation
 		 * classes input.
 		 */
 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 		    &bias) == 0) {
 			alloc_bias = vdev_derive_alloc_bias(bias);
 
 			/* spa_vdev_add() expects feature to be enabled */
 			if (spa->spa_load_state != SPA_LOAD_CREATE &&
 			    !spa_feature_is_enabled(spa,
 			    SPA_FEATURE_ALLOCATION_CLASSES)) {
 				return (SET_ERROR(ENOTSUP));
 			}
 		}
 
 		/* spa_vdev_add() expects feature to be enabled */
 		if (ops == &vdev_draid_ops &&
 		    spa->spa_load_state != SPA_LOAD_CREATE &&
 		    !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
 			return (SET_ERROR(ENOTSUP));
 		}
 	}
 
 	/*
 	 * Initialize the vdev specific data.  This is done before calling
 	 * vdev_alloc_common() since it may fail and this simplifies the
 	 * error reporting and cleanup code paths.
 	 */
 	void *tsd = NULL;
 	if (ops->vdev_op_init != NULL) {
 		rc = ops->vdev_op_init(spa, nv, &tsd);
 		if (rc != 0) {
 			return (rc);
 		}
 	}
 
 	vd = vdev_alloc_common(spa, id, guid, ops);
 	vd->vdev_tsd = tsd;
 	vd->vdev_islog = islog;
 
 	if (top_level && alloc_bias != VDEV_BIAS_NONE)
 		vd->vdev_alloc_bias = alloc_bias;
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
 		vd->vdev_path = spa_strdup(vd->vdev_path);
 
 	/*
 	 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
 	 * fault on a vdev and want it to persist across imports (like with
 	 * zpool offline -f).
 	 */
 	rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
 	if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 		vd->vdev_faulted = 1;
 		vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 	}
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 	    &vd->vdev_physpath) == 0)
 		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
 	    &vd->vdev_enc_sysfs_path) == 0)
 		vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
 		vd->vdev_fru = spa_strdup(vd->vdev_fru);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
 	 * as -1.
 	 */
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;
 
 	vic = &vd->vdev_indirect_config;
 
 	ASSERT0(vic->vic_mapping_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 	    &vic->vic_mapping_object);
 	ASSERT0(vic->vic_births_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
 	    &vic->vic_births_object);
 	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 	    &vic->vic_prev_indirect_vdev);
 
 	/*
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 	    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 
 	/*
 	 * Retrieve the vdev creation time.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 	    &vd->vdev_crtxg);
 
 	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (top_level &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 		    &vd->vdev_removing);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    &vd->vdev_top_zap);
 	} else {
 		ASSERT0(vd->vdev_top_zap);
 	}
 
 	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
 		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 		    alloctype == VDEV_ALLOC_ADD ||
 		    alloctype == VDEV_ALLOC_SPLIT ||
 		    alloctype == VDEV_ALLOC_ROOTPOOL);
 		/* Note: metaslab_group_create() is now deferred */
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv,
 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
 	} else {
 		ASSERT0(vd->vdev_leaf_zap);
 	}
 
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 			    &vd->vdev_dtl_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
 
 		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 			uint64_t spare = 0;
 
 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 			    &spare) == 0 && spare)
 				spa_spare_add(vd);
 		}
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 		    &vd->vdev_resilver_txg);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
 		    &vd->vdev_rebuild_txg);
 
 		if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
 			vdev_defer_resilver(vd);
 
 		/*
 		 * In general, when importing a pool we want to ignore the
 		 * persistent fault state, as the diagnosis made on another
 		 * system may not be valid in the current context.  The only
 		 * exception is if we forced a vdev to a persistently faulted
 		 * state with 'zpool offline -f'.  The persistent fault will
 		 * remain across imports until cleared.
 		 *
 		 * Local vdevs will remain in the faulted state.
 		 */
 		if (spa_load_state(spa) == SPA_LOAD_OPEN ||
 		    spa_load_state(spa) == SPA_LOAD_IMPORT) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 			    &vd->vdev_degraded);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 			    &vd->vdev_removed);
 
 			if (vd->vdev_faulted || vd->vdev_degraded) {
 				char *aux;
 
 				vd->vdev_label_aux =
 				    VDEV_AUX_ERR_EXCEEDED;
 				if (nvlist_lookup_string(nv,
 				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 				    strcmp(aux, "external") == 0)
 					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 				else
 					vd->vdev_faulted = 0ULL;
 			}
 		}
 	}
 
 	/*
 	 * Add ourselves to the parent's list of children.
 	 */
 	vdev_add_child(parent, vd);
 
 	*vdp = vd;
 
 	return (0);
 }
 
 void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
 
 	/*
 	 * Scan queues are normally destroyed at the end of a scan. If the
 	 * queue exists here, that implies the vdev is being removed while
 	 * the scan is still running.
 	 */
 	if (vd->vdev_scan_io_queue != NULL) {
 		mutex_enter(&vd->vdev_scan_io_queue_lock);
 		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
 		vd->vdev_scan_io_queue = NULL;
 		mutex_exit(&vd->vdev_scan_io_queue_lock);
 	}
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
 	 * trying to ensure complicated semantics for all callers.
 	 */
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_free(vd->vdev_child[c]);
 
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 
 	if (vd->vdev_ops->vdev_op_fini != NULL)
 		vd->vdev_ops->vdev_op_fini(vd);
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
 	}
 	if (vd->vdev_log_mg != NULL) {
 		ASSERT0(vd->vdev_ms_count);
 		metaslab_group_destroy(vd->vdev_log_mg);
 		vd->vdev_log_mg = NULL;
 	}
 
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Remove this vdev from its parent's child list.
 	 */
 	vdev_remove_child(vd->vdev_parent, vd);
 
 	ASSERT(vd->vdev_parent == NULL);
 	ASSERT(!list_link_active(&vd->vdev_leaf_node));
 
 	/*
 	 * Clean up vdev structure.
 	 */
 	vdev_queue_fini(vd);
 	vdev_cache_fini(vd);
 
 	if (vd->vdev_path)
 		spa_strfree(vd->vdev_path);
 	if (vd->vdev_devid)
 		spa_strfree(vd->vdev_devid);
 	if (vd->vdev_physpath)
 		spa_strfree(vd->vdev_physpath);
 
 	if (vd->vdev_enc_sysfs_path)
 		spa_strfree(vd->vdev_enc_sysfs_path);
 
 	if (vd->vdev_fru)
 		spa_strfree(vd->vdev_fru);
 
 	if (vd->vdev_isspare)
 		spa_spare_remove(vd);
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_close(vd->vdev_dtl_sm);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 		range_tree_destroy(vd->vdev_dtl[t]);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	EQUIV(vd->vdev_indirect_births != NULL,
 	    vd->vdev_indirect_mapping != NULL);
 	if (vd->vdev_indirect_births != NULL) {
 		vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 		vdev_indirect_births_close(vd->vdev_indirect_births);
 	}
 
 	if (vd->vdev_obsolete_sm != NULL) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 		space_map_close(vd->vdev_obsolete_sm);
 		vd->vdev_obsolete_sm = NULL;
 	}
 	range_tree_destroy(vd->vdev_obsolete_segments);
 	rw_destroy(&vd->vdev_indirect_rwlock);
 	mutex_destroy(&vd->vdev_obsolete_lock);
 
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
 	mutex_destroy(&vd->vdev_scan_io_queue_lock);
 
 	mutex_destroy(&vd->vdev_initialize_lock);
 	mutex_destroy(&vd->vdev_initialize_io_lock);
 	cv_destroy(&vd->vdev_initialize_io_cv);
 	cv_destroy(&vd->vdev_initialize_cv);
 
 	mutex_destroy(&vd->vdev_trim_lock);
 	mutex_destroy(&vd->vdev_autotrim_lock);
 	mutex_destroy(&vd->vdev_trim_io_lock);
 	cv_destroy(&vd->vdev_trim_cv);
 	cv_destroy(&vd->vdev_autotrim_cv);
 	cv_destroy(&vd->vdev_trim_io_cv);
 
 	mutex_destroy(&vd->vdev_rebuild_lock);
 	cv_destroy(&vd->vdev_rebuild_cv);
 
 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_deadman_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
 
 	kmem_free(vd, sizeof (vdev_t));
 }
 
 /*
  * Transfer top-level vdev state from svd to tvd.
  */
 static void
 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 {
 	spa_t *spa = svd->vdev_spa;
 	metaslab_t *msp;
 	vdev_t *vd;
 	int t;
 
 	ASSERT(tvd == tvd->vdev_top);
 
 	tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
 	tvd->vdev_top_zap = svd->vdev_top_zap;
 
 	svd->vdev_ms_array = 0;
 	svd->vdev_ms_shift = 0;
 	svd->vdev_ms_count = 0;
 	svd->vdev_top_zap = 0;
 
 	if (tvd->vdev_mg)
 		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 	if (tvd->vdev_log_mg)
 		ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
 	tvd->vdev_mg = svd->vdev_mg;
 	tvd->vdev_log_mg = svd->vdev_log_mg;
 	tvd->vdev_ms = svd->vdev_ms;
 
 	svd->vdev_mg = NULL;
 	svd->vdev_log_mg = NULL;
 	svd->vdev_ms = NULL;
 
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;
 	if (tvd->vdev_log_mg != NULL)
 		tvd->vdev_log_mg->mg_vd = tvd;
 
 	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
 	svd->vdev_checkpoint_sm = NULL;
 
 	tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
 	svd->vdev_alloc_bias = VDEV_BIAS_NONE;
 
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
 
 	svd->vdev_stat.vs_alloc = 0;
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
 	/*
 	 * State which may be set on a top-level vdev that's in the
 	 * process of being removed.
 	 */
 	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
 	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
 	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
 	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
 	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
 	ASSERT0(tvd->vdev_removing);
 	ASSERT0(tvd->vdev_rebuilding);
 	tvd->vdev_removing = svd->vdev_removing;
 	tvd->vdev_rebuilding = svd->vdev_rebuilding;
 	tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
 	tvd->vdev_indirect_config = svd->vdev_indirect_config;
 	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
 	tvd->vdev_indirect_births = svd->vdev_indirect_births;
 	range_tree_swap(&svd->vdev_obsolete_segments,
 	    &tvd->vdev_obsolete_segments);
 	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
 	svd->vdev_indirect_config.vic_mapping_object = 0;
 	svd->vdev_indirect_config.vic_births_object = 0;
 	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
 	svd->vdev_indirect_mapping = NULL;
 	svd->vdev_indirect_births = NULL;
 	svd->vdev_obsolete_sm = NULL;
 	svd->vdev_removing = 0;
 	svd->vdev_rebuilding = 0;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 	}
 
 	if (list_link_active(&svd->vdev_config_dirty_node)) {
 		vdev_config_clean(svd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (list_link_active(&svd->vdev_state_dirty_node)) {
 		vdev_state_clean(svd);
 		vdev_state_dirty(tvd);
 	}
 
 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 	svd->vdev_deflate_ratio = 0;
 
 	tvd->vdev_islog = svd->vdev_islog;
 	svd->vdev_islog = 0;
 
 	dsl_scan_io_queue_vdev_xfer(svd, tvd);
 }
 
 static void
 vdev_top_update(vdev_t *tvd, vdev_t *vd)
 {
 	if (vd == NULL)
 		return;
 
 	vd->vdev_top = tvd;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_top_update(tvd, vd->vdev_child[c]);
 }
 
 /*
  * Add a mirror/replacing vdev above an existing vdev.  There is no need to
  * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
  */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 {
 	spa_t *spa = cvd->vdev_spa;
 	vdev_t *pvd = cvd->vdev_parent;
 	vdev_t *mvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 
 	mvd->vdev_asize = cvd->vdev_asize;
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_max_asize = cvd->vdev_max_asize;
 	mvd->vdev_psize = cvd->vdev_psize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
 	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
 	cvd->vdev_id = mvd->vdev_children;
 	vdev_add_child(mvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (mvd == mvd->vdev_top)
 		vdev_top_transfer(cvd, mvd);
 
 	return (mvd);
 }
 
 /*
  * Remove a 1-way mirror/replacing vdev from the tree.
  */
 void
 vdev_remove_parent(vdev_t *cvd)
 {
 	vdev_t *mvd = cvd->vdev_parent;
 	vdev_t *pvd = mvd->vdev_parent;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	ASSERT(mvd->vdev_children == 1);
 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 	    mvd->vdev_ops == &vdev_replacing_ops ||
 	    mvd->vdev_ops == &vdev_spare_ops);
 	cvd->vdev_ashift = mvd->vdev_ashift;
 	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
 	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
 
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
 	if (mvd->vdev_top == mvd) {
 		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
 		cvd->vdev_orig_guid = cvd->vdev_guid;
 		cvd->vdev_guid += guid_delta;
 		cvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If pool not set for autoexpand, we need to also preserve
 		 * mvd's asize to prevent automatic expansion of cvd.
 		 * Otherwise if we are adjusting the mirror by attaching and
 		 * detaching children of non-uniform sizes, the mirror could
 		 * autoexpand, unexpectedly requiring larger devices to
 		 * re-establish the mirror.
 		 */
 		if (!cvd->vdev_spa->spa_autoexpand)
 			cvd->vdev_asize = mvd->vdev_asize;
 	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (cvd == cvd->vdev_top)
 		vdev_top_transfer(mvd, cvd);
 
 	ASSERT(mvd->vdev_children == 0);
 	vdev_free(mvd);
 }
 
 void
 vdev_metaslab_group_create(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	/*
 	 * metaslab_group_create was delayed until allocation bias was available
 	 */
 	if (vd->vdev_mg == NULL) {
 		metaslab_class_t *mc;
 
 		if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
 			vd->vdev_alloc_bias = VDEV_BIAS_LOG;
 
 		ASSERT3U(vd->vdev_islog, ==,
 		    (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
 
 		switch (vd->vdev_alloc_bias) {
 		case VDEV_BIAS_LOG:
 			mc = spa_log_class(spa);
 			break;
 		case VDEV_BIAS_SPECIAL:
 			mc = spa_special_class(spa);
 			break;
 		case VDEV_BIAS_DEDUP:
 			mc = spa_dedup_class(spa);
 			break;
 		default:
 			mc = spa_normal_class(spa);
 		}
 
 		vd->vdev_mg = metaslab_group_create(mc, vd,
 		    spa->spa_alloc_count);
 
 		if (!vd->vdev_islog) {
 			vd->vdev_log_mg = metaslab_group_create(
 			    spa_embedded_log_class(spa), vd, 1);
 		}
 
 		/*
 		 * The spa ashift min/max only apply for the normal metaslab
 		 * class. Class destination is late binding so ashift boundary
 		 * setting had to wait until now.
 		 */
 		if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 		    mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
 			if (vd->vdev_ashift > spa->spa_max_ashift)
 				spa->spa_max_ashift = vd->vdev_ashift;
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;
 
 			uint64_t min_alloc = vdev_get_min_alloc(vd);
 			if (min_alloc < spa->spa_min_alloc)
 				spa->spa_min_alloc = min_alloc;
 		}
 	}
 }
 
 int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 	boolean_t expanding = (oldc != 0);
 
 	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	/*
 	 * This vdev is not being allocated from yet or is a hole.
 	 */
 	if (vd->vdev_ms_shift == 0)
 		return (0);
 
 	ASSERT(!vd->vdev_ishole);
 
 	ASSERT(oldc <= newc);
 
 	mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (expanding) {
 		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
 		vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 	}
 
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
 
 	for (uint64_t m = oldc; m < newc; m++) {
 		uint64_t object = 0;
 		/*
 		 * vdev_ms_array may be 0 if we are creating the "fake"
 		 * metaslabs for an indirect vdev for zdb's leak detection.
 		 * See zdb_leak_init().
 		 */
 		if (txg == 0 && vd->vdev_ms_array != 0) {
 			error = dmu_read(spa->spa_meta_objset,
 			    vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "unable to read the metaslab "
 				    "array [error=%d]", error);
 				return (error);
 			}
 		}
 
 		error = metaslab_init(vd->vdev_mg, m, object, txg,
 		    &(vd->vdev_ms[m]));
 		if (error != 0) {
 			vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
 			    error);
 			return (error);
 		}
 	}
 
 	/*
 	 * Find the emptiest metaslab on the vdev and mark it for use for
 	 * embedded slog by moving it from the regular to the log metaslab
 	 * group.
 	 */
 	if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
 	    vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
 	    avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
 		uint64_t slog_msid = 0;
 		uint64_t smallest = UINT64_MAX;
 
 		/*
 		 * Note, we only search the new metaslabs, because the old
 		 * (pre-existing) ones may be active (e.g. have non-empty
 		 * range_tree's), and we don't move them to the new
 		 * metaslab_t.
 		 */
 		for (uint64_t m = oldc; m < newc; m++) {
 			uint64_t alloc =
 			    space_map_allocated(vd->vdev_ms[m]->ms_sm);
 			if (alloc < smallest) {
 				slog_msid = m;
 				smallest = alloc;
 			}
 		}
 		metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
 		/*
 		 * The metaslab was marked as dirty at the end of
 		 * metaslab_init(). Remove it from the dirty list so that we
 		 * can uninitialize and reinitialize it to the new class.
 		 */
 		if (txg != 0) {
 			(void) txg_list_remove_this(&vd->vdev_ms_list,
 			    slog_ms, txg);
 		}
 		uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
 		metaslab_fini(slog_ms);
 		VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
 		    &vd->vdev_ms[slog_msid]));
 	}
 
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
 	/*
 	 * If the vdev is being removed we don't activate
 	 * the metaslabs since we want to ensure that no new
 	 * allocations are performed on this device.
 	 */
 	if (!expanding && !vd->vdev_removing) {
 		metaslab_group_activate(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_group_activate(vd->vdev_log_mg);
 	}
 
 	if (txg == 0)
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	/*
 	 * Regardless whether this vdev was just added or it is being
 	 * expanded, the metaslab count has changed. Recalculate the
 	 * block limit.
 	 */
 	spa_log_sm_set_blocklimit(spa);
 
 	return (0);
 }
 
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
 	if (vd->vdev_checkpoint_sm != NULL) {
 		ASSERT(spa_feature_is_active(vd->vdev_spa,
 		    SPA_FEATURE_POOL_CHECKPOINT));
 		space_map_close(vd->vdev_checkpoint_sm);
 		/*
 		 * Even though we close the space map, we need to set its
 		 * pointer to NULL. The reason is that vdev_metaslab_fini()
 		 * may be called multiple times for certain operations
 		 * (i.e. when destroying a pool) so we need to ensure that
 		 * this clause never executes twice. This logic is similar
 		 * to the one used for the vdev_ms clause below.
 		 */
 		vd->vdev_checkpoint_sm = NULL;
 	}
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_t *mg = vd->vdev_mg;
 
 		metaslab_group_passivate(mg);
 		if (vd->vdev_log_mg != NULL) {
 			ASSERT(!vd->vdev_islog);
 			metaslab_group_passivate(vd->vdev_log_mg);
 		}
 
 		uint64_t count = vd->vdev_ms_count;
 		for (uint64_t m = 0; m < count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			if (msp != NULL)
 				metaslab_fini(msp);
 		}
 		vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 		vd->vdev_ms_count = 0;
 
 		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 			ASSERT0(mg->mg_histogram[i]);
 			if (vd->vdev_log_mg != NULL)
 				ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
 		}
 	}
 	ASSERT0(vd->vdev_ms_count);
 	ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
 }
 
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
 
 	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
 		if (zio->io_error == 0 && spa_writeable(spa)) {
 			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_abd,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 		} else {
 			abd_free(zio->io_abd);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		abd_free(zio->io_abd);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
 		zio_t *pio;
 		zio_link_t *zl;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
 
 		if (vdev_readable(vd) &&
 		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			vdev_dbgmsg(vd, "failed probe");
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, NULL, 0);
 			zio->io_error = SET_ERROR(ENXIO);
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
 		ASSERT(vd->vdev_probe_zio == zio);
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);
 
 		zl = NULL;
 		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 			if (!vdev_accessible(vd, pio))
 				pio->io_error = SET_ERROR(ENXIO);
 
 		kmem_free(vps, sizeof (*vps));
 	}
 }
 
 /*
  * Determine whether this device is accessible.
  *
  * Read and write to several known locations: the pad regions of each
  * vdev label but the first, which we leave alone in case it contains
  * a VTOC.
  */
 zio_t *
 vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_probe_stats_t *vps = NULL;
 	zio_t *pio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/*
 	 * Don't probe the probe.
 	 */
 	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
 		return (NULL);
 
 	/*
 	 * To prevent 'probe storms' when a device fails, we create
 	 * just one probe i/o at a time.  All zios that want to probe
 	 * this vdev will become parents of the probe io.
 	 */
 	mutex_enter(&vd->vdev_probe_lock);
 
 	if ((pio = vd->vdev_probe_zio) == NULL) {
 		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
 		    ZIO_FLAG_TRYHARD;
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
 			 * vdev_cant_read and vdev_cant_write can only
 			 * transition from TRUE to FALSE when we have the
 			 * SCL_ZIO lock as writer; otherwise they can only
 			 * transition from FALSE to TRUE.  This ensures that
 			 * any zio looking at these values can assume that
 			 * failures persist for the life of the I/O.  That's
 			 * important because when a device has intermittent
 			 * connectivity problems, we want to ensure that
 			 * they're ascribed to the device (ENXIO) and not
 			 * the zio (EIO).
 			 *
 			 * Since we hold SCL_ZIO as writer here, clear both
 			 * values so the probe can reevaluate from first
 			 * principles.
 			 */
 			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
 			vd->vdev_cant_read = B_FALSE;
 			vd->vdev_cant_write = B_FALSE;
 		}
 
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 
 		/*
 		 * We can't change the vdev state in this context, so we
 		 * kick off an async task to do it on our behalf.
 		 */
 		if (zio != NULL) {
 			vd->vdev_probe_wanted = B_TRUE;
 			spa_async_request(spa, SPA_ASYNC_PROBE);
 		}
 	}
 
 	if (zio != NULL)
 		zio_add_child(zio, pio);
 
 	mutex_exit(&vd->vdev_probe_lock);
 
 	if (vps == NULL) {
 		ASSERT(zio != NULL);
 		return (NULL);
 	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
 		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
 		    offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
 		    abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
 	if (zio == NULL)
 		return (pio);
 
 	zio_nowait(pio);
 	return (NULL);
 }
 
 static void
 vdev_load_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_load_error = vdev_load(vd);
 }
 
 static void
 vdev_open_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_open_thread = curthread;
 	vd->vdev_open_error = vdev_open(vd);
 	vd->vdev_open_thread = NULL;
 }
 
 static boolean_t
 vdev_uses_zvols(vdev_t *vd)
 {
 #ifdef _KERNEL
 	if (zvol_is_zvol(vd->vdev_path))
 		return (B_TRUE);
 #endif
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_uses_zvols(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Returns B_TRUE if the passed child should be opened.
  */
 static boolean_t
 vdev_default_open_children_func(vdev_t *vd)
 {
 	return (B_TRUE);
 }
 
 /*
  * Open the requested child vdevs.  If any of the leaf vdevs are using
  * a ZFS volume then do the opens in a single thread.  This avoids a
  * deadlock when the current thread is holding the spa_namespace_lock.
  */
 static void
 vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
 	int children = vd->vdev_children;
 
 	taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
 	    children, children, TASKQ_PREPOPULATE);
 	vd->vdev_nonrot = B_TRUE;
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (open_func(cvd) == B_FALSE)
 			continue;
 
 		if (tq == NULL || vdev_uses_zvols(vd)) {
 			cvd->vdev_open_error = vdev_open(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_open_child,
 			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 
 		vd->vdev_nonrot &= cvd->vdev_nonrot;
 	}
 
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 }
 
 /*
  * Open all child vdevs.
  */
 void
 vdev_open_children(vdev_t *vd)
 {
 	vdev_open_children_impl(vd, vdev_default_open_children_func);
 }
 
 /*
  * Conditionally open a subset of child vdevs.
  */
 void
 vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
 	vdev_open_children_impl(vd, open_func);
 }
 
 /*
  * Compute the raidz-deflation ratio.  Note, we hard-code
  * in 128k (1 << 17) because it is the "typical" blocksize.
  * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
  * otherwise it would inconsistently account for existing bp's.
  */
 static void
 vdev_set_deflate_ratio(vdev_t *vd)
 {
 	if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
 		vd->vdev_deflate_ratio = (1 << 17) /
 		    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
 	}
 }
 
 /*
  * Maximize performance by inflating the configured ashift for top level
  * vdevs to be as close to the physical ashift as possible while maintaining
  * administrator defined limits and ensuring it doesn't go below the
  * logical ashift.
  */
 static void
 vdev_ashift_optimize(vdev_t *vd)
 {
 	ASSERT(vd == vd->vdev_top);
 
 	if (vd->vdev_ashift < vd->vdev_physical_ashift) {
 		vd->vdev_ashift = MIN(
 		    MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
 		    MAX(zfs_vdev_min_auto_ashift,
 		    vd->vdev_physical_ashift));
 	} else {
 		/*
 		 * If the logical and physical ashifts are the same, then
 		 * we ensure that the top-level vdev's ashift is not smaller
 		 * than our minimum ashift value. For the unusual case
 		 * where logical ashift > physical ashift, we can't cap
 		 * the calculated ashift based on max ashift as that
 		 * would cause failures.
 		 * We still check if we need to increase it to match
 		 * the min ashift.
 		 */
 		vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
 		    vd->vdev_ashift);
 	}
 }
 
 /*
  * Prepare a virtual device for access.
  */
 int
 vdev_open(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 	uint64_t osize = 0;
 	uint64_t max_osize = 0;
 	uint64_t asize, max_asize, psize;
 	uint64_t logical_ashift = 0;
 	uint64_t physical_ashift = 0;
 
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*
 	 * If this vdev is not removed, check its fault status.  If it's
 	 * faulted, bail out of the open.
 	 */
 	if (!vd->vdev_removed && vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (SET_ERROR(ENXIO));
 	}
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
 	    &logical_ashift, &physical_ashift);
 	/*
 	 * Physical volume size should never be larger than its max size, unless
 	 * the disk has shrunk while we were reading it or the device is buggy
 	 * or damaged: either way it's not safe for use, bail out of the open.
 	 */
 	if (osize > max_osize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_OPEN_FAILED);
 		return (SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Reset the vdev_reopening flag so that we actually close
 	 * the vdev on error.
 	 */
 	vd->vdev_reopening = B_FALSE;
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
 
 	if (error) {
 		if (vd->vdev_removed &&
 		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
 			vd->vdev_removed = B_FALSE;
 
 		if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
 			    vd->vdev_stat.vs_aux);
 		} else {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    vd->vdev_stat.vs_aux);
 		}
 		return (error);
 	}
 
 	vd->vdev_removed = B_FALSE;
 
 	/*
 	 * Recheck the faulted flag now that we have confirmed that
 	 * the vdev is accessible.  If we're faulted, bail.
 	 */
 	if (vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (vd->vdev_degraded) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	} else {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
 	}
 
 	/*
 	 * For hole or missing vdevs we just return success.
 	 */
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
 		return (0);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 			    VDEV_AUX_NONE);
 			break;
 		}
 	}
 
 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
 	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = osize;
 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
 		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
 		    VDEV_LABEL_END_SIZE);
 	} else {
 		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = 0;
 		asize = osize;
 		max_asize = max_osize;
 	}
 
 	/*
 	 * If the vdev was expanded, record this so that we can re-create the
 	 * uberblock rings in labels {2,3}, during the next sync.
 	 */
 	if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
 		vd->vdev_copy_uberblocks = B_TRUE;
 
 	vd->vdev_psize = psize;
 
 	/*
 	 * Make sure the allocatable size hasn't shrunk too much.
 	 */
 	if (asize < vd->vdev_min_asize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * We can always set the logical/physical ashift members since
 	 * their values are only used to calculate the vdev_ashift when
 	 * the device is first added to the config. These values should
 	 * not be used for anything else since they may change whenever
 	 * the device is reopened and we don't store them in the label.
 	 */
 	vd->vdev_physical_ashift =
 	    MAX(physical_ashift, vd->vdev_physical_ashift);
 	vd->vdev_logical_ashift = MAX(logical_ashift,
 	    vd->vdev_logical_ashift);
 
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
 		 * For compatibility, a different ashift can be requested.
 		 */
 		vd->vdev_asize = asize;
 		vd->vdev_max_asize = max_asize;
 
 		/*
 		 * If the vdev_ashift was not overridden at creation time,
 		 * then set it the logical ashift and optimize the ashift.
 		 */
 		if (vd->vdev_ashift == 0) {
 			vd->vdev_ashift = vd->vdev_logical_ashift;
 
 			if (vd->vdev_logical_ashift > ASHIFT_MAX) {
 				vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 				    VDEV_AUX_ASHIFT_TOO_BIG);
 				return (SET_ERROR(EDOM));
 			}
 
 			if (vd->vdev_top == vd) {
 				vdev_ashift_optimize(vd);
 			}
 		}
 		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
 		    vd->vdev_ashift > ASHIFT_MAX)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_ASHIFT);
 			return (SET_ERROR(EDOM));
 		}
 	} else {
 		/*
 		 * Make sure the alignment required hasn't increased.
 		 */
 		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
 		    vd->vdev_ops->vdev_op_leaf) {
 			(void) zfs_ereport_post(
 			    FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
 			    spa, vd, NULL, NULL, 0);
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (SET_ERROR(EDOM));
 		}
 		vd->vdev_max_asize = max_asize;
 	}
 
 	/*
 	 * If all children are healthy we update asize if either:
 	 * The asize has increased, due to a device expansion caused by dynamic
 	 * LUN growth or vdev replacement, and automatic expansion is enabled;
 	 * making the additional space available.
 	 *
 	 * The asize has decreased, due to a device shrink usually caused by a
 	 * vdev replace with a smaller device. This ensures that calculations
 	 * based of max_asize and asize e.g. esize are always valid. It's safe
 	 * to do this as we've already validated that asize is greater than
 	 * vdev_min_asize.
 	 */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 	    ((asize > vd->vdev_asize &&
 	    (vd->vdev_expanding || spa->spa_autoexpand)) ||
 	    (asize < vd->vdev_asize)))
 		vd->vdev_asize = asize;
 
 	vdev_set_min_asize(vd);
 
 	/*
 	 * Ensure we can issue some IO before declaring the
 	 * vdev open for business.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    VDEV_AUX_ERR_EXCEEDED);
 		return (error);
 	}
 
 	/*
 	 * Track the minimum allocation size.
 	 */
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
 		uint64_t min_alloc = vdev_get_min_alloc(vd);
 		if (min_alloc < spa->spa_min_alloc)
 			spa->spa_min_alloc = min_alloc;
 	}
 
 	/*
 	 * If this is a leaf vdev, assess whether a resilver is needed.
 	 * But don't do this if we are doing a reopen for a scrub, since
 	 * this would just restart the scrub we are already doing.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
 		dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
 
 	return (0);
 }
 
 static void
 vdev_validate_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_validate_thread = curthread;
 	vd->vdev_validate_error = vdev_validate(vd);
 	vd->vdev_validate_thread = NULL;
 }
 
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents. This needs to be done before vdev_load() so that we don't
  * inadvertently do repair I/Os to the wrong device.
  *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
  * will be updated but the function will return 0.
  */
 int
 vdev_validate(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	taskq_t *tq = NULL;
 	nvlist_t *label;
 	uint64_t guid = 0, aux_guid = 0, top_guid;
 	uint64_t state;
 	nvlist_t *nvl;
 	uint64_t txg;
 	int children = vd->vdev_children;
 
 	if (vdev_validate_skip)
 		return (0);
 
 	if (children > 0) {
 		tq = taskq_create("vdev_validate", children, minclsyspri,
 		    children, children, TASKQ_PREPOPULATE);
 	}
 
 	for (uint64_t c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (tq == NULL || vdev_uses_zvols(cvd)) {
 			vdev_validate_child(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
 			    TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 	for (int c = 0; c < children; c++) {
 		int error = vd->vdev_child[c]->vdev_validate_error;
 
 		if (error != 0)
 			return (SET_ERROR(EBADF));
 	}
 
 
 	/*
 	 * If the device has already failed, or was marked offline, don't do
 	 * any further validation.  Otherwise, label I/O will fail and we will
 	 * overwrite the previous state.
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
 		return (0);
 
 	/*
 	 * If we are performing an extreme rewind, we allow for a label that
 	 * was modified at a point after the current txg.
 	 * If config lock is not held do not check for the txg. spa_sync could
 	 * be updating the vdev's label before updating spa_last_synced_txg.
 	 */
 	if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
 	    spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
 		txg = UINT64_MAX;
 	else
 		txg = spa_last_synced_txg(spa);
 
 	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
 		    "txg %llu", (u_longlong_t)txg);
 		return (0);
 	}
 
 	/*
 	 * Determine if this vdev has been split off into another
 	 * pool.  If so, then refuse to open it.
 	 */
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
 	    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_SPLIT_POOL);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_GUID);
 		return (0);
 	}
 
 	/*
 	 * If config is not trusted then ignore the spa guid check. This is
 	 * necessary because if the machine crashed during a re-guid the new
 	 * guid might have been written to all of the vdev labels, but not the
 	 * cached config. The check will be performed again once we have the
 	 * trusted config from the MOS.
 	 */
 	if (spa->spa_trust_config && guid != spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
 		    "match config (%llu != %llu)", (u_longlong_t)guid,
 		    (u_longlong_t)spa_guid(spa));
 		return (0);
 	}
 
 	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
 	    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
 	    &aux_guid) != 0)
 		aux_guid = 0;
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_GUID);
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
 	    != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_TOP_GUID);
 		return (0);
 	}
 
 	/*
 	 * If this vdev just became a top-level vdev because its sibling was
 	 * detached, it will have adopted the parent's vdev guid -- but the
 	 * label may or may not be on disk yet. Fortunately, either version
 	 * of the label will have the same top guid, so if we're a top-level
 	 * vdev, we can safely compare to that instead.
 	 * However, if the config comes from a cachefile that failed to update
 	 * after the detach, a top-level vdev will appear as a non top-level
 	 * vdev in the config. Also relax the constraints if we perform an
 	 * extreme rewind.
 	 *
 	 * If we split this vdev off instead, then we also check the
 	 * original pool's guid. We don't want to consider the vdev
 	 * corrupt if it is partway through a split operation.
 	 */
 	if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
 		boolean_t mismatch = B_FALSE;
 		if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
 			if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
 				mismatch = B_TRUE;
 		} else {
 			if (vd->vdev_guid != top_guid &&
 			    vd->vdev_top->vdev_guid != guid)
 				mismatch = B_TRUE;
 		}
 
 		if (mismatch) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			vdev_dbgmsg(vd, "vdev_validate: config guid "
 			    "doesn't match label guid");
 			vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
 			    (u_longlong_t)vd->vdev_guid,
 			    (u_longlong_t)vd->vdev_top->vdev_guid);
 			vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
 			    "aux_guid %llu", (u_longlong_t)guid,
 			    (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
 			return (0);
 		}
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 	    &state) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_STATE);
 		return (0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * If this is a verbatim import, no need to check the
 	 * state of the pool.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
 	    spa_load_state(spa) == SPA_LOAD_OPEN &&
 	    state != POOL_STATE_ACTIVE) {
 		vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
 		    "for spa %s", (u_longlong_t)state, spa->spa_name);
 		return (SET_ERROR(EBADF));
 	}
 
 	/*
 	 * If we were able to open and validate a vdev that was
 	 * previously marked permanently unavailable, clear that state
 	 * now.
 	 */
 	if (vd->vdev_not_present)
 		vd->vdev_not_present = 0;
 
 	return (0);
 }
 
 static void
 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
 {
 	if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
 		if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
 			zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
 			    "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
 			    dvd->vdev_path, svd->vdev_path);
 			spa_strfree(dvd->vdev_path);
 			dvd->vdev_path = spa_strdup(svd->vdev_path);
 		}
 	} else if (svd->vdev_path != NULL) {
 		dvd->vdev_path = spa_strdup(svd->vdev_path);
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
 		    (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
 	}
 }
 
 /*
  * Recursively copy vdev paths from one vdev to another. Source and destination
  * vdev trees must have same geometry otherwise return error. Intended to copy
  * paths from userland config into MOS config.
  */
 int
 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
 {
 	if ((svd->vdev_ops == &vdev_missing_ops) ||
 	    (svd->vdev_ishole && dvd->vdev_ishole) ||
 	    (dvd->vdev_ops == &vdev_indirect_ops))
 		return (0);
 
 	if (svd->vdev_ops != dvd->vdev_ops) {
 		vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
 		    svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_guid != dvd->vdev_guid) {
 		vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
 		    "%llu)", (u_longlong_t)svd->vdev_guid,
 		    (u_longlong_t)dvd->vdev_guid);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_children != dvd->vdev_children) {
 		vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
 		    "%llu != %llu", (u_longlong_t)svd->vdev_children,
 		    (u_longlong_t)dvd->vdev_children);
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (uint64_t i = 0; i < svd->vdev_children; i++) {
 		int error = vdev_copy_path_strict(svd->vdev_child[i],
 		    dvd->vdev_child[i]);
 		if (error != 0)
 			return (error);
 	}
 
 	if (svd->vdev_ops->vdev_op_leaf)
 		vdev_copy_path_impl(svd, dvd);
 
 	return (0);
 }
 
 static void
 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
 {
 	ASSERT(stvd->vdev_top == stvd);
 	ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
 
 	for (uint64_t i = 0; i < dvd->vdev_children; i++) {
 		vdev_copy_path_search(stvd, dvd->vdev_child[i]);
 	}
 
 	if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
 		return;
 
 	/*
 	 * The idea here is that while a vdev can shift positions within
 	 * a top vdev (when replacing, attaching mirror, etc.) it cannot
 	 * step outside of it.
 	 */
 	vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
 
 	if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
 		return;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vdev_copy_path_impl(vd, dvd);
 }
 
 /*
  * Recursively copy vdev paths from one root vdev to another. Source and
  * destination vdev trees may differ in geometry. For each destination leaf
  * vdev, search a vdev with the same guid and top vdev id in the source.
  * Intended to copy paths from userland config into MOS config.
  */
 void
 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
 {
 	uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
 	ASSERT(srvd->vdev_ops == &vdev_root_ops);
 	ASSERT(drvd->vdev_ops == &vdev_root_ops);
 
 	for (uint64_t i = 0; i < children; i++) {
 		vdev_copy_path_search(srvd->vdev_child[i],
 		    drvd->vdev_child[i]);
 	}
 }
 
 /*
  * Close a virtual device.
  */
 void
 vdev_close(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 	spa_t *spa __maybe_unused = vd->vdev_spa;
 
 	ASSERT(vd != NULL);
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/*
 	 * If our parent is reopening, then we are as well, unless we are
 	 * going offline.
 	 */
 	if (pvd != NULL && pvd->vdev_reopening)
 		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
 
 	vd->vdev_ops->vdev_op_close(vd);
 
 	vdev_cache_purge(vd);
 
 	/*
 	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
 	 * it's still faulted.
 	 */
 	vd->vdev_prevstate = vd->vdev_state;
 
 	if (vd->vdev_offline)
 		vd->vdev_state = VDEV_STATE_OFFLINE;
 	else
 		vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
 void
 vdev_hold(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_hold(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
 		vd->vdev_ops->vdev_op_hold(vd);
 }
 
 void
 vdev_rele(vdev_t *vd)
 {
 	ASSERT(spa_is_root(vd->vdev_spa));
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_rele(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
 		vd->vdev_ops->vdev_op_rele(vd);
 }
 
 /*
  * Reopen all interior vdevs and any unopened leaves.  We don't actually
  * reopen leaf vdevs which had previously been opened as they might deadlock
  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
  * If the leaf has never been opened then open it, as usual.
  */
 void
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/* set the reopening flag unless we're taking the vdev offline */
 	vd->vdev_reopening = !vd->vdev_offline;
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
 	/*
 	 * Call vdev_validate() here to make sure we have the same device.
 	 * Otherwise, a device with an invalid label could be successfully
 	 * opened in response to vdev_reopen().
 	 */
 	if (vd->vdev_aux) {
 		(void) vdev_validate_aux(vd);
 		if (vdev_readable(vd) && vdev_writeable(vd) &&
 		    vd->vdev_aux == &spa->spa_l2cache) {
 			/*
 			 * In case the vdev is present we should evict all ARC
 			 * buffers and pointers to log blocks and reclaim their
 			 * space before restoring its contents to L2ARC.
 			 */
 			if (l2arc_vdev_present(vd)) {
 				l2arc_rebuild_vdev(vd, B_TRUE);
 			} else {
 				l2arc_add_vdev(spa, vd);
 			}
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
 		}
 	} else {
 		(void) vdev_validate(vd);
 	}
 
 	/*
 	 * Reassess parent vdev's health.
 	 */
 	vdev_propagate_state(vd);
 }
 
 int
 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 {
 	int error;
 
 	/*
 	 * Normally, partial opens (e.g. of a mirror) are allowed.
 	 * For a create, however, we want to fail the request if
 	 * there are any components we can't open.
 	 */
 	error = vdev_open(vd);
 
 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
 		vdev_close(vd);
 		return (error ? error : SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Recursively load DTLs and initialize all labels.
 	 */
 	if ((error = vdev_dtl_load(vd)) != 0 ||
 	    (error = vdev_label_init(vd, txg, isreplacing ?
 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
 		vdev_close(vd);
 		return (error);
 	}
 
 	return (0);
 }
 
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
 	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
 	uint64_t ms_shift;
 
 	/*
 	 * There are two dimensions to the metaslab sizing calculation:
 	 * the size of the metaslab and the count of metaslabs per vdev.
 	 *
 	 * The default values used below are a good balance between memory
 	 * usage (larger metaslab size means more memory needed for loaded
 	 * metaslabs; more metaslabs means more memory needed for the
 	 * metaslab_t structs), metaslab load time (larger metaslabs take
 	 * longer to load), and metaslab sync time (more metaslabs means
 	 * more time spent syncing all of them).
 	 *
 	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
 	 * The range of the dimensions are as follows:
 	 *
 	 *	2^29 <= ms_size  <= 2^34
 	 *	  16 <= ms_count <= 131,072
 	 *
 	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
 	 * at least 512MB (2^29) to minimize fragmentation effects when
 	 * testing with smaller devices.  However, the count constraint
 	 * of at least 16 metaslabs will override this minimum size goal.
 	 *
 	 * On the upper end of vdev sizes, we aim for a maximum metaslab
 	 * size of 16GB.  However, we will cap the total count to 2^17
 	 * metaslabs to keep our memory footprint in check and let the
 	 * metaslab size grow from there if that limit is hit.
 	 *
 	 * The net effect of applying above constrains is summarized below.
 	 *
 	 *   vdev size       metaslab count
 	 *  --------------|-----------------
 	 *      < 8GB        ~16
 	 *  8GB   - 100GB   one per 512MB
 	 *  100GB - 3TB     ~200
 	 *  3TB   - 2PB     one per 16GB
 	 *      > 2PB       ~131,072
 	 *  --------------------------------
 	 *
 	 *  Finally, note that all of the above calculate the initial
 	 *  number of metaslabs. Expanding a top-level vdev will result
 	 *  in additional metaslabs being allocated making it possible
 	 *  to exceed the zfs_vdev_ms_count_limit.
 	 */
 
 	if (ms_count < zfs_vdev_min_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
 	else if (ms_count > zfs_vdev_default_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
 	else
 		ms_shift = zfs_vdev_default_ms_shift;
 
 	if (ms_shift < SPA_MAXBLOCKSHIFT) {
 		ms_shift = SPA_MAXBLOCKSHIFT;
 	} else if (ms_shift > zfs_vdev_max_ms_shift) {
 		ms_shift = zfs_vdev_max_ms_shift;
 		/* cap the total count to constrain memory footprint */
 		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
 			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
 	}
 
 	vd->vdev_ms_shift = ms_shift;
 	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
 }
 
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
 	/* indirect vdevs don't have metaslabs or dtls */
 	ASSERT(vdev_is_concrete(vd) || flags == 0);
 	ASSERT(ISP2(flags));
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	if (flags & VDD_METASLAB)
 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
 
 	if (flags & VDD_DTL)
 		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
 
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
 void
 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vdev_dirty(vd->vdev_top, flags, vd, txg);
 }
 
 /*
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
  * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
  *
  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
  *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
  *	txgs that was scrubbed.
  *
  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
  *	persistent errors or just some device being offline.
  *	Unlike the other three, the DTL_OUTAGE map is not generally
  *	maintained; it's only computed when needed, typically to
  *	determine whether a device can be detached.
  *
  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
  * either has the data or it doesn't.
  *
  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
  * if any child is less than fully replicated, then so is its parent.
  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
  * comprising only those txgs which appear in 'maxfaults' or more children;
  * those are the txgs we don't have enough replication to read.  For example,
  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
  * two child DTL_MISSING maps.
  *
  * It should be clear from the above that to compute the DTLs and outage maps
  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
  * Therefore, that is all we keep on disk.  When loading the pool, or after
  * a configuration change, we generate all other DTLs from first principles.
  */
 void
 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!range_tree_contains(rt, txg, size))
 		range_tree_add(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 boolean_t
 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t dirty = B_FALSE;
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	/*
 	 * While we are loading the pool, the DTLs have not been loaded yet.
 	 * This isn't a problem but it can result in devices being tried
 	 * which are known to not have the data.  In which case, the import
 	 * is relying on the checksum to ensure that we get the right data.
 	 * Note that while importing we are only reading the MOS, which is
 	 * always checksummed.
 	 */
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!range_tree_is_empty(rt))
 		dirty = range_tree_contains(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (dirty);
 }
 
 boolean_t
 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t empty;
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	empty = range_tree_is_empty(rt);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (empty);
 }
 
 /*
  * Check if the txg falls within the range which must be
  * resilvered.  DVAs outside this range can always be skipped.
  */
 boolean_t
 vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	/* Set by sequential resilver. */
 	if (phys_birth == TXG_UNKNOWN)
 		return (B_TRUE);
 
 	return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
 }
 
 /*
  * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
  */
 boolean_t
 vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
 	    vd->vdev_ops->vdev_op_leaf)
 		return (B_TRUE);
 
 	return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
 	    phys_birth));
 }
 
 /*
  * Returns the lowest txg in the DTL range.
  */
 static uint64_t
 vdev_dtl_min(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
 }
 
 /*
  * Returns the highest txg in the DTL.
  */
 static uint64_t
 vdev_dtl_max(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	return (range_tree_max(vd->vdev_dtl[DTL_MISSING]));
 }
 
 /*
  * Determine if a resilvering vdev should remove any DTL entries from
  * its range. If the vdev was resilvering for the entire duration of the
  * scan then it should excise that range from its DTLs. Otherwise, this
  * vdev is considered partially resilvered and should leave its DTL
  * entries intact. The comment in vdev_dtl_reassess() describes how we
  * excise the DTLs.
  */
 static boolean_t
 vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
 {
 	ASSERT0(vd->vdev_children);
 
 	if (vd->vdev_state < VDEV_STATE_DEGRADED)
 		return (B_FALSE);
 
 	if (vd->vdev_resilver_deferred)
 		return (B_FALSE);
 
 	if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
 		return (B_TRUE);
 
 	if (rebuild_done) {
 		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
 		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 		/* Rebuild not initiated by attach */
 		if (vd->vdev_rebuild_txg == 0)
 			return (B_TRUE);
 
 		/*
 		 * When a rebuild completes without error then all missing data
 		 * up to the rebuild max txg has been reconstructed and the DTL
 		 * is eligible for excision.
 		 */
 		if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
 		    vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
 			ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
 			ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
 			ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
 			return (B_TRUE);
 		}
 	} else {
 		dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
 		dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
 
 		/* Resilver not initiated by attach */
 		if (vd->vdev_resilver_txg == 0)
 			return (B_TRUE);
 
 		/*
 		 * When a resilver is initiated the scan will assign the
 		 * scn_max_txg value to the highest txg value that exists
 		 * in all DTLs. If this device's max DTL is not part of this
 		 * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
 		 * then it is not eligible for excision.
 		 */
 		if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
 			ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
 			ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
 			ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Reassess DTLs after a config change or scrub completion. If txg == 0 no
  * write operations will be issued to the pool.
  */
 void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done)
 {
 	spa_t *spa = vd->vdev_spa;
 	avl_tree_t reftree;
 	int minref;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dtl_reassess(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done, rebuild_done);
 
 	if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
 		boolean_t check_excise = B_FALSE;
 		boolean_t wasempty = B_TRUE;
 
 		mutex_enter(&vd->vdev_dtl_lock);
 
 		/*
 		 * If requested, pretend the scan or rebuild completed cleanly.
 		 */
 		if (zfs_scan_ignore_errors) {
 			if (scn != NULL)
 				scn->scn_phys.scn_errors = 0;
 			if (vr != NULL)
 				vr->vr_rebuild_phys.vrp_errors = 0;
 		}
 
 		if (scrub_txg != 0 &&
 		    !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
 			wasempty = B_FALSE;
 			zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
 			    "dtl:%llu/%llu errors:%llu",
 			    (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
 			    (u_longlong_t)scrub_txg, spa->spa_scrub_started,
 			    (u_longlong_t)vdev_dtl_min(vd),
 			    (u_longlong_t)vdev_dtl_max(vd),
 			    (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
 		}
 
 		/*
 		 * If we've completed a scrub/resilver or a rebuild cleanly
 		 * then determine if this vdev should remove any DTLs. We
 		 * only want to excise regions on vdevs that were available
 		 * during the entire duration of this scan.
 		 */
 		if (rebuild_done &&
 		    vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
 			check_excise = B_TRUE;
 		} else {
 			if (spa->spa_scrub_started ||
 			    (scn != NULL && scn->scn_phys.scn_errors == 0)) {
 				check_excise = B_TRUE;
 			}
 		}
 
 		if (scrub_txg && check_excise &&
 		    vdev_dtl_should_excise(vd, rebuild_done)) {
 			/*
 			 * We completed a scrub, resilver or rebuild up to
 			 * scrub_txg.  If we did it without rebooting, then
 			 * the scrub dtl will be valid, so excise the old
 			 * region and fold in the scrub dtl.  Otherwise,
 			 * leave the dtl as-is if there was an error.
 			 *
 			 * There's little trick here: to excise the beginning
 			 * of the DTL_MISSING map, we put it into a reference
 			 * tree and then add a segment with refcnt -1 that
 			 * covers the range [0, scrub_txg).  This means
 			 * that each txg in that range has refcnt -1 or 0.
 			 * We then add DTL_SCRUB with a refcnt of 2, so that
 			 * entries in the range [0, scrub_txg) will have a
 			 * positive refcnt -- either 1 or 2.  We then convert
 			 * the reference tree into the new DTL_MISSING map.
 			 */
 			space_reftree_create(&reftree);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_SCRUB], 2);
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_destroy(&reftree);
 
 			if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
 				zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
 				    (u_longlong_t)vdev_dtl_min(vd),
 				    (u_longlong_t)vdev_dtl_max(vd));
 			} else if (!wasempty) {
 				zfs_dbgmsg("DTL_MISSING is now empty");
 			}
 		}
 		range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
 		range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 		    range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
 			range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
 		range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
 		if (!vdev_readable(vd))
 			range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
 		else
 			range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 			    range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
 
 		/*
 		 * If the vdev was resilvering or rebuilding and no longer
 		 * has any DTLs then reset the appropriate flag and dirty
 		 * the top level so that we persist the change.
 		 */
 		if (txg != 0 &&
 		    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
 			if (vd->vdev_rebuild_txg != 0) {
 				vd->vdev_rebuild_txg = 0;
 				vdev_config_dirty(vd->vdev_top);
 			} else if (vd->vdev_resilver_txg != 0) {
 				vd->vdev_resilver_txg = 0;
 				vdev_config_dirty(vd->vdev_top);
 			}
 		}
 
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 		return;
 	}
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		/* account for child's outage in parent's missing map */
 		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
 		if (t == DTL_SCRUB)
 			continue;			/* leaf vdevs only */
 		if (t == DTL_PARTIAL)
 			minref = 1;			/* i.e. non-zero */
 		else if (vdev_get_nparity(vd) != 0)
 			minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
 		else
 			minref = vd->vdev_children;	/* any kind of mirror */
 		space_reftree_create(&reftree);
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			mutex_enter(&cvd->vdev_dtl_lock);
 			space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
 			mutex_exit(&cvd->vdev_dtl_lock);
 		}
 		space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
 		space_reftree_destroy(&reftree);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	range_tree_t *rt;
 	int error = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
 		ASSERT(vdev_is_concrete(vd));
 
 		error = space_map_open(&vd->vdev_dtl_sm, mos,
 		    vd->vdev_dtl_object, 0, -1ULL, 0);
 		if (error)
 			return (error);
 		ASSERT(vd->vdev_dtl_sm != NULL);
 
 		rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 		error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
 		if (error == 0) {
 			mutex_enter(&vd->vdev_dtl_lock);
 			range_tree_walk(rt, range_tree_add,
 			    vd->vdev_dtl[DTL_MISSING]);
 			mutex_exit(&vd->vdev_dtl_lock);
 		}
 
 		range_tree_vacate(rt, NULL, NULL);
 		range_tree_destroy(rt);
 
 		return (error);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		error = vdev_dtl_load(vd->vdev_child[c]);
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 static void
 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 	const char *string;
 
 	ASSERT(alloc_bias != VDEV_BIAS_NONE);
 
 	string =
 	    (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
 	    (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
 	    (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
 
 	ASSERT(string != NULL);
 	VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
 	    1, strlen(string) + 1, string, tx));
 
 	if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
 		spa_activate_allocation_classes(spa, tx);
 	}
 }
 
 void
 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
 	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zapobj, tx));
 }
 
 uint64_t
 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
 	    DMU_OT_NONE, 0, tx);
 
 	ASSERT(zap != 0);
 	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zap, tx));
 
 	return (zap);
 }
 
 void
 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ops != &vdev_hole_ops &&
 	    vd->vdev_ops != &vdev_missing_ops &&
 	    vd->vdev_ops != &vdev_root_ops &&
 	    !vd->vdev_top->vdev_removing) {
 		if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
 			vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
 		}
 		if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
 			vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
 			if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
 				vdev_zap_allocation_data(vd, tx);
 		}
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_construct_zaps(vd->vdev_child[i], tx);
 	}
 }
 
 static void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	range_tree_t *rtsync;
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(vd->vdev_dtl_sm);
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		space_map_free(vd->vdev_dtl_sm, tx);
 		space_map_close(vd->vdev_dtl_sm);
 		vd->vdev_dtl_sm = NULL;
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		/*
 		 * We only destroy the leaf ZAP for detached leaves or for
 		 * removed log devices. Removed data devices handle leaf ZAP
 		 * cleanup later, once cancellation is no longer possible.
 		 */
 		if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
 		    vd->vdev_top->vdev_islog)) {
 			vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
 			vd->vdev_leaf_zap = 0;
 		}
 
 		dmu_tx_commit(tx);
 		return;
 	}
 
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;
 
 		new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
 		    0, -1ULL, 0));
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
 	rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	range_tree_walk(rt, range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(rtsync, NULL, NULL);
 
 	range_tree_destroy(rtsync);
 
 	/*
 	 * If the object for the space map has changed then dirty
 	 * the top level so that we update the config.
 	 */
 	if (object != space_map_object(vd->vdev_dtl_sm)) {
 		vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
 		    "new object %llu", (u_longlong_t)txg, spa_name(spa),
 		    (u_longlong_t)object,
 		    (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
 		vdev_config_dirty(vd->vdev_top);
 	}
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Determine whether the specified vdev can be offlined/detached/removed
  * without losing data.
  */
 boolean_t
 vdev_dtl_required(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
 	uint8_t cant_read = vd->vdev_cant_read;
 	boolean_t required;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == spa->spa_root_vdev || vd == tvd)
 		return (B_TRUE);
 
 	/*
 	 * Temporarily mark the device as unreadable, and then determine
 	 * whether this results in any DTL outages in the top-level vdev.
 	 * If not, we can safely offline/detach/remove the device.
 	 */
 	vd->vdev_cant_read = B_TRUE;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
 	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
 
 	if (!required && zio_injection_enabled) {
 		required = !!zio_handle_device_injection(vd, NULL,
 		    SET_ERROR(ECHILD));
 	}
 
 	return (required);
 }
 
 /*
  * Determine if resilver is needed, and if so the txg range.
  */
 boolean_t
 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 {
 	boolean_t needed = B_FALSE;
 	uint64_t thismin = UINT64_MAX;
 	uint64_t thismax = 0;
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    vdev_writeable(vd)) {
 
 			thismin = vdev_dtl_min(vd);
 			thismax = vdev_dtl_max(vd);
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
 			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
 				thismin = MIN(thismin, cmin);
 				thismax = MAX(thismax, cmax);
 				needed = B_TRUE;
 			}
 		}
 	}
 
 	if (needed && minp) {
 		*minp = thismin;
 		*maxp = thismax;
 	}
 	return (needed);
 }
 
 /*
  * Gets the checkpoint space map object from the vdev's ZAP.  On success sm_obj
  * will contain either the checkpoint spacemap object or zero if none exists.
  * All other errors are returned to the caller.
  */
 int
 vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
 {
 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_top_zap == 0) {
 		*sm_obj = 0;
 		return (0);
 	}
 
 	int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
 	if (error == ENOENT) {
 		*sm_obj = 0;
 		error = 0;
 	}
 
 	return (error);
 }
 
 int
 vdev_load(vdev_t *vd)
 {
 	int children = vd->vdev_children;
 	int error = 0;
 	taskq_t *tq = NULL;
 
 	/*
 	 * It's only worthwhile to use the taskq for the root vdev, because the
 	 * slow part is metaslab_init, and that only happens for top-level
 	 * vdevs.
 	 */
 	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
 		tq = taskq_create("vdev_load", children, minclsyspri,
 		    children, children, TASKQ_PREPOPULATE);
 	}
 
 	/*
 	 * Recursively load all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (tq == NULL || vdev_uses_zvols(cvd)) {
 			cvd->vdev_load_error = vdev_load(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_load_child,
 			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		int error = vd->vdev_child[c]->vdev_load_error;
 
 		if (error != 0)
 			return (error);
 	}
 
 	vdev_set_deflate_ratio(vd);
 
 	/*
 	 * On spa_load path, grab the allocation bias from our zap
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		spa_t *spa = vd->vdev_spa;
 		char bias_str[64];
 
 		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
 		    bias_str);
 		if (error == 0) {
 			ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
 			vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
 		} else if (error != ENOENT) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
-			    "failed [error=%d]", vd->vdev_top_zap, error);
+			    "failed [error=%d]",
+			    (u_longlong_t)vd->vdev_top_zap, error);
 			return (error);
 		}
 	}
 
 	/*
 	 * Load any rebuild state from the top-level vdev zap.
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		error = vdev_rebuild_load(vd);
 		if (error && error != ENOTSUP) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
 			    "failed [error=%d]", error);
 			return (error);
 		}
 	}
 
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
 	if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 
 		if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
 			    "asize=%llu", (u_longlong_t)vd->vdev_ashift,
 			    (u_longlong_t)vd->vdev_asize);
 			return (SET_ERROR(ENXIO));
 		}
 
 		error = vdev_metaslab_init(vd, 0);
 		if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
 			    "[error=%d]", error);
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (error);
 		}
 
 		uint64_t checkpoint_sm_obj;
 		error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
 		if (error == 0 && checkpoint_sm_obj != 0) {
 			objset_t *mos = spa_meta_objset(vd->vdev_spa);
 			ASSERT(vd->vdev_asize != 0);
 			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
 
 			error = space_map_open(&vd->vdev_checkpoint_sm,
 			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
 			    vd->vdev_ashift);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "vdev_load: space_map_open "
 				    "failed for checkpoint spacemap (obj %llu) "
 				    "[error=%d]",
 				    (u_longlong_t)checkpoint_sm_obj, error);
 				return (error);
 			}
 			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 			/*
 			 * Since the checkpoint_sm contains free entries
 			 * exclusively we can use space_map_allocated() to
 			 * indicate the cumulative checkpointed space that
 			 * has been freed.
 			 */
 			vd->vdev_stat.vs_checkpoint_space =
 			    -space_map_allocated(vd->vdev_checkpoint_sm);
 			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
 			    vd->vdev_stat.vs_checkpoint_space;
 		} else if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
 			    "checkpoint space map object from vdev ZAP "
 			    "[error=%d]", error);
 			return (error);
 		}
 	}
 
 	/*
 	 * If this is a leaf vdev, load its DTL.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
 		    "[error=%d]", error);
 		return (error);
 	}
 
 	uint64_t obsolete_sm_object;
 	error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
 	if (error == 0 && obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		ASSERT(vd->vdev_asize != 0);
 		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
 
 		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
 		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
 			    "obsolete spacemap (obj %llu) [error=%d]",
 			    (u_longlong_t)obsolete_sm_object, error);
 			return (error);
 		}
 	} else if (error != 0) {
 		vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
 		    "space map object from vdev ZAP [error=%d]", error);
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * The special vdev case is used for hot spares and l2cache devices.  Its
  * sole purpose it to set the vdev state for the associated vdev.  To do this,
  * we make sure that we can open the underlying device, then try to read the
  * label, and make sure that the label is sane and that it hasn't been
  * repurposed to another pool.
  */
 int
 vdev_validate_aux(vdev_t *vd)
 {
 	nvlist_t *label;
 	uint64_t guid, version;
 	uint64_t state;
 
 	if (!vdev_readable(vd))
 		return (0);
 
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
 	    !SPA_VERSION_IS_SUPPORTED(version) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		return (-1);
 	}
 
 	/*
 	 * We don't actually check the pool state here.  If it's in fact in
 	 * use by another pool, we update this fact on the fly when requested.
 	 */
 	nvlist_free(label);
 	return (0);
 }
 
 static void
 vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	objset_t *mos = spa_meta_objset(vd->vdev_spa);
 
 	if (vd->vdev_top_zap == 0)
 		return;
 
 	uint64_t object = 0;
 	int err = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
 	if (err == ENOENT)
 		return;
 	VERIFY0(err);
 
 	VERIFY0(dmu_object_free(mos, object, tx));
 	VERIFY0(zap_remove(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
 }
 
 /*
  * Free the objects used to store this vdev's spacemaps, and the array
  * that points to them.
  */
 void
 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ms_array == 0)
 		return;
 
 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 	uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
 	size_t array_bytes = array_count * sizeof (uint64_t);
 	uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
 	VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
 	    array_bytes, smobj_array, 0));
 
 	for (uint64_t i = 0; i < array_count; i++) {
 		uint64_t smobj = smobj_array[i];
 		if (smobj == 0)
 			continue;
 
 		space_map_free_obj(mos, smobj, tx);
 	}
 
 	kmem_free(smobj_array, array_bytes);
 	VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
 	vdev_destroy_ms_flush_data(vd, tx);
 	vd->vdev_ms_array = 0;
 }
 
 static void
 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	vdev_destroy_spacemaps(vd, tx);
 	if (vd->vdev_top_zap != 0) {
 		vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
 		vd->vdev_top_zap = 0;
 	}
 
 	dmu_tx_commit(tx);
 }
 
 void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
 	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
 	ASSERT(vdev_is_concrete(vd));
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 	    != NULL)
 		metaslab_sync_done(msp, txg);
 
 	if (reassess) {
 		metaslab_sync_reassess(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_sync_reassess(vd->vdev_log_mg);
 	}
 }
 
 void
 vdev_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *lvd;
 	metaslab_t *msp;
 
 	ASSERT3U(txg, ==, spa->spa_syncing_txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 
 		vdev_indirect_sync_obsolete(vd, tx);
 
 		/*
 		 * If the vdev is indirect, it can't have dirty
 		 * metaslabs or DTLs.
 		 */
 		if (vd->vdev_ops == &vdev_indirect_ops) {
 			ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
 			ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
 			dmu_tx_commit(tx);
 			return;
 		}
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
 	    !vd->vdev_removing) {
 		ASSERT(vd == vd->vdev_top);
 		ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
 		ASSERT(vd->vdev_ms_array != 0);
 		vdev_config_dirty(vd);
 	}
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
 	}
 
 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
 		vdev_dtl_sync(lvd, txg);
 
 	/*
 	 * If this is an empty log device being removed, destroy the
 	 * metadata associated with it.
 	 */
 	if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
 		vdev_remove_empty_log(vd, txg);
 
 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
 	dmu_tx_commit(tx);
 }
 
 uint64_t
 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 {
 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
 }
 
 /*
  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
  * not be opened, and no I/O is attempted.
  */
 int
 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd, *tvd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	tvd = vd->vdev_top;
 
 	/*
 	 * If user did a 'zpool offline -f' then make the fault persist across
 	 * reboots.
 	 */
 	if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
 		/*
 		 * There are two kinds of forced faults: temporary and
 		 * persistent.  Temporary faults go away at pool import, while
 		 * persistent faults stay set.  Both types of faults can be
 		 * cleared with a zpool clear.
 		 *
 		 * We tell if a vdev is persistently faulted by looking at the
 		 * ZPOOL_CONFIG_AUX_STATE nvpair.  If it's set to "external" at
 		 * import then it's a persistent fault.  Otherwise, it's
 		 * temporary.  We get ZPOOL_CONFIG_AUX_STATE set to "external"
 		 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL.  This
 		 * tells vdev_config_generate() (which gets run later) to set
 		 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
 		 */
 		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 		vd->vdev_tmpoffline = B_FALSE;
 		aux = VDEV_AUX_EXTERNAL;
 	} else {
 		vd->vdev_tmpoffline = B_TRUE;
 	}
 
 	/*
 	 * We don't directly use the aux state here, but if we do a
 	 * vdev_reopen(), we need this value to be present to remember why we
 	 * were faulted.
 	 */
 	vd->vdev_label_aux = aux;
 
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
 	vd->vdev_delayed_close = B_FALSE;
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
 	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
 	 * If this device has the only valid copy of the data, then
 	 * back off and simply mark the vdev as degraded instead.
 	 */
 	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
 		/*
 		 * If we reopen the device and it's not dead, only then do we
 		 * mark it degraded.
 		 */
 		vdev_reopen(tvd);
 
 		if (vdev_readable(vd))
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
 	}
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
  * user that something is wrong.  The vdev continues to operate as normal as far
  * as I/O is concerned.
  */
 int
 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	/*
 	 * If the vdev is already faulted, then don't do anything.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	vd->vdev_degraded = 1ULL;
 	if (!vdev_is_dead(vd))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
 		    aux);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Online the given vdev.
  *
  * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
  * spare device should be detached when the device finishes resilvering.
  * Second, the online should be treated like a 'test' online case, so no FMA
  * events are generated if the device fails to open.
  */
 int
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 	boolean_t wasoffline;
 	vdev_state_t oldstate;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
 	oldstate = vd->vdev_state;
 
 	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
 	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
 	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
 
 	/* XXX - L2ARC 1.0 does not support expansion */
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
 			    spa->spa_autoexpand);
 		vd->vdev_expansion_time = gethrestime_sec();
 	}
 
 	vdev_reopen(tvd);
 	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
 
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = B_FALSE;
 	}
 
 	if (newstate)
 		*newstate = vd->vdev_state;
 	if ((flags & ZFS_ONLINE_UNSPARE) &&
 	    !vdev_is_dead(vd) && vd->vdev_parent &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
 
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
 	/* Restart initializing if necessary */
 	mutex_enter(&vd->vdev_initialize_lock);
 	if (vdev_writeable(vd) &&
 	    vd->vdev_initialize_thread == NULL &&
 	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
 		(void) vdev_initialize(vd);
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	/*
 	 * Restart trimming if necessary. We do not restart trimming for cache
 	 * devices here. This is triggered by l2arc_rebuild_vdev()
 	 * asynchronously for the whole device or in l2arc_evict() as it evicts
 	 * space for upcoming writes.
 	 */
 	mutex_enter(&vd->vdev_trim_lock);
 	if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
 	    vd->vdev_trim_thread == NULL &&
 	    vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
 		(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
 		    vd->vdev_trim_secure);
 	}
 	mutex_exit(&vd->vdev_trim_lock);
 
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED))
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 static int
 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
 	int error = 0;
 	uint64_t generation;
 	metaslab_group_t *mg;
 
 top:
 	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
 
 	/*
 	 * If the device isn't already offline, try to offline it.
 	 */
 	if (!vd->vdev_offline) {
 		/*
 		 * If this device has the only valid copy of some data,
 		 * don't allow it to be offlined. Log devices are always
 		 * expendable.
 		 */
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL,
 			    SET_ERROR(EBUSY)));
 
 		/*
 		 * If the top-level is a slog and it has had allocations
 		 * then proceed.  We check that the vdev's metaslab group
 		 * is not NULL since it's possible that we may have just
 		 * added this vdev but not yet initialized its metaslabs.
 		 */
 		if (tvd->vdev_islog && mg != NULL) {
 			/*
 			 * Prevent any future allocations.
 			 */
 			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 			metaslab_group_passivate(mg);
 			(void) spa_vdev_state_exit(spa, vd, 0);
 
 			error = spa_reset_logs(spa);
 
 			/*
 			 * If the log device was successfully reset but has
 			 * checkpointed data, do not offline it.
 			 */
 			if (error == 0 &&
 			    tvd->vdev_checkpoint_sm != NULL) {
 				ASSERT3U(space_map_allocated(
 				    tvd->vdev_checkpoint_sm), !=, 0);
 				error = ZFS_ERR_CHECKPOINT_EXISTS;
 			}
 
 			spa_vdev_state_enter(spa, SCL_ALLOC);
 
 			/*
 			 * Check to see if the config has changed.
 			 */
 			if (error || generation != spa->spa_config_generation) {
 				metaslab_group_activate(mg);
 				if (error)
 					return (spa_vdev_state_exit(spa,
 					    vd, error));
 				(void) spa_vdev_state_exit(spa, vd, 0);
 				goto top;
 			}
 			ASSERT0(tvd->vdev_stat.vs_alloc);
 		}
 
 		/*
 		 * Offline this device and reopen its top-level vdev.
 		 * If the top-level vdev is a log device then just offline
 		 * it. Otherwise, if this action results in the top-level
 		 * vdev becoming unusable, undo it and fail the request.
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(tvd);
 
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_is_dead(tvd)) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL,
 			    SET_ERROR(EBUSY)));
 		}
 
 		/*
 		 * Add the device back into the metaslab rotor so that
 		 * once we online the device it's open for business.
 		 */
 		if (tvd->vdev_islog && mg != NULL)
 			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	error = vdev_offline_locked(spa, guid, flags);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all
  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
  */
 void
 vdev_clear(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == NULL)
 		vd = rvd;
 
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
 	vd->vdev_stat.vs_slow_ios = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
 
 	/*
 	 * It makes no sense to "clear" an indirect vdev.
 	 */
 	if (!vdev_is_concrete(vd))
 		return;
 
 	/*
 	 * If we're in the FAULTED state or have experienced failed I/O, then
 	 * clear the persistent state and attempt to reopen the device.  We
 	 * also mark the vdev config dirty, so that the new faulted state is
 	 * written out to disk.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded ||
 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
 		/*
 		 * When reopening in response to a clear event, it may be due to
 		 * a fmadm repair request.  In this case, if the device is
 		 * still broken, we want to still post the ereport again.
 		 */
 		vd->vdev_forcefault = B_TRUE;
 
 		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 		vd->vdev_stat.vs_aux = 0;
 
 		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
 		vd->vdev_forcefault = B_FALSE;
 
 		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
 		/* If a resilver isn't required, check if vdevs can be culled */
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
 		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
 			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
 	}
 
 	/*
 	 * When clearing a FMA-diagnosed fault, we always want to
 	 * unspare the device, as we assume that the original spare was
 	 * done in response to the FMA fault.
 	 */
 	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	/* Clear recent error events cache (i.e. duplicate events tracking) */
 	zfs_ereport_clear(spa, vd);
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
 	/*
 	 * Holes and missing devices are always considered "dead".
 	 * This simplifies the code since we don't have to check for
 	 * these types of devices in the various code paths.
 	 * Instead we rely on the fact that we skip over dead devices
 	 * before issuing I/O to them.
 	 */
 	return (vd->vdev_state < VDEV_STATE_DEGRADED ||
 	    vd->vdev_ops == &vdev_hole_ops ||
 	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
 vdev_readable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
 }
 
 boolean_t
 vdev_writeable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
 	    vdev_is_concrete(vd));
 }
 
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
 	uint64_t state = vd->vdev_state;
 
 	/*
 	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
 	 * the proper locks.  Note that we have to get the vdev state
 	 * in a local variable because although it changes atomically,
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
 	    !vd->vdev_cant_write && vdev_is_concrete(vd) &&
 	    vd->vdev_mg->mg_initialized);
 }
 
 boolean_t
 vdev_accessible(vdev_t *vd, zio_t *zio)
 {
 	ASSERT(zio->io_vd == vd);
 
 	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
 		return (B_FALSE);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		return (!vd->vdev_cant_read);
 
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		return (!vd->vdev_cant_write);
 
 	return (B_TRUE);
 }
 
 static void
 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 {
 	/*
 	 * Exclude the dRAID spare when aggregating to avoid double counting
 	 * the ops and bytes.  These IOs are counted by the physical leaves.
 	 */
 	if (cvd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	for (int t = 0; t < VS_ZIO_TYPES; t++) {
 		vs->vs_ops[t] += cvs->vs_ops[t];
 		vs->vs_bytes[t] += cvs->vs_bytes[t];
 	}
 
 	cvs->vs_scan_removing = cvd->vdev_removing;
 }
 
 /*
  * Get extended stats
  */
 static void
 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
 {
 	int t, b;
 	for (t = 0; t < ZIO_TYPES; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
 			vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
 			vsx->vsx_total_histo[t][b] +=
 			    cvsx->vsx_total_histo[t][b];
 		}
 	}
 
 	for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
 			vsx->vsx_queue_histo[t][b] +=
 			    cvsx->vsx_queue_histo[t][b];
 		}
 		vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
 		vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
 			vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
 			vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
 	}
 
 }
 
 boolean_t
 vdev_is_spacemap_addressable(vdev_t *vd)
 {
 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
 		return (B_TRUE);
 
 	/*
 	 * If double-word space map entries are not enabled we assume
 	 * 47 bits of the space map entry are dedicated to the entry's
 	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
 	 * to calculate the maximum address that can be described by a
 	 * space map entry for the given device.
 	 */
 	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
 
 	if (shift >= 63) /* detect potential overflow */
 		return (B_TRUE);
 
 	return (vd->vdev_asize < (1ULL << shift));
 }
 
 /*
  * Get statistics for the given vdev.
  */
 static void
 vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 {
 	int t;
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
 	 * over all top-level vdevs (i.e. the direct children of the root).
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		if (vs) {
 			memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
 			memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
 		}
 		if (vsx)
 			memset(vsx, 0, sizeof (*vsx));
 
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 			vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
 
 			vdev_get_stats_ex_impl(cvd, cvs, cvsx);
 			if (vs)
 				vdev_get_child_stat(cvd, vs, cvs);
 			if (vsx)
 				vdev_get_child_stat_ex(cvd, vsx, cvsx);
 		}
 	} else {
 		/*
 		 * We're a leaf.  Just copy our ZIO active queue stats in.  The
 		 * other leaf stats are updated in vdev_stat_update().
 		 */
 		if (!vsx)
 			return;
 
 		memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
 
 		for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
 			vsx->vsx_active_queue[t] =
 			    vd->vdev_queue.vq_class[t].vqc_active;
 			vsx->vsx_pend_queue[t] = avl_numnodes(
 			    &vd->vdev_queue.vq_class[t].vqc_queued_tree);
 		}
 	}
 }
 
 void
 vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 {
 	vdev_t *tvd = vd->vdev_top;
 	mutex_enter(&vd->vdev_stat_lock);
 	if (vs) {
 		bcopy(&vd->vdev_stat, vs, sizeof (*vs));
 		vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 		vs->vs_state = vd->vdev_state;
 		vs->vs_rsize = vdev_get_min_asize(vd);
 
 		if (vd->vdev_ops->vdev_op_leaf) {
 			vs->vs_rsize += VDEV_LABEL_START_SIZE +
 			    VDEV_LABEL_END_SIZE;
 			/*
 			 * Report initializing progress. Since we don't
 			 * have the initializing locks held, this is only
 			 * an estimate (although a fairly accurate one).
 			 */
 			vs->vs_initialize_bytes_done =
 			    vd->vdev_initialize_bytes_done;
 			vs->vs_initialize_bytes_est =
 			    vd->vdev_initialize_bytes_est;
 			vs->vs_initialize_state = vd->vdev_initialize_state;
 			vs->vs_initialize_action_time =
 			    vd->vdev_initialize_action_time;
 
 			/*
 			 * Report manual TRIM progress. Since we don't have
 			 * the manual TRIM locks held, this is only an
 			 * estimate (although fairly accurate one).
 			 */
 			vs->vs_trim_notsup = !vd->vdev_has_trim;
 			vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
 			vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
 			vs->vs_trim_state = vd->vdev_trim_state;
 			vs->vs_trim_action_time = vd->vdev_trim_action_time;
 
 			/* Set when there is a deferred resilver. */
 			vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
 		}
 
 		/*
 		 * Report expandable space on top-level, non-auxiliary devices
 		 * only. The expandable space is reported in terms of metaslab
 		 * sized units since that determines how much space the pool
 		 * can expand.
 		 */
 		if (vd->vdev_aux == NULL && tvd != NULL) {
 			vs->vs_esize = P2ALIGN(
 			    vd->vdev_max_asize - vd->vdev_asize,
 			    1ULL << tvd->vdev_ms_shift);
 		}
 
 		vs->vs_configured_ashift = vd->vdev_top != NULL
 		    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
 		vs->vs_logical_ashift = vd->vdev_logical_ashift;
 		vs->vs_physical_ashift = vd->vdev_physical_ashift;
 
 		/*
 		 * Report fragmentation and rebuild progress for top-level,
 		 * non-auxiliary, concrete devices.
 		 */
 		if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
 		    vdev_is_concrete(vd)) {
 			/*
 			 * The vdev fragmentation rating doesn't take into
 			 * account the embedded slog metaslab (vdev_log_mg).
 			 * Since it's only one metaslab, it would have a tiny
 			 * impact on the overall fragmentation.
 			 */
 			vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
 			    vd->vdev_mg->mg_fragmentation : 0;
 		}
 	}
 
 	vdev_get_stats_ex_impl(vd, vs, vsx);
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
 	return (vdev_get_stats_ex(vd, vs, NULL));
 }
 
 void
 vdev_clear_stats(vdev_t *vd)
 {
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_space = 0;
 	vd->vdev_stat.vs_dspace = 0;
 	vd->vdev_stat.vs_alloc = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_scan_stat_init(vdev_t *vd)
 {
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_scan_stat_init(vd->vdev_child[c]);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vs->vs_scan_processed = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
 	zio_type_t type = zio->io_type;
 	int flags = zio->io_flags;
 
 	/*
 	 * If this i/o is a gang leader, it didn't do any actual work.
 	 */
 	if (zio->io_gang_tree)
 		return;
 
 	if (zio->io_error == 0) {
 		/*
 		 * If this is a root i/o, don't count it -- we've already
 		 * counted the top-level vdevs, and vdev_get_stats() will
 		 * aggregate them when asked.  This reduces contention on
 		 * the root vdev_stat_lock and implicitly handles blocks
 		 * that compress away to holes, for which there is no i/o.
 		 * (Holes never create vdev children, so all the counters
 		 * remain zero, which is what we want.)
 		 *
 		 * Note: this only applies to successful i/o (io_error == 0)
 		 * because unlike i/o counts, errors are not additive.
 		 * When reading a ditto block, for example, failure of
 		 * one top-level vdev does not imply a root-level error.
 		 */
 		if (vd == rvd)
 			return;
 
 		ASSERT(vd == zio->io_vd);
 
 		if (flags & ZIO_FLAG_IO_BYPASS)
 			return;
 
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
 			/*
 			 * Repair is the result of a resilver issued by the
 			 * scan thread (spa_sync).
 			 */
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 				dsl_scan_phys_t *scn_phys = &scn->scn_phys;
 				uint64_t *processed = &scn_phys->scn_processed;
 
 				if (vd->vdev_ops->vdev_op_leaf)
 					atomic_add_64(processed, psize);
 				vs->vs_scan_processed += psize;
 			}
 
 			/*
 			 * Repair is the result of a rebuild issued by the
 			 * rebuild thread (vdev_rebuild_thread).  To avoid
 			 * double counting repaired bytes the virtual dRAID
 			 * spare vdev is excluded from the processed bytes.
 			 */
 			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
 				vdev_t *tvd = vd->vdev_top;
 				vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
 				vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 				uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
 
 				if (vd->vdev_ops->vdev_op_leaf &&
 				    vd->vdev_ops != &vdev_draid_spare_ops) {
 					atomic_add_64(rebuilt, psize);
 				}
 				vs->vs_rebuild_processed += psize;
 			}
 
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
 
 		/*
 		 * The bytes/ops/histograms are recorded at the leaf level and
 		 * aggregated into the higher level vdevs in vdev_get_stats().
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
 			zio_type_t vs_type = type;
 			zio_priority_t priority = zio->io_priority;
 
 			/*
 			 * TRIM ops and bytes are reported to user space as
 			 * ZIO_TYPE_IOCTL.  This is done to preserve the
 			 * vdev_stat_t structure layout for user space.
 			 */
 			if (type == ZIO_TYPE_TRIM)
 				vs_type = ZIO_TYPE_IOCTL;
 
 			/*
 			 * Solely for the purposes of 'zpool iostat -lqrw'
 			 * reporting use the priority to categorize the IO.
 			 * Only the following are reported to user space:
 			 *
 			 *   ZIO_PRIORITY_SYNC_READ,
 			 *   ZIO_PRIORITY_SYNC_WRITE,
 			 *   ZIO_PRIORITY_ASYNC_READ,
 			 *   ZIO_PRIORITY_ASYNC_WRITE,
 			 *   ZIO_PRIORITY_SCRUB,
 			 *   ZIO_PRIORITY_TRIM.
 			 */
 			if (priority == ZIO_PRIORITY_REBUILD) {
 				priority = ((type == ZIO_TYPE_WRITE) ?
 				    ZIO_PRIORITY_ASYNC_WRITE :
 				    ZIO_PRIORITY_SCRUB);
 			} else if (priority == ZIO_PRIORITY_INITIALIZING) {
 				ASSERT3U(type, ==, ZIO_TYPE_WRITE);
 				priority = ZIO_PRIORITY_ASYNC_WRITE;
 			} else if (priority == ZIO_PRIORITY_REMOVAL) {
 				priority = ((type == ZIO_TYPE_WRITE) ?
 				    ZIO_PRIORITY_ASYNC_WRITE :
 				    ZIO_PRIORITY_ASYNC_READ);
 			}
 
 			vs->vs_ops[vs_type]++;
 			vs->vs_bytes[vs_type] += psize;
 
 			if (flags & ZIO_FLAG_DELEGATED) {
 				vsx->vsx_agg_histo[priority]
 				    [RQ_HISTO(zio->io_size)]++;
 			} else {
 				vsx->vsx_ind_histo[priority]
 				    [RQ_HISTO(zio->io_size)]++;
 			}
 
 			if (zio->io_delta && zio->io_delay) {
 				vsx->vsx_queue_histo[priority]
 				    [L_HISTO(zio->io_delta - zio->io_delay)]++;
 				vsx->vsx_disk_histo[type]
 				    [L_HISTO(zio->io_delay)]++;
 				vsx->vsx_total_histo[type]
 				    [L_HISTO(zio->io_delta)]++;
 			}
 		}
 
 		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	/*
 	 * If this is an I/O error that is going to be retried, then ignore the
 	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
 	 * hard errors, when in reality they can happen for any number of
 	 * innocuous reasons (bus resets, MPxIO link failure, etc).
 	 */
 	if (zio->io_error == EIO &&
 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 		return;
 
 	/*
 	 * Intent logs writes won't propagate their error to the root
 	 * I/O so don't mark these types of failures as pool-level
 	 * errors.
 	 */
 	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		return;
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
 	    (flags & ZIO_FLAG_SCAN_THREAD) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
 		 * a repair induced by the scrub thread, or it's a repair
 		 * made by zil_claim() during spa_load() in the first txg.
 		 * In the normal case, we commit the DTL change in the same
 		 * txg as the block was born.  In the scrub-induced repair
 		 * case, we know that scrubs run in first-pass syncing context,
 		 * so we commit the DTL change in spa_syncing_txg(spa).
 		 * In the zil_claim() case, we commit in spa_first_txg(spa).
 		 *
 		 * We currently do not make DTL entries for failed spontaneous
 		 * self-healing writes triggered by normal (non-scrubbing)
 		 * reads, because we have no transactional context in which to
 		 * do so -- and it's not clear that it'd be desirable anyway.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
 				commit_txg = spa_syncing_txg(spa);
 			} else if (spa->spa_claiming) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				commit_txg = spa_first_txg(spa);
 			}
 			ASSERT(commit_txg >= spa_syncing_txg(spa));
 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
 		if (vd != rvd)
 			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
 int64_t
 vdev_deflated_space(vdev_t *vd, int64_t space)
 {
 	ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
 
 	return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
 }
 
 /*
  * Update the in-core space usage stats for this vdev, its metaslab class,
  * and the root vdev.
  */
 void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
     int64_t space_delta)
 {
 	int64_t dspace_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
 	 * factor.  We must calculate this here and not at the root vdev
 	 * because the root vdev's psize-to-asize is simply the max of its
 	 * children's, thus not accurate enough for us.
 	 */
 	dspace_delta = vdev_deflated_space(vd, space_delta);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	/* ensure we won't underflow */
 	if (alloc_delta < 0) {
 		ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
 	}
 
 	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	/* every class but log contributes to root space stats */
 	if (vd->vdev_mg != NULL && !vd->vdev_islog) {
 		ASSERT(!vd->vdev_isl2cache);
 		mutex_enter(&rvd->vdev_stat_lock);
 		rvd->vdev_stat.vs_alloc += alloc_delta;
 		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
 	/* Note: metaslab_class_space_update moved to metaslab_space_update */
 }
 
 /*
  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  * so that it will be written out next time the vdev configuration is synced.
  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
  */
 void
 vdev_config_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
 	ASSERT(spa_writeable(spa));
 
 	/*
 	 * If this is an aux vdev (as with l2cache and spare devices), then we
 	 * update the vdev config manually and set the sync flag.
 	 */
 	if (vd->vdev_aux != NULL) {
 		spa_aux_vdev_t *sav = vd->vdev_aux;
 		nvlist_t **aux;
 		uint_t naux;
 
 		for (c = 0; c < sav->sav_count; c++) {
 			if (sav->sav_vdevs[c] == vd)
 				break;
 		}
 
 		if (c == sav->sav_count) {
 			/*
 			 * We're being removed.  There's nothing more to do.
 			 */
 			ASSERT(sav->sav_sync == B_TRUE);
 			return;
 		}
 
 		sav->sav_sync = B_TRUE;
 
 		if (nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
 			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
 		}
 
 		ASSERT(c < naux);
 
 		/*
 		 * Setting the nvlist in the middle if the array is a little
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
 		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
 
 	/*
 	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
 	 * must either hold SCL_CONFIG as writer, or must be the sync thread
 	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	if (vd == rvd) {
 		for (c = 0; c < rvd->vdev_children; c++)
 			vdev_config_dirty(rvd->vdev_child[c]);
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
 		if (!list_link_active(&vd->vdev_config_dirty_node) &&
 		    vdev_is_concrete(vd)) {
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 		}
 	}
 }
 
 void
 vdev_config_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
 	list_remove(&spa->spa_config_dirty_list, vd);
 }
 
 /*
  * Mark a top-level vdev's state as dirty, so that the next pass of
  * spa_sync() can convert this into vdev_config_dirty().  We distinguish
  * the state changes from larger config changes because they require
  * much less locking, and are often needed for administrative actions.
  */
 void
 vdev_state_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_writeable(spa));
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * The state list is protected by the SCL_STATE lock.  The caller
 	 * must either hold SCL_STATE as writer, or must be the sync thread
 	 * (which holds SCL_STATE as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	if (!list_link_active(&vd->vdev_state_dirty_node) &&
 	    vdev_is_concrete(vd))
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
 void
 vdev_state_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
 	list_remove(&spa->spa_state_dirty_list, vd);
 }
 
 /*
  * Propagate vdev state up from children to parent.
  */
 void
 vdev_propagate_state(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
 	vdev_t *child;
 
 	if (vd->vdev_children > 0) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
 			/*
 			 * Don't factor holes or indirect vdevs into the
 			 * decision.
 			 */
 			if (!vdev_is_concrete(child))
 				continue;
 
 			if (!vdev_readable(child) ||
 			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were
 				 * degraded.
 				 */
 				if (child->vdev_islog && vd == rvd)
 					degraded++;
 				else
 					faulted++;
 			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
 				degraded++;
 			}
 
 			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
 				corrupted++;
 		}
 
 		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
 
 		/*
 		 * Root special: if there is a top-level vdev that cannot be
 		 * opened due to corrupted metadata, then propagate the root
 		 * vdev's aux state as 'corrupt' rather than 'insufficient
 		 * replicas'.
 		 */
 		if (corrupted && vd == rvd &&
 		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
 			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 	}
 
 	if (vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Set a vdev's state.  If this is during an open, we don't update the parent
  * state, because we're in the process of opening children depth-first.
  * Otherwise, we propagate the change to the parent.
  *
  * If this routine places a device in a faulted state, an appropriate ereport is
  * generated.
  */
 void
 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
 	uint64_t save_state;
 	spa_t *spa = vd->vdev_spa;
 
 	if (state == vd->vdev_state) {
 		/*
 		 * Since vdev_offline() code path is already in an offline
 		 * state we can miss a statechange event to OFFLINE. Check
 		 * the previous state to catch this condition.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (state == VDEV_STATE_OFFLINE) &&
 		    (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
 			/* post an offline state change */
 			zfs_post_state_change(spa, vd, vd->vdev_prevstate);
 		}
 		vd->vdev_stat.vs_aux = aux;
 		return;
 	}
 
 	save_state = vd->vdev_state;
 
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
 	/*
 	 * If we are setting the vdev state to anything but an open state, then
 	 * always close the underlying device unless the device has requested
 	 * a delayed close (i.e. we're about to remove or fault the device).
 	 * Otherwise, we keep accessible but invalid devices open forever.
 	 * We don't call vdev_close() itself, because that implies some extra
 	 * checks (offline, etc) that we don't want here.  This is limited to
 	 * leaf devices, because otherwise closing the device will affect other
 	 * children.
 	 */
 	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
 	    vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
 		/*
 		 * If the previous state is set to VDEV_STATE_REMOVED, then this
 		 * device was previously marked removed and someone attempted to
 		 * reopen it.  If this failed due to a nonexistent device, then
 		 * keep the device in the REMOVED state.  We also let this be if
 		 * it is one of our special test online cases, which is only
 		 * attempting to online the device and shouldn't generate an FMA
 		 * fault.
 		 */
 		vd->vdev_state = VDEV_STATE_REMOVED;
 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	} else if (state == VDEV_STATE_REMOVED) {
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
 		 * If we fail to open a vdev during an import or recovery, we
 		 * mark it as "not available", which signifies that it was
 		 * never there to begin with.  Failure to open such a device
 		 * is not considered an error.
 		 */
 		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
 		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
 		/*
 		 * Post the appropriate ereport.  If the 'prevstate' field is
 		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
 		 * that this is part of a vdev_reopen().  In this case, we don't
 		 * want to post the ereport if the device was already in the
 		 * CANT_OPEN state beforehand.
 		 *
 		 * If the 'checkremove' flag is set, then this is an attempt to
 		 * online the device in response to an insertion event.  If we
 		 * hit this case, then we have detected an insertion event for a
 		 * faulted or offline device that wasn't in the removed state.
 		 * In this scenario, we don't post an ereport because we are
 		 * about to replace the device, or attempt an online with
 		 * vdev_forcefault, which will generate the fault for us.
 		 */
 		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
 		    !vd->vdev_not_present && !vd->vdev_checkremove &&
 		    vd != spa->spa_root_vdev) {
 			const char *class;
 
 			switch (aux) {
 			case VDEV_AUX_OPEN_FAILED:
 				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
 				break;
 			case VDEV_AUX_CORRUPT_DATA:
 				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
 				break;
 			case VDEV_AUX_NO_REPLICAS:
 				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
 				break;
 			case VDEV_AUX_BAD_GUID_SUM:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
 				break;
 			case VDEV_AUX_TOO_SMALL:
 				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
 				break;
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
 			case VDEV_AUX_BAD_ASHIFT:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
 				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
 
 			(void) zfs_ereport_post(class, spa, vd, NULL, NULL,
 			    save_state);
 		}
 
 		/* Erase any notion of persistent removed state */
 		vd->vdev_removed = B_FALSE;
 	} else {
 		vd->vdev_removed = B_FALSE;
 	}
 
 	/*
 	 * Notify ZED of any significant state-change on a leaf vdev.
 	 *
 	 */
 	if (vd->vdev_ops->vdev_op_leaf) {
 		/* preserve original state from a vdev_reopen() */
 		if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
 		    (vd->vdev_prevstate != vd->vdev_state) &&
 		    (save_state <= VDEV_STATE_CLOSED))
 			save_state = vd->vdev_prevstate;
 
 		/* filter out state change due to initial vdev_open */
 		if (save_state > VDEV_STATE_CLOSED)
 			zfs_post_state_change(spa, vd, save_state);
 	}
 
 	if (!isopen && vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 boolean_t
 vdev_children_are_offline(vdev_t *vd)
 {
 	ASSERT(!vd->vdev_ops->vdev_op_leaf);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
 			return (B_FALSE);
 	}
 
 	return (B_TRUE);
 }
 
 /*
  * Check the vdev configuration to ensure that it's capable of supporting
  * a root pool. We do not support partial configuration.
  */
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		const char *vdev_type = vd->vdev_ops->vdev_op_type;
 
 		if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
 			return (B_FALSE);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 boolean_t
 vdev_is_concrete(vdev_t *vd)
 {
 	vdev_ops_t *ops = vd->vdev_ops;
 	if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
 	    ops == &vdev_missing_ops || ops == &vdev_root_ops) {
 		return (B_FALSE);
 	} else {
 		return (B_TRUE);
 	}
 }
 
 /*
  * Determine if a log device has valid content.  If the vdev was
  * removed or faulted in the MOS config then we know that
  * the content on the log device has already been written to the pool.
  */
 boolean_t
 vdev_log_state_valid(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
 	    !vd->vdev_removed)
 		return (B_TRUE);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_log_state_valid(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Expand a vdev if possible.
  */
 void
 vdev_expand(vdev_t *vd, uint64_t txg)
 {
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vdev_is_concrete(vd));
 
 	vdev_set_deflate_ratio(vd);
 
 	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
 	    vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 		VERIFY(vdev_metaslab_init(vd, txg) == 0);
 		vdev_config_dirty(vd);
 	}
 }
 
 /*
  * Split a vdev.
  */
 void
 vdev_split(vdev_t *vd)
 {
 	vdev_t *cvd, *pvd = vd->vdev_parent;
 
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	cvd = pvd->vdev_child[0];
 	if (pvd->vdev_children == 1) {
 		vdev_remove_parent(cvd);
 		cvd->vdev_splitting = B_TRUE;
 	}
 	vdev_propagate_state(cvd);
 }
 
 void
 vdev_deadman(vdev_t *vd, char *tag)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		vdev_deadman(cvd, tag);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
 		if (avl_numnodes(&vq->vq_active_tree) > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
 			zfs_dbgmsg("slow vdev: %s has %lu active IOs",
 			    vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime invoke the deadman logic.
 			 */
 			fio = avl_first(&vq->vq_active_tree);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa))
 				zio_deadman(fio, tag);
 		}
 		mutex_exit(&vq->vq_lock);
 	}
 }
 
 void
 vdev_defer_resilver(vdev_t *vd)
 {
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vd->vdev_resilver_deferred = B_TRUE;
 	vd->vdev_spa->spa_resilver_deferred = B_TRUE;
 }
 
 /*
  * Clears the resilver deferred flag on all leaf devs under vd. Returns
  * B_TRUE if we have devices that need to be resilvered and are available to
  * accept resilver I/Os.
  */
 boolean_t
 vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
 {
 	boolean_t resilver_needed = B_FALSE;
 	spa_t *spa = vd->vdev_spa;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
 	}
 
 	if (vd == spa->spa_root_vdev &&
 	    spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
 		spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
 		vdev_config_dirty(vd);
 		spa->spa_resilver_deferred = B_FALSE;
 		return (resilver_needed);
 	}
 
 	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
 	    !vd->vdev_ops->vdev_op_leaf)
 		return (resilver_needed);
 
 	vd->vdev_resilver_deferred = B_FALSE;
 
 	return (!vdev_is_dead(vd) && !vd->vdev_offline &&
 	    vdev_resilver_needed(vd, NULL, NULL));
 }
 
 boolean_t
 vdev_xlate_is_empty(range_seg64_t *rs)
 {
 	return (rs->rs_start == rs->rs_end);
 }
 
 /*
  * Translate a logical range to the first contiguous physical range for the
  * specified vdev_t.  This function is initially called with a leaf vdev and
  * will walk each parent vdev until it reaches a top-level vdev. Once the
  * top-level is reached the physical range is initialized and the recursive
  * function begins to unwind. As it unwinds it calls the parent's vdev
  * specific translation function to do the real conversion.
  */
 void
 vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
 	/*
 	 * Walk up the vdev tree
 	 */
 	if (vd != vd->vdev_top) {
 		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
 		    remain_rs);
 	} else {
 		/*
 		 * We've reached the top-level vdev, initialize the physical
 		 * range to the logical range and set an empty remaining
 		 * range then start to unwind.
 		 */
 		physical_rs->rs_start = logical_rs->rs_start;
 		physical_rs->rs_end = logical_rs->rs_end;
 
 		remain_rs->rs_start = logical_rs->rs_start;
 		remain_rs->rs_end = logical_rs->rs_start;
 
 		return;
 	}
 
 	vdev_t *pvd = vd->vdev_parent;
 	ASSERT3P(pvd, !=, NULL);
 	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
 
 	/*
 	 * As this recursive function unwinds, translate the logical
 	 * range into its physical and any remaining components by calling
 	 * the vdev specific translate function.
 	 */
 	range_seg64_t intermediate = { 0 };
 	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
 
 	physical_rs->rs_start = intermediate.rs_start;
 	physical_rs->rs_end = intermediate.rs_end;
 }
 
 void
 vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
     vdev_xlate_func_t *func, void *arg)
 {
 	range_seg64_t iter_rs = *logical_rs;
 	range_seg64_t physical_rs;
 	range_seg64_t remain_rs;
 
 	while (!vdev_xlate_is_empty(&iter_rs)) {
 
 		vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
 
 		/*
 		 * With raidz and dRAID, it's possible that the logical range
 		 * does not live on this leaf vdev. Only when there is a non-
 		 * zero physical size call the provided function.
 		 */
 		if (!vdev_xlate_is_empty(&physical_rs))
 			func(arg, &physical_rs);
 
 		iter_rs = remain_rs;
 	}
 }
 
 /*
  * Look at the vdev tree and determine whether any devices are currently being
  * replaced.
  */
 boolean_t
 vdev_replace_in_progress(vdev_t *vdev)
 {
 	ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev->vdev_ops == &vdev_replacing_ops)
 		return (B_TRUE);
 
 	/*
 	 * A 'spare' vdev indicates that we have a replace in progress, unless
 	 * it has exactly two children, and the second, the hot spare, has
 	 * finished being resilvered.
 	 */
 	if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
 	    !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
 		return (B_TRUE);
 
 	for (int i = 0; i < vdev->vdev_children; i++) {
 		if (vdev_replace_in_progress(vdev->vdev_child[i]))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 EXPORT_SYMBOL(vdev_fault);
 EXPORT_SYMBOL(vdev_degrade);
 EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW,
 	"Target number of metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW,
 	"Default limit for metaslab size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW,
 	"Minimum number of metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW,
 	"Practical upper limit of total metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
 	"Rate limit slow IO (delay) events to this many per second");
 
 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
 	"Rate limit checksum events to this many checksum errors per second "
 	"(do not set below zed threshold).");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
 	"Ignore errors during resilver/scrub");
 
 ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
 	"Bypass vdev_validate()");
 
 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
 	"Disable cache flushes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
 	"Minimum number of metaslabs required to dedicate one for log blocks");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
 	param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
 	"Minimum ashift used when creating new top-level vdevs");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
 	param_set_max_auto_ashift, param_get_ulong, ZMOD_RW,
 	"Maximum ashift used when optimizing for logical -> physical sector "
 	"size on new top-level vdevs");
 /* END CSTYLED */
diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c
index 25d76970e99a..138b7dac5956 100644
--- a/module/zfs/vdev_raidz_math.c
+++ b/module/zfs/vdev_raidz_math.c
@@ -1,666 +1,666 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/types.h>
 #include <sys/zio.h>
 #include <sys/debug.h>
 #include <sys/zfs_debug.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
 #include <sys/simd.h>
 
 /* Opaque implementation with NULL methods to represent original methods */
 static const raidz_impl_ops_t vdev_raidz_original_impl = {
 	.name = "original",
 	.is_supported = raidz_will_scalar_work,
 };
 
 /* RAIDZ parity op that contain the fastest methods */
 static raidz_impl_ops_t vdev_raidz_fastest_impl = {
 	.name = "fastest"
 };
 
 /* All compiled in implementations */
 const raidz_impl_ops_t *raidz_all_maths[] = {
 	&vdev_raidz_original_impl,
 	&vdev_raidz_scalar_impl,
 #if defined(__x86_64) && defined(HAVE_SSE2)	/* only x86_64 for now */
 	&vdev_raidz_sse2_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_SSSE3)	/* only x86_64 for now */
 	&vdev_raidz_ssse3_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX2)	/* only x86_64 for now */
 	&vdev_raidz_avx2_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX512F)	/* only x86_64 for now */
 	&vdev_raidz_avx512f_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX512BW)	/* only x86_64 for now */
 	&vdev_raidz_avx512bw_impl,
 #endif
 #if defined(__aarch64__) && !defined(__FreeBSD__)
 	&vdev_raidz_aarch64_neon_impl,
 	&vdev_raidz_aarch64_neonx2_impl,
 #endif
 #if defined(__powerpc__) && defined(__altivec__)
 	&vdev_raidz_powerpc_altivec_impl,
 #endif
 };
 
 /* Indicate that benchmark has been completed */
 static boolean_t raidz_math_initialized = B_FALSE;
 
 /* Select raidz implementation */
 #define	IMPL_FASTEST	(UINT32_MAX)
 #define	IMPL_CYCLE	(UINT32_MAX - 1)
 #define	IMPL_ORIGINAL	(0)
 #define	IMPL_SCALAR	(1)
 
 #define	RAIDZ_IMPL_READ(i)	(*(volatile uint32_t *) &(i))
 
 static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
 static uint32_t user_sel_impl = IMPL_FASTEST;
 
 /* Hold all supported implementations */
 static size_t raidz_supp_impl_cnt = 0;
 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
 
 #if defined(_KERNEL)
 /*
  * kstats values for supported implementations
  * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
  */
 static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
 
 /* kstat for benchmarked implementations */
 static kstat_t *raidz_math_kstat = NULL;
 #endif
 
 /*
  * Returns the RAIDZ operations for raidz_map() parity calculations.   When
  * a SIMD implementation is not allowed in the current context, then fallback
  * to the fastest generic implementation.
  */
 const raidz_impl_ops_t *
 vdev_raidz_math_get_ops(void)
 {
 	if (!kfpu_allowed())
 		return (&vdev_raidz_scalar_impl);
 
 	raidz_impl_ops_t *ops = NULL;
 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
 
 	switch (impl) {
 	case IMPL_FASTEST:
 		ASSERT(raidz_math_initialized);
 		ops = &vdev_raidz_fastest_impl;
 		break;
 	case IMPL_CYCLE:
 		/* Cycle through all supported implementations */
 		ASSERT(raidz_math_initialized);
 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
 		static size_t cycle_impl_idx = 0;
 		size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
 		ops = raidz_supp_impl[idx];
 		break;
 	case IMPL_ORIGINAL:
 		ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
 		break;
 	case IMPL_SCALAR:
 		ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl;
 		break;
 	default:
 		ASSERT3U(impl, <, raidz_supp_impl_cnt);
 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
 		if (impl < ARRAY_SIZE(raidz_all_maths))
 			ops = raidz_supp_impl[impl];
 		break;
 	}
 
 	ASSERT3P(ops, !=, NULL);
 
 	return (ops);
 }
 
 /*
  * Select parity generation method for raidz_map
  */
 int
 vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr)
 {
 	raidz_gen_f gen_parity = NULL;
 
 	switch (raidz_parity(rm)) {
 		case 1:
 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
 			break;
 		case 2:
 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
 			break;
 		case 3:
 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
 			break;
 		default:
 			gen_parity = NULL;
-			cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
-			    raidz_parity(rm));
+			cmn_err(CE_PANIC, "invalid RAID-Z configuration %llu",
+			    (u_longlong_t)raidz_parity(rm));
 			break;
 	}
 
 	/* if method is NULL execute the original implementation */
 	if (gen_parity == NULL)
 		return (RAIDZ_ORIGINAL_IMPL);
 
 	gen_parity(rr);
 
 	return (0);
 }
 
 static raidz_rec_f
 reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
     const int nbaddata)
 {
 	if (nbaddata == 1 && parity_valid[CODE_P]) {
 		return (rm->rm_ops->rec[RAIDZ_REC_P]);
 	}
 	return ((raidz_rec_f) NULL);
 }
 
 static raidz_rec_f
 reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
     const int nbaddata)
 {
 	if (nbaddata == 1) {
 		if (parity_valid[CODE_P]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_P]);
 		} else if (parity_valid[CODE_Q]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
 		}
 	} else if (nbaddata == 2 &&
 	    parity_valid[CODE_P] && parity_valid[CODE_Q]) {
 		return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
 	}
 	return ((raidz_rec_f) NULL);
 }
 
 static raidz_rec_f
 reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
     const int nbaddata)
 {
 	if (nbaddata == 1) {
 		if (parity_valid[CODE_P]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_P]);
 		} else if (parity_valid[CODE_Q]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
 		} else if (parity_valid[CODE_R]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_R]);
 		}
 	} else if (nbaddata == 2) {
 		if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
 		} else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_PR]);
 		} else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
 			return (rm->rm_ops->rec[RAIDZ_REC_QR]);
 		}
 	} else if (nbaddata == 3 &&
 	    parity_valid[CODE_P] && parity_valid[CODE_Q] &&
 	    parity_valid[CODE_R]) {
 		return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
 	}
 	return ((raidz_rec_f) NULL);
 }
 
 /*
  * Select data reconstruction method for raidz_map
  * @parity_valid - Parity validity flag
  * @dt           - Failed data index array
  * @nbaddata     - Number of failed data columns
  */
 int
 vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr,
     const int *parity_valid, const int *dt, const int nbaddata)
 {
 	raidz_rec_f rec_fn = NULL;
 
 	switch (raidz_parity(rm)) {
 	case PARITY_P:
 		rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
 		break;
 	case PARITY_PQ:
 		rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
 		break;
 	case PARITY_PQR:
 		rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
 		break;
 	default:
-		cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
-		    raidz_parity(rm));
+		cmn_err(CE_PANIC, "invalid RAID-Z configuration %llu",
+		    (u_longlong_t)raidz_parity(rm));
 		break;
 	}
 
 	if (rec_fn == NULL)
 		return (RAIDZ_ORIGINAL_IMPL);
 	else
 		return (rec_fn(rr, dt));
 }
 
 const char *raidz_gen_name[] = {
 	"gen_p", "gen_pq", "gen_pqr"
 };
 const char *raidz_rec_name[] = {
 	"rec_p", "rec_q", "rec_r",
 	"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
 };
 
 #if defined(_KERNEL)
 
 #define	RAIDZ_KSTAT_LINE_LEN	(17 + 10*12 + 1)
 
 static int
 raidz_math_kstat_headers(char *buf, size_t size)
 {
 	int i;
 	ssize_t off;
 
 	ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
 
 	off = snprintf(buf, size, "%-17s", "implementation");
 
 	for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
 		off += snprintf(buf + off, size - off, "%-16s",
 		    raidz_gen_name[i]);
 
 	for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
 		off += snprintf(buf + off, size - off, "%-16s",
 		    raidz_rec_name[i]);
 
 	(void) snprintf(buf + off, size - off, "\n");
 
 	return (0);
 }
 
 static int
 raidz_math_kstat_data(char *buf, size_t size, void *data)
 {
 	raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
 	raidz_impl_kstat_t *cstat = (raidz_impl_kstat_t *)data;
 	ssize_t off = 0;
 	int i;
 
 	ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
 
 	if (cstat == fstat) {
 		off += snprintf(buf + off, size - off, "%-17s", "fastest");
 
 		for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) {
 			int id = fstat->gen[i];
 			off += snprintf(buf + off, size - off, "%-16s",
 			    raidz_supp_impl[id]->name);
 		}
 		for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) {
 			int id = fstat->rec[i];
 			off += snprintf(buf + off, size - off, "%-16s",
 			    raidz_supp_impl[id]->name);
 		}
 	} else {
 		ptrdiff_t id = cstat - raidz_impl_kstats;
 
 		off += snprintf(buf + off, size - off, "%-17s",
 		    raidz_supp_impl[id]->name);
 
 		for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
 			off += snprintf(buf + off, size - off, "%-16llu",
 			    (u_longlong_t)cstat->gen[i]);
 
 		for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
 			off += snprintf(buf + off, size - off, "%-16llu",
 			    (u_longlong_t)cstat->rec[i]);
 	}
 
 	(void) snprintf(buf + off, size - off, "\n");
 
 	return (0);
 }
 
 static void *
 raidz_math_kstat_addr(kstat_t *ksp, loff_t n)
 {
 	if (n <= raidz_supp_impl_cnt)
 		ksp->ks_private = (void *) (raidz_impl_kstats + n);
 	else
 		ksp->ks_private = NULL;
 
 	return (ksp->ks_private);
 }
 
 #define	BENCH_D_COLS	(8ULL)
 #define	BENCH_COLS	(BENCH_D_COLS + PARITY_PQR)
 #define	BENCH_ZIO_SIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)	/* 128 kiB */
 #define	BENCH_NS	MSEC2NSEC(1)			/* 1ms */
 
 typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
 
 static void
 benchmark_gen_impl(raidz_map_t *rm, const int fn)
 {
 	(void) fn;
 	vdev_raidz_generate_parity(rm);
 }
 
 static void
 benchmark_rec_impl(raidz_map_t *rm, const int fn)
 {
 	static const int rec_tgt[7][3] = {
 		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
 		{0, 2, 3},	/* rec_q:   bad PR & D[0]	*/
 		{0, 1, 3},	/* rec_r:   bad PQ & D[0]	*/
 		{2, 3, 4},	/* rec_pq:  bad R  & D[0][1]	*/
 		{1, 3, 4},	/* rec_pr:  bad Q  & D[0][1]	*/
 		{0, 3, 4},	/* rec_qr:  bad P  & D[0][1]	*/
 		{3, 4, 5}	/* rec_pqr: bad    & D[0][1][2] */
 	};
 
 	vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
 }
 
 /*
  * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
  * is performed by setting the rm_ops pointer and calling the top level
  * generate/reconstruct methods of bench_rm.
  */
 static void
 benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
 {
 	uint64_t run_cnt, speed, best_speed = 0;
 	hrtime_t t_start, t_diff;
 	raidz_impl_ops_t *curr_impl;
 	raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
 	int impl, i;
 
 	for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
 		/* set an implementation to benchmark */
 		curr_impl = raidz_supp_impl[impl];
 		bench_rm->rm_ops = curr_impl;
 
 		run_cnt = 0;
 		t_start = gethrtime();
 
 		do {
 			for (i = 0; i < 5; i++, run_cnt++)
 				bench_fn(bench_rm, fn);
 
 			t_diff = gethrtime() - t_start;
 		} while (t_diff < BENCH_NS);
 
 		speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
 		speed /= (t_diff * BENCH_COLS);
 
 		if (bench_fn == benchmark_gen_impl)
 			raidz_impl_kstats[impl].gen[fn] = speed;
 		else
 			raidz_impl_kstats[impl].rec[fn] = speed;
 
 		/* Update fastest implementation method */
 		if (speed > best_speed) {
 			best_speed = speed;
 
 			if (bench_fn == benchmark_gen_impl) {
 				fstat->gen[fn] = impl;
 				vdev_raidz_fastest_impl.gen[fn] =
 				    curr_impl->gen[fn];
 			} else {
 				fstat->rec[fn] = impl;
 				vdev_raidz_fastest_impl.rec[fn] =
 				    curr_impl->rec[fn];
 			}
 		}
 	}
 }
 #endif
 
 /*
  * Initialize and benchmark all supported implementations.
  */
 static void
 benchmark_raidz(void)
 {
 	raidz_impl_ops_t *curr_impl;
 	int i, c;
 
 	/* Move supported impl into raidz_supp_impl */
 	for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
 		curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
 
 		if (curr_impl->init)
 			curr_impl->init();
 
 		if (curr_impl->is_supported())
 			raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl;
 	}
 	membar_producer();		/* complete raidz_supp_impl[] init */
 	raidz_supp_impl_cnt = c;	/* number of supported impl */
 
 #if defined(_KERNEL)
 	zio_t *bench_zio = NULL;
 	raidz_map_t *bench_rm = NULL;
 	uint64_t bench_parity;
 
 	/* Fake a zio and run the benchmark on a warmed up buffer */
 	bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
 	bench_zio->io_offset = 0;
 	bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
 	bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
 	memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
 
 	/* Benchmark parity generation methods */
 	for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
 		bench_parity = fn + 1;
 		/* New raidz_map is needed for each generate_p/q/r */
 		bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
 		    BENCH_D_COLS + bench_parity, bench_parity);
 
 		benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
 
 		vdev_raidz_map_free(bench_rm);
 	}
 
 	/* Benchmark data reconstruction methods */
 	bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
 	    BENCH_COLS, PARITY_PQR);
 
 	for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
 		benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
 
 	vdev_raidz_map_free(bench_rm);
 
 	/* cleanup the bench zio */
 	abd_free(bench_zio->io_abd);
 	kmem_free(bench_zio, sizeof (zio_t));
 #else
 	/*
 	 * Skip the benchmark in user space to avoid impacting libzpool
 	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
 	 * is assumed to be the fastest and used by default.
 	 */
 	memcpy(&vdev_raidz_fastest_impl,
 	    raidz_supp_impl[raidz_supp_impl_cnt - 1],
 	    sizeof (vdev_raidz_fastest_impl));
 	strcpy(vdev_raidz_fastest_impl.name, "fastest");
 #endif /* _KERNEL */
 }
 
 void
 vdev_raidz_math_init(void)
 {
 	/* Determine the fastest available implementation. */
 	benchmark_raidz();
 
 #if defined(_KERNEL)
 	/* Install kstats for all implementations */
 	raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 	if (raidz_math_kstat != NULL) {
 		raidz_math_kstat->ks_data = NULL;
 		raidz_math_kstat->ks_ndata = UINT32_MAX;
 		kstat_set_raw_ops(raidz_math_kstat,
 		    raidz_math_kstat_headers,
 		    raidz_math_kstat_data,
 		    raidz_math_kstat_addr);
 		kstat_install(raidz_math_kstat);
 	}
 #endif
 
 	/* Finish initialization */
 	atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
 	raidz_math_initialized = B_TRUE;
 }
 
 void
 vdev_raidz_math_fini(void)
 {
 	raidz_impl_ops_t const *curr_impl;
 
 #if defined(_KERNEL)
 	if (raidz_math_kstat != NULL) {
 		kstat_delete(raidz_math_kstat);
 		raidz_math_kstat = NULL;
 	}
 #endif
 
 	for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
 		curr_impl = raidz_all_maths[i];
 		if (curr_impl->fini)
 			curr_impl->fini();
 	}
 }
 
 static const struct {
 	char *name;
 	uint32_t sel;
 } math_impl_opts[] = {
 		{ "cycle",	IMPL_CYCLE },
 		{ "fastest",	IMPL_FASTEST },
 		{ "original",	IMPL_ORIGINAL },
 		{ "scalar",	IMPL_SCALAR }
 };
 
 /*
  * Function sets desired raidz implementation.
  *
  * If we are called before init(), user preference will be saved in
  * user_sel_impl, and applied in later init() call. This occurs when module
  * parameter is specified on module load. Otherwise, directly update
  * zfs_vdev_raidz_impl.
  *
  * @val		Name of raidz implementation to use
  * @param	Unused.
  */
 int
 vdev_raidz_impl_set(const char *val)
 {
 	int err = -EINVAL;
 	char req_name[RAIDZ_IMPL_NAME_MAX];
 	uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
 	size_t i;
 
 	/* sanitize input */
 	i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
 	if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
 		return (err);
 
 	strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
 	while (i > 0 && !!isspace(req_name[i-1]))
 		i--;
 	req_name[i] = '\0';
 
 	/* Check mandatory options */
 	for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
 		if (strcmp(req_name, math_impl_opts[i].name) == 0) {
 			impl = math_impl_opts[i].sel;
 			err = 0;
 			break;
 		}
 	}
 
 	/* check all supported impl if init() was already called */
 	if (err != 0 && raidz_math_initialized) {
 		/* check all supported implementations */
 		for (i = 0; i < raidz_supp_impl_cnt; i++) {
 			if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
 				impl = i;
 				err = 0;
 				break;
 			}
 		}
 	}
 
 	if (err == 0) {
 		if (raidz_math_initialized)
 			atomic_swap_32(&zfs_vdev_raidz_impl, impl);
 		else
 			atomic_swap_32(&user_sel_impl, impl);
 	}
 
 	return (err);
 }
 
 #if defined(_KERNEL) && defined(__linux__)
 
 static int
 zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
 {
 	return (vdev_raidz_impl_set(val));
 }
 
 static int
 zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
 {
 	int i, cnt = 0;
 	char *fmt;
 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
 
 	ASSERT(raidz_math_initialized);
 
 	/* list mandatory options */
 	for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
 		fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
 		cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
 	}
 
 	/* list all supported implementations */
 	for (i = 0; i < raidz_supp_impl_cnt; i++) {
 		fmt = (i == impl) ? "[%s] " : "%s ";
 		cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
 	}
 
 	return (cnt);
 }
 
 module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
     zfs_vdev_raidz_impl_get, NULL, 0644);
 MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
 #endif