diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
index 8db1aef76518..eb81dfe07068 100644
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@@ -1,1885 +1,1882 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2016, 2017 Intel Corporation.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
  */
 
 /*
  * Functions to convert between a list of vdevs and an nvlist representing the
  * configuration.  Each entry in the list can be one of:
  *
  * 	Device vdevs
  * 		disk=(path=..., devid=...)
  * 		file=(path=...)
  *
  * 	Group vdevs
  * 		raidz[1|2]=(...)
  * 		mirror=(...)
  *
  * 	Hot spares
  *
  * While the underlying implementation supports it, group vdevs cannot contain
  * other group vdevs.  All userland verification of devices is contained within
  * this file.  If successful, the nvlist returned can be passed directly to the
  * kernel; we've done as much verification as possible in userland.
  *
  * Hot spares are a special case, and passed down as an array of disk vdevs, at
  * the same level as the root of the vdev tree.
  *
  * The only function exported by this file is 'make_root_vdev'.  The
  * function performs several passes:
  *
  * 	1. Construct the vdev specification.  Performs syntax validation and
  *         makes sure each device is valid.
  * 	2. Check for devices in use.  Using libblkid to make sure that no
  *         devices are also in use.  Some can be overridden using the 'force'
  *         flag, others cannot.
  * 	3. Check for replication errors if the 'force' flag is not specified.
  *         validates that the replication level is consistent across the
  *         entire pool.
  * 	4. Call libzfs to label any whole disks with an EFI label.
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <libintl.h>
 #include <libnvpair.h>
 #include <libzutil.h>
 #include <limits.h>
 #include <sys/spa.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
 #include "zpool_util.h"
 #include <sys/zfs_context.h>
 #include <sys/stat.h>
 
 /*
  * For any given vdev specification, we can have multiple errors.  The
  * vdev_error() function keeps track of whether we have seen an error yet, and
  * prints out a header if its the first error we've seen.
  */
 boolean_t error_seen;
 boolean_t is_force;
 
 void
 vdev_error(const char *fmt, ...)
 {
 	va_list ap;
 
 	if (!error_seen) {
 		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
 		if (!is_force)
 			(void) fprintf(stderr, gettext("use '-f' to override "
 			    "the following errors:\n"));
 		else
 			(void) fprintf(stderr, gettext("the following errors "
 			    "must be manually repaired:\n"));
 		error_seen = B_TRUE;
 	}
 
 	va_start(ap, fmt);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 }
 
 /*
  * Check that a file is valid.  All we can do in this case is check that it's
  * not in use by another pool, and not in use by swap.
  */
 int
 check_file_generic(const char *file, boolean_t force, boolean_t isspare)
 {
 	char  *name;
 	int fd;
 	int ret = 0;
 	pool_state_t state;
 	boolean_t inuse;
 
 	if ((fd = open(file, O_RDONLY)) < 0)
 		return (0);
 
 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
 		const char *desc;
 
 		switch (state) {
 		case POOL_STATE_ACTIVE:
 			desc = gettext("active");
 			break;
 
 		case POOL_STATE_EXPORTED:
 			desc = gettext("exported");
 			break;
 
 		case POOL_STATE_POTENTIALLY_ACTIVE:
 			desc = gettext("potentially active");
 			break;
 
 		default:
 			desc = gettext("unknown");
 			break;
 		}
 
 		/*
 		 * Allow hot spares to be shared between pools.
 		 */
 		if (state == POOL_STATE_SPARE && isspare) {
 			free(name);
 			(void) close(fd);
 			return (0);
 		}
 
 		if (state == POOL_STATE_ACTIVE ||
 		    state == POOL_STATE_SPARE || !force) {
 			switch (state) {
 			case POOL_STATE_SPARE:
 				vdev_error(gettext("%s is reserved as a hot "
 				    "spare for pool %s\n"), file, name);
 				break;
 			default:
 				vdev_error(gettext("%s is part of %s pool "
 				    "'%s'\n"), file, desc, name);
 				break;
 			}
 			ret = -1;
 		}
 
 		free(name);
 	}
 
 	(void) close(fd);
 	return (ret);
 }
 
 /*
  * This may be a shorthand device path or it could be total gibberish.
  * Check to see if it is a known device available in zfs_vdev_paths.
  * As part of this check, see if we've been given an entire disk
  * (minus the slice number).
  */
 static int
 is_shorthand_path(const char *arg, char *path, size_t path_size,
     struct stat64 *statbuf, boolean_t *wholedisk)
 {
 	int error;
 
 	error = zfs_resolve_shortname(arg, path, path_size);
 	if (error == 0) {
 		*wholedisk = zfs_dev_is_whole_disk(path);
 		if (*wholedisk || (stat64(path, statbuf) == 0))
 			return (0);
 	}
 
 	strlcpy(path, arg, path_size);
 	memset(statbuf, 0, sizeof (*statbuf));
 	*wholedisk = B_FALSE;
 
 	return (error);
 }
 
 /*
  * Determine if the given path is a hot spare within the given configuration.
  * If no configuration is given we rely solely on the label.
  */
 static boolean_t
 is_spare(nvlist_t *config, const char *path)
 {
 	int fd;
 	pool_state_t state;
 	char *name = NULL;
 	nvlist_t *label;
 	uint64_t guid, spareguid;
 	nvlist_t *nvroot;
 	nvlist_t **spares;
 	uint_t i, nspares;
 	boolean_t inuse;
 
 	if (zpool_is_draid_spare(path))
 		return (B_TRUE);
 
 	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
 		return (B_FALSE);
 
 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
 	    !inuse ||
 	    state != POOL_STATE_SPARE ||
 	    zpool_read_label(fd, &label, NULL) != 0) {
 		free(name);
 		(void) close(fd);
 		return (B_FALSE);
 	}
 	free(name);
 	(void) close(fd);
 
 	if (config == NULL) {
 		nvlist_free(label);
 		return (B_TRUE);
 	}
 
 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
 	nvlist_free(label);
 
 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		for (i = 0; i < nspares; i++) {
 			verify(nvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
 			if (spareguid == guid)
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
  * device, fill in the device id to make a complete nvlist.  Valid forms for a
  * leaf vdev are:
  *
  *	/dev/xxx	Complete disk path
  *	/xxx		Full path to file
  *	xxx		Shorthand for <zfs_vdev_paths>/xxx
  *	draid*		Virtual dRAID spare
  */
 static nvlist_t *
 make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
 {
 	char path[MAXPATHLEN];
 	struct stat64 statbuf;
 	nvlist_t *vdev = NULL;
 	const char *type = NULL;
 	boolean_t wholedisk = B_FALSE;
 	uint64_t ashift = 0;
 	int err;
 
 	/*
 	 * Determine what type of vdev this is, and put the full path into
 	 * 'path'.  We detect whether this is a device of file afterwards by
 	 * checking the st_mode of the file.
 	 */
 	if (arg[0] == '/') {
 		/*
 		 * Complete device or file path.  Exact type is determined by
 		 * examining the file descriptor afterwards.  Symbolic links
 		 * are resolved to their real paths to determine whole disk
 		 * and S_ISBLK/S_ISREG type checks.  However, we are careful
 		 * to store the given path as ZPOOL_CONFIG_PATH to ensure we
 		 * can leverage udev's persistent device labels.
 		 */
 		if (realpath(arg, path) == NULL) {
 			(void) fprintf(stderr,
 			    gettext("cannot resolve path '%s'\n"), arg);
 			return (NULL);
 		}
 
 		wholedisk = zfs_dev_is_whole_disk(path);
 		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
 			(void) fprintf(stderr,
 			    gettext("cannot open '%s': %s\n"),
 			    path, strerror(errno));
 			return (NULL);
 		}
 
 		/* After whole disk check restore original passed path */
 		strlcpy(path, arg, sizeof (path));
 	} else if (zpool_is_draid_spare(arg)) {
 		if (!is_primary) {
 			(void) fprintf(stderr,
 			    gettext("cannot open '%s': dRAID spares can only "
 			    "be used to replace primary vdevs\n"), arg);
 			return (NULL);
 		}
 
 		wholedisk = B_TRUE;
 		strlcpy(path, arg, sizeof (path));
 		type = VDEV_TYPE_DRAID_SPARE;
 	} else {
 		err = is_shorthand_path(arg, path, sizeof (path),
 		    &statbuf, &wholedisk);
 		if (err != 0) {
 			/*
 			 * If we got ENOENT, then the user gave us
 			 * gibberish, so try to direct them with a
 			 * reasonable error message.  Otherwise,
 			 * regurgitate strerror() since it's the best we
 			 * can do.
 			 */
 			if (err == ENOENT) {
 				(void) fprintf(stderr,
 				    gettext("cannot open '%s': no such "
 				    "device in %s\n"), arg, DISK_ROOT);
 				(void) fprintf(stderr,
 				    gettext("must be a full path or "
 				    "shorthand device name\n"));
 				return (NULL);
 			} else {
 				(void) fprintf(stderr,
 				    gettext("cannot open '%s': %s\n"),
 				    path, strerror(errno));
 				return (NULL);
 			}
 		}
 	}
 
 	if (type == NULL) {
 		/*
 		 * Determine whether this is a device or a file.
 		 */
 		if (wholedisk || S_ISBLK(statbuf.st_mode)) {
 			type = VDEV_TYPE_DISK;
 		} else if (S_ISREG(statbuf.st_mode)) {
 			type = VDEV_TYPE_FILE;
 		} else {
 			fprintf(stderr, gettext("cannot use '%s': must "
 			    "be a block device or regular file\n"), path);
 			return (NULL);
 		}
 	}
 
 	/*
 	 * Finally, we have the complete device or file, and we know that it is
 	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
 	 * vdevs have a 'path' element, and devices also have a 'devid' element.
 	 */
 	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
 
 	if (strcmp(type, VDEV_TYPE_DISK) == 0)
 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
 		    (uint64_t)wholedisk) == 0);
 
 	/*
 	 * Override defaults if custom properties are provided.
 	 */
 	if (props != NULL) {
 		char *value = NULL;
 
 		if (nvlist_lookup_string(props,
 		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
 			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
 				(void) fprintf(stderr,
 				    gettext("ashift must be a number.\n"));
 				return (NULL);
 			}
 			if (ashift != 0 &&
 			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
 				(void) fprintf(stderr,
 				    gettext("invalid 'ashift=%" PRIu64 "' "
 				    "property: only values between %" PRId32 " "
 				    "and %" PRId32 " are allowed.\n"),
 				    ashift, ASHIFT_MIN, ASHIFT_MAX);
 				return (NULL);
 			}
 		}
 	}
 
 	/*
 	 * If the device is known to incorrectly report its physical sector
 	 * size explicitly provide the known correct value.
 	 */
 	if (ashift == 0) {
 		int sector_size;
 
 		if (check_sector_size_database(path, &sector_size) == B_TRUE)
 			ashift = highbit64(sector_size) - 1;
 	}
 
 	if (ashift > 0)
 		(void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
 
 	return (vdev);
 }
 
 /*
  * Go through and verify the replication level of the pool is consistent.
  * Performs the following checks:
  *
  * 	For the new spec, verifies that devices in mirrors and raidz are the
  * 	same size.
  *
  * 	If the current configuration already has inconsistent replication
  * 	levels, ignore any other potential problems in the new spec.
  *
  * 	Otherwise, make sure that the current spec (if there is one) and the new
  * 	spec have consistent replication levels.
  *
  *	If there is no current spec (create), make sure new spec has at least
  *	one general purpose vdev.
  */
 typedef struct replication_level {
 	char *zprl_type;
 	uint64_t zprl_children;
 	uint64_t zprl_parity;
 } replication_level_t;
 
 #define	ZPOOL_FUZZ	(16 * 1024 * 1024)
 
 /*
  * N.B. For the purposes of comparing replication levels dRAID can be
  * considered functionally equivalent to raidz.
  */
 static boolean_t
 is_raidz_mirror(replication_level_t *a, replication_level_t *b,
     replication_level_t **raidz, replication_level_t **mirror)
 {
 	if ((strcmp(a->zprl_type, "raidz") == 0 ||
 	    strcmp(a->zprl_type, "draid") == 0) &&
 	    strcmp(b->zprl_type, "mirror") == 0) {
 		*raidz = a;
 		*mirror = b;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Comparison for determining if dRAID and raidz where passed in either order.
  */
 static boolean_t
 is_raidz_draid(replication_level_t *a, replication_level_t *b)
 {
 	if ((strcmp(a->zprl_type, "raidz") == 0 ||
 	    strcmp(a->zprl_type, "draid") == 0) &&
 	    (strcmp(b->zprl_type, "raidz") == 0 ||
 	    strcmp(b->zprl_type, "draid") == 0)) {
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Given a list of toplevel vdevs, return the current replication level.  If
  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
  * an error message will be displayed for each self-inconsistent vdev.
  */
 static replication_level_t *
 get_replication(nvlist_t *nvroot, boolean_t fatal)
 {
 	nvlist_t **top;
 	uint_t t, toplevels;
 	nvlist_t **child;
 	uint_t c, children;
 	nvlist_t *nv;
 	char *type;
 	replication_level_t lastrep = {0};
 	replication_level_t rep;
 	replication_level_t *ret;
 	replication_level_t *raidz, *mirror;
 	boolean_t dontreport;
 
 	ret = safe_malloc(sizeof (replication_level_t));
 
 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &top, &toplevels) == 0);
 
 	for (t = 0; t < toplevels; t++) {
 		uint64_t is_log = B_FALSE;
 
 		nv = top[t];
 
 		/*
 		 * For separate logs we ignore the top level vdev replication
 		 * constraints.
 		 */
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
 		if (is_log)
 			continue;
 
 		/*
 		 * Ignore holes introduced by removing aux devices, along
 		 * with indirect vdevs introduced by previously removed
 		 * vdevs.
 		 */
 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
 		if (strcmp(type, VDEV_TYPE_HOLE) == 0 ||
 		    strcmp(type, VDEV_TYPE_INDIRECT) == 0)
 			continue;
 
 		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 		    &child, &children) != 0) {
 			/*
 			 * This is a 'file' or 'disk' vdev.
 			 */
 			rep.zprl_type = type;
 			rep.zprl_children = 1;
 			rep.zprl_parity = 0;
 		} else {
 			int64_t vdev_size;
 
 			/*
 			 * This is a mirror or RAID-Z vdev.  Go through and make
 			 * sure the contents are all the same (files vs. disks),
 			 * keeping track of the number of elements in the
 			 * process.
 			 *
 			 * We also check that the size of each vdev (if it can
 			 * be determined) is the same.
 			 */
 			rep.zprl_type = type;
 			rep.zprl_children = 0;
 
 			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
 			    strcmp(type, VDEV_TYPE_DRAID) == 0) {
 				verify(nvlist_lookup_uint64(nv,
 				    ZPOOL_CONFIG_NPARITY,
 				    &rep.zprl_parity) == 0);
 				assert(rep.zprl_parity != 0);
 			} else {
 				rep.zprl_parity = 0;
 			}
 
 			/*
 			 * The 'dontreport' variable indicates that we've
 			 * already reported an error for this spec, so don't
 			 * bother doing it again.
 			 */
 			type = NULL;
 			dontreport = 0;
 			vdev_size = -1LL;
 			for (c = 0; c < children; c++) {
 				nvlist_t *cnv = child[c];
 				char *path;
 				struct stat64 statbuf;
 				int64_t size = -1LL;
 				char *childtype;
 				int fd, err;
 
 				rep.zprl_children++;
 
 				verify(nvlist_lookup_string(cnv,
 				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
 
 				/*
 				 * If this is a replacing or spare vdev, then
 				 * get the real first child of the vdev: do this
 				 * in a loop because replacing and spare vdevs
 				 * can be nested.
 				 */
 				while (strcmp(childtype,
 				    VDEV_TYPE_REPLACING) == 0 ||
 				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
 					nvlist_t **rchild;
 					uint_t rchildren;
 
 					verify(nvlist_lookup_nvlist_array(cnv,
 					    ZPOOL_CONFIG_CHILDREN, &rchild,
 					    &rchildren) == 0);
 					assert(rchildren == 2);
 					cnv = rchild[0];
 
 					verify(nvlist_lookup_string(cnv,
 					    ZPOOL_CONFIG_TYPE,
 					    &childtype) == 0);
 				}
 
 				verify(nvlist_lookup_string(cnv,
 				    ZPOOL_CONFIG_PATH, &path) == 0);
 
 				/*
 				 * If we have a raidz/mirror that combines disks
 				 * with files, report it as an error.
 				 */
 				if (!dontreport && type != NULL &&
 				    strcmp(type, childtype) != 0) {
 					if (ret != NULL)
 						free(ret);
 					ret = NULL;
 					if (fatal)
 						vdev_error(gettext(
 						    "mismatched replication "
 						    "level: %s contains both "
 						    "files and devices\n"),
 						    rep.zprl_type);
 					else
 						return (NULL);
 					dontreport = B_TRUE;
 				}
 
 				/*
 				 * According to stat(2), the value of 'st_size'
 				 * is undefined for block devices and character
 				 * devices.  But there is no effective way to
 				 * determine the real size in userland.
 				 *
 				 * Instead, we'll take advantage of an
 				 * implementation detail of spec_size().  If the
 				 * device is currently open, then we (should)
 				 * return a valid size.
 				 *
 				 * If we still don't get a valid size (indicated
 				 * by a size of 0 or MAXOFFSET_T), then ignore
 				 * this device altogether.
 				 */
 				if ((fd = open(path, O_RDONLY)) >= 0) {
 					err = fstat64_blk(fd, &statbuf);
 					(void) close(fd);
 				} else {
 					err = stat64(path, &statbuf);
 				}
 
 				if (err != 0 ||
 				    statbuf.st_size == 0 ||
 				    statbuf.st_size == MAXOFFSET_T)
 					continue;
 
 				size = statbuf.st_size;
 
 				/*
 				 * Also make sure that devices and
 				 * slices have a consistent size.  If
 				 * they differ by a significant amount
 				 * (~16MB) then report an error.
 				 */
 				if (!dontreport &&
 				    (vdev_size != -1LL &&
 				    (llabs(size - vdev_size) >
 				    ZPOOL_FUZZ))) {
 					if (ret != NULL)
 						free(ret);
 					ret = NULL;
 					if (fatal)
 						vdev_error(gettext(
 						    "%s contains devices of "
 						    "different sizes\n"),
 						    rep.zprl_type);
 					else
 						return (NULL);
 					dontreport = B_TRUE;
 				}
 
 				type = childtype;
 				vdev_size = size;
 			}
 		}
 
 		/*
 		 * At this point, we have the replication of the last toplevel
 		 * vdev in 'rep'.  Compare it to 'lastrep' to see if it is
 		 * different.
 		 */
 		if (lastrep.zprl_type != NULL) {
 			if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
 			    is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
 				/*
 				 * Accepted raidz and mirror when they can
 				 * handle the same number of disk failures.
 				 */
 				if (raidz->zprl_parity !=
 				    mirror->zprl_children - 1) {
 					if (ret != NULL)
 						free(ret);
 					ret = NULL;
 					if (fatal)
 						vdev_error(gettext(
 						    "mismatched replication "
 						    "level: "
 						    "%s and %s vdevs with "
 						    "different redundancy, "
 						    "%llu vs. %llu (%llu-way) "
 						    "are present\n"),
 						    raidz->zprl_type,
 						    mirror->zprl_type,
 						    (u_longlong_t)
 						    raidz->zprl_parity,
 						    (u_longlong_t)
 						    mirror->zprl_children - 1,
 						    (u_longlong_t)
 						    mirror->zprl_children);
 					else
 						return (NULL);
 				}
 			} else if (is_raidz_draid(&lastrep, &rep)) {
 				/*
 				 * Accepted raidz and draid when they can
 				 * handle the same number of disk failures.
 				 */
 				if (lastrep.zprl_parity != rep.zprl_parity) {
 					if (ret != NULL)
 						free(ret);
 					ret = NULL;
 					if (fatal)
 						vdev_error(gettext(
 						    "mismatched replication "
 						    "level: %s and %s vdevs "
 						    "with different "
 						    "redundancy, %llu vs. "
 						    "%llu are present\n"),
 						    lastrep.zprl_type,
 						    rep.zprl_type,
 						    (u_longlong_t)
 						    lastrep.zprl_parity,
 						    (u_longlong_t)
 						    rep.zprl_parity);
 					else
 						return (NULL);
 				}
 			} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
 			    0) {
 				if (ret != NULL)
 					free(ret);
 				ret = NULL;
 				if (fatal)
 					vdev_error(gettext(
 					    "mismatched replication level: "
 					    "both %s and %s vdevs are "
 					    "present\n"),
 					    lastrep.zprl_type, rep.zprl_type);
 				else
 					return (NULL);
 			} else if (lastrep.zprl_parity != rep.zprl_parity) {
 				if (ret)
 					free(ret);
 				ret = NULL;
 				if (fatal)
 					vdev_error(gettext(
 					    "mismatched replication level: "
 					    "both %llu and %llu device parity "
 					    "%s vdevs are present\n"),
 					    (u_longlong_t)
 					    lastrep.zprl_parity,
 					    (u_longlong_t)rep.zprl_parity,
 					    rep.zprl_type);
 				else
 					return (NULL);
 			} else if (lastrep.zprl_children != rep.zprl_children) {
 				if (ret)
 					free(ret);
 				ret = NULL;
 				if (fatal)
 					vdev_error(gettext(
 					    "mismatched replication level: "
 					    "both %llu-way and %llu-way %s "
 					    "vdevs are present\n"),
 					    (u_longlong_t)
 					    lastrep.zprl_children,
 					    (u_longlong_t)
 					    rep.zprl_children,
 					    rep.zprl_type);
 				else
 					return (NULL);
 			}
 		}
 		lastrep = rep;
 	}
 
 	if (ret != NULL)
 		*ret = rep;
 
 	return (ret);
 }
 
 /*
  * Check the replication level of the vdev spec against the current pool.  Calls
  * get_replication() to make sure the new spec is self-consistent.  If the pool
  * has a consistent replication level, then we ignore any errors.  Otherwise,
  * report any difference between the two.
  */
 static int
 check_replication(nvlist_t *config, nvlist_t *newroot)
 {
 	nvlist_t **child;
 	uint_t	children;
 	replication_level_t *current = NULL, *new;
 	replication_level_t *raidz, *mirror;
 	int ret;
 
 	/*
 	 * If we have a current pool configuration, check to see if it's
 	 * self-consistent.  If not, simply return success.
 	 */
 	if (config != NULL) {
 		nvlist_t *nvroot;
 
 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 		    &nvroot) == 0);
 		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
 			return (0);
 	}
 	/*
 	 * for spares there may be no children, and therefore no
 	 * replication level to check
 	 */
 	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0) || (children == 0)) {
 		free(current);
 		return (0);
 	}
 
 	/*
 	 * If all we have is logs then there's no replication level to check.
 	 */
 	if (num_logs(newroot) == children) {
 		free(current);
 		return (0);
 	}
 
 	/*
 	 * Get the replication level of the new vdev spec, reporting any
 	 * inconsistencies found.
 	 */
 	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
 		free(current);
 		return (-1);
 	}
 
 	/*
 	 * Check to see if the new vdev spec matches the replication level of
 	 * the current pool.
 	 */
 	ret = 0;
 	if (current != NULL) {
 		if (is_raidz_mirror(current, new, &raidz, &mirror) ||
 		    is_raidz_mirror(new, current, &raidz, &mirror)) {
 			if (raidz->zprl_parity != mirror->zprl_children - 1) {
 				vdev_error(gettext(
 				    "mismatched replication level: pool and "
 				    "new vdev with different redundancy, %s "
 				    "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
 				    raidz->zprl_type,
 				    mirror->zprl_type,
 				    (u_longlong_t)raidz->zprl_parity,
 				    (u_longlong_t)mirror->zprl_children - 1,
 				    (u_longlong_t)mirror->zprl_children);
 				ret = -1;
 			}
 		} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
 			vdev_error(gettext(
 			    "mismatched replication level: pool uses %s "
 			    "and new vdev is %s\n"),
 			    current->zprl_type, new->zprl_type);
 			ret = -1;
 		} else if (current->zprl_parity != new->zprl_parity) {
 			vdev_error(gettext(
 			    "mismatched replication level: pool uses %llu "
 			    "device parity and new vdev uses %llu\n"),
 			    (u_longlong_t)current->zprl_parity,
 			    (u_longlong_t)new->zprl_parity);
 			ret = -1;
 		} else if (current->zprl_children != new->zprl_children) {
 			vdev_error(gettext(
 			    "mismatched replication level: pool uses %llu-way "
 			    "%s and new vdev uses %llu-way %s\n"),
 			    (u_longlong_t)current->zprl_children,
 			    current->zprl_type,
 			    (u_longlong_t)new->zprl_children,
 			    new->zprl_type);
 			ret = -1;
 		}
 	}
 
 	free(new);
 	if (current != NULL)
 		free(current);
 
 	return (ret);
 }
 
 static int
 zero_label(char *path)
 {
 	const int size = 4096;
 	char buf[size];
 	int err, fd;
 
 	if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
 		(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
 		    path, strerror(errno));
 		return (-1);
 	}
 
 	memset(buf, 0, size);
 	err = write(fd, buf, size);
 	(void) fdatasync(fd);
 	(void) close(fd);
 
 	if (err == -1) {
 		(void) fprintf(stderr, gettext("cannot zero first %d bytes "
 		    "of '%s': %s\n"), size, path, strerror(errno));
 		return (-1);
 	}
 
 	if (err != size) {
 		(void) fprintf(stderr, gettext("could only zero %d/%d bytes "
 		    "of '%s'\n"), err, size, path);
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Go through and find any whole disks in the vdev specification, labelling them
  * as appropriate.  When constructing the vdev spec, we were unable to open this
  * device in order to provide a devid.  Now that we have labelled the disk and
  * know that slice 0 is valid, we can construct the devid now.
  *
  * If the disk was already labeled with an EFI label, we will have gotten the
  * devid already (because we were able to open the whole disk).  Otherwise, we
  * need to get the devid after we label the disk.
  */
 static int
 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
 {
 	nvlist_t **child;
 	uint_t c, children;
 	char *type, *path;
 	char devpath[MAXPATHLEN];
 	char udevpath[MAXPATHLEN];
 	uint64_t wholedisk;
 	struct stat64 statbuf;
 	int is_exclusive = 0;
 	int fd;
 	int ret;
 
 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0) {
 
 		if (strcmp(type, VDEV_TYPE_DISK) != 0)
 			return (0);
 
 		/*
 		 * We have a disk device.  If this is a whole disk write
 		 * out the efi partition table, otherwise write zero's to
 		 * the first 4k of the partition.  This is to ensure that
 		 * libblkid will not misidentify the partition due to a
 		 * magic value left by the previous filesystem.
 		 */
 		verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
 		verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    &wholedisk));
 
 		if (!wholedisk) {
 			/*
 			 * Update device id string for mpath nodes (Linux only)
 			 */
 			if (is_mpath_whole_disk(path))
 				update_vdev_config_dev_strs(nv);
 
 			if (!is_spare(NULL, path))
 				(void) zero_label(path);
 			return (0);
 		}
 
 		if (realpath(path, devpath) == NULL) {
 			ret = errno;
 			(void) fprintf(stderr,
 			    gettext("cannot resolve path '%s'\n"), path);
 			return (ret);
 		}
 
 		/*
 		 * Remove any previously existing symlink from a udev path to
 		 * the device before labeling the disk.  This ensures that
 		 * only newly created links are used.  Otherwise there is a
 		 * window between when udev deletes and recreates the link
 		 * during which access attempts will fail with ENOENT.
 		 */
 		strlcpy(udevpath, path, MAXPATHLEN);
 		(void) zfs_append_partition(udevpath, MAXPATHLEN);
 
 		fd = open(devpath, O_RDWR|O_EXCL);
 		if (fd == -1) {
 			if (errno == EBUSY)
 				is_exclusive = 1;
 #ifdef __FreeBSD__
 			if (errno == EPERM)
 				is_exclusive = 1;
 #endif
 		} else {
 			(void) close(fd);
 		}
 
 		/*
 		 * If the partition exists, contains a valid spare label,
 		 * and is opened exclusively there is no need to partition
 		 * it.  Hot spares have already been partitioned and are
 		 * held open exclusively by the kernel as a safety measure.
 		 *
 		 * If the provided path is for a /dev/disk/ device its
 		 * symbolic link will be removed, partition table created,
 		 * and then block until udev creates the new link.
 		 */
 		if (!is_exclusive && !is_spare(NULL, udevpath)) {
 			char *devnode = strrchr(devpath, '/') + 1;
 
 			ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
 			if (ret == 0) {
 				ret = lstat64(udevpath, &statbuf);
 				if (ret == 0 && S_ISLNK(statbuf.st_mode))
 					(void) unlink(udevpath);
 			}
 
 			/*
 			 * When labeling a pool the raw device node name
 			 * is provided as it appears under /dev/.
 			 */
 			if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
 				return (-1);
 
 			/*
 			 * Wait for udev to signal the device is available
 			 * by the provided path.
 			 */
 			ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
 			if (ret) {
 				(void) fprintf(stderr,
 				    gettext("missing link: %s was "
 				    "partitioned but %s is missing\n"),
 				    devnode, udevpath);
 				return (ret);
 			}
 
 			ret = zero_label(udevpath);
 			if (ret)
 				return (ret);
 		}
 
 		/*
 		 * Update the path to refer to the partition.  The presence of
 		 * the 'whole_disk' field indicates to the CLI that we should
 		 * chop off the partition number when displaying the device in
 		 * future output.
 		 */
 		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
 
 		/*
 		 * Update device id strings for whole disks (Linux only)
 		 */
 		update_vdev_config_dev_strs(nv);
 
 		return (0);
 	}
 
 	for (c = 0; c < children; c++)
 		if ((ret = make_disks(zhp, child[c])) != 0)
 			return (ret);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
 			if ((ret = make_disks(zhp, child[c])) != 0)
 				return (ret);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
 			if ((ret = make_disks(zhp, child[c])) != 0)
 				return (ret);
 
 	return (0);
 }
 
 /*
  * Go through and find any devices that are in use.  We rely on libdiskmgt for
  * the majority of this task.
  */
 static boolean_t
 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
     boolean_t replacing, boolean_t isspare)
 {
 	nvlist_t **child;
 	uint_t c, children;
 	char *type, *path;
 	int ret = 0;
 	char buf[MAXPATHLEN];
 	uint64_t wholedisk = B_FALSE;
 	boolean_t anyinuse = B_FALSE;
 
 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0) {
 
 		verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
 			verify(!nvlist_lookup_uint64(nv,
 			    ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
 
 		/*
 		 * As a generic check, we look to see if this is a replace of a
 		 * hot spare within the same pool.  If so, we allow it
 		 * regardless of what libblkid or zpool_in_use() says.
 		 */
 		if (replacing) {
 			(void) strlcpy(buf, path, sizeof (buf));
 			if (wholedisk) {
 				ret = zfs_append_partition(buf,  sizeof (buf));
 				if (ret == -1)
 					return (-1);
 			}
 
 			if (is_spare(config, buf))
 				return (B_FALSE);
 		}
 
 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
 			ret = check_device(path, force, isspare, wholedisk);
 
 		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
 			ret = check_file(path, force, isspare);
 
 		return (ret != 0);
 	}
 
 	for (c = 0; c < children; c++)
 		if (is_device_in_use(config, child[c], force, replacing,
 		    B_FALSE))
 			anyinuse = B_TRUE;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
 			if (is_device_in_use(config, child[c], force, replacing,
 			    B_TRUE))
 				anyinuse = B_TRUE;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
 			if (is_device_in_use(config, child[c], force, replacing,
 			    B_FALSE))
 				anyinuse = B_TRUE;
 
 	return (anyinuse);
 }
 
 /*
  * Returns the parity level extracted from a raidz or draid type.
  * If the parity cannot be determined zero is returned.
  */
 static int
 get_parity(const char *type)
 {
 	long parity = 0;
 	const char *p;
 
 	if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) {
 		p = type + strlen(VDEV_TYPE_RAIDZ);
 
 		if (*p == '\0') {
 			/* when unspecified default to single parity */
 			return (1);
 		} else if (*p == '0') {
 			/* no zero prefixes allowed */
 			return (0);
 		} else {
 			/* 0-3, no suffixes allowed */
 			char *end;
 			errno = 0;
 			parity = strtol(p, &end, 10);
 			if (errno != 0 || *end != '\0' ||
 			    parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) {
 				return (0);
 			}
 		}
 	} else if (strncmp(type, VDEV_TYPE_DRAID,
 	    strlen(VDEV_TYPE_DRAID)) == 0) {
 		p = type + strlen(VDEV_TYPE_DRAID);
 
 		if (*p == '\0' || *p == ':') {
 			/* when unspecified default to single parity */
 			return (1);
 		} else if (*p == '0') {
 			/* no zero prefixes allowed */
 			return (0);
 		} else {
 			/* 0-3, allowed suffixes: '\0' or ':' */
 			char *end;
 			errno = 0;
 			parity = strtol(p, &end, 10);
 			if (errno != 0 ||
 			    parity < 1 || parity > VDEV_DRAID_MAXPARITY ||
 			    (*end != '\0' && *end != ':')) {
 				return (0);
 			}
 		}
 	}
 
 	return ((int)parity);
 }
 
 /*
  * Assign the minimum and maximum number of devices allowed for
  * the specified type.  On error NULL is returned, otherwise the
  * type prefix is returned (raidz, mirror, etc).
  */
 static const char *
 is_grouping(const char *type, int *mindev, int *maxdev)
 {
 	int nparity;
 
 	if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
 	    strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) {
 		nparity = get_parity(type);
 		if (nparity == 0)
 			return (NULL);
 		if (mindev != NULL)
 			*mindev = nparity + 1;
 		if (maxdev != NULL)
 			*maxdev = 255;
 
 		if (strncmp(type, VDEV_TYPE_RAIDZ,
 		    strlen(VDEV_TYPE_RAIDZ)) == 0) {
 			return (VDEV_TYPE_RAIDZ);
 		} else {
 			return (VDEV_TYPE_DRAID);
 		}
 	}
 
 	if (maxdev != NULL)
 		*maxdev = INT_MAX;
 
 	if (strcmp(type, "mirror") == 0) {
 		if (mindev != NULL)
 			*mindev = 2;
 		return (VDEV_TYPE_MIRROR);
 	}
 
 	if (strcmp(type, "spare") == 0) {
 		if (mindev != NULL)
 			*mindev = 1;
 		return (VDEV_TYPE_SPARE);
 	}
 
 	if (strcmp(type, "log") == 0) {
 		if (mindev != NULL)
 			*mindev = 1;
 		return (VDEV_TYPE_LOG);
 	}
 
 	if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
 	    strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
 		if (mindev != NULL)
 			*mindev = 1;
 		return (type);
 	}
 
 	if (strcmp(type, "cache") == 0) {
 		if (mindev != NULL)
 			*mindev = 1;
 		return (VDEV_TYPE_L2CACHE);
 	}
 
 	return (NULL);
 }
 
 /*
  * Extract the configuration parameters encoded in the dRAID type and
  * use them to generate a dRAID configuration.  The expected format is:
  *
  * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]
  *
  * The intent is to be able to generate a good configuration when no
  * additional information is provided.  The only mandatory component
  * of the 'type' is the 'draid' prefix.  If a value is not provided
  * then reasonable defaults are used.  The optional components may
  * appear in any order but the d/s/c suffix is required.
  *
  * Valid inputs:
  * - data:     number of data devices per group (1-255)
  * - parity:   number of parity blocks per group (1-3)
  * - spares:   number of distributed spare (0-100)
  * - children: total number of devices (1-255)
  *
  * Examples:
  * - zpool create tank draid <devices...>
  * - zpool create tank draid2:8d:51c:2s <devices...>
  */
 static int
 draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
 {
 	uint64_t nparity = 1;
 	uint64_t nspares = 0;
 	uint64_t ndata = UINT64_MAX;
 	uint64_t ngroups = 1;
 	long value;
 
 	if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)
 		return (EINVAL);
 
 	nparity = (uint64_t)get_parity(type);
-	if (nparity == 0)
+	if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
+		fprintf(stderr,
+		    gettext("invalid dRAID parity level %llu; must be "
+		    "between 1 and %d\n"), (u_longlong_t)nparity,
+		    VDEV_DRAID_MAXPARITY);
 		return (EINVAL);
+	}
 
 	char *p = (char *)type;
 	while ((p = strchr(p, ':')) != NULL) {
 		char *end;
 
 		p = p + 1;
 		errno = 0;
 
 		if (!isdigit(p[0])) {
 			(void) fprintf(stderr, gettext("invalid dRAID "
 			    "syntax; expected [:<number><c|d|s>] not '%s'\n"),
 			    type);
 			return (EINVAL);
 		}
 
 		/* Expected non-zero value with c/d/s suffix */
 		value = strtol(p, &end, 10);
 		char suffix = tolower(*end);
 		if (errno != 0 ||
 		    (suffix != 'c' && suffix != 'd' && suffix != 's')) {
 			(void) fprintf(stderr, gettext("invalid dRAID "
 			    "syntax; expected [:<number><c|d|s>] not '%s'\n"),
 			    type);
 			return (EINVAL);
 		}
 
 		if (suffix == 'c') {
 			if ((uint64_t)value != children) {
 				fprintf(stderr,
 				    gettext("invalid number of dRAID children; "
 				    "%llu required but %llu provided\n"),
 				    (u_longlong_t)value,
 				    (u_longlong_t)children);
 				return (EINVAL);
 			}
 		} else if (suffix == 'd') {
 			ndata = (uint64_t)value;
 		} else if (suffix == 's') {
 			nspares = (uint64_t)value;
 		} else {
 			verify(0); /* Unreachable */
 		}
 	}
 
 	/*
 	 * When a specific number of data disks is not provided limit a
 	 * redundancy group to 8 data disks.  This value was selected to
 	 * provide a reasonable tradeoff between capacity and performance.
 	 */
 	if (ndata == UINT64_MAX) {
 		if (children > nspares + nparity) {
 			ndata = MIN(children - nspares - nparity, 8);
 		} else {
 			fprintf(stderr, gettext("request number of "
 			    "distributed spares %llu and parity level %llu\n"
 			    "leaves no disks available for data\n"),
 			    (u_longlong_t)nspares, (u_longlong_t)nparity);
 			return (EINVAL);
 		}
 	}
 
 	/* Verify the maximum allowed group size is never exceeded. */
 	if (ndata == 0 || (ndata + nparity > children - nspares)) {
 		fprintf(stderr, gettext("requested number of dRAID data "
 		    "disks per group %llu is too high,\nat most %llu disks "
 		    "are available for data\n"), (u_longlong_t)ndata,
 		    (u_longlong_t)(children - nspares - nparity));
 		return (EINVAL);
 	}
 
-	if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
-		fprintf(stderr,
-		    gettext("invalid dRAID parity level %llu; must be "
-		    "between 1 and %d\n"), (u_longlong_t)nparity,
-		    VDEV_DRAID_MAXPARITY);
-		return (EINVAL);
-	}
-
 	/*
 	 * Verify the requested number of spares can be satisfied.
 	 * An arbitrary limit of 100 distributed spares is applied.
 	 */
 	if (nspares > 100 || nspares > (children - (ndata + nparity))) {
 		fprintf(stderr,
 		    gettext("invalid number of dRAID spares %llu; additional "
 		    "disks would be required\n"), (u_longlong_t)nspares);
 		return (EINVAL);
 	}
 
 	/* Verify the requested number children is sufficient. */
 	if (children < (ndata + nparity + nspares)) {
 		fprintf(stderr, gettext("%llu disks were provided, but at "
 		    "least %llu disks are required for this config\n"),
 		    (u_longlong_t)children,
 		    (u_longlong_t)(ndata + nparity + nspares));
 	}
 
 	if (children > VDEV_DRAID_MAX_CHILDREN) {
 		fprintf(stderr, gettext("%llu disks were provided, but "
 		    "dRAID only supports up to %u disks"),
 		    (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);
 	}
 
 	/*
 	 * Calculate the minimum number of groups required to fill a slice.
 	 * This is the LCM of the stripe width (ndata + nparity) and the
 	 * number of data drives (children - nspares).
 	 */
 	while (ngroups * (ndata + nparity) % (children - nspares) != 0)
 		ngroups++;
 
 	/* Store the basic dRAID configuration. */
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
 
 	return (0);
 }
 
 /*
  * Construct a syntactically valid vdev specification,
  * and ensure that all devices and files exist and can be opened.
  * Note: we don't bother freeing anything in the error paths
  * because the program is just going to exit anyway.
  */
 static nvlist_t *
 construct_spec(nvlist_t *props, int argc, char **argv)
 {
 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
 	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
 	const char *type, *fulltype;
 	boolean_t is_log, is_special, is_dedup, is_spare;
 	boolean_t seen_logs;
 
 	top = NULL;
 	toplevels = 0;
 	spares = NULL;
 	l2cache = NULL;
 	nspares = 0;
 	nlogs = 0;
 	nl2cache = 0;
 	is_log = is_special = is_dedup = is_spare = B_FALSE;
 	seen_logs = B_FALSE;
 	nvroot = NULL;
 
 	while (argc > 0) {
 		fulltype = argv[0];
 		nv = NULL;
 
 		/*
 		 * If it's a mirror, raidz, or draid the subsequent arguments
 		 * are its leaves -- until we encounter the next mirror,
 		 * raidz or draid.
 		 */
 		if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {
 			nvlist_t **child = NULL;
 			int c, children = 0;
 
 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
 				if (spares != NULL) {
 					(void) fprintf(stderr,
 					    gettext("invalid vdev "
 					    "specification: 'spare' can be "
 					    "specified only once\n"));
 					goto spec_out;
 				}
 				is_spare = B_TRUE;
 				is_log = is_special = is_dedup = B_FALSE;
 			}
 
 			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
 				if (seen_logs) {
 					(void) fprintf(stderr,
 					    gettext("invalid vdev "
 					    "specification: 'log' can be "
 					    "specified only once\n"));
 					goto spec_out;
 				}
 				seen_logs = B_TRUE;
 				is_log = B_TRUE;
 				is_special = is_dedup = is_spare = B_FALSE;
 				argc--;
 				argv++;
 				/*
 				 * A log is not a real grouping device.
 				 * We just set is_log and continue.
 				 */
 				continue;
 			}
 
 			if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
 				is_special = B_TRUE;
 				is_log = is_dedup = is_spare = B_FALSE;
 				argc--;
 				argv++;
 				continue;
 			}
 
 			if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
 				is_dedup = B_TRUE;
 				is_log = is_special = is_spare = B_FALSE;
 				argc--;
 				argv++;
 				continue;
 			}
 
 			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
 				if (l2cache != NULL) {
 					(void) fprintf(stderr,
 					    gettext("invalid vdev "
 					    "specification: 'cache' can be "
 					    "specified only once\n"));
 					goto spec_out;
 				}
 				is_log = is_special = B_FALSE;
 				is_dedup = is_spare = B_FALSE;
 			}
 
 			if (is_log || is_special || is_dedup) {
 				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
 					(void) fprintf(stderr,
 					    gettext("invalid vdev "
 					    "specification: unsupported '%s' "
 					    "device: %s\n"), is_log ? "log" :
 					    "special", type);
 					goto spec_out;
 				}
 				nlogs++;
 			}
 
 			for (c = 1; c < argc; c++) {
 				if (is_grouping(argv[c], NULL, NULL) != NULL)
 					break;
 
 				children++;
 				child = realloc(child,
 				    children * sizeof (nvlist_t *));
 				if (child == NULL)
 					zpool_no_memory();
 				if ((nv = make_leaf_vdev(props, argv[c],
 				    !(is_log || is_special || is_dedup ||
 				    is_spare))) == NULL) {
 					for (c = 0; c < children - 1; c++)
 						nvlist_free(child[c]);
 					free(child);
 					goto spec_out;
 				}
 
 				child[children - 1] = nv;
 			}
 
 			if (children < mindev) {
 				(void) fprintf(stderr, gettext("invalid vdev "
 				    "specification: %s requires at least %d "
 				    "devices\n"), argv[0], mindev);
 				for (c = 0; c < children; c++)
 					nvlist_free(child[c]);
 				free(child);
 				goto spec_out;
 			}
 
 			if (children > maxdev) {
 				(void) fprintf(stderr, gettext("invalid vdev "
 				    "specification: %s supports no more than "
 				    "%d devices\n"), argv[0], maxdev);
 				for (c = 0; c < children; c++)
 					nvlist_free(child[c]);
 				free(child);
 				goto spec_out;
 			}
 
 			argc -= c;
 			argv += c;
 
 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
 				spares = child;
 				nspares = children;
 				continue;
 			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
 				l2cache = child;
 				nl2cache = children;
 				continue;
 			} else {
 				/* create a top-level vdev with children */
 				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
 				    0) == 0);
 				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
 				    type) == 0);
 				verify(nvlist_add_uint64(nv,
 				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
 				if (is_log) {
 					verify(nvlist_add_string(nv,
 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
 					    VDEV_ALLOC_BIAS_LOG) == 0);
 				}
 				if (is_special) {
 					verify(nvlist_add_string(nv,
 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
 					    VDEV_ALLOC_BIAS_SPECIAL) == 0);
 				}
 				if (is_dedup) {
 					verify(nvlist_add_string(nv,
 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
 					    VDEV_ALLOC_BIAS_DEDUP) == 0);
 				}
 				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
 					verify(nvlist_add_uint64(nv,
 					    ZPOOL_CONFIG_NPARITY,
 					    mindev - 1) == 0);
 				}
 				if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
 					if (draid_config_by_type(nv,
 					    fulltype, children) != 0) {
 						for (c = 0; c < children; c++)
 							nvlist_free(child[c]);
 						free(child);
 						goto spec_out;
 					}
 				}
 				verify(nvlist_add_nvlist_array(nv,
 				    ZPOOL_CONFIG_CHILDREN,
 				    (const nvlist_t **)child, children) == 0);
 
 				for (c = 0; c < children; c++)
 					nvlist_free(child[c]);
 				free(child);
 			}
 		} else {
 			/*
 			 * We have a device.  Pass off to make_leaf_vdev() to
 			 * construct the appropriate nvlist describing the vdev.
 			 */
 			if ((nv = make_leaf_vdev(props, argv[0], !(is_log ||
 			    is_special || is_dedup || is_spare))) == NULL)
 				goto spec_out;
 
 			verify(nvlist_add_uint64(nv,
 			    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
 			if (is_log) {
 				verify(nvlist_add_string(nv,
 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
 				    VDEV_ALLOC_BIAS_LOG) == 0);
 				nlogs++;
 			}
 
 			if (is_special) {
 				verify(nvlist_add_string(nv,
 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
 				    VDEV_ALLOC_BIAS_SPECIAL) == 0);
 			}
 			if (is_dedup) {
 				verify(nvlist_add_string(nv,
 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
 				    VDEV_ALLOC_BIAS_DEDUP) == 0);
 			}
 			argc--;
 			argv++;
 		}
 
 		toplevels++;
 		top = realloc(top, toplevels * sizeof (nvlist_t *));
 		if (top == NULL)
 			zpool_no_memory();
 		top[toplevels - 1] = nv;
 	}
 
 	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
 		(void) fprintf(stderr, gettext("invalid vdev "
 		    "specification: at least one toplevel vdev must be "
 		    "specified\n"));
 		goto spec_out;
 	}
 
 	if (seen_logs && nlogs == 0) {
 		(void) fprintf(stderr, gettext("invalid vdev specification: "
 		    "log requires at least 1 device\n"));
 		goto spec_out;
 	}
 
 	/*
 	 * Finally, create nvroot and add all top-level vdevs to it.
 	 */
 	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
 	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    (const nvlist_t **)top, toplevels) == 0);
 	if (nspares != 0)
 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    (const nvlist_t **)spares, nspares) == 0);
 	if (nl2cache != 0)
 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    (const nvlist_t **)l2cache, nl2cache) == 0);
 
 spec_out:
 	for (t = 0; t < toplevels; t++)
 		nvlist_free(top[t]);
 	for (t = 0; t < nspares; t++)
 		nvlist_free(spares[t]);
 	for (t = 0; t < nl2cache; t++)
 		nvlist_free(l2cache[t]);
 
 	free(spares);
 	free(l2cache);
 	free(top);
 
 	return (nvroot);
 }
 
 nvlist_t *
 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
     splitflags_t flags, int argc, char **argv)
 {
 	nvlist_t *newroot = NULL, **child;
 	uint_t c, children;
 
 	if (argc > 0) {
 		if ((newroot = construct_spec(props, argc, argv)) == NULL) {
 			(void) fprintf(stderr, gettext("Unable to build a "
 			    "pool from the specified devices\n"));
 			return (NULL);
 		}
 
 		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
 			nvlist_free(newroot);
 			return (NULL);
 		}
 
 		/* avoid any tricks in the spec */
 		verify(nvlist_lookup_nvlist_array(newroot,
 		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
 		for (c = 0; c < children; c++) {
 			char *path;
 			const char *type;
 			int min, max;
 
 			verify(nvlist_lookup_string(child[c],
 			    ZPOOL_CONFIG_PATH, &path) == 0);
 			if ((type = is_grouping(path, &min, &max)) != NULL) {
 				(void) fprintf(stderr, gettext("Cannot use "
 				    "'%s' as a device for splitting\n"), type);
 				nvlist_free(newroot);
 				return (NULL);
 			}
 		}
 	}
 
 	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	return (newroot);
 }
 
 static int
 num_normal_vdevs(nvlist_t *nvroot)
 {
 	nvlist_t **top;
 	uint_t t, toplevels, normal = 0;
 
 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &top, &toplevels) == 0);
 
 	for (t = 0; t < toplevels; t++) {
 		uint64_t log = B_FALSE;
 
 		(void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
 		if (log)
 			continue;
 		if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
 			continue;
 
 		normal++;
 	}
 
 	return (normal);
 }
 
 /*
  * Get and validate the contents of the given vdev specification.  This ensures
  * that the nvlist returned is well-formed, that all the devices exist, and that
  * they are not currently in use by any other known consumer.  The 'poolconfig'
  * parameter is the current configuration of the pool when adding devices
  * existing pool, and is used to perform additional checks, such as changing the
  * replication level of the pool.  It can be 'NULL' to indicate that this is a
  * new pool.  The 'force' flag controls whether devices should be forcefully
  * added, even if they appear in use.
  */
 nvlist_t *
 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
     boolean_t replacing, boolean_t dryrun, int argc, char **argv)
 {
 	nvlist_t *newroot;
 	nvlist_t *poolconfig = NULL;
 	is_force = force;
 
 	/*
 	 * Construct the vdev specification.  If this is successful, we know
 	 * that we have a valid specification, and that all devices can be
 	 * opened.
 	 */
 	if ((newroot = construct_spec(props, argc, argv)) == NULL)
 		return (NULL);
 
 	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	/*
 	 * Validate each device to make sure that it's not shared with another
 	 * subsystem.  We do this even if 'force' is set, because there are some
 	 * uses (such as a dedicated dump device) that even '-f' cannot
 	 * override.
 	 */
 	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	/*
 	 * Check the replication level of the given vdevs and report any errors
 	 * found.  We include the existing pool spec, if any, as we need to
 	 * catch changes against the existing replication level.
 	 */
 	if (check_rep && check_replication(poolconfig, newroot) != 0) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	/*
 	 * On pool create the new vdev spec must have one normal vdev.
 	 */
 	if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
 		vdev_error(gettext("at least one general top-level vdev must "
 		    "be specified\n"));
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	/*
 	 * Run through the vdev specification and label any whole disks found.
 	 */
 	if (!dryrun && make_disks(zhp, newroot) != 0) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
 
 	return (newroot);
 }
diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c
index 7afcc1231348..6bd35713ff18 100644
--- a/module/zfs/dmu_redact.c
+++ b/module/zfs/dmu_redact.c
@@ -1,1202 +1,1200 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/txg.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_redact.h>
 #include <sys/bqueue.h>
 #include <sys/objlist.h>
 #include <sys/dmu_tx.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * This controls the number of entries in the buffer the redaction_list_update
  * synctask uses to buffer writes to the redaction list.
  */
 static const int redact_sync_bufsize = 1024;
 
 /*
  * Controls how often to update the redaction list when creating a redaction
  * list.
  */
 static const uint64_t redaction_list_update_interval_ns =
     1000 * 1000 * 1000ULL; /* 1s */
 
 /*
  * This tunable controls the length of the queues that zfs redact worker threads
  * use to communicate.  If the dmu_redact_snap thread is blocking on these
  * queues, this variable may need to be increased.  If there is a significant
  * slowdown at the start of a redact operation as these threads consume all the
  * available IO resources, or the queues are consuming too much memory, this
  * variable may need to be decreased.
  */
 static const int zfs_redact_queue_length = 1024 * 1024;
 
 /*
  * These tunables control the fill fraction of the queues by zfs redact. The
  * fill fraction controls the frequency with which threads have to be
  * cv_signaled. If a lot of cpu time is being spent on cv_signal, then these
  * should be tuned down.  If the queues empty before the signalled thread can
  * catch up, then these should be tuned up.
  */
 static const uint64_t zfs_redact_queue_ff = 20;
 
 struct redact_record {
 	bqueue_node_t		ln;
 	boolean_t		eos_marker; /* Marks the end of the stream */
 	uint64_t		start_object;
 	uint64_t		start_blkid;
 	uint64_t		end_object;
 	uint64_t		end_blkid;
 	uint8_t			indblkshift;
 	uint32_t		datablksz;
 };
 
 struct redact_thread_arg {
 	bqueue_t	q;
 	objset_t	*os;		/* Objset to traverse */
 	dsl_dataset_t	*ds;		/* Dataset to traverse */
 	struct redact_record *current_record;
 	int		error_code;
 	boolean_t	cancel;
 	zbookmark_phys_t resume;
 	objlist_t	*deleted_objs;
 	uint64_t	*num_blocks_visited;
 	uint64_t	ignore_object;	/* ignore further callbacks on this */
 	uint64_t	txg; /* txg to traverse since */
 };
 
 /*
  * The redaction node is a wrapper around the redaction record that is used
  * by the redaction merging thread to sort the records and determine overlaps.
  *
  * It contains two nodes; one sorts the records by their start_zb, and the other
  * sorts the records by their end_zb.
  */
 struct redact_node {
 	avl_node_t			avl_node_start;
 	avl_node_t			avl_node_end;
 	struct redact_record		*record;
 	struct redact_thread_arg	*rt_arg;
 	uint32_t			thread_num;
 };
 
 struct merge_data {
 	list_t				md_redact_block_pending;
 	redact_block_phys_t		md_coalesce_block;
 	uint64_t			md_last_time;
 	redact_block_phys_t		md_furthest[TXG_SIZE];
 	/* Lists of struct redact_block_list_node. */
 	list_t				md_blocks[TXG_SIZE];
 	boolean_t			md_synctask_txg[TXG_SIZE];
 	uint64_t			md_latest_synctask_txg;
 	redaction_list_t		*md_redaction_list;
 };
 
 /*
  * A wrapper around struct redact_block so it can be stored in a list_t.
  */
 struct redact_block_list_node {
 	redact_block_phys_t	block;
 	list_node_t		node;
 };
 
 /*
  * We've found a new redaction candidate.  In order to improve performance, we
  * coalesce these blocks when they're adjacent to each other.  This function
  * handles that.  If the new candidate block range is immediately after the
  * range we're building, coalesce it into the range we're building.  Otherwise,
  * put the record we're building on the queue, and update the build pointer to
  * point to the new record.
  */
 static void
 record_merge_enqueue(bqueue_t *q, struct redact_record **build,
     struct redact_record *new)
 {
 	if (new->eos_marker) {
 		if (*build != NULL)
 			bqueue_enqueue(q, *build, sizeof (**build));
 		bqueue_enqueue_flush(q, new, sizeof (*new));
 		return;
 	}
 	if (*build == NULL) {
 		*build = new;
 		return;
 	}
 	struct redact_record *curbuild = *build;
 	if ((curbuild->end_object == new->start_object &&
 	    curbuild->end_blkid + 1 == new->start_blkid &&
 	    curbuild->end_blkid != UINT64_MAX) ||
 	    (curbuild->end_object + 1 == new->start_object &&
 	    curbuild->end_blkid == UINT64_MAX && new->start_blkid == 0)) {
 		curbuild->end_object = new->end_object;
 		curbuild->end_blkid = new->end_blkid;
 		kmem_free(new, sizeof (*new));
 	} else {
 		bqueue_enqueue(q, curbuild, sizeof (*curbuild));
 		*build = new;
 	}
 }
 #ifdef _KERNEL
 struct objnode {
 	avl_node_t node;
 	uint64_t obj;
 };
 
 static int
 objnode_compare(const void *o1, const void *o2)
 {
 	const struct objnode *obj1 = o1;
 	const struct objnode *obj2 = o2;
 	if (obj1->obj < obj2->obj)
 		return (-1);
 	if (obj1->obj > obj2->obj)
 		return (1);
 	return (0);
 }
 
 
 static objlist_t *
 zfs_get_deleteq(objset_t *os)
 {
 	objlist_t *deleteq_objlist = objlist_create();
 	uint64_t deleteq_obj;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	dmu_object_info_t doi;
 
 	ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
 	VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi));
 	ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE);
 
 	VERIFY0(zap_lookup(os, MASTER_NODE_OBJ,
 	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
 
 	/*
 	 * In order to insert objects into the objlist, they must be in sorted
 	 * order. We don't know what order we'll get them out of the ZAP in, so
 	 * we insert them into and remove them from an avl_tree_t to sort them.
 	 */
 	avl_tree_t at;
 	avl_create(&at, objnode_compare, sizeof (struct objnode),
 	    offsetof(struct objnode, node));
 
 	for (zap_cursor_init(&zc, os, deleteq_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
 		struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP);
 		obj->obj = za.za_first_integer;
 		avl_add(&at, obj);
 	}
 	zap_cursor_fini(&zc);
 
 	struct objnode *next, *found = avl_first(&at);
 	while (found != NULL) {
 		next = AVL_NEXT(&at, found);
 		objlist_insert(deleteq_objlist, found->obj);
 		found = next;
 	}
 
 	void *cookie = NULL;
 	while ((found = avl_destroy_nodes(&at, &cookie)) != NULL)
 		kmem_free(found, sizeof (*found));
 	avl_destroy(&at);
 	return (deleteq_objlist);
 }
 #endif
 
 /*
  * This is the callback function to traverse_dataset for the redaction threads
  * for dmu_redact_snap.  This thread is responsible for creating redaction
  * records for all the data that is modified by the snapshots we're redacting
  * with respect to.  Redaction records represent ranges of data that have been
  * modified by one of the redaction snapshots, and are stored in the
  * redact_record struct. We need to create redaction records for three
  * cases:
  *
  * First, if there's a normal write, we need to create a redaction record for
  * that block.
  *
  * Second, if there's a hole, we need to create a redaction record that covers
  * the whole range of the hole.  If the hole is in the meta-dnode, it must cover
  * every block in all of the objects in the hole.
  *
  * Third, if there is a deleted object, we need to create a redaction record for
  * all of the blocks in that object.
  */
 static int
 redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
 {
 	(void) spa, (void) zilog;
 	struct redact_thread_arg *rta = arg;
 	struct redact_record *record;
 
 	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
 	    zb->zb_object >= rta->resume.zb_object);
 
 	if (rta->cancel)
 		return (SET_ERROR(EINTR));
 
 	if (rta->ignore_object == zb->zb_object)
 		return (0);
 
 	/*
 	 * If we're visiting a dnode, we need to handle the case where the
 	 * object has been deleted.
 	 */
 	if (zb->zb_level == ZB_DNODE_LEVEL) {
 		ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
 
 		if (zb->zb_object == 0)
 			return (0);
 
 		/*
 		 * If the object has been deleted, redact all of the blocks in
 		 * it.
 		 */
 		if (dnp->dn_type == DMU_OT_NONE ||
 		    objlist_exists(rta->deleted_objs, zb->zb_object)) {
 			rta->ignore_object = zb->zb_object;
 			record = kmem_zalloc(sizeof (struct redact_record),
 			    KM_SLEEP);
 
 			record->eos_marker = B_FALSE;
 			record->start_object = record->end_object =
 			    zb->zb_object;
 			record->start_blkid = 0;
 			record->end_blkid = UINT64_MAX;
 			record_merge_enqueue(&rta->q,
 			    &rta->current_record, record);
 		}
 		return (0);
 	} else if (zb->zb_level < 0) {
 		return (0);
 	} else if (zb->zb_level > 0 && !BP_IS_HOLE(bp)) {
 		/*
 		 * If this is an indirect block, but not a hole, it doesn't
 		 * provide any useful information for redaction, so ignore it.
 		 */
 		return (0);
 	}
 
 	/*
 	 * At this point, there are two options left for the type of block we're
 	 * looking at.  Either this is a hole (which could be in the dnode or
 	 * the meta-dnode), or it's a level 0 block of some sort.  If it's a
 	 * hole, we create a redaction record that covers the whole range.  If
 	 * the hole is in a dnode, we need to redact all the blocks in that
 	 * hole.  If the hole is in the meta-dnode, we instead need to redact
 	 * all blocks in every object covered by that hole.  If it's a level 0
 	 * block, we only need to redact that single block.
 	 */
 	record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP);
 	record->eos_marker = B_FALSE;
 
 	record->start_object = record->end_object = zb->zb_object;
 	if (BP_IS_HOLE(bp)) {
 		record->start_blkid = zb->zb_blkid *
 		    bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level);
 
 		record->end_blkid = ((zb->zb_blkid + 1) *
 		    bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level)) - 1;
 
 		if (zb->zb_object == DMU_META_DNODE_OBJECT) {
 			record->start_object = record->start_blkid *
 			    ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) /
 			    sizeof (dnode_phys_t));
 			record->start_blkid = 0;
 			record->end_object = ((record->end_blkid +
 			    1) * ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) /
 			    sizeof (dnode_phys_t))) - 1;
 			record->end_blkid = UINT64_MAX;
 		}
 	} else if (zb->zb_level != 0 ||
 	    zb->zb_object == DMU_META_DNODE_OBJECT) {
 		kmem_free(record, sizeof (*record));
 		return (0);
 	} else {
 		record->start_blkid = record->end_blkid = zb->zb_blkid;
 	}
 	record->indblkshift = dnp->dn_indblkshift;
 	record->datablksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 	record_merge_enqueue(&rta->q, &rta->current_record, record);
 
 	return (0);
 }
 
 static __attribute__((noreturn)) void
 redact_traverse_thread(void *arg)
 {
 	struct redact_thread_arg *rt_arg = arg;
 	int err;
 	struct redact_record *data;
 #ifdef _KERNEL
 	if (rt_arg->os->os_phys->os_type == DMU_OST_ZFS)
 		rt_arg->deleted_objs = zfs_get_deleteq(rt_arg->os);
 	else
 		rt_arg->deleted_objs = objlist_create();
 #else
 	rt_arg->deleted_objs = objlist_create();
 #endif
 
 	err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg,
 	    &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
 	    redact_cb, rt_arg);
 
 	if (err != EINTR)
 		rt_arg->error_code = err;
 	objlist_destroy(rt_arg->deleted_objs);
 	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
 	data->eos_marker = B_TRUE;
 	record_merge_enqueue(&rt_arg->q, &rt_arg->current_record, data);
 	thread_exit();
 }
 
 static inline void
 create_zbookmark_from_obj_off(zbookmark_phys_t *zb, uint64_t object,
     uint64_t blkid)
 {
 	zb->zb_object = object;
 	zb->zb_level = 0;
 	zb->zb_blkid = blkid;
 }
 
 /*
  * This is a utility function that can do the comparison for the start or ends
  * of the ranges in a redact_record.
  */
 static int
 redact_range_compare(uint64_t obj1, uint64_t off1, uint32_t dbss1,
     uint64_t obj2, uint64_t off2, uint32_t dbss2)
 {
 	zbookmark_phys_t z1, z2;
 	create_zbookmark_from_obj_off(&z1, obj1, off1);
 	create_zbookmark_from_obj_off(&z2, obj2, off2);
 
 	return (zbookmark_compare(dbss1 >> SPA_MINBLOCKSHIFT, 0,
 	    dbss2 >> SPA_MINBLOCKSHIFT, 0, &z1, &z2));
 }
 
 /*
  * Compare two redaction records by their range's start location.  Also makes
  * eos records always compare last.  We use the thread number in the redact_node
  * to ensure that records do not compare equal (which is not allowed in our avl
  * trees).
  */
 static int
 redact_node_compare_start(const void *arg1, const void *arg2)
 {
 	const struct redact_node *rn1 = arg1;
 	const struct redact_node *rn2 = arg2;
 	const struct redact_record *rr1 = rn1->record;
 	const struct redact_record *rr2 = rn2->record;
 	if (rr1->eos_marker)
 		return (1);
 	if (rr2->eos_marker)
 		return (-1);
 
 	int cmp = redact_range_compare(rr1->start_object, rr1->start_blkid,
 	    rr1->datablksz, rr2->start_object, rr2->start_blkid,
 	    rr2->datablksz);
 	if (cmp == 0)
 		cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1);
 	return (cmp);
 }
 
 /*
  * Compare two redaction records by their range's end location.  Also makes
  * eos records always compare last.  We use the thread number in the redact_node
  * to ensure that records do not compare equal (which is not allowed in our avl
  * trees).
  */
 static int
 redact_node_compare_end(const void *arg1, const void *arg2)
 {
 	const struct redact_node *rn1 = arg1;
 	const struct redact_node *rn2 = arg2;
 	const struct redact_record *srr1 = rn1->record;
 	const struct redact_record *srr2 = rn2->record;
 	if (srr1->eos_marker)
 		return (1);
 	if (srr2->eos_marker)
 		return (-1);
 
 	int cmp = redact_range_compare(srr1->end_object, srr1->end_blkid,
 	    srr1->datablksz, srr2->end_object, srr2->end_blkid,
 	    srr2->datablksz);
 	if (cmp == 0)
 		cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1);
 	return (cmp);
 }
 
 /*
  * Utility function that compares two redaction records to determine if any part
  * of the "from" record is before any part of the "to" record. Also causes End
  * of Stream redaction records to compare after all others, so that the
  * redaction merging logic can stay simple.
  */
 static boolean_t
 redact_record_before(const struct redact_record *from,
     const struct redact_record *to)
 {
 	if (from->eos_marker == B_TRUE)
 		return (B_FALSE);
 	else if (to->eos_marker == B_TRUE)
 		return (B_TRUE);
 	return (redact_range_compare(from->start_object, from->start_blkid,
 	    from->datablksz, to->end_object, to->end_blkid,
 	    to->datablksz) <= 0);
 }
 
 /*
  * Pop a new redaction record off the queue, check that the records are in the
  * right order, and free the old data.
  */
 static struct redact_record *
 get_next_redact_record(bqueue_t *bq, struct redact_record *prev)
 {
 	struct redact_record *next = bqueue_dequeue(bq);
 	ASSERT(redact_record_before(prev, next));
 	kmem_free(prev, sizeof (*prev));
 	return (next);
 }
 
 /*
  * Remove the given redaction node from both trees, pull a new redaction record
  * off the queue, free the old redaction record, update the redaction node, and
  * reinsert the node into the trees.
  */
 static int
 update_avl_trees(avl_tree_t *start_tree, avl_tree_t *end_tree,
     struct redact_node *redact_node)
 {
 	avl_remove(start_tree, redact_node);
 	avl_remove(end_tree, redact_node);
 	redact_node->record = get_next_redact_record(&redact_node->rt_arg->q,
 	    redact_node->record);
 	avl_add(end_tree, redact_node);
 	avl_add(start_tree, redact_node);
 	return (redact_node->rt_arg->error_code);
 }
 
 /*
  * Synctask for updating redaction lists.  We first take this txg's list of
  * redacted blocks and append those to the redaction list.  We then update the
  * redaction list's bonus buffer.  We store the furthest blocks we visited and
  * the list of snapshots that we're redacting with respect to.  We need these so
  * that redacted sends and receives can be correctly resumed.
  */
 static void
 redaction_list_update_sync(void *arg, dmu_tx_t *tx)
 {
 	struct merge_data *md = arg;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	list_t *list = &md->md_blocks[txg & TXG_MASK];
 	redact_block_phys_t *furthest_visited =
 	    &md->md_furthest[txg & TXG_MASK];
 	objset_t *mos = tx->tx_pool->dp_meta_objset;
 	redaction_list_t *rl = md->md_redaction_list;
 	int bufsize = redact_sync_bufsize;
 	redact_block_phys_t *buf = kmem_alloc(bufsize * sizeof (*buf),
 	    KM_SLEEP);
 	int index = 0;
 
 	dmu_buf_will_dirty(rl->rl_dbuf, tx);
 
 	for (struct redact_block_list_node *rbln = list_remove_head(list);
 	    rbln != NULL; rbln = list_remove_head(list)) {
 		ASSERT3U(rbln->block.rbp_object, <=,
 		    furthest_visited->rbp_object);
 		ASSERT(rbln->block.rbp_object < furthest_visited->rbp_object ||
 		    rbln->block.rbp_blkid <= furthest_visited->rbp_blkid);
 		buf[index] = rbln->block;
 		index++;
 		if (index == bufsize) {
 			dmu_write(mos, rl->rl_object,
 			    rl->rl_phys->rlp_num_entries * sizeof (*buf),
 			    bufsize * sizeof (*buf), buf, tx);
 			rl->rl_phys->rlp_num_entries += bufsize;
 			index = 0;
 		}
 		kmem_free(rbln, sizeof (*rbln));
 	}
 	if (index > 0) {
 		dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries *
 		    sizeof (*buf), index * sizeof (*buf), buf, tx);
 		rl->rl_phys->rlp_num_entries += index;
 	}
 	kmem_free(buf, bufsize * sizeof (*buf));
 
 	md->md_synctask_txg[txg & TXG_MASK] = B_FALSE;
 	rl->rl_phys->rlp_last_object = furthest_visited->rbp_object;
 	rl->rl_phys->rlp_last_blkid = furthest_visited->rbp_blkid;
 }
 
 static void
 commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object,
     uint64_t blkid)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir);
 	dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node));
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	uint64_t txg = dmu_tx_get_txg(tx);
 	if (!md->md_synctask_txg[txg & TXG_MASK]) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx),
 		    redaction_list_update_sync, md, tx);
 		md->md_synctask_txg[txg & TXG_MASK] = B_TRUE;
 		md->md_latest_synctask_txg = txg;
 	}
 	md->md_furthest[txg & TXG_MASK].rbp_object = object;
 	md->md_furthest[txg & TXG_MASK].rbp_blkid = blkid;
 	list_move_tail(&md->md_blocks[txg & TXG_MASK],
 	    &md->md_redact_block_pending);
 	dmu_tx_commit(tx);
 	md->md_last_time = gethrtime();
 }
 
 /*
  * We want to store the list of blocks that we're redacting in the bookmark's
  * redaction list.  However, this list is stored in the MOS, which means it can
  * only be written to in syncing context.  To get around this, we create a
  * synctask that will write to the mos for us.  We tell it what to write by
  * a linked list for each current transaction group; every time we decide to
  * redact a block, we append it to the transaction group that is currently in
  * open context.  We also update some progress information that the synctask
  * will store to enable resumable redacted sends.
  */
 static void
 update_redaction_list(struct merge_data *md, objset_t *os,
     uint64_t object, uint64_t blkid, uint64_t endblkid, uint32_t blksz)
 {
 	boolean_t enqueue = B_FALSE;
 	redact_block_phys_t cur = {0};
 	uint64_t count = endblkid - blkid + 1;
 	while (count > REDACT_BLOCK_MAX_COUNT) {
 		update_redaction_list(md, os, object, blkid,
 		    blkid + REDACT_BLOCK_MAX_COUNT - 1, blksz);
 		blkid += REDACT_BLOCK_MAX_COUNT;
 		count -= REDACT_BLOCK_MAX_COUNT;
 	}
 	redact_block_phys_t *coalesce = &md->md_coalesce_block;
 	boolean_t new;
 	if (coalesce->rbp_size_count == 0) {
 		new = B_TRUE;
 		enqueue = B_FALSE;
 	} else  {
 		uint64_t old_count = redact_block_get_count(coalesce);
 		if (coalesce->rbp_object == object &&
 		    coalesce->rbp_blkid + old_count == blkid &&
 		    old_count + count <= REDACT_BLOCK_MAX_COUNT) {
 			ASSERT3U(redact_block_get_size(coalesce), ==, blksz);
 			redact_block_set_count(coalesce, old_count + count);
 			new = B_FALSE;
 			enqueue = B_FALSE;
 		} else {
 			new = B_TRUE;
 			enqueue = B_TRUE;
 		}
 	}
 
 	if (new) {
 		cur = *coalesce;
 		coalesce->rbp_blkid = blkid;
 		coalesce->rbp_object = object;
 
 		redact_block_set_count(coalesce, count);
 		redact_block_set_size(coalesce, blksz);
 	}
 
 	if (enqueue && redact_block_get_size(&cur) != 0) {
 		struct redact_block_list_node *rbln =
 		    kmem_alloc(sizeof (struct redact_block_list_node),
 		    KM_SLEEP);
 		rbln->block = cur;
 		list_insert_tail(&md->md_redact_block_pending, rbln);
 	}
 
 	if (gethrtime() > md->md_last_time +
 	    redaction_list_update_interval_ns) {
 		commit_rl_updates(os, md, object, blkid);
 	}
 }
 
 /*
  * This thread merges all the redaction records provided by the worker threads,
  * and determines which blocks are redacted by all the snapshots.  The algorithm
  * for doing so is similar to performing a merge in mergesort with n sub-lists
  * instead of 2, with some added complexity due to the fact that the entries are
  * ranges, not just single blocks.  This algorithm relies on the fact that the
  * queues are sorted, which is ensured by the fact that traverse_dataset
  * traverses the dataset in a consistent order.  We pull one entry off the front
  * of the queues of each secure dataset traversal thread.  Then we repeat the
  * following: each record represents a range of blocks modified by one of the
  * redaction snapshots, and each block in that range may need to be redacted in
  * the send stream.  Find the record with the latest start of its range, and the
  * record with the earliest end of its range. If the last start is before the
  * first end, then we know that the blocks in the range [last_start, first_end]
  * are covered by all of the ranges at the front of the queues, which means
  * every thread redacts that whole range.  For example, let's say the ranges on
  * each queue look like this:
  *
  * Block Id   1  2  3  4  5  6  7  8  9 10 11
  * Thread 1 |    [====================]
  * Thread 2 |       [========]
  * Thread 3 |             [=================]
  *
  * Thread 3 has the last start (5), and the thread 2 has the last end (6).  All
  * three threads modified the range [5,6], so that data should not be sent over
  * the wire.  After we've determined whether or not to redact anything, we take
  * the record with the first end.  We discard that record, and pull a new one
  * off the front of the queue it came from.  In the above example, we would
  * discard Thread 2's record, and pull a new one.  Let's say the next record we
  * pulled from Thread 2 covered range [10,11].  The new layout would look like
  * this:
  *
  * Block Id   1  2  3  4  5  6  7  8  9 10 11
  * Thread 1 |    [====================]
  * Thread 2 |                            [==]
  * Thread 3 |             [=================]
  *
  * When we compare the last start (10, from Thread 2) and the first end (9, from
  * Thread 1), we see that the last start is greater than the first end.
  * Therefore, we do not redact anything from these records.  We'll iterate by
  * replacing the record from Thread 1.
  *
  * We iterate by replacing the record with the lowest end because we know
  * that the record with the lowest end has helped us as much as it can.  All the
  * ranges before it that we will ever redact have been redacted.  In addition,
  * by replacing the one with the lowest end, we guarantee we catch all ranges
  * that need to be redacted.  For example, if in the case above we had replaced
  * the record from Thread 1 instead, we might have ended up with the following:
  *
  * Block Id   1  2  3  4  5  6  7  8  9 10 11 12
  * Thread 1 |                               [==]
  * Thread 2 |       [========]
  * Thread 3 |             [=================]
  *
  * If the next record from Thread 2 had been [8,10], for example, we should have
  * redacted part of that range, but because we updated Thread 1's record, we
  * missed it.
  *
  * We implement this algorithm by using two trees.  The first sorts the
  * redaction records by their start_zb, and the second sorts them by their
  * end_zb.  We use these to find the record with the last start and the record
  * with the first end.  We create a record with that start and end, and send it
  * on.  The overall runtime of this implementation is O(n log m), where n is the
  * total number of redaction records from all the different redaction snapshots,
  * and m is the number of redaction snapshots.
  *
  * If we redact with respect to zero snapshots, we create a redaction
  * record with the start object and blkid to 0, and the end object and blkid to
  * UINT64_MAX.  This will result in us redacting every block.
  */
 static int
 perform_thread_merge(bqueue_t *q, uint32_t num_threads,
     struct redact_thread_arg *thread_args, boolean_t *cancel)
 {
 	struct redact_node *redact_nodes = NULL;
 	avl_tree_t start_tree, end_tree;
 	struct redact_record *record;
 	struct redact_record *current_record = NULL;
 	int err = 0;
 	struct merge_data md = { {0} };
 	list_create(&md.md_redact_block_pending,
 	    sizeof (struct redact_block_list_node),
 	    offsetof(struct redact_block_list_node, node));
 
 	/*
 	 * If we're redacting with respect to zero snapshots, then no data is
 	 * permitted to be sent.  We enqueue a record that redacts all blocks,
 	 * and an eos marker.
 	 */
 	if (num_threads == 0) {
 		record = kmem_zalloc(sizeof (struct redact_record),
 		    KM_SLEEP);
 		// We can't redact object 0, so don't try.
 		record->start_object = 1;
 		record->start_blkid = 0;
 		record->end_object = record->end_blkid = UINT64_MAX;
 		bqueue_enqueue(q, record, sizeof (*record));
 		return (0);
 	}
-	if (num_threads > 0) {
-		redact_nodes = kmem_zalloc(num_threads *
-		    sizeof (*redact_nodes), KM_SLEEP);
-	}
+	redact_nodes = kmem_zalloc(num_threads *
+	    sizeof (*redact_nodes), KM_SLEEP);
 
 	avl_create(&start_tree, redact_node_compare_start,
 	    sizeof (struct redact_node),
 	    offsetof(struct redact_node, avl_node_start));
 	avl_create(&end_tree, redact_node_compare_end,
 	    sizeof (struct redact_node),
 	    offsetof(struct redact_node, avl_node_end));
 
 	for (int i = 0; i < num_threads; i++) {
 		struct redact_node *node = &redact_nodes[i];
 		struct redact_thread_arg *targ = &thread_args[i];
 		node->record = bqueue_dequeue(&targ->q);
 		node->rt_arg = targ;
 		node->thread_num = i;
 		avl_add(&start_tree, node);
 		avl_add(&end_tree, node);
 	}
 
 	/*
 	 * Once the first record in the end tree has returned EOS, every record
 	 * must be an EOS record, so we should stop.
 	 */
 	while (err == 0 && !((struct redact_node *)avl_first(&end_tree))->
 	    record->eos_marker) {
 		if (*cancel) {
 			err = EINTR;
 			break;
 		}
 		struct redact_node *last_start = avl_last(&start_tree);
 		struct redact_node *first_end = avl_first(&end_tree);
 
 		/*
 		 * If the last start record is before the first end record,
 		 * then we have blocks that are redacted by all threads.
 		 * Therefore, we should redact them.  Copy the record, and send
 		 * it to the main thread.
 		 */
 		if (redact_record_before(last_start->record,
 		    first_end->record)) {
 			record = kmem_zalloc(sizeof (struct redact_record),
 			    KM_SLEEP);
 			*record = *first_end->record;
 			record->start_object = last_start->record->start_object;
 			record->start_blkid = last_start->record->start_blkid;
 			record_merge_enqueue(q, &current_record,
 			    record);
 		}
 		err = update_avl_trees(&start_tree, &end_tree, first_end);
 	}
 
 	/*
 	 * We're done; if we were cancelled, we need to cancel our workers and
 	 * clear out their queues.  Either way, we need to remove every thread's
 	 * redact_node struct from the avl trees.
 	 */
 	for (int i = 0; i < num_threads; i++) {
 		if (err != 0) {
 			thread_args[i].cancel = B_TRUE;
 			while (!redact_nodes[i].record->eos_marker) {
 				(void) update_avl_trees(&start_tree, &end_tree,
 				    &redact_nodes[i]);
 			}
 		}
 		avl_remove(&start_tree, &redact_nodes[i]);
 		avl_remove(&end_tree, &redact_nodes[i]);
 		kmem_free(redact_nodes[i].record,
 		    sizeof (struct redact_record));
 		bqueue_destroy(&thread_args[i].q);
 	}
 
 	avl_destroy(&start_tree);
 	avl_destroy(&end_tree);
 	kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes));
 	if (current_record != NULL)
 		bqueue_enqueue(q, current_record, sizeof (*current_record));
 	return (err);
 }
 
 struct redact_merge_thread_arg {
 	bqueue_t q;
 	spa_t *spa;
 	int numsnaps;
 	struct redact_thread_arg *thr_args;
 	boolean_t cancel;
 	int error_code;
 };
 
 static __attribute__((noreturn)) void
 redact_merge_thread(void *arg)
 {
 	struct redact_merge_thread_arg *rmta = arg;
 	rmta->error_code = perform_thread_merge(&rmta->q,
 	    rmta->numsnaps, rmta->thr_args, &rmta->cancel);
 	struct redact_record *rec = kmem_zalloc(sizeof (*rec), KM_SLEEP);
 	rec->eos_marker = B_TRUE;
 	bqueue_enqueue_flush(&rmta->q, rec, 1);
 	thread_exit();
 }
 
 /*
  * Find the next object in or after the redaction range passed in, and hold
  * its dnode with the provided tag.  Also update *object to contain the new
  * object number.
  */
 static int
 hold_next_object(objset_t *os, struct redact_record *rec, const void *tag,
     uint64_t *object, dnode_t **dn)
 {
 	int err = 0;
 	if (*dn != NULL)
 		dnode_rele(*dn, tag);
 	*dn = NULL;
 	if (*object < rec->start_object) {
 		*object = rec->start_object - 1;
 	}
 	err = dmu_object_next(os, object, B_FALSE, 0);
 	if (err != 0)
 		return (err);
 
 	err = dnode_hold(os, *object, tag, dn);
 	while (err == 0 && (*object < rec->start_object ||
 	    DMU_OT_IS_METADATA((*dn)->dn_type))) {
 		dnode_rele(*dn, tag);
 		*dn = NULL;
 		err = dmu_object_next(os, object, B_FALSE, 0);
 		if (err != 0)
 			break;
 		err = dnode_hold(os, *object, tag, dn);
 	}
 	return (err);
 }
 
 static int
 perform_redaction(objset_t *os, redaction_list_t *rl,
     struct redact_merge_thread_arg *rmta)
 {
 	int err = 0;
 	bqueue_t *q = &rmta->q;
 	struct redact_record *rec = NULL;
 	struct merge_data md = { {0} };
 
 	list_create(&md.md_redact_block_pending,
 	    sizeof (struct redact_block_list_node),
 	    offsetof(struct redact_block_list_node, node));
 	md.md_redaction_list = rl;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		list_create(&md.md_blocks[i],
 		    sizeof (struct redact_block_list_node),
 		    offsetof(struct redact_block_list_node, node));
 	}
 	dnode_t *dn = NULL;
 	uint64_t prev_obj = 0;
 	for (rec = bqueue_dequeue(q); !rec->eos_marker && err == 0;
 	    rec = get_next_redact_record(q, rec)) {
 		ASSERT3U(rec->start_object, !=, 0);
 		uint64_t object;
 		if (prev_obj != rec->start_object) {
 			object = rec->start_object - 1;
 			err = hold_next_object(os, rec, FTAG, &object, &dn);
 		} else {
 			object = prev_obj;
 		}
 		while (err == 0 && object <= rec->end_object) {
 			if (issig(JUSTLOOKING) && issig(FORREAL)) {
 				err = EINTR;
 				break;
 			}
 			/*
 			 * Part of the current object is contained somewhere in
 			 * the range covered by rec.
 			 */
 			uint64_t startblkid;
 			uint64_t endblkid;
 			uint64_t maxblkid = dn->dn_phys->dn_maxblkid;
 
 			if (rec->start_object < object)
 				startblkid = 0;
 			else if (rec->start_blkid > maxblkid)
 				break;
 			else
 				startblkid = rec->start_blkid;
 
 			if (rec->end_object > object || rec->end_blkid >
 			    maxblkid) {
 				endblkid = maxblkid;
 			} else {
 				endblkid = rec->end_blkid;
 			}
 			update_redaction_list(&md, os, object, startblkid,
 			    endblkid, dn->dn_datablksz);
 
 			if (object == rec->end_object)
 				break;
 			err = hold_next_object(os, rec, FTAG, &object, &dn);
 		}
 		if (err == ESRCH)
 			err = 0;
 		if (dn != NULL)
 			prev_obj = object;
 	}
 	if (err == 0 && dn != NULL)
 		dnode_rele(dn, FTAG);
 
 	if (err == ESRCH)
 		err = 0;
 	rmta->cancel = B_TRUE;
 	while (!rec->eos_marker)
 		rec = get_next_redact_record(q, rec);
 	kmem_free(rec, sizeof (*rec));
 
 	/*
 	 * There may be a block that's being coalesced, sync that out before we
 	 * return.
 	 */
 	if (err == 0 && md.md_coalesce_block.rbp_size_count != 0) {
 		struct redact_block_list_node *rbln =
 		    kmem_alloc(sizeof (struct redact_block_list_node),
 		    KM_SLEEP);
 		rbln->block = md.md_coalesce_block;
 		list_insert_tail(&md.md_redact_block_pending, rbln);
 	}
 	commit_rl_updates(os, &md, UINT64_MAX, UINT64_MAX);
 
 	/*
 	 * Wait for all the redaction info to sync out before we return, so that
 	 * anyone who attempts to resume this redaction will have all the data
 	 * they need.
 	 */
 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
 	if (md.md_latest_synctask_txg != 0)
 		txg_wait_synced(dp, md.md_latest_synctask_txg);
 	for (int i = 0; i < TXG_SIZE; i++)
 		list_destroy(&md.md_blocks[i]);
 	return (err);
 }
 
 static boolean_t
 redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid)
 {
 	for (int i = 0; i < num_snaps; i++) {
 		if (snaps[i] == guid)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 int
 dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
     const char *redactbook)
 {
 	int err = 0;
 	dsl_pool_t *dp = NULL;
 	dsl_dataset_t *ds = NULL;
 	int numsnaps = 0;
 	objset_t *os;
 	struct redact_thread_arg *args = NULL;
 	redaction_list_t *new_rl = NULL;
 	char *newredactbook;
 
 	if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0)
 		return (err);
 
 	newredactbook = kmem_zalloc(sizeof (char) * ZFS_MAX_DATASET_NAME_LEN,
 	    KM_SLEEP);
 
 	if ((err = dsl_dataset_hold_flags(dp, snapname, DS_HOLD_FLAG_DECRYPT,
 	    FTAG, &ds)) != 0) {
 		goto out;
 	}
 	dsl_dataset_long_hold(ds, FTAG);
 	if (!ds->ds_is_snapshot || dmu_objset_from_ds(ds, &os) != 0) {
 		err = EINVAL;
 		goto out;
 	}
 	if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)) {
 		err = EALREADY;
 		goto out;
 	}
 
 	numsnaps = fnvlist_num_pairs(redactnvl);
 	if (numsnaps > 0)
 		args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
 
 	nvpair_t *pair = NULL;
 	for (int i = 0; i < numsnaps; i++) {
 		pair = nvlist_next_nvpair(redactnvl, pair);
 		const char *name = nvpair_name(pair);
 		struct redact_thread_arg *rta = &args[i];
 		err = dsl_dataset_hold_flags(dp, name, DS_HOLD_FLAG_DECRYPT,
 		    FTAG, &rta->ds);
 		if (err != 0)
 			break;
 		/*
 		 * We want to do the long hold before we can get any other
 		 * errors, because the cleanup code will release the long
 		 * hold if rta->ds is filled in.
 		 */
 		dsl_dataset_long_hold(rta->ds, FTAG);
 
 		err = dmu_objset_from_ds(rta->ds, &rta->os);
 		if (err != 0)
 			break;
 		if (!dsl_dataset_is_before(rta->ds, ds, 0)) {
 			err = EINVAL;
 			break;
 		}
 		if (dsl_dataset_feature_is_active(rta->ds,
 		    SPA_FEATURE_REDACTED_DATASETS)) {
 			err = EALREADY;
 			break;
 
 		}
 	}
 	if (err != 0)
 		goto out;
 	VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL);
 
 	boolean_t resuming = B_FALSE;
 	zfs_bookmark_phys_t bookmark;
 
 	(void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN);
 	char *c = strchr(newredactbook, '@');
 	ASSERT3P(c, !=, NULL);
 	int n = snprintf(c, ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook),
 	    "#%s", redactbook);
 	if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) {
 		dsl_pool_rele(dp, FTAG);
 		kmem_free(newredactbook,
 		    sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
 		if (args != NULL)
 			kmem_free(args, numsnaps * sizeof (*args));
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 	err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark);
 	if (err == 0) {
 		resuming = B_TRUE;
 		if (bookmark.zbm_redaction_obj == 0) {
 			err = EEXIST;
 			goto out;
 		}
 		err = dsl_redaction_list_hold_obj(dp,
 		    bookmark.zbm_redaction_obj, FTAG, &new_rl);
 		if (err != 0) {
 			err = EIO;
 			goto out;
 		}
 		dsl_redaction_list_long_hold(dp, new_rl, FTAG);
 		if (new_rl->rl_phys->rlp_num_snaps != numsnaps) {
 			err = ESRCH;
 			goto out;
 		}
 		for (int i = 0; i < numsnaps; i++) {
 			struct redact_thread_arg *rta = &args[i];
 			if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps,
 			    new_rl->rl_phys->rlp_num_snaps,
 			    dsl_dataset_phys(rta->ds)->ds_guid)) {
 				err = ESRCH;
 				goto out;
 			}
 		}
 		if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX &&
 		    new_rl->rl_phys->rlp_last_object == UINT64_MAX) {
 			err = EEXIST;
 			goto out;
 		}
 		dsl_pool_rele(dp, FTAG);
 		dp = NULL;
 	} else {
 		uint64_t *guids = NULL;
 		if (numsnaps > 0) {
 			guids = kmem_zalloc(numsnaps * sizeof (uint64_t),
 			    KM_SLEEP);
 		}
 		for (int i = 0; i < numsnaps; i++) {
 			struct redact_thread_arg *rta = &args[i];
 			guids[i] = dsl_dataset_phys(rta->ds)->ds_guid;
 		}
 
 		dsl_pool_rele(dp, FTAG);
 		dp = NULL;
 		err = dsl_bookmark_create_redacted(newredactbook, snapname,
 		    numsnaps, guids, FTAG, &new_rl);
 		kmem_free(guids, numsnaps * sizeof (uint64_t));
 		if (err != 0) {
 			goto out;
 		}
 	}
 
 	for (int i = 0; i < numsnaps; i++) {
 		struct redact_thread_arg *rta = &args[i];
 		(void) bqueue_init(&rta->q, zfs_redact_queue_ff,
 		    zfs_redact_queue_length,
 		    offsetof(struct redact_record, ln));
 		if (resuming) {
 			rta->resume.zb_blkid =
 			    new_rl->rl_phys->rlp_last_blkid;
 			rta->resume.zb_object =
 			    new_rl->rl_phys->rlp_last_object;
 		}
 		rta->txg = dsl_dataset_phys(ds)->ds_creation_txg;
 		(void) thread_create(NULL, 0, redact_traverse_thread, rta,
 		    0, curproc, TS_RUN, minclsyspri);
 	}
 
 	struct redact_merge_thread_arg *rmta;
 	rmta = kmem_zalloc(sizeof (struct redact_merge_thread_arg), KM_SLEEP);
 
 	(void) bqueue_init(&rmta->q, zfs_redact_queue_ff,
 	    zfs_redact_queue_length, offsetof(struct redact_record, ln));
 	rmta->numsnaps = numsnaps;
 	rmta->spa = os->os_spa;
 	rmta->thr_args = args;
 	(void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc,
 	    TS_RUN, minclsyspri);
 	err = perform_redaction(os, new_rl, rmta);
 	bqueue_destroy(&rmta->q);
 	kmem_free(rmta, sizeof (struct redact_merge_thread_arg));
 
 out:
 	kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
 
 	if (new_rl != NULL) {
 		dsl_redaction_list_long_rele(new_rl, FTAG);
 		dsl_redaction_list_rele(new_rl, FTAG);
 	}
 	for (int i = 0; i < numsnaps; i++) {
 		struct redact_thread_arg *rta = &args[i];
 		/*
 		 * rta->ds may be NULL if we got an error while filling
 		 * it in.
 		 */
 		if (rta->ds != NULL) {
 			dsl_dataset_long_rele(rta->ds, FTAG);
 			dsl_dataset_rele_flags(rta->ds,
 			    DS_HOLD_FLAG_DECRYPT, FTAG);
 		}
 	}
 
 	if (args != NULL)
 		kmem_free(args, numsnaps * sizeof (*args));
 	if (dp != NULL)
 		dsl_pool_rele(dp, FTAG);
 	if (ds != NULL) {
 		dsl_dataset_long_rele(ds, FTAG);
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 	}
 	return (SET_ERROR(err));
 
 }
diff --git a/module/zfs/sa.c b/module/zfs/sa.c
index 763b0c920f11..f9daaabbed3e 100644
--- a/module/zfs/sa.c
+++ b/module/zfs/sa.c
@@ -1,2258 +1,2258 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/sysmacros.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/sunddi.h>
 #include <sys/sa_impl.h>
 #include <sys/errno.h>
 #include <sys/zfs_context.h>
 
 #ifdef _KERNEL
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * ZFS System attributes:
  *
  * A generic mechanism to allow for arbitrary attributes
  * to be stored in a dnode.  The data will be stored in the bonus buffer of
  * the dnode and if necessary a special "spill" block will be used to handle
  * overflow situations.  The spill block will be sized to fit the data
  * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
  * spill block is stored at the end of the current bonus buffer.  Any
  * attributes that would be in the way of the blkptr_t will be relocated
  * into the spill block.
  *
  * Attribute registration:
  *
  * Stored persistently on a per dataset basis
  * a mapping between attribute "string" names and their actual attribute
  * numeric values, length, and byteswap function.  The names are only used
  * during registration.  All  attributes are known by their unique attribute
  * id value.  If an attribute can have a variable size then the value
  * 0 will be used to indicate this.
  *
  * Attribute Layout:
  *
  * Attribute layouts are a way to compactly store multiple attributes, but
  * without taking the overhead associated with managing each attribute
  * individually.  Since you will typically have the same set of attributes
  * stored in the same order a single table will be used to represent that
  * layout.  The ZPL for example will usually have only about 10 different
  * layouts (regular files, device files, symlinks,
  * regular files + scanstamp, files/dir with extended attributes, and then
  * you have the possibility of all of those minus ACL, because it would
  * be kicked out into the spill block)
  *
  * Layouts are simply an array of the attributes and their
  * ordering i.e. [0, 1, 4, 5, 2]
  *
  * Each distinct layout is given a unique layout number and that is what's
  * stored in the header at the beginning of the SA data buffer.
  *
  * A layout only covers a single dbuf (bonus or spill).  If a set of
  * attributes is split up between the bonus buffer and a spill buffer then
  * two different layouts will be used.  This allows us to byteswap the
  * spill without looking at the bonus buffer and keeps the on disk format of
  * the bonus and spill buffer the same.
  *
  * Adding a single attribute will cause the entire set of attributes to
  * be rewritten and could result in a new layout number being constructed
  * as part of the rewrite if no such layout exists for the new set of
  * attributes.  The new attribute will be appended to the end of the already
  * existing attributes.
  *
  * Both the attribute registration and attribute layout information are
  * stored in normal ZAP attributes.  Their should be a small number of
  * known layouts and the set of attributes is assumed to typically be quite
  * small.
  *
  * The registered attributes and layout "table" information is maintained
  * in core and a special "sa_os_t" is attached to the objset_t.
  *
  * A special interface is provided to allow for quickly applying
  * a large set of attributes at once.  sa_replace_all_by_template() is
  * used to set an array of attributes.  This is used by the ZPL when
  * creating a brand new file.  The template that is passed into the function
  * specifies the attribute, size for variable length attributes, location of
  * data and special "data locator" function if the data isn't in a contiguous
  * location.
  *
  * Byteswap implications:
  *
  * Since the SA attributes are not entirely self describing we can't do
  * the normal byteswap processing.  The special ZAP layout attribute and
  * attribute registration attributes define the byteswap function and the
  * size of the attributes, unless it is variable sized.
  * The normal ZFS byteswapping infrastructure assumes you don't need
  * to read any objects in order to do the necessary byteswapping.  Whereas
  * SA attributes can only be properly byteswapped if the dataset is opened
  * and the layout/attribute ZAP attributes are available.  Because of this
  * the SA attributes will be byteswapped when they are first accessed by
  * the SA code that will read the SA data.
  */
 
 typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
     uint16_t length, int length_idx, boolean_t, void *userp);
 
 static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
 static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
 static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
     sa_hdr_phys_t *hdr);
 static void sa_idx_tab_rele(objset_t *os, void *arg);
 static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
     int buflen);
 static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx);
 
 static arc_byteswap_func_t sa_bswap_table[] = {
 	byteswap_uint64_array,
 	byteswap_uint32_array,
 	byteswap_uint16_array,
 	byteswap_uint8_array,
 	zfs_acl_byteswap,
 };
 
 #ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
 #define	SA_COPY_DATA(f, s, t, l)				\
 do {								\
 	if (f == NULL) {					\
 		if (l == 8) {					\
 			*(uint64_t *)t = *(uint64_t *)s;	\
 		} else if (l == 16) {				\
 			*(uint64_t *)t = *(uint64_t *)s;	\
 			*(uint64_t *)((uintptr_t)t + 8) =	\
 			    *(uint64_t *)((uintptr_t)s + 8);	\
 		} else {					\
 			memcpy(t, s, l);				\
 		}						\
 	} else {						\
 		sa_copy_data(f, s, t, l);			\
 	}							\
 } while (0)
 #else
 #define	SA_COPY_DATA(f, s, t, l)	sa_copy_data(f, s, t, l)
 #endif
 
 /*
  * This table is fixed and cannot be changed.  Its purpose is to
  * allow the SA code to work with both old/new ZPL file systems.
  * It contains the list of legacy attributes.  These attributes aren't
  * stored in the "attribute" registry zap objects, since older ZPL file systems
  * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
  * use this static table.
  */
 static const sa_attr_reg_t sa_legacy_attrs[] = {
 	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
 	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
 	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
 	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
 	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
 	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
 	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
 	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
 	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
 	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
 	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
 	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
 	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
 	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
 	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
 	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
 };
 
 /*
  * This is only used for objects of type DMU_OT_ZNODE
  */
 static const sa_attr_type_t sa_legacy_zpl_layout[] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
 /*
  * Special dummy layout used for buffers with no attributes.
  */
 static const sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
 
 static const size_t sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs);
 static kmem_cache_t *sa_cache = NULL;
 
 static int
 sa_cache_constructor(void *buf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	sa_handle_t *hdl = buf;
 
 	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 static void
 sa_cache_destructor(void *buf, void *unused)
 {
 	(void) unused;
 	sa_handle_t *hdl = buf;
 	mutex_destroy(&hdl->sa_lock);
 }
 
 void
 sa_cache_init(void)
 {
 	sa_cache = kmem_cache_create("sa_cache",
 	    sizeof (sa_handle_t), 0, sa_cache_constructor,
 	    sa_cache_destructor, NULL, NULL, NULL, 0);
 }
 
 void
 sa_cache_fini(void)
 {
 	if (sa_cache)
 		kmem_cache_destroy(sa_cache);
 }
 
 static int
 layout_num_compare(const void *arg1, const void *arg2)
 {
 	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
 	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
 
 	return (TREE_CMP(node1->lot_num, node2->lot_num));
 }
 
 static int
 layout_hash_compare(const void *arg1, const void *arg2)
 {
 	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
 	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
 
 	int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash);
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_CMP(node1->lot_instance, node2->lot_instance));
 }
 
 static boolean_t
 sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
 {
 	int i;
 
 	if (count != tbf->lot_attr_count)
 		return (1);
 
 	for (i = 0; i != count; i++) {
 		if (attrs[i] != tbf->lot_attrs[i])
 			return (1);
 	}
 	return (0);
 }
 
 #define	SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
 
 static uint64_t
 sa_layout_info_hash(const sa_attr_type_t *attrs, int attr_count)
 {
 	uint64_t crc = -1ULL;
 
 	for (int i = 0; i != attr_count; i++)
 		crc ^= SA_ATTR_HASH(attrs[i]);
 
 	return (crc);
 }
 
 static int
 sa_get_spill(sa_handle_t *hdl)
 {
 	int rc;
 	if (hdl->sa_spill == NULL) {
 		if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
 		    &hdl->sa_spill)) == 0)
 			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 	} else {
 		rc = 0;
 	}
 
 	return (rc);
 }
 
 /*
  * Main attribute lookup/update function
  * returns 0 for success or non zero for failures
  *
  * Operates on bulk array, first failure will abort further processing
  */
 static int
 sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
     sa_data_op_t data_op, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	int i;
 	int error = 0;
 	sa_buf_type_t buftypes;
 
 	buftypes = 0;
 
 	ASSERT(count > 0);
 	for (i = 0; i != count; i++) {
 		ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
 
 		bulk[i].sa_addr = NULL;
 		/* First check the bonus buffer */
 
 		if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
 		    hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
 			SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
 			    SA_GET_HDR(hdl, SA_BONUS),
 			    bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
 			if (tx && !(buftypes & SA_BONUS)) {
 				dmu_buf_will_dirty(hdl->sa_bonus, tx);
 				buftypes |= SA_BONUS;
 			}
 		}
 		if (bulk[i].sa_addr == NULL &&
 		    ((error = sa_get_spill(hdl)) == 0)) {
 			if (TOC_ATTR_PRESENT(
 			    hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
 				SA_ATTR_INFO(sa, hdl->sa_spill_tab,
 				    SA_GET_HDR(hdl, SA_SPILL),
 				    bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
 				if (tx && !(buftypes & SA_SPILL) &&
 				    bulk[i].sa_size == bulk[i].sa_length) {
 					dmu_buf_will_dirty(hdl->sa_spill, tx);
 					buftypes |= SA_SPILL;
 				}
 			}
 		}
 		if (error && error != ENOENT) {
 			return ((error == ECKSUM) ? EIO : error);
 		}
 
 		switch (data_op) {
 		case SA_LOOKUP:
 			if (bulk[i].sa_addr == NULL)
 				return (SET_ERROR(ENOENT));
 			if (bulk[i].sa_data) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_addr, bulk[i].sa_data,
 				    bulk[i].sa_size);
 			}
 			continue;
 
 		case SA_UPDATE:
 			/* existing rewrite of attr */
 			if (bulk[i].sa_addr &&
 			    bulk[i].sa_size == bulk[i].sa_length) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_addr,
 				    bulk[i].sa_length);
 				continue;
 			} else if (bulk[i].sa_addr) { /* attr size change */
 				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 				    SA_REPLACE, bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_length, tx);
 			} else { /* adding new attribute */
 				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 				    SA_ADD, bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_length, tx);
 			}
 			if (error)
 				return (error);
 			break;
 		default:
 			break;
 		}
 	}
 	return (error);
 }
 
 static sa_lot_t *
 sa_add_layout_entry(objset_t *os, const sa_attr_type_t *attrs, int attr_count,
     uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, *findtb;
 	int i;
 	avl_index_t loc;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
 	tb->lot_attr_count = attr_count;
 	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
 	memcpy(tb->lot_attrs, attrs, sizeof (sa_attr_type_t) * attr_count);
 	tb->lot_num = lot_num;
 	tb->lot_hash = hash;
 	tb->lot_instance = 0;
 
 	if (zapadd) {
 		char attr_name[8];
 
 		if (sa->sa_layout_attr_obj == 0) {
 			sa->sa_layout_attr_obj = zap_create_link(os,
 			    DMU_OT_SA_ATTR_LAYOUTS,
 			    sa->sa_master_obj, SA_LAYOUTS, tx);
 		}
 
 		(void) snprintf(attr_name, sizeof (attr_name),
 		    "%d", (int)lot_num);
 		VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
 		    attr_name, 2, attr_count, attrs, tx));
 	}
 
 	list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
 	    offsetof(sa_idx_tab_t, sa_next));
 
 	for (i = 0; i != attr_count; i++) {
 		if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
 			tb->lot_var_sizes++;
 	}
 
 	avl_add(&sa->sa_layout_num_tree, tb);
 
 	/* verify we don't have a hash collision */
 	if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
 		for (; findtb && findtb->lot_hash == hash;
 		    findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
 			if (findtb->lot_instance != tb->lot_instance)
 				break;
 			tb->lot_instance++;
 		}
 	}
 	avl_add(&sa->sa_layout_hash_tree, tb);
 	return (tb);
 }
 
 static void
 sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
     int count, dmu_tx_t *tx, sa_lot_t **lot)
 {
 	sa_lot_t *tb, tbsearch;
 	avl_index_t loc;
 	sa_os_t *sa = os->os_sa;
 	boolean_t found = B_FALSE;
 
 	mutex_enter(&sa->sa_lock);
 	tbsearch.lot_hash = hash;
 	tbsearch.lot_instance = 0;
 	tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
 	if (tb) {
 		for (; tb && tb->lot_hash == hash;
 		    tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
 			if (sa_layout_equal(tb, attrs, count) == 0) {
 				found = B_TRUE;
 				break;
 			}
 		}
 	}
 	if (!found) {
 		tb = sa_add_layout_entry(os, attrs, count,
 		    avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
 	}
 	mutex_exit(&sa->sa_lock);
 	*lot = tb;
 }
 
 static int
 sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
 {
 	int error;
 	uint32_t blocksize;
 
 	if (size == 0) {
 		blocksize = SPA_MINBLOCKSIZE;
 	} else if (size > SPA_OLD_MAXBLOCKSIZE) {
 		ASSERT(0);
 		return (SET_ERROR(EFBIG));
 	} else {
 		blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
 	}
 
 	error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
 	ASSERT(error == 0);
 	return (error);
 }
 
 static void
 sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
 {
 	if (func == NULL) {
 		memcpy(target, datastart, buflen);
 	} else {
 		boolean_t start;
 		int bytes;
 		void *dataptr;
 		void *saptr = target;
 		uint32_t length;
 
 		start = B_TRUE;
 		bytes = 0;
 		while (bytes < buflen) {
 			func(&dataptr, &length, buflen, start, datastart);
 			memcpy(saptr, dataptr, length);
 			saptr = (void *)((caddr_t)saptr + length);
 			bytes += length;
 			start = B_FALSE;
 		}
 	}
 }
 
 /*
  * Determine several different values pertaining to system attribute
  * buffers.
  *
  * Return the size of the sa_hdr_phys_t header for the buffer. Each
  * variable length attribute except the first contributes two bytes to
  * the header size, which is then rounded up to an 8-byte boundary.
  *
  * The following output parameters are also computed.
  *
  *  index - The index of the first attribute in attr_desc that will
  *  spill over. Only valid if will_spill is set.
  *
  *  total - The total number of bytes of all system attributes described
  *  in attr_desc.
  *
  *  will_spill - Set when spilling is necessary. It is only set when
  *  the buftype is SA_BONUS.
  */
 static int
 sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
     dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
     int *total, boolean_t *will_spill)
 {
 	int var_size_count = 0;
 	int i;
 	int hdrsize;
 	int extra_hdrsize;
 
 	if (buftype == SA_BONUS && sa->sa_force_spill) {
 		*total = 0;
 		*index = 0;
 		*will_spill = B_TRUE;
 		return (0);
 	}
 
 	*index = -1;
 	*total = 0;
 	*will_spill = B_FALSE;
 
 	extra_hdrsize = 0;
 	hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
 	    sizeof (sa_hdr_phys_t);
 
 	ASSERT(IS_P2ALIGNED(full_space, 8));
 
 	for (i = 0; i != attr_count; i++) {
 		boolean_t is_var_sz, might_spill_here;
 		int tmp_hdrsize;
 
 		*total = P2ROUNDUP(*total, 8);
 		*total += attr_desc[i].sa_length;
 		if (*will_spill)
 			continue;
 
 		is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
 		if (is_var_sz)
 			var_size_count++;
 
 		/*
 		 * Calculate what the SA header size would be if this
 		 * attribute doesn't spill.
 		 */
 		tmp_hdrsize = hdrsize + ((is_var_sz && var_size_count > 1) ?
 		    sizeof (uint16_t) : 0);
 
 		/*
 		 * Check whether this attribute spans into the space
 		 * that would be used by the spill block pointer should
 		 * a spill block be needed.
 		 */
 		might_spill_here =
 		    buftype == SA_BONUS && *index == -1 &&
 		    (*total + P2ROUNDUP(tmp_hdrsize, 8)) >
 		    (full_space - sizeof (blkptr_t));
 
 		if (is_var_sz && var_size_count > 1) {
 			if (buftype == SA_SPILL ||
 			    tmp_hdrsize + *total < full_space) {
 				/*
 				 * Record the extra header size in case this
 				 * increase needs to be reversed due to
 				 * spill-over.
 				 */
 				hdrsize = tmp_hdrsize;
 				if (*index != -1 || might_spill_here)
 					extra_hdrsize += sizeof (uint16_t);
 			} else {
 				ASSERT(buftype == SA_BONUS);
 				if (*index == -1)
 					*index = i;
 				*will_spill = B_TRUE;
 				continue;
 			}
 		}
 
 		/*
 		 * Store index of where spill *could* occur. Then
 		 * continue to count the remaining attribute sizes. The
 		 * sum is used later for sizing bonus and spill buffer.
 		 */
 		if (might_spill_here)
 			*index = i;
 
 		if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
 		    buftype == SA_BONUS)
 			*will_spill = B_TRUE;
 	}
 
 	if (*will_spill)
 		hdrsize -= extra_hdrsize;
 
 	hdrsize = P2ROUNDUP(hdrsize, 8);
 	return (hdrsize);
 }
 
 #define	BUF_SPACE_NEEDED(total, header) (total + header)
 
 /*
  * Find layout that corresponds to ordering of attributes
  * If not found a new layout number is created and added to
  * persistent layout tables.
  */
 static int
 sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
     dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	uint64_t hash;
 	sa_buf_type_t buftype;
 	sa_hdr_phys_t *sahdr;
 	void *data_start;
 	sa_attr_type_t *attrs, *attrs_start;
 	int i, lot_count;
 	int dnodesize;
 	int spill_idx;
 	int hdrsize;
 	int spillhdrsize = 0;
 	int used;
 	dmu_object_type_t bonustype;
 	sa_lot_t *lot;
 	int len_idx;
 	int spill_used;
 	int bonuslen;
 	boolean_t spilling;
 
 	dmu_buf_will_dirty(hdl->sa_bonus, tx);
 	bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
 	dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
 	bonuslen = DN_BONUS_SIZE(dnodesize);
 
 	/* first determine bonus header size and sum of all attributes */
 	hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
 	    SA_BONUS, bonuslen, &spill_idx, &used, &spilling);
 
 	if (used > SPA_OLD_MAXBLOCKSIZE)
 		return (SET_ERROR(EFBIG));
 
 	VERIFY0(dmu_set_bonus(hdl->sa_bonus, spilling ?
 	    MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
 	    used + hdrsize, tx));
 
 	ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
 	    bonustype == DMU_OT_SA);
 
 	/* setup and size spill buffer when needed */
 	if (spilling) {
 		boolean_t dummy;
 
 		if (hdl->sa_spill == NULL) {
 			VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL,
 			    &hdl->sa_spill) == 0);
 		}
 		dmu_buf_will_dirty(hdl->sa_spill, tx);
 
 		spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx],
 		    attr_count - spill_idx, hdl->sa_spill, SA_SPILL,
 		    hdl->sa_spill->db_size, &i, &spill_used, &dummy);
 
 		if (spill_used > SPA_OLD_MAXBLOCKSIZE)
 			return (SET_ERROR(EFBIG));
 
 		if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
 		    hdl->sa_spill->db_size)
 			VERIFY(0 == sa_resize_spill(hdl,
 			    BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
 	}
 
 	/* setup starting pointers to lay down data */
 	data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
 	sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
 	buftype = SA_BONUS;
 
 	attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
 	lot_count = 0;
 
 	for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
 		uint16_t length;
 
 		ASSERT(IS_P2ALIGNED(data_start, 8));
 		attrs[i] = attr_desc[i].sa_attr;
 		length = SA_REGISTERED_LEN(sa, attrs[i]);
 		if (length == 0)
 			length = attr_desc[i].sa_length;
 
 		if (spilling && i == spill_idx) { /* switch to spill buffer */
 			VERIFY(bonustype == DMU_OT_SA);
 			if (buftype == SA_BONUS && !sa->sa_force_spill) {
 				sa_find_layout(hdl->sa_os, hash, attrs_start,
 				    lot_count, tx, &lot);
 				SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
 			}
 
 			buftype = SA_SPILL;
 			hash = -1ULL;
 			len_idx = 0;
 
 			sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
 			sahdr->sa_magic = SA_MAGIC;
 			data_start = (void *)((uintptr_t)sahdr +
 			    spillhdrsize);
 			attrs_start = &attrs[i];
 			lot_count = 0;
 		}
 		hash ^= SA_ATTR_HASH(attrs[i]);
 		attr_desc[i].sa_addr = data_start;
 		attr_desc[i].sa_size = length;
 		SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
 		    data_start, length);
 		if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
 			sahdr->sa_lengths[len_idx++] = length;
 		}
 		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 		    length), 8);
 		lot_count++;
 	}
 
 	sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
 
 	/*
 	 * Verify that old znodes always have layout number 0.
 	 * Must be DMU_OT_SA for arbitrary layouts
 	 */
 	VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
 	    (bonustype == DMU_OT_SA && lot->lot_num > 1));
 
 	if (bonustype == DMU_OT_SA) {
 		SA_SET_HDR(sahdr, lot->lot_num,
 		    buftype == SA_BONUS ? hdrsize : spillhdrsize);
 	}
 
 	kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
 	if (hdl->sa_bonus_tab) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 		hdl->sa_bonus_tab = NULL;
 	}
 	if (!sa->sa_force_spill)
 		VERIFY(0 == sa_build_index(hdl, SA_BONUS));
 	if (hdl->sa_spill) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 		if (!spilling) {
 			/*
 			 * remove spill block that is no longer needed.
 			 */
 			dmu_buf_rele(hdl->sa_spill, NULL);
 			hdl->sa_spill = NULL;
 			hdl->sa_spill_tab = NULL;
 			VERIFY(0 == dmu_rm_spill(hdl->sa_os,
 			    sa_handle_object(hdl), tx));
 		} else {
 			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 		}
 	}
 
 	return (0);
 }
 
 static void
 sa_free_attr_table(sa_os_t *sa)
 {
 	int i;
 
 	if (sa->sa_attr_table == NULL)
 		return;
 
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_name)
 			kmem_free(sa->sa_attr_table[i].sa_name,
 			    strlen(sa->sa_attr_table[i].sa_name) + 1);
 	}
 
 	kmem_free(sa->sa_attr_table,
 	    sizeof (sa_attr_table_t) * sa->sa_num_attrs);
 
 	sa->sa_attr_table = NULL;
 }
 
 static int
 sa_attr_table_setup(objset_t *os, const sa_attr_reg_t *reg_attrs, int count)
 {
 	sa_os_t *sa = os->os_sa;
 	uint64_t sa_attr_count = 0;
 	uint64_t sa_reg_count = 0;
 	int error = 0;
 	uint64_t attr_value;
 	sa_attr_table_t *tb;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int registered_count = 0;
 	int i;
 	dmu_objset_type_t ostype = dmu_objset_type(os);
 
 	sa->sa_user_table =
 	    kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
 	sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
 
 	if (sa->sa_reg_attr_obj != 0) {
 		error = zap_count(os, sa->sa_reg_attr_obj,
 		    &sa_attr_count);
 
 		/*
 		 * Make sure we retrieved a count and that it isn't zero
 		 */
 		if (error || (error == 0 && sa_attr_count == 0)) {
 			if (error == 0)
 				error = SET_ERROR(EINVAL);
 			goto bail;
 		}
 		sa_reg_count = sa_attr_count;
 	}
 
 	if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
 		sa_attr_count += sa_legacy_attr_count;
 
 	/* Allocate attribute numbers for attributes that aren't registered */
 	for (i = 0; i != count; i++) {
 		boolean_t found = B_FALSE;
 		int j;
 
 		if (ostype == DMU_OST_ZFS) {
 			for (j = 0; j != sa_legacy_attr_count; j++) {
 				if (strcmp(reg_attrs[i].sa_name,
 				    sa_legacy_attrs[j].sa_name) == 0) {
 					sa->sa_user_table[i] =
 					    sa_legacy_attrs[j].sa_attr;
 					found = B_TRUE;
 				}
 			}
 		}
 		if (found)
 			continue;
 
 		if (sa->sa_reg_attr_obj)
 			error = zap_lookup(os, sa->sa_reg_attr_obj,
 			    reg_attrs[i].sa_name, 8, 1, &attr_value);
 		else
 			error = SET_ERROR(ENOENT);
 		switch (error) {
 		case ENOENT:
 			sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
 			sa_attr_count++;
 			break;
 		case 0:
 			sa->sa_user_table[i] = ATTR_NUM(attr_value);
 			break;
 		default:
 			goto bail;
 		}
 	}
 
 	sa->sa_num_attrs = sa_attr_count;
 	tb = sa->sa_attr_table =
 	    kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
 
 	/*
 	 * Attribute table is constructed from requested attribute list,
 	 * previously foreign registered attributes, and also the legacy
 	 * ZPL set of attributes.
 	 */
 
 	if (sa->sa_reg_attr_obj) {
 		for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
 		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t value;
 			value  = za.za_first_integer;
 
 			registered_count++;
 			tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
 			tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
 			tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
 			tb[ATTR_NUM(value)].sa_registered = B_TRUE;
 
 			if (tb[ATTR_NUM(value)].sa_name) {
 				continue;
 			}
 			tb[ATTR_NUM(value)].sa_name =
 			    kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
 			(void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
 			    strlen(za.za_name) +1);
 		}
 		zap_cursor_fini(&zc);
 		/*
 		 * Make sure we processed the correct number of registered
 		 * attributes
 		 */
 		if (registered_count != sa_reg_count) {
 			ASSERT(error != 0);
 			goto bail;
 		}
 
 	}
 
 	if (ostype == DMU_OST_ZFS) {
 		for (i = 0; i != sa_legacy_attr_count; i++) {
 			if (tb[i].sa_name)
 				continue;
 			tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
 			tb[i].sa_length = sa_legacy_attrs[i].sa_length;
 			tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
 			tb[i].sa_registered = B_FALSE;
 			tb[i].sa_name =
 			    kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
 			    KM_SLEEP);
 			(void) strlcpy(tb[i].sa_name,
 			    sa_legacy_attrs[i].sa_name,
 			    strlen(sa_legacy_attrs[i].sa_name) + 1);
 		}
 	}
 
 	for (i = 0; i != count; i++) {
 		sa_attr_type_t attr_id;
 
 		attr_id = sa->sa_user_table[i];
 		if (tb[attr_id].sa_name)
 			continue;
 
 		tb[attr_id].sa_length = reg_attrs[i].sa_length;
 		tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
 		tb[attr_id].sa_attr = attr_id;
 		tb[attr_id].sa_name =
 		    kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
 		(void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
 		    strlen(reg_attrs[i].sa_name) + 1);
 	}
 
 	sa->sa_need_attr_registration =
 	    (sa_attr_count != registered_count);
 
 	return (0);
 bail:
 	kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
 	sa->sa_user_table = NULL;
 	sa_free_attr_table(sa);
 	ASSERT(error != 0);
 	return (error);
 }
 
 int
 sa_setup(objset_t *os, uint64_t sa_obj, const sa_attr_reg_t *reg_attrs,
     int count, sa_attr_type_t **user_table)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	sa_os_t *sa;
 	dmu_objset_type_t ostype = dmu_objset_type(os);
 	sa_attr_type_t *tb;
 	int error;
 
 	mutex_enter(&os->os_user_ptr_lock);
 	if (os->os_sa) {
 		mutex_enter(&os->os_sa->sa_lock);
 		mutex_exit(&os->os_user_ptr_lock);
 		tb = os->os_sa->sa_user_table;
 		mutex_exit(&os->os_sa->sa_lock);
 		*user_table = tb;
 		return (0);
 	}
 
 	sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
 	mutex_init(&sa->sa_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	sa->sa_master_obj = sa_obj;
 
 	os->os_sa = sa;
 	mutex_enter(&sa->sa_lock);
 	mutex_exit(&os->os_user_ptr_lock);
 	avl_create(&sa->sa_layout_num_tree, layout_num_compare,
 	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
 	avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
 	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
 
 	if (sa_obj) {
 		error = zap_lookup(os, sa_obj, SA_LAYOUTS,
 		    8, 1, &sa->sa_layout_attr_obj);
 		if (error != 0 && error != ENOENT)
 			goto fail;
 		error = zap_lookup(os, sa_obj, SA_REGISTRY,
 		    8, 1, &sa->sa_reg_attr_obj);
 		if (error != 0 && error != ENOENT)
 			goto fail;
 	}
 
 	if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
 		goto fail;
 
 	if (sa->sa_layout_attr_obj != 0) {
 		uint64_t layout_count;
 
 		error = zap_count(os, sa->sa_layout_attr_obj,
 		    &layout_count);
 
 		/*
 		 * Layout number count should be > 0
 		 */
 		if (error || (error == 0 && layout_count == 0)) {
 			if (error == 0)
 				error = SET_ERROR(EINVAL);
 			goto fail;
 		}
 
 		for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
 		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			sa_attr_type_t *lot_attrs;
 			uint64_t lot_num;
 
 			lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
 			    za.za_num_integers, KM_SLEEP);
 
 			if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
 			    za.za_name, 2, za.za_num_integers,
 			    lot_attrs))) != 0) {
 				kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
 				    za.za_num_integers);
 				break;
 			}
 			VERIFY0(ddi_strtoull(za.za_name, NULL, 10,
 			    (unsigned long long *)&lot_num));
 
 			(void) sa_add_layout_entry(os, lot_attrs,
 			    za.za_num_integers, lot_num,
 			    sa_layout_info_hash(lot_attrs,
 			    za.za_num_integers), B_FALSE, NULL);
 			kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
 			    za.za_num_integers);
 		}
 		zap_cursor_fini(&zc);
 
 		/*
 		 * Make sure layout count matches number of entries added
 		 * to AVL tree
 		 */
 		if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
 			ASSERT(error != 0);
 			goto fail;
 		}
 	}
 
 	/* Add special layout number for old ZNODES */
 	if (ostype == DMU_OST_ZFS) {
 		(void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
 		    sa_legacy_attr_count, 0,
 		    sa_layout_info_hash(sa_legacy_zpl_layout,
 		    sa_legacy_attr_count), B_FALSE, NULL);
 
 		(void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
 		    0, B_FALSE, NULL);
 	}
 	*user_table = os->os_sa->sa_user_table;
 	mutex_exit(&sa->sa_lock);
 	return (0);
 fail:
 	os->os_sa = NULL;
 	sa_free_attr_table(sa);
 	if (sa->sa_user_table)
 		kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 	mutex_exit(&sa->sa_lock);
 	avl_destroy(&sa->sa_layout_hash_tree);
 	avl_destroy(&sa->sa_layout_num_tree);
 	mutex_destroy(&sa->sa_lock);
 	kmem_free(sa, sizeof (sa_os_t));
 	return ((error == ECKSUM) ? EIO : error);
 }
 
 void
 sa_tear_down(objset_t *os)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *layout;
 	void *cookie;
 
 	kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 
 	/* Free up attr table */
 
 	sa_free_attr_table(sa);
 
 	cookie = NULL;
 	while ((layout =
 	    avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))) {
 		sa_idx_tab_t *tab;
 		while ((tab = list_head(&layout->lot_idx_tab))) {
 			ASSERT(zfs_refcount_count(&tab->sa_refcount));
 			sa_idx_tab_rele(os, tab);
 		}
 	}
 
 	cookie = NULL;
 	while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))) {
 		kmem_free(layout->lot_attrs,
 		    sizeof (sa_attr_type_t) * layout->lot_attr_count);
 		kmem_free(layout, sizeof (sa_lot_t));
 	}
 
 	avl_destroy(&sa->sa_layout_hash_tree);
 	avl_destroy(&sa->sa_layout_num_tree);
 	mutex_destroy(&sa->sa_lock);
 
 	kmem_free(sa, sizeof (sa_os_t));
 	os->os_sa = NULL;
 }
 
 static void
 sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t var_length, void *userp)
 {
 	sa_idx_tab_t *idx_tab = userp;
 
 	if (var_length) {
 		ASSERT(idx_tab->sa_variable_lengths);
 		idx_tab->sa_variable_lengths[length_idx] = length;
 	}
 	TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
 	    (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
 }
 
 static void
 sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
     sa_iterfunc_t func, sa_lot_t *tab, void *userp)
 {
 	void *data_start;
 	sa_lot_t *tb = tab;
 	sa_lot_t search;
 	avl_index_t loc;
 	sa_os_t *sa = os->os_sa;
 	int i;
 	uint16_t *length_start = NULL;
 	uint8_t length_idx = 0;
 
 	if (tab == NULL) {
 		search.lot_num = SA_LAYOUT_NUM(hdr, type);
 		tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
 		ASSERT(tb);
 	}
 
 	if (IS_SA_BONUSTYPE(type)) {
 		data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
 		    offsetof(sa_hdr_phys_t, sa_lengths) +
 		    (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
 		length_start = hdr->sa_lengths;
 	} else {
 		data_start = hdr;
 	}
 
 	for (i = 0; i != tb->lot_attr_count; i++) {
 		int attr_length, reg_length;
 		uint8_t idx_len;
 
 		reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
 		IMPLY(reg_length == 0, IS_SA_BONUSTYPE(type));
 		if (reg_length) {
 			attr_length = reg_length;
 			idx_len = 0;
 		} else {
 			attr_length = length_start[length_idx];
 			idx_len = length_idx++;
 		}
 
 		func(hdr, data_start, tb->lot_attrs[i], attr_length,
 		    idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
 
 		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 		    attr_length), 8);
 	}
 }
 
 static void
 sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t variable_length, void *userp)
 {
 	(void) hdr, (void) length_idx, (void) variable_length;
 	sa_handle_t *hdl = userp;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
 	sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
 }
 
 static void
 sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
 {
 	sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
 	dmu_buf_impl_t *db;
 	int num_lengths = 1;
 	int i;
 	sa_os_t *sa __maybe_unused = hdl->sa_os->os_sa;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	if (sa_hdr_phys->sa_magic == SA_MAGIC)
 		return;
 
 	db = SA_GET_DB(hdl, buftype);
 
 	if (buftype == SA_SPILL) {
 		arc_release(db->db_buf, NULL);
 		arc_buf_thaw(db->db_buf);
 	}
 
 	sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
 	sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
 
 	/*
 	 * Determine number of variable lengths in header
 	 * The standard 8 byte header has one for free and a
 	 * 16 byte header would have 4 + 1;
 	 */
 	if (SA_HDR_SIZE(sa_hdr_phys) > 8)
 		num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
 	for (i = 0; i != num_lengths; i++)
 		sa_hdr_phys->sa_lengths[i] =
 		    BSWAP_16(sa_hdr_phys->sa_lengths[i]);
 
 	sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
 	    sa_byteswap_cb, NULL, hdl);
 
 	if (buftype == SA_SPILL)
 		arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
 }
 
 static int
 sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
 {
 	sa_hdr_phys_t *sa_hdr_phys;
 	dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
 	dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	sa_idx_tab_t *idx_tab;
 
 	sa_hdr_phys = SA_GET_HDR(hdl, buftype);
 
 	mutex_enter(&sa->sa_lock);
 
 	/* Do we need to byteswap? */
 
 	/* only check if not old znode */
 	if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
 	    sa_hdr_phys->sa_magic != 0) {
 		if (BSWAP_32(sa_hdr_phys->sa_magic) != SA_MAGIC) {
 			mutex_exit(&sa->sa_lock);
 			zfs_dbgmsg("Buffer Header: %x != SA_MAGIC:%x "
 			    "object=%#llx\n", sa_hdr_phys->sa_magic, SA_MAGIC,
 			    (u_longlong_t)db->db.db_object);
 			return (SET_ERROR(EIO));
 		}
 		sa_byteswap(hdl, buftype);
 	}
 
 	idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
 
 	if (buftype == SA_BONUS)
 		hdl->sa_bonus_tab = idx_tab;
 	else
 		hdl->sa_spill_tab = idx_tab;
 
 	mutex_exit(&sa->sa_lock);
 	return (0);
 }
 
 static void
 sa_evict_sync(void *dbu)
 {
 	(void) dbu;
 	panic("evicting sa dbuf\n");
 }
 
 static void
 sa_idx_tab_rele(objset_t *os, void *arg)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_idx_tab_t *idx_tab = arg;
 
 	if (idx_tab == NULL)
 		return;
 
 	mutex_enter(&sa->sa_lock);
 	if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
 		list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
 		if (idx_tab->sa_variable_lengths)
 			kmem_free(idx_tab->sa_variable_lengths,
 			    sizeof (uint16_t) *
 			    idx_tab->sa_layout->lot_var_sizes);
 		zfs_refcount_destroy(&idx_tab->sa_refcount);
 		kmem_free(idx_tab->sa_idx_tab,
 		    sizeof (uint32_t) * sa->sa_num_attrs);
 		kmem_free(idx_tab, sizeof (sa_idx_tab_t));
 	}
 	mutex_exit(&sa->sa_lock);
 }
 
 static void
 sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
 {
 	sa_os_t *sa __maybe_unused = os->os_sa;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	(void) zfs_refcount_add(&idx_tab->sa_refcount, NULL);
 }
 
 void
 sa_spill_rele(sa_handle_t *hdl)
 {
 	mutex_enter(&hdl->sa_lock);
 	if (hdl->sa_spill) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 		dmu_buf_rele(hdl->sa_spill, NULL);
 		hdl->sa_spill = NULL;
 		hdl->sa_spill_tab = NULL;
 	}
 	mutex_exit(&hdl->sa_lock);
 }
 
 void
 sa_handle_destroy(sa_handle_t *hdl)
 {
 	dmu_buf_t *db = hdl->sa_bonus;
 
 	mutex_enter(&hdl->sa_lock);
 	(void) dmu_buf_remove_user(db, &hdl->sa_dbu);
 
 	if (hdl->sa_bonus_tab)
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 
 	if (hdl->sa_spill_tab)
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 
 	dmu_buf_rele(hdl->sa_bonus, NULL);
 
 	if (hdl->sa_spill)
 		dmu_buf_rele(hdl->sa_spill, NULL);
 	mutex_exit(&hdl->sa_lock);
 
 	kmem_cache_free(sa_cache, hdl);
 }
 
 int
 sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
 	int error = 0;
 	sa_handle_t *handle = NULL;
 #ifdef ZFS_DEBUG
 	dmu_object_info_t doi;
 
 	dmu_object_info_from_db(db, &doi);
 	ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
 	    doi.doi_bonus_type == DMU_OT_ZNODE);
 #endif
 	/* find handle, if it exists */
 	/* if one doesn't exist then create a new one, and initialize it */
 
 	if (hdl_type == SA_HDL_SHARED)
 		handle = dmu_buf_get_user(db);
 
 	if (handle == NULL) {
 		sa_handle_t *winner = NULL;
 
 		handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
 		handle->sa_dbu.dbu_evict_func_sync = NULL;
 		handle->sa_dbu.dbu_evict_func_async = NULL;
 		handle->sa_userp = userp;
 		handle->sa_bonus = db;
 		handle->sa_os = os;
 		handle->sa_spill = NULL;
 		handle->sa_bonus_tab = NULL;
 		handle->sa_spill_tab = NULL;
 
 		error = sa_build_index(handle, SA_BONUS);
 
 		if (hdl_type == SA_HDL_SHARED) {
 			dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL,
 			    NULL);
 			winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
 		}
 
 		if (winner != NULL) {
 			kmem_cache_free(sa_cache, handle);
 			handle = winner;
 		}
 	}
 	*handlepp = handle;
 
 	return (error);
 }
 
 int
 sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
 	dmu_buf_t *db;
 	int error;
 
 	if ((error = dmu_bonus_hold(objset, objid, NULL, &db)))
 		return (error);
 
 	return (sa_handle_get_from_db(objset, db, userp, hdl_type,
 	    handlepp));
 }
 
 int
 sa_buf_hold(objset_t *objset, uint64_t obj_num, const void *tag, dmu_buf_t **db)
 {
 	return (dmu_bonus_hold(objset, obj_num, tag, db));
 }
 
 void
 sa_buf_rele(dmu_buf_t *db, const void *tag)
 {
 	dmu_buf_rele(db, tag);
 }
 
 static int
 sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
 {
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 	return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
 }
 
 static int
 sa_lookup_locked(sa_handle_t *hdl, sa_attr_type_t attr, void *buf,
     uint32_t buflen)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
 
 	bulk.sa_attr = attr;
 	bulk.sa_data = buf;
 	bulk.sa_length = buflen;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 	error = sa_lookup_impl(hdl, &bulk, 1);
 	return (error);
 }
 
 int
 sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_lookup_locked(hdl, attr, buf, buflen);
 	mutex_exit(&hdl->sa_lock);
 
 	return (error);
 }
 
 #ifdef _KERNEL
 int
 sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, zfs_uio_t *uio)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_data = NULL;
 	bulk.sa_attr = attr;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 
 	mutex_enter(&hdl->sa_lock);
 	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
 		error = zfs_uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
 		    zfs_uio_resid(uio)), UIO_READ, uio);
 	}
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * For the existed object that is upgraded from old system, its ondisk layout
  * has no slot for the project ID attribute. But quota accounting logic needs
  * to access related slots by offset directly. So we need to adjust these old
  * objects' layout to make the project ID to some unified and fixed offset.
  */
 int
 sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
 {
 	znode_t *zp = sa_get_userdata(hdl);
 	dmu_buf_t *db = sa_get_db(hdl);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int count = 0, err = 0;
 	sa_bulk_attr_t *bulk, *attrs;
 	zfs_acl_locator_cb_t locate = { 0 };
 	uint64_t uid, gid, mode, rdev, xattr = 0, parent, gen, links;
 	uint64_t crtime[2], mtime[2], ctime[2], atime[2];
 	zfs_acl_phys_t znode_acl = { 0 };
 	char scanstamp[AV_SCANSTAMP_SZ];
 
 	if (zp->z_acl_cached == NULL) {
 		zfs_acl_t *aclp;
 
 		mutex_enter(&zp->z_acl_lock);
 		err = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
 		mutex_exit(&zp->z_acl_lock);
 		if (err != 0 && err != ENOENT)
 			return (err);
 	}
 
 	bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 	attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 	mutex_enter(&hdl->sa_lock);
 	mutex_enter(&zp->z_lock);
 
 	err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid,
 	    sizeof (uint64_t));
 	if (unlikely(err == 0))
 		/* Someone has added project ID attr by race. */
 		err = EEXIST;
 	if (err != ENOENT)
 		goto out;
 
 	/* First do a bulk query of the attributes that aren't cached */
 	if (zp->z_is_sa) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &mode, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
 		    &gen, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 		    &uid, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 		    &gid, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 		    &parent, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &atime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
 		    &crtime, 16);
 		if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 			    &rdev, 8);
 	} else {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &atime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
 		    &crtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
 		    &gen, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &mode, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 		    &parent, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL,
 		    &xattr, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 		    &uid, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 		    &gid, 8);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
 		    &znode_acl, 88);
 	}
 	err = sa_bulk_lookup_locked(hdl, bulk, count);
 	if (err != 0)
 		goto out;
 
 	err = sa_lookup_locked(hdl, SA_ZPL_XATTR(zfsvfs), &xattr, 8);
 	if (err != 0 && err != ENOENT)
 		goto out;
 
 	zp->z_projid = projid;
 	zp->z_pflags |= ZFS_PROJID;
 	links = ZTONLNK(zp);
 	count = 0;
 	err = 0;
 
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
 	    &crtime, 16);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8);
 
 	if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 
 	if (zp->z_acl_cached != NULL) {
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
 		    &zp->z_acl_cached->z_acl_count, 8);
 		if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
 			zfs_acl_xform(zp, zp->z_acl_cached, CRED());
 		locate.cb_aclp = zp->z_acl_cached;
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
 		    zfs_acl_data_locator, &locate,
 		    zp->z_acl_cached->z_acl_bytes);
 	}
 
 	if (xattr)
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_XATTR(zfsvfs), NULL,
 		    &xattr, 8);
 
 	if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
 		memcpy(scanstamp,
 		    (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
 		    AV_SCANSTAMP_SZ);
 		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL,
 		    scanstamp, AV_SCANSTAMP_SZ);
 		zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
 	}
 
 	VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
 	VERIFY(sa_replace_all_by_template_locked(hdl, attrs, count, tx) == 0);
 	if (znode_acl.z_acl_extern_obj) {
 		VERIFY(0 == dmu_object_free(zfsvfs->z_os,
 		    znode_acl.z_acl_extern_obj, tx));
 	}
 
 	zp->z_is_sa = B_TRUE;
 
 out:
 	mutex_exit(&zp->z_lock);
 	mutex_exit(&hdl->sa_lock);
 	kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
 	kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END);
 	return (err);
 }
 #endif
 
 static sa_idx_tab_t *
 sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr)
 {
 	sa_idx_tab_t *idx_tab;
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, search;
 	avl_index_t loc;
 
 	/*
 	 * Deterimine layout number.  If SA node and header == 0 then
 	 * force the index table to the dummy "1" empty layout.
 	 *
 	 * The layout number would only be zero for a newly created file
 	 * that has not added any attributes yet, or with crypto enabled which
 	 * doesn't write any attributes to the bonus buffer.
 	 */
 
 	search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
 
 	tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
 
 	/* Verify header size is consistent with layout information */
 	ASSERT(tb);
 	ASSERT((IS_SA_BONUSTYPE(bonustype) &&
 	    SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb)) || !IS_SA_BONUSTYPE(bonustype) ||
 	    (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
 
 	/*
 	 * See if any of the already existing TOC entries can be reused?
 	 */
 
 	for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
 	    idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
 		boolean_t valid_idx = B_TRUE;
 		int i;
 
 		if (tb->lot_var_sizes != 0 &&
 		    idx_tab->sa_variable_lengths != NULL) {
 			for (i = 0; i != tb->lot_var_sizes; i++) {
 				if (hdr->sa_lengths[i] !=
 				    idx_tab->sa_variable_lengths[i]) {
 					valid_idx = B_FALSE;
 					break;
 				}
 			}
 		}
 		if (valid_idx) {
 			sa_idx_tab_hold(os, idx_tab);
 			return (idx_tab);
 		}
 	}
 
 	/* No such luck, create a new entry */
 	idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
 	idx_tab->sa_idx_tab =
 	    kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
 	idx_tab->sa_layout = tb;
 	zfs_refcount_create(&idx_tab->sa_refcount);
 	if (tb->lot_var_sizes)
 		idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
 		    tb->lot_var_sizes, KM_SLEEP);
 
 	sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
 	    tb, idx_tab);
 	sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
 	sa_idx_tab_hold(os, idx_tab);	/* one for layout */
 	list_insert_tail(&tb->lot_idx_tab, idx_tab);
 	return (idx_tab);
 }
 
 void
 sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
     boolean_t start, void *userdata)
 {
 	ASSERT(start);
 
 	*dataptr = userdata;
 	*len = total_len;
 }
 
 static void
 sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
 {
 	uint64_t attr_value = 0;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	sa_attr_table_t *tb = sa->sa_attr_table;
 	int i;
 
 	mutex_enter(&sa->sa_lock);
 
 	if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
 		mutex_exit(&sa->sa_lock);
 		return;
 	}
 
 	if (sa->sa_reg_attr_obj == 0) {
 		sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
 		    DMU_OT_SA_ATTR_REGISTRATION,
 		    sa->sa_master_obj, SA_REGISTRY, tx);
 	}
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_registered)
 			continue;
 		ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
 		    tb[i].sa_byteswap);
 		VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
 		    tb[i].sa_name, 8, 1, &attr_value, tx));
 		tb[i].sa_registered = B_TRUE;
 	}
 	sa->sa_need_attr_registration = B_FALSE;
 	mutex_exit(&sa->sa_lock);
 }
 
 /*
  * Replace all attributes with attributes specified in template.
  * If dnode had a spill buffer then those attributes will be
  * also be replaced, possibly with just an empty spill block
  *
  * This interface is intended to only be used for bulk adding of
  * attributes for a new file.  It will also be used by the ZPL
  * when converting and old formatted znode to native SA support.
  */
 int
 sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
     int attr_count, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
 	if (sa->sa_need_attr_registration)
 		sa_attr_register_sync(hdl, tx);
 	return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
 }
 
 int
 sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
     int attr_count, dmu_tx_t *tx)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_replace_all_by_template_locked(hdl, attr_desc,
 	    attr_count, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * Add/remove a single attribute or replace a variable-sized attribute value
  * with a value of a different size, and then rewrite the entire set
  * of attributes.
  * Same-length attribute value replacement (including fixed-length attributes)
  * is handled more efficiently by the upper layers.
  */
 static int
 sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
 	dnode_t *dn;
 	sa_bulk_attr_t *attr_desc;
 	void *old_data[2];
 	int bonus_attr_count = 0;
 	int bonus_data_size = 0;
 	int spill_data_size = 0;
 	int spill_attr_count = 0;
 	int error;
 	uint16_t length, reg_length;
 	int i, j, k, length_idx;
 	sa_hdr_phys_t *hdr;
 	sa_idx_tab_t *idx_tab;
 	int attr_count;
 	int count;
 
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 
 	/* First make of copy of the old data */
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn->dn_bonuslen != 0) {
 		bonus_data_size = hdl->sa_bonus->db_size;
 		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
 		memcpy(old_data[0], hdl->sa_bonus->db_data,
 		    hdl->sa_bonus->db_size);
 		bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
 	} else {
 		old_data[0] = NULL;
 	}
 	DB_DNODE_EXIT(db);
 
 	/* Bring spill buffer online if it isn't currently */
 
 	if ((error = sa_get_spill(hdl)) == 0) {
 		spill_data_size = hdl->sa_spill->db_size;
 		old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP);
 		memcpy(old_data[1], hdl->sa_spill->db_data,
 		    hdl->sa_spill->db_size);
 		spill_attr_count =
 		    hdl->sa_spill_tab->sa_layout->lot_attr_count;
 	} else if (error && error != ENOENT) {
 		if (old_data[0])
 			kmem_free(old_data[0], bonus_data_size);
 		return (error);
 	} else {
 		old_data[1] = NULL;
 	}
 
 	/* build descriptor of all attributes */
 
 	attr_count = bonus_attr_count + spill_attr_count;
 	if (action == SA_ADD)
 		attr_count++;
 	else if (action == SA_REMOVE)
 		attr_count--;
 
 	attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
 
 	/*
 	 * loop through bonus and spill buffer if it exists, and
 	 * build up new attr_descriptor to reset the attributes
 	 */
 	k = j = 0;
 	count = bonus_attr_count;
 	hdr = SA_GET_HDR(hdl, SA_BONUS);
 	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
-	for (; k != 2; k++) {
+	for (; ; k++) {
 		/*
 		 * Iterate over each attribute in layout.  Fetch the
 		 * size of variable-length attributes needing rewrite
 		 * from sa_lengths[].
 		 */
 		for (i = 0, length_idx = 0; i != count; i++) {
 			sa_attr_type_t attr;
 
 			attr = idx_tab->sa_layout->lot_attrs[i];
 			reg_length = SA_REGISTERED_LEN(sa, attr);
 			if (reg_length == 0) {
 				length = hdr->sa_lengths[length_idx];
 				length_idx++;
 			} else {
 				length = reg_length;
 			}
 			if (attr == newattr) {
 				/*
 				 * There is nothing to do for SA_REMOVE,
 				 * so it is just skipped.
 				 */
 				if (action == SA_REMOVE)
 					continue;
 
 				/*
 				 * Duplicate attributes are not allowed, so the
 				 * action can not be SA_ADD here.
 				 */
 				ASSERT3S(action, ==, SA_REPLACE);
 
 				/*
 				 * Only a variable-sized attribute can be
 				 * replaced here, and its size must be changing.
 				 */
 				ASSERT3U(reg_length, ==, 0);
 				ASSERT3U(length, !=, buflen);
 				SA_ADD_BULK_ATTR(attr_desc, j, attr,
 				    locator, datastart, buflen);
 			} else {
 				SA_ADD_BULK_ATTR(attr_desc, j, attr,
 				    NULL, (void *)
 				    (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
 				    (uintptr_t)old_data[k]), length);
 			}
 		}
 		if (k == 0 && hdl->sa_spill) {
 			hdr = SA_GET_HDR(hdl, SA_SPILL);
 			idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
 			count = spill_attr_count;
 		} else {
 			break;
 		}
 	}
 	if (action == SA_ADD) {
 		reg_length = SA_REGISTERED_LEN(sa, newattr);
 		IMPLY(reg_length != 0, reg_length == buflen);
 		SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
 		    datastart, buflen);
 	}
 	ASSERT3U(j, ==, attr_count);
 
 	error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
 
 	if (old_data[0])
 		kmem_free(old_data[0], bonus_data_size);
 	if (old_data[1])
 		vmem_free(old_data[1], spill_data_size);
 	kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
 
 	return (error);
 }
 
 static int
 sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
     dmu_tx_t *tx)
 {
 	int error;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	dmu_object_type_t bonustype;
 	dmu_buf_t *saved_spill;
 
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 
 	bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
 	saved_spill = hdl->sa_spill;
 
 	/* sync out registration table if necessary */
 	if (sa->sa_need_attr_registration)
 		sa_attr_register_sync(hdl, tx);
 
 	error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
 	if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
 		sa->sa_update_cb(hdl, tx);
 
 	/*
 	 * If saved_spill is NULL and current sa_spill is not NULL that
 	 * means we increased the refcount of the spill buffer through
 	 * sa_get_spill() or dmu_spill_hold_by_dnode().  Therefore we
 	 * must release the hold before calling dmu_tx_commit() to avoid
 	 * making a copy of this buffer in dbuf_sync_leaf() due to the
 	 * reference count now being greater than 1.
 	 */
 	if (!saved_spill && hdl->sa_spill) {
 		if (hdl->sa_spill_tab) {
 			sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 			hdl->sa_spill_tab = NULL;
 		}
 
 		dmu_buf_rele(hdl->sa_spill, NULL);
 		hdl->sa_spill = NULL;
 	}
 
 	return (error);
 }
 
 /*
  * update or add new attribute
  */
 int
 sa_update(sa_handle_t *hdl, sa_attr_type_t type,
     void *buf, uint32_t buflen, dmu_tx_t *tx)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
 
 	bulk.sa_attr = type;
 	bulk.sa_data_func = NULL;
 	bulk.sa_length = buflen;
 	bulk.sa_data = buf;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * Return size of an attribute
  */
 
 int
 sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
 {
 	sa_bulk_attr_t bulk;
 	int error;
 
 	bulk.sa_data = NULL;
 	bulk.sa_attr = attr;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
 		mutex_exit(&hdl->sa_lock);
 		return (error);
 	}
 	*size = bulk.sa_size;
 
 	mutex_exit(&hdl->sa_lock);
 	return (0);
 }
 
 int
 sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
 {
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 	return (sa_lookup_impl(hdl, attrs, count));
 }
 
 int
 sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
 {
 	int error;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_lookup_locked(hdl, attrs, count);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
 {
 	int error;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, attrs, count, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
 	    NULL, 0, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 void
 sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
 {
 	dmu_object_info_from_db(hdl->sa_bonus, doi);
 }
 
 void
 sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
 {
 	dmu_object_size_from_db(hdl->sa_bonus,
 	    blksize, nblocks);
 }
 
 void
 sa_set_userp(sa_handle_t *hdl, void *ptr)
 {
 	hdl->sa_userp = ptr;
 }
 
 dmu_buf_t *
 sa_get_db(sa_handle_t *hdl)
 {
 	return (hdl->sa_bonus);
 }
 
 void *
 sa_get_userdata(sa_handle_t *hdl)
 {
 	return (hdl->sa_userp);
 }
 
 void
 sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
 {
 	ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
 	os->os_sa->sa_update_cb = func;
 }
 
 void
 sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
 {
 
 	mutex_enter(&os->os_sa->sa_lock);
 	sa_register_update_callback_locked(os, func);
 	mutex_exit(&os->os_sa->sa_lock);
 }
 
 uint64_t
 sa_handle_object(sa_handle_t *hdl)
 {
 	return (hdl->sa_bonus->db_object);
 }
 
 boolean_t
 sa_enabled(objset_t *os)
 {
 	return (os->os_sa == NULL);
 }
 
 int
 sa_set_sa_object(objset_t *os, uint64_t sa_object)
 {
 	sa_os_t *sa = os->os_sa;
 
 	if (sa->sa_master_obj)
 		return (1);
 
 	sa->sa_master_obj = sa_object;
 
 	return (0);
 }
 
 int
 sa_hdrsize(void *arg)
 {
 	sa_hdr_phys_t *hdr = arg;
 
 	return (SA_HDR_SIZE(hdr));
 }
 
 void
 sa_handle_lock(sa_handle_t *hdl)
 {
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 }
 
 void
 sa_handle_unlock(sa_handle_t *hdl)
 {
 	ASSERT(hdl);
 	mutex_exit(&hdl->sa_lock);
 }
 
 #ifdef _KERNEL
 EXPORT_SYMBOL(sa_handle_get);
 EXPORT_SYMBOL(sa_handle_get_from_db);
 EXPORT_SYMBOL(sa_handle_destroy);
 EXPORT_SYMBOL(sa_buf_hold);
 EXPORT_SYMBOL(sa_buf_rele);
 EXPORT_SYMBOL(sa_spill_rele);
 EXPORT_SYMBOL(sa_lookup);
 EXPORT_SYMBOL(sa_update);
 EXPORT_SYMBOL(sa_remove);
 EXPORT_SYMBOL(sa_bulk_lookup);
 EXPORT_SYMBOL(sa_bulk_lookup_locked);
 EXPORT_SYMBOL(sa_bulk_update);
 EXPORT_SYMBOL(sa_size);
 EXPORT_SYMBOL(sa_object_info);
 EXPORT_SYMBOL(sa_object_size);
 EXPORT_SYMBOL(sa_get_userdata);
 EXPORT_SYMBOL(sa_set_userp);
 EXPORT_SYMBOL(sa_get_db);
 EXPORT_SYMBOL(sa_handle_object);
 EXPORT_SYMBOL(sa_register_update_callback);
 EXPORT_SYMBOL(sa_setup);
 EXPORT_SYMBOL(sa_replace_all_by_template);
 EXPORT_SYMBOL(sa_replace_all_by_template_locked);
 EXPORT_SYMBOL(sa_enabled);
 EXPORT_SYMBOL(sa_cache_init);
 EXPORT_SYMBOL(sa_cache_fini);
 EXPORT_SYMBOL(sa_set_sa_object);
 EXPORT_SYMBOL(sa_hdrsize);
 EXPORT_SYMBOL(sa_handle_lock);
 EXPORT_SYMBOL(sa_handle_unlock);
 EXPORT_SYMBOL(sa_lookup_uio);
 EXPORT_SYMBOL(sa_add_projid);
 #endif /* _KERNEL */
diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c
index 2e8489c7dfcf..e6afb1c58c95 100644
--- a/module/zfs/zap_leaf.c
+++ b/module/zfs/zap_leaf.c
@@ -1,848 +1,848 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 /*
  * The 512-byte leaf is broken into 32 16-byte chunks.
  * chunk number n means l_chunk[n], even though the header precedes it.
  * the names are stored null-terminated.
  */
 
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/arc.h>
 
 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
 #define	CHAIN_END 0xffff /* end of the chunk chain */
 
 #define	LEAF_HASH(l, h) \
 	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
 	((h) >> \
 	(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
 
 #define	LEAF_HASH_ENTPTR(l, h)	(&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
 
 static void
 zap_memset(void *a, int c, size_t n)
 {
 	char *cp = a;
 	char *cpend = cp + n;
 
 	while (cp < cpend)
 		*cp++ = c;
 }
 
 static void
 stv(int len, void *addr, uint64_t value)
 {
 	switch (len) {
 	case 1:
 		*(uint8_t *)addr = value;
 		return;
 	case 2:
 		*(uint16_t *)addr = value;
 		return;
 	case 4:
 		*(uint32_t *)addr = value;
 		return;
 	case 8:
 		*(uint64_t *)addr = value;
 		return;
 	default:
 		cmn_err(CE_PANIC, "bad int len %d", len);
 	}
 }
 
 static uint64_t
 ldv(int len, const void *addr)
 {
 	switch (len) {
 	case 1:
 		return (*(uint8_t *)addr);
 	case 2:
 		return (*(uint16_t *)addr);
 	case 4:
 		return (*(uint32_t *)addr);
 	case 8:
 		return (*(uint64_t *)addr);
 	default:
 		cmn_err(CE_PANIC, "bad int len %d", len);
 	}
 	return (0xFEEDFACEDEADBEEFULL);
 }
 
 void
 zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 {
 	zap_leaf_t l;
 	dmu_buf_t l_dbuf;
 
 	l_dbuf.db_data = buf;
 	l.l_bs = highbit64(size) - 1;
 	l.l_dbuf = &l_dbuf;
 
 	buf->l_hdr.lh_block_type =	BSWAP_64(buf->l_hdr.lh_block_type);
 	buf->l_hdr.lh_prefix =		BSWAP_64(buf->l_hdr.lh_prefix);
 	buf->l_hdr.lh_magic =		BSWAP_32(buf->l_hdr.lh_magic);
 	buf->l_hdr.lh_nfree =		BSWAP_16(buf->l_hdr.lh_nfree);
 	buf->l_hdr.lh_nentries =	BSWAP_16(buf->l_hdr.lh_nentries);
 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
 
 	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
 
 	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
 		struct zap_leaf_entry *le;
 
 		switch (lc->l_free.lf_type) {
 		case ZAP_CHUNK_ENTRY:
 			le = &lc->l_entry;
 
 			le->le_type =		BSWAP_8(le->le_type);
 			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
 			le->le_next =		BSWAP_16(le->le_next);
 			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
 			le->le_name_numints =	BSWAP_16(le->le_name_numints);
 			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
 			le->le_value_numints =	BSWAP_16(le->le_value_numints);
 			le->le_cd =		BSWAP_32(le->le_cd);
 			le->le_hash =		BSWAP_64(le->le_hash);
 			break;
 		case ZAP_CHUNK_FREE:
 			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
 			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
 			break;
 		case ZAP_CHUNK_ARRAY:
 			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
 			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
 			/* la_array doesn't need swapping */
 			break;
 		default:
 			cmn_err(CE_PANIC, "bad leaf type %d",
 			    lc->l_free.lf_type);
 		}
 	}
 }
 
 void
 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 {
 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
 	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
 	    sizeof (struct zap_leaf_header));
 	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
 	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
 	zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
 	zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
 	zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
 	if (sort)
 		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 }
 
 /*
  * Routines which manipulate leaf chunks (l_chunk[]).
  */
 
 static uint16_t
 zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
 	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
 	int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
 	zap_leaf_phys(l)->l_hdr.lh_freelist =
 	    ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
 
 	zap_leaf_phys(l)->l_hdr.lh_nfree--;
 
 	return (chunk);
 }
 
 static void
 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
 {
 	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
 
 	zlf->lf_type = ZAP_CHUNK_FREE;
 	zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	memset(zlf->lf_pad, 0, sizeof (zlf->lf_pad)); /* help it to compress */
 	zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
 
 	zap_leaf_phys(l)->l_hdr.lh_nfree++;
 }
 
 /*
  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
  */
 
 static uint16_t
 zap_leaf_array_create(zap_leaf_t *l, const char *buf,
     int integer_size, int num_integers)
 {
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
 	int byten = 0;
 	uint64_t value = 0;
 	int shift = (integer_size - 1) * 8;
 	int len = num_integers;
 
 	ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
 
 	while (len > 0) {
 		uint16_t chunk = zap_leaf_chunk_alloc(l);
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 
 		la->la_type = ZAP_CHUNK_ARRAY;
 		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
 			if (byten == 0)
 				value = ldv(integer_size, buf);
 			la->la_array[i] = value >> shift;
 			value <<= 8;
 			if (++byten == integer_size) {
 				byten = 0;
 				buf += integer_size;
 				if (--len == 0)
 					break;
 			}
 		}
 
 		*chunkp = chunk;
 		chunkp = &la->la_next;
 	}
 	*chunkp = CHAIN_END;
 
 	return (chunk_head);
 }
 
 static void
 zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
 {
 	uint16_t chunk = *chunkp;
 
 	*chunkp = CHAIN_END;
 
 	while (chunk != CHAIN_END) {
 		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
 		    ZAP_CHUNK_ARRAY);
 		zap_leaf_chunk_free(l, chunk);
 		chunk = nextchunk;
 	}
 }
 
 /* array_len and buf_len are in integers, not bytes */
 static void
 zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
     void *buf)
 {
 	int len = MIN(array_len, buf_len);
 	int byten = 0;
 	uint64_t value = 0;
 	char *p = buf;
 
 	ASSERT3U(array_int_len, <=, buf_int_len);
 
 	/* Fast path for one 8-byte integer */
 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		uint8_t *ip = la->la_array;
 		uint64_t *buf64 = buf;
 
 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
 		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
 		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
 		return;
 	}
 
 	/* Fast path for an array of 1-byte integers (eg. the entry name) */
 	if (array_int_len == 1 && buf_int_len == 1 &&
 	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_array *la =
 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
 			memcpy(p, la->la_array, ZAP_LEAF_ARRAY_BYTES);
 			p += ZAP_LEAF_ARRAY_BYTES;
 			chunk = la->la_next;
 		}
 		return;
 	}
 
 	while (len > 0) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
 			value = (value << 8) | la->la_array[i];
 			byten++;
 			if (byten == array_int_len) {
 				stv(buf_int_len, p, value);
 				byten = 0;
 				len--;
 				if (len == 0)
 					return;
 				p += buf_int_len;
 			}
 		}
 		chunk = la->la_next;
 	}
 }
 
 static boolean_t
 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
     int chunk, int array_numints)
 {
 	int bseen = 0;
 
 	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
 		uint64_t *thiskey =
 		    kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
 		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
 
 		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
 		    sizeof (*thiskey), array_numints, thiskey);
 		boolean_t match = memcmp(thiskey, zn->zn_key_orig,
 		    array_numints * sizeof (*thiskey)) == 0;
 		kmem_free(thiskey, array_numints * sizeof (*thiskey));
 		return (match);
 	}
 
 	ASSERT(zn->zn_key_intlen == 1);
 	if (zn->zn_matchtype & MT_NORMALIZE) {
 		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
 
 		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
 		    sizeof (char), array_numints, thisname);
 		boolean_t match = zap_match(zn, thisname);
 		kmem_free(thisname, array_numints);
 		return (match);
 	}
 
 	/*
 	 * Fast path for exact matching.
 	 * First check that the lengths match, so that we don't read
 	 * past the end of the zn_key_orig array.
 	 */
 	if (array_numints != zn->zn_key_orig_numints)
 		return (B_FALSE);
 	while (bseen < array_numints) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		if (memcmp(la->la_array, (char *)zn->zn_key_orig + bseen,
 		    toread))
 			break;
 		chunk = la->la_next;
 		bseen += toread;
 	}
 	return (bseen == array_numints);
 }
 
 /*
  * Routines which manipulate leaf entries.
  */
 
 int
 zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
 {
 	struct zap_leaf_entry *le;
 
 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 	for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
 		uint16_t chunk = *chunkp;
 		le = ZAP_LEAF_ENTRY(l, chunk);
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 		if (le->le_hash != zn->zn_hash)
 			continue;
 
 		/*
 		 * NB: the entry chain is always sorted by cd on
 		 * normalized zap objects, so this will find the
 		 * lowest-cd match for MT_NORMALIZE.
 		 */
 		ASSERT((zn->zn_matchtype == 0) ||
 		    (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
 		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
 		    le->le_name_numints)) {
 			zeh->zeh_num_integers = le->le_value_numints;
 			zeh->zeh_integer_size = le->le_value_intlen;
 			zeh->zeh_cd = le->le_cd;
 			zeh->zeh_hash = le->le_hash;
 			zeh->zeh_chunkp = chunkp;
 			zeh->zeh_leaf = l;
 			return (0);
 		}
 	}
 
 	return (SET_ERROR(ENOENT));
 }
 
 /* Return (h1,cd1 >= h2,cd2) */
 #define	HCD_GTEQ(h1, cd1, h2, cd2) \
 	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
 
 int
 zap_leaf_lookup_closest(zap_leaf_t *l,
     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
 {
 	uint64_t besth = -1ULL;
 	uint32_t bestcd = -1U;
 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
 	struct zap_leaf_entry *le;
 
 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 	for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
 		for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
 		    chunk != CHAIN_END; chunk = le->le_next) {
 			le = ZAP_LEAF_ENTRY(l, chunk);
 
 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
 			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
 				ASSERT3U(bestlh, >=, lh);
 				bestlh = lh;
 				besth = le->le_hash;
 				bestcd = le->le_cd;
 
 				zeh->zeh_num_integers = le->le_value_numints;
 				zeh->zeh_integer_size = le->le_value_intlen;
 				zeh->zeh_cd = le->le_cd;
 				zeh->zeh_hash = le->le_hash;
 				zeh->zeh_fakechunk = chunk;
 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
 				zeh->zeh_leaf = l;
 			}
 		}
 	}
 
 	return (bestcd == -1U ? SET_ERROR(ENOENT) : 0);
 }
 
 int
 zap_entry_read(const zap_entry_handle_t *zeh,
     uint8_t integer_size, uint64_t num_integers, void *buf)
 {
 	struct zap_leaf_entry *le =
 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	if (le->le_value_intlen > integer_size)
 		return (SET_ERROR(EINVAL));
 
 	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
 	    le->le_value_intlen, le->le_value_numints,
 	    integer_size, num_integers, buf);
 
 	if (zeh->zeh_num_integers > num_integers)
 		return (SET_ERROR(EOVERFLOW));
 	return (0);
 
 }
 
 int
 zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
     char *buf)
 {
 	struct zap_leaf_entry *le =
 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
 		    le->le_name_numints, 8, buflen / 8, buf);
 	} else {
 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
 		    le->le_name_numints, 1, buflen, buf);
 	}
 	if (le->le_name_numints > buflen)
 		return (SET_ERROR(EOVERFLOW));
 	return (0);
 }
 
 int
 zap_entry_update(zap_entry_handle_t *zeh,
     uint8_t integer_size, uint64_t num_integers, const void *buf)
 {
 	zap_leaf_t *l = zeh->zeh_leaf;
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
 
 	int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
 	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
 
 	if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
 		return (SET_ERROR(EAGAIN));
 
 	zap_leaf_array_free(l, &le->le_value_chunk);
 	le->le_value_chunk =
 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
 	le->le_value_numints = num_integers;
 	le->le_value_intlen = integer_size;
 	return (0);
 }
 
 void
 zap_entry_remove(zap_entry_handle_t *zeh)
 {
 	zap_leaf_t *l = zeh->zeh_leaf;
 
 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
 
 	uint16_t entry_chunk = *zeh->zeh_chunkp;
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	zap_leaf_array_free(l, &le->le_name_chunk);
 	zap_leaf_array_free(l, &le->le_value_chunk);
 
 	*zeh->zeh_chunkp = le->le_next;
 	zap_leaf_chunk_free(l, entry_chunk);
 
 	zap_leaf_phys(l)->l_hdr.lh_nentries--;
 }
 
 int
 zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
     uint8_t integer_size, uint64_t num_integers, const void *buf,
     zap_entry_handle_t *zeh)
 {
 	uint16_t chunk;
 	struct zap_leaf_entry *le;
 	uint64_t h = zn->zn_hash;
 
 	uint64_t valuelen = integer_size * num_integers;
 
 	int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (SET_ERROR(E2BIG));
 
 	if (cd == ZAP_NEED_CD) {
 		/* find the lowest unused cd */
 		if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
 			cd = 0;
 
 			for (chunk = *LEAF_HASH_ENTPTR(l, h);
 			    chunk != CHAIN_END; chunk = le->le_next) {
 				le = ZAP_LEAF_ENTRY(l, chunk);
 				if (le->le_cd > cd)
 					break;
 				if (le->le_hash == h) {
 					ASSERT3U(cd, ==, le->le_cd);
 					cd++;
 				}
 			}
 		} else {
 			/* old unsorted format; do it the O(n^2) way */
 			for (cd = 0; ; cd++) {
 				for (chunk = *LEAF_HASH_ENTPTR(l, h);
 				    chunk != CHAIN_END; chunk = le->le_next) {
 					le = ZAP_LEAF_ENTRY(l, chunk);
 					if (le->le_hash == h &&
 					    le->le_cd == cd) {
 						break;
 					}
 				}
 				/* If this cd is not in use, we are good. */
 				if (chunk == CHAIN_END)
 					break;
 			}
 		}
 		/*
 		 * We would run out of space in a block before we could
 		 * store enough entries to run out of CD values.
 		 */
 		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
 	}
 
 	if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
 		return (SET_ERROR(EAGAIN));
 
 	/* make the entry */
 	chunk = zap_leaf_chunk_alloc(l);
 	le = ZAP_LEAF_ENTRY(l, chunk);
 	le->le_type = ZAP_CHUNK_ENTRY;
 	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
 	    zn->zn_key_intlen, zn->zn_key_orig_numints);
 	le->le_name_numints = zn->zn_key_orig_numints;
 	le->le_value_chunk =
 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
 	le->le_value_numints = num_integers;
 	le->le_value_intlen = integer_size;
 	le->le_hash = h;
 	le->le_cd = cd;
 
 	/* link it into the hash chain */
 	/* XXX if we did the search above, we could just use that */
 	uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
 
 	zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
 	zeh->zeh_leaf = l;
 	zeh->zeh_num_integers = num_integers;
 	zeh->zeh_integer_size = le->le_value_intlen;
 	zeh->zeh_cd = le->le_cd;
 	zeh->zeh_hash = le->le_hash;
 	zeh->zeh_chunkp = chunkp;
 
 	return (0);
 }
 
 /*
  * Determine if there is another entry with the same normalized form.
  * For performance purposes, either zn or name must be provided (the
  * other can be NULL).  Note, there usually won't be any hash
  * conflicts, in which case we don't need the concatenated/normalized
  * form of the name.  But all callers have one of these on hand anyway,
  * so might as well take advantage.  A cleaner but slower interface
  * would accept neither argument, and compute the normalized name as
  * needed (using zap_name_alloc_str(zap_entry_read_name(zeh))).
  */
 boolean_t
 zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
     const char *name, zap_t *zap)
 {
 	struct zap_leaf_entry *le;
 	boolean_t allocdzn = B_FALSE;
 
 	if (zap->zap_normflags == 0)
 		return (B_FALSE);
 
 	for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
 	    chunk != CHAIN_END; chunk = le->le_next) {
 		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
 		if (le->le_hash != zeh->zeh_hash)
 			continue;
 		if (le->le_cd == zeh->zeh_cd)
 			continue;
 
 		if (zn == NULL) {
 			zn = zap_name_alloc_str(zap, name, MT_NORMALIZE);
 			allocdzn = B_TRUE;
 		}
 		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
 		    le->le_name_chunk, le->le_name_numints)) {
 			if (allocdzn)
 				zap_name_free(zn);
 			return (B_TRUE);
 		}
 	}
 	if (allocdzn)
 		zap_name_free(zn);
 	return (B_FALSE);
 }
 
 /*
  * Routines for transferring entries between leafs.
  */
 
 static uint16_t *
 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
 {
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	struct zap_leaf_entry *le2;
 	uint16_t *chunkp;
 
 	/*
 	 * keep the entry chain sorted by cd
 	 * NB: this will not cause problems for unsorted leafs, though
 	 * it is unnecessary there.
 	 */
 	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
 	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
 		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
 		if (le2->le_cd > le->le_cd)
 			break;
 	}
 
 	le->le_next = *chunkp;
 	*chunkp = entry;
 	return (chunkp);
 }
 
 static uint16_t
 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 {
 	uint16_t new_chunk;
 	uint16_t *nchunkp = &new_chunk;
 
 	while (chunk != CHAIN_END) {
 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
 		struct zap_leaf_array *nla =
 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
 		struct zap_leaf_array *la =
 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int nextchunk = la->la_next;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
 
 		*nla = *la; /* structure assignment */
 
 		zap_leaf_chunk_free(l, chunk);
 		chunk = nextchunk;
 		*nchunkp = nchunk;
 		nchunkp = &nla->la_next;
 	}
 	*nchunkp = CHAIN_END;
 	return (new_chunk);
 }
 
 static void
 zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 {
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	uint16_t chunk = zap_leaf_chunk_alloc(nl);
 	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
 	*nle = *le; /* structure assignment */
 
 	(void) zap_leaf_rehash_entry(nl, chunk);
 
 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
 	nle->le_value_chunk =
 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
 
 	zap_leaf_chunk_free(l, entry);
 
 	zap_leaf_phys(l)->l_hdr.lh_nentries--;
 	zap_leaf_phys(nl)->l_hdr.lh_nentries++;
 }
 
 /*
  * Transfer the entries whose hash prefix ends in 1 to the new leaf.
  */
 void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
 	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
 	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
 	zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
 	zap_leaf_phys(nl)->l_hdr.lh_prefix =
 	    zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
 	zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* break existing hash chains */
 	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
 	if (sort)
 		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 
 	/*
 	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
 	 * the remaining entries
 	 *
 	 * NB: We could find entries via the hashtable instead. That
 	 * would be O(hashents+numents) rather than O(numblks+numents),
 	 * but this accesses memory more sequentially, and when we're
 	 * called, the block is usually pretty full.
 	 */
 	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
 		if (le->le_type != ZAP_CHUNK_ENTRY)
 			continue;
 
 		if (le->le_hash & (1ULL << bit))
 			zap_leaf_transfer_entry(l, i, nl);
 		else
 			(void) zap_leaf_rehash_entry(l, i);
 	}
 }
 
 void
 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 {
 	int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
 
 
 	n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_with_n5_entries[n]++;
 
 	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
 	    zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
 	    (1<<FZAP_BLOCK_SHIFT(zap));
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
 
 	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
 		int nentries = 0;
 		int chunk = zap_leaf_phys(l)->l_hash[i];
 
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_entry *le =
 			    ZAP_LEAF_ENTRY(l, chunk);
 
 			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
 			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
 			    le->le_value_intlen);
 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 			zs->zs_entries_using_n_chunks[n]++;
 
 			chunk = le->le_next;
 			nentries++;
 		}
 
 		n = nentries;
 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 		zs->zs_buckets_with_n_entries[n]++;
 	}
 }