diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index 1563f5d2792c..6c009bdc1235 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -1,565 +1,566 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  *
  * Copyright (c) 2016, Intel Corporation.
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
  */
 
 /*
  * The ZFS retire agent is responsible for managing hot spares across all pools.
  * When we see a device fault or a device removal, we try to open the associated
  * pool and look for any hot spares.  We iterate over any available hot spares
  * and attempt a 'zpool replace' for each one.
  *
  * For vdevs diagnosed as faulty, the agent is also responsible for proactively
  * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors).
  */
 
 #include <sys/fs/zfs.h>
 #include <sys/fm/protocol.h>
 #include <sys/fm/fs/zfs.h>
 #include <libzutil.h>
 #include <libzfs.h>
 #include <string.h>
+#include <libgen.h>
 
 #include "zfs_agents.h"
 #include "fmd_api.h"
 
 
 typedef struct zfs_retire_repaired {
 	struct zfs_retire_repaired	*zrr_next;
 	uint64_t			zrr_pool;
 	uint64_t			zrr_vdev;
 } zfs_retire_repaired_t;
 
 typedef struct zfs_retire_data {
 	libzfs_handle_t			*zrd_hdl;
 	zfs_retire_repaired_t		*zrd_repaired;
 } zfs_retire_data_t;
 
 static void
 zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp)
 {
 	zfs_retire_repaired_t *zrp;
 
 	while ((zrp = zdp->zrd_repaired) != NULL) {
 		zdp->zrd_repaired = zrp->zrr_next;
 		fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t));
 	}
 }
 
 /*
  * Find a pool with a matching GUID.
  */
 typedef struct find_cbdata {
 	uint64_t	cb_guid;
 	zpool_handle_t	*cb_zhp;
 	nvlist_t	*cb_vdev;
 } find_cbdata_t;
 
 static int
 find_pool(zpool_handle_t *zhp, void *data)
 {
 	find_cbdata_t *cbp = data;
 
 	if (cbp->cb_guid ==
 	    zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) {
 		cbp->cb_zhp = zhp;
 		return (1);
 	}
 
 	zpool_close(zhp);
 	return (0);
 }
 
 /*
  * Find a vdev within a tree with a matching GUID.
  */
 static nvlist_t *
 find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid)
 {
 	uint64_t guid;
 	nvlist_t **child;
 	uint_t c, children;
 	nvlist_t *ret;
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
 	    guid == search_guid) {
 		fmd_hdl_debug(fmd_module_hdl("zfs-retire"),
 		    "matched vdev %llu", guid);
 		return (nv);
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
 		return (NULL);
 
 	for (c = 0; c < children; c++) {
 		if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
 			return (ret);
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) != 0)
 		return (NULL);
 
 	for (c = 0; c < children; c++) {
 		if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
 			return (ret);
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) != 0)
 		return (NULL);
 
 	for (c = 0; c < children; c++) {
 		if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
 			return (ret);
 	}
 
 	return (NULL);
 }
 
 /*
  * Given a (pool, vdev) GUID pair, find the matching pool and vdev.
  */
 static zpool_handle_t *
 find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid,
     nvlist_t **vdevp)
 {
 	find_cbdata_t cb;
 	zpool_handle_t *zhp;
 	nvlist_t *config, *nvroot;
 
 	/*
 	 * Find the corresponding pool and make sure the vdev still exists.
 	 */
 	cb.cb_guid = pool_guid;
 	if (zpool_iter(zhdl, find_pool, &cb) != 1)
 		return (NULL);
 
 	zhp = cb.cb_zhp;
 	config = zpool_get_config(zhp, NULL);
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) != 0) {
 		zpool_close(zhp);
 		return (NULL);
 	}
 
 	if (vdev_guid != 0) {
 		if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) {
 			zpool_close(zhp);
 			return (NULL);
 		}
 	}
 
 	return (zhp);
 }
 
 /*
  * Given a vdev, attempt to replace it with every known spare until one
  * succeeds or we run out of devices to try.
  * Return whether we were successful or not in replacing the device.
  */
 static boolean_t
 replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
 {
 	nvlist_t *config, *nvroot, *replacement;
 	nvlist_t **spares;
 	uint_t s, nspares;
 	char *dev_name;
 	zprop_source_t source;
 	int ashift;
 
 	config = zpool_get_config(zhp, NULL);
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) != 0)
 		return (B_FALSE);
 
 	/*
 	 * Find out if there are any hot spares available in the pool.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) != 0)
 		return (B_FALSE);
 
 	/*
 	 * lookup "ashift" pool property, we may need it for the replacement
 	 */
 	ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source);
 
 	replacement = fmd_nvl_alloc(hdl, FMD_SLEEP);
 
 	(void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT);
 
 	dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
 
 	/*
 	 * Try to replace each spare, ending when we successfully
 	 * replace it.
 	 */
 	for (s = 0; s < nspares; s++) {
 		boolean_t rebuild = B_FALSE;
 		char *spare_name, *type;
 
 		if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
 		    &spare_name) != 0)
 			continue;
 
 		/* prefer sequential resilvering for distributed spares */
 		if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE,
 		    &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
 			rebuild = B_TRUE;
 
 		/* if set, add the "ashift" pool property to the spare nvlist */
 		if (source != ZPROP_SRC_DEFAULT)
 			(void) nvlist_add_uint64(spares[s],
 			    ZPOOL_CONFIG_ASHIFT, ashift);
 
 		(void) nvlist_add_nvlist_array(replacement,
 		    ZPOOL_CONFIG_CHILDREN, &spares[s], 1);
 
 		fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'",
 		    dev_name, zfs_basename(spare_name));
 
 		if (zpool_vdev_attach(zhp, dev_name, spare_name,
 		    replacement, B_TRUE, rebuild) == 0) {
 			free(dev_name);
 			nvlist_free(replacement);
 			return (B_TRUE);
 		}
 	}
 
 	free(dev_name);
 	nvlist_free(replacement);
 
 	return (B_FALSE);
 }
 
 /*
  * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and
  * ASRU is now usable.  ZFS has found the device to be present and
  * functioning.
  */
 /*ARGSUSED*/
 static void
 zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl)
 {
 	zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
 	zfs_retire_repaired_t *zrp;
 	uint64_t pool_guid, vdev_guid;
 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
 	    &pool_guid) != 0 || nvlist_lookup_uint64(nvl,
 	    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
 		return;
 
 	/*
 	 * Before checking the state of the ASRU, go through and see if we've
 	 * already made an attempt to repair this ASRU.  This list is cleared
 	 * whenever we receive any kind of list event, and is designed to
 	 * prevent us from generating a feedback loop when we attempt repairs
 	 * against a faulted pool.  The problem is that checking the unusable
 	 * state of the ASRU can involve opening the pool, which can post
 	 * statechange events but otherwise leave the pool in the faulted
 	 * state.  This list allows us to detect when a statechange event is
 	 * due to our own request.
 	 */
 	for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) {
 		if (zrp->zrr_pool == pool_guid &&
 		    zrp->zrr_vdev == vdev_guid)
 			return;
 	}
 
 	zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP);
 	zrp->zrr_next = zdp->zrd_repaired;
 	zrp->zrr_pool = pool_guid;
 	zrp->zrr_vdev = vdev_guid;
 	zdp->zrd_repaired = zrp;
 
 	fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu",
 	    vdev_guid, pool_guid);
 }
 
 /*ARGSUSED*/
 static void
 zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
     const char *class)
 {
 	uint64_t pool_guid, vdev_guid;
 	zpool_handle_t *zhp;
 	nvlist_t *resource, *fault;
 	nvlist_t **faults;
 	uint_t f, nfaults;
 	zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
 	libzfs_handle_t *zhdl = zdp->zrd_hdl;
 	boolean_t fault_device, degrade_device;
 	boolean_t is_repair;
 	char *scheme;
 	nvlist_t *vdev = NULL;
 	char *uuid;
 	int repair_done = 0;
 	boolean_t retire;
 	boolean_t is_disk;
 	vdev_aux_t aux;
 	uint64_t state = 0;
 
 	fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class);
 
 	nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state);
 
 	/*
 	 * If this is a resource notifying us of device removal then simply
 	 * check for an available spare and continue unless the device is a
 	 * l2arc vdev, in which case we just offline it.
 	 */
 	if (strcmp(class, "resource.fs.zfs.removed") == 0 ||
 	    (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
 	    (state == VDEV_STATE_REMOVED || state == VDEV_STATE_FAULTED))) {
 		char *devtype;
 		char *devname;
 
 		if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
 		    &pool_guid) != 0 ||
 		    nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
 		    &vdev_guid) != 0)
 			return;
 
 		if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
 		    &vdev)) == NULL)
 			return;
 
 		devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
 
 		/* Can't replace l2arc with a spare: offline the device */
 		if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
 		    &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) {
 			fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname);
 			zpool_vdev_offline(zhp, devname, B_TRUE);
 		} else if (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
 		    replace_with_spare(hdl, zhp, vdev) == B_FALSE) {
 			/* Could not handle with spare */
 			fmd_hdl_debug(hdl, "no spare for '%s'", devname);
 		}
 
 		free(devname);
 		zpool_close(zhp);
 		return;
 	}
 
 	if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
 		return;
 
 	/*
 	 * Note: on Linux statechange events are more than just
 	 * healthy ones so we need to confirm the actual state value.
 	 */
 	if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
 	    state == VDEV_STATE_HEALTHY) {
 		zfs_vdev_repair(hdl, nvl);
 		return;
 	}
 	if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
 		zfs_vdev_repair(hdl, nvl);
 		return;
 	}
 
 	zfs_retire_clear_data(hdl, zdp);
 
 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
 		is_repair = B_TRUE;
 	else
 		is_repair = B_FALSE;
 
 	/*
 	 * We subscribe to zfs faults as well as all repair events.
 	 */
 	if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
 	    &faults, &nfaults) != 0)
 		return;
 
 	for (f = 0; f < nfaults; f++) {
 		fault = faults[f];
 
 		fault_device = B_FALSE;
 		degrade_device = B_FALSE;
 		is_disk = B_FALSE;
 
 		if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE,
 		    &retire) == 0 && retire == 0)
 			continue;
 
 		/*
 		 * While we subscribe to fault.fs.zfs.*, we only take action
 		 * for faults targeting a specific vdev (open failure or SERD
 		 * failure).  We also subscribe to fault.io.* events, so that
 		 * faulty disks will be faulted in the ZFS configuration.
 		 */
 		if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) {
 			fault_device = B_TRUE;
 		} else if (fmd_nvl_class_match(hdl, fault,
 		    "fault.fs.zfs.vdev.checksum")) {
 			degrade_device = B_TRUE;
 		} else if (fmd_nvl_class_match(hdl, fault,
 		    "fault.fs.zfs.device")) {
 			fault_device = B_FALSE;
 		} else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) {
 			is_disk = B_TRUE;
 			fault_device = B_TRUE;
 		} else {
 			continue;
 		}
 
 		if (is_disk) {
 			continue;
 		} else {
 			/*
 			 * This is a ZFS fault.  Lookup the resource, and
 			 * attempt to find the matching vdev.
 			 */
 			if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE,
 			    &resource) != 0 ||
 			    nvlist_lookup_string(resource, FM_FMRI_SCHEME,
 			    &scheme) != 0)
 				continue;
 
 			if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0)
 				continue;
 
 			if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL,
 			    &pool_guid) != 0)
 				continue;
 
 			if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV,
 			    &vdev_guid) != 0) {
 				if (is_repair)
 					vdev_guid = 0;
 				else
 					continue;
 			}
 
 			if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
 			    &vdev)) == NULL)
 				continue;
 
 			aux = VDEV_AUX_ERR_EXCEEDED;
 		}
 
 		if (vdev_guid == 0) {
 			/*
 			 * For pool-level repair events, clear the entire pool.
 			 */
 			fmd_hdl_debug(hdl, "zpool_clear of pool '%s'",
 			    zpool_get_name(zhp));
 			(void) zpool_clear(zhp, NULL, NULL);
 			zpool_close(zhp);
 			continue;
 		}
 
 		/*
 		 * If this is a repair event, then mark the vdev as repaired and
 		 * continue.
 		 */
 		if (is_repair) {
 			repair_done = 1;
 			fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu",
 			    zpool_get_name(zhp), vdev_guid);
 			(void) zpool_vdev_clear(zhp, vdev_guid);
 			zpool_close(zhp);
 			continue;
 		}
 
 		/*
 		 * Actively fault the device if needed.
 		 */
 		if (fault_device)
 			(void) zpool_vdev_fault(zhp, vdev_guid, aux);
 		if (degrade_device)
 			(void) zpool_vdev_degrade(zhp, vdev_guid, aux);
 
 		if (fault_device || degrade_device)
 			fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'",
 			    fault_device ? "fault" : "degrade", vdev_guid,
 			    zpool_get_name(zhp));
 
 		/*
 		 * Attempt to substitute a hot spare.
 		 */
 		(void) replace_with_spare(hdl, zhp, vdev);
 
 		zpool_close(zhp);
 	}
 
 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done &&
 	    nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0)
 		fmd_case_uuresolved(hdl, uuid);
 }
 
 static const fmd_hdl_ops_t fmd_ops = {
 	zfs_retire_recv,	/* fmdo_recv */
 	NULL,			/* fmdo_timeout */
 	NULL,			/* fmdo_close */
 	NULL,			/* fmdo_stats */
 	NULL,			/* fmdo_gc */
 };
 
 static const fmd_prop_t fmd_props[] = {
 	{ "spare_on_remove", FMD_TYPE_BOOL, "true" },
 	{ NULL, 0, NULL }
 };
 
 static const fmd_hdl_info_t fmd_info = {
 	"ZFS Retire Agent", "1.0", &fmd_ops, fmd_props
 };
 
 void
 _zfs_retire_init(fmd_hdl_t *hdl)
 {
 	zfs_retire_data_t *zdp;
 	libzfs_handle_t *zhdl;
 
 	if ((zhdl = libzfs_init()) == NULL)
 		return;
 
 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
 		libzfs_fini(zhdl);
 		return;
 	}
 
 	zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP);
 	zdp->zrd_hdl = zhdl;
 
 	fmd_hdl_setspecific(hdl, zdp);
 }
 
 void
 _zfs_retire_fini(fmd_hdl_t *hdl)
 {
 	zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
 
 	if (zdp != NULL) {
 		zfs_retire_clear_data(hdl, zdp);
 		libzfs_fini(zdp->zrd_hdl);
 		fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t));
 	}
 }
diff --git a/cmd/zed/zed.c b/cmd/zed/zed.c
index 0aa03fded468..e45176c00bf2 100644
--- a/cmd/zed/zed.c
+++ b/cmd/zed/zed.c
@@ -1,308 +1,308 @@
 /*
  * This file is part of the ZFS Event Daemon (ZED).
  *
  * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
  * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
  * Refer to the OpenZFS git commit log for authoritative copyright attribution.
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License Version 1.0 (CDDL-1.0).
  * You can obtain a copy of the license from the top-level file
  * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
  * You may not use this file except in compliance with the license.
  */
 
 #include <errno.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include "zed.h"
 #include "zed_conf.h"
 #include "zed_event.h"
 #include "zed_file.h"
 #include "zed_log.h"
 
 static volatile sig_atomic_t _got_exit = 0;
 static volatile sig_atomic_t _got_hup = 0;
 
 /*
  * Signal handler for SIGINT & SIGTERM.
  */
 static void
 _exit_handler(int signum)
 {
 	_got_exit = 1;
 }
 
 /*
  * Signal handler for SIGHUP.
  */
 static void
 _hup_handler(int signum)
 {
 	_got_hup = 1;
 }
 
 /*
  * Register signal handlers.
  */
 static void
 _setup_sig_handlers(void)
 {
 	struct sigaction sa;
 
 	if (sigemptyset(&sa.sa_mask) < 0)
 		zed_log_die("Failed to initialize sigset");
 
 	sa.sa_flags = SA_RESTART;
 
 	sa.sa_handler = SIG_IGN;
 	if (sigaction(SIGPIPE, &sa, NULL) < 0)
 		zed_log_die("Failed to ignore SIGPIPE");
 
 	sa.sa_handler = _exit_handler;
 	if (sigaction(SIGINT, &sa, NULL) < 0)
 		zed_log_die("Failed to register SIGINT handler");
 
 	if (sigaction(SIGTERM, &sa, NULL) < 0)
 		zed_log_die("Failed to register SIGTERM handler");
 
 	sa.sa_handler = _hup_handler;
 	if (sigaction(SIGHUP, &sa, NULL) < 0)
 		zed_log_die("Failed to register SIGHUP handler");
 
 	(void) sigaddset(&sa.sa_mask, SIGCHLD);
 	if (pthread_sigmask(SIG_BLOCK, &sa.sa_mask, NULL) < 0)
 		zed_log_die("Failed to block SIGCHLD");
 }
 
 /*
  * Lock all current and future pages in the virtual memory address space.
  * Access to locked pages will never be delayed by a page fault.
  *
  * EAGAIN is tested up to max_tries in case this is a transient error.
  *
  * Note that memory locks are not inherited by a child created via fork()
  * and are automatically removed during an execve().  As such, this must
  * be called after the daemon fork()s (when running in the background).
  */
 static void
 _lock_memory(void)
 {
 #if HAVE_MLOCKALL
 	int i = 0;
 	const int max_tries = 10;
 
 	for (i = 0; i < max_tries; i++) {
 		if (mlockall(MCL_CURRENT | MCL_FUTURE) == 0) {
 			zed_log_msg(LOG_INFO, "Locked all pages in memory");
 			return;
 		}
 		if (errno != EAGAIN)
 			break;
 	}
 	zed_log_die("Failed to lock memory pages: %s", strerror(errno));
 
 #else /* HAVE_MLOCKALL */
 	zed_log_die("Failed to lock memory pages: mlockall() not supported");
 #endif /* HAVE_MLOCKALL */
 }
 
 /*
  * Start daemonization of the process including the double fork().
  *
  * The parent process will block here until _finish_daemonize() is called
  * (in the grandchild process), at which point the parent process will exit.
  * This prevents the parent process from exiting until initialization is
  * complete.
  */
 static void
 _start_daemonize(void)
 {
 	pid_t pid;
 	struct sigaction sa;
 
 	/* Create pipe for communicating with child during daemonization. */
 	zed_log_pipe_open();
 
 	/* Background process and ensure child is not process group leader. */
 	pid = fork();
 	if (pid < 0) {
 		zed_log_die("Failed to create child process: %s",
 		    strerror(errno));
 	} else if (pid > 0) {
 
 		/* Close writes since parent will only read from pipe. */
 		zed_log_pipe_close_writes();
 
 		/* Wait for notification that daemonization is complete. */
 		zed_log_pipe_wait();
 
 		zed_log_pipe_close_reads();
 		_exit(EXIT_SUCCESS);
 	}
 
 	/* Close reads since child will only write to pipe. */
 	zed_log_pipe_close_reads();
 
 	/* Create independent session and detach from terminal. */
 	if (setsid() < 0)
 		zed_log_die("Failed to create new session: %s",
 		    strerror(errno));
 
 	/* Prevent child from terminating on HUP when session leader exits. */
 	if (sigemptyset(&sa.sa_mask) < 0)
 		zed_log_die("Failed to initialize sigset");
 
 	sa.sa_flags = 0;
 	sa.sa_handler = SIG_IGN;
 
 	if (sigaction(SIGHUP, &sa, NULL) < 0)
 		zed_log_die("Failed to ignore SIGHUP");
 
 	/* Ensure process cannot re-acquire terminal. */
 	pid = fork();
 	if (pid < 0) {
 		zed_log_die("Failed to create grandchild process: %s",
 		    strerror(errno));
 	} else if (pid > 0) {
 		_exit(EXIT_SUCCESS);
 	}
 }
 
 /*
  * Finish daemonization of the process by closing stdin/stdout/stderr.
  *
  * This must be called at the end of initialization after all external
  * communication channels are established and accessible.
  */
 static void
 _finish_daemonize(void)
 {
 	int devnull;
 
 	/* Preserve fd 0/1/2, but discard data to/from stdin/stdout/stderr. */
 	devnull = open("/dev/null", O_RDWR);
 	if (devnull < 0)
 		zed_log_die("Failed to open /dev/null: %s", strerror(errno));
 
 	if (dup2(devnull, STDIN_FILENO) < 0)
 		zed_log_die("Failed to dup /dev/null onto stdin: %s",
 		    strerror(errno));
 
 	if (dup2(devnull, STDOUT_FILENO) < 0)
 		zed_log_die("Failed to dup /dev/null onto stdout: %s",
 		    strerror(errno));
 
 	if (dup2(devnull, STDERR_FILENO) < 0)
 		zed_log_die("Failed to dup /dev/null onto stderr: %s",
 		    strerror(errno));
 
 	if ((devnull > STDERR_FILENO) && (close(devnull) < 0))
 		zed_log_die("Failed to close /dev/null: %s", strerror(errno));
 
 	/* Notify parent that daemonization is complete. */
 	zed_log_pipe_close_writes();
 }
 
 /*
  * ZFS Event Daemon (ZED).
  */
 int
 main(int argc, char *argv[])
 {
 	struct zed_conf zcp;
 	uint64_t saved_eid;
 	int64_t saved_etime[2];
 
 	zed_log_init(argv[0]);
 	zed_log_stderr_open(LOG_NOTICE);
 	zed_conf_init(&zcp);
 	zed_conf_parse_opts(&zcp, argc, argv);
 	if (zcp.do_verbose)
 		zed_log_stderr_open(LOG_INFO);
 
 	if (geteuid() != 0)
 		zed_log_die("Must be run as root");
 
 	zed_file_close_from(STDERR_FILENO + 1);
 
 	(void) umask(0);
 
 	if (chdir("/") < 0)
 		zed_log_die("Failed to change to root directory");
 
 	if (zed_conf_scan_dir(&zcp) < 0)
 		exit(EXIT_FAILURE);
 
 	if (!zcp.do_foreground) {
 		_start_daemonize();
 		zed_log_syslog_open(LOG_DAEMON);
 	}
 	_setup_sig_handlers();
 
 	if (zcp.do_memlock)
 		_lock_memory();
 
 	if ((zed_conf_write_pid(&zcp) < 0) && (!zcp.do_force))
 		exit(EXIT_FAILURE);
 
 	if (!zcp.do_foreground)
 		_finish_daemonize();
 
 	zed_log_msg(LOG_NOTICE,
 	    "ZFS Event Daemon %s-%s (PID %d)",
 	    ZFS_META_VERSION, ZFS_META_RELEASE, (int)getpid());
 
 	if (zed_conf_open_state(&zcp) < 0)
 		exit(EXIT_FAILURE);
 
 	if (zed_conf_read_state(&zcp, &saved_eid, saved_etime) < 0)
 		exit(EXIT_FAILURE);
 
 idle:
 	/*
 	 * If -I is specified, attempt to open /dev/zfs repeatedly until
 	 * successful.
 	 */
 	do {
 		if (!zed_event_init(&zcp))
 			break;
 		/* Wait for some time and try again. tunable? */
 		sleep(30);
 	} while (!_got_exit && zcp.do_idle);
 
 	if (_got_exit)
 		goto out;
 
 	zed_event_seek(&zcp, saved_eid, saved_etime);
 
 	while (!_got_exit) {
 		int rv;
 		if (_got_hup) {
 			_got_hup = 0;
 			(void) zed_conf_scan_dir(&zcp);
 		}
 		rv = zed_event_service(&zcp);
 
 		/* ENODEV: When kernel module is unloaded (osx) */
-		if (rv == ENODEV)
+		if (rv != 0)
 			break;
 	}
 
 	zed_log_msg(LOG_NOTICE, "Exiting");
 	zed_event_fini(&zcp);
 
 	if (zcp.do_idle && !_got_exit)
 		goto idle;
 
 out:
 	zed_conf_destroy(&zcp);
 	zed_log_fini();
 	exit(EXIT_SUCCESS);
 }
diff --git a/cmd/zed/zed_conf.c b/cmd/zed/zed_conf.c
index 2cf2311dbb42..59935102f123 100644
--- a/cmd/zed/zed_conf.c
+++ b/cmd/zed/zed_conf.c
@@ -1,705 +1,706 @@
 /*
  * This file is part of the ZFS Event Daemon (ZED).
  *
  * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
  * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
  * Refer to the OpenZFS git commit log for authoritative copyright attribution.
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License Version 1.0 (CDDL-1.0).
  * You can obtain a copy of the license from the top-level file
  * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
  * You may not use this file except in compliance with the license.
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <libgen.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <unistd.h>
 #include "zed.h"
 #include "zed_conf.h"
 #include "zed_file.h"
 #include "zed_log.h"
 #include "zed_strings.h"
 
 /*
  * Initialise the configuration with default values.
  */
 void
 zed_conf_init(struct zed_conf *zcp)
 {
 	memset(zcp, 0, sizeof (*zcp));
 
 	/* zcp->zfs_hdl opened in zed_event_init() */
 	/* zcp->zedlets created in zed_conf_scan_dir() */
 
 	zcp->pid_fd = -1;		/* opened in zed_conf_write_pid() */
 	zcp->state_fd = -1;		/* opened in zed_conf_open_state() */
 	zcp->zevent_fd = -1;		/* opened in zed_event_init() */
 
 	zcp->max_jobs = 16;
 
 	if (!(zcp->pid_file = strdup(ZED_PID_FILE)) ||
 	    !(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR)) ||
 	    !(zcp->state_file = strdup(ZED_STATE_FILE)))
 		zed_log_die("Failed to create conf: %s", strerror(errno));
 }
 
 /*
  * Destroy the configuration [zcp].
  *
  * Note: zfs_hdl & zevent_fd are destroyed via zed_event_fini().
  */
 void
 zed_conf_destroy(struct zed_conf *zcp)
 {
 	if (zcp->state_fd >= 0) {
 		if (close(zcp->state_fd) < 0)
 			zed_log_msg(LOG_WARNING,
 			    "Failed to close state file \"%s\": %s",
 			    zcp->state_file, strerror(errno));
 		zcp->state_fd = -1;
 	}
 	if (zcp->pid_file) {
 		if ((unlink(zcp->pid_file) < 0) && (errno != ENOENT))
 			zed_log_msg(LOG_WARNING,
 			    "Failed to remove PID file \"%s\": %s",
 			    zcp->pid_file, strerror(errno));
 	}
 	if (zcp->pid_fd >= 0) {
 		if (close(zcp->pid_fd) < 0)
 			zed_log_msg(LOG_WARNING,
 			    "Failed to close PID file \"%s\": %s",
 			    zcp->pid_file, strerror(errno));
 		zcp->pid_fd = -1;
 	}
 	if (zcp->pid_file) {
 		free(zcp->pid_file);
 		zcp->pid_file = NULL;
 	}
 	if (zcp->zedlet_dir) {
 		free(zcp->zedlet_dir);
 		zcp->zedlet_dir = NULL;
 	}
 	if (zcp->state_file) {
 		free(zcp->state_file);
 		zcp->state_file = NULL;
 	}
 	if (zcp->zedlets) {
 		zed_strings_destroy(zcp->zedlets);
 		zcp->zedlets = NULL;
 	}
 }
 
 /*
  * Display command-line help and exit.
  *
  * If [got_err] is 0, output to stdout and exit normally;
  * otherwise, output to stderr and exit with a failure status.
  */
 static void
 _zed_conf_display_help(const char *prog, boolean_t got_err)
 {
 	struct opt { const char *o, *d, *v; };
 
 	FILE *fp = got_err ? stderr : stdout;
 
 	struct opt *oo;
 	struct opt iopts[] = {
 		{ .o = "-h", .d = "Display help" },
 		{ .o = "-L", .d = "Display license information" },
 		{ .o = "-V", .d = "Display version information" },
 		{},
 	};
 	struct opt nopts[] = {
 		{ .o = "-v", .d = "Be verbose" },
 		{ .o = "-f", .d = "Force daemon to run" },
 		{ .o = "-F", .d = "Run daemon in the foreground" },
 		{ .o = "-I",
 		    .d = "Idle daemon until kernel module is (re)loaded" },
 		{ .o = "-M", .d = "Lock all pages in memory" },
 		{ .o = "-P", .d = "$PATH for ZED to use (only used by ZTS)" },
 		{ .o = "-Z", .d = "Zero state file" },
 		{},
 	};
 	struct opt vopts[] = {
 		{ .o = "-d DIR", .d = "Read enabled ZEDLETs from DIR.",
 		    .v = ZED_ZEDLET_DIR },
 		{ .o = "-p FILE", .d = "Write daemon's PID to FILE.",
 		    .v = ZED_PID_FILE },
 		{ .o = "-s FILE", .d = "Write daemon's state to FILE.",
 		    .v = ZED_STATE_FILE },
 		{ .o = "-j JOBS", .d = "Start at most JOBS at once.",
 		    .v = "16" },
 		{},
 	};
 
 	fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed"));
 	fprintf(fp, "\n");
 	for (oo = iopts; oo->o; ++oo)
 		fprintf(fp, "    %*s %s\n", -8, oo->o, oo->d);
 	fprintf(fp, "\n");
 	for (oo = nopts; oo->o; ++oo)
 		fprintf(fp, "    %*s %s\n", -8, oo->o, oo->d);
 	fprintf(fp, "\n");
 	for (oo = vopts; oo->o; ++oo)
 		fprintf(fp, "    %*s %s [%s]\n", -8, oo->o, oo->d, oo->v);
 	fprintf(fp, "\n");
 
 	exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS);
 }
 
 /*
  * Display license information to stdout and exit.
  */
 static void
 _zed_conf_display_license(void)
 {
 	printf(
 	    "The ZFS Event Daemon (ZED) is distributed under the terms of the\n"
 	    "  Common Development and Distribution License (CDDL-1.0)\n"
 	    "  <http://opensource.org/licenses/CDDL-1.0>.\n"
 	    "\n"
 	    "Developed at Lawrence Livermore National Laboratory"
 	    " (LLNL-CODE-403049).\n"
 	    "\n");
 
 	exit(EXIT_SUCCESS);
 }
 
 /*
  * Display version information to stdout and exit.
  */
 static void
 _zed_conf_display_version(void)
 {
 	printf("%s-%s-%s\n",
 	    ZFS_META_NAME, ZFS_META_VERSION, ZFS_META_RELEASE);
 
 	exit(EXIT_SUCCESS);
 }
 
 /*
  * Copy the [path] string to the [resultp] ptr.
  * If [path] is not an absolute path, prefix it with the current working dir.
  * If [resultp] is non-null, free its existing string before assignment.
  */
 static void
 _zed_conf_parse_path(char **resultp, const char *path)
 {
 	char buf[PATH_MAX];
 
 	assert(resultp != NULL);
 	assert(path != NULL);
 
 	if (*resultp)
 		free(*resultp);
 
 	if (path[0] == '/') {
 		*resultp = strdup(path);
 	} else {
 		if (!getcwd(buf, sizeof (buf)))
 			zed_log_die("Failed to get current working dir: %s",
 			    strerror(errno));
 
 		if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf) ||
 		    strlcat(buf, path, sizeof (buf)) >= sizeof (buf))
 			zed_log_die("Failed to copy path: %s",
 			    strerror(ENAMETOOLONG));
 
 		*resultp = strdup(buf);
 	}
 
 	if (!*resultp)
 		zed_log_die("Failed to copy path: %s", strerror(ENOMEM));
 }
 
 /*
  * Parse the command-line options into the configuration [zcp].
  */
 void
 zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv)
 {
 	const char * const opts = ":hLVd:p:P:s:vfFMZIj:";
 	int opt;
 	unsigned long raw;
 
 	if (!zcp || !argv || !argv[0])
 		zed_log_die("Failed to parse options: Internal error");
 
 	opterr = 0;			/* suppress default getopt err msgs */
 
 	while ((opt = getopt(argc, argv, opts)) != -1) {
 		switch (opt) {
 		case 'h':
 			_zed_conf_display_help(argv[0], B_FALSE);
 			break;
 		case 'L':
 			_zed_conf_display_license();
 			break;
 		case 'V':
 			_zed_conf_display_version();
 			break;
 		case 'd':
 			_zed_conf_parse_path(&zcp->zedlet_dir, optarg);
 			break;
 		case 'I':
 			zcp->do_idle = 1;
 			break;
 		case 'p':
 			_zed_conf_parse_path(&zcp->pid_file, optarg);
 			break;
 		case 'P':
 			_zed_conf_parse_path(&zcp->path, optarg);
 			break;
 		case 's':
 			_zed_conf_parse_path(&zcp->state_file, optarg);
 			break;
 		case 'v':
 			zcp->do_verbose = 1;
 			break;
 		case 'f':
 			zcp->do_force = 1;
 			break;
 		case 'F':
 			zcp->do_foreground = 1;
 			break;
 		case 'M':
 			zcp->do_memlock = 1;
 			break;
 		case 'Z':
 			zcp->do_zero = 1;
 			break;
 		case 'j':
 			errno = 0;
 			raw = strtoul(optarg, NULL, 0);
 			if (errno == ERANGE || raw > INT16_MAX) {
 				zed_log_die("%lu is too many jobs", raw);
 			} if (raw == 0) {
 				zed_log_die("0 jobs makes no sense");
 			} else {
 				zcp->max_jobs = raw;
 			}
 			break;
 		case '?':
 		default:
 			if (optopt == '?')
 				_zed_conf_display_help(argv[0], B_FALSE);
 
 			fprintf(stderr, "%s: Invalid option '-%c'\n\n",
 			    argv[0], optopt);
 			_zed_conf_display_help(argv[0], B_TRUE);
 			break;
 		}
 	}
 }
 
 /*
  * Scan the [zcp] zedlet_dir for files to exec based on the event class.
  * Files must be executable by user, but not writable by group or other.
  * Dotfiles are ignored.
  *
  * Return 0 on success with an updated set of zedlets,
  * or -1 on error with errno set.
  */
 int
 zed_conf_scan_dir(struct zed_conf *zcp)
 {
 	zed_strings_t *zedlets;
 	DIR *dirp;
 	struct dirent *direntp;
 	char pathname[PATH_MAX];
 	struct stat st;
 	int n;
 
 	if (!zcp) {
 		errno = EINVAL;
 		zed_log_msg(LOG_ERR, "Failed to scan zedlet dir: %s",
 		    strerror(errno));
 		return (-1);
 	}
 	zedlets = zed_strings_create();
 	if (!zedlets) {
 		errno = ENOMEM;
 		zed_log_msg(LOG_WARNING, "Failed to scan dir \"%s\": %s",
 		    zcp->zedlet_dir, strerror(errno));
 		return (-1);
 	}
 	dirp = opendir(zcp->zedlet_dir);
 	if (!dirp) {
 		int errno_bak = errno;
 		zed_log_msg(LOG_WARNING, "Failed to open dir \"%s\": %s",
 		    zcp->zedlet_dir, strerror(errno));
 		zed_strings_destroy(zedlets);
 		errno = errno_bak;
 		return (-1);
 	}
 	while ((direntp = readdir(dirp))) {
 		if (direntp->d_name[0] == '.')
 			continue;
 
 		n = snprintf(pathname, sizeof (pathname),
 		    "%s/%s", zcp->zedlet_dir, direntp->d_name);
 		if ((n < 0) || (n >= sizeof (pathname))) {
 			zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s",
 			    direntp->d_name, strerror(ENAMETOOLONG));
 			continue;
 		}
 		if (stat(pathname, &st) < 0) {
 			zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s",
 			    pathname, strerror(errno));
 			continue;
 		}
 		if (!S_ISREG(st.st_mode)) {
 			zed_log_msg(LOG_INFO,
 			    "Ignoring \"%s\": not a regular file",
 			    direntp->d_name);
 			continue;
 		}
 		if ((st.st_uid != 0) && !zcp->do_force) {
 			zed_log_msg(LOG_NOTICE,
 			    "Ignoring \"%s\": not owned by root",
 			    direntp->d_name);
 			continue;
 		}
 		if (!(st.st_mode & S_IXUSR)) {
 			zed_log_msg(LOG_INFO,
 			    "Ignoring \"%s\": not executable by user",
 			    direntp->d_name);
 			continue;
 		}
 		if ((st.st_mode & S_IWGRP) && !zcp->do_force) {
 			zed_log_msg(LOG_NOTICE,
 			    "Ignoring \"%s\": writable by group",
 			    direntp->d_name);
 			continue;
 		}
 		if ((st.st_mode & S_IWOTH) && !zcp->do_force) {
 			zed_log_msg(LOG_NOTICE,
 			    "Ignoring \"%s\": writable by other",
 			    direntp->d_name);
 			continue;
 		}
 		if (zed_strings_add(zedlets, NULL, direntp->d_name) < 0) {
 			zed_log_msg(LOG_WARNING,
 			    "Failed to register \"%s\": %s",
 			    direntp->d_name, strerror(errno));
 			continue;
 		}
 		if (zcp->do_verbose)
 			zed_log_msg(LOG_INFO,
 			    "Registered zedlet \"%s\"", direntp->d_name);
 	}
 	if (closedir(dirp) < 0) {
 		int errno_bak = errno;
 		zed_log_msg(LOG_WARNING, "Failed to close dir \"%s\": %s",
 		    zcp->zedlet_dir, strerror(errno));
 		zed_strings_destroy(zedlets);
 		errno = errno_bak;
 		return (-1);
 	}
 	if (zcp->zedlets)
 		zed_strings_destroy(zcp->zedlets);
 
 	zcp->zedlets = zedlets;
 	return (0);
 }
 
 /*
  * Write the PID file specified in [zcp].
  * Return 0 on success, -1 on error.
  *
  * This must be called after fork()ing to become a daemon (so the correct PID
  * is recorded), but before daemonization is complete and the parent process
  * exits (for synchronization with systemd).
  */
 int
 zed_conf_write_pid(struct zed_conf *zcp)
 {
 	char buf[PATH_MAX];
 	int n;
 	char *p;
 	mode_t mask;
 	int rv;
 
 	if (!zcp || !zcp->pid_file) {
 		errno = EINVAL;
 		zed_log_msg(LOG_ERR, "Failed to create PID file: %s",
 		    strerror(errno));
 		return (-1);
 	}
 	assert(zcp->pid_fd == -1);
 	/*
 	 * Create PID file directory if needed.
 	 */
 	n = strlcpy(buf, zcp->pid_file, sizeof (buf));
 	if (n >= sizeof (buf)) {
 		errno = ENAMETOOLONG;
 		zed_log_msg(LOG_ERR, "Failed to create PID file: %s",
 		    strerror(errno));
 		goto err;
 	}
 	p = strrchr(buf, '/');
 	if (p)
 		*p = '\0';
 
 	if ((mkdirp(buf, 0755) < 0) && (errno != EEXIST)) {
 		zed_log_msg(LOG_ERR, "Failed to create directory \"%s\": %s",
 		    buf, strerror(errno));
 		goto err;
 	}
 	/*
 	 * Obtain PID file lock.
 	 */
 	mask = umask(0);
 	umask(mask | 022);
 	zcp->pid_fd = open(zcp->pid_file, O_RDWR | O_CREAT | O_CLOEXEC, 0644);
 	umask(mask);
 	if (zcp->pid_fd < 0) {
 		zed_log_msg(LOG_ERR, "Failed to open PID file \"%s\": %s",
 		    zcp->pid_file, strerror(errno));
 		goto err;
 	}
 	rv = zed_file_lock(zcp->pid_fd);
 	if (rv < 0) {
 		zed_log_msg(LOG_ERR, "Failed to lock PID file \"%s\": %s",
 		    zcp->pid_file, strerror(errno));
 		goto err;
 	} else if (rv > 0) {
 		pid_t pid = zed_file_is_locked(zcp->pid_fd);
 		if (pid < 0) {
 			zed_log_msg(LOG_ERR,
 			    "Failed to test lock on PID file \"%s\"",
 			    zcp->pid_file);
 		} else if (pid > 0) {
 			zed_log_msg(LOG_ERR,
 			    "Found PID %d bound to PID file \"%s\"",
 			    pid, zcp->pid_file);
 		} else {
 			zed_log_msg(LOG_ERR,
 			    "Inconsistent lock state on PID file \"%s\"",
 			    zcp->pid_file);
 		}
 		goto err;
 	}
 	/*
 	 * Write PID file.
 	 */
 	n = snprintf(buf, sizeof (buf), "%d\n", (int)getpid());
 	if ((n < 0) || (n >= sizeof (buf))) {
 		errno = ERANGE;
 		zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s",
 		    zcp->pid_file, strerror(errno));
 	} else if (write(zcp->pid_fd, buf, n) != n) {
 		zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s",
 		    zcp->pid_file, strerror(errno));
 	} else if (fdatasync(zcp->pid_fd) < 0) {
 		zed_log_msg(LOG_ERR, "Failed to sync PID file \"%s\": %s",
 		    zcp->pid_file, strerror(errno));
 	} else {
 		return (0);
 	}
 
 err:
 	if (zcp->pid_fd >= 0) {
 		(void) close(zcp->pid_fd);
 		zcp->pid_fd = -1;
 	}
 	return (-1);
 }
 
 /*
  * Open and lock the [zcp] state_file.
  * Return 0 on success, -1 on error.
  *
  * FIXME: Move state information into kernel.
  */
 int
 zed_conf_open_state(struct zed_conf *zcp)
 {
 	char dirbuf[PATH_MAX];
 	int n;
 	char *p;
 	int rv;
 
 	if (!zcp || !zcp->state_file) {
 		errno = EINVAL;
 		zed_log_msg(LOG_ERR, "Failed to open state file: %s",
 		    strerror(errno));
 		return (-1);
 	}
 	n = strlcpy(dirbuf, zcp->state_file, sizeof (dirbuf));
 	if (n >= sizeof (dirbuf)) {
 		errno = ENAMETOOLONG;
 		zed_log_msg(LOG_WARNING, "Failed to open state file: %s",
 		    strerror(errno));
 		return (-1);
 	}
 	p = strrchr(dirbuf, '/');
 	if (p)
 		*p = '\0';
 
 	if ((mkdirp(dirbuf, 0755) < 0) && (errno != EEXIST)) {
 		zed_log_msg(LOG_WARNING,
 		    "Failed to create directory \"%s\": %s",
 		    dirbuf, strerror(errno));
 		return (-1);
 	}
 	if (zcp->state_fd >= 0) {
 		if (close(zcp->state_fd) < 0) {
 			zed_log_msg(LOG_WARNING,
 			    "Failed to close state file \"%s\": %s",
 			    zcp->state_file, strerror(errno));
 			return (-1);
 		}
 	}
 	if (zcp->do_zero)
 		(void) unlink(zcp->state_file);
 
 	zcp->state_fd = open(zcp->state_file,
 	    O_RDWR | O_CREAT | O_CLOEXEC, 0644);
 	if (zcp->state_fd < 0) {
 		zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s",
 		    zcp->state_file, strerror(errno));
 		return (-1);
 	}
 	rv = zed_file_lock(zcp->state_fd);
 	if (rv < 0) {
 		zed_log_msg(LOG_WARNING, "Failed to lock state file \"%s\": %s",
 		    zcp->state_file, strerror(errno));
 		return (-1);
 	}
 	if (rv > 0) {
 		pid_t pid = zed_file_is_locked(zcp->state_fd);
 		if (pid < 0) {
 			zed_log_msg(LOG_WARNING,
 			    "Failed to test lock on state file \"%s\"",
 			    zcp->state_file);
 		} else if (pid > 0) {
 			zed_log_msg(LOG_WARNING,
 			    "Found PID %d bound to state file \"%s\"",
 			    pid, zcp->state_file);
 		} else {
 			zed_log_msg(LOG_WARNING,
 			    "Inconsistent lock state on state file \"%s\"",
 			    zcp->state_file);
 		}
 		return (-1);
 	}
 	return (0);
 }
 
 /*
  * Read the opened [zcp] state_file to obtain the eid & etime of the last event
  * processed.  Write the state from the last event to the [eidp] & [etime] args
  * passed by reference.  Note that etime[] is an array of size 2.
  * Return 0 on success, -1 on error.
  */
 int
 zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[])
 {
 	ssize_t len;
 	struct iovec iov[3];
 	ssize_t n;
 
 	if (!zcp || !eidp || !etime) {
 		errno = EINVAL;
 		zed_log_msg(LOG_ERR,
 		    "Failed to read state file: %s", strerror(errno));
 		return (-1);
 	}
 	if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) {
 		zed_log_msg(LOG_WARNING,
 		    "Failed to reposition state file offset: %s",
 		    strerror(errno));
 		return (-1);
 	}
 	len = 0;
 	iov[0].iov_base = eidp;
 	len += iov[0].iov_len = sizeof (*eidp);
 	iov[1].iov_base = &etime[0];
 	len += iov[1].iov_len = sizeof (etime[0]);
 	iov[2].iov_base = &etime[1];
 	len += iov[2].iov_len = sizeof (etime[1]);
 
 	n = readv(zcp->state_fd, iov, 3);
 	if (n == 0) {
 		*eidp = 0;
 	} else if (n < 0) {
 		zed_log_msg(LOG_WARNING,
 		    "Failed to read state file \"%s\": %s",
 		    zcp->state_file, strerror(errno));
 		return (-1);
 	} else if (n != len) {
 		errno = EIO;
 		zed_log_msg(LOG_WARNING,
 		    "Failed to read state file \"%s\": Read %d of %d bytes",
 		    zcp->state_file, n, len);
 		return (-1);
 	}
 	return (0);
 }
 
 /*
  * Write the [eid] & [etime] of the last processed event to the opened
  * [zcp] state_file.  Note that etime[] is an array of size 2.
  * Return 0 on success, -1 on error.
  */
 int
 zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[])
 {
 	ssize_t len;
 	struct iovec iov[3];
 	ssize_t n;
 
 	if (!zcp) {
 		errno = EINVAL;
 		zed_log_msg(LOG_ERR,
 		    "Failed to write state file: %s", strerror(errno));
 		return (-1);
 	}
 	if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) {
 		zed_log_msg(LOG_WARNING,
 		    "Failed to reposition state file offset: %s",
 		    strerror(errno));
 		return (-1);
 	}
 	len = 0;
 	iov[0].iov_base = &eid;
 	len += iov[0].iov_len = sizeof (eid);
 	iov[1].iov_base = &etime[0];
 	len += iov[1].iov_len = sizeof (etime[0]);
 	iov[2].iov_base = &etime[1];
 	len += iov[2].iov_len = sizeof (etime[1]);
 
 	n = writev(zcp->state_fd, iov, 3);
 	if (n < 0) {
 		zed_log_msg(LOG_WARNING,
 		    "Failed to write state file \"%s\": %s",
 		    zcp->state_file, strerror(errno));
 		return (-1);
 	}
 	if (n != len) {
 		errno = EIO;
 		zed_log_msg(LOG_WARNING,
 		    "Failed to write state file \"%s\": Wrote %d of %d bytes",
 		    zcp->state_file, n, len);
 		return (-1);
 	}
 	if (fdatasync(zcp->state_fd) < 0) {
 		zed_log_msg(LOG_WARNING,
 		    "Failed to sync state file \"%s\": %s",
 		    zcp->state_file, strerror(errno));
 		return (-1);
 	}
 	return (0);
 }
diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c
index 1eecfa0a92c4..03dcd03aceb7 100644
--- a/cmd/zed/zed_exec.c
+++ b/cmd/zed/zed_exec.c
@@ -1,368 +1,370 @@
 /*
  * This file is part of the ZFS Event Daemon (ZED).
  *
  * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
  * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
  * Refer to the OpenZFS git commit log for authoritative copyright attribution.
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License Version 1.0 (CDDL-1.0).
  * You can obtain a copy of the license from the top-level file
  * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
  * You may not use this file except in compliance with the license.
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stddef.h>
 #include <sys/avl.h>
 #include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
 #include <time.h>
 #include <unistd.h>
 #include <pthread.h>
+#include <signal.h>
+
 #include "zed_exec.h"
 #include "zed_log.h"
 #include "zed_strings.h"
 
 #define	ZEVENT_FILENO	3
 
 struct launched_process_node {
 	avl_node_t node;
 	pid_t pid;
 	uint64_t eid;
 	char *name;
 };
 
 static int
 _launched_process_node_compare(const void *x1, const void *x2)
 {
 	pid_t p1;
 	pid_t p2;
 
 	assert(x1 != NULL);
 	assert(x2 != NULL);
 
 	p1 = ((const struct launched_process_node *) x1)->pid;
 	p2 = ((const struct launched_process_node *) x2)->pid;
 
 	if (p1 < p2)
 		return (-1);
 	else if (p1 == p2)
 		return (0);
 	else
 		return (1);
 }
 
 static pthread_t _reap_children_tid = (pthread_t)-1;
 static volatile boolean_t _reap_children_stop;
 static avl_tree_t _launched_processes;
 static pthread_mutex_t _launched_processes_lock = PTHREAD_MUTEX_INITIALIZER;
 static int16_t _launched_processes_limit;
 
 /*
  * Create an environment string array for passing to execve() using the
  * NAME=VALUE strings in container [zsp].
  * Return a newly-allocated environment, or NULL on error.
  */
 static char **
 _zed_exec_create_env(zed_strings_t *zsp)
 {
 	int num_ptrs;
 	int buflen;
 	char *buf;
 	char **pp;
 	char *p;
 	const char *q;
 	int i;
 	int len;
 
 	num_ptrs = zed_strings_count(zsp) + 1;
 	buflen = num_ptrs * sizeof (char *);
 	for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp))
 		buflen += strlen(q) + 1;
 
 	buf = calloc(1, buflen);
 	if (!buf)
 		return (NULL);
 
 	pp = (char **)buf;
 	p = buf + (num_ptrs * sizeof (char *));
 	i = 0;
 	for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) {
 		pp[i] = p;
 		len = strlen(q) + 1;
 		memcpy(p, q, len);
 		p += len;
 		i++;
 	}
 	pp[i] = NULL;
 	assert(buf + buflen == p);
 	return ((char **)buf);
 }
 
 /*
  * Fork a child process to handle event [eid].  The program [prog]
  * in directory [dir] is executed with the environment [env].
  *
  * The file descriptor [zfd] is the zevent_fd used to track the
  * current cursor location within the zevent nvlist.
  */
 static void
 _zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog,
     char *env[], int zfd, boolean_t in_foreground)
 {
 	char path[PATH_MAX];
 	int n;
 	pid_t pid;
 	int fd;
 	struct launched_process_node *node;
 	sigset_t mask;
 	struct timespec launch_timeout =
 		{ .tv_sec = 0, .tv_nsec = 200 * 1000 * 1000, };
 
 	assert(dir != NULL);
 	assert(prog != NULL);
 	assert(env != NULL);
 	assert(zfd >= 0);
 
 	while (__atomic_load_n(&_launched_processes_limit,
 	    __ATOMIC_SEQ_CST) <= 0)
 		(void) nanosleep(&launch_timeout, NULL);
 
 	n = snprintf(path, sizeof (path), "%s/%s", dir, prog);
 	if ((n < 0) || (n >= sizeof (path))) {
 		zed_log_msg(LOG_WARNING,
 		    "Failed to fork \"%s\" for eid=%llu: %s",
 		    prog, eid, strerror(ENAMETOOLONG));
 		return;
 	}
 	(void) pthread_mutex_lock(&_launched_processes_lock);
 	pid = fork();
 	if (pid < 0) {
 		(void) pthread_mutex_unlock(&_launched_processes_lock);
 		zed_log_msg(LOG_WARNING,
 		    "Failed to fork \"%s\" for eid=%llu: %s",
 		    prog, eid, strerror(errno));
 		return;
 	} else if (pid == 0) {
 		(void) sigemptyset(&mask);
 		(void) sigprocmask(SIG_SETMASK, &mask, NULL);
 
 		(void) umask(022);
 		if (in_foreground && /* we're already devnulled if daemonised */
 		    (fd = open("/dev/null", O_RDWR | O_CLOEXEC)) != -1) {
 			(void) dup2(fd, STDIN_FILENO);
 			(void) dup2(fd, STDOUT_FILENO);
 			(void) dup2(fd, STDERR_FILENO);
 		}
 		(void) dup2(zfd, ZEVENT_FILENO);
 		execle(path, prog, NULL, env);
 		_exit(127);
 	}
 
 	/* parent process */
 
 	node = calloc(1, sizeof (*node));
 	if (node) {
 		node->pid = pid;
 		node->eid = eid;
 		node->name = strdup(prog);
 
 		avl_add(&_launched_processes, node);
 	}
 	(void) pthread_mutex_unlock(&_launched_processes_lock);
 
 	__atomic_sub_fetch(&_launched_processes_limit, 1, __ATOMIC_SEQ_CST);
 	zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d",
 	    prog, eid, pid);
 }
 
 static void
 _nop(int sig)
 {}
 
 static void *
 _reap_children(void *arg)
 {
 	struct launched_process_node node, *pnode;
 	pid_t pid;
 	int status;
 	struct rusage usage;
 	struct sigaction sa = {};
 
 	(void) sigfillset(&sa.sa_mask);
 	(void) sigdelset(&sa.sa_mask, SIGCHLD);
 	(void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL);
 
 	(void) sigemptyset(&sa.sa_mask);
 	sa.sa_handler = _nop;
 	sa.sa_flags = SA_NOCLDSTOP;
 	(void) sigaction(SIGCHLD, &sa, NULL);
 
 	for (_reap_children_stop = B_FALSE; !_reap_children_stop; ) {
 		(void) pthread_mutex_lock(&_launched_processes_lock);
 		pid = wait4(0, &status, WNOHANG, &usage);
 
 		if (pid == 0 || pid == (pid_t)-1) {
 			(void) pthread_mutex_unlock(&_launched_processes_lock);
 			if (pid == 0 || errno == ECHILD)
 				pause();
 			else if (errno != EINTR)
 				zed_log_msg(LOG_WARNING,
 				    "Failed to wait for children: %s",
 				    strerror(errno));
 		} else {
 			memset(&node, 0, sizeof (node));
 			node.pid = pid;
 			pnode = avl_find(&_launched_processes, &node, NULL);
 			if (pnode) {
 				memcpy(&node, pnode, sizeof (node));
 
 				avl_remove(&_launched_processes, pnode);
 				free(pnode);
 			}
 			(void) pthread_mutex_unlock(&_launched_processes_lock);
 			__atomic_add_fetch(&_launched_processes_limit, 1,
 			    __ATOMIC_SEQ_CST);
 
 			usage.ru_utime.tv_sec += usage.ru_stime.tv_sec;
 			usage.ru_utime.tv_usec += usage.ru_stime.tv_usec;
 			usage.ru_utime.tv_sec +=
 			    usage.ru_utime.tv_usec / (1000 * 1000);
 			usage.ru_utime.tv_usec %= 1000 * 1000;
 
 			if (WIFEXITED(status)) {
 				zed_log_msg(LOG_INFO,
 				    "Finished \"%s\" eid=%llu pid=%d "
 				    "time=%llu.%06us exit=%d",
 				    node.name, node.eid, pid,
 				    (unsigned long long) usage.ru_utime.tv_sec,
 				    (unsigned int) usage.ru_utime.tv_usec,
 				    WEXITSTATUS(status));
 			} else if (WIFSIGNALED(status)) {
 				zed_log_msg(LOG_INFO,
 				    "Finished \"%s\" eid=%llu pid=%d "
 				    "time=%llu.%06us sig=%d/%s",
 				    node.name, node.eid, pid,
 				    (unsigned long long) usage.ru_utime.tv_sec,
 				    (unsigned int) usage.ru_utime.tv_usec,
 				    WTERMSIG(status),
 				    strsignal(WTERMSIG(status)));
 			} else {
 				zed_log_msg(LOG_INFO,
 				    "Finished \"%s\" eid=%llu pid=%d "
 				    "time=%llu.%06us status=0x%X",
 				    node.name, node.eid,
 				    (unsigned long long) usage.ru_utime.tv_sec,
 				    (unsigned int) usage.ru_utime.tv_usec,
 				    (unsigned int) status);
 			}
 
 			free(node.name);
 		}
 	}
 
 	return (NULL);
 }
 
 void
 zed_exec_fini(void)
 {
 	struct launched_process_node *node;
 	void *ck = NULL;
 
 	if (_reap_children_tid == (pthread_t)-1)
 		return;
 
 	_reap_children_stop = B_TRUE;
 	(void) pthread_kill(_reap_children_tid, SIGCHLD);
 	(void) pthread_join(_reap_children_tid, NULL);
 
 	while ((node = avl_destroy_nodes(&_launched_processes, &ck)) != NULL) {
 		free(node->name);
 		free(node);
 	}
 	avl_destroy(&_launched_processes);
 
 	(void) pthread_mutex_destroy(&_launched_processes_lock);
 	(void) pthread_mutex_init(&_launched_processes_lock, NULL);
 
 	_reap_children_tid = (pthread_t)-1;
 }
 
 /*
  * Process the event [eid] by synchronously invoking all zedlets with a
  * matching class prefix.
  *
  * Each executable in [zcp->zedlets] from the directory [zcp->zedlet_dir]
  * is matched against the event's [class], [subclass], and the "all" class
  * (which matches all events).
  * Every zedlet with a matching class prefix is invoked.
  * The NAME=VALUE strings in [envs] will be passed to the zedlet as
  * environment variables.
  *
  * The file descriptor [zcp->zevent_fd] is the zevent_fd used to track the
  * current cursor location within the zevent nvlist.
  *
  * Return 0 on success, -1 on error.
  */
 int
 zed_exec_process(uint64_t eid, const char *class, const char *subclass,
     struct zed_conf *zcp, zed_strings_t *envs)
 {
 	const char *class_strings[4];
 	const char *allclass = "all";
 	const char **csp;
 	const char *z;
 	char **e;
 	int n;
 
 	if (!zcp->zedlet_dir || !zcp->zedlets || !envs || zcp->zevent_fd < 0)
 		return (-1);
 
 	if (_reap_children_tid == (pthread_t)-1) {
 		_launched_processes_limit = zcp->max_jobs;
 
 		if (pthread_create(&_reap_children_tid, NULL,
 		    _reap_children, NULL) != 0)
 			return (-1);
 		pthread_setname_np(_reap_children_tid, "reap ZEDLETs");
 
 		avl_create(&_launched_processes, _launched_process_node_compare,
 		    sizeof (struct launched_process_node),
 		    offsetof(struct launched_process_node, node));
 	}
 
 	csp = class_strings;
 
 	if (class)
 		*csp++ = class;
 
 	if (subclass)
 		*csp++ = subclass;
 
 	if (allclass)
 		*csp++ = allclass;
 
 	*csp = NULL;
 
 	e = _zed_exec_create_env(envs);
 
 	for (z = zed_strings_first(zcp->zedlets); z;
 	    z = zed_strings_next(zcp->zedlets)) {
 		for (csp = class_strings; *csp; csp++) {
 			n = strlen(*csp);
 			if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n]))
 				_zed_exec_fork_child(eid, zcp->zedlet_dir,
 				    z, e, zcp->zevent_fd, zcp->do_foreground);
 		}
 	}
 	free(e);
 	return (0);
 }
diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index 6491606d328b..cd080c8ee667 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -1,126 +1,135 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  *  Copyright (c) 2020 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_FM_FS_ZFS_H
 #define	_SYS_FM_FS_ZFS_H
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #define	ZFS_ERROR_CLASS				"fs.zfs"
 
 #define	FM_EREPORT_ZFS_CHECKSUM			"checksum"
 #define	FM_EREPORT_ZFS_AUTHENTICATION		"authentication"
 #define	FM_EREPORT_ZFS_IO			"io"
 #define	FM_EREPORT_ZFS_DATA			"data"
 #define	FM_EREPORT_ZFS_DELAY			"delay"
 #define	FM_EREPORT_ZFS_DEADMAN			"deadman"
 #define	FM_EREPORT_ZFS_POOL			"zpool"
 #define	FM_EREPORT_ZFS_DEVICE_UNKNOWN		"vdev.unknown"
 #define	FM_EREPORT_ZFS_DEVICE_OPEN_FAILED	"vdev.open_failed"
 #define	FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA	"vdev.corrupt_data"
 #define	FM_EREPORT_ZFS_DEVICE_NO_REPLICAS	"vdev.no_replicas"
 #define	FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM	"vdev.bad_guid_sum"
 #define	FM_EREPORT_ZFS_DEVICE_TOO_SMALL		"vdev.too_small"
 #define	FM_EREPORT_ZFS_DEVICE_BAD_LABEL		"vdev.bad_label"
 #define	FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT	"vdev.bad_ashift"
 #define	FM_EREPORT_ZFS_IO_FAILURE		"io_failure"
 #define	FM_EREPORT_ZFS_PROBE_FAILURE		"probe_failure"
 #define	FM_EREPORT_ZFS_LOG_REPLAY		"log_replay"
 #define	FM_EREPORT_ZFS_CONFIG_CACHE_WRITE	"config_cache_write"
 
 #define	FM_EREPORT_PAYLOAD_ZFS_POOL		"pool"
 #define	FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE	"pool_failmode"
 #define	FM_EREPORT_PAYLOAD_ZFS_POOL_GUID	"pool_guid"
 #define	FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT	"pool_context"
 #define	FM_EREPORT_PAYLOAD_ZFS_POOL_STATE	"pool_state"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID	"vdev_guid"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE	"vdev_type"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH	"vdev_path"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH	"vdev_physpath"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH	"vdev_enc_sysfs_path"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID	"vdev_devid"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU		"vdev_fru"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE	"vdev_state"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE	"vdev_laststate"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT	"vdev_ashift"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS	"vdev_complete_ts"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS	"vdev_delta_ts"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS	"vdev_spare_paths"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS	"vdev_spare_guids"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS	"vdev_read_errors"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS	"vdev_delays"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID	"parent_guid"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE	"parent_type"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH	"parent_path"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID	"parent_devid"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET	"zio_objset"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT	"zio_object"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL	"zio_level"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID	"zio_blkid"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR		"zio_err"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET	"zio_offset"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE		"zio_size"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS	"zio_flags"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE	"zio_stage"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY	"zio_priority"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE	"zio_pipeline"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY	"zio_delay"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP	"zio_timestamp"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA	"zio_delta"
 #define	FM_EREPORT_PAYLOAD_ZFS_PREV_STATE	"prev_state"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED	"cksum_expected"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL	"cksum_actual"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO	"cksum_algorithm"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP	"cksum_byteswap"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS	"bad_range_sets"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS	"bad_range_clears"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS	"bad_set_bits"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS	"bad_cleared_bits"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
+#define	FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME	"snapshot_name"
+#define	FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME	"device_name"
+#define	FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME	"raw_name"
+#define	FM_EREPORT_PAYLOAD_ZFS_VOLUME	"volume"
 
 #define	FM_EREPORT_FAILMODE_WAIT		"wait"
 #define	FM_EREPORT_FAILMODE_CONTINUE		"continue"
 #define	FM_EREPORT_FAILMODE_PANIC		"panic"
 
 #define	FM_RESOURCE_REMOVED			"removed"
 #define	FM_RESOURCE_AUTOREPLACE			"autoreplace"
 #define	FM_RESOURCE_STATECHANGE			"statechange"
 
+#define	FM_RESOURCE_ZFS_SNAPSHOT_MOUNT		"snapshot_mount"
+#define	FM_RESOURCE_ZFS_SNAPSHOT_UNMOUNT		"snapshot_unmount"
+#define	FM_RESOURCE_ZVOL_CREATE_SYMLINK		"zvol_create"
+#define	FM_RESOURCE_ZVOL_REMOVE_SYMLINK		"zvol_remove"
+
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_FM_FS_ZFS_H */
diff --git a/include/sys/spa.h b/include/sys/spa.h
index f811d6f5a743..2ae467877ddf 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1,1211 +1,1213 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019, Klara Inc.
  */
 
 #ifndef _SYS_SPA_H
 #define	_SYS_SPA_H
 
 #include <sys/avl.h>
 #include <sys/zfs_context.h>
 #include <sys/kstat.h>
 #include <sys/nvpair.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
 #include <sys/spa_checksum.h>
 #include <sys/dmu.h>
 #include <sys/space_map.h>
 #include <sys/bitops.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Forward references that lots of things need.
  */
 typedef struct spa spa_t;
 typedef struct vdev vdev_t;
 typedef struct metaslab metaslab_t;
 typedef struct metaslab_group metaslab_group_t;
 typedef struct metaslab_class metaslab_class_t;
 typedef struct zio zio_t;
 typedef struct zilog zilog_t;
 typedef struct spa_aux_vdev spa_aux_vdev_t;
 typedef struct ddt ddt_t;
 typedef struct ddt_entry ddt_entry_t;
 typedef struct zbookmark_phys zbookmark_phys_t;
 
 struct bpobj;
 struct bplist;
 struct dsl_pool;
 struct dsl_dataset;
 struct dsl_crypto_params;
 
 /*
  * Alignment Shift (ashift) is an immutable, internal top-level vdev property
  * which can only be set at vdev creation time. Physical writes are always done
  * according to it, which makes 2^ashift the smallest possible IO on a vdev.
  *
  * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB
  * (2^16 = 65,536).
  */
 #define	ASHIFT_MIN		9
 #define	ASHIFT_MAX		16
 
 /*
  * Size of block to hold the configuration data (a packed nvlist)
  */
 #define	SPA_CONFIG_BLOCKSIZE	(1ULL << 14)
 
 /*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
  * The ASIZE encoding should be at least 64 times larger (6 more bits)
  * to support up to 4-way RAID-Z mirror mode with worst-case gang block
  * overhead, three DVAs per bp, plus one more bit in case we do anything
  * else that expands the ASIZE.
  */
 #define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
 
 #define	SPA_COMPRESSBITS	7
 #define	SPA_VDEVBITS		24
 #define	SPA_COMPRESSMASK	((1U << SPA_COMPRESSBITS) - 1)
 
 /*
  * All SPA data is represented by 128-bit data virtual addresses (DVAs).
  * The members of the dva_t should be considered opaque outside the SPA.
  */
 typedef struct dva {
 	uint64_t	dva_word[2];
 } dva_t;
 
 
 /*
  * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
  * secret and is suitable for use in MAC algorithms as the key.
  */
 typedef struct zio_cksum_salt {
 	uint8_t		zcs_bytes[32];
 } zio_cksum_salt_t;
 
 /*
  * Each block is described by its DVAs, time of birth, checksum, etc.
  * The word-by-word, bit-by-bit layout of the blkptr is as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|  pad  |	  vdev1         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 2	|  pad  |	  vdev2         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 4	|  pad  |	  vdev3         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|			fill count				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * c	|			checksum[0]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * d	|			checksum[1]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * e	|			checksum[2]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * f	|			checksum[3]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * vdev		virtual device ID
  * offset	offset into virtual device
  * LSIZE	logical size
  * PSIZE	physical size (after compression)
  * ASIZE	allocated size (including RAID-Z parity and gang block headers)
  * GRID		RAID-Z layout information (reserved for future use)
  * cksum	checksum function
  * comp		compression function
  * G		gang block indicator
  * B		byteorder (endianness)
  * D		dedup
  * X		encryption
  * E		blkptr_t contains embedded data (see below)
  * lvl		level of indirection
  * type		DMU object type
  * phys birth	txg when dva[0] was written; zero if same as logical birth txg
  *              note that typically all the dva's would be written in this
  *              txg, but they could be different if they were moved by
  *              device removal.
  * log. birth	transaction group in which the block was logically born
  * fill count	number of non-zero blocks under this bp
  * checksum[4]	256-bit checksum of the data this bp describes
  */
 
 /*
  * The blkptr_t's of encrypted blocks also need to store the encryption
  * parameters so that the block can be decrypted. This layout is as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|		vdev1		| GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 2	|		vdev2		| GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 4	|			salt					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|			IV1					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|		IV2		|	    fill count		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * c	|			checksum[0]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * d	|			checksum[1]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * e	|			MAC[0]					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * f	|			MAC[1]					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * salt		Salt for generating encryption keys
  * IV1		First 64 bits of encryption IV
  * X		Block requires encryption handling (set to 1)
  * E		blkptr_t contains embedded data (set to 0, see below)
  * fill count	number of non-zero blocks under this bp (truncated to 32 bits)
  * IV2		Last 32 bits of encryption IV
  * checksum[2]	128-bit checksum of the data this bp describes
  * MAC[2]	128-bit message authentication code for this data
  *
  * The X bit being set indicates that this block is one of 3 types. If this is
  * a level 0 block with an encrypted object type, the block is encrypted
  * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted
  * object type, this block is authenticated with an HMAC (see
  * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC
  * words to store a checksum-of-MACs from the level below (see
  * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED()
  * refers to both encrypted and authenticated blocks and BP_USES_CRYPT()
  * refers to any of these 3 kinds of blocks.
  *
  * The additional encryption parameters are the salt, IV, and MAC which are
  * explained in greater detail in the block comment at the top of zio_crypt.c.
  * The MAC occupies half of the checksum space since it serves a very similar
  * purpose: to prevent data corruption on disk. The only functional difference
  * is that the checksum is used to detect on-disk corruption whether or not the
  * encryption key is loaded and the MAC provides additional protection against
  * malicious disk tampering. We use the 3rd DVA to store the salt and first
  * 64 bits of the IV. As a result encrypted blocks can only have 2 copies
  * maximum instead of the normal 3. The last 32 bits of the IV are stored in
  * the upper bits of what is usually the fill count. Note that only blocks at
  * level 0 or -2 are ever encrypted, which allows us to guarantee that these
  * 32 bits are not trampled over by other code (see zio_crypt.c for details).
  * The salt and IV are not used for authenticated bps or bps with an indirect
  * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits
  * for the fill count.
  */
 
 /*
  * "Embedded" blkptr_t's don't actually point to a block, instead they
  * have a data payload embedded in the blkptr_t itself.  See the comment
  * in blkptr.c for more details.
  *
  * The blkptr_t is laid out as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|      payload                                                  |
  * 1	|      payload                                                  |
  * 2	|      payload                                                  |
  * 3	|      payload                                                  |
  * 4	|      payload                                                  |
  * 5	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| etype |E| comp| PSIZE|              LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|      payload                                                  |
  * 8	|      payload                                                  |
  * 9	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|      payload                                                  |
  * c	|      payload                                                  |
  * d	|      payload                                                  |
  * e	|      payload                                                  |
  * f	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * payload		contains the embedded data
  * B (byteorder)	byteorder (endianness)
  * D (dedup)		padding (set to zero)
  * X			encryption (set to zero)
  * E (embedded)		set to one
  * lvl			indirection level
  * type			DMU object type
  * etype		how to interpret embedded data (BP_EMBEDDED_TYPE_*)
  * comp			compression function of payload
  * PSIZE		size of payload after compression, in bytes
  * LSIZE		logical size of payload, in bytes
  *			note that 25 bits is enough to store the largest
  *			"normal" BP's LSIZE (2^16 * 2^9) in bytes
  * log. birth		transaction group in which the block was logically born
  *
  * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
  * bp's they are stored in units of SPA_MINBLOCKSHIFT.
  * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
  * The B, D, X, lvl, type, and comp fields are stored the same as with normal
  * BP's so the BP_SET_* macros can be used with them.  etype, PSIZE, LSIZE must
  * be set with the BPE_SET_* macros.  BP_SET_EMBEDDED() should be called before
  * other macros, as they assert that they are only used on BP's of the correct
  * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use
  * the payload space for encryption parameters (see the comment above on
  * how encryption parameters are stored).
  */
 
 #define	BPE_GET_ETYPE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET((bp)->blk_prop, 40, 8))
 #define	BPE_SET_ETYPE(bp, t)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET((bp)->blk_prop, 40, 8, t); \
 } while (0)
 
 #define	BPE_GET_LSIZE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
 #define	BPE_SET_LSIZE(bp, x)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
 } while (0)
 
 #define	BPE_GET_PSIZE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
 #define	BPE_SET_PSIZE(bp, x)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
 } while (0)
 
 typedef enum bp_embedded_type {
 	BP_EMBEDDED_TYPE_DATA,
 	BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */
 	BP_EMBEDDED_TYPE_REDACTED,
 	NUM_BP_EMBEDDED_TYPES
 } bp_embedded_type_t;
 
 #define	BPE_NUM_WORDS 14
 #define	BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
 #define	BPE_IS_PAYLOADWORD(bp, wp) \
 	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
 
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
 #define	SPA_SYNC_MIN_VDEVS 3		/* min vdevs to update during sync */
 
 /*
  * A block is a hole when it has either 1) never been written to, or
  * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
  * without physically allocating disk space. Holes are represented in the
  * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
  * done through the BP_IS_HOLE macro. For holes, the logical size, level,
  * DMU object type, and birth times are all also stored for holes that
  * were written to at some point (i.e. were punched after having been filled).
  */
 typedef struct blkptr {
 	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
 	uint64_t	blk_pad[2];	/* Extra space for the future	    */
 	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
 	uint64_t	blk_birth;	/* transaction group at birth	    */
 	uint64_t	blk_fill;	/* fill count			    */
 	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
 } blkptr_t;
 
 /*
  * Macros to get and set fields in a bp or DVA.
  */
 
 /*
  * Note, for gang blocks, DVA_GET_ASIZE() is the total space allocated for
  * this gang DVA including its children BP's.  The space allocated at this
  * DVA's vdev/offset is vdev_gang_header_asize(vdev).
  */
 #define	DVA_GET_ASIZE(dva)	\
 	BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_ASIZE(dva, x)	\
 	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
 	SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
 #define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
 
 #define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
 #define	DVA_SET_VDEV(dva, x)	\
 	BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
 
 #define	DVA_GET_OFFSET(dva)	\
 	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_OFFSET(dva, x)	\
 	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define	BP_GET_LSIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ?	\
 	(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
 	BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
 #define	BP_SET_LSIZE(bp, x)	do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, \
 	    0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 } while (0)
 
 #define	BP_GET_PSIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
 #define	BP_SET_PSIZE(bp, x)	do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, \
 	    16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 } while (0)
 
 #define	BP_GET_COMPRESS(bp)		\
 	BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
 #define	BP_SET_COMPRESS(bp, x)		\
 	BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
 
 #define	BP_IS_EMBEDDED(bp)		BF64_GET((bp)->blk_prop, 39, 1)
 #define	BP_SET_EMBEDDED(bp, x)		BF64_SET((bp)->blk_prop, 39, 1, x)
 
 #define	BP_GET_CHECKSUM(bp)		\
 	(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
 	BF64_GET((bp)->blk_prop, 40, 8))
 #define	BP_SET_CHECKSUM(bp, x)		do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET((bp)->blk_prop, 40, 8, x); \
 } while (0)
 
 #define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
 #define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
 
 #define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
 #define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
 
 /* encrypted, authenticated, and MAC cksum bps use the same bit */
 #define	BP_USES_CRYPT(bp)		BF64_GET((bp)->blk_prop, 61, 1)
 #define	BP_SET_CRYPT(bp, x)		BF64_SET((bp)->blk_prop, 61, 1, x)
 
 #define	BP_IS_ENCRYPTED(bp)			\
 	(BP_USES_CRYPT(bp) &&			\
 	BP_GET_LEVEL(bp) <= 0 &&		\
 	DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp)))
 
 #define	BP_IS_AUTHENTICATED(bp)			\
 	(BP_USES_CRYPT(bp) &&			\
 	BP_GET_LEVEL(bp) <= 0 &&		\
 	!DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp)))
 
 #define	BP_HAS_INDIRECT_MAC_CKSUM(bp)		\
 	(BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0)
 
 #define	BP_IS_PROTECTED(bp)			\
 	(BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp))
 
 #define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
 #define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
 
 #define	BP_GET_BYTEORDER(bp)		BF64_GET((bp)->blk_prop, 63, 1)
 #define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
 
 #define	BP_GET_FREE(bp)			BF64_GET((bp)->blk_fill, 0, 1)
 #define	BP_SET_FREE(bp, x)		BF64_SET((bp)->blk_fill, 0, 1, x)
 
 #define	BP_PHYSICAL_BIRTH(bp)		\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
 
 #define	BP_SET_BIRTH(bp, logical, physical)	\
 {						\
 	ASSERT(!BP_IS_EMBEDDED(bp));		\
 	(bp)->blk_birth = (logical);		\
 	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
 }
 
 #define	BP_GET_FILL(bp)				\
 	((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \
 	((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill))
 
 #define	BP_SET_FILL(bp, fill)			\
 {						\
 	if (BP_IS_ENCRYPTED(bp))			\
 		BF64_SET((bp)->blk_fill, 0, 32, fill); \
 	else					\
 		(bp)->blk_fill = fill;		\
 }
 
 #define	BP_GET_IV2(bp)				\
 	(ASSERT(BP_IS_ENCRYPTED(bp)),		\
 	BF64_GET((bp)->blk_fill, 32, 32))
 #define	BP_SET_IV2(bp, iv2)			\
 {						\
 	ASSERT(BP_IS_ENCRYPTED(bp));		\
 	BF64_SET((bp)->blk_fill, 32, 32, iv2);	\
 }
 
 #define	BP_IS_METADATA(bp)	\
 	(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 
 #define	BP_GET_ASIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	(DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))
 
 #define	BP_GET_UCSIZE(bp)	\
 	(BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define	BP_GET_NDVAS(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	(!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))
 
 #define	BP_COUNT_GANG(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
 	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
 	(DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))))
 
 #define	DVA_EQUAL(dva1, dva2)	\
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
 #define	BP_EQUAL(bp1, bp2)	\
 	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
 	(bp1)->blk_birth == (bp2)->blk_birth &&			\
 	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
 
 
 #define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
 
 #define	BP_IDENTITY(bp)		(ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
 #define	BP_IS_GANG(bp)		\
 	(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
 #define	DVA_IS_EMPTY(dva)	((dva)->dva_word[0] == 0ULL &&	\
 				(dva)->dva_word[1] == 0ULL)
 #define	BP_IS_HOLE(bp) \
 	(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
 
 #define	BP_SET_REDACTED(bp) \
 {							\
 	BP_SET_EMBEDDED(bp, B_TRUE);			\
 	BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED);	\
 }
 #define	BP_IS_REDACTED(bp) \
 	(BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED)
 
 /* BP_IS_RAIDZ(bp) assumes no block compression */
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
 				BP_GET_PSIZE(bp))
 
 #define	BP_ZERO(bp)				\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
 	(bp)->blk_dva[0].dva_word[1] = 0;	\
 	(bp)->blk_dva[1].dva_word[0] = 0;	\
 	(bp)->blk_dva[1].dva_word[1] = 0;	\
 	(bp)->blk_dva[2].dva_word[0] = 0;	\
 	(bp)->blk_dva[2].dva_word[1] = 0;	\
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
 	(bp)->blk_phys_birth = 0;		\
 	(bp)->blk_birth = 0;			\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
 
 #ifdef _ZFS_BIG_ENDIAN
 #define	ZFS_HOST_BYTEORDER	(0ULL)
 #else
 #define	ZFS_HOST_BYTEORDER	(1ULL)
 #endif
 
 #define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
 
 #define	BP_SPRINTF_LEN	400
 
 /*
  * This macro allows code sharing between zfs, libzpool, and mdb.
  * 'func' is either snprintf() or mdb_snprintf().
  * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
  */
 
 #define	SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
 {									\
 	static const char *copyname[] =					\
 	    { "zero", "single", "double", "triple" };			\
 	int len = 0;							\
 	int copies = 0;							\
 	const char *crypt_type;						\
 	if (bp != NULL) {						\
 		if (BP_IS_ENCRYPTED(bp)) {				\
 			crypt_type = "encrypted";			\
 			/* LINTED E_SUSPICIOUS_COMPARISON */		\
 		} else if (BP_IS_AUTHENTICATED(bp)) {			\
 			crypt_type = "authenticated";			\
 		} else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {		\
 			crypt_type = "indirect-MAC";			\
 		} else {						\
 			crypt_type = "unencrypted";			\
 		}							\
 	}								\
 	if (bp == NULL) {						\
 		len += func(buf + len, size - len, "<NULL>");		\
 	} else if (BP_IS_HOLE(bp)) {					\
 		len += func(buf + len, size - len,			\
 		    "HOLE [L%llu %s] "					\
 		    "size=%llxL birth=%lluL",				\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else if (BP_IS_EMBEDDED(bp)) {				\
 		len = func(buf + len, size - len,			\
 		    "EMBEDDED [L%llu %s] et=%u %s "			\
 		    "size=%llxL/%llxP birth=%lluL",			\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (int)BPE_GET_ETYPE(bp),				\
 		    compress,						\
 		    (u_longlong_t)BPE_GET_LSIZE(bp),			\
 		    (u_longlong_t)BPE_GET_PSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else if (BP_IS_REDACTED(bp)) {				\
 		len += func(buf + len, size - len,			\
 		    "REDACTED [L%llu %s] size=%llxL birth=%lluL",	\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else {							\
 		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
 			const dva_t *dva = &bp->blk_dva[d];		\
 			if (DVA_IS_VALID(dva))				\
 				copies++;				\
 			len += func(buf + len, size - len,		\
 			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
 			    (u_longlong_t)DVA_GET_VDEV(dva),		\
 			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
 			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
 			    ws);					\
 		}							\
 		if (BP_IS_ENCRYPTED(bp)) {				\
 			len += func(buf + len, size - len,		\
 			    "salt=%llx iv=%llx:%llx%c",			\
 			    (u_longlong_t)bp->blk_dva[2].dva_word[0],	\
 			    (u_longlong_t)bp->blk_dva[2].dva_word[1],	\
 			    (u_longlong_t)BP_GET_IV2(bp),		\
 			    ws);					\
 		}							\
 		if (BP_IS_GANG(bp) &&					\
 		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
 		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
 			copies--;					\
 		len += func(buf + len, size - len,			\
 		    "[L%llu %s] %s %s %s %s %s %s %s%c"			\
 		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
 		    "cksum=%llx:%llx:%llx:%llx",			\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    checksum,						\
 		    compress,						\
 		    crypt_type,						\
 		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
 		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
 		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
 		    copyname[copies],					\
 		    ws,							\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth,			\
 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
 		    (u_longlong_t)BP_GET_FILL(bp),			\
 		    ws,							\
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
 	}								\
 	ASSERT(len < size);						\
 }
 
 #define	BP_GET_BUFC_TYPE(bp)						\
 	(BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 typedef enum spa_import_type {
 	SPA_IMPORT_EXISTING,
 	SPA_IMPORT_ASSEMBLE
 } spa_import_type_t;
 
 typedef enum spa_mode {
 	SPA_MODE_UNINIT = 0,
 	SPA_MODE_READ = 1,
 	SPA_MODE_WRITE = 2,
 } spa_mode_t;
 
 /*
  * Send TRIM commands in-line during normal pool operation while deleting.
  *	OFF: no
  *	ON: yes
  * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources.
  */
 typedef enum {
 	SPA_AUTOTRIM_OFF = 0,	/* default */
 	SPA_AUTOTRIM_ON,
 #ifdef IN_FREEBSD_BASE
 	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON,
 #else
 	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF,
 #endif
 } spa_autotrim_t;
 
 /*
  * Reason TRIM command was issued, used internally for accounting purposes.
  */
 typedef enum trim_type {
 	TRIM_TYPE_MANUAL = 0,
 	TRIM_TYPE_AUTO = 1,
 	TRIM_TYPE_SIMPLE = 2
 } trim_type_t;
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
 extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
     nvlist_t *policy, nvlist_t **config);
 extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
     size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops, struct dsl_crypto_params *dcp);
 extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props,
     uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(const char *pool);
 extern int spa_checkpoint(const char *pool);
 extern int spa_checkpoint_discard(const char *pool);
 extern int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce);
 extern int spa_reset(const char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
 extern void spa_async_unrequest(spa_t *spa, int flag);
 extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);
 extern int spa_async_tasks(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
 extern void spa_inject_delref(spa_t *spa);
 extern void spa_scan_stat_init(spa_t *spa);
 extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
 extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
 #define	SPA_ASYNC_CONFIG_UPDATE			0x01
 #define	SPA_ASYNC_REMOVE			0x02
 #define	SPA_ASYNC_PROBE				0x04
 #define	SPA_ASYNC_RESILVER_DONE			0x08
 #define	SPA_ASYNC_RESILVER			0x10
 #define	SPA_ASYNC_AUTOEXPAND			0x20
 #define	SPA_ASYNC_REMOVE_DONE			0x40
 #define	SPA_ASYNC_REMOVE_STOP			0x80
 #define	SPA_ASYNC_INITIALIZE_RESTART		0x100
 #define	SPA_ASYNC_TRIM_RESTART			0x200
 #define	SPA_ASYNC_AUTOTRIM_RESTART		0x400
 #define	SPA_ASYNC_L2CACHE_REBUILD		0x800
 #define	SPA_ASYNC_L2CACHE_TRIM			0x1000
 #define	SPA_ASYNC_REBUILD_DONE			0x2000
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing, int rebuild);
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern boolean_t spa_vdev_remove_active(spa_t *spa);
 extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     nvlist_t *vdev_errlist);
 extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp);
 
 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
 extern void spa_spare_remove(vdev_t *vd);
 extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
 extern void spa_spare_activate(vdev_t *vd);
 
 /* L2ARC state (which is global across all pools) */
 extern void spa_l2cache_add(vdev_t *vd);
 extern void spa_l2cache_remove(vdev_t *vd);
 extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
 
 /* scanning */
 extern int spa_scan(spa_t *spa, pool_scan_func_t func);
 extern int spa_scan_stop(spa_t *spa);
 extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
 extern int zfs_sync_pass_deferred_free;
 
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
 /*
  * SPA configuration functions in spa_config.c
  */
 
 #define	SPA_CONFIG_UPDATE_POOL	0
 #define	SPA_CONFIG_UPDATE_VDEVS	1
 
 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
 extern void spa_config_load(void);
 extern nvlist_t *spa_all_configs(uint64_t *);
 extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
     int getstats);
 extern void spa_config_update(spa_t *spa, int what);
 extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv,
     vdev_t *parent, uint_t id, int atype);
 
 
 /*
  * Miscellaneous SPA routines in spa_misc.c
  */
 
 /* Namespace manipulation */
 extern spa_t *spa_lookup(const char *name);
 extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
 extern void spa_remove(spa_t *spa);
 extern spa_t *spa_next(spa_t *prev);
 
 /* Refcount functions */
 extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
 extern void spa_async_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
 #define	SCL_NONE	0x00
 #define	SCL_CONFIG	0x01
 #define	SCL_STATE	0x02
 #define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
 #define	SCL_ALLOC	0x08
 #define	SCL_ZIO		0x10
 #define	SCL_FREE	0x20
 #define	SCL_VDEV	0x40
 #define	SCL_LOCKS	7
 #define	SCL_ALL		((1 << SCL_LOCKS) - 1)
 #define	SCL_STATE_ALL	(SCL_STATE | SCL_L2ARC | SCL_ZIO)
 
 /* Historical pool statistics */
 typedef struct spa_history_kstat {
 	kmutex_t		lock;
 	uint64_t		count;
 	uint64_t		size;
 	kstat_t			*kstat;
 	void			*priv;
 	list_t			list;
 } spa_history_kstat_t;
 
 typedef struct spa_history_list {
 	uint64_t		size;
 	procfs_list_t		procfs_list;
 } spa_history_list_t;
 
 typedef struct spa_stats {
 	spa_history_list_t	read_history;
 	spa_history_list_t	txg_history;
 	spa_history_kstat_t	tx_assign_histogram;
 	spa_history_list_t	mmp_history;
 	spa_history_kstat_t	state;		/* pool state */
 	spa_history_kstat_t	iostats;
 } spa_stats_t;
 
 typedef enum txg_state {
 	TXG_STATE_BIRTH		= 0,
 	TXG_STATE_OPEN		= 1,
 	TXG_STATE_QUIESCED	= 2,
 	TXG_STATE_WAIT_FOR_SYNC	= 3,
 	TXG_STATE_SYNCED	= 4,
 	TXG_STATE_COMMITTED	= 5,
 } txg_state_t;
 
 typedef struct txg_stat {
 	vdev_stat_t		vs1;
 	vdev_stat_t		vs2;
 	uint64_t		txg;
 	uint64_t		ndirty;
 } txg_stat_t;
 
 /* Assorted pool IO kstats */
 typedef struct spa_iostats {
 	kstat_named_t	trim_extents_written;
 	kstat_named_t	trim_bytes_written;
 	kstat_named_t	trim_extents_skipped;
 	kstat_named_t	trim_bytes_skipped;
 	kstat_named_t	trim_extents_failed;
 	kstat_named_t	trim_bytes_failed;
 	kstat_named_t	autotrim_extents_written;
 	kstat_named_t	autotrim_bytes_written;
 	kstat_named_t	autotrim_extents_skipped;
 	kstat_named_t	autotrim_bytes_skipped;
 	kstat_named_t	autotrim_extents_failed;
 	kstat_named_t	autotrim_bytes_failed;
 	kstat_named_t	simple_trim_extents_written;
 	kstat_named_t	simple_trim_bytes_written;
 	kstat_named_t	simple_trim_extents_skipped;
 	kstat_named_t	simple_trim_bytes_skipped;
 	kstat_named_t	simple_trim_extents_failed;
 	kstat_named_t	simple_trim_bytes_failed;
 } spa_iostats_t;
 
 extern void spa_stats_init(spa_t *spa);
 extern void spa_stats_destroy(spa_t *spa);
 extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb,
     uint32_t aflags);
 extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time);
 extern int spa_txg_history_set(spa_t *spa,  uint64_t txg,
     txg_state_t completed_state, hrtime_t completed_time);
 extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t,
     struct dsl_pool *);
 extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *);
 extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs);
 extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id);
 extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
     hrtime_t duration);
 extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
     uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
     int error);
 extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
     uint64_t extents_written, uint64_t bytes_written,
     uint64_t extents_skipped, uint64_t bytes_skipped,
     uint64_t extents_failed, uint64_t bytes_failed);
 extern void spa_import_progress_add(spa_t *spa);
 extern void spa_import_progress_remove(uint64_t spa_guid);
 extern int spa_import_progress_set_mmp_check(uint64_t pool_guid,
     uint64_t mmp_sec_remaining);
 extern int spa_import_progress_set_max_txg(uint64_t pool_guid,
     uint64_t max_txg);
 extern int spa_import_progress_set_state(uint64_t pool_guid,
     spa_load_state_t spa_load_state);
 
 /* Pool configuration locks */
 extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
 extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw);
 extern void spa_config_exit(spa_t *spa, int locks, const void *tag);
 extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
 
 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
 extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid);
 extern uint64_t spa_vdev_config_enter(spa_t *spa);
 extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
     int error, char *tag);
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
 
 /* Pool vdev state change lock */
 extern void spa_vdev_state_enter(spa_t *spa, int oplock);
 extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
 
 /* Log state */
 typedef enum spa_log_state {
 	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
 	SPA_LOG_MISSING,	/* missing log(s) */
 	SPA_LOG_CLEAR,		/* clear the log(s) */
 	SPA_LOG_GOOD,		/* log(s) are good */
 } spa_log_state_t;
 
 extern spa_log_state_t spa_get_log_state(spa_t *spa);
 extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
 extern int spa_reset_logs(spa_t *spa);
 
 /* Log claim callback */
 extern void spa_claim_notify(zio_t *zio);
 extern void spa_deadman(void *);
 
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
 extern boolean_t spa_is_initializing(spa_t *spa);
 extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
 extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
 extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
 extern void spa_altroot(spa_t *, char *, size_t);
 extern int spa_sync_pass(spa_t *spa);
 extern char *spa_name(spa_t *spa);
 extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_load_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
 extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_final_dirty_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
 extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
 extern uint64_t spa_get_checkpoint_space(spa_t *spa);
 extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern boolean_t spa_deflate(spa_t *spa);
 extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern metaslab_class_t *spa_embedded_log_class(spa_t *spa);
 extern metaslab_class_t *spa_special_class(spa_t *spa);
 extern metaslab_class_t *spa_dedup_class(spa_t *spa);
 extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
     dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
 
 extern void spa_evicting_os_register(spa_t *, objset_t *os);
 extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
 extern void spa_evicting_os_wait(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_prev_software_version(spa_t *spa);
 extern uint64_t spa_get_failmode(spa_t *spa);
 extern uint64_t spa_get_deadman_failmode(spa_t *spa);
 extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode);
 extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
 extern space_map_t *spa_syncing_log_sm(spa_t *spa);
 extern uint64_t spa_deadman_synctime(spa_t *spa);
 extern uint64_t spa_deadman_ziotime(spa_t *spa);
 extern uint64_t spa_dirty_data(spa_t *spa);
 extern spa_autotrim_t spa_get_autotrim(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern void spa_load_failed(spa_t *spa, const char *fmt, ...)
     __attribute__((format(printf, 2, 3)));
 extern void spa_load_note(spa_t *spa, const char *fmt, ...)
     __attribute__((format(printf, 2, 3)));
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
     dmu_tx_t *tx);
 extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
 extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
 extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern int spa_change_guid(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
     boolean_t l2cache);
 extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
 extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
 extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
 extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
 extern int spa_maxblocksize(spa_t *spa);
 extern int spa_maxdnodesize(spa_t *spa);
 extern boolean_t spa_has_checkpoint(spa_t *spa);
 extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
 extern boolean_t spa_suspend_async_destroy(spa_t *spa);
 extern uint64_t spa_min_claim_txg(spa_t *spa);
 extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
     const blkptr_t *bp);
 typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
     void *arg);
 extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
     spa_remap_cb_t callback, void *arg);
 extern uint64_t spa_get_last_removal_txg(spa_t *spa);
 extern boolean_t spa_trust_config(spa_t *spa);
 extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
 extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
 extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
 extern uint64_t spa_total_metaslabs(spa_t *spa);
 extern boolean_t spa_multihost(spa_t *spa);
 extern uint32_t spa_get_hostid(spa_t *spa);
 extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
 extern boolean_t spa_livelist_delete_check(spa_t *spa);
 
 extern spa_mode_t spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
 
 extern char *spa_his_ievent_table[];
 
 extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
 extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf);
 extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl);
 extern void spa_history_log_version(spa_t *spa, const char *operation,
     dmu_tx_t *tx);
 extern void spa_history_log_internal(spa_t *spa, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5);
 extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
     dmu_tx_t *tx, const char *fmt, ...)  __printflike(4, 5);
 extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5);
 
 extern const char *spa_state_to_name(spa_t *spa);
 
 /* error handling */
 struct zbookmark_phys;
 extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb);
 extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t state);
 extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd,
     zio_t *zio);
 extern void zfs_ereport_taskq_fini(void);
 extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd);
 extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
     const char *name, nvlist_t *aux);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
 extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_get_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
 extern void spa_errlog_rotate(spa_t *spa);
 extern void spa_errlog_drain(spa_t *spa);
 extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
 extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
 
 /* vdev cache */
 extern void vdev_cache_stat_init(void);
 extern void vdev_cache_stat_fini(void);
 
 /* vdev mirror */
 extern void vdev_mirror_stat_init(void);
 extern void vdev_mirror_stat_fini(void);
 
 /* Initialization and termination */
 extern void spa_init(spa_mode_t mode);
 extern void spa_fini(void);
 extern void spa_boot_init(void);
 
 /* properties */
 extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
 extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
 extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
 extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
 
 /* asynchronous event notification */
 extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl,
     const char *name);
+extern void zfs_ereport_zvol_post(const char *subclass, const char *name,
+    const char *device_name, const char *raw_name);
 
 /* waiting for pool activities to complete */
 extern int spa_wait(const char *pool, zpool_wait_activity_t activity,
     boolean_t *waited);
 extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity,
     uint64_t tag, boolean_t *waited);
 extern void spa_notify_waiters(spa_t *spa);
 extern void spa_wake_waiters(spa_t *spa);
 
 /* module param call functions */
 int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS);
 int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS);
 int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS);
 int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_bp(bp, fmt, ...) do {				\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
 	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));	\
 	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} \
 } while (0)
 #else
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
 extern spa_mode_t spa_mode_global;
 extern int zfs_deadman_enabled;
 extern unsigned long zfs_deadman_synctime_ms;
 extern unsigned long zfs_deadman_ziotime_ms;
 extern unsigned long zfs_deadman_checktime_ms;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_SPA_H */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 2d34481f6be6..5b606eaf8d50 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -1,690 +1,692 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019-2020, Michael Niewöhner
  */
 
 #ifndef _ZIO_H
 #define	_ZIO_H
 
 #include <sys/zio_priority.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_impl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Embedded checksum
  */
 #define	ZEC_MAGIC	0x210da7ab10c7a11ULL
 
 typedef struct zio_eck {
 	uint64_t	zec_magic;	/* for validation, endianness	*/
 	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
 } zio_eck_t;
 
 /*
  * Gang block headers are self-checksumming and contain an array
  * of block pointers.
  */
 #define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
 #define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
 	sizeof (zio_eck_t)) / sizeof (blkptr_t))
 #define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
 	sizeof (zio_eck_t) - \
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
 typedef struct zio_gbh {
 	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
 	uint64_t		zg_filler[SPA_GBH_FILLER];
 	zio_eck_t		zg_tail;
 } zio_gbh_phys_t;
 
 enum zio_checksum {
 	ZIO_CHECKSUM_INHERIT = 0,
 	ZIO_CHECKSUM_ON,
 	ZIO_CHECKSUM_OFF,
 	ZIO_CHECKSUM_LABEL,
 	ZIO_CHECKSUM_GANG_HEADER,
 	ZIO_CHECKSUM_ZILOG,
 	ZIO_CHECKSUM_FLETCHER_2,
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
 	ZIO_CHECKSUM_NOPARITY,
 	ZIO_CHECKSUM_SHA512,
 	ZIO_CHECKSUM_SKEIN,
 #if !defined(__FreeBSD__)
 	ZIO_CHECKSUM_EDONR,
 #endif
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
 
 #define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
 #define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
 
 #define	ZIO_CHECKSUM_MASK	0xffULL
 #define	ZIO_CHECKSUM_VERIFY	(1 << 8)
 
 #define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
 
 /* macros defining encryption lengths */
 #define	ZIO_OBJSET_MAC_LEN		32
 #define	ZIO_DATA_IV_LEN			12
 #define	ZIO_DATA_SALT_LEN		8
 #define	ZIO_DATA_MAC_LEN		16
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
 
 /*
  * The meaning of "compress = on" selected by the compression features enabled
  * on a given pool.
  */
 #define	ZIO_COMPRESS_LEGACY_ON_VALUE	ZIO_COMPRESS_LZJB
 #define	ZIO_COMPRESS_LZ4_ON_VALUE	ZIO_COMPRESS_LZ4
 
 #define	ZIO_COMPRESS_DEFAULT		ZIO_COMPRESS_OFF
 
 #define	BOOTFS_COMPRESS_VALID(compress)			\
 	((compress) == ZIO_COMPRESS_LZJB ||		\
 	(compress) == ZIO_COMPRESS_LZ4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_1 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_2 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_3 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_5 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_6 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_7 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_8 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_9 ||		\
 	(compress) == ZIO_COMPRESS_ZLE ||		\
 	(compress) == ZIO_COMPRESS_ZSTD ||		\
 	(compress) == ZIO_COMPRESS_ON ||		\
 	(compress) == ZIO_COMPRESS_OFF)
 
 
 #define	ZIO_COMPRESS_ALGO(x)	(x & SPA_COMPRESSMASK)
 #define	ZIO_COMPRESS_LEVEL(x)	((x & ~SPA_COMPRESSMASK) >> SPA_COMPRESSBITS)
 #define	ZIO_COMPRESS_RAW(type, level)	(type | ((level) << SPA_COMPRESSBITS))
 
 #define	ZIO_COMPLEVEL_ZSTD(level)	\
 	ZIO_COMPRESS_RAW(ZIO_COMPRESS_ZSTD, level)
 
 #define	ZIO_FAILURE_MODE_WAIT		0
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
 
 typedef enum zio_suspend_reason {
 	ZIO_SUSPEND_NONE = 0,
 	ZIO_SUSPEND_IOERR,
 	ZIO_SUSPEND_MMP,
 } zio_suspend_reason_t;
 
 enum zio_flag {
 	/*
 	 * Flags inherited by gang, ddt, and vdev children,
 	 * and that must be equal for two zios to aggregate
 	 */
 	ZIO_FLAG_DONT_AGGREGATE	= 1 << 0,
 	ZIO_FLAG_IO_REPAIR	= 1 << 1,
 	ZIO_FLAG_SELF_HEAL	= 1 << 2,
 	ZIO_FLAG_RESILVER	= 1 << 3,
 	ZIO_FLAG_SCRUB		= 1 << 4,
 	ZIO_FLAG_SCAN_THREAD	= 1 << 5,
 	ZIO_FLAG_PHYSICAL	= 1 << 6,
 
 #define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
 
 	/*
 	 * Flags inherited by ddt, gang, and vdev children.
 	 */
 	ZIO_FLAG_CANFAIL	= 1 << 7,	/* must be first for INHERIT */
 	ZIO_FLAG_SPECULATIVE	= 1 << 8,
 	ZIO_FLAG_CONFIG_WRITER	= 1 << 9,
 	ZIO_FLAG_DONT_RETRY	= 1 << 10,
 	ZIO_FLAG_DONT_CACHE	= 1 << 11,
 	ZIO_FLAG_NODATA		= 1 << 12,
 	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 13,
 	ZIO_FLAG_IO_ALLOCATING  = 1 << 14,
 
 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 
 	/*
 	 * Flags inherited by vdev children.
 	 */
 	ZIO_FLAG_IO_RETRY	= 1 << 15,	/* must be first for INHERIT */
 	ZIO_FLAG_PROBE		= 1 << 16,
 	ZIO_FLAG_TRYHARD	= 1 << 17,
 	ZIO_FLAG_OPTIONAL	= 1 << 18,
 
 #define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
 
 	/*
 	 * Flags not inherited by any children.
 	 */
 	ZIO_FLAG_DONT_QUEUE	= 1 << 19,	/* must be first for INHERIT */
 	ZIO_FLAG_DONT_PROPAGATE	= 1 << 20,
 	ZIO_FLAG_IO_BYPASS	= 1 << 21,
 	ZIO_FLAG_IO_REWRITE	= 1 << 22,
 	ZIO_FLAG_RAW_COMPRESS	= 1 << 23,
 	ZIO_FLAG_RAW_ENCRYPT	= 1 << 24,
 	ZIO_FLAG_GANG_CHILD	= 1 << 25,
 	ZIO_FLAG_DDT_CHILD	= 1 << 26,
 	ZIO_FLAG_GODFATHER	= 1 << 27,
 	ZIO_FLAG_NOPWRITE	= 1 << 28,
 	ZIO_FLAG_REEXECUTED	= 1 << 29,
 	ZIO_FLAG_DELEGATED	= 1 << 30,
 	ZIO_FLAG_FASTWRITE	= 1 << 31,
 };
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
 #define	ZIO_FLAG_RAW	(ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
 
 #define	ZIO_DDT_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |		\
 	ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_GANG_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
 	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_VDEV_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
 	ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_CHILD_BIT(x)		(1 << (x))
 #define	ZIO_CHILD_BIT_IS_SET(val, x)	((val) & (1 << (x)))
 
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
 	ZIO_CHILD_DDT,
 	ZIO_CHILD_LOGICAL,
 	ZIO_CHILD_TYPES
 };
 
 #define	ZIO_CHILD_VDEV_BIT		ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
 #define	ZIO_CHILD_GANG_BIT		ZIO_CHILD_BIT(ZIO_CHILD_GANG)
 #define	ZIO_CHILD_DDT_BIT		ZIO_CHILD_BIT(ZIO_CHILD_DDT)
 #define	ZIO_CHILD_LOGICAL_BIT		ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
 #define	ZIO_CHILD_ALL_BITS					\
 	(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT |		\
 	ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
 
 enum zio_wait_type {
 	ZIO_WAIT_READY = 0,
 	ZIO_WAIT_DONE,
 	ZIO_WAIT_TYPES
 };
 
 typedef void zio_done_func_t(zio_t *zio);
 
 extern int zio_exclude_metadata;
 extern int zio_dva_throttle_enabled;
 extern const char *zio_type_name[ZIO_TYPES];
 
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
  * identifies any block in the pool.  By convention, the meta-objset (MOS)
  * is objset 0, and the meta-dnode is object 0.  This covers all blocks
  * except root blocks and ZIL blocks, which are defined as follows:
  *
  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
  * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
  *
  * Note: this structure is called a bookmark because its original purpose
  * was to remember where to resume a pool-wide traverse.
  *
  * Note: this structure is passed between userland and the kernel, and is
  * stored on disk (by virtue of being incorporated into other on-disk
  * structures, e.g. dsl_scan_phys_t).
  */
 struct zbookmark_phys {
 	uint64_t	zb_objset;
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
 };
 
 #define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
 {                                                       \
 	(zb)->zb_objset = objset;                       \
 	(zb)->zb_object = object;                       \
 	(zb)->zb_level = level;                         \
 	(zb)->zb_blkid = blkid;                         \
 }
 
 #define	ZB_DESTROYED_OBJSET	(-1ULL)
 
 #define	ZB_ROOT_OBJECT		(0ULL)
 #define	ZB_ROOT_LEVEL		(-1LL)
 #define	ZB_ROOT_BLKID		(0ULL)
 
 #define	ZB_ZIL_OBJECT		(0ULL)
 #define	ZB_ZIL_LEVEL		(-2LL)
 
 #define	ZB_DNODE_LEVEL		(-3LL)
 #define	ZB_DNODE_BLKID		(0ULL)
 
 #define	ZB_IS_ZERO(zb)						\
 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
 #define	ZB_IS_ROOT(zb)				\
 	((zb)->zb_object == ZB_ROOT_OBJECT &&	\
 	(zb)->zb_level == ZB_ROOT_LEVEL &&	\
 	(zb)->zb_blkid == ZB_ROOT_BLKID)
 
 typedef struct zio_prop {
 	enum zio_checksum	zp_checksum;
 	enum zio_compress	zp_compress;
 	uint8_t			zp_complevel;
 	dmu_object_type_t	zp_type;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
 	boolean_t		zp_dedup;
 	boolean_t		zp_dedup_verify;
 	boolean_t		zp_nopwrite;
 	boolean_t		zp_encrypt;
 	boolean_t		zp_byteorder;
 	uint8_t			zp_salt[ZIO_DATA_SALT_LEN];
 	uint8_t			zp_iv[ZIO_DATA_IV_LEN];
 	uint8_t			zp_mac[ZIO_DATA_MAC_LEN];
 	uint32_t		zp_zpl_smallblk;
 } zio_prop_t;
 
 typedef struct zio_cksum_report zio_cksum_report_t;
 
 typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
     const abd_t *good_data);
 typedef void zio_cksum_free_f(void *cbdata, size_t size);
 
 struct zio_bad_cksum;				/* defined in zio_checksum.h */
 struct dnode_phys;
 struct abd;
 
 struct zio_cksum_report {
 	struct zio_cksum_report *zcr_next;
 	nvlist_t		*zcr_ereport;
 	nvlist_t		*zcr_detector;
 	void			*zcr_cbdata;
 	size_t			zcr_cbinfo;	/* passed to zcr_free() */
 	uint64_t		zcr_sector;
 	uint64_t		zcr_align;
 	uint64_t		zcr_length;
 	zio_cksum_finish_f	*zcr_finish;
 	zio_cksum_free_f	*zcr_free;
 
 	/* internal use only */
 	struct zio_bad_cksum	*zcr_ckinfo;	/* information from failure */
 };
 
 typedef struct zio_vsd_ops {
 	zio_done_func_t		*vsd_free;
 } zio_vsd_ops_t;
 
 typedef struct zio_gang_node {
 	zio_gbh_phys_t		*gn_gbh;
 	struct zio_gang_node	*gn_child[SPA_GBH_NBLKPTRS];
 } zio_gang_node_t;
 
 typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
     zio_gang_node_t *gn, struct abd *data, uint64_t offset);
 
 typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
 
 typedef struct zio_transform {
 	struct abd		*zt_orig_abd;
 	uint64_t		zt_orig_size;
 	uint64_t		zt_bufsize;
 	zio_transform_func_t	*zt_transform;
 	struct zio_transform	*zt_next;
 } zio_transform_t;
 
 typedef zio_t *zio_pipe_stage_t(zio_t *zio);
 
 /*
  * The io_reexecute flags are distinct from io_flags because the child must
  * be able to propagate them to the parent.  The normal io_flags are local
  * to the zio, not protected by any lock, and not modifiable by children;
  * the reexecute flags are protected by io_lock, modifiable by children,
  * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
  */
 #define	ZIO_REEXECUTE_NOW	0x01
 #define	ZIO_REEXECUTE_SUSPEND	0x02
 
 /*
  * The io_trim flags are used to specify the type of TRIM to perform.  They
  * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags.
  */
 enum trim_flag {
 	ZIO_TRIM_SECURE		= 1 << 0,
 };
 
 typedef struct zio_alloc_list {
 	list_t  zal_list;
 	uint64_t zal_size;
 } zio_alloc_list_t;
 
 typedef struct zio_link {
 	zio_t		*zl_parent;
 	zio_t		*zl_child;
 	list_node_t	zl_parent_node;
 	list_node_t	zl_child_node;
 } zio_link_t;
 
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_phys_t	io_bookmark;
 	zio_prop_t	io_prop;
 	zio_type_t	io_type;
 	enum zio_child	io_child_type;
 	enum trim_flag	io_trim_flags;
 	int		io_cmd;
 	zio_priority_t	io_priority;
 	uint8_t		io_reexecute;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
 	spa_t		*io_spa;
 	blkptr_t	*io_bp;
 	blkptr_t	*io_bp_override;
 	blkptr_t	io_bp_copy;
 	list_t		io_parent_list;
 	list_t		io_child_list;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
 	/* Callback info */
 	zio_done_func_t	*io_ready;
 	zio_done_func_t	*io_children_ready;
 	zio_done_func_t	*io_physdone;
 	zio_done_func_t	*io_done;
 	void		*io_private;
 	int64_t		io_prev_space_delta;	/* DMU private */
 	blkptr_t	io_bp_orig;
 	/* io_lsize != io_orig_size iff this is a raw write */
 	uint64_t	io_lsize;
 
 	/* Data represented by this I/O */
 	struct abd	*io_abd;
 	struct abd	*io_orig_abd;
 	uint64_t	io_size;
 	uint64_t	io_orig_size;
 
 	/* Stuff for the vdev stack */
 	vdev_t		*io_vd;
 	void		*io_vsd;
 	const zio_vsd_ops_t *io_vsd_ops;
 	metaslab_class_t *io_metaslab_class;	/* dva throttle class */
 
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;	/* submitted at */
 	hrtime_t	io_queued_timestamp;
 	hrtime_t	io_target_timestamp;
 	hrtime_t	io_delta;	/* vdev queue service delta */
 	hrtime_t	io_delay;	/* Device access time (disk or */
 					/* file). */
 	avl_node_t	io_queue_node;
 	avl_node_t	io_offset_node;
 	avl_node_t	io_alloc_node;
 	zio_alloc_list_t 	io_alloc_list;
 
 	/* Internal pipeline state */
 	enum zio_flag	io_flags;
 	enum zio_stage	io_stage;
 	enum zio_stage	io_pipeline;
 	enum zio_flag	io_orig_flags;
 	enum zio_stage	io_orig_stage;
 	enum zio_stage	io_orig_pipeline;
 	enum zio_stage	io_pipeline_trace;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
 	uint64_t	io_child_count;
 	uint64_t	io_phys_children;
 	uint64_t	io_parent_count;
 	uint64_t	*io_stall;
 	zio_t		*io_gang_leader;
 	zio_gang_node_t	*io_gang_tree;
 	void		*io_executor;
 	void		*io_waiter;
 	void		*io_bio;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
 	int		io_allocator;
 
 	/* FMA state */
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
 
 enum blk_verify_flag {
 	BLK_VERIFY_ONLY,
 	BLK_VERIFY_LOG,
 	BLK_VERIFY_HALT
 };
 
 extern int zio_bookmark_compare(const void *, const void *);
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
     zio_done_func_t *done, void *priv, enum zio_flag flags);
 
 extern zio_t *zio_root(spa_t *spa,
     zio_done_func_t *done, void *priv, enum zio_flag flags);
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv,
     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *physdone, zio_done_func_t *done,
     void *priv, zio_priority_t priority, enum zio_flag flags,
     const zbookmark_phys_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
 
 extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
     boolean_t nopwrite);
 
 extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 
 extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp,
     zio_done_func_t *done, void *priv, enum zio_flag flags);
 
 extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     zio_done_func_t *done, void *priv, enum zio_flag flags);
 
 extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     enum zio_flag flags, enum trim_flag trim_flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     enum zio_flag flags, boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     enum zio_flag flags, boolean_t labels);
 
 extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, enum zio_flag flags);
 
 extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
     blkptr_t *new_bp, uint64_t size, boolean_t *slog);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
 extern void zio_execute(void *zio);
 extern void zio_interrupt(void *zio);
 extern void zio_delay_init(zio_t *zio);
 extern void zio_delay_interrupt(zio_t *zio);
 extern void zio_deadman(zio_t *zio, char *tag);
 
 extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
 extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
 extern zio_t *zio_unique_parent(zio_t *cio);
 extern void zio_add_child(zio_t *pio, zio_t *cio);
 
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
 extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
     uint64_t bufsize, zio_transform_func_t *transform);
 extern void zio_pop_transforms(zio_t *zio);
 
 extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     uint64_t offset, struct abd *data, uint64_t size, int type,
     zio_priority_t priority, enum zio_flag flags,
     zio_done_func_t *done, void *priv);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
     struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
     enum zio_flag flags, zio_done_func_t *done, void *priv);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
 extern void zio_vdev_io_redone(zio_t *zio);
 
 extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
 
 extern void zio_checksum_verified(zio_t *zio);
 extern int zio_worst_error(int e1, int e2);
 
 extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
     enum zio_checksum parent);
 extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
     enum zio_checksum child, enum zio_checksum parent);
 extern enum zio_compress zio_compress_select(spa_t *spa,
     enum zio_compress child, enum zio_compress parent);
 extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress,
     uint8_t child, uint8_t parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t);
 extern int zio_resume(spa_t *spa);
 extern void zio_resume_wait(spa_t *spa);
 
 extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
     boolean_t config_held, enum blk_verify_flag blk_verify);
 
 /*
  * Initial setup and teardown.
  */
 extern void zio_init(void);
 extern void zio_fini(void);
 
 /*
  * Fault injection
  */
 struct zinject_record;
 extern uint32_t zio_injection_enabled;
 extern int zio_inject_fault(char *name, int flags, int *id,
     struct zinject_record *record);
 extern int zio_inject_list_next(int *id, char *name, size_t buflen,
     struct zinject_record *record);
 extern int zio_clear_fault(int id);
 extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
 extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
     uint64_t type, int error);
 extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1,
     int err2);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
 extern hrtime_t zio_handle_io_delay(zio_t *zio);
 
 /*
  * Checksum ereport functions
  */
 extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
     uint64_t length, struct zio_bad_cksum *info);
 extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
     const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical);
 
 extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
 
 /* If we have the good data in hand, this function can be used */
 extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
     uint64_t length, const abd_t *good_data, const abd_t *bad_data,
     struct zio_bad_cksum *info);
 
 void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr);
+extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa,
+    const char *name);
 
 /* Called from spa_sync(), but primarily an injection handler */
 extern void spa_handle_ignored_writes(spa_t *spa);
 
 /* zbookmark_phys functions */
 boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
 int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
     uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _ZIO_H */
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 60e631567a89..007f31b4e7b3 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -1,1458 +1,1510 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012,2021 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 
 #include <sys/fm/fs/zfs.h>
 #include <sys/fm/protocol.h>
 #include <sys/fm/util.h>
 #include <sys/sysevent.h>
 
 /*
  * This general routine is responsible for generating all the different ZFS
  * ereports.  The payload is dependent on the class, and which arguments are
  * supplied to the function:
  *
  * 	EREPORT			POOL	VDEV	IO
  * 	block			X	X	X
  * 	data			X		X
  * 	device			X	X
  * 	pool			X
  *
  * If we are in a loading state, all errors are chained together by the same
  * SPA-wide ENA (Error Numeric Association).
  *
  * For isolated I/O requests, we get the ENA from the zio_t. The propagation
  * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
  * to chain together all ereports associated with a logical piece of data.  For
  * read I/Os, there  are basically three 'types' of I/O, which form a roughly
  * layered diagram:
  *
  *      +---------------+
  * 	| Aggregate I/O |	No associated logical data or device
  * 	+---------------+
  *              |
  *              V
  * 	+---------------+	Reads associated with a piece of logical data.
  * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
  * 	+---------------+       mirrors, gang blocks, retries, etc.
  *              |
  *              V
  * 	+---------------+	Reads associated with a particular device, but
  * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
  * 	+---------------+	and I/O aggregation.
  *
  * Note that 'physical I/O' here is not the same terminology as used in the rest
  * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
  * blockpointer.  But I/O with no associated block pointer can still be related
  * to a logical piece of data (i.e. RAID-Z requests).
  *
  * Purely physical I/O always have unique ENAs.  They are not related to a
  * particular piece of logical data, and therefore cannot be chained together.
  * We still generate an ereport, but the DE doesn't correlate it with any
  * logical piece of data.  When such an I/O fails, the delegated I/O requests
  * will issue a retry, which will trigger the 'real' ereport with the correct
  * ENA.
  *
  * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
  * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
  * then inherit this pointer, so that when it is first set subsequent failures
  * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
  * this pointer is set to NULL, and no ereport will be generated (since it
  * doesn't actually correspond to any particular device or piece of data,
  * and the caller will always retry without caching or queueing anyway).
  *
  * For checksum errors, we want to include more information about the actual
  * error which occurs.  Accordingly, we build an ereport when the error is
  * noticed, but instead of sending it in immediately, we hang it off of the
  * io_cksum_report field of the logical IO.  When the logical IO completes
  * (successfully or not), zfs_ereport_finish_checksum() is called with the
  * good and bad versions of the buffer (if available), and we annotate the
  * ereport with information about the differences.
  */
 
 #ifdef _KERNEL
 /*
  * Duplicate ereport Detection
  *
  * Some ereports are retained momentarily for detecting duplicates.  These
  * are kept in a recent_events_node_t in both a time-ordered list and an AVL
  * tree of recent unique ereports.
  *
  * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
  * task is used to purge stale entries.
  */
 static list_t recent_events_list;
 static avl_tree_t recent_events_tree;
 static kmutex_t recent_events_lock;
 static taskqid_t recent_events_cleaner_tqid;
 
 /*
  * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
  *
  * This setting can be changed dynamically and setting it to zero
  * disables duplicate detection.
  */
 unsigned int zfs_zevent_retain_max = 2000;
 
 /*
  * The lifespan for a recent ereport entry. The default of 15 minutes is
  * intended to outlive the zfs diagnosis engine's threshold of 10 errors
  * over a period of 10 minutes.
  */
 unsigned int zfs_zevent_retain_expire_secs = 900;
 
 typedef enum zfs_subclass {
 	ZSC_IO,
 	ZSC_DATA,
 	ZSC_CHECKSUM
 } zfs_subclass_t;
 
 typedef struct {
 	/* common criteria */
 	uint64_t	re_pool_guid;
 	uint64_t	re_vdev_guid;
 	int		re_io_error;
 	uint64_t	re_io_size;
 	uint64_t	re_io_offset;
 	zfs_subclass_t	re_subclass;
 	zio_priority_t	re_io_priority;
 
 	/* logical zio criteria (optional) */
 	zbookmark_phys_t re_io_bookmark;
 
 	/* internal state */
 	avl_node_t	re_tree_link;
 	list_node_t	re_list_link;
 	uint64_t	re_timestamp;
 } recent_events_node_t;
 
 static int
 recent_events_compare(const void *a, const void *b)
 {
 	const recent_events_node_t *node1 = a;
 	const recent_events_node_t *node2 = b;
 	int cmp;
 
 	/*
 	 * The comparison order here is somewhat arbitrary.
 	 * What's important is that if every criteria matches, then it
 	 * is a duplicate (i.e. compare returns 0)
 	 */
 	if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
 		return (cmp);
 
 	const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
 	const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
 
 	if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
 		return (cmp);
 
 	return (0);
 }
 
 static void zfs_ereport_schedule_cleaner(void);
 
 /*
  * background task to clean stale recent event nodes.
  */
 /*ARGSUSED*/
 static void
 zfs_ereport_cleaner(void *arg)
 {
 	recent_events_node_t *entry;
 	uint64_t now = gethrtime();
 
 	/*
 	 * purge expired entries
 	 */
 	mutex_enter(&recent_events_lock);
 	while ((entry = list_tail(&recent_events_list)) != NULL) {
 		uint64_t age = NSEC2SEC(now - entry->re_timestamp);
 		if (age <= zfs_zevent_retain_expire_secs)
 			break;
 
 		/* remove expired node */
 		avl_remove(&recent_events_tree, entry);
 		list_remove(&recent_events_list, entry);
 		kmem_free(entry, sizeof (*entry));
 	}
 
 	/* Restart the cleaner if more entries remain */
 	recent_events_cleaner_tqid = 0;
 	if (!list_is_empty(&recent_events_list))
 		zfs_ereport_schedule_cleaner();
 
 	mutex_exit(&recent_events_lock);
 }
 
 static void
 zfs_ereport_schedule_cleaner(void)
 {
 	ASSERT(MUTEX_HELD(&recent_events_lock));
 
 	uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
 
 	recent_events_cleaner_tqid = taskq_dispatch_delay(
 	    system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
 	    ddi_get_lbolt() + NSEC_TO_TICK(timeout));
 }
 
 /*
  * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL
  */
 void
 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
 {
 	uint64_t vdev_guid, pool_guid;
 	int cnt = 0;
 
 	ASSERT(vd != NULL || spa != NULL);
 	if (vd == NULL) {
 		vdev_guid = 0;
 		pool_guid = spa_guid(spa);
 	} else {
 		vdev_guid = vd->vdev_guid;
 		pool_guid = 0;
 	}
 
 	mutex_enter(&recent_events_lock);
 
 	recent_events_node_t *next = list_head(&recent_events_list);
 	while (next != NULL) {
 		recent_events_node_t *entry = next;
 
 		next = list_next(&recent_events_list, next);
 
 		if (entry->re_vdev_guid == vdev_guid ||
 		    entry->re_pool_guid == pool_guid) {
 			avl_remove(&recent_events_tree, entry);
 			list_remove(&recent_events_list, entry);
 			kmem_free(entry, sizeof (*entry));
 			cnt++;
 		}
 	}
 
 	mutex_exit(&recent_events_lock);
 }
 
 /*
  * Check if an ereport would be a duplicate of one recently posted.
  *
  * An ereport is considered a duplicate if the set of criteria in
  * recent_events_node_t all match.
  *
  * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
  * are candidates for duplicate checking.
  */
 static boolean_t
 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
 {
 	recent_events_node_t search = {0}, *entry;
 
 	if (vd == NULL || zio == NULL)
 		return (B_FALSE);
 
 	if (zfs_zevent_retain_max == 0)
 		return (B_FALSE);
 
 	if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
 		search.re_subclass = ZSC_IO;
 	else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
 		search.re_subclass = ZSC_DATA;
 	else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
 		search.re_subclass = ZSC_CHECKSUM;
 	else
 		return (B_FALSE);
 
 	search.re_pool_guid = spa_guid(spa);
 	search.re_vdev_guid = vd->vdev_guid;
 	search.re_io_error = zio->io_error;
 	search.re_io_priority = zio->io_priority;
 	/* if size is supplied use it over what's in zio */
 	if (size) {
 		search.re_io_size = size;
 		search.re_io_offset = offset;
 	} else {
 		search.re_io_size = zio->io_size;
 		search.re_io_offset = zio->io_offset;
 	}
 
 	/* grab optional logical zio criteria */
 	if (zb != NULL) {
 		search.re_io_bookmark.zb_objset = zb->zb_objset;
 		search.re_io_bookmark.zb_object = zb->zb_object;
 		search.re_io_bookmark.zb_level = zb->zb_level;
 		search.re_io_bookmark.zb_blkid = zb->zb_blkid;
 	}
 
 	uint64_t now = gethrtime();
 
 	mutex_enter(&recent_events_lock);
 
 	/* check if we have seen this one recently */
 	entry = avl_find(&recent_events_tree, &search, NULL);
 	if (entry != NULL) {
 		uint64_t age = NSEC2SEC(now - entry->re_timestamp);
 
 		/*
 		 * There is still an active cleaner (since we're here).
 		 * Reset the last seen time for this duplicate entry
 		 * so that its lifespand gets extended.
 		 */
 		list_remove(&recent_events_list, entry);
 		list_insert_head(&recent_events_list, entry);
 		entry->re_timestamp = now;
 
 		zfs_zevent_track_duplicate();
 		mutex_exit(&recent_events_lock);
 
 		return (age <= zfs_zevent_retain_expire_secs);
 	}
 
 	if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
 		/* recycle oldest node */
 		entry = list_tail(&recent_events_list);
 		ASSERT(entry != NULL);
 		list_remove(&recent_events_list, entry);
 		avl_remove(&recent_events_tree, entry);
 	} else {
 		entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
 	}
 
 	/* record this as a recent ereport */
 	*entry = search;
 	avl_add(&recent_events_tree, entry);
 	list_insert_head(&recent_events_list, entry);
 	entry->re_timestamp = now;
 
 	/* Start a cleaner if not already scheduled */
 	if (recent_events_cleaner_tqid == 0)
 		zfs_ereport_schedule_cleaner();
 
 	mutex_exit(&recent_events_lock);
 	return (B_FALSE);
 }
 
 void
 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
 {
 	if (nvl)
 		fm_nvlist_destroy(nvl, FM_NVA_FREE);
 
 	if (detector)
 		fm_nvlist_destroy(detector, FM_NVA_FREE);
 }
 
 /*
  * We want to rate limit ZIO delay, deadman, and checksum events so as to not
  * flood zevent consumers when a disk is acting up.
  *
  * Returns 1 if we're ratelimiting, 0 if not.
  */
 static int
 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
 {
 	int rc = 0;
 	/*
 	 * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
 	 * are.  Invert it to get our return value.
 	 */
 	if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
 		rc = !zfs_ratelimit(&vd->vdev_delay_rl);
 	} else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) {
 		rc = !zfs_ratelimit(&vd->vdev_deadman_rl);
 	} else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
 		rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
 	}
 
 	if (rc)	{
 		/* We're rate limiting */
 		fm_erpt_dropped_increment();
 	}
 
 	return (rc);
 }
 
 /*
  * Return B_TRUE if the event actually posted, B_FALSE if not.
  */
 static boolean_t
 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
     const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
     zio_t *zio, uint64_t stateoroffset, uint64_t size)
 {
 	nvlist_t *ereport, *detector;
 
 	uint64_t ena;
 	char class[64];
 
 	if ((ereport = fm_nvlist_create(NULL)) == NULL)
 		return (B_FALSE);
 
 	if ((detector = fm_nvlist_create(NULL)) == NULL) {
 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
 		return (B_FALSE);
 	}
 
 	/*
 	 * Serialize ereport generation
 	 */
 	mutex_enter(&spa->spa_errlist_lock);
 
 	/*
 	 * Determine the ENA to use for this event.  If we are in a loading
 	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
 	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
 	 */
 	if (spa_load_state(spa) != SPA_LOAD_NONE) {
 		if (spa->spa_ena == 0)
 			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
 		ena = spa->spa_ena;
 	} else if (zio != NULL && zio->io_logical != NULL) {
 		if (zio->io_logical->io_ena == 0)
 			zio->io_logical->io_ena =
 			    fm_ena_generate(0, FM_ENA_FMT1);
 		ena = zio->io_logical->io_ena;
 	} else {
 		ena = fm_ena_generate(0, FM_ENA_FMT1);
 	}
 
 	/*
 	 * Construct the full class, detector, and other standard FMA fields.
 	 */
 	(void) snprintf(class, sizeof (class), "%s.%s",
 	    ZFS_ERROR_CLASS, subclass);
 
 	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
 	    vd != NULL ? vd->vdev_guid : 0);
 
 	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
 
 	/*
 	 * Construct the per-ereport payload, depending on which parameters are
 	 * passed in.
 	 */
 
 	/*
 	 * Generic payload members common to all ereports.
 	 */
 	fm_payload_set(ereport,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64,
 	    (uint64_t)spa_state(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
 	    (int32_t)spa_load_state(spa), NULL);
 
 	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
 	    DATA_TYPE_STRING,
 	    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
 	    FM_EREPORT_FAILMODE_WAIT :
 	    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
 	    FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
 	    NULL);
 
 	if (vd != NULL) {
 		vdev_t *pvd = vd->vdev_parent;
 		vdev_queue_t *vq = &vd->vdev_queue;
 		vdev_stat_t *vs = &vd->vdev_stat;
 		vdev_t *spare_vd;
 		uint64_t *spare_guids;
 		char **spare_paths;
 		int i, spare_count;
 
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
 		    DATA_TYPE_UINT64, vd->vdev_guid,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
 		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
 		if (vd->vdev_path != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
 			    DATA_TYPE_STRING, vd->vdev_path, NULL);
 		if (vd->vdev_devid != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
 			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
 		if (vd->vdev_fru != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
 			    DATA_TYPE_STRING, vd->vdev_fru, NULL);
 		if (vd->vdev_enc_sysfs_path != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 			    DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL);
 		if (vd->vdev_ashift)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
 			    DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
 
 		if (vq != NULL) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
 			    DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
 			    DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
 		}
 
 		if (vs != NULL) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_read_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_write_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_checksum_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
 			    DATA_TYPE_UINT64, vs->vs_slow_ios,
 			    NULL);
 		}
 
 		if (pvd != NULL) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
 			    DATA_TYPE_UINT64, pvd->vdev_guid,
 			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
 			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
 			    NULL);
 			if (pvd->vdev_path)
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
 				    DATA_TYPE_STRING, pvd->vdev_path, NULL);
 			if (pvd->vdev_devid)
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
 				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
 		}
 
 		spare_count = spa->spa_spares.sav_count;
 		spare_paths = kmem_zalloc(sizeof (char *) * spare_count,
 		    KM_SLEEP);
 		spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count,
 		    KM_SLEEP);
 
 		for (i = 0; i < spare_count; i++) {
 			spare_vd = spa->spa_spares.sav_vdevs[i];
 			if (spare_vd) {
 				spare_paths[i] = spare_vd->vdev_path;
 				spare_guids[i] = spare_vd->vdev_guid;
 			}
 		}
 
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS,
 		    DATA_TYPE_STRING_ARRAY, spare_count, spare_paths,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS,
 		    DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL);
 
 		kmem_free(spare_guids, sizeof (uint64_t) * spare_count);
 		kmem_free(spare_paths, sizeof (char *) * spare_count);
 	}
 
 	if (zio != NULL) {
 		/*
 		 * Payload common to all I/Os.
 		 */
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
 		    DATA_TYPE_INT32, zio->io_error, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
 		    DATA_TYPE_INT32, zio->io_flags, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
 		    DATA_TYPE_UINT32, zio->io_stage, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
 		    DATA_TYPE_UINT32, zio->io_pipeline, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
 		    DATA_TYPE_UINT64, zio->io_delay, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
 		    DATA_TYPE_UINT64, zio->io_timestamp, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
 		    DATA_TYPE_UINT64, zio->io_delta, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
 		    DATA_TYPE_UINT32, zio->io_priority, NULL);
 
 		/*
 		 * If the 'size' parameter is non-zero, it indicates this is a
 		 * RAID-Z or other I/O where the physical offset and length are
 		 * provided for us, instead of within the zio_t.
 		 */
 		if (vd != NULL) {
 			if (size)
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
 				    DATA_TYPE_UINT64, stateoroffset,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
 				    DATA_TYPE_UINT64, size, NULL);
 			else
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
 				    DATA_TYPE_UINT64, zio->io_offset,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
 				    DATA_TYPE_UINT64, zio->io_size, NULL);
 		}
 	} else if (vd != NULL) {
 		/*
 		 * If we have a vdev but no zio, this is a device fault, and the
 		 * 'stateoroffset' parameter indicates the previous state of the
 		 * vdev.
 		 */
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
 		    DATA_TYPE_UINT64, stateoroffset, NULL);
 	}
 
 	/*
 	 * Payload for I/Os with corresponding logical information.
 	 */
 	if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
 		    DATA_TYPE_UINT64, zb->zb_objset,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
 		    DATA_TYPE_UINT64, zb->zb_object,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
 		    DATA_TYPE_INT64, zb->zb_level,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
 		    DATA_TYPE_UINT64, zb->zb_blkid, NULL);
 	}
 
 	mutex_exit(&spa->spa_errlist_lock);
 
 	*ereport_out = ereport;
 	*detector_out = detector;
 	return (B_TRUE);
 }
 
 /* if it's <= 128 bytes, save the corruption directly */
 #define	ZFM_MAX_INLINE		(128 / sizeof (uint64_t))
 
 #define	MAX_RANGES		16
 
 typedef struct zfs_ecksum_info {
 	/* histograms of set and cleared bits by bit number in a 64-bit word */
 	uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
 	uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
 
 	/* inline arrays of bits set and cleared. */
 	uint64_t zei_bits_set[ZFM_MAX_INLINE];
 	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
 
 	/*
 	 * for each range, the number of bits set and cleared.  The Hamming
 	 * distance between the good and bad buffers is the sum of them all.
 	 */
 	uint32_t zei_range_sets[MAX_RANGES];
 	uint32_t zei_range_clears[MAX_RANGES];
 
 	struct zei_ranges {
 		uint32_t	zr_start;
 		uint32_t	zr_end;
 	} zei_ranges[MAX_RANGES];
 
 	size_t	zei_range_count;
 	uint32_t zei_mingap;
 	uint32_t zei_allowed_mingap;
 
 } zfs_ecksum_info_t;
 
 static void
 update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
 {
 	size_t i;
 	size_t bits = 0;
 	uint64_t value = BE_64(value_arg);
 
 	/* We store the bits in big-endian (largest-first) order */
 	for (i = 0; i < 64; i++) {
 		if (value & (1ull << i)) {
 			hist[63 - i]++;
 			++bits;
 		}
 	}
 	/* update the count of bits changed */
 	*count += bits;
 }
 
 /*
  * We've now filled up the range array, and need to increase "mingap" and
  * shrink the range list accordingly.  zei_mingap is always the smallest
  * distance between array entries, so we set the new_allowed_gap to be
  * one greater than that.  We then go through the list, joining together
  * any ranges which are closer than the new_allowed_gap.
  *
  * By construction, there will be at least one.  We also update zei_mingap
  * to the new smallest gap, to prepare for our next invocation.
  */
 static void
 zei_shrink_ranges(zfs_ecksum_info_t *eip)
 {
 	uint32_t mingap = UINT32_MAX;
 	uint32_t new_allowed_gap = eip->zei_mingap + 1;
 
 	size_t idx, output;
 	size_t max = eip->zei_range_count;
 
 	struct zei_ranges *r = eip->zei_ranges;
 
 	ASSERT3U(eip->zei_range_count, >, 0);
 	ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
 
 	output = idx = 0;
 	while (idx < max - 1) {
 		uint32_t start = r[idx].zr_start;
 		uint32_t end = r[idx].zr_end;
 
 		while (idx < max - 1) {
 			idx++;
 
 			uint32_t nstart = r[idx].zr_start;
 			uint32_t nend = r[idx].zr_end;
 
 			uint32_t gap = nstart - end;
 			if (gap < new_allowed_gap) {
 				end = nend;
 				continue;
 			}
 			if (gap < mingap)
 				mingap = gap;
 			break;
 		}
 		r[output].zr_start = start;
 		r[output].zr_end = end;
 		output++;
 	}
 	ASSERT3U(output, <, eip->zei_range_count);
 	eip->zei_range_count = output;
 	eip->zei_mingap = mingap;
 	eip->zei_allowed_mingap = new_allowed_gap;
 }
 
 static void
 zei_add_range(zfs_ecksum_info_t *eip, int start, int end)
 {
 	struct zei_ranges *r = eip->zei_ranges;
 	size_t count = eip->zei_range_count;
 
 	if (count >= MAX_RANGES) {
 		zei_shrink_ranges(eip);
 		count = eip->zei_range_count;
 	}
 	if (count == 0) {
 		eip->zei_mingap = UINT32_MAX;
 		eip->zei_allowed_mingap = 1;
 	} else {
 		int gap = start - r[count - 1].zr_end;
 
 		if (gap < eip->zei_allowed_mingap) {
 			r[count - 1].zr_end = end;
 			return;
 		}
 		if (gap < eip->zei_mingap)
 			eip->zei_mingap = gap;
 	}
 	r[count].zr_start = start;
 	r[count].zr_end = end;
 	eip->zei_range_count++;
 }
 
 static size_t
 zei_range_total_size(zfs_ecksum_info_t *eip)
 {
 	struct zei_ranges *r = eip->zei_ranges;
 	size_t count = eip->zei_range_count;
 	size_t result = 0;
 	size_t idx;
 
 	for (idx = 0; idx < count; idx++)
 		result += (r[idx].zr_end - r[idx].zr_start);
 
 	return (result);
 }
 
 static zfs_ecksum_info_t *
 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
     const abd_t *goodabd, const abd_t *badabd, size_t size,
     boolean_t drop_if_identical)
 {
 	const uint64_t *good;
 	const uint64_t *bad;
 
 	uint64_t allset = 0;
 	uint64_t allcleared = 0;
 
 	size_t nui64s = size / sizeof (uint64_t);
 
 	size_t inline_size;
 	int no_inline = 0;
 	size_t idx;
 	size_t range;
 
 	size_t offset = 0;
 	ssize_t start = -1;
 
 	zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
 
 	/* don't do any annotation for injected checksum errors */
 	if (info != NULL && info->zbc_injected)
 		return (eip);
 
 	if (info != NULL && info->zbc_has_cksum) {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
 		    DATA_TYPE_UINT64_ARRAY,
 		    sizeof (info->zbc_expected) / sizeof (uint64_t),
 		    (uint64_t *)&info->zbc_expected,
 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
 		    DATA_TYPE_UINT64_ARRAY,
 		    sizeof (info->zbc_actual) / sizeof (uint64_t),
 		    (uint64_t *)&info->zbc_actual,
 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
 		    DATA_TYPE_STRING,
 		    info->zbc_checksum_name,
 		    NULL);
 
 		if (info->zbc_byteswapped) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
 			    DATA_TYPE_BOOLEAN, 1,
 			    NULL);
 		}
 	}
 
 	if (badabd == NULL || goodabd == NULL)
 		return (eip);
 
 	ASSERT3U(nui64s, <=, UINT32_MAX);
 	ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(size, <=, UINT32_MAX);
 
 	good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size);
 	bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size);
 
 	/* build up the range list by comparing the two buffers. */
 	for (idx = 0; idx < nui64s; idx++) {
 		if (good[idx] == bad[idx]) {
 			if (start == -1)
 				continue;
 
 			zei_add_range(eip, start, idx);
 			start = -1;
 		} else {
 			if (start != -1)
 				continue;
 
 			start = idx;
 		}
 	}
 	if (start != -1)
 		zei_add_range(eip, start, idx);
 
 	/* See if it will fit in our inline buffers */
 	inline_size = zei_range_total_size(eip);
 	if (inline_size > ZFM_MAX_INLINE)
 		no_inline = 1;
 
 	/*
 	 * If there is no change and we want to drop if the buffers are
 	 * identical, do so.
 	 */
 	if (inline_size == 0 && drop_if_identical) {
 		kmem_free(eip, sizeof (*eip));
 		abd_return_buf((abd_t *)goodabd, (void *)good, size);
 		abd_return_buf((abd_t *)badabd, (void *)bad, size);
 		return (NULL);
 	}
 
 	/*
 	 * Now walk through the ranges, filling in the details of the
 	 * differences.  Also convert our uint64_t-array offsets to byte
 	 * offsets.
 	 */
 	for (range = 0; range < eip->zei_range_count; range++) {
 		size_t start = eip->zei_ranges[range].zr_start;
 		size_t end = eip->zei_ranges[range].zr_end;
 
 		for (idx = start; idx < end; idx++) {
 			uint64_t set, cleared;
 
 			// bits set in bad, but not in good
 			set = ((~good[idx]) & bad[idx]);
 			// bits set in good, but not in bad
 			cleared = (good[idx] & (~bad[idx]));
 
 			allset |= set;
 			allcleared |= cleared;
 
 			if (!no_inline) {
 				ASSERT3U(offset, <, inline_size);
 				eip->zei_bits_set[offset] = set;
 				eip->zei_bits_cleared[offset] = cleared;
 				offset++;
 			}
 
 			update_histogram(set, eip->zei_histogram_set,
 			    &eip->zei_range_sets[range]);
 			update_histogram(cleared, eip->zei_histogram_cleared,
 			    &eip->zei_range_clears[range]);
 		}
 
 		/* convert to byte offsets */
 		eip->zei_ranges[range].zr_start	*= sizeof (uint64_t);
 		eip->zei_ranges[range].zr_end	*= sizeof (uint64_t);
 	}
 
 	abd_return_buf((abd_t *)goodabd, (void *)good, size);
 	abd_return_buf((abd_t *)badabd, (void *)bad, size);
 
 	eip->zei_allowed_mingap	*= sizeof (uint64_t);
 	inline_size		*= sizeof (uint64_t);
 
 	/* fill in ereport */
 	fm_payload_set(ereport,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
 	    DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
 	    (uint32_t *)eip->zei_ranges,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
 	    DATA_TYPE_UINT32, eip->zei_allowed_mingap,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
 	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
 	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
 	    NULL);
 
 	if (!no_inline) {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
 		    DATA_TYPE_UINT8_ARRAY,
 		    inline_size, (uint8_t *)eip->zei_bits_set,
 		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
 		    DATA_TYPE_UINT8_ARRAY,
 		    inline_size, (uint8_t *)eip->zei_bits_cleared,
 		    NULL);
 	} else {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
 		    DATA_TYPE_UINT32_ARRAY,
 		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
 		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
 		    DATA_TYPE_UINT32_ARRAY,
 		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
 		    NULL);
 	}
 	return (eip);
 }
 #else
 /*ARGSUSED*/
 void
 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
 {
 }
 #endif
 
 /*
  * Make sure our event is still valid for the given zio/vdev/pool.  For example,
  * we don't want to keep logging events for a faulted or missing vdev.
  */
 boolean_t
 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
 {
 #ifdef _KERNEL
 	/*
 	 * If we are doing a spa_tryimport() or in recovery mode,
 	 * ignore errors.
 	 */
 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
 	    spa_load_state(spa) == SPA_LOAD_RECOVER)
 		return (B_FALSE);
 
 	/*
 	 * If we are in the middle of opening a pool, and the previous attempt
 	 * failed, don't bother logging any new ereports - we're just going to
 	 * get the same diagnosis anyway.
 	 */
 	if (spa_load_state(spa) != SPA_LOAD_NONE &&
 	    spa->spa_last_open_failed)
 		return (B_FALSE);
 
 	if (zio != NULL) {
 		/*
 		 * If this is not a read or write zio, ignore the error.  This
 		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
 		 */
 		if (zio->io_type != ZIO_TYPE_READ &&
 		    zio->io_type != ZIO_TYPE_WRITE)
 			return (B_FALSE);
 
 		if (vd != NULL) {
 			/*
 			 * If the vdev has already been marked as failing due
 			 * to a failed probe, then ignore any subsequent I/O
 			 * errors, as the DE will automatically fault the vdev
 			 * on the first such failure.  This also catches cases
 			 * where vdev_remove_wanted is set and the device has
 			 * not yet been asynchronously placed into the REMOVED
 			 * state.
 			 */
 			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
 				return (B_FALSE);
 
 			/*
 			 * Ignore checksum errors for reads from DTL regions of
 			 * leaf vdevs.
 			 */
 			if (zio->io_type == ZIO_TYPE_READ &&
 			    zio->io_error == ECKSUM &&
 			    vd->vdev_ops->vdev_op_leaf &&
 			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
 				return (B_FALSE);
 		}
 	}
 
 	/*
 	 * For probe failure, we want to avoid posting ereports if we've
 	 * already removed the device in the meantime.
 	 */
 	if (vd != NULL &&
 	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
 	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
 		return (B_FALSE);
 
 	/* Ignore bogus delay events (like from ioctls or unqueued IOs) */
 	if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
 	    (zio != NULL) && (!zio->io_timestamp)) {
 		return (B_FALSE);
 	}
 #endif
 	return (B_TRUE);
 }
 
 /*
  * Post an ereport for the given subclass
  *
  * Returns
  * - 0 if an event was posted
  * - EINVAL if there was a problem posting event
  * - EBUSY if the event was rate limited
  * - EALREADY if the event was already posted (duplicate)
  */
 int
 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
 {
 	int rc = 0;
 #ifdef _KERNEL
 	nvlist_t *ereport = NULL;
 	nvlist_t *detector = NULL;
 
 	if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
 		return (EINVAL);
 
 	if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
 		return (SET_ERROR(EALREADY));
 
 	if (zfs_is_ratelimiting_event(subclass, vd))
 		return (SET_ERROR(EBUSY));
 
 	if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
 	    zb, zio, state, 0))
 		return (SET_ERROR(EINVAL));	/* couldn't post event */
 
 	if (ereport == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/* Cleanup is handled by the callback function */
 	rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 #endif
 	return (rc);
 }
 
 /*
  * Prepare a checksum ereport
  *
  * Returns
  * - 0 if an event was posted
  * - EINVAL if there was a problem posting event
  * - EBUSY if the event was rate limited
  * - EALREADY if the event was already posted (duplicate)
  */
 int
 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
     struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info)
 {
 	zio_cksum_report_t *report;
 
 #ifdef _KERNEL
 	if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
 	    offset, length))
 		return (SET_ERROR(EALREADY));
 
 	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
 		return (SET_ERROR(EBUSY));
 #endif
 
 	report = kmem_zalloc(sizeof (*report), KM_SLEEP);
 
 	zio_vsd_default_cksum_report(zio, report);
 
 	/* copy the checksum failure information if it was provided */
 	if (info != NULL) {
 		report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
 		bcopy(info, report->zcr_ckinfo, sizeof (*info));
 	}
 
 	report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
 	report->zcr_align =
 	    vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
 	report->zcr_length = length;
 
 #ifdef _KERNEL
 	(void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
 	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
 
 	if (report->zcr_ereport == NULL) {
 		zfs_ereport_free_checksum(report);
 		return (0);
 	}
 #endif
 
 	mutex_enter(&spa->spa_errlist_lock);
 	report->zcr_next = zio->io_logical->io_cksum_report;
 	zio->io_logical->io_cksum_report = report;
 	mutex_exit(&spa->spa_errlist_lock);
 	return (0);
 }
 
 void
 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
     const abd_t *bad_data, boolean_t drop_if_identical)
 {
 #ifdef _KERNEL
 	zfs_ecksum_info_t *info;
 
 	info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
 	    good_data, bad_data, report->zcr_length, drop_if_identical);
 	if (info != NULL)
 		zfs_zevent_post(report->zcr_ereport,
 		    report->zcr_detector, zfs_zevent_post_cb);
 	else
 		zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector);
 
 	report->zcr_ereport = report->zcr_detector = NULL;
 	if (info != NULL)
 		kmem_free(info, sizeof (*info));
 #endif
 }
 
 void
 zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
 {
 #ifdef _KERNEL
 	if (rpt->zcr_ereport != NULL) {
 		fm_nvlist_destroy(rpt->zcr_ereport,
 		    FM_NVA_FREE);
 		fm_nvlist_destroy(rpt->zcr_detector,
 		    FM_NVA_FREE);
 	}
 #endif
 	rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
 
 	if (rpt->zcr_ckinfo != NULL)
 		kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
 
 	kmem_free(rpt, sizeof (*rpt));
 }
 
 /*
  * Post a checksum ereport
  *
  * Returns
  * - 0 if an event was posted
  * - EINVAL if there was a problem posting event
  * - EBUSY if the event was rate limited
  * - EALREADY if the event was already posted (duplicate)
  */
 int
 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
     struct zio *zio, uint64_t offset, uint64_t length,
     const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
 {
 	int rc = 0;
 #ifdef _KERNEL
 	nvlist_t *ereport = NULL;
 	nvlist_t *detector = NULL;
 	zfs_ecksum_info_t *info;
 
 	if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
 	    offset, length))
 		return (SET_ERROR(EALREADY));
 
 	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
 		return (SET_ERROR(EBUSY));
 
 	if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
 	    spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
 	    B_FALSE);
 
 	if (info != NULL) {
 		rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 		kmem_free(info, sizeof (*info));
 	}
 #endif
 	return (rc);
 }
 
 /*
  * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of
  * change in the pool.  All sysevents are listed in sys/sysevent/eventdefs.h
  * and are designed to be consumed by the ZFS Event Daemon (ZED).  For
  * additional details refer to the zed(8) man page.
  */
 nvlist_t *
 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
     nvlist_t *aux)
 {
 	nvlist_t *resource = NULL;
 #ifdef _KERNEL
 	char class[64];
 
 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
 		return (NULL);
 
 	if ((resource = fm_nvlist_create(NULL)) == NULL)
 		return (NULL);
 
 	(void) snprintf(class, sizeof (class), "%s.%s.%s", type,
 	    ZFS_ERROR_CLASS, name);
 	VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION));
 	VERIFY0(nvlist_add_string(resource, FM_CLASS, class));
 	VERIFY0(nvlist_add_string(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)));
 	VERIFY0(nvlist_add_uint64(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)));
 	VERIFY0(nvlist_add_uint64(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa)));
 	VERIFY0(nvlist_add_int32(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa)));
 
 	if (vd) {
 		VERIFY0(nvlist_add_uint64(resource,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid));
 		VERIFY0(nvlist_add_uint64(resource,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state));
 		if (vd->vdev_path != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path));
 		if (vd->vdev_devid != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid));
 		if (vd->vdev_fru != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru));
 		if (vd->vdev_enc_sysfs_path != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 			    vd->vdev_enc_sysfs_path));
 	}
 
 	/* also copy any optional payload data */
 	if (aux) {
 		nvpair_t *elem = NULL;
 
 		while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
 			(void) nvlist_add_nvpair(resource, elem);
 	}
 
 #endif
 	return (resource);
 }
 
 static void
 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
     nvlist_t *aux)
 {
 #ifdef _KERNEL
 	nvlist_t *resource;
 
 	resource = zfs_event_create(spa, vd, type, name, aux);
 	if (resource)
 		zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
 #endif
 }
 
 /*
  * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
  * has been removed from the system.  This will cause the DE to ignore any
  * recent I/O errors, inferring that they are due to the asynchronous device
  * removal.
  */
 void
 zfs_post_remove(spa_t *spa, vdev_t *vd)
 {
 	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL);
 }
 
 /*
  * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
  * has the 'autoreplace' property set, and therefore any broken vdevs will be
  * handled by higher level logic, and no vdev fault should be generated.
  */
 void
 zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
 {
 	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL);
 }
 
 /*
  * The 'resource.fs.zfs.statechange' event is an internal signal that the
  * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
  * cause the retire agent to repair any outstanding fault management cases
  * open because the device was not found (fault.fs.zfs.device).
  */
 void
 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
 {
 #ifdef _KERNEL
 	nvlist_t *aux;
 
 	/*
 	 * Add optional supplemental keys to payload
 	 */
 	aux = fm_nvlist_create(NULL);
 	if (vd && aux) {
 		if (vd->vdev_physpath) {
 			(void) nvlist_add_string(aux,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
 			    vd->vdev_physpath);
 		}
 		if (vd->vdev_enc_sysfs_path) {
 			(void) nvlist_add_string(aux,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 			    vd->vdev_enc_sysfs_path);
 		}
 
 		(void) nvlist_add_uint64(aux,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
 	}
 
 	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE,
 	    aux);
 
 	if (aux)
 		fm_nvlist_destroy(aux, FM_NVA_FREE);
 #endif
 }
 
 #ifdef _KERNEL
 void
 zfs_ereport_init(void)
 {
 	mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&recent_events_list, sizeof (recent_events_node_t),
 	    offsetof(recent_events_node_t, re_list_link));
 	avl_create(&recent_events_tree,  recent_events_compare,
 	    sizeof (recent_events_node_t), offsetof(recent_events_node_t,
 	    re_tree_link));
 }
 
 /*
  * This 'early' fini needs to run before zfs_fini() which on Linux waits
  * for the system_delay_taskq to drain.
  */
 void
 zfs_ereport_taskq_fini(void)
 {
 	mutex_enter(&recent_events_lock);
 	if (recent_events_cleaner_tqid != 0) {
 		taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
 		recent_events_cleaner_tqid = 0;
 	}
 	mutex_exit(&recent_events_lock);
 }
 
 void
 zfs_ereport_fini(void)
 {
 	recent_events_node_t *entry;
 
 	while ((entry = list_head(&recent_events_list)) != NULL) {
 		avl_remove(&recent_events_tree, entry);
 		list_remove(&recent_events_list, entry);
 		kmem_free(entry, sizeof (*entry));
 	}
 	avl_destroy(&recent_events_tree);
 	list_destroy(&recent_events_list);
 	mutex_destroy(&recent_events_lock);
 }
 
+void
+zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name)
+{
+	nvlist_t *aux;
+
+	aux = fm_nvlist_create(NULL);
+	nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
+
+	zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
+	fm_nvlist_destroy(aux, FM_NVA_FREE);
+}
+
+/*
+ * Post when a event when a zvol is created or removed
+ *
+ * This is currently only used by macOS, since it uses the event to create
+ * symlinks between the volume name (mypool/myvol) and the actual /dev
+ * device (/dev/disk3).  For example:
+ *
+ * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3
+ *
+ * name: The full name of the zvol ("mypool/myvol")
+ * dev_name: The full /dev name for the zvol ("/dev/disk3")
+ * raw_name: The raw  /dev name for the zvol ("/dev/rdisk3")
+ */
+void
+zfs_ereport_zvol_post(const char *subclass, const char *name,
+    const char *dev_name, const char *raw_name)
+{
+	nvlist_t *aux;
+	char *r;
+
+	boolean_t locked = mutex_owned(&spa_namespace_lock);
+	if (!locked) mutex_enter(&spa_namespace_lock);
+	spa_t *spa = spa_lookup(name);
+	if (!locked) mutex_exit(&spa_namespace_lock);
+
+	if (spa == NULL)
+		return;
+
+	aux = fm_nvlist_create(NULL);
+	nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
+	nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
+	    raw_name);
+	r = strchr(name, '/');
+	if (r && r[1])
+		nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
+
+	zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
+	fm_nvlist_destroy(aux, FM_NVA_FREE);
+}
+
 EXPORT_SYMBOL(zfs_ereport_post);
 EXPORT_SYMBOL(zfs_ereport_is_valid);
 EXPORT_SYMBOL(zfs_ereport_post_checksum);
 EXPORT_SYMBOL(zfs_post_remove);
 EXPORT_SYMBOL(zfs_post_autoreplace);
 EXPORT_SYMBOL(zfs_post_state_change);
 
 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
 	"Maximum recent zevents records to retain for duplicate checking");
 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
 	"Expiration time for recent zevents records");
 #endif /* _KERNEL */