diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c
index 024a931d7816..b08916b317f8 100644
--- a/module/os/freebsd/zfs/zio_crypt.c
+++ b/module/os/freebsd/zfs/zio_crypt.c
@@ -1,1832 +1,1837 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2017, Datto, Inc. All rights reserved.
  */
 
 #include <sys/zio_crypt.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/sha2.h>
 #include <sys/hkdf.h>
 
 /*
  * This file is responsible for handling all of the details of generating
  * encryption parameters and performing encryption and authentication.
  *
  * BLOCK ENCRYPTION PARAMETERS:
  * Encryption /Authentication Algorithm Suite (crypt):
  * The encryption algorithm, mode, and key length we are going to use. We
  * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
  * keys. All authentication is currently done with SHA512-HMAC.
  *
  * Plaintext:
  * The unencrypted data that we want to encrypt.
  *
  * Initialization Vector (IV):
  * An initialization vector for the encryption algorithms. This is used to
  * "tweak" the encryption algorithms so that two blocks of the same data are
  * encrypted into different ciphertext outputs, thus obfuscating block patterns.
  * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
  * never reused with the same encryption key. This value is stored unencrypted
  * and must simply be provided to the decryption function. We use a 96 bit IV
  * (as recommended by NIST) for all block encryption. For non-dedup blocks we
  * derive the IV randomly. The first 64 bits of the IV are stored in the second
  * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
  * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
  * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
  * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
  * level 0 blocks is the number of allocated dnodes in that block. The on-disk
  * format supports at most 2^15 slots per L0 dnode block, because the maximum
  * block size is 16MB (2^24). In either case, for level 0 blocks this number
  * will still be smaller than UINT32_MAX so it is safe to store the IV in the
  * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
  * for the dnode code.
  *
  * Master key:
  * This is the most important secret data of an encrypted dataset. It is used
  * along with the salt to generate that actual encryption keys via HKDF. We
  * do not use the master key to directly encrypt any data because there are
  * theoretical limits on how much data can actually be safely encrypted with
  * any encryption mode. The master key is stored encrypted on disk with the
  * user's wrapping key. Its length is determined by the encryption algorithm.
  * For details on how this is stored see the block comment in dsl_crypt.c
  *
  * Salt:
  * Used as an input to the HKDF function, along with the master key. We use a
  * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
  * can be used for encrypting many blocks, so we cache the current salt and the
  * associated derived key in zio_crypt_t so we do not need to derive it again
  * needlessly.
  *
  * Encryption Key:
  * A secret binary key, generated from an HKDF function used to encrypt and
  * decrypt data.
  *
  * Message Authentication Code (MAC)
  * The MAC is an output of authenticated encryption modes such as AES-GCM and
  * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
  * data on disk and return garbage to the application. Effectively, it is a
  * checksum that can not be reproduced by an attacker. We store the MAC in the
  * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
  * regular checksum of the ciphertext which can be used for scrubbing.
  *
  * OBJECT AUTHENTICATION:
  * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
  * they contain some info that always needs to be readable. To prevent this
  * data from being altered, we authenticate this data using SHA512-HMAC. This
  * will produce a MAC (similar to the one produced via encryption) which can
  * be used to verify the object was not modified. HMACs do not require key
  * rotation or IVs, so we can keep up to the full 3 copies of authenticated
  * data.
  *
  * ZIL ENCRYPTION:
  * ZIL blocks have their bp written to disk ahead of the associated data, so we
  * cannot store the MAC there as we normally do. For these blocks the MAC is
  * stored in the embedded checksum within the zil_chain_t header. The salt and
  * IV are generated for the block on bp allocation instead of at encryption
  * time. In addition, ZIL blocks have some pieces that must be left in plaintext
  * for claiming even though all of the sensitive user data still needs to be
  * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
  * pieces of the block need to be encrypted. All data that is not encrypted is
  * authenticated using the AAD mechanisms that the supported encryption modes
  * provide for. In order to preserve the semantics of the ZIL for encrypted
  * datasets, the ZIL is not protected at the objset level as described below.
  *
  * DNODE ENCRYPTION:
  * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
  * in plaintext for scrubbing and claiming, but the bonus buffers might contain
  * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
  * which pieces of the block need to be encrypted. For more details about
  * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
  *
  * OBJECT SET AUTHENTICATION:
  * Up to this point, everything we have encrypted and authenticated has been
  * at level 0 (or -2 for the ZIL). If we did not do any further work the
  * on-disk format would be susceptible to attacks that deleted or rearranged
  * the order of level 0 blocks. Ideally, the cleanest solution would be to
  * maintain a tree of authentication MACs going up the bp tree. However, this
  * presents a problem for raw sends. Send files do not send information about
  * indirect blocks so there would be no convenient way to transfer the MACs and
  * they cannot be recalculated on the receive side without the master key which
  * would defeat one of the purposes of raw sends in the first place. Instead,
  * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
  * from the level below. We also include some portable fields from blk_prop such
  * as the lsize and compression algorithm to prevent the data from being
  * misinterpreted.
  *
  * At the objset level, we maintain 2 separate 256 bit MACs in the
  * objset_phys_t. The first one is "portable" and is the logical root of the
  * MAC tree maintained in the metadnode's bps. The second, is "local" and is
  * used as the root MAC for the user accounting objects, which are also not
  * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
  * of the send file. The useraccounting code ensures that the useraccounting
  * info is not present upon a receive, so the local MAC can simply be cleared
  * out at that time. For more info about objset_phys_t authentication, see
  * zio_crypt_do_objset_hmacs().
  *
  * CONSIDERATIONS FOR DEDUP:
  * In order for dedup to work, blocks that we want to dedup with one another
  * need to use the same IV and encryption key, so that they will have the same
  * ciphertext. Normally, one should never reuse an IV with the same encryption
  * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
  * blocks. In this case, however, since we are using the same plaintext as
  * well all that we end up with is a duplicate of the original ciphertext we
  * already had. As a result, an attacker with read access to the raw disk will
  * be able to tell which blocks are the same but this information is given away
  * by dedup anyway. In order to get the same IVs and encryption keys for
  * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
  * here so that a reproducible checksum of the plaintext is never available to
  * the attacker. The HMAC key is kept alongside the master key, encrypted on
  * disk. The first 64 bits of the HMAC are used in place of the random salt, and
  * the next 96 bits are used as the IV. As a result of this mechanism, dedup
  * will only work within a clone family since encrypted dedup requires use of
  * the same master and HMAC keys.
  */
 
 /*
  * After encrypting many blocks with the same key we may start to run up
  * against the theoretical limits of how much data can securely be encrypted
  * with a single key using the supported encryption modes. The most obvious
  * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
  * the more IVs we generate (which both GCM and CCM modes strictly forbid).
  * This risk actually grows surprisingly quickly over time according to the
  * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
  * generated n IVs with a cryptographically secure RNG, the approximate
  * probability p(n) of a collision is given as:
  *
  * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
  *
  * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
  *
  * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
  * we must not write more than 398,065,730 blocks with the same encryption key.
  * Therefore, we rotate our keys after 400,000,000 blocks have been written by
  * generating a new random 64 bit salt for our HKDF encryption key generation
  * function.
  */
 #define	ZFS_KEY_MAX_SALT_USES_DEFAULT	400000000
 #define	ZFS_CURRENT_MAX_SALT_USES	\
 	(MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
 static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
 
 typedef struct blkptr_auth_buf {
 	uint64_t bab_prop;			/* blk_prop - portable mask */
 	uint8_t bab_mac[ZIO_DATA_MAC_LEN];	/* MAC from blk_cksum */
 	uint64_t bab_pad;			/* reserved for future use */
 } blkptr_auth_buf_t;
 
 const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
 	{"",			ZC_TYPE_NONE,	0,	"inherit"},
 	{"",			ZC_TYPE_NONE,	0,	"on"},
 	{"",			ZC_TYPE_NONE,	0,	"off"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	16,	"aes-128-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	24,	"aes-192-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	32,	"aes-256-ccm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	16,	"aes-128-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	24,	"aes-192-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	32,	"aes-256-gcm"}
 };
 
 static void
 zio_crypt_key_destroy_early(zio_crypt_key_t *key)
 {
 	rw_destroy(&key->zk_salt_lock);
 
 	/* free crypto templates */
 	memset(&key->zk_session, 0, sizeof (key->zk_session));
 
 	/* zero out sensitive data */
 	memset(key, 0, sizeof (zio_crypt_key_t));
 }
 
 void
 zio_crypt_key_destroy(zio_crypt_key_t *key)
 {
 
 	freebsd_crypt_freesession(&key->zk_session);
 	zio_crypt_key_destroy_early(key);
 }
 
 int
 zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
 {
 	int ret;
 	crypto_mechanism_t mech __unused;
 	uint_t keydata_len;
 	const zio_crypt_info_t *ci = NULL;
 
 	ASSERT3P(key, !=, NULL);
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	ci = &zio_crypt_table[crypt];
 	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
 	    ci->ci_crypt_type != ZC_TYPE_CCM)
 		return (ENOTSUP);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 	memset(key, 0, sizeof (zio_crypt_key_t));
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	/* fill keydata buffers and salt with random data */
 	ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_master_keydata, keydata_len);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for the ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = &key->zk_hmac_key;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	ci = &zio_crypt_table[crypt];
 	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
 	    ci->ci_crypt_type != ZC_TYPE_CCM)
 		return (ENOTSUP);
 
 	ret = freebsd_crypt_newsession(&key->zk_session, ci,
 	    &key->zk_current_key);
 	if (ret)
 		goto error;
 
 	key->zk_crypt = crypt;
 	key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy_early(key);
 	return (ret);
 }
 
 static int
 zio_crypt_key_change_salt(zio_crypt_key_t *key)
 {
 	int ret = 0;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	crypto_mechanism_t mech __unused;
 
 	uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
 
 	/* generate a new salt */
 	ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	rw_enter(&key->zk_salt_lock, RW_WRITER);
 
 	/* someone beat us to the salt rotation, just unlock and return */
 	if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
 		goto out_unlock;
 
 	/* derive the current key from the master key and the new salt */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
 	if (ret != 0)
 		goto out_unlock;
 
 	/* assign the salt and reset the usage count */
 	memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
 	key->zk_salt_count = 0;
 
 	freebsd_crypt_freesession(&key->zk_session);
 	ret = freebsd_crypt_newsession(&key->zk_session,
 	    &zio_crypt_table[key->zk_crypt], &key->zk_current_key);
 	if (ret != 0)
 		goto out_unlock;
 
 	rw_exit(&key->zk_salt_lock);
 
 	return (0);
 
 out_unlock:
 	rw_exit(&key->zk_salt_lock);
 error:
 	return (ret);
 }
 
 /* See comment above zfs_key_max_salt_uses definition for details */
 int
 zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
 {
 	int ret;
 	boolean_t salt_change;
 
 	rw_enter(&key->zk_salt_lock, RW_READER);
 
 	memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
 	salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
 	    ZFS_CURRENT_MAX_SALT_USES);
 
 	rw_exit(&key->zk_salt_lock);
 
 	if (salt_change) {
 		ret = zio_crypt_key_change_salt(key);
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 void *failed_decrypt_buf;
 int failed_decrypt_size;
 
 /*
  * This function handles all encryption and decryption in zfs. When
  * encrypting it expects puio to reference the plaintext and cuio to
  * reference the ciphertext. cuio must have enough space for the
  * ciphertext + room for a MAC. datalen should be the length of the
  * plaintext / ciphertext alone.
  */
 /*
  * The implementation for FreeBSD's OpenCrypto.
  *
  * The big difference between ICP and FOC is that FOC uses a single
  * buffer for input and output.  This means that (for AES-GCM, the
  * only one supported right now) the source must be copied into the
  * destination, and the destination must have the AAD, and the tag/MAC,
  * already associated with it.  (Both implementations can use a uio.)
  *
  * Since the auth data is part of the iovec array, all we need to know
  * is the length:  0 means there's no AAD.
  *
  */
 static int
 zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess,
     uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen,
     zfs_uio_t *uio, uint_t auth_len)
 {
 	const zio_crypt_info_t *ci = &zio_crypt_table[crypt];
 	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
 	    ci->ci_crypt_type != ZC_TYPE_CCM)
 		return (ENOTSUP);
 
 
 	int ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf,
 	    datalen, auth_len);
 	if (ret != 0) {
 #ifdef FCRYPTO_DEBUG
 		printf("%s(%d):  Returning error %s\n",
 		    __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM");
 #endif
 		ret = SET_ERROR(encrypt ? EIO : ECKSUM);
 	}
 
 	return (ret);
 }
 
 int
 zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
     uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
 {
 	int ret;
 	uint64_t aad[3];
 	/*
 	 * With OpenCrypto in FreeBSD, the same buffer is used for
 	 * input and output.  Also, the AAD (for AES-GMC at least)
 	 * needs to logically go in front.
 	 */
 	zfs_uio_t cuio;
 	struct uio cuio_s;
 	iovec_t iovecs[4];
 	uint64_t crypt = key->zk_crypt;
 	uint_t enc_len, keydata_len, aad_len;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	zfs_uio_init(&cuio, &cuio_s);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 
 	/* generate iv for wrapping the master and hmac key */
 	ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	/*
 	 * Since we only support one buffer, we need to copy
 	 * the plain text (source) to the cipher buffer (dest).
 	 * We set iovecs[0] -- the authentication data -- below.
 	 */
 	memcpy(keydata_out, key->zk_master_keydata, keydata_len);
 	memcpy(hmac_keydata_out, key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
 	iovecs[1].iov_base = keydata_out;
 	iovecs[1].iov_len = keydata_len;
 	iovecs[2].iov_base = hmac_keydata_out;
 	iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
 	iovecs[3].iov_base = mac;
 	iovecs[3].iov_len = WRAPPING_MAC_LEN;
 
 	/*
 	 * Although we don't support writing to the old format, we do
 	 * support rewrapping the key so that the user can move and
 	 * quarantine datasets on the old format.
 	 */
 	if (key->zk_version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(key->zk_guid);
 	} else {
 		ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(key->zk_guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(key->zk_version);
 	}
 
 	iovecs[0].iov_base = aad;
 	iovecs[0].iov_len = aad_len;
 	enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
 
 	GET_UIO_STRUCT(&cuio)->uio_iov = iovecs;
 	zfs_uio_iovcnt(&cuio) = 4;
 	zfs_uio_segflg(&cuio) = UIO_SYSSPACE;
 
 	/* encrypt the keys and store the resulting ciphertext and mac */
 	ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey,
 	    iv, enc_len, &cuio, aad_len);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 int
 zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
     uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
     uint8_t *mac, zio_crypt_key_t *key)
 {
 	int ret;
 	uint64_t aad[3];
 	/*
 	 * With OpenCrypto in FreeBSD, the same buffer is used for
 	 * input and output.  Also, the AAD (for AES-GMC at least)
 	 * needs to logically go in front.
 	 */
 	zfs_uio_t cuio;
 	struct uio cuio_s;
 	iovec_t iovecs[4];
 	void *src, *dst;
 	uint_t enc_len, keydata_len, aad_len;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	zfs_uio_init(&cuio, &cuio_s);
 
 	/*
 	 * Since we only support one buffer, we need to copy
 	 * the encrypted buffer (source) to the plain buffer
 	 * (dest).  We set iovecs[0] -- the authentication data --
 	 * below.
 	 */
 	dst = key->zk_master_keydata;
 	src = keydata;
 	memcpy(dst, src, keydata_len);
 
 	dst = key->zk_hmac_keydata;
 	src = hmac_keydata;
 	memcpy(dst, src, SHA512_HMAC_KEYLEN);
 
 	iovecs[1].iov_base = key->zk_master_keydata;
 	iovecs[1].iov_len = keydata_len;
 	iovecs[2].iov_base = key->zk_hmac_keydata;
 	iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
 	iovecs[3].iov_base = mac;
 	iovecs[3].iov_len = WRAPPING_MAC_LEN;
 
 	if (version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(guid);
 	} else {
 		ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(version);
 	}
 
 	enc_len = keydata_len + SHA512_HMAC_KEYLEN;
 	iovecs[0].iov_base = aad;
 	iovecs[0].iov_len = aad_len;
 
 	GET_UIO_STRUCT(&cuio)->uio_iov = iovecs;
 	zfs_uio_iovcnt(&cuio) = 4;
 	zfs_uio_segflg(&cuio) = UIO_SYSSPACE;
 
 	/* decrypt the keys and store the result in the output buffers */
 	ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey,
 	    iv, enc_len, &cuio, aad_len);
 
 	if (ret != 0)
 		goto error;
 
 	/* generate a fresh salt */
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	ret = freebsd_crypt_newsession(&key->zk_session,
 	    &zio_crypt_table[crypt], &key->zk_current_key);
 	if (ret != 0)
 		goto error;
 
 	key->zk_crypt = crypt;
 	key->zk_version = version;
 	key->zk_guid = guid;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy_early(key);
 	return (ret);
 }
 
 int
 zio_crypt_generate_iv(uint8_t *ivbuf)
 {
 	int ret;
 
 	/* randomly generate the IV */
 	ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	memset(ivbuf, 0, ZIO_DATA_IV_LEN);
 	return (ret);
 }
 
 int
 zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
     uint8_t *digestbuf, uint_t digestlen)
 {
 	uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
 
 	ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
 
 	crypto_mac(&key->zk_hmac_key, data, datalen,
 	    raw_digestbuf, SHA512_DIGEST_LENGTH);
 
 	memcpy(digestbuf, raw_digestbuf, digestlen);
 
 	return (0);
 }
 
 int
 zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
     uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
 {
 	int ret;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	ret = zio_crypt_do_hmac(key, data, datalen,
 	    digestbuf, SHA512_DIGEST_LENGTH);
 	if (ret != 0)
 		return (ret);
 
 	memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN);
 	memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN);
 
 	return (0);
 }
 
 /*
  * The following functions are used to encode and decode encryption parameters
  * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
  * byte strings, which normally means that these strings would not need to deal
  * with byteswapping at all. However, both blkptr_t and zil_header_t may be
  * byteswapped by lower layers and so we must "undo" that byteswap here upon
  * decoding and encoding in a non-native byteorder. These functions require
  * that the byteorder bit is correct before being called.
  */
 void
 zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_ENCRYPTED(bp));
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
 		memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, val32);
 	} else {
 		memcpy(&val64, salt, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
 
 		memcpy(&val64, iv, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
 
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, BSWAP_32(val32));
 	}
 }
 
 void
 zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_PROTECTED(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		memset(salt, 0, ZIO_DATA_SALT_LEN);
 		memset(iv, 0, ZIO_DATA_IV_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
 		memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
 
 		val32 = (uint32_t)BP_GET_IV2(bp);
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
 		memcpy(salt, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
 		memcpy(iv, &val64, sizeof (uint64_t));
 
 		val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
 		memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
 		    sizeof (uint64_t));
 	} else {
 		memcpy(&val64, mac, sizeof (uint64_t));
 		bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
 
 		memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
 		bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
 	}
 }
 
 void
 zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		memset(mac, 0, ZIO_DATA_MAC_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
 		memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
 		    sizeof (uint64_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
 		memcpy(mac, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
 		memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
 {
 	zil_chain_t *zilc = data;
 
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
 	    sizeof (uint64_t));
 }
 
 void
 zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
 {
 	/*
 	 * The ZIL MAC is embedded in the block it protects, which will
 	 * not have been byteswapped by the time this function has been called.
 	 * As a result, we don't need to worry about byteswapping the MAC.
 	 */
 	const zil_chain_t *zilc = data;
 
 	memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
 	memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
 	    sizeof (uint64_t));
 }
 
 /*
  * This routine takes a block of dnodes (src_abd) and copies only the bonus
  * buffers to the same offsets in the dst buffer. datalen should be the size
  * of both the src_abd and the dst buffer (not just the length of the bonus
  * buffers).
  */
 void
 zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
 {
 	uint_t i, max_dnp = datalen >> DNODE_SHIFT;
 	uint8_t *src;
 	dnode_phys_t *dnp, *sdnp, *ddnp;
 
 	src = abd_borrow_buf_copy(src_abd, datalen);
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp),
 			    DN_MAX_BONUS_LEN(dnp));
 		}
 	}
 
 	abd_return_buf(src_abd, src, datalen);
 }
 
 /*
  * This function decides what fields from blk_prop are included in
  * the on-disk various MAC algorithms.
  */
 static void
 zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
 {
 	int avoidlint = SPA_MINBLOCKSIZE;
 	/*
 	 * Version 0 did not properly zero out all non-portable fields
 	 * as it should have done. We maintain this code so that we can
 	 * do read-only imports of pools on this version.
 	 */
 	if (version == 0) {
 		BP_SET_DEDUP(bp, 0);
 		BP_SET_CHECKSUM(bp, 0);
 		BP_SET_PSIZE(bp, avoidlint);
 		return;
 	}
 
 	ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 
 	/*
 	 * The hole_birth feature might set these fields even if this bp
 	 * is a hole. We zero them out here to guarantee that raw sends
 	 * will function with or without the feature.
 	 */
 	if (BP_IS_HOLE(bp)) {
 		bp->blk_prop = 0ULL;
 		return;
 	}
 
 	/*
 	 * At L0 we want to verify these fields to ensure that data blocks
 	 * can not be reinterpreted. For instance, we do not want an attacker
 	 * to trick us into returning raw lz4 compressed data to the user
 	 * by modifying the compression bits. At higher levels, we cannot
 	 * enforce this policy since raw sends do not convey any information
 	 * about indirect blocks, so these values might be different on the
 	 * receive side. Fortunately, this does not open any new attack
 	 * vectors, since any alterations that can be made to a higher level
 	 * bp must still verify the correct order of the layer below it.
 	 */
 	if (BP_GET_LEVEL(bp) != 0) {
 		BP_SET_BYTEORDER(bp, 0);
 		BP_SET_COMPRESS(bp, 0);
 
 		/*
 		 * psize cannot be set to zero or it will trigger
 		 * asserts, but the value doesn't really matter as
 		 * long as it is constant.
 		 */
 		BP_SET_PSIZE(bp, avoidlint);
 	}
 
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_CHECKSUM(bp, 0);
 }
 
 static void
 zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
     blkptr_auth_buf_t *bab, uint_t *bab_len)
 {
 	blkptr_t tmpbp = *bp;
 
 	if (should_bswap)
 		byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
 
 	ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
 	ASSERT0(BP_IS_EMBEDDED(&tmpbp));
 
 	zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
 
 	/*
 	 * We always MAC blk_prop in LE to ensure portability. This
 	 * must be done after decoding the mac, since the endianness
 	 * will get zero'd out here.
 	 */
 	zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
 	bab->bab_prop = LE_64(tmpbp.blk_prop);
 	bab->bab_pad = 0ULL;
 
 	/* version 0 did not include the padding */
 	*bab_len = sizeof (blkptr_auth_buf_t);
 	if (version == 0)
 		*bab_len -= sizeof (uint64_t);
 }
 
 static int
 zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	crypto_mac_update(ctx, &bab, bab_len);
 
 	return (0);
 }
 
 static void
 zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	SHA2Update(ctx, &bab, bab_len);
 }
 
 static void
 zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	memcpy(*aadp, &bab, bab_len);
 	*aadp += bab_len;
 	*aad_len += bab_len;
 }
 
 static int
 zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, dnode_phys_t *dnp)
 {
 	int ret, i;
 	dnode_phys_t *adnp;
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
 
 	/* authenticate the core dnode (masking out non-portable bits) */
 	memcpy(tmp_dncore, dnp, sizeof (tmp_dncore));
 	adnp = (dnode_phys_t *)tmp_dncore;
 	if (le_bswap) {
 		adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
 		adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
 		adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
 		adnp->dn_used = BSWAP_64(adnp->dn_used);
 	}
 	adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 	adnp->dn_used = 0;
 
 	crypto_mac_update(ctx, adnp, sizeof (tmp_dncore));
 
 	for (i = 0; i < dnp->dn_nblkptr; i++) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, &dnp->dn_blkptr[i]);
 		if (ret != 0)
 			goto error;
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, DN_SPILL_BLKPTR(dnp));
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * objset_phys_t blocks introduce a number of exceptions to the normal
  * authentication process. objset_phys_t's contain 2 separate HMACS for
  * protecting the integrity of their data. The portable_mac protects the
  * metadnode. This MAC can be sent with a raw send and protects against
  * reordering of data within the metadnode. The local_mac protects the user
  * accounting objects which are not sent from one system to another.
  *
  * In addition, objset blocks are the only blocks that can be modified and
  * written to disk without the key loaded under certain circumstances. During
  * zil_claim() we need to be able to update the zil_header_t to complete
  * claiming log blocks and during raw receives we need to write out the
  * portable_mac from the send file. Both of these actions are possible
  * because these fields are not protected by either MAC so neither one will
  * need to modify the MACs without the key. However, when the modified blocks
  * are written out they will be byteswapped into the host machine's native
  * endianness which will modify fields protected by the MAC. As a result, MAC
  * calculation for objset blocks works slightly differently from other block
  * types. Where other block types MAC the data in whatever endianness is
  * written to disk, objset blocks always MAC little endian version of their
  * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
  * and le_bswap indicates whether a byteswap is needed to get this block
  * into little endian format.
  */
 int
 zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
     boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
 {
 	int ret;
 	struct hmac_ctx hash_ctx;
 	struct hmac_ctx *ctx = &hash_ctx;
 	objset_phys_t *osp = data;
 	uint64_t intval;
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
 	uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
 
 
 	/* calculate the portable MAC from the portable fields and metadnode */
 	crypto_mac_init(ctx, &key->zk_hmac_key);
 
 	/* add in the os_type */
 	intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
 	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
 
 	/* add in the portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
 
 	/* add in fields from the metadnode */
 	ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 	    should_bswap, &osp->os_meta_dnode);
 	if (ret)
 		goto error;
 
 	crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH);
 
 	memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN);
 
 	/*
 	 * This is necessary here as we check next whether
 	 * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to
 	 * decide if the local_mac should be zeroed out. That flag will always
 	 * be set by dmu_objset_id_quota_upgrade_cb() and
 	 * dmu_objset_userspace_upgrade_cb() if useraccounting has been
 	 * completed.
 	 */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	boolean_t uacct_incomplete =
 	    !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 
 	/*
 	 * The local MAC protects the user, group and project accounting.
 	 * If these objects are not present, the local MAC is zeroed out.
 	 */
 	if (uacct_incomplete ||
 	    (datalen >= OBJSET_PHYS_SIZE_V3 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen >= OBJSET_PHYS_SIZE_V2 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen <= OBJSET_PHYS_SIZE_V1)) {
 		memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 		return (0);
 	}
 
 	/* calculate the local MAC from the userused and groupused dnodes */
 	crypto_mac_init(ctx, &key->zk_hmac_key);
 
 	/* add in the non-portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
 
 	/* XXX check dnode type ... */
 	/* add in fields from the user accounting dnodes */
 	if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_userused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_groupused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
 	    datalen >= OBJSET_PHYS_SIZE_V3) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_projectused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH);
 
 	memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN);
 
 	return (0);
 
 error:
 	memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN);
 	memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 	return (ret);
 }
 
 static void
 zio_crypt_destroy_uio(zfs_uio_t *uio)
 {
 	if (GET_UIO_STRUCT(uio)->uio_iov)
 		kmem_free(GET_UIO_STRUCT(uio)->uio_iov,
 		    zfs_uio_iovcnt(uio) * sizeof (iovec_t));
 }
 
 /*
  * This function parses an uncompressed indirect block and returns a checksum
  * of all the portable fields from all of the contained bps. The portable
  * fields are the MAC and all of the fields from blk_prop except for the dedup,
  * checksum, and psize bits. For an explanation of the purpose of this, see
  * the comment block on object set authentication.
  */
 static int
 zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
     uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
 {
 	blkptr_t *bp;
 	int i, epb = datalen >> SPA_BLKPTRSHIFT;
 	SHA2_CTX ctx;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	/* checksum all of the MACs from the layer below */
 	SHA2Init(SHA512, &ctx);
 	for (i = 0, bp = buf; i < epb; i++, bp++) {
 		zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
 		    byteswap, bp);
 	}
 	SHA2Final(digestbuf, &ctx);
 
 	if (generate) {
 		memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN);
 		return (0);
 	}
 
 	if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) {
 #ifdef FCRYPTO_DEBUG
 		printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__);
 #endif
 		return (SET_ERROR(ECKSUM));
 	}
 	return (0);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 
 	/*
 	 * Unfortunately, callers of this function will not always have
 	 * easy access to the on-disk format version. This info is
 	 * normally found in the DSL Crypto Key, but the checksum-of-MACs
 	 * is expected to be verifiable even when the key isn't loaded.
 	 * Here, instead of doing a ZAP lookup for the version for each
 	 * zio, we simply try both existing formats.
 	 */
 	ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
 	    datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
 	if (ret == ECKSUM) {
 		ASSERT(!generate);
 		ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
 		    buf, datalen, 0, byteswap, cksum);
 	}
 
 	return (ret);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 	void *buf;
 
 	buf = abd_borrow_buf_copy(abd, datalen);
 	ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
 	    byteswap, cksum);
 	abd_return_buf(abd, buf, datalen);
 
 	return (ret);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting ZIL blocks.
  * We do not check for the older ZIL chain because the encryption feature
  * was not available before the newer ZIL chain was introduced. The goal
  * here is to encrypt everything except the blkptr_t of a lr_write_t and
  * the zil_chain_t header. Everything that is not encrypted is authenticated.
  */
 /*
  * The OpenCrypto used in FreeBSD does not use separate source and
  * destination buffers; instead, the same buffer is used.  Further, to
  * accommodate some of the drivers, the authbuf needs to be logically before
  * the data.  This means that we need to copy the source to the destination,
  * and set up an extra iovec_t at the beginning to handle the authbuf.
  * It also means we'll only return one zfs_uio_t.
  */
 
 static int
 zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio,
     zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
     boolean_t *no_crypt)
 {
 	(void) puio;
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 	uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
 	iovec_t *dst_iovecs;
 	zil_chain_t *zilc;
 	lr_t *lr;
-	uint64_t txtype, lr_len;
+	uint64_t txtype, lr_len, nused;
 	uint_t crypt_len, nr_iovecs, vec;
 	uint_t aad_len = 0, total_len = 0;
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 	}
 	memcpy(dst, src, datalen);
 
 	/* Find the start and end record of the log block. */
 	zilc = (zil_chain_t *)src;
 	slrp = src + sizeof (zil_chain_t);
 	aadp = aadbuf;
-	blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	ASSERT3U(nused, >=, sizeof (zil_chain_t));
+	ASSERT3U(nused, <=, datalen);
+	blkend = src + nused;
 
 	/*
 	 * Calculate the number of encrypted iovecs we will need.
 	 */
 
 	/* We need at least two iovecs -- one for the AAD, one for the MAC. */
 	nr_iovecs = 2;
 
 	for (; slrp < blkend; slrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (byteswap) {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		} else {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		}
+		ASSERT3U(lr_len, >=, sizeof (lr_t));
+		ASSERT3U(lr_len, <=, blkend - slrp);
 
 		nr_iovecs++;
 		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
 			nr_iovecs++;
 	}
 
 	dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP);
 
 	/*
 	 * Copy the plain zil header over and authenticate everything except
 	 * the checksum that will store our MAC. If we are writing the data
 	 * the embedded checksum will not have been calculated yet, so we don't
 	 * authenticate that.
 	 */
 	memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t));
 	aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 	aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 
 	slrp = src + sizeof (zil_chain_t);
 	dlrp = dst + sizeof (zil_chain_t);
 
 	/*
 	 * Loop over records again, filling in iovecs.
 	 */
 
 	/* The first iovec will contain the authbuf. */
 	vec = 1;
 
 	for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (!byteswap) {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		} else {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		}
 
 		/* copy the common lr_t */
 		memcpy(dlrp, slrp, sizeof (lr_t));
 		memcpy(aadp, slrp, sizeof (lr_t));
 		aadp += sizeof (lr_t);
 		aad_len += sizeof (lr_t);
 
 		/*
 		 * If this is a TX_WRITE record we want to encrypt everything
 		 * except the bp if exists. If the bp does exist we want to
 		 * authenticate it.
 		 */
 		if (txtype == TX_WRITE) {
 			crypt_len = sizeof (lr_write_t) -
 			    sizeof (lr_t) - sizeof (blkptr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp +
 			    sizeof (lr_t);
 			dst_iovecs[vec].iov_len = crypt_len;
 
 			/* copy the bp now since it will not be encrypted */
 			memcpy(dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
 			    slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
 			    sizeof (blkptr_t));
 			memcpy(aadp,
 			    slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
 			    sizeof (blkptr_t));
 			aadp += sizeof (blkptr_t);
 			aad_len += sizeof (blkptr_t);
 			vec++;
 			total_len += crypt_len;
 
 			if (lr_len != sizeof (lr_write_t)) {
 				crypt_len = lr_len - sizeof (lr_write_t);
 				dst_iovecs[vec].iov_base = (char *)
 				    dlrp + sizeof (lr_write_t);
 				dst_iovecs[vec].iov_len = crypt_len;
 				vec++;
 				total_len += crypt_len;
 			}
 		} else if (txtype == TX_CLONE_RANGE) {
 			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
 			crypt_len = o - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t);
 			dst_iovecs[vec].iov_len = crypt_len;
 
 			/* copy the bps now since they will not be encrypted */
 			memcpy(dlrp + o, slrp + o, lr_len - o);
 			memcpy(aadp, slrp + o, lr_len - o);
 			aadp += lr_len - o;
 			aad_len += lr_len - o;
 			vec++;
 			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp +
 			    sizeof (lr_t);
 			dst_iovecs[vec].iov_len = crypt_len;
 			vec++;
 			total_len += crypt_len;
 		}
 	}
 
 	/* The last iovec will contain the MAC. */
 	ASSERT3U(vec, ==, nr_iovecs - 1);
 
 	/* AAD */
 	dst_iovecs[0].iov_base = aadbuf;
 	dst_iovecs[0].iov_len = aad_len;
 	/* MAC */
 	dst_iovecs[vec].iov_base = 0;
 	dst_iovecs[vec].iov_len = 0;
 
 	*no_crypt = (vec == 1);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 	GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs;
 	zfs_uio_iovcnt(out_uio) = nr_iovecs;
 
 	return (0);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting dnode blocks.
  */
 static int
 zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf,
     uint_t *auth_len, boolean_t *no_crypt)
 {
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 	uint8_t *src, *dst, *aadp;
 	dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
 	iovec_t *dst_iovecs;
 	uint_t nr_iovecs, crypt_len, vec;
 	uint_t aad_len = 0, total_len = 0;
 	uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 	}
 	memcpy(dst, src, datalen);
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 	aadp = aadbuf;
 
 	/*
 	 * Count the number of iovecs we will need to do the encryption by
 	 * counting the number of bonus buffers that need to be encrypted.
 	 */
 
 	/* We need at least two iovecs -- one for the AAD, one for the MAC. */
 	nr_iovecs = 2;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		/*
 		 * This block may still be byteswapped. However, all of the
 		 * values we use are either uint8_t's (for which byteswapping
 		 * is a noop) or a * != 0 check, which will work regardless
 		 * of whether or not we byteswap.
 		 */
 		if (sdnp[i].dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
 		    sdnp[i].dn_bonuslen != 0) {
 			nr_iovecs++;
 		}
 	}
 
 	dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP);
 
 	/*
 	 * Iterate through the dnodes again, this time filling in the uios
 	 * we allocated earlier. We also concatenate any data we want to
 	 * authenticate onto aadbuf.
 	 */
 
 	/* The first iovec will contain the authbuf. */
 	vec = 1;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 
 		/* copy over the core fields and blkptrs (kept as plaintext) */
 		memcpy(&ddnp[i], dnp,
 		    (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp),
 			    sizeof (blkptr_t));
 		}
 
 		/*
 		 * Handle authenticated data. We authenticate everything in
 		 * the dnode that can be brought over when we do a raw send.
 		 * This includes all of the core fields as well as the MACs
 		 * stored in the bp checksums and all of the portable bits
 		 * from blk_prop. We include the dnode padding here in case it
 		 * ever gets used in the future. Some dn_flags and dn_used are
 		 * not portable so we mask those out values out of the
 		 * authenticated data.
 		 */
 		crypt_len = offsetof(dnode_phys_t, dn_blkptr);
 		memcpy(aadp, dnp, crypt_len);
 		adnp = (dnode_phys_t *)aadp;
 		adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 		adnp->dn_used = 0;
 		aadp += crypt_len;
 		aad_len += crypt_len;
 
 		for (j = 0; j < dnp->dn_nblkptr; j++) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, &dnp->dn_blkptr[j]);
 		}
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, DN_SPILL_BLKPTR(dnp));
 		}
 
 		/*
 		 * If this bonus buffer needs to be encrypted, we prepare an
 		 * iovec_t. The encryption / decryption functions will fill
 		 * this in for us with the encrypted or decrypted data.
 		 * Otherwise we add the bonus buffer to the authenticated
 		 * data buffer and copy it over to the destination. The
 		 * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
 		 * we can guarantee alignment with the AES block size
 		 * (128 bits).
 		 */
 		crypt_len = DN_MAX_BONUS_LEN(dnp);
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			dst_iovecs[vec].iov_base = DN_BONUS(&ddnp[i]);
 			dst_iovecs[vec].iov_len = crypt_len;
 
 			vec++;
 			total_len += crypt_len;
 		} else {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len);
 			memcpy(aadp, DN_BONUS(dnp), crypt_len);
 			aadp += crypt_len;
 			aad_len += crypt_len;
 		}
 	}
 
 	/* The last iovec will contain the MAC. */
 	ASSERT3U(vec, ==, nr_iovecs - 1);
 
 	/* AAD */
 	dst_iovecs[0].iov_base = aadbuf;
 	dst_iovecs[0].iov_len = aad_len;
 	/* MAC */
 	dst_iovecs[vec].iov_base = 0;
 	dst_iovecs[vec].iov_len = 0;
 
 	*no_crypt = (vec == 1);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 	GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs;
 	zfs_uio_iovcnt(out_uio) = nr_iovecs;
 
 	return (0);
 }
 
 static int
 zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *out_uio,
     uint_t *enc_len)
 {
 	(void) puio;
 	int ret;
 	uint_t nr_plain = 1, nr_cipher = 2;
 	iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
 	void *src, *dst;
 
 	cipher_iovecs = kmem_zalloc(nr_cipher * sizeof (iovec_t),
 	    KM_SLEEP);
 	if (!cipher_iovecs) {
 		ret = SET_ERROR(ENOMEM);
 		goto error;
 	}
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 	}
 	memcpy(dst, src, datalen);
 	cipher_iovecs[0].iov_base = dst;
 	cipher_iovecs[0].iov_len = datalen;
 
 	*enc_len = datalen;
 	GET_UIO_STRUCT(out_uio)->uio_iov = cipher_iovecs;
 	zfs_uio_iovcnt(out_uio) = nr_cipher;
 
 	return (0);
 
 error:
 	if (plain_iovecs != NULL)
 		kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
 	if (cipher_iovecs != NULL)
 		kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
 
 	*enc_len = 0;
 	GET_UIO_STRUCT(out_uio)->uio_iov = NULL;
 	zfs_uio_iovcnt(out_uio) = 0;
 
 	return (ret);
 }
 
 /*
  * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
  * that they can be used for encryption and decryption by zio_do_crypt_uio().
  * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
  * requiring special handling to parse out pieces that are to be encrypted. The
  * authbuf is used by these special cases to store additional authenticated
  * data (AAD) for the encryption modes.
  */
 static int
 zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len,
     uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt)
 {
 	int ret;
 	iovec_t *mac_iov;
 
 	ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
 
 	/* route to handler */
 	switch (ot) {
 	case DMU_OT_INTENT_LOG:
 		ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
 		    datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
 		    no_crypt);
 		break;
 	case DMU_OT_DNODE:
 		ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
 		    cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
 		    auth_len, no_crypt);
 		break;
 	default:
 		ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
 		    datalen, puio, cuio, enc_len);
 		*authbuf = NULL;
 		*auth_len = 0;
 		*no_crypt = B_FALSE;
 		break;
 	}
 
 	if (ret != 0)
 		goto error;
 
 	/* populate the uios */
 	zfs_uio_segflg(cuio) = UIO_SYSSPACE;
 
 	mac_iov =
 	    ((iovec_t *)&(GET_UIO_STRUCT(cuio)->
 	    uio_iov[zfs_uio_iovcnt(cuio) - 1]));
 	mac_iov->iov_base = (void *)mac;
 	mac_iov->iov_len = ZIO_DATA_MAC_LEN;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 void *failed_decrypt_buf;
 int faile_decrypt_size;
 
 /*
  * Primary encryption / decryption entrypoint for zio data.
  */
 int
 zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
     dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
     uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
     boolean_t *no_crypt)
 {
 	int ret;
 	boolean_t locked = B_FALSE;
 	uint64_t crypt = key->zk_crypt;
 	uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
 	uint_t enc_len, auth_len;
 	zfs_uio_t puio, cuio;
 	struct uio puio_s, cuio_s;
 	uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
 	crypto_key_t tmp_ckey, *ckey = NULL;
 	freebsd_crypt_session_t *tmpl = NULL;
 	uint8_t *authbuf = NULL;
 
 
 	zfs_uio_init(&puio, &puio_s);
 	zfs_uio_init(&cuio, &cuio_s);
 	memset(GET_UIO_STRUCT(&puio), 0, sizeof (struct uio));
 	memset(GET_UIO_STRUCT(&cuio), 0, sizeof (struct uio));
 
 #ifdef FCRYPTO_DEBUG
 	printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n",
 	    __FUNCTION__,
 	    encrypt ? "encrypt" : "decrypt",
 	    key, salt, ot, iv, mac, datalen,
 	    byteswap ? "byteswap" : "native_endian", plainbuf,
 	    cipherbuf, no_crypt);
 
 	printf("\tkey = {");
 	for (int i = 0; i < key->zk_current_key.ck_length/8; i++)
 		printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]);
 	printf("}\n");
 #endif
 	/* create uios for encryption */
 	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
 	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
 	    &authbuf, &auth_len, no_crypt);
 	if (ret != 0)
 		return (ret);
 
 	/*
 	 * If the needed key is the current one, just use it. Otherwise we
 	 * need to generate a temporary one from the given salt + master key.
 	 * If we are encrypting, we must return a copy of the current salt
 	 * so that it can be stored in the blkptr_t.
 	 */
 	rw_enter(&key->zk_salt_lock, RW_READER);
 	locked = B_TRUE;
 
 	if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
 		ckey = &key->zk_current_key;
 		tmpl = &key->zk_session;
 	} else {
 		rw_exit(&key->zk_salt_lock);
 		locked = B_FALSE;
 
 		ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 		    salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
 		if (ret != 0)
 			goto error;
 		tmp_ckey.ck_data = enc_keydata;
 		tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 		ckey = &tmp_ckey;
 		tmpl = NULL;
 	}
 
 	/* perform the encryption / decryption */
 	ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt,
 	    ckey, iv, enc_len, &cuio, auth_len);
 	if (ret != 0)
 		goto error;
 	if (locked) {
 		rw_exit(&key->zk_salt_lock);
 	}
 
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 
 	return (0);
 
 error:
 	if (!encrypt) {
 		if (failed_decrypt_buf != NULL)
 			kmem_free(failed_decrypt_buf, failed_decrypt_size);
 		failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP);
 		failed_decrypt_size = datalen;
 		memcpy(failed_decrypt_buf, cipherbuf, datalen);
 	}
 	if (locked)
 		rw_exit(&key->zk_salt_lock);
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 	return (SET_ERROR(ret));
 }
 
 /*
  * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
  * linear buffers.
  */
 int
 zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
     boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
     uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
 {
 	int ret;
 	void *ptmp, *ctmp;
 
 	if (encrypt) {
 		ptmp = abd_borrow_buf_copy(pabd, datalen);
 		ctmp = abd_borrow_buf(cabd, datalen);
 	} else {
 		ptmp = abd_borrow_buf(pabd, datalen);
 		ctmp = abd_borrow_buf_copy(cabd, datalen);
 	}
 
 	ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
 	    datalen, ptmp, ctmp, no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (0);
 
 error:
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (SET_ERROR(ret));
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 /* CSTYLED */
 module_param(zfs_key_max_salt_uses, ulong, 0644);
 MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
 	"can be used for generating encryption keys before it is rotated");
 #endif
diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c
index 775ab8efbcdf..2114be281901 100644
--- a/module/os/linux/zfs/zio_crypt.c
+++ b/module/os/linux/zfs/zio_crypt.c
@@ -1,2079 +1,2084 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2017, Datto, Inc. All rights reserved.
  */
 
 #include <sys/zio_crypt.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/sha2.h>
 #include <sys/hkdf.h>
 #include <sys/qat.h>
 
 /*
  * This file is responsible for handling all of the details of generating
  * encryption parameters and performing encryption and authentication.
  *
  * BLOCK ENCRYPTION PARAMETERS:
  * Encryption /Authentication Algorithm Suite (crypt):
  * The encryption algorithm, mode, and key length we are going to use. We
  * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
  * keys. All authentication is currently done with SHA512-HMAC.
  *
  * Plaintext:
  * The unencrypted data that we want to encrypt.
  *
  * Initialization Vector (IV):
  * An initialization vector for the encryption algorithms. This is used to
  * "tweak" the encryption algorithms so that two blocks of the same data are
  * encrypted into different ciphertext outputs, thus obfuscating block patterns.
  * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
  * never reused with the same encryption key. This value is stored unencrypted
  * and must simply be provided to the decryption function. We use a 96 bit IV
  * (as recommended by NIST) for all block encryption. For non-dedup blocks we
  * derive the IV randomly. The first 64 bits of the IV are stored in the second
  * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
  * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
  * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
  * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
  * level 0 blocks is the number of allocated dnodes in that block. The on-disk
  * format supports at most 2^15 slots per L0 dnode block, because the maximum
  * block size is 16MB (2^24). In either case, for level 0 blocks this number
  * will still be smaller than UINT32_MAX so it is safe to store the IV in the
  * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
  * for the dnode code.
  *
  * Master key:
  * This is the most important secret data of an encrypted dataset. It is used
  * along with the salt to generate that actual encryption keys via HKDF. We
  * do not use the master key to directly encrypt any data because there are
  * theoretical limits on how much data can actually be safely encrypted with
  * any encryption mode. The master key is stored encrypted on disk with the
  * user's wrapping key. Its length is determined by the encryption algorithm.
  * For details on how this is stored see the block comment in dsl_crypt.c
  *
  * Salt:
  * Used as an input to the HKDF function, along with the master key. We use a
  * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
  * can be used for encrypting many blocks, so we cache the current salt and the
  * associated derived key in zio_crypt_t so we do not need to derive it again
  * needlessly.
  *
  * Encryption Key:
  * A secret binary key, generated from an HKDF function used to encrypt and
  * decrypt data.
  *
  * Message Authentication Code (MAC)
  * The MAC is an output of authenticated encryption modes such as AES-GCM and
  * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
  * data on disk and return garbage to the application. Effectively, it is a
  * checksum that can not be reproduced by an attacker. We store the MAC in the
  * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
  * regular checksum of the ciphertext which can be used for scrubbing.
  *
  * OBJECT AUTHENTICATION:
  * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
  * they contain some info that always needs to be readable. To prevent this
  * data from being altered, we authenticate this data using SHA512-HMAC. This
  * will produce a MAC (similar to the one produced via encryption) which can
  * be used to verify the object was not modified. HMACs do not require key
  * rotation or IVs, so we can keep up to the full 3 copies of authenticated
  * data.
  *
  * ZIL ENCRYPTION:
  * ZIL blocks have their bp written to disk ahead of the associated data, so we
  * cannot store the MAC there as we normally do. For these blocks the MAC is
  * stored in the embedded checksum within the zil_chain_t header. The salt and
  * IV are generated for the block on bp allocation instead of at encryption
  * time. In addition, ZIL blocks have some pieces that must be left in plaintext
  * for claiming even though all of the sensitive user data still needs to be
  * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
  * pieces of the block need to be encrypted. All data that is not encrypted is
  * authenticated using the AAD mechanisms that the supported encryption modes
  * provide for. In order to preserve the semantics of the ZIL for encrypted
  * datasets, the ZIL is not protected at the objset level as described below.
  *
  * DNODE ENCRYPTION:
  * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
  * in plaintext for scrubbing and claiming, but the bonus buffers might contain
  * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
  * which pieces of the block need to be encrypted. For more details about
  * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
  *
  * OBJECT SET AUTHENTICATION:
  * Up to this point, everything we have encrypted and authenticated has been
  * at level 0 (or -2 for the ZIL). If we did not do any further work the
  * on-disk format would be susceptible to attacks that deleted or rearranged
  * the order of level 0 blocks. Ideally, the cleanest solution would be to
  * maintain a tree of authentication MACs going up the bp tree. However, this
  * presents a problem for raw sends. Send files do not send information about
  * indirect blocks so there would be no convenient way to transfer the MACs and
  * they cannot be recalculated on the receive side without the master key which
  * would defeat one of the purposes of raw sends in the first place. Instead,
  * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
  * from the level below. We also include some portable fields from blk_prop such
  * as the lsize and compression algorithm to prevent the data from being
  * misinterpreted.
  *
  * At the objset level, we maintain 2 separate 256 bit MACs in the
  * objset_phys_t. The first one is "portable" and is the logical root of the
  * MAC tree maintained in the metadnode's bps. The second, is "local" and is
  * used as the root MAC for the user accounting objects, which are also not
  * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
  * of the send file. The useraccounting code ensures that the useraccounting
  * info is not present upon a receive, so the local MAC can simply be cleared
  * out at that time. For more info about objset_phys_t authentication, see
  * zio_crypt_do_objset_hmacs().
  *
  * CONSIDERATIONS FOR DEDUP:
  * In order for dedup to work, blocks that we want to dedup with one another
  * need to use the same IV and encryption key, so that they will have the same
  * ciphertext. Normally, one should never reuse an IV with the same encryption
  * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
  * blocks. In this case, however, since we are using the same plaintext as
  * well all that we end up with is a duplicate of the original ciphertext we
  * already had. As a result, an attacker with read access to the raw disk will
  * be able to tell which blocks are the same but this information is given away
  * by dedup anyway. In order to get the same IVs and encryption keys for
  * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
  * here so that a reproducible checksum of the plaintext is never available to
  * the attacker. The HMAC key is kept alongside the master key, encrypted on
  * disk. The first 64 bits of the HMAC are used in place of the random salt, and
  * the next 96 bits are used as the IV. As a result of this mechanism, dedup
  * will only work within a clone family since encrypted dedup requires use of
  * the same master and HMAC keys.
  */
 
 /*
  * After encrypting many blocks with the same key we may start to run up
  * against the theoretical limits of how much data can securely be encrypted
  * with a single key using the supported encryption modes. The most obvious
  * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
  * the more IVs we generate (which both GCM and CCM modes strictly forbid).
  * This risk actually grows surprisingly quickly over time according to the
  * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
  * generated n IVs with a cryptographically secure RNG, the approximate
  * probability p(n) of a collision is given as:
  *
  * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
  *
  * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
  *
  * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
  * we must not write more than 398,065,730 blocks with the same encryption key.
  * Therefore, we rotate our keys after 400,000,000 blocks have been written by
  * generating a new random 64 bit salt for our HKDF encryption key generation
  * function.
  */
 #define	ZFS_KEY_MAX_SALT_USES_DEFAULT	400000000
 #define	ZFS_CURRENT_MAX_SALT_USES	\
 	(MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
 static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
 
 typedef struct blkptr_auth_buf {
 	uint64_t bab_prop;			/* blk_prop - portable mask */
 	uint8_t bab_mac[ZIO_DATA_MAC_LEN];	/* MAC from blk_cksum */
 	uint64_t bab_pad;			/* reserved for future use */
 } blkptr_auth_buf_t;
 
 const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
 	{"",			ZC_TYPE_NONE,	0,	"inherit"},
 	{"",			ZC_TYPE_NONE,	0,	"on"},
 	{"",			ZC_TYPE_NONE,	0,	"off"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	16,	"aes-128-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	24,	"aes-192-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	32,	"aes-256-ccm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	16,	"aes-128-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	24,	"aes-192-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	32,	"aes-256-gcm"}
 };
 
 void
 zio_crypt_key_destroy(zio_crypt_key_t *key)
 {
 	rw_destroy(&key->zk_salt_lock);
 
 	/* free crypto templates */
 	crypto_destroy_ctx_template(key->zk_current_tmpl);
 	crypto_destroy_ctx_template(key->zk_hmac_tmpl);
 
 	/* zero out sensitive data */
 	memset(key, 0, sizeof (zio_crypt_key_t));
 }
 
 int
 zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
 {
 	int ret;
 	crypto_mechanism_t mech = {0};
 	uint_t keydata_len;
 
 	ASSERT(key != NULL);
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 /*
  * Workaround for GCC 12+ with UBSan enabled deficencies.
  *
  * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code
  * below as violating -Warray-bounds
  */
 #if defined(__GNUC__) && !defined(__clang__) && \
 	((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \
 	    defined(CONFIG_UBSAN))
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 #if defined(__GNUC__) && !defined(__clang__) && \
 	((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \
 	    defined(CONFIG_UBSAN))
 #pragma GCC diagnostic pop
 #endif
 	memset(key, 0, sizeof (zio_crypt_key_t));
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	/* fill keydata buffers and salt with random data */
 	ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_master_keydata, keydata_len);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for the ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = &key->zk_hmac_key;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	/*
 	 * Initialize the crypto templates. It's ok if this fails because
 	 * this is just an optimization.
 	 */
 	mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
 	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
 	    &key->zk_current_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_current_tmpl = NULL;
 
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
 	    &key->zk_hmac_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_hmac_tmpl = NULL;
 
 	key->zk_crypt = crypt;
 	key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy(key);
 	return (ret);
 }
 
 static int
 zio_crypt_key_change_salt(zio_crypt_key_t *key)
 {
 	int ret = 0;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	crypto_mechanism_t mech;
 	uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
 
 	/* generate a new salt */
 	ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	rw_enter(&key->zk_salt_lock, RW_WRITER);
 
 	/* someone beat us to the salt rotation, just unlock and return */
 	if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
 		goto out_unlock;
 
 	/* derive the current key from the master key and the new salt */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
 	if (ret != 0)
 		goto out_unlock;
 
 	/* assign the salt and reset the usage count */
 	memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
 	key->zk_salt_count = 0;
 
 	/* destroy the old context template and create the new one */
 	crypto_destroy_ctx_template(key->zk_current_tmpl);
 	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
 	    &key->zk_current_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_current_tmpl = NULL;
 
 	rw_exit(&key->zk_salt_lock);
 
 	return (0);
 
 out_unlock:
 	rw_exit(&key->zk_salt_lock);
 error:
 	return (ret);
 }
 
 /* See comment above zfs_key_max_salt_uses definition for details */
 int
 zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
 {
 	int ret;
 	boolean_t salt_change;
 
 	rw_enter(&key->zk_salt_lock, RW_READER);
 
 	memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
 	salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
 	    ZFS_CURRENT_MAX_SALT_USES);
 
 	rw_exit(&key->zk_salt_lock);
 
 	if (salt_change) {
 		ret = zio_crypt_key_change_salt(key);
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * This function handles all encryption and decryption in zfs. When
  * encrypting it expects puio to reference the plaintext and cuio to
  * reference the ciphertext. cuio must have enough space for the
  * ciphertext + room for a MAC. datalen should be the length of the
  * plaintext / ciphertext alone.
  */
 static int
 zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
     crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen,
     zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len)
 {
 	int ret;
 	crypto_data_t plaindata, cipherdata;
 	CK_AES_CCM_PARAMS ccmp;
 	CK_AES_GCM_PARAMS gcmp;
 	crypto_mechanism_t mech;
 	zio_crypt_info_t crypt_info;
 	uint_t plain_full_len, maclen;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	/* lookup the encryption info */
 	crypt_info = zio_crypt_table[crypt];
 
 	/* the mac will always be the last iovec_t in the cipher uio */
 	maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len;
 
 	ASSERT(maclen <= ZIO_DATA_MAC_LEN);
 
 	/* setup encryption mechanism (same as crypt) */
 	mech.cm_type = crypto_mech2id(crypt_info.ci_mechname);
 
 	/*
 	 * Strangely, the ICP requires that plain_full_len must include
 	 * the MAC length when decrypting, even though the UIO does not
 	 * need to have the extra space allocated.
 	 */
 	if (encrypt) {
 		plain_full_len = datalen;
 	} else {
 		plain_full_len = datalen + maclen;
 	}
 
 	/*
 	 * setup encryption params (currently only AES CCM and AES GCM
 	 * are supported)
 	 */
 	if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) {
 		ccmp.ulNonceSize = ZIO_DATA_IV_LEN;
 		ccmp.ulAuthDataSize = auth_len;
 		ccmp.authData = authbuf;
 		ccmp.ulMACSize = maclen;
 		ccmp.nonce = ivbuf;
 		ccmp.ulDataSize = plain_full_len;
 
 		mech.cm_param = (char *)(&ccmp);
 		mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS);
 	} else {
 		gcmp.ulIvLen = ZIO_DATA_IV_LEN;
 		gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN);
 		gcmp.ulAADLen = auth_len;
 		gcmp.pAAD = authbuf;
 		gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen);
 		gcmp.pIv = ivbuf;
 
 		mech.cm_param = (char *)(&gcmp);
 		mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
 	}
 
 	/* populate the cipher and plain data structs. */
 	plaindata.cd_format = CRYPTO_DATA_UIO;
 	plaindata.cd_offset = 0;
 	plaindata.cd_uio = puio;
 	plaindata.cd_length = plain_full_len;
 
 	cipherdata.cd_format = CRYPTO_DATA_UIO;
 	cipherdata.cd_offset = 0;
 	cipherdata.cd_uio = cuio;
 	cipherdata.cd_length = datalen + maclen;
 
 	/* perform the actual encryption */
 	if (encrypt) {
 		ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata);
 		if (ret != CRYPTO_SUCCESS) {
 			ret = SET_ERROR(EIO);
 			goto error;
 		}
 	} else {
 		ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata);
 		if (ret != CRYPTO_SUCCESS) {
 			ASSERT3U(ret, ==, CRYPTO_INVALID_MAC);
 			ret = SET_ERROR(ECKSUM);
 			goto error;
 		}
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 int
 zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
     uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
 {
 	int ret;
 	zfs_uio_t puio, cuio;
 	uint64_t aad[3];
 	iovec_t plain_iovecs[2], cipher_iovecs[3];
 	uint64_t crypt = key->zk_crypt;
 	uint_t enc_len, keydata_len, aad_len;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 
 	/* generate iv for wrapping the master and hmac key */
 	ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* initialize zfs_uio_ts */
 	plain_iovecs[0].iov_base = key->zk_master_keydata;
 	plain_iovecs[0].iov_len = keydata_len;
 	plain_iovecs[1].iov_base = key->zk_hmac_keydata;
 	plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 
 	cipher_iovecs[0].iov_base = keydata_out;
 	cipher_iovecs[0].iov_len = keydata_len;
 	cipher_iovecs[1].iov_base = hmac_keydata_out;
 	cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 	cipher_iovecs[2].iov_base = mac;
 	cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
 
 	/*
 	 * Although we don't support writing to the old format, we do
 	 * support rewrapping the key so that the user can move and
 	 * quarantine datasets on the old format.
 	 */
 	if (key->zk_version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(key->zk_guid);
 	} else {
 		ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(key->zk_guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(key->zk_version);
 	}
 
 	enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
 	puio.uio_iov = plain_iovecs;
 	puio.uio_iovcnt = 2;
 	puio.uio_segflg = UIO_SYSSPACE;
 	cuio.uio_iov = cipher_iovecs;
 	cuio.uio_iovcnt = 3;
 	cuio.uio_segflg = UIO_SYSSPACE;
 
 	/* encrypt the keys and store the resulting ciphertext and mac */
 	ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len,
 	    &puio, &cuio, (uint8_t *)aad, aad_len);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 int
 zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
     uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
     uint8_t *mac, zio_crypt_key_t *key)
 {
 	crypto_mechanism_t mech;
 	zfs_uio_t puio, cuio;
 	uint64_t aad[3];
 	iovec_t plain_iovecs[2], cipher_iovecs[3];
 	uint_t enc_len, keydata_len, aad_len;
 	int ret;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 
 	/* initialize zfs_uio_ts */
 	plain_iovecs[0].iov_base = key->zk_master_keydata;
 	plain_iovecs[0].iov_len = keydata_len;
 	plain_iovecs[1].iov_base = key->zk_hmac_keydata;
 	plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 
 	cipher_iovecs[0].iov_base = keydata;
 	cipher_iovecs[0].iov_len = keydata_len;
 	cipher_iovecs[1].iov_base = hmac_keydata;
 	cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 	cipher_iovecs[2].iov_base = mac;
 	cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
 
 	if (version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(guid);
 	} else {
 		ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(version);
 	}
 
 	enc_len = keydata_len + SHA512_HMAC_KEYLEN;
 	puio.uio_iov = plain_iovecs;
 	puio.uio_segflg = UIO_SYSSPACE;
 	puio.uio_iovcnt = 2;
 	cuio.uio_iov = cipher_iovecs;
 	cuio.uio_iovcnt = 3;
 	cuio.uio_segflg = UIO_SYSSPACE;
 
 	/* decrypt the keys and store the result in the output buffers */
 	ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len,
 	    &puio, &cuio, (uint8_t *)aad, aad_len);
 	if (ret != 0)
 		goto error;
 
 	/* generate a fresh salt */
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	/*
 	 * Initialize the crypto templates. It's ok if this fails because
 	 * this is just an optimization.
 	 */
 	mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
 	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
 	    &key->zk_current_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_current_tmpl = NULL;
 
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
 	    &key->zk_hmac_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_hmac_tmpl = NULL;
 
 	key->zk_crypt = crypt;
 	key->zk_version = version;
 	key->zk_guid = guid;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy(key);
 	return (ret);
 }
 
 int
 zio_crypt_generate_iv(uint8_t *ivbuf)
 {
 	int ret;
 
 	/* randomly generate the IV */
 	ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	memset(ivbuf, 0, ZIO_DATA_IV_LEN);
 	return (ret);
 }
 
 int
 zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
     uint8_t *digestbuf, uint_t digestlen)
 {
 	int ret;
 	crypto_mechanism_t mech;
 	crypto_data_t in_data, digest_data;
 	uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
 
 	ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
 
 	/* initialize sha512-hmac mechanism and crypto data */
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	mech.cm_param = NULL;
 	mech.cm_param_len = 0;
 
 	/* initialize the crypto data */
 	in_data.cd_format = CRYPTO_DATA_RAW;
 	in_data.cd_offset = 0;
 	in_data.cd_length = datalen;
 	in_data.cd_raw.iov_base = (char *)data;
 	in_data.cd_raw.iov_len = in_data.cd_length;
 
 	digest_data.cd_format = CRYPTO_DATA_RAW;
 	digest_data.cd_offset = 0;
 	digest_data.cd_length = SHA512_DIGEST_LENGTH;
 	digest_data.cd_raw.iov_base = (char *)raw_digestbuf;
 	digest_data.cd_raw.iov_len = digest_data.cd_length;
 
 	/* generate the hmac */
 	ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl,
 	    &digest_data);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	memcpy(digestbuf, raw_digestbuf, digestlen);
 
 	return (0);
 
 error:
 	memset(digestbuf, 0, digestlen);
 	return (ret);
 }
 
 int
 zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
     uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
 {
 	int ret;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	ret = zio_crypt_do_hmac(key, data, datalen,
 	    digestbuf, SHA512_DIGEST_LENGTH);
 	if (ret != 0)
 		return (ret);
 
 	memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN);
 	memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN);
 
 	return (0);
 }
 
 /*
  * The following functions are used to encode and decode encryption parameters
  * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
  * byte strings, which normally means that these strings would not need to deal
  * with byteswapping at all. However, both blkptr_t and zil_header_t may be
  * byteswapped by lower layers and so we must "undo" that byteswap here upon
  * decoding and encoding in a non-native byteorder. These functions require
  * that the byteorder bit is correct before being called.
  */
 void
 zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_ENCRYPTED(bp));
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
 		memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, val32);
 	} else {
 		memcpy(&val64, salt, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
 
 		memcpy(&val64, iv, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
 
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, BSWAP_32(val32));
 	}
 }
 
 void
 zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_PROTECTED(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		memset(salt, 0, ZIO_DATA_SALT_LEN);
 		memset(iv, 0, ZIO_DATA_IV_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
 		memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
 
 		val32 = (uint32_t)BP_GET_IV2(bp);
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
 		memcpy(salt, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
 		memcpy(iv, &val64, sizeof (uint64_t));
 
 		val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
 		memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
 		    sizeof (uint64_t));
 	} else {
 		memcpy(&val64, mac, sizeof (uint64_t));
 		bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
 
 		memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
 		bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
 	}
 }
 
 void
 zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		memset(mac, 0, ZIO_DATA_MAC_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
 		memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
 		    sizeof (uint64_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
 		memcpy(mac, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
 		memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
 {
 	zil_chain_t *zilc = data;
 
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
 	    sizeof (uint64_t));
 }
 
 void
 zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
 {
 	/*
 	 * The ZIL MAC is embedded in the block it protects, which will
 	 * not have been byteswapped by the time this function has been called.
 	 * As a result, we don't need to worry about byteswapping the MAC.
 	 */
 	const zil_chain_t *zilc = data;
 
 	memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
 	memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
 	    sizeof (uint64_t));
 }
 
 /*
  * This routine takes a block of dnodes (src_abd) and copies only the bonus
  * buffers to the same offsets in the dst buffer. datalen should be the size
  * of both the src_abd and the dst buffer (not just the length of the bonus
  * buffers).
  */
 void
 zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
 {
 	uint_t i, max_dnp = datalen >> DNODE_SHIFT;
 	uint8_t *src;
 	dnode_phys_t *dnp, *sdnp, *ddnp;
 
 	src = abd_borrow_buf_copy(src_abd, datalen);
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp),
 			    DN_MAX_BONUS_LEN(dnp));
 		}
 	}
 
 	abd_return_buf(src_abd, src, datalen);
 }
 
 /*
  * This function decides what fields from blk_prop are included in
  * the on-disk various MAC algorithms.
  */
 static void
 zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
 {
 	/*
 	 * Version 0 did not properly zero out all non-portable fields
 	 * as it should have done. We maintain this code so that we can
 	 * do read-only imports of pools on this version.
 	 */
 	if (version == 0) {
 		BP_SET_DEDUP(bp, 0);
 		BP_SET_CHECKSUM(bp, 0);
 		BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
 		return;
 	}
 
 	ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 
 	/*
 	 * The hole_birth feature might set these fields even if this bp
 	 * is a hole. We zero them out here to guarantee that raw sends
 	 * will function with or without the feature.
 	 */
 	if (BP_IS_HOLE(bp)) {
 		bp->blk_prop = 0ULL;
 		return;
 	}
 
 	/*
 	 * At L0 we want to verify these fields to ensure that data blocks
 	 * can not be reinterpreted. For instance, we do not want an attacker
 	 * to trick us into returning raw lz4 compressed data to the user
 	 * by modifying the compression bits. At higher levels, we cannot
 	 * enforce this policy since raw sends do not convey any information
 	 * about indirect blocks, so these values might be different on the
 	 * receive side. Fortunately, this does not open any new attack
 	 * vectors, since any alterations that can be made to a higher level
 	 * bp must still verify the correct order of the layer below it.
 	 */
 	if (BP_GET_LEVEL(bp) != 0) {
 		BP_SET_BYTEORDER(bp, 0);
 		BP_SET_COMPRESS(bp, 0);
 
 		/*
 		 * psize cannot be set to zero or it will trigger
 		 * asserts, but the value doesn't really matter as
 		 * long as it is constant.
 		 */
 		BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
 	}
 
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_CHECKSUM(bp, 0);
 }
 
 static void
 zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
     blkptr_auth_buf_t *bab, uint_t *bab_len)
 {
 	blkptr_t tmpbp = *bp;
 
 	if (should_bswap)
 		byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
 
 	ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
 	ASSERT0(BP_IS_EMBEDDED(&tmpbp));
 
 	zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
 
 	/*
 	 * We always MAC blk_prop in LE to ensure portability. This
 	 * must be done after decoding the mac, since the endianness
 	 * will get zero'd out here.
 	 */
 	zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
 	bab->bab_prop = LE_64(tmpbp.blk_prop);
 	bab->bab_pad = 0ULL;
 
 	/* version 0 did not include the padding */
 	*bab_len = sizeof (blkptr_auth_buf_t);
 	if (version == 0)
 		*bab_len -= sizeof (uint64_t);
 }
 
 static int
 zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	int ret;
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 	crypto_data_t cd;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	cd.cd_format = CRYPTO_DATA_RAW;
 	cd.cd_offset = 0;
 	cd.cd_length = bab_len;
 	cd.cd_raw.iov_base = (char *)&bab;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 static void
 zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	SHA2Update(ctx, &bab, bab_len);
 }
 
 static void
 zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	memcpy(*aadp, &bab, bab_len);
 	*aadp += bab_len;
 	*aad_len += bab_len;
 }
 
 static int
 zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, dnode_phys_t *dnp)
 {
 	int ret, i;
 	dnode_phys_t *adnp, tmp_dncore;
 	size_t dn_core_size = offsetof(dnode_phys_t, dn_blkptr);
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	crypto_data_t cd;
 
 	cd.cd_format = CRYPTO_DATA_RAW;
 	cd.cd_offset = 0;
 
 	/*
 	 * Authenticate the core dnode (masking out non-portable bits).
 	 * We only copy the first 64 bytes we operate on to avoid the overhead
 	 * of copying 512-64 unneeded bytes. The compiler seems to be fine
 	 * with that.
 	 */
 	memcpy(&tmp_dncore, dnp, dn_core_size);
 	adnp = &tmp_dncore;
 
 	if (le_bswap) {
 		adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
 		adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
 		adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
 		adnp->dn_used = BSWAP_64(adnp->dn_used);
 	}
 	adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 	adnp->dn_used = 0;
 
 	cd.cd_length = dn_core_size;
 	cd.cd_raw.iov_base = (char *)adnp;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	for (i = 0; i < dnp->dn_nblkptr; i++) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, &dnp->dn_blkptr[i]);
 		if (ret != 0)
 			goto error;
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, DN_SPILL_BLKPTR(dnp));
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * objset_phys_t blocks introduce a number of exceptions to the normal
  * authentication process. objset_phys_t's contain 2 separate HMACS for
  * protecting the integrity of their data. The portable_mac protects the
  * metadnode. This MAC can be sent with a raw send and protects against
  * reordering of data within the metadnode. The local_mac protects the user
  * accounting objects which are not sent from one system to another.
  *
  * In addition, objset blocks are the only blocks that can be modified and
  * written to disk without the key loaded under certain circumstances. During
  * zil_claim() we need to be able to update the zil_header_t to complete
  * claiming log blocks and during raw receives we need to write out the
  * portable_mac from the send file. Both of these actions are possible
  * because these fields are not protected by either MAC so neither one will
  * need to modify the MACs without the key. However, when the modified blocks
  * are written out they will be byteswapped into the host machine's native
  * endianness which will modify fields protected by the MAC. As a result, MAC
  * calculation for objset blocks works slightly differently from other block
  * types. Where other block types MAC the data in whatever endianness is
  * written to disk, objset blocks always MAC little endian version of their
  * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
  * and le_bswap indicates whether a byteswap is needed to get this block
  * into little endian format.
  */
 int
 zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
     boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
 {
 	int ret;
 	crypto_mechanism_t mech;
 	crypto_context_t ctx;
 	crypto_data_t cd;
 	objset_phys_t *osp = data;
 	uint64_t intval;
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
 	uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
 
 	/* initialize HMAC mechanism */
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	mech.cm_param = NULL;
 	mech.cm_param_len = 0;
 
 	cd.cd_format = CRYPTO_DATA_RAW;
 	cd.cd_offset = 0;
 
 	/* calculate the portable MAC from the portable fields and metadnode */
 	ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in the os_type */
 	intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
 	cd.cd_length = sizeof (uint64_t);
 	cd.cd_raw.iov_base = (char *)&intval;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in the portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	cd.cd_length = sizeof (uint64_t);
 	cd.cd_raw.iov_base = (char *)&intval;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in fields from the metadnode */
 	ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 	    should_bswap, &osp->os_meta_dnode);
 	if (ret)
 		goto error;
 
 	/* store the final digest in a temporary buffer and copy what we need */
 	cd.cd_length = SHA512_DIGEST_LENGTH;
 	cd.cd_raw.iov_base = (char *)raw_portable_mac;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_final(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN);
 
 	/*
 	 * This is necessary here as we check next whether
 	 * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to
 	 * decide if the local_mac should be zeroed out. That flag will always
 	 * be set by dmu_objset_id_quota_upgrade_cb() and
 	 * dmu_objset_userspace_upgrade_cb() if useraccounting has been
 	 * completed.
 	 */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	boolean_t uacct_incomplete =
 	    !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 
 	/*
 	 * The local MAC protects the user, group and project accounting.
 	 * If these objects are not present, the local MAC is zeroed out.
 	 */
 	if (uacct_incomplete ||
 	    (datalen >= OBJSET_PHYS_SIZE_V3 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen >= OBJSET_PHYS_SIZE_V2 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen <= OBJSET_PHYS_SIZE_V1)) {
 		memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 		return (0);
 	}
 
 	/* calculate the local MAC from the userused and groupused dnodes */
 	ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in the non-portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	cd.cd_length = sizeof (uint64_t);
 	cd.cd_raw.iov_base = (char *)&intval;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in fields from the user accounting dnodes */
 	if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_userused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_groupused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
 	    datalen >= OBJSET_PHYS_SIZE_V3) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_projectused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	/* store the final digest in a temporary buffer and copy what we need */
 	cd.cd_length = SHA512_DIGEST_LENGTH;
 	cd.cd_raw.iov_base = (char *)raw_local_mac;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_final(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN);
 
 	return (0);
 
 error:
 	memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN);
 	memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 	return (ret);
 }
 
 static void
 zio_crypt_destroy_uio(zfs_uio_t *uio)
 {
 	if (uio->uio_iov)
 		kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
 }
 
 /*
  * This function parses an uncompressed indirect block and returns a checksum
  * of all the portable fields from all of the contained bps. The portable
  * fields are the MAC and all of the fields from blk_prop except for the dedup,
  * checksum, and psize bits. For an explanation of the purpose of this, see
  * the comment block on object set authentication.
  */
 static int
 zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
     uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
 {
 	blkptr_t *bp;
 	int i, epb = datalen >> SPA_BLKPTRSHIFT;
 	SHA2_CTX ctx;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	/* checksum all of the MACs from the layer below */
 	SHA2Init(SHA512, &ctx);
 	for (i = 0, bp = buf; i < epb; i++, bp++) {
 		zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
 		    byteswap, bp);
 	}
 	SHA2Final(digestbuf, &ctx);
 
 	if (generate) {
 		memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN);
 		return (0);
 	}
 
 	if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0)
 		return (SET_ERROR(ECKSUM));
 
 	return (0);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 
 	/*
 	 * Unfortunately, callers of this function will not always have
 	 * easy access to the on-disk format version. This info is
 	 * normally found in the DSL Crypto Key, but the checksum-of-MACs
 	 * is expected to be verifiable even when the key isn't loaded.
 	 * Here, instead of doing a ZAP lookup for the version for each
 	 * zio, we simply try both existing formats.
 	 */
 	ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
 	    datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
 	if (ret == ECKSUM) {
 		ASSERT(!generate);
 		ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
 		    buf, datalen, 0, byteswap, cksum);
 	}
 
 	return (ret);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 	void *buf;
 
 	buf = abd_borrow_buf_copy(abd, datalen);
 	ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
 	    byteswap, cksum);
 	abd_return_buf(abd, buf, datalen);
 
 	return (ret);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting ZIL blocks.
  * We do not check for the older ZIL chain because the encryption feature
  * was not available before the newer ZIL chain was introduced. The goal
  * here is to encrypt everything except the blkptr_t of a lr_write_t and
  * the zil_chain_t header. Everything that is not encrypted is authenticated.
  */
 static int
 zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio,
     zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
     boolean_t *no_crypt)
 {
 	int ret;
-	uint64_t txtype, lr_len;
+	uint64_t txtype, lr_len, nused;
 	uint_t nr_src, nr_dst, crypt_len;
 	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
 	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
 	uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
 	zil_chain_t *zilc;
 	lr_t *lr;
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 
 	/* cipherbuf always needs an extra iovec for the MAC */
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 		nr_src = 0;
 		nr_dst = 1;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 		nr_src = 1;
 		nr_dst = 0;
 	}
 	memset(dst, 0, datalen);
 
 	/* find the start and end record of the log block */
 	zilc = (zil_chain_t *)src;
 	slrp = src + sizeof (zil_chain_t);
 	aadp = aadbuf;
-	blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	ASSERT3U(nused, >=, sizeof (zil_chain_t));
+	ASSERT3U(nused, <=, datalen);
+	blkend = src + nused;
 
 	/* calculate the number of encrypted iovecs we will need */
 	for (; slrp < blkend; slrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (!byteswap) {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		} else {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		}
+		ASSERT3U(lr_len, >=, sizeof (lr_t));
+		ASSERT3U(lr_len, <=, blkend - slrp);
 
 		nr_iovecs++;
 		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
 			nr_iovecs++;
 	}
 
 	nr_src += nr_iovecs;
 	nr_dst += nr_iovecs;
 
 	/* allocate the iovec arrays */
 	if (nr_src != 0) {
 		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
 		if (src_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	if (nr_dst != 0) {
 		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
 		if (dst_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	/*
 	 * Copy the plain zil header over and authenticate everything except
 	 * the checksum that will store our MAC. If we are writing the data
 	 * the embedded checksum will not have been calculated yet, so we don't
 	 * authenticate that.
 	 */
 	memcpy(dst, src, sizeof (zil_chain_t));
 	memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t));
 	aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 	aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 
 	/* loop over records again, filling in iovecs */
 	nr_iovecs = 0;
 	slrp = src + sizeof (zil_chain_t);
 	dlrp = dst + sizeof (zil_chain_t);
 
 	for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (!byteswap) {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		} else {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		}
 
 		/* copy the common lr_t */
 		memcpy(dlrp, slrp, sizeof (lr_t));
 		memcpy(aadp, slrp, sizeof (lr_t));
 		aadp += sizeof (lr_t);
 		aad_len += sizeof (lr_t);
 
 		ASSERT3P(src_iovecs, !=, NULL);
 		ASSERT3P(dst_iovecs, !=, NULL);
 
 		/*
 		 * If this is a TX_WRITE record we want to encrypt everything
 		 * except the bp if exists. If the bp does exist we want to
 		 * authenticate it.
 		 */
 		if (txtype == TX_WRITE) {
 			crypt_len = sizeof (lr_write_t) -
 			    sizeof (lr_t) - sizeof (blkptr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 
 			/* copy the bp now since it will not be encrypted */
 			memcpy(dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
 			    slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
 			    sizeof (blkptr_t));
 			memcpy(aadp,
 			    slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
 			    sizeof (blkptr_t));
 			aadp += sizeof (blkptr_t);
 			aad_len += sizeof (blkptr_t);
 			nr_iovecs++;
 			total_len += crypt_len;
 
 			if (lr_len != sizeof (lr_write_t)) {
 				crypt_len = lr_len - sizeof (lr_write_t);
 				src_iovecs[nr_iovecs].iov_base =
 				    slrp + sizeof (lr_write_t);
 				src_iovecs[nr_iovecs].iov_len = crypt_len;
 				dst_iovecs[nr_iovecs].iov_base =
 				    dlrp + sizeof (lr_write_t);
 				dst_iovecs[nr_iovecs].iov_len = crypt_len;
 				nr_iovecs++;
 				total_len += crypt_len;
 			}
 		} else if (txtype == TX_CLONE_RANGE) {
 			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
 			crypt_len = o - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 
 			/* copy the bps now since they will not be encrypted */
 			memcpy(dlrp + o, slrp + o, lr_len - o);
 			memcpy(aadp, slrp + o, lr_len - o);
 			aadp += lr_len - o;
 			aad_len += lr_len - o;
 			nr_iovecs++;
 			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 			nr_iovecs++;
 			total_len += crypt_len;
 		}
 	}
 
 	*no_crypt = (nr_iovecs == 0);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 
 	if (encrypt) {
 		puio->uio_iov = src_iovecs;
 		puio->uio_iovcnt = nr_src;
 		cuio->uio_iov = dst_iovecs;
 		cuio->uio_iovcnt = nr_dst;
 	} else {
 		puio->uio_iov = dst_iovecs;
 		puio->uio_iovcnt = nr_dst;
 		cuio->uio_iov = src_iovecs;
 		cuio->uio_iovcnt = nr_src;
 	}
 
 	return (0);
 
 error:
 	zio_buf_free(aadbuf, datalen);
 	if (src_iovecs != NULL)
 		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
 	if (dst_iovecs != NULL)
 		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
 
 	*enc_len = 0;
 	*authbuf = NULL;
 	*auth_len = 0;
 	*no_crypt = B_FALSE;
 	puio->uio_iov = NULL;
 	puio->uio_iovcnt = 0;
 	cuio->uio_iov = NULL;
 	cuio->uio_iovcnt = 0;
 	return (ret);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting dnode blocks.
  */
 static int
 zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
     uint_t *auth_len, boolean_t *no_crypt)
 {
 	int ret;
 	uint_t nr_src, nr_dst, crypt_len;
 	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
 	uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
 	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
 	uint8_t *src, *dst, *aadp;
 	dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 		nr_src = 0;
 		nr_dst = 1;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 		nr_src = 1;
 		nr_dst = 0;
 	}
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 	aadp = aadbuf;
 
 	/*
 	 * Count the number of iovecs we will need to do the encryption by
 	 * counting the number of bonus buffers that need to be encrypted.
 	 */
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		/*
 		 * This block may still be byteswapped. However, all of the
 		 * values we use are either uint8_t's (for which byteswapping
 		 * is a noop) or a * != 0 check, which will work regardless
 		 * of whether or not we byteswap.
 		 */
 		if (sdnp[i].dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
 		    sdnp[i].dn_bonuslen != 0) {
 			nr_iovecs++;
 		}
 	}
 
 	nr_src += nr_iovecs;
 	nr_dst += nr_iovecs;
 
 	if (nr_src != 0) {
 		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
 		if (src_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	if (nr_dst != 0) {
 		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
 		if (dst_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	nr_iovecs = 0;
 
 	/*
 	 * Iterate through the dnodes again, this time filling in the uios
 	 * we allocated earlier. We also concatenate any data we want to
 	 * authenticate onto aadbuf.
 	 */
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 
 		/* copy over the core fields and blkptrs (kept as plaintext) */
 		memcpy(&ddnp[i], dnp,
 		    (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp),
 			    sizeof (blkptr_t));
 		}
 
 		/*
 		 * Handle authenticated data. We authenticate everything in
 		 * the dnode that can be brought over when we do a raw send.
 		 * This includes all of the core fields as well as the MACs
 		 * stored in the bp checksums and all of the portable bits
 		 * from blk_prop. We include the dnode padding here in case it
 		 * ever gets used in the future. Some dn_flags and dn_used are
 		 * not portable so we mask those out values out of the
 		 * authenticated data.
 		 */
 		crypt_len = offsetof(dnode_phys_t, dn_blkptr);
 		memcpy(aadp, dnp, crypt_len);
 		adnp = (dnode_phys_t *)aadp;
 		adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 		adnp->dn_used = 0;
 		aadp += crypt_len;
 		aad_len += crypt_len;
 
 		for (j = 0; j < dnp->dn_nblkptr; j++) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, &dnp->dn_blkptr[j]);
 		}
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, DN_SPILL_BLKPTR(dnp));
 		}
 
 		/*
 		 * If this bonus buffer needs to be encrypted, we prepare an
 		 * iovec_t. The encryption / decryption functions will fill
 		 * this in for us with the encrypted or decrypted data.
 		 * Otherwise we add the bonus buffer to the authenticated
 		 * data buffer and copy it over to the destination. The
 		 * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
 		 * we can guarantee alignment with the AES block size
 		 * (128 bits).
 		 */
 		crypt_len = DN_MAX_BONUS_LEN(dnp);
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			ASSERT3U(nr_iovecs, <, nr_src);
 			ASSERT3U(nr_iovecs, <, nr_dst);
 			ASSERT3P(src_iovecs, !=, NULL);
 			ASSERT3P(dst_iovecs, !=, NULL);
 			src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 
 			nr_iovecs++;
 			total_len += crypt_len;
 		} else {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len);
 			memcpy(aadp, DN_BONUS(dnp), crypt_len);
 			aadp += crypt_len;
 			aad_len += crypt_len;
 		}
 	}
 
 	*no_crypt = (nr_iovecs == 0);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 
 	if (encrypt) {
 		puio->uio_iov = src_iovecs;
 		puio->uio_iovcnt = nr_src;
 		cuio->uio_iov = dst_iovecs;
 		cuio->uio_iovcnt = nr_dst;
 	} else {
 		puio->uio_iov = dst_iovecs;
 		puio->uio_iovcnt = nr_dst;
 		cuio->uio_iov = src_iovecs;
 		cuio->uio_iovcnt = nr_src;
 	}
 
 	return (0);
 
 error:
 	zio_buf_free(aadbuf, datalen);
 	if (src_iovecs != NULL)
 		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
 	if (dst_iovecs != NULL)
 		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
 
 	*enc_len = 0;
 	*authbuf = NULL;
 	*auth_len = 0;
 	*no_crypt = B_FALSE;
 	puio->uio_iov = NULL;
 	puio->uio_iovcnt = 0;
 	cuio->uio_iov = NULL;
 	cuio->uio_iovcnt = 0;
 	return (ret);
 }
 
 static int
 zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio,
     uint_t *enc_len)
 {
 	(void) encrypt;
 	int ret;
 	uint_t nr_plain = 1, nr_cipher = 2;
 	iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
 
 	/* allocate the iovecs for the plain and cipher data */
 	plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t),
 	    KM_SLEEP);
 	if (!plain_iovecs) {
 		ret = SET_ERROR(ENOMEM);
 		goto error;
 	}
 
 	cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
 	    KM_SLEEP);
 	if (!cipher_iovecs) {
 		ret = SET_ERROR(ENOMEM);
 		goto error;
 	}
 
 	plain_iovecs[0].iov_base = plainbuf;
 	plain_iovecs[0].iov_len = datalen;
 	cipher_iovecs[0].iov_base = cipherbuf;
 	cipher_iovecs[0].iov_len = datalen;
 
 	*enc_len = datalen;
 	puio->uio_iov = plain_iovecs;
 	puio->uio_iovcnt = nr_plain;
 	cuio->uio_iov = cipher_iovecs;
 	cuio->uio_iovcnt = nr_cipher;
 
 	return (0);
 
 error:
 	if (plain_iovecs != NULL)
 		kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
 	if (cipher_iovecs != NULL)
 		kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
 
 	*enc_len = 0;
 	puio->uio_iov = NULL;
 	puio->uio_iovcnt = 0;
 	cuio->uio_iov = NULL;
 	cuio->uio_iovcnt = 0;
 	return (ret);
 }
 
 /*
  * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
  * that they can be used for encryption and decryption by zio_do_crypt_uio().
  * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
  * requiring special handling to parse out pieces that are to be encrypted. The
  * authbuf is used by these special cases to store additional authenticated
  * data (AAD) for the encryption modes.
  */
 static int
 zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len,
     uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt)
 {
 	int ret;
 	iovec_t *mac_iov;
 
 	ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
 
 	/* route to handler */
 	switch (ot) {
 	case DMU_OT_INTENT_LOG:
 		ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
 		    datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
 		    no_crypt);
 		break;
 	case DMU_OT_DNODE:
 		ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
 		    cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
 		    auth_len, no_crypt);
 		break;
 	default:
 		ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
 		    datalen, puio, cuio, enc_len);
 		*authbuf = NULL;
 		*auth_len = 0;
 		*no_crypt = B_FALSE;
 		break;
 	}
 
 	if (ret != 0)
 		goto error;
 
 	/* populate the uios */
 	puio->uio_segflg = UIO_SYSSPACE;
 	cuio->uio_segflg = UIO_SYSSPACE;
 
 	mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
 	mac_iov->iov_base = mac;
 	mac_iov->iov_len = ZIO_DATA_MAC_LEN;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * Primary encryption / decryption entrypoint for zio data.
  */
 int
 zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
     dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
     uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
     boolean_t *no_crypt)
 {
 	int ret;
 	boolean_t locked = B_FALSE;
 	uint64_t crypt = key->zk_crypt;
 	uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
 	uint_t enc_len, auth_len;
 	zfs_uio_t puio, cuio;
 	uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
 	crypto_key_t tmp_ckey, *ckey = NULL;
 	crypto_ctx_template_t tmpl;
 	uint8_t *authbuf = NULL;
 
 	memset(&puio, 0, sizeof (puio));
 	memset(&cuio, 0, sizeof (cuio));
 
 	/*
 	 * If the needed key is the current one, just use it. Otherwise we
 	 * need to generate a temporary one from the given salt + master key.
 	 * If we are encrypting, we must return a copy of the current salt
 	 * so that it can be stored in the blkptr_t.
 	 */
 	rw_enter(&key->zk_salt_lock, RW_READER);
 	locked = B_TRUE;
 
 	if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
 		ckey = &key->zk_current_key;
 		tmpl = key->zk_current_tmpl;
 	} else {
 		rw_exit(&key->zk_salt_lock);
 		locked = B_FALSE;
 
 		ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 		    salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
 		if (ret != 0)
 			goto error;
 
 		tmp_ckey.ck_data = enc_keydata;
 		tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 		ckey = &tmp_ckey;
 		tmpl = NULL;
 	}
 
 	/*
 	 * Attempt to use QAT acceleration if we can. We currently don't
 	 * do this for metadnode and ZIL blocks, since they have a much
 	 * more involved buffer layout and the qat_crypt() function only
 	 * works in-place.
 	 */
 	if (qat_crypt_use_accel(datalen) &&
 	    ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) {
 		uint8_t *srcbuf, *dstbuf;
 
 		if (encrypt) {
 			srcbuf = plainbuf;
 			dstbuf = cipherbuf;
 		} else {
 			srcbuf = cipherbuf;
 			dstbuf = plainbuf;
 		}
 
 		ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf,
 		    dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen);
 		if (ret == CPA_STATUS_SUCCESS) {
 			if (locked) {
 				rw_exit(&key->zk_salt_lock);
 				locked = B_FALSE;
 			}
 
 			return (0);
 		}
 		/* If the hardware implementation fails fall back to software */
 	}
 
 	/* create uios for encryption */
 	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
 	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
 	    &authbuf, &auth_len, no_crypt);
 	if (ret != 0)
 		goto error;
 
 	/* perform the encryption / decryption in software */
 	ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len,
 	    &puio, &cuio, authbuf, auth_len);
 	if (ret != 0)
 		goto error;
 
 	if (locked) {
 		rw_exit(&key->zk_salt_lock);
 	}
 
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 
 	return (0);
 
 error:
 	if (locked)
 		rw_exit(&key->zk_salt_lock);
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 
 	return (ret);
 }
 
 /*
  * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
  * linear buffers.
  */
 int
 zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
     boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
     uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
 {
 	int ret;
 	void *ptmp, *ctmp;
 
 	if (encrypt) {
 		ptmp = abd_borrow_buf_copy(pabd, datalen);
 		ctmp = abd_borrow_buf(cabd, datalen);
 	} else {
 		ptmp = abd_borrow_buf(pabd, datalen);
 		ctmp = abd_borrow_buf_copy(cabd, datalen);
 	}
 
 	ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
 	    datalen, ptmp, ctmp, no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (0);
 
 error:
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (ret);
 }
 
 #if defined(_KERNEL)
 /* CSTYLED */
 module_param(zfs_key_max_salt_uses, ulong, 0644);
 MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
 	"can be used for generating encryption keys before it is rotated");
 #endif
diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c
index 09c7be853bf9..2e0af60f6db4 100644
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@@ -1,1223 +1,1259 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 Cyril Plisko. All rights reserved.
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/thread.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/vfs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_vnops.h>
 #include <sys/spa.h>
 #include <sys/zil.h>
 #include <sys/byteorder.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 #include <sys/atomic.h>
 #include <sys/cred.h>
 #include <sys/zpl.h>
 #include <sys/dmu_objset.h>
 #include <sys/zfeature.h>
 
 /*
  * NB: FreeBSD expects to be able to do vnode locking in lookup and
  * hold the locks across all subsequent VOPs until vput is called.
  * This means that its zfs vnops routines can't do any internal locking.
  * In order to have the same contract as the Linux vnops there would
  * needed to be duplicate locked vnops. If the vnops were used more widely
  * in common code this would likely be preferable. However, currently
  * this is the only file where this is the case.
  */
 
 /*
  * Functions to replay ZFS intent log (ZIL) records
  * The functions are called through a function vector (zfs_replay_vector)
  * which is indexed by the transaction type.
  */
 
 static void
 zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
     uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
 {
 	memset(vap, 0, sizeof (*vap));
 	vap->va_mask = (uint_t)mask;
 	vap->va_mode = mode;
 #if defined(__FreeBSD__) || defined(__APPLE__)
 	vap->va_type = IFTOVT(mode);
 #endif
 	vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
 	vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
 	vap->va_rdev = zfs_cmpldev(rdev);
 	vap->va_nodeid = nodeid;
 }
 
 static int
 zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap)
 {
 	(void) arg1, (void) arg2, (void) byteswap;
 	return (SET_ERROR(ENOTSUP));
 }
 
 static void
 zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
 {
 	xoptattr_t *xoap = NULL;
 	uint64_t *attrs;
 	uint64_t *crtime;
 	uint32_t *bitmap;
 	void *scanstamp;
 	int i;
 
 	xvap->xva_vattr.va_mask |= ATTR_XVATTR;
 	if ((xoap = xva_getxoptattr(xvap)) == NULL) {
 		xvap->xva_vattr.va_mask &= ~ATTR_XVATTR; /* shouldn't happen */
 		return;
 	}
 
 	ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
 
 	bitmap = &lrattr->lr_attr_bitmap;
 	for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
 		xvap->xva_reqattrmap[i] = *bitmap;
 
 	attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
 	crtime = attrs + 1;
 	scanstamp = (caddr_t)(crtime + 2);
 
 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
 		xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
 		xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
 		xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
 		xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
 		xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
 		xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
 		xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
 		xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
 		xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
 		xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
 		xoap->xoa_av_quarantined =
 		    ((*attrs & XAT0_AV_QUARANTINED) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
 		ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 		ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
 
 		memcpy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
 	} else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 		/*
 		 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
 		 * at the same time, so we can share the same space.
 		 */
 		memcpy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t));
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
 		xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
 		xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
 		xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT))
 		xoap->xoa_projinherit = ((*attrs & XAT0_PROJINHERIT) != 0);
 }
 
 static int
 zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
 {
 	uint64_t uid_idx;
 	uint64_t gid_idx;
 	int domcnt = 0;
 
 	uid_idx = FUID_INDEX(uid);
 	gid_idx = FUID_INDEX(gid);
 	if (uid_idx)
 		domcnt++;
 	if (gid_idx > 0 && gid_idx != uid_idx)
 		domcnt++;
 
 	return (domcnt);
 }
 
 static void *
 zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
     int domcnt)
 {
 	int i;
 
 	for (i = 0; i != domcnt; i++) {
 		fuid_infop->z_domain_table[i] = start;
 		start = (caddr_t)start + strlen(start) + 1;
 	}
 
 	return (start);
 }
 
 /*
  * Set the uid/gid in the fuid_info structure.
  */
 static void
 zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
 {
 	/*
 	 * If owner or group are log specific FUIDs then slurp up
 	 * domain information and build zfs_fuid_info_t
 	 */
 	if (IS_EPHEMERAL(uid))
 		fuid_infop->z_fuid_owner = uid;
 
 	if (IS_EPHEMERAL(gid))
 		fuid_infop->z_fuid_group = gid;
 }
 
 /*
  * Load fuid domains into fuid_info_t
  */
 static zfs_fuid_info_t *
 zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
 {
 	int domcnt;
 
 	zfs_fuid_info_t *fuid_infop;
 
 	fuid_infop = zfs_fuid_info_alloc();
 
 	domcnt = zfs_replay_domain_cnt(uid, gid);
 
 	if (domcnt == 0)
 		return (fuid_infop);
 
 	fuid_infop->z_domain_table =
 	    kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP);
 
 	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
 
 	fuid_infop->z_domain_cnt = domcnt;
 	*end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
 	return (fuid_infop);
 }
 
 /*
  * load zfs_fuid_t's and fuid_domains into fuid_info_t
  */
 static zfs_fuid_info_t *
 zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
     uint64_t gid)
 {
 	uint64_t *log_fuid = (uint64_t *)start;
 	zfs_fuid_info_t *fuid_infop;
 	int i;
 
 	fuid_infop = zfs_fuid_info_alloc();
 	fuid_infop->z_domain_cnt = domcnt;
 
 	fuid_infop->z_domain_table =
 	    kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP);
 
 	for (i = 0; i != idcnt; i++) {
 		zfs_fuid_t *zfuid;
 
 		zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
 		zfuid->z_logfuid = *log_fuid;
 		zfuid->z_id = -1;
 		zfuid->z_domidx = 0;
 		list_insert_tail(&fuid_infop->z_fuids, zfuid);
 		log_fuid++;
 	}
 
 	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
 
 	*end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
 	return (fuid_infop);
 }
 
 static void
 zfs_replay_swap_attrs(lr_attr_t *lrattr)
 {
 	/* swap the lr_attr structure */
 	byteswap_uint32_array(lrattr, sizeof (*lrattr));
 	/* swap the bitmap */
 	byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
 	    sizeof (uint32_t));
 	/* swap the attributes, create time + 64 bit word for attributes */
 	byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
 	    (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
 }
 
 /*
  * Replay file create with optional ACL, xvattr information as well
  * as option FUID information.
  */
 static int
 zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_acl_create_t *lracl = arg2;
 	char *name = NULL;		/* location determined later */
 	lr_create_t *lr = (lr_create_t *)lracl;
 	znode_t *dzp;
 	znode_t *zp;
 	xvattr_t xva;
 	int vflg = 0;
 	vsecattr_t vsec = { 0 };
 	lr_attr_t *lrattr;
 	void *aclstart;
 	void *fuidstart;
 	size_t xvatlen = 0;
 	uint64_t txtype;
 	uint64_t objid;
 	uint64_t dnodesize;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl));
+
 	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lracl, sizeof (*lracl));
 		if (txtype == TX_CREATE_ACL_ATTR ||
 		    txtype == TX_MKDIR_ACL_ATTR) {
 			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
 			zfs_replay_swap_attrs(lrattr);
 			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 		}
 
 		aclstart = (caddr_t)(lracl + 1) + xvatlen;
 		zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
 		/* swap fuids */
 		if (lracl->lr_fuidcnt) {
 			byteswap_uint64_array((caddr_t)aclstart +
 			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
 			    lracl->lr_fuidcnt * sizeof (uint64_t));
 		}
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	objid = LR_FOID_GET_OBJ(lr->lr_foid);
 	dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
 
 	xva_init(&xva);
 	zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
 	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
 
 	/*
 	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
 	 * eventually end up in zfs_mknode(), which assigns the object's
 	 * creation time, generation number, and dnode size. The generic
 	 * zfs_create() has no concept of these attributes, so we smuggle
 	 * the values inside the vattr's otherwise unused va_ctime,
 	 * va_nblocks, and va_fsid fields.
 	 */
 	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
 	xva.xva_vattr.va_nblocks = lr->lr_gen;
 	xva.xva_vattr.va_fsid = dnodesize;
 
 	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
 	if (error)
 		goto bail;
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 	switch (txtype) {
 	case TX_CREATE_ACL:
 		aclstart = (caddr_t)(lracl + 1);
 		fuidstart = (caddr_t)aclstart +
 		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
 		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 		    lr->lr_uid, lr->lr_gid);
 		zfs_fallthrough;
 	case TX_CREATE_ACL_ATTR:
 		if (name == NULL) {
 			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
 			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 			xva.xva_vattr.va_mask |= ATTR_XVATTR;
 			zfs_replay_xvattr(lrattr, &xva);
 		}
 		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
 		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
 		vsec.vsa_aclcnt = lracl->lr_aclcnt;
 		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
 		vsec.vsa_aclflags = lracl->lr_acl_flags;
 		if (zfsvfs->z_fuid_replay == NULL) {
 			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
 			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 			zfsvfs->z_fuid_replay =
 			    zfs_replay_fuids(fuidstart,
 			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 			    lr->lr_uid, lr->lr_gid);
 		}
 
 #if defined(__linux__)
 		error = zfs_create(dzp, name, &xva.xva_vattr,
 		    0, 0, &zp, kcred, vflg, &vsec, zfs_init_idmap);
 #else
 		error = zfs_create(dzp, name, &xva.xva_vattr,
 		    0, 0, &zp, kcred, vflg, &vsec, NULL);
 #endif
 		break;
 	case TX_MKDIR_ACL:
 		aclstart = (caddr_t)(lracl + 1);
 		fuidstart = (caddr_t)aclstart +
 		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
 		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 		    lr->lr_uid, lr->lr_gid);
 		zfs_fallthrough;
 	case TX_MKDIR_ACL_ATTR:
 		if (name == NULL) {
 			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
 			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 			zfs_replay_xvattr(lrattr, &xva);
 		}
 		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
 		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
 		vsec.vsa_aclcnt = lracl->lr_aclcnt;
 		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
 		vsec.vsa_aclflags = lracl->lr_acl_flags;
 		if (zfsvfs->z_fuid_replay == NULL) {
 			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
 			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 			zfsvfs->z_fuid_replay =
 			    zfs_replay_fuids(fuidstart,
 			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 			    lr->lr_uid, lr->lr_gid);
 		}
 #if defined(__linux__)
 		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
 		    &zp, kcred, vflg, &vsec, zfs_init_idmap);
 #else
 		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
 		    &zp, kcred, vflg, &vsec, NULL);
 #endif
 		break;
 	default:
 		error = SET_ERROR(ENOTSUP);
 	}
 
 bail:
 	if (error == 0 && zp != NULL) {
 #ifdef __FreeBSD__
 		VOP_UNLOCK1(ZTOV(zp));
 #endif
 		zrele(zp);
 	}
 	zrele(dzp);
 
 	if (zfsvfs->z_fuid_replay)
 		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 	zfsvfs->z_fuid_replay = NULL;
 
 	return (error);
 }
 
 static int
 zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_create_t *lr = arg2;
 	char *name = NULL;		/* location determined later */
 	char *link;			/* symlink content follows name */
 	znode_t *dzp;
 	znode_t *zp = NULL;
 	xvattr_t xva;
 	int vflg = 0;
 	size_t lrsize = sizeof (lr_create_t);
 	lr_attr_t *lrattr;
 	void *start;
 	size_t xvatlen;
 	uint64_t txtype;
 	uint64_t objid;
 	uint64_t dnodesize;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
 			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
 	}
 
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	objid = LR_FOID_GET_OBJ(lr->lr_foid);
 	dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
 
 	xva_init(&xva);
 	zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
 	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
 
 	/*
 	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
 	 * eventually end up in zfs_mknode(), which assigns the object's
 	 * creation time, generation number, and dnode slot count. The
 	 * generic zfs_create() has no concept of these attributes, so
 	 * we smuggle the values inside the vattr's otherwise unused
 	 * va_ctime, va_nblocks, and va_fsid fields.
 	 */
 	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
 	xva.xva_vattr.va_nblocks = lr->lr_gen;
 	xva.xva_vattr.va_fsid = dnodesize;
 
 	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
 	if (error)
 		goto out;
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 
 	/*
 	 * Symlinks don't have fuid info, and CIFS never creates
 	 * symlinks.
 	 *
 	 * The _ATTR versions will grab the fuid info in their subcases.
 	 */
 	if (txtype != TX_SYMLINK &&
 	    txtype != TX_MKDIR_ATTR &&
 	    txtype != TX_CREATE_ATTR) {
 		start = (lr + 1);
 		zfsvfs->z_fuid_replay =
 		    zfs_replay_fuid_domain(start, &start,
 		    lr->lr_uid, lr->lr_gid);
 	}
 
 	switch (txtype) {
 	case TX_CREATE_ATTR:
 		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
 		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
 		start = (caddr_t)(lr + 1) + xvatlen;
 		zfsvfs->z_fuid_replay =
 		    zfs_replay_fuid_domain(start, &start,
 		    lr->lr_uid, lr->lr_gid);
 		name = (char *)start;
 		zfs_fallthrough;
 
 	case TX_CREATE:
 		if (name == NULL)
 			name = (char *)start;
 
 #if defined(__linux__)
 		error = zfs_create(dzp, name, &xva.xva_vattr,
 		    0, 0, &zp, kcred, vflg, NULL, zfs_init_idmap);
 #else
 		error = zfs_create(dzp, name, &xva.xva_vattr,
 		    0, 0, &zp, kcred, vflg, NULL, NULL);
 #endif
 		break;
 	case TX_MKDIR_ATTR:
 		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
 		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
 		start = (caddr_t)(lr + 1) + xvatlen;
 		zfsvfs->z_fuid_replay =
 		    zfs_replay_fuid_domain(start, &start,
 		    lr->lr_uid, lr->lr_gid);
 		name = (char *)start;
 		zfs_fallthrough;
 
 	case TX_MKDIR:
 		if (name == NULL)
 			name = (char *)(lr + 1);
 
 #if defined(__linux__)
 		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
 		    &zp, kcred, vflg, NULL, zfs_init_idmap);
 #else
 		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
 		    &zp, kcred, vflg, NULL, NULL);
 #endif
 
 		break;
 	case TX_MKXATTR:
 		error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred);
 		break;
 	case TX_SYMLINK:
 		name = (char *)(lr + 1);
 		link = name + strlen(name) + 1;
 #if defined(__linux__)
 		error = zfs_symlink(dzp, name, &xva.xva_vattr,
 		    link, &zp, kcred, vflg, zfs_init_idmap);
 #else
 		error = zfs_symlink(dzp, name, &xva.xva_vattr,
 		    link, &zp, kcred, vflg, NULL);
 #endif
 		break;
 	default:
 		error = SET_ERROR(ENOTSUP);
 	}
 
 out:
 	if (error == 0 && zp != NULL) {
 #ifdef __FreeBSD__
 		VOP_UNLOCK1(ZTOV(zp));
 #endif
 		zrele(zp);
 	}
 	zrele(dzp);
 
 	if (zfsvfs->z_fuid_replay)
 		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 	zfsvfs->z_fuid_replay = NULL;
 	return (error);
 }
 
 static int
 zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_remove_t *lr = arg2;
 	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
 	znode_t *dzp;
 	int error;
 	int vflg = 0;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 
 	switch ((int)lr->lr_common.lrc_txtype) {
 	case TX_REMOVE:
 		error = zfs_remove(dzp, name, kcred, vflg);
 		break;
 	case TX_RMDIR:
 		error = zfs_rmdir(dzp, name, NULL, kcred, vflg);
 		break;
 	default:
 		error = SET_ERROR(ENOTSUP);
 	}
 
 	zrele(dzp);
 
 	return (error);
 }
 
 static int
 zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_link_t *lr = arg2;
 	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
 	znode_t *dzp, *zp;
 	int error;
 	int vflg = 0;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
 		zrele(dzp);
 		return (error);
 	}
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 
 	error = zfs_link(dzp, zp, name, kcred, vflg);
 	zrele(zp);
 	zrele(dzp);
 
 	return (error);
 }
 
 static int
 do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname,
     char *tname, uint64_t rflags, vattr_t *wo_vap)
 {
 	znode_t *sdzp, *tdzp;
 	int error, vflg = 0;
 
 	/* Only Linux currently supports RENAME_* flags. */
 #ifdef __linux__
 	VERIFY0(rflags & ~(RENAME_EXCHANGE | RENAME_WHITEOUT));
 
 	/* wo_vap must be non-NULL iff. we're doing RENAME_WHITEOUT */
 	VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
 #else
 	VERIFY0(rflags);
 #endif
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
 		return (error);
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
 		zrele(sdzp);
 		return (error);
 	}
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 
 #if defined(__linux__)
 	error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags,
 	    wo_vap, zfs_init_idmap);
 #else
 	error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags,
 	    wo_vap, NULL);
 #endif
 
 	zrele(tdzp);
 	zrele(sdzp);
 	return (error);
 }
 
 static int
 zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_rename_t *lr = arg2;
-	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
-	char *tname = sname + strlen(sname) + 1;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
 	return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL));
 }
 
 static int
 zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap)
 {
 #ifdef __linux__
 	zfsvfs_t *zfsvfs = arg1;
 	lr_rename_t *lr = arg2;
-	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
-	char *tname = sname + strlen(sname) + 1;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
 	return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE,
 	    NULL));
 #else
 	return (SET_ERROR(ENOTSUP));
 #endif
 }
 
 static int
 zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap)
 {
 #ifdef __linux__
 	zfsvfs_t *zfsvfs = arg1;
 	lr_rename_whiteout_t *lr = arg2;
 	int error;
-	/* sname and tname follow lr_rename_whiteout_t */
-	char *sname = (char *)(lr + 1);
-	char *tname = sname + strlen(sname) + 1;
 	/* For the whiteout file. */
 	xvattr_t xva;
 	uint64_t objid;
 	uint64_t dnodesize;
 
+	ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	objid = LR_FOID_GET_OBJ(lr->lr_wfoid);
 	dnodesize = LR_FOID_GET_SLOTS(lr->lr_wfoid) << DNODE_SHIFT;
 
 	xva_init(&xva);
 	zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
 	    lr->lr_wmode, lr->lr_wuid, lr->lr_wgid, lr->lr_wrdev, objid);
 
 	/*
 	 * As with TX_CREATE, RENAME_WHITEOUT ends up in zfs_mknode(), which
 	 * assigns the object's creation time, generation number, and dnode
 	 * slot count. The generic zfs_rename() has no concept of these
 	 * attributes, so we smuggle the values inside the vattr's otherwise
 	 * unused va_ctime, va_nblocks, and va_fsid fields.
 	 */
 	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_wcrtime);
 	xva.xva_vattr.va_nblocks = lr->lr_wgen;
 	xva.xva_vattr.va_fsid = dnodesize;
 
 	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
 	if (error)
 		return (error);
 
+	/* sname and tname follow lr_rename_whiteout_t */
+	char *sname = (char *)(lr + 1);
+	char *tname = sname + strlen(sname) + 1;
 	return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname,
 	    RENAME_WHITEOUT, &xva.xva_vattr));
 #else
 	return (SET_ERROR(ENOTSUP));
 #endif
 }
 
 static int
 zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_write_t *lr = arg2;
 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
 	znode_t	*zp;
 	int error;
 	uint64_t eod, offset, length;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * As we can log writes out of order, it's possible the
 		 * file has been removed. In this case just drop the write
 		 * and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 		return (error);
 	}
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 	eod = offset + length;	/* end of data for this write */
 
 	/*
 	 * This may be a write from a dmu_sync() for a whole block,
 	 * and may extend beyond the current end of the file.
 	 * We can't just replay what was written for this TX_WRITE as
 	 * a future TX_WRITE2 may extend the eof and the data for that
 	 * write needs to be there. So we write the whole block and
 	 * reduce the eof. This needs to be done within the single dmu
 	 * transaction created within vn_rdwr -> zfs_write. So a possible
 	 * new end of file is passed through in zfsvfs->z_replay_eof
 	 */
 
 	zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 		if (length < blocksize) {
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
 		if (zp->z_size < eod)
 			zfsvfs->z_replay_eof = eod;
 	}
 	error = zfs_write_simple(zp, data, length, offset, NULL);
 	zrele(zp);
 	zfsvfs->z_replay_eof = 0;	/* safety */
 
 	return (error);
 }
 
 /*
  * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
  * meaning the pool block is already being synced. So now that we always write
  * out full blocks, all we have to do is expand the eof if
  * the file is grown.
  */
 static int
 zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_write_t *lr = arg2;
 	znode_t	*zp;
 	int error;
 	uint64_t end;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 top:
 	end = lr->lr_offset + lr->lr_length;
 	if (end > zp->z_size) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		zp->z_size = end;
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			zrele(zp);
 			if (error == ERESTART) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			dmu_tx_abort(tx);
 			return (error);
 		}
 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 		    (void *)&zp->z_size, sizeof (uint64_t), tx);
 
 		/* Ensure the replayed seq is updated */
 		(void) zil_replaying(zfsvfs->z_log, tx);
 
 		dmu_tx_commit(tx);
 	}
 
 	zrele(zp);
 
 	return (error);
 }
 
 static int
 zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_truncate_t *lr = arg2;
 	znode_t *zp;
 	flock64_t fl = {0};
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 	fl.l_type = F_WRLCK;
 	fl.l_whence = SEEK_SET;
 	fl.l_start = lr->lr_offset;
 	fl.l_len = lr->lr_length;
 
 	error = zfs_space(zp, F_FREESP, &fl, O_RDWR | O_LARGEFILE,
 	    lr->lr_offset, kcred);
 
 	zrele(zp);
 
 	return (error);
 }
 
 static int
 zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_setattr_t *lr = arg2;
 	znode_t *zp;
 	xvattr_t xva;
 	vattr_t *vap = &xva.xva_vattr;
 	int error;
 	void *start;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	xva_init(&xva);
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 		if ((lr->lr_mask & ATTR_XVATTR) &&
 		    zfsvfs->z_version >= ZPL_VERSION_INITIAL)
 			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 	zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
 	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
 
 	vap->va_size = lr->lr_size;
 	ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
 	gethrestime(&vap->va_ctime);
 	vap->va_mask |= ATTR_CTIME;
 
 	/*
 	 * Fill in xvattr_t portions if necessary.
 	 */
 
 	start = (lr_setattr_t *)(lr + 1);
 	if (vap->va_mask & ATTR_XVATTR) {
 		zfs_replay_xvattr((lr_attr_t *)start, &xva);
 		start = (caddr_t)start +
 		    ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
 	} else
 		xva.xva_vattr.va_mask &= ~ATTR_XVATTR;
 
 	zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
 	    lr->lr_uid, lr->lr_gid);
 
 #if defined(__linux__)
 	error = zfs_setattr(zp, vap, 0, kcred, zfs_init_idmap);
 #else
 	error = zfs_setattr(zp, vap, 0, kcred, NULL);
 #endif
 
 	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 	zfsvfs->z_fuid_replay = NULL;
 	zrele(zp);
 
 	return (error);
 }
 
 static int
 zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_setsaxattr_t *lr = arg2;
 	znode_t *zp;
 	nvlist_t *nvl;
 	size_t sa_size;
 	char *name;
 	char *value;
 	size_t size;
 	int error = 0;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size);
+
 	ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa,
 	    SPA_FEATURE_ZILSAXATTR));
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 	rw_enter(&zp->z_xattr_lock, RW_WRITER);
 	mutex_enter(&zp->z_lock);
 	if (zp->z_xattr_cached == NULL)
 		error = zfs_sa_get_xattr(zp);
 	mutex_exit(&zp->z_lock);
 
 	if (error)
 		goto out;
 
 	ASSERT(zp->z_xattr_cached);
 	nvl = zp->z_xattr_cached;
 
 	/* Get xattr name, value and size from log record */
 	size = lr->lr_size;
 	name = (char *)(lr + 1);
 	if (size == 0) {
 		value = NULL;
 		error = nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
 	} else {
 		value = name + strlen(name) + 1;
 		/* Limited to 32k to keep nvpair memory allocations small */
 		if (size > DXATTR_MAX_ENTRY_SIZE) {
 			error = SET_ERROR(EFBIG);
 			goto out;
 		}
 
 		/* Prevent the DXATTR SA from consuming the entire SA region */
 		error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
 		if (error)
 			goto out;
 
 		if (sa_size > DXATTR_MAX_SA_SIZE) {
 			error = SET_ERROR(EFBIG);
 			goto out;
 		}
 
 		error = nvlist_add_byte_array(nvl, name, (uchar_t *)value,
 		    size);
 	}
 
 	/*
 	 * Update the SA for additions, modifications, and removals. On
 	 * error drop the inconsistent cached version of the nvlist, it
 	 * will be reconstructed from the ARC when next accessed.
 	 */
 	if (error == 0)
 		error = zfs_sa_set_xattr(zp, name, value, size);
 
 	if (error) {
 		nvlist_free(nvl);
 		zp->z_xattr_cached = NULL;
 	}
 
 out:
 	rw_exit(&zp->z_xattr_lock);
 	zrele(zp);
 	return (error);
 }
 
 static int
 zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_acl_v0_t *lr = arg2;
 	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
 	vsecattr_t vsa = {0};
 	znode_t *zp;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) +
+	    sizeof (ace_t) * lr->lr_aclcnt);
+
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
 	vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
 	vsa.vsa_aclflags = 0;
 	vsa.vsa_aclentp = ace;
 
 	error = zfs_setsecattr(zp, &vsa, 0, kcred);
 
 	zrele(zp);
 
 	return (error);
 }
 
 /*
  * Replaying ACLs is complicated by FUID support.
  * The log record may contain some optional data
  * to be used for replaying FUID's.  These pieces
  * are the actual FUIDs that were created initially.
  * The FUID table index may no longer be valid and
  * during zfs_create() a new index may be assigned.
  * Because of this the log will contain the original
  * domain+rid in order to create a new FUID.
  *
  * The individual ACEs may contain an ephemeral uid/gid which is no
  * longer valid and will need to be replaced with an actual FUID.
  *
  */
 static int
 zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_acl_t *lr = arg2;
 	ace_t *ace = (ace_t *)(lr + 1);
 	vsecattr_t vsa = {0};
 	znode_t *zp;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes);
+
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
 		if (lr->lr_fuidcnt) {
 			byteswap_uint64_array((caddr_t)ace +
 			    ZIL_ACE_LENGTH(lr->lr_acl_bytes),
 			    lr->lr_fuidcnt * sizeof (uint64_t));
 		}
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
 	vsa.vsa_aclentp = ace;
 	vsa.vsa_aclentsz = lr->lr_acl_bytes;
 	vsa.vsa_aclflags = lr->lr_acl_flags;
 
 	if (lr->lr_fuidcnt) {
 		void *fuidstart = (caddr_t)ace +
 		    ZIL_ACE_LENGTH(lr->lr_acl_bytes);
 
 		zfsvfs->z_fuid_replay =
 		    zfs_replay_fuids(fuidstart, &fuidstart,
 		    lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
 	}
 
 	error = zfs_setsecattr(zp, &vsa, 0, kcred);
 
 	if (zfsvfs->z_fuid_replay)
 		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 
 	zfsvfs->z_fuid_replay = NULL;
 	zrele(zp);
 
 	return (error);
 }
 
 static int
 zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_clone_range_t *lr = arg2;
 	znode_t *zp;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * Clones can be logged out of order, so don't be surprised if
 		 * the file is gone - just return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 		return (error);
 	}
 
 	error = zfs_clone_range_replay(zp, lr->lr_offset, lr->lr_length,
 	    lr->lr_blksz, lr->lr_bps, lr->lr_nbps);
 
 	zrele(zp);
 	return (error);
 }
 
 /*
  * Callback vectors for replaying records
  */
 zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_error,	/* no such type */
 	zfs_replay_create,	/* TX_CREATE */
 	zfs_replay_create,	/* TX_MKDIR */
 	zfs_replay_create,	/* TX_MKXATTR */
 	zfs_replay_create,	/* TX_SYMLINK */
 	zfs_replay_remove,	/* TX_REMOVE */
 	zfs_replay_remove,	/* TX_RMDIR */
 	zfs_replay_link,	/* TX_LINK */
 	zfs_replay_rename,	/* TX_RENAME */
 	zfs_replay_write,	/* TX_WRITE */
 	zfs_replay_truncate,	/* TX_TRUNCATE */
 	zfs_replay_setattr,	/* TX_SETATTR */
 	zfs_replay_acl_v0,	/* TX_ACL_V0 */
 	zfs_replay_acl,		/* TX_ACL */
 	zfs_replay_create_acl,	/* TX_CREATE_ACL */
 	zfs_replay_create,	/* TX_CREATE_ATTR */
 	zfs_replay_create_acl,	/* TX_CREATE_ACL_ATTR */
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL */
 	zfs_replay_create,	/* TX_MKDIR_ATTR */
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
 	zfs_replay_write2,	/* TX_WRITE2 */
 	zfs_replay_setsaxattr,	/* TX_SETSAXATTR */
 	zfs_replay_rename_exchange,	/* TX_RENAME_EXCHANGE */
 	zfs_replay_rename_whiteout,	/* TX_RENAME_WHITEOUT */
 	zfs_replay_clone_range,	/* TX_CLONE_RANGE */
 };
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index a11886136994..37fb792f5ddc 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1,4233 +1,4253 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2018 Datto Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
 #include <sys/stat.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/wmsum.h>
 
 /*
  * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
  * calls that change the file system. Each itx has enough information to
  * be able to replay them after a system crash, power loss, or
  * equivalent failure mode. These are stored in memory until either:
  *
  *   1. they are committed to the pool by the DMU transaction group
  *      (txg), at which point they can be discarded; or
  *   2. they are committed to the on-disk ZIL for the dataset being
  *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
  *      requirement).
  *
  * In the event of a crash or power loss, the itxs contained by each
  * dataset's on-disk ZIL will be replayed when that dataset is first
  * instantiated (e.g. if the dataset is a normal filesystem, when it is
  * first mounted).
  *
  * As hinted at above, there is one ZIL per dataset (both the in-memory
  * representation, and the on-disk representation). The on-disk format
  * consists of 3 parts:
  *
  * 	- a single, per-dataset, ZIL header; which points to a chain of
  * 	- zero or more ZIL blocks; each of which contains
  * 	- zero or more ZIL records
  *
  * A ZIL record holds the information necessary to replay a single
  * system call transaction. A ZIL block can hold many ZIL records, and
  * the blocks are chained together, similarly to a singly linked list.
  *
  * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
  * block in the chain, and the ZIL header points to the first block in
  * the chain.
  *
  * Note, there is not a fixed place in the pool to hold these ZIL
  * blocks; they are dynamically allocated and freed as needed from the
  * blocks available on the pool, though they can be preferentially
  * allocated from a dedicated "log" vdev.
  */
 
 /*
  * This controls the amount of time that a ZIL block (lwb) will remain
  * "open" when it isn't "full", and it has a thread waiting for it to be
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
 static uint_t zfs_commit_timeout_pct = 5;
 
 /*
  * Minimal time we care to delay commit waiting for more ZIL records.
  * At least FreeBSD kernel can't sleep for less than 2us at its best.
  * So requests to sleep for less then 5us is a waste of CPU time with
  * a risk of significant log latency increase due to oversleep.
  */
 static uint64_t zil_min_commit_timeout = 5000;
 
 /*
  * See zil.h for more information about these fields.
  */
 static zil_kstat_values_t zil_stats = {
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_alloc",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_alloc",	KSTAT_DATA_UINT64 },
 };
 
 static zil_sums_t zil_sums_global;
 static kstat_t *zil_kstats_global;
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
 int zil_replay_disable = 0;
 
 /*
  * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
  * the disk(s) by the ZIL after an LWB write has completed. Setting this
  * will cause ZIL corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 static int zil_nocacheflush = 0;
 
 /*
  * Limit SLOG write size per commit executed with synchronous priority.
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
  */
 static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
 
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
 static itx_t *zil_itx_clone(itx_t *oitx);
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 }
 
 static void
 zil_bp_tree_init(zilog_t *zilog)
 {
 	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
 	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
 zil_bp_tree_fini(zilog_t *zilog)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
 int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (0);
 
 	dva = BP_IDENTITY(bp);
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (SET_ERROR(EEXIST));
 
 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
 	return (0);
 }
 
 static zil_header_t *
 zil_header_in_syncing_context(zilog_t *zilog)
 {
 	return ((zil_header_t *)zilog->zl_header);
 }
 
 static void
 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 {
 	zio_cksum_t *zc = &bp->blk_cksum;
 
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_0]));
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_1]));
 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 }
 
 static int
 zil_kstats_global_update(kstat_t *ksp, int rw)
 {
 	zil_kstat_values_t *zs = ksp->ks_data;
 	ASSERT3P(&zil_stats, ==, zs);
 
 	if (rw == KSTAT_WRITE) {
 		return (SET_ERROR(EACCES));
 	}
 
 	zil_kstat_values_update(zs, &zil_sums_global);
 
 	return (0);
 }
 
 /*
  * Read a log block and make sure it's valid.
  */
 static int
 zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
     blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	if (!decrypt)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
 	    abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
 		 * Validate the checksummed log block.
 		 *
 		 * Sequence numbers should be... sequential.  The checksum
 		 * verifier for the next block should be bp's checksum plus 1.
 		 *
 		 * Also check the log chain linkage and size used.
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		uint64_t size = BP_GET_LSIZE(bp);
 		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = (*abuf)->b_data;
 			char *lr = (char *)(zilc + 1);
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    zilc->zc_nused < sizeof (*zilc) ||
 			    zilc->zc_nused > size) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused - sizeof (*zilc);
 				*nbp = zilc->zc_next_blk;
 			}
 		} else {
 			char *lr = (*abuf)->b_data;
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Read a TX_WRITE log data block.
  */
 static int
 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
 		if (wbuf != NULL)
 			memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 		return (0);
 	}
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	/*
 	 * If we are not using the resulting data, we are just checking that
 	 * it hasn't been corrupted so we don't need to waste CPU time
 	 * decompressing and decrypting it.
 	 */
 	if (wbuf == NULL)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		if (wbuf != NULL)
 			memcpy(wbuf, abuf->b_data, arc_buf_size(abuf));
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 void
 zil_sums_init(zil_sums_t *zs)
 {
 	wmsum_init(&zs->zil_commit_count, 0);
 	wmsum_init(&zs->zil_commit_writer_count, 0);
 	wmsum_init(&zs->zil_itx_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_bytes, 0);
 	wmsum_init(&zs->zil_itx_copied_count, 0);
 	wmsum_init(&zs->zil_itx_copied_bytes, 0);
 	wmsum_init(&zs->zil_itx_needcopy_count, 0);
 	wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
 }
 
 void
 zil_sums_fini(zil_sums_t *zs)
 {
 	wmsum_fini(&zs->zil_commit_count);
 	wmsum_fini(&zs->zil_commit_writer_count);
 	wmsum_fini(&zs->zil_itx_count);
 	wmsum_fini(&zs->zil_itx_indirect_count);
 	wmsum_fini(&zs->zil_itx_indirect_bytes);
 	wmsum_fini(&zs->zil_itx_copied_count);
 	wmsum_fini(&zs->zil_itx_copied_bytes);
 	wmsum_fini(&zs->zil_itx_needcopy_count);
 	wmsum_fini(&zs->zil_itx_needcopy_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_count);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_write);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_count);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_write);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
 }
 
 void
 zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
 {
 	zs->zil_commit_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_count);
 	zs->zil_commit_writer_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_writer_count);
 	zs->zil_itx_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_count);
 	zs->zil_itx_indirect_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_count);
 	zs->zil_itx_indirect_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_bytes);
 	zs->zil_itx_copied_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_count);
 	zs->zil_itx_copied_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_bytes);
 	zs->zil_itx_needcopy_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_count);
 	zs->zil_itx_needcopy_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_bytes);
 	zs->zil_itx_metaslab_normal_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
 	zs->zil_itx_metaslab_normal_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
 	zs->zil_itx_metaslab_normal_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
 	zs->zil_itx_metaslab_normal_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
 	zs->zil_itx_metaslab_slog_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
 	zs->zil_itx_metaslab_slog_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
 	zs->zil_itx_metaslab_slog_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
 	zs->zil_itx_metaslab_slog_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
  */
 int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
     boolean_t decrypt)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	boolean_t claimed = !!zh->zh_claim_txg;
 	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
 	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
 	uint64_t max_blk_seq = 0;
 	uint64_t max_lr_seq = 0;
 	uint64_t blk_count = 0;
 	uint64_t lr_count = 0;
 	blkptr_t blk, next_blk = {{{{0}}}};
 	int error = 0;
 
 	/*
 	 * Old logs didn't record the maximum zh_claim_lr_seq.
 	 */
 	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
 	 * For each block in the chain we strongly check that block to
 	 * ensure its validity.  We stop when an invalid block is found.
 	 * For each block pointer in the chain we call parse_blk_func().
 	 * For each record in each valid block we call parse_lr_func().
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
 		char *lrp, *end;
 		arc_buf_t *abuf = NULL;
 
 		if (blk_seq > claim_blk_seq)
 			break;
 
 		error = parse_blk_func(zilog, &blk, arg, txg);
 		if (error != 0)
 			break;
 		ASSERT3U(max_blk_seq, <, blk_seq);
 		max_blk_seq = blk_seq;
 		blk_count++;
 
 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
 		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
 		    &lrp, &end, &abuf);
 		if (error != 0) {
 			if (abuf)
 				arc_buf_destroy(abuf, &abuf);
 			if (claimed) {
 				char name[ZFS_MAX_DATASET_NAME_LEN];
 
 				dmu_objset_name(zilog->zl_os, name);
 
 				cmn_err(CE_WARN, "ZFS read log block error %d, "
 				    "dataset %s, seq 0x%llx\n", error, name,
 				    (u_longlong_t)blk_seq);
 			}
 			break;
 		}
 
 		for (; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
+			ASSERT3U(reclen, <=, end - lrp);
 			if (lr->lrc_seq > claim_lr_seq) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 
 			error = parse_lr_func(zilog, lr, arg, txg);
 			if (error != 0) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
 		arc_buf_destroy(abuf, &abuf);
 	}
 done:
 	zilog->zl_parse_error = error;
 	zilog->zl_parse_blk_seq = max_blk_seq;
 	zilog->zl_parse_lr_seq = max_lr_seq;
 	zilog->zl_parse_blk_count = blk_count;
 	zilog->zl_parse_lr_count = lr_count;
 
 	zil_bp_tree_fini(zilog);
 
 	return (error);
 }
 
 static int
 zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	(void) tx;
 	ASSERT(!BP_IS_HOLE(bp));
 
 	/*
 	 * As we call this function from the context of a rewind to a
 	 * checkpoint, each ZIL block whose txg is later than the txg
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
 	if (bp->blk_birth >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	zio_free(zilog->zl_spa, first_txg, bp);
 	return (0);
 }
 
 static int
 zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 	(void) zilog, (void) lrc, (void) tx, (void) first_txg;
 	return (0);
 }
 
 static int
 zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	/*
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
 	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
 	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
 static int
 zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
-	ASSERT(lrc->lrc_txtype == TX_WRITE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
 	 * in normal operation when a log block is written to disk before
 	 * some of the dmu_sync() blocks it points to.  In this case, the
 	 * transaction cannot have been committed to anyone (we would have
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
 	if (lr->lr_blkptr.blk_birth >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 static int
 zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa;
 	uint_t ii;
 
-	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	/*
 	 * XXX: Do we need to byteswap lr?
 	 */
 
 	spa = zilog->zl_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		/*
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
 		 */
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_claim_write(zilog, lrc, tx, first_txg));
 	case TX_CLONE_RANGE:
 		return (zil_claim_clone_range(zilog, lrc, tx));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t claim_txg)
 {
 	(void) claim_txg;
 
 	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
 zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
-	ASSERT(lrc->lrc_txtype == TX_WRITE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
 	if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
 	    !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 	}
 
 	return (0);
 }
 
 static int
 zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa;
 	uint_t ii;
 
-	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	spa = zilog->zl_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		if (!BP_IS_HOLE(bp)) {
 			zio_free(spa, dmu_tx_get_txg(tx), bp);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t claim_txg)
 {
 
 	if (claim_txg == 0) {
 		return (0);
 	}
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_free_write(zilog, lrc, tx, claim_txg));
 	case TX_CLONE_RANGE:
 		return (zil_free_clone_range(zilog, lrc, tx));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_lwb_vdev_compare(const void *x1, const void *x2)
 {
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
 	return (TREE_CMP(v1, v2));
 }
 
 /*
  * Allocate a new lwb.  We may already have a block pointer for it, in which
  * case we get size and version from there.  Or we may not yet, in which case
  * we choose them here and later make the block allocation match.
  */
 static lwb_t *
 zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
     uint64_t txg, lwb_state_t state)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	if (bp) {
 		lwb->lwb_blk = *bp;
 		lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
 		sz = BP_GET_LSIZE(bp);
 	} else {
 		BP_ZERO(&lwb->lwb_blk);
 		lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
 		    SPA_VERSION_SLIM_ZIL);
 	}
 	lwb->lwb_slog = slog;
 	lwb->lwb_error = 0;
 	if (lwb->lwb_slim) {
 		lwb->lwb_nmax = sz;
 		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
 	} else {
 		lwb->lwb_nmax = sz - sizeof (zil_chain_t);
 		lwb->lwb_nused = lwb->lwb_nfilled = 0;
 	}
 	lwb->lwb_sz = sz;
 	lwb->lwb_state = state;
 	lwb->lwb_buf = zio_buf_alloc(sz);
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
 	lwb->lwb_issued_timestamp = 0;
 	lwb->lwb_issued_txg = 0;
 	lwb->lwb_alloc_txg = txg;
 	lwb->lwb_max_txg = 0;
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
 	if (state != LWB_STATE_NEW)
 		zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 
 	return (lwb);
 }
 
 static void
 zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 	ASSERT3P(lwb->lwb_child_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
 	ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	VERIFY(list_is_empty(&lwb->lwb_itxs));
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
 	 * valid, and prevent use-after-free errors.
 	 */
 	if (zilog->zl_last_lwb_opened == lwb)
 		zilog->zl_last_lwb_opened = NULL;
 
 	kmem_cache_free(zil_lwb_cache, lwb);
 }
 
 /*
  * Called when we create in-memory log transactions so that we know
  * to cleanup the itxs at the end of spa_sync().
  */
 static void
 zilog_dirty(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	if (ds->ds_is_snapshot)
 		panic("dirtying snapshot!");
 
 	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, zilog);
 
 		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
 	}
 }
 
 /*
  * Determine if the zil is dirty in the specified txg. Callers wanting to
  * ensure that the dirty state does not change must hold the itxg_lock for
  * the specified txg. Holding the lock will ensure that the zil cannot be
  * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
  * state.
  */
 static boolean_t __maybe_unused
 zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * Determine if the zil is dirty. The zil is considered dirty if it has
  * any pending itx records that have not been cleaned by zil_clean().
  */
 static boolean_t
 zilog_is_dirty(zilog_t *zilog)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Its called in zil_commit context (zil_process_commit_list()/zil_create()).
  * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled.
  * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every
  * zil_commit.
  */
 static void
 zil_commit_activate_saxattr_feature(zilog_t *zilog)
 {
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 
 	if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
 	    !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(ds, tx);
 		txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&ds->ds_lock);
 		ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 		    (void *)B_TRUE;
 		mutex_exit(&ds->ds_lock);
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	}
 }
 
 /*
  * Create an on-disk intent log.
  */
 static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
 	boolean_t slog = FALSE;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	ASSERT(zh->zh_claim_txg == 0);
 	ASSERT(zh->zh_replay_seq == 0);
 
 	blk = zh->zh_log;
 
 	/*
 	 * Allocate an initial log block if:
 	 *    - there isn't one already
 	 *    - the existing block is the wrong endianness
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
 			zio_free(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
 		    ZIL_MIN_BLKSZ, &slog);
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
 
 	/*
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
 		lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
 	 * and wait for zil_sync() to stuff the block pointer into zh_log.
 	 * (zh is part of the MOS, so we cannot modify it in open context.)
 	 */
 	if (tx != NULL) {
 		/*
 		 * If "zilsaxattr" feature is enabled on zpool, then activate
 		 * it now when we're creating the ZIL chain. We can't wait with
 		 * this until we write the first xattr log record because we
 		 * need to wait for the feature activation to sync out.
 		 */
 		if (spa_feature_is_enabled(zilog->zl_spa,
 		    SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) !=
 		    DMU_OST_ZVOL) {
 			mutex_enter(&ds->ds_lock);
 			ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 			    (void *)B_TRUE;
 			mutex_exit(&ds->ds_lock);
 		}
 
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	} else {
 		/*
 		 * This branch covers the case where we enable the feature on a
 		 * zpool that has existing ZIL headers.
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 	}
 	IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL,
 	    dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR));
 
 	ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 	IMPLY(error == 0, lwb != NULL);
 
 	return (lwb);
 }
 
 /*
  * In one tx, free all log blocks and clear the log header. If keep_first
  * is set, then we're replaying a log with no content. We want to keep the
  * first block, however, so that the first synchronous transaction doesn't
  * require a txg_wait_synced() in zil_create(). We don't need to
  * txg_wait_synced() here either when keep_first is set, because both
  * zil_create() and zil_destroy() will wait for any in-progress destroys
  * to complete.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
 	dmu_tx_t *tx;
 	uint64_t txg;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	zilog->zl_old_header = *zh;		/* debugging aid */
 
 	if (BP_IS_HOLE(&zh->zh_log))
 		return (B_FALSE);
 
 	tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 			if (!BP_IS_HOLE(&lwb->lwb_blk))
 				zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
 			zil_free_lwb(zilog, lwb);
 		}
 	} else if (!keep_first) {
 		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 
 	return (B_TRUE);
 }
 
 void
 zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	(void) zil_parse(zilog, zil_free_log_block,
 	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
 }
 
 int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	zilog_t *zilog;
 	uint64_t first_txg;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_own_obj(dp, ds->ds_object,
 	    DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
 	if (error != 0) {
 		/*
 		 * EBUSY indicates that the objset is inconsistent, in which
 		 * case it can not have a ZIL.
 		 */
 		if (error != EBUSY) {
 			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
 			    (unsigned long long)ds->ds_object, error);
 		}
 
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
 	first_txg = spa_min_claim_txg(zilog->zl_spa);
 
 	/*
 	 * If the spa_log_state is not set to be cleared, check whether
 	 * the current uberblock is a checkpoint one and if the current
 	 * header has been claimed before moving on.
 	 *
 	 * If the current uberblock is a checkpointed uberblock then
 	 * one of the following scenarios took place:
 	 *
 	 * 1] We are currently rewinding to the checkpoint of the pool.
 	 * 2] We crashed in the middle of a checkpoint rewind but we
 	 *    did manage to write the checkpointed uberblock to the
 	 *    vdev labels, so when we tried to import the pool again
 	 *    the checkpointed uberblock was selected from the import
 	 *    procedure.
 	 *
 	 * In both cases we want to zero out all the ZIL blocks, except
 	 * the ones that have been claimed at the time of the checkpoint
 	 * (their zh_claim_txg != 0). The reason is that these blocks
 	 * may be corrupted since we may have reused their locations on
 	 * disk after we took the checkpoint.
 	 *
 	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
 	 * when we first figure out whether the current uberblock is
 	 * checkpointed or not. Unfortunately, that would discard all
 	 * the logs, including the ones that are claimed, and we would
 	 * leak space.
 	 */
 	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
 	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 	    zh->zh_claim_txg == 0)) {
 		if (!BP_IS_HOLE(&zh->zh_log)) {
 			(void) zil_parse(zilog, zil_clear_log_block,
 			    zil_noop_log_record, tx, first_txg, B_FALSE);
 		}
 		BP_ZERO(&zh->zh_log);
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_disown(os, B_FALSE, FTAG);
 		return (0);
 	}
 
 	/*
 	 * If we are not rewinding and opening the pool normally, then
 	 * the min_claim_txg should be equal to the first txg of the pool.
 	 */
 	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
 
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
 	 * but we can read the entire log later, we will not try to replay
 	 * or destroy beyond the last block we successfully claimed.
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
 		(void) zil_parse(zilog, zil_claim_log_block,
 		    zil_claim_log_record, tx, first_txg, B_FALSE);
 		zh->zh_claim_txg = first_txg;
 		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
 		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
 		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
 			zh->zh_flags |= ZIL_REPLAY_NEEDED;
 		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
 	dmu_objset_disown(os, B_FALSE, FTAG);
 	return (0);
 }
 
 /*
  * Check the log by walking the log chain.
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	(void) dp;
 	zilog_t *zilog;
 	objset_t *os;
 	blkptr_t *bp;
 	int error;
 
 	ASSERT(tx == NULL);
 
 	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
 		cmn_err(CE_WARN, "can't open objset %llu, error %d",
 		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;
 
 		/*
 		 * Check the first block and determine if it's on a log device
 		 * which may have been removed or faulted prior to loading this
 		 * pool.  If so, there's no point in checking the rest of the
 		 * log as its content should have already been synced to the
 		 * pool.
 		 */
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
 		if (!valid)
 			return (0);
 
 		/*
 		 * Check whether the current uberblock is checkpointed (e.g.
 		 * we are rewinding) and whether the current header has been
 		 * claimed or not. If it hasn't then skip verifying it. We
 		 * do this because its ZIL blocks may be part of the pool's
 		 * state before the rewind, which is no longer valid.
 		 */
 		zil_header_t *zh = zil_header_in_syncing_context(zilog);
 		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 		    zh->zh_claim_txg == 0)
 			return (0);
 	}
 
 	/*
 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
 	 * any blocks, but just determine whether it is possible to do so.
 	 * In addition to checking the log chain, zil_claim_log_block()
 	 * will invoke zio_claim() with a done func of spa_claim_notify(),
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL :
 	    spa_min_claim_txg(os->os_spa), B_FALSE);
 
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 /*
  * When an itx is "skipped", this function is used to properly mark the
  * waiter as "done, and signal any thread(s) waiting on it. An itx can
  * be skipped (and not committed to an lwb) for a variety of reasons,
  * one of them being that the itx was committed via spa_sync(), prior to
  * it being committed to an lwb; this can happen if a thread calling
  * zil_commit() is racing with spa_sync().
  */
 static void
 zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
 {
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 	zcw->zcw_done = B_TRUE;
 	cv_broadcast(&zcw->zcw_cv);
 	mutex_exit(&zcw->zcw_lock);
 }
 
 /*
  * This function is used when the given waiter is to be linked into an
  * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
  * At this point, the waiter will no longer be referenced by the itx,
  * and instead, will be referenced by the lwb.
  */
 static void
 zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 {
 	/*
 	 * The lwb_waiters field of the lwb is protected by the zilog's
 	 * zl_issuer_lock while the lwb is open and zl_lock otherwise.
 	 * zl_issuer_lock also protects leaving the open state.
 	 * zcw_lwb setting is protected by zl_issuer_lock and state !=
 	 * flush_done, which transition is protected by zl_lock.
 	 */
 	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
 	IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
 	    MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(&lwb->lwb_waiters, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	zcw->zcw_lwb = lwb;
 }
 
 /*
  * This function is used when zio_alloc_zil() fails to allocate a ZIL
  * block, and the given waiter must be linked to the "nolwb waiters"
  * list inside of zil_process_commit_list().
  */
 static void
 zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(nolwb, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 }
 
 void
 zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 {
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	avl_index_t where;
 	zil_vdev_node_t *zv, zvsearch;
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	if (zil_nocacheflush)
 		return;
 
 	mutex_enter(&lwb->lwb_vdev_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
 			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
 			zv->zv_vdev = zvsearch.zv_vdev;
 			avl_insert(t, zv, where);
 		}
 	}
 	mutex_exit(&lwb->lwb_vdev_lock);
 }
 
 static void
 zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 {
 	avl_tree_t *src = &lwb->lwb_vdev_tree;
 	avl_tree_t *dst = &nlwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	/*
 	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
 	 * not need the protection of lwb_vdev_lock (it will only be modified
 	 * while holding zilog->zl_lock) as its writes and those of its
 	 * children have all completed.  The younger 'nlwb' may be waiting on
 	 * future writes to additional vdevs.
 	 */
 	mutex_enter(&nlwb->lwb_vdev_lock);
 	/*
 	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
 	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
 	 */
 	while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
 		avl_index_t where;
 
 		if (avl_find(dst, zv, &where) == NULL) {
 			avl_insert(dst, zv, where);
 		} else {
 			kmem_free(zv, sizeof (*zv));
 		}
 	}
 	mutex_exit(&nlwb->lwb_vdev_lock);
 }
 
 void
 zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 {
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 }
 
 /*
  * This function is a called after all vdevs associated with a given lwb
  * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
  * as the lwb write completes, if "zil_nocacheflush" is set. Further,
  * all "previous" lwb's will have completed before this function is
  * called; i.e. this function is called for all previous lwbs before
  * it's called for "this" lwb (enforced via zio the dependencies
  * configured in zil_lwb_set_zio_dependency()).
  *
  * The intention is for this function to be called as soon as the
  * contents of an lwb are considered "stable" on disk, and will survive
  * any sudden loss of power. At this point, any threads waiting for the
  * lwb to reach this state are signalled, and the "waiter" structures
  * are marked "done".
  */
 static void
 zil_lwb_flush_vdevs_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
 	zil_commit_waiter_t *zcw;
 	itx_t *itx;
 
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
 	hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
 
 	mutex_enter(&zilog->zl_lock);
 
 	zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
 
 	lwb->lwb_root_zio = NULL;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
 
 	if (zilog->zl_last_lwb_opened == lwb) {
 		/*
 		 * Remember the highest committed log sequence number
 		 * for ztest. We only update this value when all the log
 		 * writes succeeded, because ztest wants to ASSERT that
 		 * it got the whole log chain.
 		 */
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 	}
 
 	while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 		zil_itx_destroy(itx);
 
 	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 		mutex_enter(&zcw->zcw_lock);
 
 		ASSERT3P(zcw->zcw_lwb, ==, lwb);
 		zcw->zcw_lwb = NULL;
 		/*
 		 * We expect any ZIO errors from child ZIOs to have been
 		 * propagated "up" to this specific LWB's root ZIO, in
 		 * order for this error handling to work correctly. This
 		 * includes ZIO errors from either this LWB's write or
 		 * flush, as well as any errors from other dependent LWBs
 		 * (e.g. a root LWB ZIO that might be a child of this LWB).
 		 *
 		 * With that said, it's important to note that LWB flush
 		 * errors are not propagated up to the LWB root ZIO.
 		 * This is incorrect behavior, and results in VDEV flush
 		 * errors not being handled correctly here. See the
 		 * comment above the call to "zio_flush" for details.
 		 */
 
 		zcw->zcw_zio_error = zio->io_error;
 
 		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 		zcw->zcw_done = B_TRUE;
 		cv_broadcast(&zcw->zcw_cv);
 
 		mutex_exit(&zcw->zcw_lock);
 	}
 
 	uint64_t txg = lwb->lwb_issued_txg;
 
 	/* Once we drop the lock, lwb may be freed by zil_sync(). */
 	mutex_exit(&zilog->zl_lock);
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
 	zilog->zl_lwb_inflight[txg & TXG_MASK]--;
 	if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
 		cv_broadcast(&zilog->zl_lwb_io_cv);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 }
 
 /*
  * Wait for the completion of all issued write/flush of that txg provided.
  * It guarantees zil_lwb_flush_vdevs_done() is called and returned.
  */
 static void
 zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
 {
 	ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa));
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0)
 		cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 #ifdef ZFS_DEBUG
 	mutex_enter(&zilog->zl_lock);
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb_t *lwb = list_head(&zilog->zl_lwb_list);
 	while (lwb != NULL) {
 		if (lwb->lwb_issued_txg <= txg) {
 			ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
 			ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
 			IMPLY(lwb->lwb_issued_txg > 0,
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 		}
 		IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 		    lwb->lwb_state == LWB_STATE_FLUSH_DONE,
 		    lwb->lwb_buf == NULL);
 		lwb = list_next(&zilog->zl_lwb_list, lwb);
 	}
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lock);
 #endif
 }
 
 /*
  * This is called when an lwb's write zio completes. The callback's
  * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
  * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
  * in writing out this specific lwb's data, and in the case that cache
  * flushes have been deferred, vdevs involved in writing the data for
  * previous lwbs. The writes corresponding to all the vdevs in the
  * lwb_vdev_tree will have completed by the time this is called, due to
  * the zio dependencies configured in zil_lwb_set_zio_dependency(),
  * which takes deferred flushes into account. The lwb will be "done"
  * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
  * completion callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	spa_t *spa = zio->io_spa;
 	zilog_t *zilog = lwb->lwb_zilog;
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 	lwb_t *nlwb;
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
 	abd_free(zio->io_abd);
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	lwb->lwb_buf = NULL;
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
 	lwb->lwb_state = LWB_STATE_WRITE_DONE;
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 
 	/*
 	 * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not
 	 * called for it yet, and when it will be, it won't be able to make
 	 * its write ZIO a parent this ZIO.  In such case we can not defer
 	 * our flushes or below may be a race between the done callbacks.
 	 */
 	nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
 		nlwb = NULL;
 	mutex_exit(&zilog->zl_lock);
 
 	if (avl_numnodes(t) == 0)
 		return;
 
 	/*
 	 * If there was an IO error, we're not going to call zio_flush()
 	 * on these vdevs, so we simply empty the tree and free the
 	 * nodes. We avoid calling zio_flush() since there isn't any
 	 * good reason for doing so, after the lwb block failed to be
 	 * written out.
 	 *
 	 * Additionally, we don't perform any further error handling at
 	 * this point (e.g. setting "zcw_zio_error" appropriately), as
 	 * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
 	 * we expect any error seen here, to have been propagated to
 	 * that function).
 	 */
 	if (zio->io_error != 0) {
 		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
 			kmem_free(zv, sizeof (*zv));
 		return;
 	}
 
 	/*
 	 * If this lwb does not have any threads waiting for it to
 	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
 	 * command to the vdevs written to by "this" lwb, and instead
 	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
 	 * command for those vdevs. Thus, we merge the vdev tree of
 	 * "this" lwb with the vdev tree of the "next" lwb in the list,
 	 * and assume the "next" lwb will handle flushing the vdevs (or
 	 * deferring the flush(s) again).
 	 *
 	 * This is a useful performance optimization, especially for
 	 * workloads with lots of async write activity and few sync
 	 * write and/or fsync activity, as it has the potential to
 	 * coalesce multiple flush commands to a vdev into one.
 	 */
 	if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
 		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 		return;
 	}
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
 		if (vd != NULL && !vd->vdev_nowritecache) {
 			/*
 			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
 			 * always used within "zio_flush". This means,
 			 * any errors when flushing the vdev(s), will
 			 * (unfortunately) not be handled correctly,
 			 * since these "zio_flush" errors will not be
 			 * propagated up to "zil_lwb_flush_vdevs_done".
 			 */
 			zio_flush(lwb->lwb_root_zio, vd);
 		}
 		kmem_free(zv, sizeof (*zv));
 	}
 }
 
 /*
  * Build the zio dependency chain, which is used to preserve the ordering of
  * lwb completions that is required by the semantics of the ZIL. Each new lwb
  * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
  * cannot complete until the previous lwb's zio completes.
  *
  * This is required by the semantics of zil_commit(): the commit waiters
  * attached to the lwbs will be woken in the lwb zio's completion callback,
  * so this zio dependency graph ensures the waiters are woken in the correct
  * order (the same order the lwbs were created).
  */
 static void
 zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 
 	lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
 	if (prev_lwb == NULL ||
 	    prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
 		return;
 
 	/*
 	 * If the previous lwb's write hasn't already completed, we also want
 	 * to order the completion of the lwb write zios (above, we only order
 	 * the completion of the lwb root zios). This is required because of
 	 * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
 	 *
 	 * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
 	 * lwb will rely on this lwb to flush the vdevs written to by that
 	 * previous lwb. Thus, we need to ensure this lwb doesn't issue the
 	 * flush until after the previous lwb's write completes. We ensure
 	 * this ordering by setting the zio parent/child relationship here.
 	 *
 	 * Without this relationship on the lwb's write zio, it's possible
 	 * for this lwb's write to complete prior to the previous lwb's write
 	 * completing; and thus, the vdevs for the previous lwb would be
 	 * flushed prior to that lwb's data being written to those vdevs (the
 	 * vdevs are flushed in the lwb write zio's completion handler,
 	 * zil_lwb_write_done()).
 	 */
 	if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
 		ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
 		zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
 	} else {
 		ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	}
 
 	ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
 	zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
 }
 
 
 /*
  * This function's purpose is to "open" an lwb such that it is ready to
  * accept new itxs being committed to it. This function is idempotent; if
  * the passed in lwb has already been opened, it is essentially a no-op.
  */
 static void
 zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (lwb->lwb_state != LWB_STATE_NEW) {
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 		return;
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_OPENED;
 	zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 }
 
 /*
  * Define a limited set of intent log block sizes.
  *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
  * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 static const struct {
 	uint64_t	limit;
 	uint64_t	blksz;
 } zil_block_buckets[] = {
 	{ 4096,		4096 },			/* non TX_WRITE */
 	{ 8192 + 4096,	8192 + 4096 },		/* database */
 	{ 32768 + 4096,	32768 + 4096 },		/* NFS writes */
 	{ 65536 + 4096,	65536 + 4096 },		/* 64KB writes */
 	{ 131072,	131072 },		/* < 128KB writes */
 	{ 131072 +4096,	65536 + 4096 },		/* 128KB writes */
 	{ UINT64_MAX,	SPA_OLD_MAXBLOCKSIZE},	/* > 128KB writes */
 };
 
 /*
  * Maximum block size used by the ZIL.  This is picked up when the ZIL is
  * initialized.  Otherwise this should not be used directly; see
  * zl_max_block_size instead.
  */
 static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * Close the log block for being issued and allocate the next one.
  * Has to be called under zl_issuer_lock to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 {
 	int i;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 	lwb->lwb_state = LWB_STATE_CLOSED;
 
 	/*
 	 * If there was an allocation failure then returned NULL will trigger
 	 * zil_commit_writer_stall() at the caller.  This is inherently racy,
 	 * since allocation may not have happened yet.
 	 */
 	if (lwb->lwb_error != 0)
 		return (NULL);
 
 	/*
 	 * Log blocks are pre-allocated. Here we select the size of the next
 	 * block, based on size used in the last block.
 	 * - first find the smallest bucket that will fit the block from a
 	 *   limited set of block sizes. This is because it's faster to write
 	 *   blocks allocated from the same metaslab as they are adjacent or
 	 *   close.
 	 * - next find the maximum from the new suggested size and an array of
 	 *   previous sizes. This lessens a picket fence effect of wrongly
 	 *   guessing the size if we have a stream of say 2k, 64k, 2k, 64k
 	 *   requests.
 	 *
 	 * Note we only write what is used, but we can't just allocate
 	 * the maximum block size because we can exhaust the available
 	 * pool log space.
 	 */
 	uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
 	for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
 		continue;
 	zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
 	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
 	for (i = 0; i < ZIL_PREV_BLKS; i++)
 		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
 	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog,
 	    uint64_t, zil_blksz,
 	    uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
 	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
 	return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state));
 }
 
 /*
  * Finalize previously closed block and issue the write zio.
  */
 static void
 zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 {
 	spa_t *spa = zilog->zl_spa;
 	zil_chain_t *zilc;
 	boolean_t slog;
 	zbookmark_phys_t zb;
 	zio_priority_t prio;
 	int error;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	/* Actually fill the lwb with the data. */
 	for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
 	    itx = list_next(&lwb->lwb_itxs, itx))
 		zil_lwb_commit(zilog, lwb, itx);
 	lwb->lwb_nused = lwb->lwb_nfilled;
+	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 
 	lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
 	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * The lwb is now ready to be issued, but it can be only if it already
 	 * got its block pointer allocated or the allocation has failed.
 	 * Otherwise leave it as-is, relying on some other thread to issue it
 	 * after allocating its block pointer via calling zil_lwb_write_issue()
 	 * for the previous lwb(s) in the chain.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_READY;
 	if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
 		mutex_exit(&zilog->zl_lock);
 		return;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 next_lwb:
 	if (lwb->lwb_slim)
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 	else
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
 	int wsz = lwb->lwb_sz;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
 		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
 		SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 		    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
 		    &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
 		    lwb, prio, ZIO_FLAG_CANFAIL, &zb);
 		zil_lwb_add_block(lwb, &lwb->lwb_blk);
 
 		if (lwb->lwb_slim) {
 			/* For Slim ZIL only write what is used. */
 			wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
 			    int);
 			ASSERT3S(wsz, <=, lwb->lwb_sz);
 			zio_shrink(lwb->lwb_write_zio, wsz);
 			wsz = lwb->lwb_write_zio->io_size;
 		}
 		memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
 		zilc->zc_pad = 0;
 		zilc->zc_nused = lwb->lwb_nused;
 		zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 	} else {
 		/*
 		 * We can't write the lwb if there was an allocation failure,
 		 * so create a null zio instead just to maintain dependencies.
 		 */
 		lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
 		    zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
 		lwb->lwb_write_zio->io_error = lwb->lwb_error;
 	}
 	if (lwb->lwb_child_zio)
 		zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
 
 	/*
 	 * Open transaction to allocate the next block pointer.
 	 */
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * Allocate next the block pointer unless we are already in error.
 	 */
 	lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	blkptr_t *bp = &zilc->zc_next_blk;
 	BP_ZERO(bp);
 	error = lwb->lwb_error;
 	if (error == 0) {
 		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
 		    &slog);
 	}
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
 		    ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 	}
 
 	/*
 	 * Reduce TXG open time by incrementing inflight counter and committing
 	 * the transaciton.  zil_sync() will wait for it to return to zero.
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb->lwb_issued_txg = txg;
 	zilog->zl_lwb_inflight[txg & TXG_MASK]++;
 	zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	dmu_tx_commit(tx);
 
 	spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
 
 	/*
 	 * We've completed all potentially blocking operations.  Update the
 	 * nlwb and allow it proceed without possible lock order reversals.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	zil_lwb_set_zio_dependency(zilog, lwb);
 	lwb->lwb_state = LWB_STATE_ISSUED;
 
 	if (nlwb) {
 		nlwb->lwb_blk = *bp;
 		nlwb->lwb_error = error;
 		nlwb->lwb_slog = slog;
 		nlwb->lwb_alloc_txg = txg;
 		if (nlwb->lwb_state != LWB_STATE_READY)
 			nlwb = NULL;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	if (lwb->lwb_slog) {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	} else {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	}
 	lwb->lwb_issued_timestamp = gethrtime();
 	if (lwb->lwb_child_zio)
 		zio_nowait(lwb->lwb_child_zio);
 	zio_nowait(lwb->lwb_write_zio);
 	zio_nowait(lwb->lwb_root_zio);
 
 	/*
 	 * If nlwb was ready when we gave it the block pointer,
 	 * it is on us to issue it and possibly following ones.
 	 */
 	lwb = nlwb;
 	if (lwb)
 		goto next_lwb;
 }
 
 /*
  * Maximum amount of data that can be put into single log block.
  */
 uint64_t
 zil_max_log_data(zilog_t *zilog, size_t hdrsize)
 {
 	return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
 }
 
 /*
  * Maximum amount of log space we agree to waste to reduce number of
  * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
  */
 static inline uint64_t
 zil_max_waste_space(zilog_t *zilog)
 {
 	return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
 }
 
 /*
  * Maximum amount of write data for WR_COPIED.  For correctness, consumers
  * must fall back to WR_NEED_COPY if we can't fit the entire record into one
  * maximum sized log block, because each WR_COPIED record must fit in a
  * single log block.  Below that it is a tradeoff of additional memory copy
  * and possibly worse log space efficiency vs additional range lock/unlock.
  */
 static uint_t zil_maxcopied = 7680;
 
 uint64_t
 zil_max_copied_data(zilog_t *zilog)
 {
 	uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	return (MIN(max_data, zil_maxcopied));
 }
 
 /*
  * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
  * split the itx as needed, but don't touch the actual transaction data.
  * Has to be called under zl_issuer_lock to call zil_lwb_write_close()
  * to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 {
 	itx_t *citx;
 	lr_t *lr, *clr;
 	lr_write_t *lrw;
 	uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3P(lwb->lwb_buf, !=, NULL);
 
 	zil_lwb_write_open(zilog, lwb);
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	/*
 	 * A commit itx doesn't represent any on-disk state; instead
 	 * it's simply used as a place holder on the commit list, and
 	 * provides a mechanism for attaching a "commit waiter" onto the
 	 * correct lwb (such that the waiter can be signalled upon
 	 * completion of that lwb). Thus, we don't process this itx's
 	 * log record if it's a commit itx (these itx's don't have log
 	 * records), and instead link the itx's waiter onto the lwb's
 	 * list of waiters.
 	 *
 	 * For more details, see the comment above zil_commit().
 	 */
 	if (lr->lrc_txtype == TX_COMMIT) {
 		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
 		list_insert_tail(&lwb->lwb_itxs, itx);
 		return (lwb);
 	}
 
+	reclen = lr->lrc_reclen;
 	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+		ASSERT3U(reclen, ==, sizeof (lr_write_t));
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 	} else {
+		ASSERT3U(reclen, >=, sizeof (lr_t));
 		dlen = 0;
 	}
-	reclen = lr->lrc_reclen;
+	ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
 	zilog->zl_cur_used += (reclen + dlen);
 
 cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
 	lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < zil_max_waste_space(zilog) &&
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
 		list_insert_tail(ilwbs, lwb);
 		lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
 		if (lwb == NULL)
 			return (NULL);
 		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
-
-		/*
-		 * There must be enough space in the new, empty log block to
-		 * hold reclen.  For WR_COPIED, we need to fit the whole
-		 * record in one block, and reclen is the header size + the
-		 * data size. For WR_NEED_COPY, we can create multiple
-		 * records, splitting the data into multiple blocks, so we
-		 * only need to fit one word of data per block; in this case
-		 * reclen is just the header size (no data).
-		 */
-		ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
 	}
 
+	/*
+	 * There must be enough space in the log block to hold reclen.
+	 * For WR_COPIED, we need to fit the whole record in one block,
+	 * and reclen is the write record header size + the data size.
+	 * For WR_NEED_COPY, we can create multiple records, splitting
+	 * the data into multiple blocks, so we only need to fit one
+	 * word of data per block; in this case reclen is just the header
+	 * size (no data).
+	 */
+	ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
+
 	dnow = MIN(dlen, lwb_sp - reclen);
 	if (dlen > dnow) {
 		ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
 		ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
 		citx = zil_itx_clone(itx);
 		clr = &citx->itx_lr;
 		lr_write_t *clrw = (lr_write_t *)clr;
 		clrw->lr_length = dnow;
 		lrw->lr_offset += dnow;
 		lrw->lr_length -= dnow;
 	} else {
 		citx = itx;
 		clr = lr;
 	}
 
 	/*
 	 * We're actually making an entry, so update lrc_seq to be the
 	 * log record sequence number.  Note that this is generally not
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
 	clr->lrc_seq = ++zilog->zl_lr_seq;
 
 	lwb->lwb_nused += reclen + dnow;
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
 	zil_lwb_add_txg(lwb, lr->lrc_txg);
 	list_insert_tail(&lwb->lwb_itxs, citx);
 
 	dlen -= dnow;
 	if (dlen > 0) {
 		zilog->zl_cur_used += reclen;
 		goto cont;
 	}
 
 	if (lr->lrc_txtype == TX_WRITE &&
 	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
 		txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
 
 	return (lwb);
 }
 
 /*
  * Fill the actual transaction data into the lwb, following zil_lwb_assign().
  * Does not require locking.
  */
 static void
 zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 {
 	lr_t *lr, *lrb;
 	lr_write_t *lrw, *lrwb;
 	char *lr_buf;
 	uint64_t dlen, reclen;
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return;
 
 	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 	} else {
 		dlen = 0;
 	}
 	reclen = lr->lrc_reclen;
 	ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
 
 	lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
 	memcpy(lr_buf, lr, reclen);
 	lrb = (lr_t *)lr_buf;		/* Like lr, but inside lwb. */
 	lrwb = (lr_write_t *)lrb;	/* Like lrw, but inside lwb. */
 
 	ZIL_STAT_BUMP(zilog, zil_itx_count);
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
 	 */
 	if (lr->lrc_txtype == TX_WRITE) {
 		if (itx->itx_wr_state == WR_COPIED) {
 			ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
 			ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
 			    lrw->lr_length);
 		} else {
 			char *dbuf;
 			int error;
 
 			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
 				lrb->lrc_reclen += dlen;
 				ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
 				ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
 				    dlen);
 			} else {
 				ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
 				dbuf = NULL;
 				ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
 				ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
 				    lrw->lr_length);
 				if (lwb->lwb_child_zio == NULL) {
 					lwb->lwb_child_zio = zio_root(
 					    zilog->zl_spa, NULL, NULL,
 					    ZIO_FLAG_CANFAIL);
 				}
 			}
 
 			/*
 			 * The "lwb_child_zio" we pass in will become a child of
 			 * "lwb_write_zio", when one is created, so one will be
 			 * a parent of any zio's created by the "zl_get_data".
 			 * This way "lwb_write_zio" will first wait for children
 			 * block pointers before own writing, and then for their
 			 * writing completion before the vdev cache flushing.
 			 */
 			error = zilog->zl_get_data(itx->itx_private,
 			    itx->itx_gen, lrwb, dbuf, lwb,
 			    lwb->lwb_child_zio);
 			if (dbuf != NULL && error == 0) {
 				/* Zero any padding bytes in the last block. */
 				memset((char *)dbuf + lrwb->lr_length, 0,
 				    dlen - lrwb->lr_length);
 			}
 
 			/*
 			 * Typically, the only return values we should see from
 			 * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or
 			 *  EALREADY. However, it is also possible to see other
 			 *  error values such as ENOSPC or EINVAL from
 			 *  dmu_read() -> dnode_hold() -> dnode_hold_impl() or
 			 *  ENXIO as well as a multitude of others from the
 			 *  block layer through dmu_buf_hold() -> dbuf_read()
 			 *  -> zio_wait(), as well as through dmu_read() ->
 			 *  dnode_hold() -> dnode_hold_impl() -> dbuf_read() ->
 			 *  zio_wait(). When these errors happen, we can assume
 			 *  that neither an immediate write nor an indirect
 			 *  write occurred, so we need to fall back to
 			 *  txg_wait_synced(). This is unusual, so we print to
 			 *  dmesg whenever one of these errors occurs.
 			 */
 			switch (error) {
 			case 0:
 				break;
 			default:
 				cmn_err(CE_WARN, "zil_lwb_commit() received "
 				    "unexpected error %d from ->zl_get_data()"
 				    ". Falling back to txg_wait_synced().",
 				    error);
 				zfs_fallthrough;
 			case EIO:
 				txg_wait_synced(zilog->zl_dmu_pool,
 				    lr->lrc_txg);
 				zfs_fallthrough;
 			case ENOENT:
 				zfs_fallthrough;
 			case EEXIST:
 				zfs_fallthrough;
 			case EALREADY:
 				return;
 			}
 		}
 	}
 
 	lwb->lwb_nfilled += reclen + dlen;
 	ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
 	ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
 }
 
 itx_t *
 zil_itx_create(uint64_t txtype, size_t olrsize)
 {
 	size_t itxsize, lrsize;
 	itx_t *itx;
 
+	ASSERT3U(olrsize, >=, sizeof (lr_t));
 	lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
+	ASSERT3U(lrsize, >=, olrsize);
 	itxsize = offsetof(itx_t, itx_lr) + lrsize;
 
 	itx = zio_data_buf_alloc(itxsize);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize);
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	itx->itx_size = itxsize;
 
 	return (itx);
 }
 
 static itx_t *
 zil_itx_clone(itx_t *oitx)
 {
+	ASSERT3U(oitx->itx_size, >=, sizeof (itx_t));
+	ASSERT3U(oitx->itx_size, ==,
+	    offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen);
+
 	itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
 	memcpy(itx, oitx, oitx->itx_size);
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	return (itx);
 }
 
 void
 zil_itx_destroy(itx_t *itx)
 {
+	ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
+	ASSERT3U(itx->itx_lr.lrc_reclen, ==,
+	    itx->itx_size - offsetof(itx_t, itx_lr));
 	IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
 	IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 
 	if (itx->itx_callback != NULL)
 		itx->itx_callback(itx->itx_callback_data);
 
 	zio_data_buf_free(itx, itx->itx_size);
 }
 
 /*
  * Free up the sync and async itxs. The itxs_t has already been detached
  * so no locks are needed.
  */
 static void
 zil_itxg_clean(void *arg)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
 	itxs_t *itxs = arg;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
 	while ((itx = list_remove_head(list)) != NULL) {
 		/*
 		 * In the general case, commit itxs will not be found
 		 * here, as they'll be committed to an lwb via
 		 * zil_lwb_assign(), and free'd in that function. Having
 		 * said that, it is still possible for commit itxs to be
 		 * found here, due to the following race:
 		 *
 		 *	- a thread calls zil_commit() which assigns the
 		 *	  commit itx to a per-txg i_sync_list
 		 *	- zil_itxg_clean() is called (e.g. via spa_sync())
 		 *	  while the waiter is still on the i_sync_list
 		 *
 		 * There's nothing to prevent syncing the txg while the
 		 * waiter is on the i_sync_list. This normally doesn't
 		 * happen because spa_sync() is slower than zil_commit(),
 		 * but if zil_commit() calls txg_wait_synced() (e.g.
 		 * because zil_create() or zil_commit_writer_stall() is
 		 * called) we will hit this case.
 		 */
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
 			zil_commit_waiter_skip(itx->itx_private);
 
 		zil_itx_destroy(itx);
 	}
 
 	cookie = NULL;
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
 		while ((itx = list_remove_head(list)) != NULL) {
 			/* commit itxs should never be on the async lists. */
 			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 		list_destroy(list);
 		kmem_free(ian, sizeof (itx_async_node_t));
 	}
 	avl_destroy(t);
 
 	kmem_free(itxs, sizeof (itxs_t));
 }
 
 static int
 zil_aitx_compare(const void *x1, const void *x2)
 {
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
 	return (TREE_CMP(o1, o2));
 }
 
 /*
  * Remove all async itx with the given oid.
  */
 void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
-	itx_async_node_t *ian;
+	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
 	itx_t *itx;
 
 	ASSERT(oid != 0);
 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
-		ian = avl_find(t, &oid, &where);
+		ian_search.ia_foid = oid;
+		ian = avl_find(t, &ian_search, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
 	while ((itx = list_remove_head(&clean_list)) != NULL) {
 		/* commit itxs should never be on the async lists. */
 		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 		zil_itx_destroy(itx);
 	}
 	list_destroy(&clean_list);
 }
 
 void
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t txg;
 	itxg_t *itxg;
 	itxs_t *itxs, *clean = NULL;
 
 	/*
 	 * Ensure the data of a renamed file is committed before the rename.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
 		zil_async_to_sync(zilog, itx->itx_oid);
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
 		txg = ZILTEST_TXG;
 	else
 		txg = dmu_tx_get_txg(tx);
 
 	itxg = &zilog->zl_itxg[txg & TXG_MASK];
 	mutex_enter(&itxg->itxg_lock);
 	itxs = itxg->itxg_itxs;
 	if (itxg->itxg_txg != txg) {
 		if (itxs != NULL) {
 			/*
 			 * The zil_clean callback hasn't got around to cleaning
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
 			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
 			    "txg %llu", (u_longlong_t)itxg->itxg_txg);
 			clean = itxg->itxg_itxs;
 		}
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
 		    KM_SLEEP);
 
 		list_create(&itxs->i_sync_list, sizeof (itx_t),
 		    offsetof(itx_t, itx_node));
 		avl_create(&itxs->i_async_tree, zil_aitx_compare,
 		    sizeof (itx_async_node_t),
 		    offsetof(itx_async_node_t, ia_node));
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid =
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
 		itx_async_node_t *ian;
 		avl_index_t where;
 
 		ian = avl_find(t, &foid, &where);
 		if (ian == NULL) {
 			ian = kmem_alloc(sizeof (itx_async_node_t),
 			    KM_SLEEP);
 			list_create(&ian->ia_list, sizeof (itx_t),
 			    offsetof(itx_t, itx_node));
 			ian->ia_foid = foid;
 			avl_insert(t, ian, where);
 		}
 		list_insert_tail(&ian->ia_list, itx);
 	}
 
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * We don't want to dirty the ZIL using ZILTEST_TXG, because
 	 * zil_clean() will never be called using ZILTEST_TXG. Thus, we
 	 * need to be careful to always dirty the ZIL using the "real"
 	 * TXG (not itxg_txg) even when the SPA is frozen.
 	 */
 	zilog_dirty(zilog, dmu_tx_get_txg(tx));
 	mutex_exit(&itxg->itxg_lock);
 
 	/* Release the old itxs now we've dropped the lock */
 	if (clean != NULL)
 		zil_itxg_clean(clean);
 }
 
 /*
  * If there are any in-memory intent log transactions which have now been
  * synced then start up a taskq to free them. We should only do this after we
  * have written out the uberblocks (i.e. txg has been committed) so that
  * don't inadvertently clean out in-memory log records that would be required
  * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
 	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
 	itxs_t *clean_me;
 
 	ASSERT3U(synced_txg, <, ZILTEST_TXG);
 
 	mutex_enter(&itxg->itxg_lock);
 	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
 		mutex_exit(&itxg->itxg_lock);
 		return;
 	}
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT3U(itxg->itxg_txg, !=, 0);
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
 	mutex_exit(&itxg->itxg_lock);
 	/*
 	 * Preferably start a task queue to free up the old itxs but
 	 * if taskq_dispatch can't allocate resources to do that then
 	 * free it in-line. This should be rare. Note, using TQ_SLEEP
 	 * created a bad performance problem.
 	 */
 	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
 	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
 	taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
 	    zil_itxg_clean, clean_me, TQ_NOSLEEP);
 	if (id == TASKQID_INVALID)
 		zil_itxg_clean(clean_me);
 }
 
 /*
  * This function will traverse the queue of itxs that need to be
  * committed, and move them onto the ZIL's zl_itx_commit_list.
  */
 static uint64_t
 zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg, wtxg = 0;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing. That's okay since we'll
 	 * only commit things in the future.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If we're adding itx records to the zl_itx_commit_list,
 		 * then the zil better be dirty in this "txg". We can assert
 		 * that here since we're holding the itxg_lock which will
 		 * prevent spa_sync from cleaning it. Once we add the itxs
 		 * to the zl_itx_commit_list we must commit it to disk even
 		 * if it's unnecessary (i.e. the txg was synced).
 		 */
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
 		if (unlikely(zilog->zl_suspend > 0)) {
 			/*
 			 * ZIL was just suspended, but we lost the race.
 			 * Allow all earlier itxs to be committed, but ask
 			 * caller to do txg_wait_synced(txg) for any new.
 			 */
 			if (!list_is_empty(sync_list))
 				wtxg = MAX(wtxg, txg);
 		} else {
 			list_move_tail(commit_list, sync_list);
 		}
 
 		mutex_exit(&itxg->itxg_lock);
 	}
 	return (wtxg);
 }
 
 /*
  * Move the async itxs for a specified object to commit into sync lists.
  */
 void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
-	itx_async_node_t *ian;
+	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If a foid is specified then find that node and append its
 		 * list. Otherwise walk the tree appending all the lists
 		 * to the sync list. We add to the end rather than the
 		 * beginning to ensure the create has happened.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
-			ian = avl_find(t, &foid, &where);
+			ian_search.ia_foid = foid;
+			ian = avl_find(t, &ian_search, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 			}
 		} else {
 			void *cookie = NULL;
 
 			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 				list_destroy(&ian->ia_list);
 				kmem_free(ian, sizeof (itx_async_node_t));
 			}
 		}
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
 /*
  * This function will prune commit itxs that are at the head of the
  * commit list (it won't prune past the first non-commit itx), and
  * either: a) attach them to the last lwb that's still pending
  * completion, or b) skip them altogether.
  *
  * This is used as a performance optimization to prevent commit itxs
  * from generating new lwbs when it's unnecessary to do so.
  */
 static void
 zil_prune_commit_list(zilog_t *zilog)
 {
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		if (lrc->lrc_txtype != TX_COMMIT)
 			break;
 
 		mutex_enter(&zilog->zl_lock);
 
 		lwb_t *last_lwb = zilog->zl_last_lwb_opened;
 		if (last_lwb == NULL ||
 		    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
 			/*
 			 * All of the itxs this waiter was waiting on
 			 * must have already completed (or there were
 			 * never any itx's for it to wait on), so it's
 			 * safe to skip this waiter and mark it done.
 			 */
 			zil_commit_waiter_skip(itx->itx_private);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
 		}
 
 		mutex_exit(&zilog->zl_lock);
 
 		list_remove(&zilog->zl_itx_commit_list, itx);
 		zil_itx_destroy(itx);
 	}
 
 	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 }
 
 static void
 zil_commit_writer_stall(zilog_t *zilog)
 {
 	/*
 	 * When zio_alloc_zil() fails to allocate the next lwb block on
 	 * disk, we must call txg_wait_synced() to ensure all of the
 	 * lwbs in the zilog's zl_lwb_list are synced and then freed (in
 	 * zil_sync()), such that any subsequent ZIL writer (i.e. a call
 	 * to zil_process_commit_list()) will have to call zil_create(),
 	 * and start a new ZIL chain.
 	 *
 	 * Since zil_alloc_zil() failed, the lwb that was previously
 	 * issued does not have a pointer to the "next" lwb on disk.
 	 * Thus, if another ZIL writer thread was to allocate the "next"
 	 * on-disk lwb, that block could be leaked in the event of a
 	 * crash (because the previous lwb on-disk would not point to
 	 * it).
 	 *
 	 * We must hold the zilog's zl_issuer_lock while we do this, to
 	 * ensure no new threads enter zil_process_commit_list() until
 	 * all lwb's in the zl_lwb_list have been synced and freed
 	 * (which is achieved via the txg_wait_synced() call).
 	 */
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }
 
 /*
  * This function will traverse the commit list, creating new lwbs as
  * needed, and committing the itxs from the commit list to these newly
  * created lwbs. Additionally, as a new lwb is created, the previous
  * lwb will be issued to the zio layer to be written to disk.
  */
 static void
 zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 {
 	spa_t *spa = zilog->zl_spa;
 	list_t nolwb_itxs;
 	list_t nolwb_waiters;
 	lwb_t *lwb, *plwb;
 	itx_t *itx;
 	boolean_t first = B_TRUE;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	/*
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
 	if (list_is_empty(&zilog->zl_itx_commit_list))
 		return;
 
 	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb == NULL) {
 		lwb = zil_create(zilog);
 	} else {
 		/*
 		 * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will
 		 * have already been created (zl_lwb_list not empty).
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 		first = (lwb->lwb_state == LWB_STATE_NEW) &&
 		    ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
 		    plwb->lwb_state == LWB_STATE_FLUSH_DONE);
 	}
 
 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
 
 		ASSERT3U(txg, !=, 0);
 
 		if (lrc->lrc_txtype == TX_COMMIT) {
 			DTRACE_PROBE2(zil__process__commit__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		} else {
 			DTRACE_PROBE2(zil__process__normal__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		}
 
 		boolean_t synced = txg <= spa_last_synced_txg(spa);
 		boolean_t frozen = txg > spa_freeze_txg(spa);
 
 		/*
 		 * If the txg of this itx has already been synced out, then
 		 * we don't need to commit this itx to an lwb. This is
 		 * because the data of this itx will have already been
 		 * written to the main pool. This is inherently racy, and
 		 * it's still ok to commit an itx whose txg has already
 		 * been synced; this will result in a write that's
 		 * unnecessary, but will do no harm.
 		 *
 		 * With that said, we always want to commit TX_COMMIT itxs
 		 * to an lwb, regardless of whether or not that itx's txg
 		 * has been synced out. We do this to ensure any OPENED lwb
 		 * will always have at least one zil_commit_waiter_t linked
 		 * to the lwb.
 		 *
 		 * As a counter-example, if we skipped TX_COMMIT itx's
 		 * whose txg had already been synced, the following
 		 * situation could occur if we happened to be racing with
 		 * spa_sync:
 		 *
 		 * 1. We commit a non-TX_COMMIT itx to an lwb, where the
 		 *    itx's txg is 10 and the last synced txg is 9.
 		 * 2. spa_sync finishes syncing out txg 10.
 		 * 3. We move to the next itx in the list, it's a TX_COMMIT
 		 *    whose txg is 10, so we skip it rather than committing
 		 *    it to the lwb used in (1).
 		 *
 		 * If the itx that is skipped in (3) is the last TX_COMMIT
 		 * itx in the commit list, than it's possible for the lwb
 		 * used in (1) to remain in the OPENED state indefinitely.
 		 *
 		 * To prevent the above scenario from occurring, ensuring
 		 * that once an lwb is OPENED it will transition to ISSUED
 		 * and eventually DONE, we always commit TX_COMMIT itx's to
 		 * an lwb here, even if that itx's txg has already been
 		 * synced.
 		 *
 		 * Finally, if the pool is frozen, we _always_ commit the
 		 * itx.  The point of freezing the pool is to prevent data
 		 * from being written to the main pool via spa_sync, and
 		 * instead rely solely on the ZIL to persistently store the
 		 * data; i.e.  when the pool is frozen, the last synced txg
 		 * value can't be trusted.
 		 */
 		if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
 			if (lwb != NULL) {
 				lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
 				if (lwb == NULL) {
 					list_insert_tail(&nolwb_itxs, itx);
 				} else if ((zcw->zcw_lwb != NULL &&
 				    zcw->zcw_lwb != lwb) || zcw->zcw_done) {
 					/*
 					 * Our lwb is done, leave the rest of
 					 * itx list to somebody else who care.
 					 */
 					first = B_FALSE;
 					break;
 				}
 			} else {
 				if (lrc->lrc_txtype == TX_COMMIT) {
 					zil_commit_waiter_link_nolwb(
 					    itx->itx_private, &nolwb_waiters);
 				}
 				list_insert_tail(&nolwb_itxs, itx);
 			}
 		} else {
 			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 	}
 
 	if (lwb == NULL) {
 		/*
 		 * This indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this happens, we must stall
 		 * the ZIL write pipeline; see the comment within
 		 * zil_commit_writer_stall() for more details.
 		 */
 		while ((lwb = list_remove_head(ilwbs)) != NULL)
 			zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 
 		/*
 		 * Additionally, we have to signal and mark the "nolwb"
 		 * waiters as "done" here, since without an lwb, we
 		 * can't do this via zil_lwb_flush_vdevs_done() like
 		 * normal.
 		 */
 		zil_commit_waiter_t *zcw;
 		while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
 			zil_commit_waiter_skip(zcw);
 
 		/*
 		 * And finally, we have to destroy the itx's that
 		 * couldn't be committed to an lwb; this will also call
 		 * the itx's callback if one exists for the itx.
 		 */
 		while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
 			zil_itx_destroy(itx);
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * At this point, the ZIL block pointed at by the "lwb"
 		 * variable is in "new" or "opened" state.
 		 *
 		 * If it's "new", then no itxs have been committed to it, so
 		 * there's no point in issuing its zio (i.e. it's "empty").
 		 *
 		 * If it's "opened", then it contains one or more itxs that
 		 * eventually need to be committed to stable storage. In
 		 * this case we intentionally do not issue the lwb's zio
 		 * to disk yet, and instead rely on one of the following
 		 * two mechanisms for issuing the zio:
 		 *
 		 * 1. Ideally, there will be more ZIL activity occurring on
 		 * the system, such that this function will be immediately
 		 * called again by different thread and this lwb will be
 		 * closed by zil_lwb_assign().  This way, the lwb will be
 		 * "full" when it is issued to disk, and we'll make use of
 		 * the lwb's size the best we can.
 		 *
 		 * 2. If there isn't sufficient ZIL activity occurring on
 		 * the system, zil_commit_waiter() will close it and issue
 		 * the zio.  If this occurs, the lwb is not guaranteed
 		 * to be "full" by the time its zio is issued, and means
 		 * the size of the lwb was "too large" given the amount
 		 * of ZIL activity occurring on the system at that time.
 		 *
 		 * We do this for a couple of reasons:
 		 *
 		 * 1. To try and reduce the number of IOPs needed to
 		 * write the same number of itxs. If an lwb has space
 		 * available in its buffer for more itxs, and more itxs
 		 * will be committed relatively soon (relative to the
 		 * latency of performing a write), then it's beneficial
 		 * to wait for these "next" itxs. This way, more itxs
 		 * can be committed to stable storage with fewer writes.
 		 *
 		 * 2. To try and use the largest lwb block size that the
 		 * incoming rate of itxs can support. Again, this is to
 		 * try and pack as many itxs into as few lwbs as
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
 		 *
 		 * If we had no already running or open LWBs, it can be
 		 * the workload is single-threaded.  And if the ZIL write
 		 * latency is very small or if the LWB is almost full, it
 		 * may be cheaper to bypass the delay.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED && first) {
 			hrtime_t sleep = zilog->zl_last_lwb_latency *
 			    zfs_commit_timeout_pct / 100;
 			if (sleep < zil_min_commit_timeout ||
 			    lwb->lwb_nmax - lwb->lwb_nused <
 			    lwb->lwb_nmax / 8) {
 				list_insert_tail(ilwbs, lwb);
 				lwb = zil_lwb_write_close(zilog, lwb,
 				    LWB_STATE_NEW);
 				zilog->zl_cur_used = 0;
 				if (lwb == NULL) {
 					while ((lwb = list_remove_head(ilwbs))
 					    != NULL)
 						zil_lwb_write_issue(zilog, lwb);
 					zil_commit_writer_stall(zilog);
 				}
 			}
 		}
 	}
 }
 
 /*
  * This function is responsible for ensuring the passed in commit waiter
  * (and associated commit itx) is committed to an lwb. If the waiter is
  * not already committed to an lwb, all itxs in the zilog's queue of
  * itxs will be processed. The assumption is the passed in waiter's
  * commit itx will found in the queue just like the other non-commit
  * itxs, such that when the entire queue is processed, the waiter will
  * have been committed to an lwb.
  *
  * The lwb associated with the passed in waiter is not guaranteed to
  * have been issued by the time this function completes. If the lwb is
  * not issued, we rely on future calls to zil_commit_writer() to issue
  * the lwb, or the timeout mechanism found in zil_commit_waiter().
  */
 static uint64_t
 zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	list_t ilwbs;
 	lwb_t *lwb;
 	uint64_t wtxg = 0;
 
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
 	mutex_enter(&zilog->zl_issuer_lock);
 
 	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
 		/*
 		 * It's possible that, while we were waiting to acquire
 		 * the "zl_issuer_lock", another thread committed this
 		 * waiter to an lwb. If that occurs, we bail out early,
 		 * without processing any of the zilog's queue of itxs.
 		 *
 		 * On certain workloads and system configurations, the
 		 * "zl_issuer_lock" can become highly contended. In an
 		 * attempt to reduce this contention, we immediately drop
 		 * the lock if the waiter has already been processed.
 		 *
 		 * We've measured this optimization to reduce CPU spent
 		 * contending on this lock by up to 5%, using a system
 		 * with 32 CPUs, low latency storage (~50 usec writes),
 		 * and 1024 threads performing sync writes.
 		 */
 		goto out;
 	}
 
 	ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
 
 	wtxg = zil_get_commit_list(zilog);
 	zil_prune_commit_list(zilog);
 	zil_process_commit_list(zilog, zcw, &ilwbs);
 
 out:
 	mutex_exit(&zilog->zl_issuer_lock);
 	while ((lwb = list_remove_head(&ilwbs)) != NULL)
 		zil_lwb_write_issue(zilog, lwb);
 	list_destroy(&ilwbs);
 	return (wtxg);
 }
 
 static void
 zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 
 	lwb_t *lwb = zcw->zcw_lwb;
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 
 	/*
 	 * If the lwb has already been issued by another thread, we can
 	 * immediately return since there's no work to be done (the
 	 * point of this function is to issue the lwb). Additionally, we
 	 * do this prior to acquiring the zl_issuer_lock, to avoid
 	 * acquiring it when it's not necessary to do so.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED)
 		return;
 
 	/*
 	 * In order to call zil_lwb_write_close() we must hold the
 	 * zilog's "zl_issuer_lock". We can't simply acquire that lock,
 	 * since we're already holding the commit waiter's "zcw_lock",
 	 * and those two locks are acquired in the opposite order
 	 * elsewhere.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 	mutex_enter(&zilog->zl_issuer_lock);
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * Since we just dropped and re-acquired the commit waiter's
 	 * lock, we have to re-check to see if the waiter was marked
 	 * "done" during that process. If the waiter was marked "done",
 	 * the "lwb" pointer is no longer valid (it can be free'd after
 	 * the waiter is marked "done"), so without this check we could
 	 * wind up with a use-after-free error below.
 	 */
 	if (zcw->zcw_done) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	ASSERT3P(lwb, ==, zcw->zcw_lwb);
 
 	/*
 	 * We've already checked this above, but since we hadn't acquired
 	 * the zilog's zl_issuer_lock, we have to perform this check a
 	 * second time while holding the lock.
 	 *
 	 * We don't need to hold the zl_lock since the lwb cannot transition
 	 * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
 	 * _can_ transition from CLOSED to DONE, but it's OK to race with
 	 * that transition since we treat the lwb the same, whether it's in
 	 * the CLOSED, ISSUED or DONE states.
 	 *
 	 * The important thing, is we treat the lwb differently depending on
 	 * if it's OPENED or CLOSED, and block any other threads that might
 	 * attempt to close/issue this lwb. For that reason we hold the
 	 * zl_issuer_lock when checking the lwb_state; we must not call
 	 * zil_lwb_write_close() if the lwb had already been closed/issued.
 	 *
 	 * See the comment above the lwb_state_t structure definition for
 	 * more details on the lwb states, and locking requirements.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	/*
 	 * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
 	 * is still open.  But we have to drop it to avoid a deadlock in case
 	 * callback of zio issued by zil_lwb_write_issue() try to get it,
 	 * while zil_lwb_write_issue() is blocked on attempt to issue next
 	 * lwb it found in LWB_STATE_READY state.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 
 	/*
 	 * As described in the comments above zil_commit_waiter() and
 	 * zil_process_commit_list(), we need to issue this lwb's zio
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
 	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	/*
 	 * Since the lwb's zio hadn't been issued by the time this thread
 	 * reached its timeout, we reset the zilog's "zl_cur_used" field
 	 * to influence the zil block size selection algorithm.
 	 *
 	 * By having to issue the lwb's zio here, it means the size of the
 	 * lwb was too large, given the incoming throughput of itxs.  By
 	 * setting "zl_cur_used" to zero, we communicate this fact to the
 	 * block size selection algorithm, so it can take this information
 	 * into account, and potentially select a smaller size for the
 	 * next lwb block that is allocated.
 	 */
 	zilog->zl_cur_used = 0;
 
 	if (nlwb == NULL) {
 		/*
 		 * When zil_lwb_write_close() returns NULL, this
 		 * indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this occurs, the ZIL write
 		 * pipeline must be stalled; see the comment within the
 		 * zil_commit_writer_stall() function for more details.
 		 */
 		zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 		mutex_exit(&zilog->zl_issuer_lock);
 	} else {
 		mutex_exit(&zilog->zl_issuer_lock);
 		zil_lwb_write_issue(zilog, lwb);
 	}
 	mutex_enter(&zcw->zcw_lock);
 }
 
 /*
  * This function is responsible for performing the following two tasks:
  *
  * 1. its primary responsibility is to block until the given "commit
  *    waiter" is considered "done".
  *
  * 2. its secondary responsibility is to issue the zio for the lwb that
  *    the given "commit waiter" is waiting on, if this function has
  *    waited "long enough" and the lwb is still in the "open" state.
  *
  * Given a sufficient amount of itxs being generated and written using
  * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
  * function. If this does not occur, this secondary responsibility will
  * ensure the lwb is issued even if there is not other synchronous
  * activity on the system.
  *
  * For more details, see zil_process_commit_list(); more specifically,
  * the comment at the bottom of that function.
  */
 static void
 zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * The timeout is scaled based on the lwb latency to avoid
 	 * significantly impacting the latency of each individual itx.
 	 * For more details, see the comment at the bottom of the
 	 * zil_process_commit_list() function.
 	 */
 	int pct = MAX(zfs_commit_timeout_pct, 1);
 	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
 	hrtime_t wakeup = gethrtime() + sleep;
 	boolean_t timedout = B_FALSE;
 
 	while (!zcw->zcw_done) {
 		ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 
 		lwb_t *lwb = zcw->zcw_lwb;
 
 		/*
 		 * Usually, the waiter will have a non-NULL lwb field here,
 		 * but it's possible for it to be NULL as a result of
 		 * zil_commit() racing with spa_sync().
 		 *
 		 * When zil_clean() is called, it's possible for the itxg
 		 * list (which may be cleaned via a taskq) to contain
 		 * commit itxs. When this occurs, the commit waiters linked
 		 * off of these commit itxs will not be committed to an
 		 * lwb.  Additionally, these commit waiters will not be
 		 * marked done until zil_commit_waiter_skip() is called via
 		 * zil_itxg_clean().
 		 *
 		 * Thus, it's possible for this commit waiter (i.e. the
 		 * "zcw" variable) to be found in this "in between" state;
 		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
 		 * been skipped, so it's "zcw_done" field is still B_FALSE.
 		 */
 		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
 
 		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
 			ASSERT3B(timedout, ==, B_FALSE);
 
 			/*
 			 * If the lwb hasn't been issued yet, then we
 			 * need to wait with a timeout, in case this
 			 * function needs to issue the lwb after the
 			 * timeout is reached; responsibility (2) from
 			 * the comment above this function.
 			 */
 			int rc = cv_timedwait_hires(&zcw->zcw_cv,
 			    &zcw->zcw_lock, wakeup, USEC2NSEC(1),
 			    CALLOUT_FLAG_ABSOLUTE);
 
 			if (rc != -1 || zcw->zcw_done)
 				continue;
 
 			timedout = B_TRUE;
 			zil_commit_waiter_timeout(zilog, zcw);
 
 			if (!zcw->zcw_done) {
 				/*
 				 * If the commit waiter has already been
 				 * marked "done", it's possible for the
 				 * waiter's lwb structure to have already
 				 * been freed.  Thus, we can only reliably
 				 * make these assertions if the waiter
 				 * isn't done.
 				 */
 				ASSERT3P(lwb, ==, zcw->zcw_lwb);
 				ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
 			}
 		} else {
 			/*
 			 * If the lwb isn't open, then it must have already
 			 * been issued. In that case, there's no need to
 			 * use a timeout when waiting for the lwb to
 			 * complete.
 			 *
 			 * Additionally, if the lwb is NULL, the waiter
 			 * will soon be signaled and marked done via
 			 * zil_clean() and zil_itxg_clean(), so no timeout
 			 * is required.
 			 */
 
 			IMPLY(lwb != NULL,
 			    lwb->lwb_state == LWB_STATE_CLOSED ||
 			    lwb->lwb_state == LWB_STATE_READY ||
 			    lwb->lwb_state == LWB_STATE_ISSUED ||
 			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 			cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
 		}
 	}
 
 	mutex_exit(&zcw->zcw_lock);
 }
 
 static zil_commit_waiter_t *
 zil_alloc_commit_waiter(void)
 {
 	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
 
 	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_link_init(&zcw->zcw_node);
 	zcw->zcw_lwb = NULL;
 	zcw->zcw_done = B_FALSE;
 	zcw->zcw_zio_error = 0;
 
 	return (zcw);
 }
 
 static void
 zil_free_commit_waiter(zil_commit_waiter_t *zcw)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
 	mutex_destroy(&zcw->zcw_lock);
 	cv_destroy(&zcw->zcw_cv);
 	kmem_cache_free(zil_zcw_cache, zcw);
 }
 
 /*
  * This function is used to create a TX_COMMIT itx and assign it. This
  * way, it will be linked into the ZIL's list of synchronous itxs, and
  * then later committed to an lwb (or skipped) when
  * zil_process_commit_list() is called.
  */
 static void
 zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 
 	/*
 	 * Since we are not going to create any new dirty data, and we
 	 * can even help with clearing the existing dirty data, we
 	 * should not be subject to the dirty data based delays. We
 	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
 	 */
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 
 	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
 	itx->itx_sync = B_TRUE;
 	itx->itx_private = zcw;
 
 	zil_itx_assign(zilog, itx, tx);
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Commit ZFS Intent Log transactions (itxs) to stable storage.
  *
  * When writing ZIL transactions to the on-disk representation of the
  * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
  * itxs can be committed to a single lwb. Once a lwb is written and
  * committed to stable storage (i.e. the lwb is written, and vdevs have
  * been flushed), each itx that was committed to that lwb is also
  * considered to be committed to stable storage.
  *
  * When an itx is committed to an lwb, the log record (lr_t) contained
  * by the itx is copied into the lwb's zio buffer, and once this buffer
  * is written to disk, it becomes an on-disk ZIL block.
  *
  * As itxs are generated, they're inserted into the ZIL's queue of
  * uncommitted itxs. The semantics of zil_commit() are such that it will
  * block until all itxs that were in the queue when it was called, are
  * committed to stable storage.
  *
  * If "foid" is zero, this means all "synchronous" and "asynchronous"
  * itxs, for all objects in the dataset, will be committed to stable
  * storage prior to zil_commit() returning. If "foid" is non-zero, all
  * "synchronous" itxs for all objects, but only "asynchronous" itxs
  * that correspond to the foid passed in, will be committed to stable
  * storage prior to zil_commit() returning.
  *
  * Generally speaking, when zil_commit() is called, the consumer doesn't
  * actually care about _all_ of the uncommitted itxs. Instead, they're
  * simply trying to waiting for a specific itx to be committed to disk,
  * but the interface(s) for interacting with the ZIL don't allow such
  * fine-grained communication. A better interface would allow a consumer
  * to create and assign an itx, and then pass a reference to this itx to
  * zil_commit(); such that zil_commit() would return as soon as that
  * specific itx was committed to disk (instead of waiting for _all_
  * itxs to be committed).
  *
  * When a thread calls zil_commit() a special "commit itx" will be
  * generated, along with a corresponding "waiter" for this commit itx.
  * zil_commit() will wait on this waiter's CV, such that when the waiter
  * is marked done, and signaled, zil_commit() will return.
  *
  * This commit itx is inserted into the queue of uncommitted itxs. This
  * provides an easy mechanism for determining which itxs were in the
  * queue prior to zil_commit() having been called, and which itxs were
  * added after zil_commit() was called.
  *
  * The commit itx is special; it doesn't have any on-disk representation.
  * When a commit itx is "committed" to an lwb, the waiter associated
  * with it is linked onto the lwb's list of waiters. Then, when that lwb
  * completes, each waiter on the lwb's list is marked done and signaled
  * -- allowing the thread waiting on the waiter to return from zil_commit().
  *
  * It's important to point out a few critical factors that allow us
  * to make use of the commit itxs, commit waiters, per-lwb lists of
  * commit waiters, and zio completion callbacks like we're doing:
  *
  *   1. The list of waiters for each lwb is traversed, and each commit
  *      waiter is marked "done" and signaled, in the zio completion
  *      callback of the lwb's zio[*].
  *
  *      * Actually, the waiters are signaled in the zio completion
  *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
  *        that are sent to the vdevs upon completion of the lwb zio.
  *
  *   2. When the itxs are inserted into the ZIL's queue of uncommitted
  *      itxs, the order in which they are inserted is preserved[*]; as
  *      itxs are added to the queue, they are added to the tail of
  *      in-memory linked lists.
  *
  *      When committing the itxs to lwbs (to be written to disk), they
  *      are committed in the same order in which the itxs were added to
  *      the uncommitted queue's linked list(s); i.e. the linked list of
  *      itxs to commit is traversed from head to tail, and each itx is
  *      committed to an lwb in that order.
  *
  *      * To clarify:
  *
  *        - the order of "sync" itxs is preserved w.r.t. other
  *          "sync" itxs, regardless of the corresponding objects.
  *        - the order of "async" itxs is preserved w.r.t. other
  *          "async" itxs corresponding to the same object.
  *        - the order of "async" itxs is *not* preserved w.r.t. other
  *          "async" itxs corresponding to different objects.
  *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
  *          versa) is *not* preserved, even for itxs that correspond
  *          to the same object.
  *
  *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
  *      zil_get_commit_list(), and zil_process_commit_list().
  *
  *   3. The lwbs represent a linked list of blocks on disk. Thus, any
  *      lwb cannot be considered committed to stable storage, until its
  *      "previous" lwb is also committed to stable storage. This fact,
  *      coupled with the fact described above, means that itxs are
  *      committed in (roughly) the order in which they were generated.
  *      This is essential because itxs are dependent on prior itxs.
  *      Thus, we *must not* deem an itx as being committed to stable
  *      storage, until *all* prior itxs have also been committed to
  *      stable storage.
  *
  *      To enforce this ordering of lwb zio's, while still leveraging as
  *      much of the underlying storage performance as possible, we rely
  *      on two fundamental concepts:
  *
  *          1. The creation and issuance of lwb zio's is protected by
  *             the zilog's "zl_issuer_lock", which ensures only a single
  *             thread is creating and/or issuing lwb's at a time
  *          2. The "previous" lwb is a child of the "current" lwb
  *             (leveraging the zio parent-child dependency graph)
  *
  *      By relying on this parent-child zio relationship, we can have
  *      many lwb zio's concurrently issued to the underlying storage,
  *      but the order in which they complete will be the same order in
  *      which they were created.
  */
 void
 zil_commit(zilog_t *zilog, uint64_t foid)
 {
 	/*
 	 * We should never attempt to call zil_commit on a snapshot for
 	 * a couple of reasons:
 	 *
 	 * 1. A snapshot may never be modified, thus it cannot have any
 	 *    in-flight itxs that would have modified the dataset.
 	 *
 	 * 2. By design, when zil_commit() is called, a commit itx will
 	 *    be assigned to this zilog; as a result, the zilog will be
 	 *    dirtied. We must not dirty the zilog of a snapshot; there's
 	 *    checks in the code that enforce this invariant, and will
 	 *    cause a panic if it's not upheld.
 	 */
 	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
 
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return;
 
 	if (!spa_writeable(zilog->zl_spa)) {
 		/*
 		 * If the SPA is not writable, there should never be any
 		 * pending itxs waiting to be committed to disk. If that
 		 * weren't true, we'd skip writing those itxs out, and
 		 * would break the semantics of zil_commit(); thus, we're
 		 * verifying that truth before we return to the caller.
 		 */
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 		for (int i = 0; i < TXG_SIZE; i++)
 			ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
 		return;
 	}
 
 	/*
 	 * If the ZIL is suspended, we don't want to dirty it by calling
 	 * zil_commit_itx_assign() below, nor can we write out
 	 * lwbs like would be done in zil_commit_write(). Thus, we
 	 * simply rely on txg_wait_synced() to maintain the necessary
 	 * semantics, and avoid calling those functions altogether.
 	 */
 	if (zilog->zl_suspend > 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 		return;
 	}
 
 	zil_commit_impl(zilog, foid);
 }
 
 void
 zil_commit_impl(zilog_t *zilog, uint64_t foid)
 {
 	ZIL_STAT_BUMP(zilog, zil_commit_count);
 
 	/*
 	 * Move the "async" itxs for the specified foid to the "sync"
 	 * queues, such that they will be later committed (or skipped)
 	 * to an lwb when zil_process_commit_list() is called.
 	 *
 	 * Since these "async" itxs must be committed prior to this
 	 * call to zil_commit returning, we must perform this operation
 	 * before we call zil_commit_itx_assign().
 	 */
 	zil_async_to_sync(zilog, foid);
 
 	/*
 	 * We allocate a new "waiter" structure which will initially be
 	 * linked to the commit itx using the itx's "itx_private" field.
 	 * Since the commit itx doesn't represent any on-disk state,
 	 * when it's committed to an lwb, rather than copying the its
 	 * lr_t into the lwb's buffer, the commit itx's "waiter" will be
 	 * added to the lwb's list of waiters. Then, when the lwb is
 	 * committed to stable storage, each waiter in the lwb's list of
 	 * waiters will be marked "done", and signalled.
 	 *
 	 * We must create the waiter and assign the commit itx prior to
 	 * calling zil_commit_writer(), or else our specific commit itx
 	 * is not guaranteed to be committed to an lwb prior to calling
 	 * zil_commit_waiter().
 	 */
 	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
 	zil_commit_itx_assign(zilog, zcw);
 
 	uint64_t wtxg = zil_commit_writer(zilog, zcw);
 	zil_commit_waiter(zilog, zcw);
 
 	if (zcw->zcw_zio_error != 0) {
 		/*
 		 * If there was an error writing out the ZIL blocks that
 		 * this thread is waiting on, then we fallback to
 		 * relying on spa_sync() to write out the data this
 		 * thread is waiting on. Obviously this has performance
 		 * implications, but the expectation is for this to be
 		 * an exceptional case, and shouldn't occur often.
 		 */
 		DTRACE_PROBE2(zil__commit__io__error,
 		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 	} else if (wtxg != 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, wtxg);
 	}
 
 	zil_free_commit_waiter(zcw);
 }
 
 /*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
 	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
 	/*
 	 * We don't zero out zl_destroy_txg, so make sure we don't try
 	 * to destroy it twice.
 	 */
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	zil_lwb_flush_wait_all(zilog, txg);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
 	if (*replayed_seq != 0) {
 		ASSERT(zh->zh_replay_seq < *replayed_seq);
 		zh->zh_replay_seq = *replayed_seq;
 		*replayed_seq = 0;
 	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 		memset(zh, 0, sizeof (zil_header_t));
 		memset(zilog->zl_replayed_seq, 0,
 		    sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
 			 * If this block was part of log chain that couldn't
 			 * be claimed because a device was missing during
 			 * zil_claim(), but that device later returns,
 			 * then this block could erroneously appear valid.
 			 * To guard against this, assign a new GUID to the new
 			 * log chain so it doesn't matter what blk points to.
 			 */
 			zil_init_log_chain(zilog, &blk);
 			zh->zh_log = blk;
 		} else {
 			/*
 			 * A destroyed ZIL chain can't contain any TX_SETSAXATTR
 			 * records. So, deactivate the feature for this dataset.
 			 * We activate it again when we start a new ZIL chain.
 			 */
 			if (dsl_dataset_feature_is_active(ds,
 			    SPA_FEATURE_ZILSAXATTR))
 				dsl_dataset_deactivate_feature(ds,
 				    SPA_FEATURE_ZILSAXATTR, tx);
 		}
 	}
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
 		    lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
 		if (!BP_IS_HOLE(&lwb->lwb_blk))
 			zio_free(spa, txg, &lwb->lwb_blk);
 		zil_free_lwb(zilog, lwb);
 
 		/*
 		 * If we don't have anything left in the lwb list then
 		 * we've had an allocation failure and we need to zero
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
 		if (list_is_empty(&zilog->zl_lwb_list))
 			BP_ZERO(&zh->zh_log);
 	}
 
 	mutex_exit(&zilog->zl_lock);
 }
 
 static int
 zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	lwb_t *lwb = vbuf;
 	list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
 	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 static void
 zil_lwb_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	lwb_t *lwb = vbuf;
 	mutex_destroy(&lwb->lwb_vdev_lock);
 	avl_destroy(&lwb->lwb_vdev_tree);
 	list_destroy(&lwb->lwb_waiters);
 	list_destroy(&lwb->lwb_itxs);
 }
 
 void
 zil_init(void)
 {
 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
 	    sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
 
 	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
 	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	zil_sums_init(&zil_sums_global);
 	zil_kstats_global = kstat_create("zfs", 0, "zil", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (zil_kstats_global != NULL) {
 		zil_kstats_global->ks_data = &zil_stats;
 		zil_kstats_global->ks_update = zil_kstats_global_update;
 		zil_kstats_global->ks_private = NULL;
 		kstat_install(zil_kstats_global);
 	}
 }
 
 void
 zil_fini(void)
 {
 	kmem_cache_destroy(zil_zcw_cache);
 	kmem_cache_destroy(zil_lwb_cache);
 
 	if (zil_kstats_global != NULL) {
 		kstat_delete(zil_kstats_global);
 		zil_kstats_global = NULL;
 	}
 
 	zil_sums_fini(&zil_sums_global);
 }
 
 void
 zil_set_sync(zilog_t *zilog, uint64_t sync)
 {
 	zilog->zl_sync = sync;
 }
 
 void
 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
 {
 	zilog->zl_logbias = logbias;
 }
 
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
 	zilog_t *zilog;
 
 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
 
 	zilog->zl_header = zh_phys;
 	zilog->zl_os = os;
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 	zilog->zl_logbias = dmu_objset_logbias(os);
 	zilog->zl_sync = dmu_objset_syncprop(os);
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
 	zilog->zl_max_block_size = zil_maxblocksize;
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
 	}
 
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
 	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
 
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
 
 	return (zilog);
 }
 
 void
 zil_free(zilog_t *zilog)
 {
 	int i;
 
 	zilog->zl_stop_sync = 1;
 
 	ASSERT0(zilog->zl_suspend);
 	ASSERT0(zilog->zl_suspending);
 
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
 	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
 	list_destroy(&zilog->zl_itx_commit_list);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * It's possible for an itx to be generated that doesn't dirty
 		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
 		 * callback to remove the entry. We remove those here.
 		 *
 		 * Also free up the ziltest itxs.
 		 */
 		if (zilog->zl_itxg[i].itxg_itxs)
 			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
 		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
 	}
 
 	mutex_destroy(&zilog->zl_issuer_lock);
 	mutex_destroy(&zilog->zl_lock);
 	mutex_destroy(&zilog->zl_lwb_io_lock);
 
 	cv_destroy(&zilog->zl_cv_suspend);
 	cv_destroy(&zilog->zl_lwb_io_cv);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
 /*
  * Open an intent log.
  */
 zilog_t *
 zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	ASSERT3P(zilog->zl_get_data, ==, NULL);
 	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
 	zilog->zl_sums = zil_sums;
 
 	return (zilog);
 }
 
 /*
  * Close an intent log.
  */
 void
 zil_close(zilog_t *zilog)
 {
 	lwb_t *lwb;
 	uint64_t txg;
 
 	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
 		zil_commit(zilog, 0);
 	} else {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT0(zilog->zl_dirty_max_txg);
 		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	txg = zilog->zl_dirty_max_txg;
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		txg = MAX(txg, lwb->lwb_alloc_txg);
 		txg = MAX(txg, lwb->lwb_max_txg);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
 	 * on the time when the dmu_tx transaction is assigned in
 	 * zil_lwb_write_issue().
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 	/*
 	 * We need to use txg_wait_synced() to wait until that txg is synced.
 	 * zil_sync() will guarantee all lwbs up to that txg have been
 	 * written out, flushed, and cleaned.
 	 */
 	if (txg != 0)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 
 	if (zilog_is_dirty(zilog))
 		zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog,
 		    (u_longlong_t)txg);
 	if (txg < spa_freeze_txg(zilog->zl_spa))
 		VERIFY(!zilog_is_dirty(zilog));
 
 	zilog->zl_get_data = NULL;
 
 	/*
 	 * We should have only one lwb left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_remove_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 static const char *suspend_tag = "zil suspending";
 
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
  * On old version pools, we suspend the log briefly when taking a
  * snapshot so that it will have an empty intent log.
  *
  * Long holds are not really intended to be used the way we do here --
  * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
  * could fail.  Therefore we take pains to only put a long hold if it is
  * actually necessary.  Fortunately, it will only be necessary if the
  * objset is currently mounted (or the ZVOL equivalent).  In that case it
  * will already have a long hold, so we are not really making things any worse.
  *
  * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
  * zvol_state_t), and use their mechanism to prevent their hold from being
  * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
  * very little gain.
  *
  * if cookiep == NULL, this does both the suspend & resume.
  * Otherwise, it returns with the dataset "long held", and the cookie
  * should be passed into zil_resume().
  */
 int
 zil_suspend(const char *osname, void **cookiep)
 {
 	objset_t *os;
 	zilog_t *zilog;
 	const zil_header_t *zh;
 	int error;
 
 	error = dmu_objset_hold(osname, suspend_tag, &os);
 	if (error != 0)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/*
 	 * Don't put a long hold in the cases where we can avoid it.  This
 	 * is when there is no cookie so we are doing a suspend & resume
 	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
 	 * for the suspend because it's already suspended, or there's no ZIL.
 	 */
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
 
 	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
 
 	if (zilog->zl_suspend > 1) {
 		/*
 		 * Someone else is already suspending it.
 		 * Just wait for them to finish.
 		 */
 
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
 		if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 		return (0);
 	}
 
 	/*
 	 * If there is no pointer to an on-disk block, this ZIL must not
 	 * be active (e.g. filesystem not mounted), so there's nothing
 	 * to clean up.
 	 */
 	if (BP_IS_HOLE(&zh->zh_log)) {
 		ASSERT(cookiep != NULL); /* fast path already handled */
 
 		*cookiep = os;
 		mutex_exit(&zilog->zl_lock);
 		return (0);
 	}
 
 	/*
 	 * The ZIL has work to do. Ensure that the associated encryption
 	 * key will remain mapped while we are committing the log by
 	 * grabbing a reference to it. If the key isn't loaded we have no
 	 * choice but to return an error until the wrapping key is loaded.
 	 */
 	if (os->os_encrypted &&
 	    dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) {
 		zilog->zl_suspend--;
 		mutex_exit(&zilog->zl_lock);
 		dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 		dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 		return (SET_ERROR(EACCES));
 	}
 
 	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * We need to use zil_commit_impl to ensure we wait for all
 	 * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
 	 * to disk before proceeding. If we used zil_commit instead, it
 	 * would just call txg_wait_synced(), because zl_suspend is set.
 	 * txg_wait_synced() doesn't wait for these lwb's to be
 	 * LWB_STATE_FLUSH_DONE before returning.
 	 */
 	zil_commit_impl(zilog, 0);
 
 	/*
 	 * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
 	 * use txg_wait_synced() to ensure the data from the zilog has
 	 * migrated to the main pool before calling zil_destroy().
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_suspending = B_FALSE;
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
 
 	if (os->os_encrypted)
 		dsl_dataset_remove_key_mapping(dmu_objset_ds(os));
 
 	if (cookiep == NULL)
 		zil_resume(os);
 	else
 		*cookiep = os;
 	return (0);
 }
 
 void
 zil_resume(void *cookie)
 {
 	objset_t *os = cookie;
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zilog->zl_suspend != 0);
 	zilog->zl_suspend--;
 	mutex_exit(&zilog->zl_lock);
 	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
 	zil_replay_func_t *const *zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
 	char		*zr_lr;
 } zil_replay_arg_t;
 
 static int
 zil_replay_error(zilog_t *zilog, const lr_t *lr, int error)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
 	dmu_objset_name(zilog->zl_os, name);
 
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
 	    (u_longlong_t)lr->lrc_seq,
 	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
 
 	return (error);
 }
 
 static int
 zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
     uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	int error = 0;
 
 	zilog->zl_replaying_seq = lr->lrc_seq;
 
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
 		return (0);
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
 		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
 	if (txtype == 0 || txtype >= TX_MAX_TYPE)
 		return (zil_replay_error(zilog, lr, EINVAL));
 
 	/*
 	 * If this record type can be logged out of order, the object
 	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
 	 */
 	if (TX_OOO(txtype)) {
 		error = dmu_object_info(zilog->zl_os,
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
 		if (error == ENOENT || error == EEXIST)
 			return (0);
 	}
 
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
 	memcpy(zr->zr_lr, lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
 	 */
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
 		    zr->zr_lr + reclen);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 
 	/*
 	 * The log block containing this lr may have been byteswapped
 	 * so that we can easily examine common fields like lrc_txtype.
 	 * However, the log is a mix of different record types, and only the
 	 * replay vectors know how to byteswap their records.  Therefore, if
 	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
 	if (zr->zr_byteswap)
 		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
 	 * and update the log header sequence number to reflect the fact that
 	 * we did so. At the end of each replay function the sequence number
 	 * is updated if we are in replay mode.
 	 */
 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
 	if (error != 0) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
 		 * EEXIST. So if we receive any error we try syncing out
 		 * any removes then retry the transaction.  Note that we
 		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 	return (0);
 }
 
 static int
 zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	(void) bp, (void) arg, (void) claim_txg;
 
 	zilog->zl_replay_blks++;
 
 	return (0);
 }
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *const replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
 		return (zil_destroy(zilog, B_TRUE));
 	}
 
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = ddi_get_lbolt();
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg, B_TRUE);
 	vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
 
 	return (B_TRUE);
 }
 
 boolean_t
 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return (B_TRUE);
 
 	if (zilog->zl_replay) {
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
 		    zilog->zl_replaying_seq;
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 int
 zil_reset(const char *osname, void *arg)
 {
 	(void) arg;
 
 	int error = zil_suspend(osname, NULL);
 	/* EACCES means crypto key not loaded */
 	if ((error == EACCES) || (error == EBUSY))
 		return (SET_ERROR(error));
 	if (error != 0)
 		return (SET_ERROR(EEXIST));
 	return (0);
 }
 
 EXPORT_SYMBOL(zil_alloc);
 EXPORT_SYMBOL(zil_free);
 EXPORT_SYMBOL(zil_open);
 EXPORT_SYMBOL(zil_close);
 EXPORT_SYMBOL(zil_replay);
 EXPORT_SYMBOL(zil_replaying);
 EXPORT_SYMBOL(zil_destroy);
 EXPORT_SYMBOL(zil_destroy_sync);
 EXPORT_SYMBOL(zil_itx_create);
 EXPORT_SYMBOL(zil_itx_destroy);
 EXPORT_SYMBOL(zil_itx_assign);
 EXPORT_SYMBOL(zil_commit);
 EXPORT_SYMBOL(zil_claim);
 EXPORT_SYMBOL(zil_check_log_chain);
 EXPORT_SYMBOL(zil_sync);
 EXPORT_SYMBOL(zil_clean);
 EXPORT_SYMBOL(zil_suspend);
 EXPORT_SYMBOL(zil_resume);
 EXPORT_SYMBOL(zil_lwb_add_block);
 EXPORT_SYMBOL(zil_bp_tree_add);
 EXPORT_SYMBOL(zil_set_sync);
 EXPORT_SYMBOL(zil_set_logbias);
 EXPORT_SYMBOL(zil_sums_init);
 EXPORT_SYMBOL(zil_sums_fini);
 EXPORT_SYMBOL(zil_kstat_values_update);
 
 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
 	"ZIL block open timeout percentage");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW,
 	"Minimum delay we care for ZIL block commit");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
 	"Disable intent logging replay");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
 	"Disable ZIL cache flushes");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,
 	"Limit in bytes slog sync writes per commit");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
 	"Limit in bytes of ZIL log block size");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
 	"Limit in bytes WR_COPIED size");
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index 9de515e8767a..e511b31fee6d 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -1,573 +1,577 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zil.h>
 #include <sys/abd.h>
 #include <zfs_fletcher.h>
 
 /*
  * Checksum vectors.
  *
  * In the SPA, everything is checksummed.  We support checksum vectors
  * for three distinct reasons:
  *
  *   1. Different kinds of data need different levels of protection.
  *	For SPA metadata, we always want a very strong checksum.
  *	For user data, we let users make the trade-off between speed
  *	and checksum strength.
  *
  *   2. Cryptographic hash and MAC algorithms are an area of active research.
  *	It is likely that in future hash functions will be at least as strong
  *	as current best-of-breed, and may be substantially faster as well.
  *	We want the ability to take advantage of these new hashes as soon as
  *	they become available.
  *
  *   3. If someone develops hardware that can compute a strong hash quickly,
  *	we want the ability to take advantage of that hardware.
  *
  * Of course, we don't want a checksum upgrade to invalidate existing
  * data, so we store the checksum *function* in eight bits of the bp.
  * This gives us room for up to 256 different checksum functions.
  *
  * When writing a block, we always checksum it with the latest-and-greatest
  * checksum function of the appropriate strength.  When reading a block,
  * we compare the expected checksum against the actual checksum, which we
  * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
  *
  * SALTED CHECKSUMS
  *
  * To enable the use of less secure hash algorithms with dedup, we
  * introduce the notion of salted checksums (MACs, really).  A salted
  * checksum is fed both a random 256-bit value (the salt) and the data
  * to be checksummed.  This salt is kept secret (stored on the pool, but
  * never shown to the user).  Thus even if an attacker knew of collision
  * weaknesses in the hash algorithm, they won't be able to mount a known
  * plaintext attack on the DDT, since the actual hash value cannot be
  * known ahead of time.  How the salt is used is algorithm-specific
  * (some might simply prefix it to the data block, others might need to
  * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
  * object in the MOS (DMU_POOL_CHECKSUM_SALT).
  *
  * CONTEXT TEMPLATES
  *
  * Some hashing algorithms need to perform a substantial amount of
  * initialization work (e.g. salted checksums above may need to pre-hash
  * the salt) before being able to process data.  Performing this
  * redundant work for each block would be wasteful, so we instead allow
  * a checksum algorithm to do the work once (the first time it's used)
  * and then keep this pre-initialized context as a template inside the
  * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
  * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
  * construct and destruct the pre-initialized checksum context.  The
  * pre-initialized context is then reused during each checksum
  * invocation and passed to the checksum function.
  */
 
 static void
 abd_checksum_off(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) abd, (void) size, (void) ctx_template;
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 }
 
 static void
 abd_fletcher_2_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_init(zcp);
 	(void) abd_iterate_func(abd, 0, size,
 	    fletcher_2_incremental_native, zcp);
 }
 
 static void
 abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_init(zcp);
 	(void) abd_iterate_func(abd, 0, size,
 	    fletcher_2_incremental_byteswap, zcp);
 }
 
 static inline void
 abd_fletcher_4_impl(abd_t *abd, uint64_t size, zio_abd_checksum_data_t *acdp)
 {
 	fletcher_4_abd_ops.acf_init(acdp);
 	abd_iterate_func(abd, 0, size, fletcher_4_abd_ops.acf_iter, acdp);
 	fletcher_4_abd_ops.acf_fini(acdp);
 }
 
 void
 abd_fletcher_4_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_4_ctx_t ctx;
 
 	zio_abd_checksum_data_t acd = {
 		.acd_byteorder	= ZIO_CHECKSUM_NATIVE,
 		.acd_zcp 	= zcp,
 		.acd_ctx	= &ctx
 	};
 
 	abd_fletcher_4_impl(abd, size, &acd);
 
 }
 
 void
 abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_4_ctx_t ctx;
 
 	zio_abd_checksum_data_t acd = {
 		.acd_byteorder	= ZIO_CHECKSUM_BYTESWAP,
 		.acd_zcp 	= zcp,
 		.acd_ctx	= &ctx
 	};
 
 	abd_fletcher_4_impl(abd, size, &acd);
 }
 
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
 	{{NULL, NULL}, NULL, NULL, 0, "on"},
 	{{abd_checksum_off,		abd_checksum_off},
 	    NULL, NULL, 0, "off"},
 	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
 	    "label"},
 	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
 	    "gang_header"},
 	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
 	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
 	    NULL, NULL, 0, "fletcher2"},
 	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
 	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
 	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
 	{{abd_checksum_off,		abd_checksum_off},
 	    NULL, NULL, 0, "noparity"},
 	{{abd_checksum_sha512_native,	abd_checksum_sha512_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
 	{{abd_checksum_skein_native,	abd_checksum_skein_byteswap},
 	    abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
 	{{abd_checksum_edonr_native,	abd_checksum_edonr_byteswap},
 	    abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
 	{{abd_checksum_blake3_native,	abd_checksum_blake3_byteswap},
 	    abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"},
 };
 
 /*
  * The flag corresponding to the "verify" in dedup=[checksum,]verify
  * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
  */
 spa_feature_t
 zio_checksum_to_feature(enum zio_checksum cksum)
 {
 	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
 
 	switch (cksum) {
 	case ZIO_CHECKSUM_BLAKE3:
 		return (SPA_FEATURE_BLAKE3);
 	case ZIO_CHECKSUM_SHA512:
 		return (SPA_FEATURE_SHA512);
 	case ZIO_CHECKSUM_SKEIN:
 		return (SPA_FEATURE_SKEIN);
 	case ZIO_CHECKSUM_EDONR:
 		return (SPA_FEATURE_EDONR);
 	default:
 		return (SPA_FEATURE_NONE);
 	}
 }
 
 enum zio_checksum
 zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
 {
 	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
 
 	if (child == ZIO_CHECKSUM_INHERIT)
 		return (parent);
 
 	if (child == ZIO_CHECKSUM_ON)
 		return (ZIO_CHECKSUM_ON_VALUE);
 
 	return (child);
 }
 
 enum zio_checksum
 zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
     enum zio_checksum parent)
 {
 	ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
 
 	if (child == ZIO_CHECKSUM_INHERIT)
 		return (parent);
 
 	if (child == ZIO_CHECKSUM_ON)
 		return (spa_dedup_checksum(spa));
 
 	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
 		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
 
 	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
 	    ZCHECKSUM_FLAG_DEDUP) ||
 	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
 
 	return (child);
 }
 
 /*
  * Set the external verifier for a gang block based on <vdev, offset, txg>,
  * a tuple which is guaranteed to be unique for the life of the pool.
  */
 static void
 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 
 	ASSERT(BP_IS_GANG(bp));
 
 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 }
 
 /*
  * Set the external verifier for a label block based on its offset.
  * The vdev is implicit, and the txg is unknowable at pool open time --
  * hence the logic in vdev_uberblock_load() to find the most recent copy.
  */
 static void
 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 {
 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 }
 
 /*
  * Calls the template init function of a checksum which supports context
  * templates and installs the template into the spa_t.
  */
 static void
 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 
 	if (ci->ci_tmpl_init == NULL)
 		return;
 	if (spa->spa_cksum_tmpls[checksum] != NULL)
 		return;
 
 	VERIFY(ci->ci_tmpl_free != NULL);
 	mutex_enter(&spa->spa_cksum_tmpls_lock);
 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
 		spa->spa_cksum_tmpls[checksum] =
 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
 		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
 	}
 	mutex_exit(&spa->spa_cksum_tmpls_lock);
 }
 
 /* convenience function to update a checksum to accommodate an encryption MAC */
 static void
 zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor)
 {
 	/*
 	 * Weak checksums do not have their entropy spread evenly
 	 * across the bits of the checksum. Therefore, when truncating
 	 * a weak checksum we XOR the first 2 words with the last 2 so
 	 * that we don't "lose" any entropy unnecessarily.
 	 */
 	if (xor) {
 		cksum->zc_word[0] ^= cksum->zc_word[2];
 		cksum->zc_word[1] ^= cksum->zc_word[3];
 	}
 
 	cksum->zc_word[2] = saved->zc_word[2];
 	cksum->zc_word[3] = saved->zc_word[3];
 }
 
 /*
  * Generate the checksum.
  */
 void
 zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
     abd_t *abd, uint64_t size)
 {
 	static const uint64_t zec_magic = ZEC_MAGIC;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t offset = zio->io_offset;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t cksum, saved;
 	spa_t *spa = zio->io_spa;
 	boolean_t insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0;
 
 	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(ci->ci_func[0] != NULL);
 
 	zio_checksum_template_init(checksum, spa);
 
 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t eck;
 		size_t eck_offset;
 
 		memset(&saved, 0, sizeof (zio_cksum_t));
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t zilc;
 			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
 
-			size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ,
-			    uint64_t);
+			uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused,
+			    ZIL_MIN_BLKSZ, uint64_t);
+			ASSERT3U(size, >=, nused);
+			size = nused;
 			eck = zilc.zc_eck;
 			eck_offset = offsetof(zil_chain_t, zc_eck);
 		} else {
+			ASSERT3U(size, >=, sizeof (zio_eck_t));
 			eck_offset = size - sizeof (zio_eck_t);
 			abd_copy_to_buf_off(&eck, abd, eck_offset,
 			    sizeof (zio_eck_t));
 		}
 
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER) {
 			zio_checksum_gang_verifier(&eck.zec_cksum, bp);
 		} else if (checksum == ZIO_CHECKSUM_LABEL) {
 			zio_checksum_label_verifier(&eck.zec_cksum, offset);
 		} else {
 			saved = eck.zec_cksum;
 			eck.zec_cksum = bp->blk_cksum;
 		}
 
 		abd_copy_from_buf_off(abd, &zec_magic,
 		    eck_offset + offsetof(zio_eck_t, zec_magic),
 		    sizeof (zec_magic));
 		abd_copy_from_buf_off(abd, &eck.zec_cksum,
 		    eck_offset + offsetof(zio_eck_t, zec_cksum),
 		    sizeof (zio_cksum_t));
 
 		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
 		    &cksum);
 		if (bp != NULL && BP_USES_CRYPT(bp) &&
 		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 			zio_checksum_handle_crypt(&cksum, &saved, insecure);
 
 		abd_copy_from_buf_off(abd, &cksum,
 		    eck_offset + offsetof(zio_eck_t, zec_cksum),
 		    sizeof (zio_cksum_t));
 	} else {
 		saved = bp->blk_cksum;
 		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
 		    &cksum);
 		if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 			zio_checksum_handle_crypt(&cksum, &saved, insecure);
 		bp->blk_cksum = cksum;
 	}
 }
 
 int
 zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
     enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset,
     zio_bad_cksum_t *info)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t actual_cksum, expected_cksum;
 	zio_eck_t eck;
 	int byteswap;
 
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 		return (SET_ERROR(EINVAL));
 
 	zio_checksum_template_init(checksum, spa);
 
 	IMPLY(bp == NULL, ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED);
 	IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL);
 
 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_cksum_t verifier;
 		size_t eck_offset;
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t zilc;
 			uint64_t nused;
 
 			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
 
 			eck = zilc.zc_eck;
 			eck_offset = offsetof(zil_chain_t, zc_eck) +
 			    offsetof(zio_eck_t, zec_cksum);
 
 			if (eck.zec_magic == ZEC_MAGIC) {
 				nused = zilc.zc_nused;
 			} else if (eck.zec_magic == BSWAP_64(ZEC_MAGIC)) {
 				nused = BSWAP_64(zilc.zc_nused);
 			} else {
 				return (SET_ERROR(ECKSUM));
 			}
 
-			if (nused > size) {
+			nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+			if (size < nused)
 				return (SET_ERROR(ECKSUM));
-			}
-
-			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+			size = nused;
 		} else {
+			if (size < sizeof (zio_eck_t))
+				return (SET_ERROR(ECKSUM));
 			eck_offset = size - sizeof (zio_eck_t);
 			abd_copy_to_buf_off(&eck, abd, eck_offset,
 			    sizeof (zio_eck_t));
 			eck_offset += offsetof(zio_eck_t, zec_cksum);
 		}
 
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 			zio_checksum_gang_verifier(&verifier, bp);
 		else if (checksum == ZIO_CHECKSUM_LABEL)
 			zio_checksum_label_verifier(&verifier, offset);
 		else
 			verifier = bp->blk_cksum;
 
 		byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC));
 
 		if (byteswap)
 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
 		expected_cksum = eck.zec_cksum;
 
 		abd_copy_from_buf_off(abd, &verifier, eck_offset,
 		    sizeof (zio_cksum_t));
 
 		ci->ci_func[byteswap](abd, size,
 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 
 		abd_copy_from_buf_off(abd, &expected_cksum, eck_offset,
 		    sizeof (zio_cksum_t));
 
 		if (byteswap) {
 			byteswap_uint64_array(&expected_cksum,
 			    sizeof (zio_cksum_t));
 		}
 	} else {
 		byteswap = BP_SHOULD_BYTESWAP(bp);
 		expected_cksum = bp->blk_cksum;
 		ci->ci_func[byteswap](abd, size,
 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 	}
 
 	/*
 	 * MAC checksums are a special case since half of this checksum will
 	 * actually be the encryption MAC. This will be verified by the
 	 * decryption process, so we just check the truncated checksum now.
 	 * Objset blocks use embedded MACs so we don't truncate the checksum
 	 * for them.
 	 */
 	if (bp != NULL && BP_USES_CRYPT(bp) &&
 	    BP_GET_TYPE(bp) != DMU_OT_OBJSET) {
 		if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) {
 			actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2];
 			actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3];
 		}
 
 		actual_cksum.zc_word[2] = 0;
 		actual_cksum.zc_word[3] = 0;
 		expected_cksum.zc_word[2] = 0;
 		expected_cksum.zc_word[3] = 0;
 	}
 
 	if (info != NULL) {
 		info->zbc_checksum_name = ci->ci_name;
 		info->zbc_byteswapped = byteswap;
 		info->zbc_injected = 0;
 		info->zbc_has_cksum = 1;
 	}
 
 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (SET_ERROR(ECKSUM));
 
 	return (0);
 }
 
 int
 zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 	int error;
 	uint64_t size = (bp == NULL ? zio->io_size :
 	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
 	uint64_t offset = zio->io_offset;
 	abd_t *data = zio->io_abd;
 	spa_t *spa = zio->io_spa;
 
 	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
 	    offset, info);
 
 	if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
 		error = zio_handle_fault_injection(zio, ECKSUM);
 		if (error != 0)
 			info->zbc_injected = 1;
 	}
 
 	return (error);
 }
 
 /*
  * Called by a spa_t that's about to be deallocated. This steps through
  * all of the checksum context templates and deallocates any that were
  * initialized using the algorithm-specific template init function.
  */
 void
 zio_checksum_templates_free(spa_t *spa)
 {
 	for (enum zio_checksum checksum = 0;
 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 
 			VERIFY(ci->ci_tmpl_free != NULL);
 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
 			spa->spa_cksum_tmpls[checksum] = NULL;
 		}
 	}
 }
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 53dcb4dee448..91b2d9fcb531 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -1,1791 +1,1799 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  *
  * ZFS volume emulation driver.
  *
  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  * Volumes are accessed through the symbolic links named:
  *
  * /dev/<pool_name>/<dataset_name>
  *
  * Volumes are persistent through reboot and module load.  No user command
  * needs to be run before opening and using a device.
  *
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 /*
  * Note on locking of zvol state structures.
  *
  * These structures are used to maintain internal state used to emulate block
  * devices on top of zvols. In particular, management of device minor number
  * operations - create, remove, rename, and set_snapdev - involves access to
  * these structures. The zvol_state_lock is primarily used to protect the
  * zvol_state_list. The zv->zv_state_lock is used to protect the contents
  * of the zvol_state_t structures, as well as to make sure that when the
  * time comes to remove the structure from the list, it is not in use, and
  * therefore, it can be taken off zvol_state_list and freed.
  *
  * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
  * e.g. for the duration of receive and rollback operations. This lock can be
  * held for significant periods of time. Given that it is undesirable to hold
  * mutexes for long periods of time, the following lock ordering applies:
  * - take zvol_state_lock if necessary, to protect zvol_state_list
  * - take zv_suspend_lock if necessary, by the code path in question
  * - take zv_state_lock to protect zvol_state_t
  *
  * The minor operations are issued to spa->spa_zvol_taskq queues, that are
  * single-threaded (to preserve order of minor operations), and are executed
  * through the zvol_task_cb that dispatches the specific operations. Therefore,
  * these operations are serialized per pool. Consequently, we can be certain
  * that for a given zvol, there is only one operation at a time in progress.
  * That is why one can be sure that first, zvol_state_t for a given zvol is
  * allocated and placed on zvol_state_list, and then other minor operations
  * for this zvol are going to proceed in the order of issue.
  *
  */
 
 #include <sys/dataset_kstats.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/zio.h>
 #include <sys/zfs_rlock.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
 #include <sys/zvol_impl.h>
 
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
 
 struct hlist_head *zvol_htable;
 static list_t zvol_state_list;
 krwlock_t zvol_state_lock;
 
 typedef enum {
 	ZVOL_ASYNC_REMOVE_MINORS,
 	ZVOL_ASYNC_RENAME_MINORS,
 	ZVOL_ASYNC_SET_SNAPDEV,
 	ZVOL_ASYNC_SET_VOLMODE,
 	ZVOL_ASYNC_MAX
 } zvol_async_op_t;
 
 typedef struct {
 	zvol_async_op_t op;
 	char name1[MAXNAMELEN];
 	char name2[MAXNAMELEN];
 	uint64_t value;
 } zvol_task_t;
 
 uint64_t
 zvol_name_hash(const char *name)
 {
 	int i;
 	uint64_t crc = -1ULL;
 	const uint8_t *p = (const uint8_t *)name;
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) {
 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
 	}
 	return (crc);
 }
 
 /*
  * Find a zvol_state_t given the name and hash generated by zvol_name_hash.
  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
  * before zv_state_lock. The mode argument indicates the mode (including none)
  * for zv_suspend_lock to be taken.
  */
 zvol_state_t *
 zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
 {
 	zvol_state_t *zv;
 	struct hlist_node *p = NULL;
 
 	rw_enter(&zvol_state_lock, RW_READER);
 	hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
 		zv = hlist_entry(p, zvol_state_t, zv_hlink);
 		mutex_enter(&zv->zv_state_lock);
 		if (zv->zv_hash == hash &&
 		    strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
 			/*
 			 * this is the right zvol, take the locks in the
 			 * right order
 			 */
 			if (mode != RW_NONE &&
 			    !rw_tryenter(&zv->zv_suspend_lock, mode)) {
 				mutex_exit(&zv->zv_state_lock);
 				rw_enter(&zv->zv_suspend_lock, mode);
 				mutex_enter(&zv->zv_state_lock);
 				/*
 				 * zvol cannot be renamed as we continue
 				 * to hold zvol_state_lock
 				 */
 				ASSERT(zv->zv_hash == hash &&
 				    strncmp(zv->zv_name, name, MAXNAMELEN)
 				    == 0);
 			}
 			rw_exit(&zvol_state_lock);
 			return (zv);
 		}
 		mutex_exit(&zv->zv_state_lock);
 	}
 	rw_exit(&zvol_state_lock);
 
 	return (NULL);
 }
 
 /*
  * Find a zvol_state_t given the name.
  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
  * before zv_state_lock. The mode argument indicates the mode (including none)
  * for zv_suspend_lock to be taken.
  */
 static zvol_state_t *
 zvol_find_by_name(const char *name, int mode)
 {
 	return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
 }
 
 /*
  * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
  */
 void
 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	zfs_creat_t *zct = arg;
 	nvlist_t *nvprops = zct->zct_props;
 	int error;
 	uint64_t volblocksize, volsize;
 
 	VERIFY(nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 	if (nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 
 	/*
 	 * These properties must be removed from the list so the generic
 	 * property setting step won't apply to them.
 	 */
 	VERIFY(nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 	(void) nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 
 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 	ASSERT(error == 0);
 }
 
 /*
  * ZFS_IOC_OBJSET_STATS entry point.
  */
 int
 zvol_get_stats(objset_t *os, nvlist_t *nv)
 {
 	int error;
 	dmu_object_info_t *doi;
 	uint64_t val;
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 	if (error)
 		return (SET_ERROR(error));
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 	error = dmu_object_info(os, ZVOL_OBJ, doi);
 
 	if (error == 0) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 		    doi->doi_data_block_size);
 	}
 
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	return (SET_ERROR(error));
 }
 
 /*
  * Sanity check volume size.
  */
 int
 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 {
 	if (volsize == 0)
 		return (SET_ERROR(EINVAL));
 
 	if (volsize % blocksize != 0)
 		return (SET_ERROR(EINVAL));
 
 #ifdef _ILP32
 	if (volsize - 1 > SPEC_MAXOFFSET_T)
 		return (SET_ERROR(EOVERFLOW));
 #endif
 	return (0);
 }
 
 /*
  * Ensure the zap is flushed then inform the VFS of the capacity change.
  */
 static int
 zvol_update_volsize(uint64_t volsize, objset_t *os)
 {
 	dmu_tx_t *tx;
 	int error;
 	uint64_t txg;
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (SET_ERROR(error));
 	}
 	txg = dmu_tx_get_txg(tx);
 
 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 	    &volsize, tx);
 	dmu_tx_commit(tx);
 
 	txg_wait_synced(dmu_objset_pool(os), txg);
 
 	if (error == 0)
 		error = dmu_free_long_range(os,
 		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
 
 	return (error);
 }
 
 /*
  * Set ZFS_PROP_VOLSIZE set entry point.  Note that modifying the volume
  * size will result in a udev "change" event being generated.
  */
 int
 zvol_set_volsize(const char *name, uint64_t volsize)
 {
 	objset_t *os = NULL;
 	uint64_t readonly;
 	int error;
 	boolean_t owned = B_FALSE;
 
 	error = dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 	if (error != 0)
 		return (SET_ERROR(error));
 	if (readonly)
 		return (SET_ERROR(EROFS));
 
 	zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
 
 	ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_READ_HELD(&zv->zv_suspend_lock)));
 
 	if (zv == NULL || zv->zv_objset == NULL) {
 		if (zv != NULL)
 			rw_exit(&zv->zv_suspend_lock);
 		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
 		    FTAG, &os)) != 0) {
 			if (zv != NULL)
 				mutex_exit(&zv->zv_state_lock);
 			return (SET_ERROR(error));
 		}
 		owned = B_TRUE;
 		if (zv != NULL)
 			zv->zv_objset = os;
 	} else {
 		os = zv->zv_objset;
 	}
 
 	dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
 
 	if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
 	    (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
 		goto out;
 
 	error = zvol_update_volsize(volsize, os);
 	if (error == 0 && zv != NULL) {
 		zv->zv_volsize = volsize;
 		zv->zv_changed = 1;
 	}
 out:
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	if (owned) {
 		dmu_objset_disown(os, B_TRUE, FTAG);
 		if (zv != NULL)
 			zv->zv_objset = NULL;
 	} else {
 		rw_exit(&zv->zv_suspend_lock);
 	}
 
 	if (zv != NULL)
 		mutex_exit(&zv->zv_state_lock);
 
 	if (error == 0 && zv != NULL)
 		zvol_os_update_volsize(zv, volsize);
 
 	return (SET_ERROR(error));
 }
 
 /*
  * Sanity check volume block size.
  */
 int
 zvol_check_volblocksize(const char *name, uint64_t volblocksize)
 {
 	/* Record sizes above 128k need the feature to be enabled */
 	if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
 		spa_t *spa;
 		int error;
 
 		if ((error = spa_open(name, &spa, FTAG)) != 0)
 			return (error);
 
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 			spa_close(spa, FTAG);
 			return (SET_ERROR(ENOTSUP));
 		}
 
 		/*
 		 * We don't allow setting the property above 1MB,
 		 * unless the tunable has been changed.
 		 */
 		if (volblocksize > zfs_max_recordsize)
 			return (SET_ERROR(EDOM));
 
 		spa_close(spa, FTAG);
 	}
 
 	if (volblocksize < SPA_MINBLOCKSIZE ||
 	    volblocksize > SPA_MAXBLOCKSIZE ||
 	    !ISP2(volblocksize))
 		return (SET_ERROR(EDOM));
 
 	return (0);
 }
 
 /*
  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
  * implement DKIOCFREE/free-long-range.
  */
 static int
 zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zvol_state_t *zv = arg1;
 	lr_truncate_t *lr = arg2;
 	uint64_t offset, length;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 	dmu_tx_mark_netfree(tx);
 	int error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0) {
 		dmu_tx_abort(tx);
 	} else {
 		(void) zil_replaying(zv->zv_zilog, tx);
 		dmu_tx_commit(tx);
 		error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset,
 		    length);
 	}
 
 	return (error);
 }
 
 /*
  * Replay a TX_WRITE ZIL transaction that didn't get committed
  * after a system failure
  */
 static int
 zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zvol_state_t *zv = arg1;
 	lr_write_t *lr = arg2;
 	objset_t *os = zv->zv_objset;
 	char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 	uint64_t offset, length;
 	dmu_tx_t *tx;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 		if (length < blocksize) {
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
 	}
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 	} else {
 		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 		(void) zil_replaying(zv->zv_zilog, tx);
 		dmu_tx_commit(tx);
 	}
 
 	return (error);
 }
 
 /*
  * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
  * after a system failure.
  *
  * TODO: For now we drop block cloning transations for ZVOLs as they are
  *       unsupported, but we still need to inform BRT about that as we
  *       claimed them during pool import.
  *       This situation can occur when we try to import a pool from a ZFS
  *       version supporting block cloning for ZVOLs into a system that
  *       has this ZFS version, that doesn't support block cloning for ZVOLs.
  */
 static int
 zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	zvol_state_t *zv = arg1;
 	objset_t *os = zv->zv_objset;
 	lr_clone_range_t *lr = arg2;
 	blkptr_t *bp;
 	dmu_tx_t *tx;
 	spa_t *spa;
 	uint_t ii;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
+
 	dmu_objset_name(os, name);
 	cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.",
 	    name);
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	tx = dmu_tx_create(os);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	spa = os->os_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		if (!BP_IS_HOLE(bp)) {
 			zio_free(spa, dmu_tx_get_txg(tx), bp);
 		}
 	}
 
 	(void) zil_replaying(zv->zv_zilog, tx);
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 static int
 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 {
 	(void) arg1, (void) arg2, (void) byteswap;
 	return (SET_ERROR(ENOTSUP));
 }
 
 /*
  * Callback vectors for replaying records.
  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  */
 zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* no such transaction type */
 	zvol_replay_err,	/* TX_CREATE */
 	zvol_replay_err,	/* TX_MKDIR */
 	zvol_replay_err,	/* TX_MKXATTR */
 	zvol_replay_err,	/* TX_SYMLINK */
 	zvol_replay_err,	/* TX_REMOVE */
 	zvol_replay_err,	/* TX_RMDIR */
 	zvol_replay_err,	/* TX_LINK */
 	zvol_replay_err,	/* TX_RENAME */
 	zvol_replay_write,	/* TX_WRITE */
 	zvol_replay_truncate,	/* TX_TRUNCATE */
 	zvol_replay_err,	/* TX_SETATTR */
 	zvol_replay_err,	/* TX_ACL */
 	zvol_replay_err,	/* TX_CREATE_ATTR */
 	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL */
 	zvol_replay_err,	/* TX_MKDIR_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
 	zvol_replay_err,	/* TX_WRITE2 */
 	zvol_replay_err,	/* TX_SETSAXATTR */
 	zvol_replay_err,	/* TX_RENAME_EXCHANGE */
 	zvol_replay_err,	/* TX_RENAME_WHITEOUT */
 	zvol_replay_clone_range	/* TX_CLONE_RANGE */
 };
 
 /*
  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
  *
  * We store data in the log buffers if it's small enough.
  * Otherwise we will later flush the data out via dmu_sync().
  */
 static const ssize_t zvol_immediate_write_sz = 32768;
 
 void
 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
     uint64_t size, int sync)
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
 	itx_wr_state_t write_state;
 	uint64_t sz = size;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 		write_state = WR_INDIRECT;
 	else if (!spa_has_slogs(zilog->zl_spa) &&
 	    size >= blocksize && blocksize > zvol_immediate_write_sz)
 		write_state = WR_INDIRECT;
 	else if (sync)
 		write_state = WR_COPIED;
 	else
 		write_state = WR_NEED_COPY;
 
 	while (size) {
 		itx_t *itx;
 		lr_write_t *lr;
 		itx_wr_state_t wr_state = write_state;
 		ssize_t len = size;
 
 		if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog))
 			wr_state = WR_NEED_COPY;
 		else if (wr_state == WR_INDIRECT)
 			len = MIN(blocksize - P2PHASE(offset, blocksize), size);
 
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
 		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
 		    offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			wr_state = WR_NEED_COPY;
 		}
 
 		itx->itx_wr_state = wr_state;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = offset;
 		lr->lr_length = len;
 		lr->lr_blkoff = 0;
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zv;
 		itx->itx_sync = sync;
 
 		(void) zil_itx_assign(zilog, itx, tx);
 
 		offset += len;
 		size -= len;
 	}
 
 	if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
 		dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg);
 	}
 }
 
 /*
  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
  */
 void
 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
     boolean_t sync)
 {
 	itx_t *itx;
 	lr_truncate_t *lr;
 	zilog_t *zilog = zv->zv_zilog;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
 	lr = (lr_truncate_t *)&itx->itx_lr;
 	lr->lr_foid = ZVOL_OBJ;
 	lr->lr_offset = off;
 	lr->lr_length = len;
 
 	itx->itx_sync = sync;
 	zil_itx_assign(zilog, itx, tx);
 }
 
 
 static void
 zvol_get_done(zgd_t *zgd, int error)
 {
 	(void) error;
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_rangelock_exit(zgd->zgd_lr);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
     struct lwb *lwb, zio_t *zio)
 {
 	zvol_state_t *zv = arg;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error;
 
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_lwb = lwb;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 		    size, RL_READER);
 		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
 	} else { /* indirect write */
 		ASSERT3P(zio, !=, NULL);
 		/*
 		 * Have to lock the whole block to ensure when it's written out
 		 * and its checksum is being calculated that no one can change
 		 * the data. Contrarily to zfs_get_data we need not re-check
 		 * blocksize after we get the lock because it cannot be changed.
 		 */
 		size = zv->zv_volblocksize;
 		offset = P2ALIGN_TYPED(offset, size, uint64_t);
 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 		    size, RL_READER);
 		error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd,
 		    &db);
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db != NULL);
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zvol_get_done, zgd);
 
 			if (error == 0)
 				return (0);
 		}
 	}
 
 	zvol_get_done(zgd, error);
 
 	return (SET_ERROR(error));
 }
 
 /*
  * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
  */
 
 void
 zvol_insert(zvol_state_t *zv)
 {
 	ASSERT(RW_WRITE_HELD(&zvol_state_lock));
 	list_insert_head(&zvol_state_list, zv);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 }
 
 /*
  * Simply remove the zvol from to list of zvols.
  */
 static void
 zvol_remove(zvol_state_t *zv)
 {
 	ASSERT(RW_WRITE_HELD(&zvol_state_lock));
 	list_remove(&zvol_state_list, zv);
 	hlist_del(&zv->zv_hlink);
 }
 
 /*
  * Setup zv after we just own the zv->objset
  */
 static int
 zvol_setup_zv(zvol_state_t *zv)
 {
 	uint64_t volsize;
 	int error;
 	uint64_t ro;
 	objset_t *os = zv->zv_objset;
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
 
 	zv->zv_zilog = NULL;
 	zv->zv_flags &= ~ZVOL_WRITTEN_TO;
 
 	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
 	if (error)
 		return (SET_ERROR(error));
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error)
 		return (SET_ERROR(error));
 
 	error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
 	if (error)
 		return (SET_ERROR(error));
 
 	zvol_os_set_capacity(zv, volsize >> 9);
 	zv->zv_volsize = volsize;
 
 	if (ro || dmu_objset_is_snapshot(os) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
 		zvol_os_set_disk_ro(zv, 1);
 		zv->zv_flags |= ZVOL_RDONLY;
 	} else {
 		zvol_os_set_disk_ro(zv, 0);
 		zv->zv_flags &= ~ZVOL_RDONLY;
 	}
 	return (0);
 }
 
 /*
  * Shutdown every zv_objset related stuff except zv_objset itself.
  * The is the reverse of zvol_setup_zv.
  */
 static void
 zvol_shutdown_zv(zvol_state_t *zv)
 {
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_LOCK_HELD(&zv->zv_suspend_lock));
 
 	if (zv->zv_flags & ZVOL_WRITTEN_TO) {
 		ASSERT(zv->zv_zilog != NULL);
 		zil_close(zv->zv_zilog);
 	}
 
 	zv->zv_zilog = NULL;
 
 	dnode_rele(zv->zv_dn, zv);
 	zv->zv_dn = NULL;
 
 	/*
 	 * Evict cached data. We must write out any dirty data before
 	 * disowning the dataset.
 	 */
 	if (zv->zv_flags & ZVOL_WRITTEN_TO)
 		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 	(void) dmu_objset_evict_dbufs(zv->zv_objset);
 }
 
 /*
  * return the proper tag for rollback and recv
  */
 void *
 zvol_tag(zvol_state_t *zv)
 {
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 	return (zv->zv_open_count > 0 ? zv : NULL);
 }
 
 /*
  * Suspend the zvol for recv and rollback.
  */
 zvol_state_t *
 zvol_suspend(const char *name)
 {
 	zvol_state_t *zv;
 
 	zv = zvol_find_by_name(name, RW_WRITER);
 
 	if (zv == NULL)
 		return (NULL);
 
 	/* block all I/O, release in zvol_resume. */
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 
 	atomic_inc(&zv->zv_suspend_ref);
 
 	if (zv->zv_open_count > 0)
 		zvol_shutdown_zv(zv);
 
 	/*
 	 * do not hold zv_state_lock across suspend/resume to
 	 * avoid locking up zvol lookups
 	 */
 	mutex_exit(&zv->zv_state_lock);
 
 	/* zv_suspend_lock is released in zvol_resume() */
 	return (zv);
 }
 
 int
 zvol_resume(zvol_state_t *zv)
 {
 	int error = 0;
 
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 
 	mutex_enter(&zv->zv_state_lock);
 
 	if (zv->zv_open_count > 0) {
 		VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
 		VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
 		VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
 		dmu_objset_rele(zv->zv_objset, zv);
 
 		error = zvol_setup_zv(zv);
 	}
 
 	mutex_exit(&zv->zv_state_lock);
 
 	rw_exit(&zv->zv_suspend_lock);
 	/*
 	 * We need this because we don't hold zvol_state_lock while releasing
 	 * zv_suspend_lock. zvol_remove_minors_impl thus cannot check
 	 * zv_suspend_lock to determine it is safe to free because rwlock is
 	 * not inherent atomic.
 	 */
 	atomic_dec(&zv->zv_suspend_ref);
 
 	return (SET_ERROR(error));
 }
 
 int
 zvol_first_open(zvol_state_t *zv, boolean_t readonly)
 {
 	objset_t *os;
 	int error;
 
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(mutex_owned(&spa_namespace_lock));
 
 	boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
 	if (error)
 		return (SET_ERROR(error));
 
 	zv->zv_objset = os;
 
 	error = zvol_setup_zv(zv);
 	if (error) {
 		dmu_objset_disown(os, 1, zv);
 		zv->zv_objset = NULL;
 	}
 
 	return (error);
 }
 
 void
 zvol_last_close(zvol_state_t *zv)
 {
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	zvol_shutdown_zv(zv);
 
 	dmu_objset_disown(zv->zv_objset, 1, zv);
 	zv->zv_objset = NULL;
 }
 
 typedef struct minors_job {
 	list_t *list;
 	list_node_t link;
 	/* input */
 	char *name;
 	/* output */
 	int error;
 } minors_job_t;
 
 /*
  * Prefetch zvol dnodes for the minors_job
  */
 static void
 zvol_prefetch_minors_impl(void *arg)
 {
 	minors_job_t *job = arg;
 	char *dsname = job->name;
 	objset_t *os = NULL;
 
 	job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
 	    FTAG, &os);
 	if (job->error == 0) {
 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
 		dmu_objset_disown(os, B_TRUE, FTAG);
 	}
 }
 
 /*
  * Mask errors to continue dmu_objset_find() traversal
  */
 static int
 zvol_create_snap_minor_cb(const char *dsname, void *arg)
 {
 	minors_job_t *j = arg;
 	list_t *minors_list = j->list;
 	const char *name = j->name;
 
 	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
 
 	/* skip the designated dataset */
 	if (name && strcmp(dsname, name) == 0)
 		return (0);
 
 	/* at this point, the dsname should name a snapshot */
 	if (strchr(dsname, '@') == 0) {
 		dprintf("zvol_create_snap_minor_cb(): "
 		    "%s is not a snapshot name\n", dsname);
 	} else {
 		minors_job_t *job;
 		char *n = kmem_strdup(dsname);
 		if (n == NULL)
 			return (0);
 
 		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 		job->name = n;
 		job->list = minors_list;
 		job->error = 0;
 		list_insert_tail(minors_list, job);
 		/* don't care if dispatch fails, because job->error is 0 */
 		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
 		    TQ_SLEEP);
 	}
 
 	return (0);
 }
 
 /*
  * If spa_keystore_load_wkey() is called for an encrypted zvol,
  * we need to look for any clones also using the key. This function
  * is "best effort" - so we just skip over it if there are failures.
  */
 static void
 zvol_add_clones(const char *dsname, list_t *minors_list)
 {
 	/* Also check if it has clones */
 	dsl_dir_t *dd = NULL;
 	dsl_pool_t *dp = NULL;
 
 	if (dsl_pool_hold(dsname, FTAG, &dp) != 0)
 		return;
 
 	if (!spa_feature_is_enabled(dp->dp_spa,
 	    SPA_FEATURE_ENCRYPTION))
 		goto out;
 
 	if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0)
 		goto out;
 
 	if (dsl_dir_phys(dd)->dd_clones == 0)
 		goto out;
 
 	zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
 	zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 
 	for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 		dsl_dataset_t *clone;
 		minors_job_t *job;
 
 		if (dsl_dataset_hold_obj(dd->dd_pool,
 		    za->za_first_integer, FTAG, &clone) == 0) {
 
 			char name[ZFS_MAX_DATASET_NAME_LEN];
 			dsl_dataset_name(clone, name);
 
 			char *n = kmem_strdup(name);
 			job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 			job->name = n;
 			job->list = minors_list;
 			job->error = 0;
 			list_insert_tail(minors_list, job);
 
 			dsl_dataset_rele(clone, FTAG);
 		}
 	}
 	zap_cursor_fini(zc);
 	kmem_free(za, sizeof (zap_attribute_t));
 	kmem_free(zc, sizeof (zap_cursor_t));
 
 out:
 	if (dd != NULL)
 		dsl_dir_rele(dd, FTAG);
 	dsl_pool_rele(dp, FTAG);
 }
 
 /*
  * Mask errors to continue dmu_objset_find() traversal
  */
 static int
 zvol_create_minors_cb(const char *dsname, void *arg)
 {
 	uint64_t snapdev;
 	int error;
 	list_t *minors_list = arg;
 
 	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
 
 	error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
 	if (error)
 		return (0);
 
 	/*
 	 * Given the name and the 'snapdev' property, create device minor nodes
 	 * with the linkages to zvols/snapshots as needed.
 	 * If the name represents a zvol, create a minor node for the zvol, then
 	 * check if its snapshots are 'visible', and if so, iterate over the
 	 * snapshots and create device minor nodes for those.
 	 */
 	if (strchr(dsname, '@') == 0) {
 		minors_job_t *job;
 		char *n = kmem_strdup(dsname);
 		if (n == NULL)
 			return (0);
 
 		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 		job->name = n;
 		job->list = minors_list;
 		job->error = 0;
 		list_insert_tail(minors_list, job);
 		/* don't care if dispatch fails, because job->error is 0 */
 		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
 		    TQ_SLEEP);
 
 		zvol_add_clones(dsname, minors_list);
 
 		if (snapdev == ZFS_SNAPDEV_VISIBLE) {
 			/*
 			 * traverse snapshots only, do not traverse children,
 			 * and skip the 'dsname'
 			 */
 			(void) dmu_objset_find(dsname,
 			    zvol_create_snap_minor_cb, (void *)job,
 			    DS_FIND_SNAPSHOTS);
 		}
 	} else {
 		dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
 		    dsname);
 	}
 
 	return (0);
 }
 
 /*
  * Create minors for the specified dataset, including children and snapshots.
  * Pay attention to the 'snapdev' property and iterate over the snapshots
  * only if they are 'visible'. This approach allows one to assure that the
  * snapshot metadata is read from disk only if it is needed.
  *
  * The name can represent a dataset to be recursively scanned for zvols and
  * their snapshots, or a single zvol snapshot. If the name represents a
  * dataset, the scan is performed in two nested stages:
  * - scan the dataset for zvols, and
  * - for each zvol, create a minor node, then check if the zvol's snapshots
  *   are 'visible', and only then iterate over the snapshots if needed
  *
  * If the name represents a snapshot, a check is performed if the snapshot is
  * 'visible' (which also verifies that the parent is a zvol), and if so,
  * a minor node for that snapshot is created.
  */
 void
 zvol_create_minors_recursive(const char *name)
 {
 	list_t minors_list;
 	minors_job_t *job;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	/*
 	 * This is the list for prefetch jobs. Whenever we found a match
 	 * during dmu_objset_find, we insert a minors_job to the list and do
 	 * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
 	 * any lock because all list operation is done on the current thread.
 	 *
 	 * We will use this list to do zvol_os_create_minor after prefetch
 	 * so we don't have to traverse using dmu_objset_find again.
 	 */
 	list_create(&minors_list, sizeof (minors_job_t),
 	    offsetof(minors_job_t, link));
 
 
 	if (strchr(name, '@') != NULL) {
 		uint64_t snapdev;
 
 		int error = dsl_prop_get_integer(name, "snapdev",
 		    &snapdev, NULL);
 
 		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
 			(void) zvol_os_create_minor(name);
 	} else {
 		fstrans_cookie_t cookie = spl_fstrans_mark();
 		(void) dmu_objset_find(name, zvol_create_minors_cb,
 		    &minors_list, DS_FIND_CHILDREN);
 		spl_fstrans_unmark(cookie);
 	}
 
 	taskq_wait_outstanding(system_taskq, 0);
 
 	/*
 	 * Prefetch is completed, we can do zvol_os_create_minor
 	 * sequentially.
 	 */
 	while ((job = list_remove_head(&minors_list)) != NULL) {
 		if (!job->error)
 			(void) zvol_os_create_minor(job->name);
 		kmem_strfree(job->name);
 		kmem_free(job, sizeof (minors_job_t));
 	}
 
 	list_destroy(&minors_list);
 }
 
 void
 zvol_create_minor(const char *name)
 {
 	/*
 	 * Note: the dsl_pool_config_lock must not be held.
 	 * Minor node creation needs to obtain the zvol_state_lock.
 	 * zvol_open() obtains the zvol_state_lock and then the dsl pool
 	 * config lock.  Therefore, we can't have the config lock now if
 	 * we are going to wait for the zvol_state_lock, because it
 	 * would be a lock order inversion which could lead to deadlock.
 	 */
 
 	if (zvol_inhibit_dev)
 		return;
 
 	if (strchr(name, '@') != NULL) {
 		uint64_t snapdev;
 
 		int error = dsl_prop_get_integer(name,
 		    "snapdev", &snapdev, NULL);
 
 		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
 			(void) zvol_os_create_minor(name);
 	} else {
 		(void) zvol_os_create_minor(name);
 	}
 }
 
 /*
  * Remove minors for specified dataset including children and snapshots.
  */
 
 static void
 zvol_free_task(void *arg)
 {
 	zvol_os_free(arg);
 }
 
 void
 zvol_remove_minors_impl(const char *name)
 {
 	zvol_state_t *zv, *zv_next;
 	int namelen = ((name) ? strlen(name) : 0);
 	taskqid_t t;
 	list_t free_list;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	list_create(&free_list, sizeof (zvol_state_t),
 	    offsetof(zvol_state_t, zv_next));
 
 	rw_enter(&zvol_state_lock, RW_WRITER);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 		if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
 		    (strncmp(zv->zv_name, name, namelen) == 0 &&
 		    (zv->zv_name[namelen] == '/' ||
 		    zv->zv_name[namelen] == '@'))) {
 			/*
 			 * By holding zv_state_lock here, we guarantee that no
 			 * one is currently using this zv
 			 */
 
 			/* If in use, leave alone */
 			if (zv->zv_open_count > 0 ||
 			    atomic_read(&zv->zv_suspend_ref)) {
 				mutex_exit(&zv->zv_state_lock);
 				continue;
 			}
 
 			zvol_remove(zv);
 
 			/*
 			 * Cleared while holding zvol_state_lock as a writer
 			 * which will prevent zvol_open() from opening it.
 			 */
 			zvol_os_clear_private(zv);
 
 			/* Drop zv_state_lock before zvol_free() */
 			mutex_exit(&zv->zv_state_lock);
 
 			/* Try parallel zv_free, if failed do it in place */
 			t = taskq_dispatch(system_taskq, zvol_free_task, zv,
 			    TQ_SLEEP);
 			if (t == TASKQID_INVALID)
 				list_insert_head(&free_list, zv);
 		} else {
 			mutex_exit(&zv->zv_state_lock);
 		}
 	}
 	rw_exit(&zvol_state_lock);
 
 	/* Drop zvol_state_lock before calling zvol_free() */
 	while ((zv = list_remove_head(&free_list)) != NULL)
 		zvol_os_free(zv);
 }
 
 /* Remove minor for this specific volume only */
 static void
 zvol_remove_minor_impl(const char *name)
 {
 	zvol_state_t *zv = NULL, *zv_next;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	rw_enter(&zvol_state_lock, RW_WRITER);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 		if (strcmp(zv->zv_name, name) == 0) {
 			/*
 			 * By holding zv_state_lock here, we guarantee that no
 			 * one is currently using this zv
 			 */
 
 			/* If in use, leave alone */
 			if (zv->zv_open_count > 0 ||
 			    atomic_read(&zv->zv_suspend_ref)) {
 				mutex_exit(&zv->zv_state_lock);
 				continue;
 			}
 			zvol_remove(zv);
 
 			zvol_os_clear_private(zv);
 			mutex_exit(&zv->zv_state_lock);
 			break;
 		} else {
 			mutex_exit(&zv->zv_state_lock);
 		}
 	}
 
 	/* Drop zvol_state_lock before calling zvol_free() */
 	rw_exit(&zvol_state_lock);
 
 	if (zv != NULL)
 		zvol_os_free(zv);
 }
 
 /*
  * Rename minors for specified dataset including children and snapshots.
  */
 static void
 zvol_rename_minors_impl(const char *oldname, const char *newname)
 {
 	zvol_state_t *zv, *zv_next;
 	int oldnamelen;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	oldnamelen = strlen(oldname);
 
 	rw_enter(&zvol_state_lock, RW_READER);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 
 		if (strcmp(zv->zv_name, oldname) == 0) {
 			zvol_os_rename_minor(zv, newname);
 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
 		    (zv->zv_name[oldnamelen] == '/' ||
 		    zv->zv_name[oldnamelen] == '@')) {
 			char *name = kmem_asprintf("%s%c%s", newname,
 			    zv->zv_name[oldnamelen],
 			    zv->zv_name + oldnamelen + 1);
 			zvol_os_rename_minor(zv, name);
 			kmem_strfree(name);
 		}
 
 		mutex_exit(&zv->zv_state_lock);
 	}
 
 	rw_exit(&zvol_state_lock);
 }
 
 typedef struct zvol_snapdev_cb_arg {
 	uint64_t snapdev;
 } zvol_snapdev_cb_arg_t;
 
 static int
 zvol_set_snapdev_cb(const char *dsname, void *param)
 {
 	zvol_snapdev_cb_arg_t *arg = param;
 
 	if (strchr(dsname, '@') == NULL)
 		return (0);
 
 	switch (arg->snapdev) {
 		case ZFS_SNAPDEV_VISIBLE:
 			(void) zvol_os_create_minor(dsname);
 			break;
 		case ZFS_SNAPDEV_HIDDEN:
 			(void) zvol_remove_minor_impl(dsname);
 			break;
 	}
 
 	return (0);
 }
 
 static void
 zvol_set_snapdev_impl(char *name, uint64_t snapdev)
 {
 	zvol_snapdev_cb_arg_t arg = {snapdev};
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	/*
 	 * The zvol_set_snapdev_sync() sets snapdev appropriately
 	 * in the dataset hierarchy. Here, we only scan snapshots.
 	 */
 	dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 zvol_set_volmode_impl(char *name, uint64_t volmode)
 {
 	fstrans_cookie_t cookie;
 	uint64_t old_volmode;
 	zvol_state_t *zv;
 
 	if (strchr(name, '@') != NULL)
 		return;
 
 	/*
 	 * It's unfortunate we need to remove minors before we create new ones:
 	 * this is necessary because our backing gendisk (zvol_state->zv_disk)
 	 * could be different when we set, for instance, volmode from "geom"
 	 * to "dev" (or vice versa).
 	 */
 	zv = zvol_find_by_name(name, RW_NONE);
 	if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
 			return;
 	if (zv != NULL) {
 		old_volmode = zv->zv_volmode;
 		mutex_exit(&zv->zv_state_lock);
 		if (old_volmode == volmode)
 			return;
 		zvol_wait_close(zv);
 	}
 	cookie = spl_fstrans_mark();
 	switch (volmode) {
 		case ZFS_VOLMODE_NONE:
 			(void) zvol_remove_minor_impl(name);
 			break;
 		case ZFS_VOLMODE_GEOM:
 		case ZFS_VOLMODE_DEV:
 			(void) zvol_remove_minor_impl(name);
 			(void) zvol_os_create_minor(name);
 			break;
 		case ZFS_VOLMODE_DEFAULT:
 			(void) zvol_remove_minor_impl(name);
 			if (zvol_volmode == ZFS_VOLMODE_NONE)
 				break;
 			else /* if zvol_volmode is invalid defaults to "geom" */
 				(void) zvol_os_create_minor(name);
 			break;
 	}
 	spl_fstrans_unmark(cookie);
 }
 
 static zvol_task_t *
 zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
     uint64_t value)
 {
 	zvol_task_t *task;
 
 	/* Never allow tasks on hidden names. */
 	if (name1[0] == '$')
 		return (NULL);
 
 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 	task->op = op;
 	task->value = value;
 
 	strlcpy(task->name1, name1, MAXNAMELEN);
 	if (name2 != NULL)
 		strlcpy(task->name2, name2, MAXNAMELEN);
 
 	return (task);
 }
 
 static void
 zvol_task_free(zvol_task_t *task)
 {
 	kmem_free(task, sizeof (zvol_task_t));
 }
 
 /*
  * The worker thread function performed asynchronously.
  */
 static void
 zvol_task_cb(void *arg)
 {
 	zvol_task_t *task = arg;
 
 	switch (task->op) {
 	case ZVOL_ASYNC_REMOVE_MINORS:
 		zvol_remove_minors_impl(task->name1);
 		break;
 	case ZVOL_ASYNC_RENAME_MINORS:
 		zvol_rename_minors_impl(task->name1, task->name2);
 		break;
 	case ZVOL_ASYNC_SET_SNAPDEV:
 		zvol_set_snapdev_impl(task->name1, task->value);
 		break;
 	case ZVOL_ASYNC_SET_VOLMODE:
 		zvol_set_volmode_impl(task->name1, task->value);
 		break;
 	default:
 		VERIFY(0);
 		break;
 	}
 
 	zvol_task_free(task);
 }
 
 typedef struct zvol_set_prop_int_arg {
 	const char *zsda_name;
 	uint64_t zsda_value;
 	zprop_source_t zsda_source;
 	dmu_tx_t *zsda_tx;
 } zvol_set_prop_int_arg_t;
 
 /*
  * Sanity check the dataset for safe use by the sync task.  No additional
  * conditions are imposed.
  */
 static int
 zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	int error;
 
 	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
 	if (error != 0)
 		return (error);
 
 	dsl_dir_rele(dd, FTAG);
 
 	return (error);
 }
 
 static int
 zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	(void) arg;
 	char dsname[MAXNAMELEN];
 	zvol_task_t *task;
 	uint64_t snapdev;
 
 	dsl_dataset_name(ds, dsname);
 	if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0)
 		return (0);
 	task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev);
 	if (task == NULL)
 		return (0);
 
 	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
 	    task, TQ_SLEEP);
 	return (0);
 }
 
 /*
  * Traverse all child datasets and apply snapdev appropriately.
  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
  * dataset and read the effective "snapdev" on every child in the callback
  * function: this is because the value is not guaranteed to be the same in the
  * whole dataset hierarchy.
  */
 static void
 zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	int error;
 
 	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
 	zsda->zsda_tx = tx;
 
 	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
 	if (error == 0) {
 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
 		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
 		    &zsda->zsda_value, zsda->zsda_tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
 	    zsda, DS_FIND_CHILDREN);
 
 	dsl_dir_rele(dd, FTAG);
 }
 
 int
 zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
 {
 	zvol_set_prop_int_arg_t zsda;
 
 	zsda.zsda_name = ddname;
 	zsda.zsda_source = source;
 	zsda.zsda_value = snapdev;
 
 	return (dsl_sync_task(ddname, zvol_set_snapdev_check,
 	    zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 /*
  * Sanity check the dataset for safe use by the sync task.  No additional
  * conditions are imposed.
  */
 static int
 zvol_set_volmode_check(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	int error;
 
 	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
 	if (error != 0)
 		return (error);
 
 	dsl_dir_rele(dd, FTAG);
 
 	return (error);
 }
 
 static int
 zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	(void) arg;
 	char dsname[MAXNAMELEN];
 	zvol_task_t *task;
 	uint64_t volmode;
 
 	dsl_dataset_name(ds, dsname);
 	if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0)
 		return (0);
 	task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode);
 	if (task == NULL)
 		return (0);
 
 	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
 	    task, TQ_SLEEP);
 	return (0);
 }
 
 /*
  * Traverse all child datasets and apply volmode appropriately.
  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
  * dataset and read the effective "volmode" on every child in the callback
  * function: this is because the value is not guaranteed to be the same in the
  * whole dataset hierarchy.
  */
 static void
 zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	int error;
 
 	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
 	zsda->zsda_tx = tx;
 
 	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
 	if (error == 0) {
 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE),
 		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
 		    &zsda->zsda_value, zsda->zsda_tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb,
 	    zsda, DS_FIND_CHILDREN);
 
 	dsl_dir_rele(dd, FTAG);
 }
 
 int
 zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode)
 {
 	zvol_set_prop_int_arg_t zsda;
 
 	zsda.zsda_name = ddname;
 	zsda.zsda_source = source;
 	zsda.zsda_value = volmode;
 
 	return (dsl_sync_task(ddname, zvol_set_volmode_check,
 	    zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 void
 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
 {
 	zvol_task_t *task;
 	taskqid_t id;
 
 	task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
 	if (task == NULL)
 		return;
 
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
 }
 
 void
 zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
     boolean_t async)
 {
 	zvol_task_t *task;
 	taskqid_t id;
 
 	task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
 	if (task == NULL)
 		return;
 
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
 }
 
 boolean_t
 zvol_is_zvol(const char *name)
 {
 
 	return (zvol_os_is_zvol(name));
 }
 
 int
 zvol_init_impl(void)
 {
 	int i;
 
 	list_create(&zvol_state_list, sizeof (zvol_state_t),
 	    offsetof(zvol_state_t, zv_next));
 	rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
 
 	zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
 	    KM_SLEEP);
 	for (i = 0; i < ZVOL_HT_SIZE; i++)
 		INIT_HLIST_HEAD(&zvol_htable[i]);
 
 	return (0);
 }
 
 void
 zvol_fini_impl(void)
 {
 	zvol_remove_minors_impl(NULL);
 
 	/*
 	 * The call to "zvol_remove_minors_impl" may dispatch entries to
 	 * the system_taskq, but it doesn't wait for those entries to
 	 * complete before it returns. Thus, we must wait for all of the
 	 * removals to finish, before we can continue.
 	 */
 	taskq_wait_outstanding(system_taskq, 0);
 
 	kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
 	list_destroy(&zvol_state_list);
 	rw_destroy(&zvol_state_lock);
 }